some fixes
[jalview.git] / forester / java / src / org / forester / surfacing / DomainParsimonyCalculator.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
26
27 package org.forester.surfacing;
28
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.Map.Entry;
35 import java.util.Set;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
38
39 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
40 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
41 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
42 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
43 import org.forester.evoinference.parsimony.DolloParsimony;
44 import org.forester.evoinference.parsimony.FitchParsimony;
45 import org.forester.phylogeny.Phylogeny;
46 import org.forester.phylogeny.PhylogenyNode;
47 import org.forester.phylogeny.data.BinaryCharacters;
48 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
49 import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType;
50 import org.forester.util.ForesterUtil;
51
52 public final class DomainParsimonyCalculator {
53
54     private static final String                     TYPE_FORBINARY_CHARACTERS = "parsimony inferred";
55     private CharacterStateMatrix<GainLossStates>    _gain_loss_matrix;
56     private CharacterStateMatrix<BinaryStates>      _binary_internal_states_matrix;
57     private final List<GenomeWideCombinableDomains> _gwcd_list;
58     private final Phylogeny                         _phylogeny;
59     private int                                     _total_losses;
60     private int                                     _total_gains;
61     private int                                     _total_unchanged;
62     private int                                     _cost;
63     private Map<DomainId, Set<String>>              _domain_id_to_secondary_features_map;
64     private SortedSet<DomainId>                     _positive_filter;
65
66     private DomainParsimonyCalculator( final Phylogeny phylogeny ) {
67         init();
68         _phylogeny = phylogeny;
69         _gwcd_list = null;
70     }
71
72     private DomainParsimonyCalculator( final Phylogeny phylogeny, final List<GenomeWideCombinableDomains> gwcd_list ) {
73         init();
74         _phylogeny = phylogeny;
75         _gwcd_list = gwcd_list;
76     }
77
78     private DomainParsimonyCalculator( final Phylogeny phylogeny,
79                                        final List<GenomeWideCombinableDomains> gwcd_list,
80                                        final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
81         init();
82         _phylogeny = phylogeny;
83         _gwcd_list = gwcd_list;
84         setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map );
85     }
86
87     int calculateNumberOfBinaryDomainCombination() {
88         if ( getGenomeWideCombinableDomainsList().isEmpty() ) {
89             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
90         }
91         final Set<BinaryDomainCombination> all_binary_combinations = new HashSet<BinaryDomainCombination>();
92         for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) {
93             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
94                 all_binary_combinations.add( bc );
95             }
96         }
97         return all_binary_combinations.size();
98     }
99
100     CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence() {
101         return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
102     }
103
104     CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence() {
105         return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() );
106     }
107
108     CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map<Species, MappingResults> mapping_results_map ) {
109         return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(),
110                                                                 getDomainIdToSecondaryFeaturesMap(),
111                                                                 mapping_results_map );
112     }
113
114     Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) {
115         for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) {
116             final PhylogenyNode node = it.next();
117             final String node_identifier = node.getName();
118             final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ),
119                                                               getUnitsGainedOnNode( node_identifier ),
120                                                               getUnitsLostOnNode( node_identifier ),
121                                                               TYPE_FORBINARY_CHARACTERS,
122                                                               getSumOfPresentOnNode( node_identifier ),
123                                                               getSumOfGainsOnNode( node_identifier ),
124                                                               getSumOfLossesOnNode( node_identifier ) );
125             node.getNodeData().setBinaryCharacters( bc );
126         }
127         return phylogeny;
128     }
129
130     private void executeDolloParsimony( final boolean on_domain_presence ) {
131         reset();
132         final DolloParsimony dollo = DolloParsimony.createInstance();
133         dollo.setReturnGainLossMatrix( true );
134         dollo.setReturnInternalStates( true );
135         CharacterStateMatrix<BinaryStates> states = null;
136         if ( on_domain_presence ) {
137             states = createMatrixOfDomainPresenceOrAbsence();
138         }
139         else {
140             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence();
141         }
142         dollo.execute( getPhylogeny(), states );
143         setGainLossMatrix( dollo.getGainLossMatrix() );
144         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
145         setCost( dollo.getCost() );
146         setTotalGains( dollo.getTotalGains() );
147         setTotalLosses( dollo.getTotalLosses() );
148         setTotalUnchanged( dollo.getTotalUnchanged() );
149     }
150
151     public void executeDolloParsimonyOnBinaryDomainCombintionPresence() {
152         executeDolloParsimony( false );
153     }
154
155     public void executeDolloParsimonyOnDomainPresence() {
156         executeDolloParsimony( true );
157     }
158
159     public void executeDolloParsimonyOnDomainPresence( final SortedSet<DomainId> positive_filter ) {
160         setPositiveFilter( positive_filter );
161         executeDolloParsimony( true );
162         setPositiveFilter( null );
163     }
164
165     public void executeDolloParsimonyOnSecondaryFeatures( final Map<Species, MappingResults> mapping_results_map ) {
166         if ( getDomainIdToSecondaryFeaturesMap() == null ) {
167             throw new RuntimeException( "Domain id to secondary features map has apparently not been set" );
168         }
169         reset();
170         final DolloParsimony dollo = DolloParsimony.createInstance();
171         dollo.setReturnGainLossMatrix( true );
172         dollo.setReturnInternalStates( true );
173         final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeaturePresenceOrAbsence( mapping_results_map );
174         dollo.execute( getPhylogeny(), states );
175         setGainLossMatrix( dollo.getGainLossMatrix() );
176         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
177         setCost( dollo.getCost() );
178         setTotalGains( dollo.getTotalGains() );
179         setTotalLosses( dollo.getTotalLosses() );
180         setTotalUnchanged( dollo.getTotalUnchanged() );
181     }
182
183     private void executeFitchParsimony( final boolean on_domain_presence,
184                                         final boolean use_last,
185                                         final boolean randomize,
186                                         final long random_number_seed ) {
187         reset();
188         if ( use_last ) {
189             System.out.println( "   Fitch parsimony: use_last = true" );
190         }
191         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
192         fitch.setRandomize( randomize );
193         if ( randomize ) {
194             fitch.setRandomNumberSeed( random_number_seed );
195         }
196         fitch.setUseLast( use_last );
197         fitch.setReturnGainLossMatrix( true );
198         fitch.setReturnInternalStates( true );
199         CharacterStateMatrix<BinaryStates> states = null;
200         if ( on_domain_presence ) {
201             states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
202         }
203         else {
204             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
205         }
206         fitch.execute( getPhylogeny(), states );
207         setGainLossMatrix( fitch.getGainLossMatrix() );
208         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
209         setCost( fitch.getCost() );
210         setTotalGains( fitch.getTotalGains() );
211         setTotalLosses( fitch.getTotalLosses() );
212         setTotalUnchanged( fitch.getTotalUnchanged() );
213     }
214     
215     private void executeFitchParsimonyOnSecondaryFeatures( 
216                                         final boolean use_last,
217                                         final boolean randomize,
218                                         final long random_number_seed ) {
219         reset();
220         if ( use_last ) {
221             System.out.println( "   Fitch parsimony: use_last = true" );
222         }
223         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
224         fitch.setRandomize( randomize );
225         if ( randomize ) {
226             fitch.setRandomNumberSeed( random_number_seed );
227         }
228         fitch.setUseLast( use_last );
229         fitch.setReturnGainLossMatrix( true );
230         fitch.setReturnInternalStates( true );
231         
232         final Map<DomainId, Set<String>> map = getDomainIdToSecondaryFeaturesMap();
233         final Map<DomainId, String> newmap = new HashMap<DomainId, String>();
234         final Iterator<Entry<DomainId, Set<String>>> it = map.entrySet().iterator();
235         while (it.hasNext()) {
236             final Map.Entry<DomainId, Set<String>> pair =  (Map.Entry<DomainId, Set<String>>)it.next();
237             if ( pair.getValue().size() != 1 ) {
238                 throw new IllegalArgumentException( pair.getKey().getId() + " mapps to " + pair.getValue().size() + " items" );
239             }
240             newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] );
241         }
242         
243         CharacterStateMatrix<BinaryStates> states  =createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(),
244                                                                                 newmap );
245        
246         fitch.execute( getPhylogeny(), states );
247         setGainLossMatrix( fitch.getGainLossMatrix() );
248         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
249         setCost( fitch.getCost() );
250         setTotalGains( fitch.getTotalGains() );
251         setTotalLosses( fitch.getTotalLosses() );
252         setTotalUnchanged( fitch.getTotalUnchanged() );
253     }
254
255     public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) {
256         executeFitchParsimony( false, use_last, false, 0 );
257     }
258     
259     public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) {
260         executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 );
261     }
262
263     public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) {
264         executeFitchParsimony( false, false, true, random_number_seed );
265     }
266
267     public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) {
268         executeFitchParsimony( true, use_last, false, 0 );
269     }
270
271     public void executeFitchParsimonyOnDomainPresence( final long random_number_seed ) {
272         executeFitchParsimony( true, false, true, random_number_seed );
273     }
274
275     public void executeOnGivenBinaryStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix,
276                                                   final String[] character_labels ) {
277         reset();
278         if ( binary_states_matrix.getNumberOfCharacters() != character_labels.length ) {
279             throw new IllegalArgumentException( "binary states matrix number of characters is not equal to the number of character labels provided" );
280         }
281         if ( binary_states_matrix.getNumberOfIdentifiers() != getPhylogeny().getNumberOfBranches() ) {
282             throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" );
283         }
284         final CharacterStateMatrix<GainLossStates> gl_matrix = new BasicCharacterStateMatrix<GainLossStates>( binary_states_matrix
285                                                                                                                       .getNumberOfIdentifiers(),
286                                                                                                               binary_states_matrix
287                                                                                                                       .getNumberOfCharacters() );
288         int total_gains = 0;
289         int total_losses = 0;
290         int total_unchanged = 0;
291         int i = 0;
292         for( final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); it.hasNext(); ) {
293             gl_matrix.setIdentifier( i++, it.next().getName() );
294         }
295         for( int c = 0; c < character_labels.length; ++c ) {
296             gl_matrix.setCharacter( c, character_labels[ c ] );
297             final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder();
298             while ( it.hasNext() ) {
299                 final PhylogenyNode node = it.next();
300                 final String name = node.getName();
301                 final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix
302                         .getIdentifierIndex( name ), c );
303                 final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent();
304                 GainLossStates gl_state = null;
305                 if ( node.isRoot() ) {
306                     ++total_unchanged;
307                     if ( bin_state == BinaryStates.ABSENT ) {
308                         gl_state = GainLossStates.UNCHANGED_ABSENT;
309                     }
310                     else {
311                         gl_state = GainLossStates.UNCHANGED_PRESENT;
312                     }
313                 }
314                 else {
315                     final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix
316                             .getIdentifierIndex( parent_node.getName() ), c );
317                     if ( bin_state == BinaryStates.ABSENT ) {
318                         if ( parent_bin_state == BinaryStates.ABSENT ) {
319                             ++total_unchanged;
320                             gl_state = GainLossStates.UNCHANGED_ABSENT;
321                         }
322                         else {
323                             ++total_losses;
324                             gl_state = GainLossStates.LOSS;
325                         }
326                     }
327                     else {
328                         if ( parent_bin_state == BinaryStates.ABSENT ) {
329                             ++total_gains;
330                             gl_state = GainLossStates.GAIN;
331                         }
332                         else {
333                             ++total_unchanged;
334                             gl_state = GainLossStates.UNCHANGED_PRESENT;
335                         }
336                     }
337                 }
338                 gl_matrix.setState( name, c, gl_state );
339             }
340         }
341         setTotalGains( total_gains );
342         setTotalLosses( total_losses );
343         setTotalUnchanged( total_unchanged );
344         setCost( total_gains + total_losses );
345         setGainLossMatrix( gl_matrix );
346     }
347
348     public int getCost() {
349         return _cost;
350     }
351
352     private Map<DomainId, Set<String>> getDomainIdToSecondaryFeaturesMap() {
353         return _domain_id_to_secondary_features_map;
354     }
355
356     public CharacterStateMatrix<Integer> getGainLossCountsMatrix() {
357         final CharacterStateMatrix<Integer> matrix = new BasicCharacterStateMatrix<Integer>( getGainLossMatrix()
358                 .getNumberOfIdentifiers(), 3 );
359         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
360             matrix.setIdentifier( i, getGainLossMatrix().getIdentifier( i ) );
361         }
362         matrix.setCharacter( 0, "GAINS" );
363         matrix.setCharacter( 1, "LOSSES" );
364         matrix.setCharacter( 2, "NET" );
365         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
366             int gains = 0;
367             int losses = 0;
368             for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
369                 final GainLossStates s = getGainLossMatrix().getState( i, c );
370                 if ( s == GainLossStates.GAIN ) {
371                     ++gains;
372                 }
373                 else if ( s == GainLossStates.LOSS ) {
374                     ++losses;
375                 }
376             }
377             matrix.setState( i, 0, gains );
378             matrix.setState( i, 1, losses );
379             matrix.setState( i, 2, gains - losses );
380         }
381         return matrix;
382     }
383
384     public CharacterStateMatrix<GainLossStates> getGainLossMatrix() {
385         return _gain_loss_matrix;
386     }
387
388     private List<GenomeWideCombinableDomains> getGenomeWideCombinableDomainsList() {
389         return _gwcd_list;
390     }
391
392     public CharacterStateMatrix<BinaryStates> getInternalStatesMatrix() {
393         return _binary_internal_states_matrix;
394     }
395
396     public int getNetGainsOnNode( final String node_identifier ) {
397         if ( getGainLossMatrix() == null ) {
398             throw new RuntimeException( "no gain loss matrix has been calculated" );
399         }
400         int net = 0;
401         final int id_index = getGainLossMatrix().getIdentifierIndex( node_identifier );
402         for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
403             if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.GAIN ) {
404                 ++net;
405             }
406             else if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.LOSS ) {
407                 --net;
408             }
409         }
410         return net;
411     }
412
413     private Phylogeny getPhylogeny() {
414         return _phylogeny;
415     }
416
417     private SortedSet<DomainId> getPositiveFilter() {
418         return _positive_filter;
419     }
420
421     public int getSumOfGainsOnNode( final String node_identifier ) {
422         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
423     }
424
425     public int getSumOfLossesOnNode( final String node_identifier ) {
426         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
427     }
428
429     public int getSumOfPresentOnNode( final String node_identifier ) {
430         return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier );
431     }
432
433     int getSumOfUnchangedAbsentOnNode( final String node_identifier ) {
434         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
435     }
436
437     int getSumOfUnchangedOnNode( final String node_identifier ) {
438         return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier );
439     }
440
441     int getSumOfUnchangedPresentOnNode( final String node_identifier ) {
442         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
443     }
444
445     public int getTotalGains() {
446         return _total_gains;
447     }
448
449     public int getTotalLosses() {
450         return _total_losses;
451     }
452
453     public int getTotalUnchanged() {
454         return _total_unchanged;
455     }
456
457     public SortedSet<String> getUnitsGainedOnNode( final String node_identifier ) {
458         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
459     }
460
461     public SortedSet<String> getUnitsLostOnNode( final String node_identifier ) {
462         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
463     }
464
465     public SortedSet<String> getUnitsOnNode( final String node_identifier ) {
466         final SortedSet<String> present = getUnitsGainedOnNode( node_identifier );
467         present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) );
468         return present;
469     }
470
471     SortedSet<String> getUnitsUnchangedAbsentOnNode( final String node_identifier ) {
472         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
473     }
474
475     SortedSet<String> getUnitsUnchangedPresentOnNode( final String node_identifier ) {
476         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
477     }
478
479     private void init() {
480         setDomainIdToSecondaryFeaturesMap( null );
481         setPositiveFilter( null );
482         reset();
483     }
484
485     private void reset() {
486         setGainLossMatrix( null );
487         setBinaryInternalStatesMatrix( null );
488         setCost( -1 );
489         setTotalGains( -1 );
490         setTotalLosses( -1 );
491         setTotalUnchanged( -1 );
492     }
493
494     private void setBinaryInternalStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix ) {
495         _binary_internal_states_matrix = binary_states_matrix;
496     }
497
498     private void setCost( final int cost ) {
499         _cost = cost;
500     }
501
502     private void setDomainIdToSecondaryFeaturesMap( final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
503         _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map;
504     }
505
506     private void setGainLossMatrix( final CharacterStateMatrix<GainLossStates> gain_loss_matrix ) {
507         _gain_loss_matrix = gain_loss_matrix;
508     }
509
510     private void setPositiveFilter( final SortedSet<DomainId> positive_filter ) {
511         _positive_filter = positive_filter;
512     }
513
514     private void setTotalGains( final int total_gains ) {
515         _total_gains = total_gains;
516     }
517
518     private void setTotalLosses( final int total_losses ) {
519         _total_losses = total_losses;
520     }
521
522     private void setTotalUnchanged( final int total_unchanged ) {
523         _total_unchanged = total_unchanged;
524     }
525
526     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) {
527         return new DomainParsimonyCalculator( phylogeny );
528     }
529
530     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
531                                                             final List<GenomeWideCombinableDomains> gwcd_list ) {
532         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
533             throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes()
534                     + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" );
535         }
536         return new DomainParsimonyCalculator( phylogeny, gwcd_list );
537     }
538
539     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
540                                                             final List<GenomeWideCombinableDomains> gwcd_list,
541                                                             final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
542         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
543             throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" );
544         }
545         return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map );
546     }
547
548     
549     /**
550      * For folds instead of Pfam-domains, for example
551      * 
552      * 
553      * @param gwcd_list
554      * @return
555      */
556     static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
557                                                                                                final Map<DomainId, Set<String>> domain_id_to_second_features_map,
558                                                                                                final Map<Species, MappingResults> mapping_results_map ) {
559         if ( gwcd_list.isEmpty() ) {
560             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
561         }
562         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
563             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
564         }
565         final int number_of_identifiers = gwcd_list.size();
566         final SortedSet<String> all_secondary_features = new TreeSet<String>();
567         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
568             int mapped = 0;
569             int not_mapped = 0;
570             for( final DomainId domain : gwcd.getAllDomainIds() ) {
571                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
572                     all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
573                     mapped++;
574                 }
575                 else {
576                     not_mapped++;
577                 }
578             }
579             if ( mapping_results_map != null ) {
580                 final MappingResults mr = new MappingResults();
581                 mr.setDescription( gwcd.getSpecies().getSpeciesId() );
582                 mr.setSumOfSuccesses( mapped );
583                 mr.setSumOfFailures( not_mapped );
584                 mapping_results_map.put( gwcd.getSpecies(), mr );
585             }
586         }
587         final int number_of_characters = all_secondary_features.size();
588         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
589                                                                                                                                                  number_of_characters );
590         int character_index = 0;
591         for( final String second_id : all_secondary_features ) {
592             matrix.setCharacter( character_index++, second_id );
593         }
594         int identifier_index = 0;
595         final Set<String> all_identifiers = new HashSet<String>();
596         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
597             final String species_id = gwcd.getSpecies().getSpeciesId();
598             if ( all_identifiers.contains( species_id ) ) {
599                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
600             }
601             all_identifiers.add( species_id );
602             matrix.setIdentifier( identifier_index, species_id );
603             final Set<String> all_second_per_gwcd = new HashSet<String>();
604             for( final DomainId domain : gwcd.getAllDomainIds() ) {
605                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
606                     all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
607                 }
608             }
609             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
610                 if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
611                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
612                 }
613                 else {
614                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
615                 }
616             }
617             ++identifier_index;
618         }
619         return matrix;
620     }
621     
622     
623     public static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
624                                                                                                                              final Map<DomainId, String> domain_id_to_second_features_map) {
625         if ( gwcd_list.isEmpty() ) {
626             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
627         }
628         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
629             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
630         }
631         final int number_of_identifiers = gwcd_list.size();
632         final SortedSet<BinaryDomainCombination> all_binary_combinations_mapped = new TreeSet<BinaryDomainCombination>();
633         final Set<BinaryDomainCombination>[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ];
634         int identifier_index = 0;
635         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
636             binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet<BinaryDomainCombination>();
637             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
638                 
639                 if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) {
640                     throw new IllegalArgumentException( "no mapping found for " + bc.getId0() );
641                 }
642                 if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) {
643                     throw new IllegalArgumentException( "no mapping found for " + bc.getId1() );
644                 }
645                 
646                 
647                 final BinaryDomainCombination mapped_bc = new  BasicBinaryDomainCombination( domain_id_to_second_features_map.get(  bc.getId0()) ,
648                                                                                              domain_id_to_second_features_map.get( bc.getId1()) );
649                 all_binary_combinations_mapped.add( mapped_bc );
650                 binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc );
651             }
652             ++identifier_index;
653         }
654        
655         final int number_of_characters = all_binary_combinations_mapped.size();
656         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
657                                                                                                                                                  number_of_characters );
658         int character_index = 0;
659         for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) {
660             matrix.setCharacter( character_index++, bc.toString() );
661         }
662         identifier_index = 0;
663         final Set<String> all_identifiers = new HashSet<String>();
664         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
665             final String species_id = gwcd.getSpecies().getSpeciesId();
666             if ( all_identifiers.contains( species_id ) ) {
667                 throw new AssertionError( "species [" + species_id + "] is not unique" );
668             }
669             all_identifiers.add( species_id );
670             matrix.setIdentifier( identifier_index, species_id );
671             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
672                 BinaryDomainCombination bc = null;
673                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
674                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
675                 }
676                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
677                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
678                 }
679                 else {
680                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
681                 }
682                 if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) {
683                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
684                 }
685                 else {
686                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
687                 }
688             }
689             ++identifier_index;
690         }
691         return matrix;
692     }
693     
694     
695     
696     
697     public static CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
698         if ( gwcd_list.isEmpty() ) {
699             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
700         }
701         final int number_of_identifiers = gwcd_list.size();
702         final SortedSet<BinaryDomainCombination> all_binary_combinations = new TreeSet<BinaryDomainCombination>();
703         final Set<BinaryDomainCombination>[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ];
704         int identifier_index = 0;
705         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
706             binary_combinations_per_genome[ identifier_index ] = new HashSet<BinaryDomainCombination>();
707             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
708                 all_binary_combinations.add( bc );
709                 binary_combinations_per_genome[ identifier_index ].add( bc );
710             }
711             ++identifier_index;
712         }
713         final int number_of_characters = all_binary_combinations.size();
714         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
715                                                                                                                                                  number_of_characters );
716         int character_index = 0;
717         for( final BinaryDomainCombination bc : all_binary_combinations ) {
718             matrix.setCharacter( character_index++, bc.toString() );
719         }
720         identifier_index = 0;
721         final Set<String> all_identifiers = new HashSet<String>();
722         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
723             final String species_id = gwcd.getSpecies().getSpeciesId();
724             if ( all_identifiers.contains( species_id ) ) {
725                 throw new AssertionError( "species [" + species_id + "] is not unique" );
726             }
727             all_identifiers.add( species_id );
728             matrix.setIdentifier( identifier_index, species_id );
729             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
730                 BinaryDomainCombination bc = null;
731                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
732                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
733                 }
734                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
735                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
736                 }
737                 else {
738                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
739                 }
740                 if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) {
741                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
742                 }
743                 else {
744                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
745                 }
746             }
747             ++identifier_index;
748         }
749         return matrix;
750     }
751
752     static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
753         return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null );
754     }
755
756     public static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
757                                                                                             final SortedSet<DomainId> positive_filter ) {
758         if ( gwcd_list.isEmpty() ) {
759             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
760         }
761         if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) {
762             throw new IllegalArgumentException( "positive filter is empty" );
763         }
764         final int number_of_identifiers = gwcd_list.size();
765         final SortedSet<DomainId> all_domain_ids = new TreeSet<DomainId>();
766         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
767             for( final DomainId domain : gwcd.getAllDomainIds() ) {
768                 all_domain_ids.add( domain );
769             }
770         }
771         int number_of_characters = all_domain_ids.size();
772         if ( positive_filter != null ) {
773             //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes 
774             number_of_characters = 0;
775             for( final DomainId id : all_domain_ids ) {
776                 if ( positive_filter.contains( id ) ) {
777                     number_of_characters++;
778                 }
779             }
780         }
781         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
782                                                                                                                                                  number_of_characters );
783         int character_index = 0;
784         for( final DomainId id : all_domain_ids ) {
785             if ( positive_filter == null ) {
786                 matrix.setCharacter( character_index++, id.getId() );
787             }
788             else {
789                 if ( positive_filter.contains( id ) ) {
790                     matrix.setCharacter( character_index++, id.getId() );
791                 }
792             }
793         }
794         int identifier_index = 0;
795         final Set<String> all_identifiers = new HashSet<String>();
796         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
797             final String species_id = gwcd.getSpecies().getSpeciesId();
798             if ( all_identifiers.contains( species_id ) ) {
799                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
800             }
801             all_identifiers.add( species_id );
802             matrix.setIdentifier( identifier_index, species_id );
803             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
804                 if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) {
805                     throw new RuntimeException( "this should not have happened: problem with character #" + ci );
806                 }
807                 final DomainId id = new DomainId( matrix.getCharacter( ci ) );
808                 if ( gwcd.contains( id ) ) {
809                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
810                 }
811                 else {
812                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
813                 }
814             }
815             ++identifier_index;
816         }
817         return matrix;
818     }
819
820    
821
822     private static int getStateSumDeltaOnNode( final String node_identifier,
823                                                final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
824                                                final GainLossStates state ) {
825         if ( gain_loss_matrix == null ) {
826             throw new RuntimeException( "no gain loss matrix has been calculated" );
827         }
828         if ( ForesterUtil.isEmpty( node_identifier ) ) {
829             throw new IllegalArgumentException( "node identifier must not be empty" );
830         }
831         if ( gain_loss_matrix.isEmpty() ) {
832             throw new RuntimeException( "gain loss matrix is empty" );
833         }
834         int sum = 0;
835         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
836         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
837             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
838                 ++sum;
839             }
840         }
841         return sum;
842     }
843
844     private static SortedSet<String> getUnitsDeltaOnNode( final String node_identifier,
845                                                           final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
846                                                           final GainLossStates state ) {
847         if ( gain_loss_matrix == null ) {
848             throw new RuntimeException( "no gain loss matrix has been calculated" );
849         }
850         if ( ForesterUtil.isEmpty( node_identifier ) ) {
851             throw new IllegalArgumentException( "node identifier must not be empty" );
852         }
853         if ( gain_loss_matrix.isEmpty() ) {
854             throw new RuntimeException( "gain loss matrix is empty" );
855         }
856         final SortedSet<String> d = new TreeSet<String>();
857         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
858         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
859             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
860                 if ( d.contains( gain_loss_matrix.getCharacter( c ) ) ) {
861                     throw new AssertionError( "this should not have happended: character ["
862                             + gain_loss_matrix.getCharacter( c ) + "] already in set" );
863                 }
864                 d.add( gain_loss_matrix.getCharacter( c ) );
865             }
866         }
867         return d;
868     }
869 }