cleanup
[jalview.git] / forester / java / src / org / forester / surfacing / DomainParsimonyCalculator.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
26
27 package org.forester.surfacing;
28
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.Map.Entry;
35 import java.util.Set;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
38
39 import org.forester.application.surfacing;
40 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
41 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
42 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
43 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
44 import org.forester.evoinference.parsimony.DolloParsimony;
45 import org.forester.evoinference.parsimony.FitchParsimony;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.BinaryCharacters;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType;
51 import org.forester.util.ForesterUtil;
52
53 public final class DomainParsimonyCalculator {
54
55     private static final String                     TYPE_FORBINARY_CHARACTERS = "parsimony inferred";
56     private CharacterStateMatrix<GainLossStates>    _gain_loss_matrix;
57     private CharacterStateMatrix<BinaryStates>      _binary_internal_states_matrix;
58     private final List<GenomeWideCombinableDomains> _gwcd_list;
59     private final Phylogeny                         _phylogeny;
60     private int                                     _total_losses;
61     private int                                     _total_gains;
62     private int                                     _total_unchanged;
63     private int                                     _cost;
64     private Map<DomainId, Set<String>>              _domain_id_to_secondary_features_map;
65     private SortedSet<DomainId>                     _positive_filter;
66
67     private DomainParsimonyCalculator( final Phylogeny phylogeny ) {
68         init();
69         _phylogeny = phylogeny;
70         _gwcd_list = null;
71     }
72
73     private DomainParsimonyCalculator( final Phylogeny phylogeny, final List<GenomeWideCombinableDomains> gwcd_list ) {
74         init();
75         _phylogeny = phylogeny;
76         _gwcd_list = gwcd_list;
77     }
78
79     private DomainParsimonyCalculator( final Phylogeny phylogeny,
80                                        final List<GenomeWideCombinableDomains> gwcd_list,
81                                        final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
82         init();
83         _phylogeny = phylogeny;
84         _gwcd_list = gwcd_list;
85         setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map );
86     }
87
88     int calculateNumberOfBinaryDomainCombination() {
89         if ( getGenomeWideCombinableDomainsList().isEmpty() ) {
90             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
91         }
92         final Set<BinaryDomainCombination> all_binary_combinations = new HashSet<BinaryDomainCombination>();
93         for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) {
94             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
95                 all_binary_combinations.add( bc );
96             }
97         }
98         return all_binary_combinations.size();
99     }
100
101     CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence() {
102         return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
103     }
104
105     CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence() {
106         return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() );
107     }
108
109     CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map<Species, MappingResults> mapping_results_map ) {
110         return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(),
111                                                                 getDomainIdToSecondaryFeaturesMap(),
112                                                                 mapping_results_map );
113     }
114
115     Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) {
116         for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) {
117             final PhylogenyNode node = it.next();
118             final String node_identifier = node.getName();
119             final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ),
120                                                               getUnitsGainedOnNode( node_identifier ),
121                                                               getUnitsLostOnNode( node_identifier ),
122                                                               TYPE_FORBINARY_CHARACTERS,
123                                                               getSumOfPresentOnNode( node_identifier ),
124                                                               getSumOfGainsOnNode( node_identifier ),
125                                                               getSumOfLossesOnNode( node_identifier ) );
126             node.getNodeData().setBinaryCharacters( bc );
127         }
128         return phylogeny;
129     }
130
131     private void executeDolloParsimony( final boolean on_domain_presence ) {
132         reset();
133         final DolloParsimony dollo = DolloParsimony.createInstance();
134         dollo.setReturnGainLossMatrix( true );
135         dollo.setReturnInternalStates( true );
136         CharacterStateMatrix<BinaryStates> states = null;
137         if ( on_domain_presence ) {
138             states = createMatrixOfDomainPresenceOrAbsence();
139         }
140         else {
141             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence();
142         }
143         dollo.execute( getPhylogeny(), states );
144         setGainLossMatrix( dollo.getGainLossMatrix() );
145         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
146         setCost( dollo.getCost() );
147         setTotalGains( dollo.getTotalGains() );
148         setTotalLosses( dollo.getTotalLosses() );
149         setTotalUnchanged( dollo.getTotalUnchanged() );
150     }
151
152     public void executeDolloParsimonyOnBinaryDomainCombintionPresence() {
153         executeDolloParsimony( false );
154     }
155
156     public void executeDolloParsimonyOnDomainPresence() {
157         executeDolloParsimony( true );
158     }
159
160     public void executeDolloParsimonyOnDomainPresence( final SortedSet<DomainId> positive_filter ) {
161         setPositiveFilter( positive_filter );
162         executeDolloParsimony( true );
163         setPositiveFilter( null );
164     }
165
166     public void executeDolloParsimonyOnSecondaryFeatures( final Map<Species, MappingResults> mapping_results_map ) {
167         if ( getDomainIdToSecondaryFeaturesMap() == null ) {
168             throw new RuntimeException( "Domain id to secondary features map has apparently not been set" );
169         }
170         reset();
171         final DolloParsimony dollo = DolloParsimony.createInstance();
172         dollo.setReturnGainLossMatrix( true );
173         dollo.setReturnInternalStates( true );
174         final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeaturePresenceOrAbsence( mapping_results_map );
175         dollo.execute( getPhylogeny(), states );
176         setGainLossMatrix( dollo.getGainLossMatrix() );
177         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
178         setCost( dollo.getCost() );
179         setTotalGains( dollo.getTotalGains() );
180         setTotalLosses( dollo.getTotalLosses() );
181         setTotalUnchanged( dollo.getTotalUnchanged() );
182     }
183
184     private void executeFitchParsimony( final boolean on_domain_presence,
185                                         final boolean use_last,
186                                         final boolean randomize,
187                                         final long random_number_seed ) {
188         reset();
189         if ( use_last ) {
190             System.out.println( "   Fitch parsimony: use_last = true" );
191         }
192         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
193         fitch.setRandomize( randomize );
194         if ( randomize ) {
195             fitch.setRandomNumberSeed( random_number_seed );
196         }
197         fitch.setUseLast( use_last );
198         fitch.setReturnGainLossMatrix( true );
199         fitch.setReturnInternalStates( true );
200         CharacterStateMatrix<BinaryStates> states = null;
201         if ( on_domain_presence ) {
202             states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
203         }
204         else {
205             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
206         }
207         fitch.execute( getPhylogeny(), states );
208         setGainLossMatrix( fitch.getGainLossMatrix() );
209         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
210         setCost( fitch.getCost() );
211         setTotalGains( fitch.getTotalGains() );
212         setTotalLosses( fitch.getTotalLosses() );
213         setTotalUnchanged( fitch.getTotalUnchanged() );
214     }
215
216     private void executeFitchParsimonyOnSecondaryFeatures( final boolean use_last,
217                                                            final boolean randomize,
218                                                            final long random_number_seed ) {
219         reset();
220         if ( use_last ) {
221             System.out.println( "   Fitch parsimony: use_last = true" );
222         }
223         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
224         fitch.setRandomize( randomize );
225         if ( randomize ) {
226             fitch.setRandomNumberSeed( random_number_seed );
227         }
228         fitch.setUseLast( use_last );
229         fitch.setReturnGainLossMatrix( true );
230         fitch.setReturnInternalStates( true );
231         final Map<DomainId, Set<String>> map = getDomainIdToSecondaryFeaturesMap();
232         final Map<DomainId, String> newmap = new HashMap<DomainId, String>();
233         final Iterator<Entry<DomainId, Set<String>>> it = map.entrySet().iterator();
234         while ( it.hasNext() ) {
235             final Map.Entry<DomainId, Set<String>> pair = it.next();
236             if ( pair.getValue().size() != 1 ) {
237                 throw new IllegalArgumentException( pair.getKey().getId() + " mapps to " + pair.getValue().size()
238                         + " items" );
239             }
240             newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] );
241         }
242         final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(),
243                                                                                                                                   newmap );
244         fitch.execute( getPhylogeny(), states );
245         setGainLossMatrix( fitch.getGainLossMatrix() );
246         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
247         setCost( fitch.getCost() );
248         setTotalGains( fitch.getTotalGains() );
249         setTotalLosses( fitch.getTotalLosses() );
250         setTotalUnchanged( fitch.getTotalUnchanged() );
251     }
252
253     public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) {
254         executeFitchParsimony( false, use_last, false, 0 );
255     }
256
257     public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) {
258         executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 );
259     }
260
261     public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) {
262         executeFitchParsimony( false, false, true, random_number_seed );
263     }
264
265     public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) {
266         executeFitchParsimony( true, use_last, false, 0 );
267     }
268
269     public void executeFitchParsimonyOnDomainPresence( final long random_number_seed ) {
270         executeFitchParsimony( true, false, true, random_number_seed );
271     }
272
273     public void executeOnGivenBinaryStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix,
274                                                   final String[] character_labels ) {
275         reset();
276         if ( binary_states_matrix.getNumberOfCharacters() != character_labels.length ) {
277             throw new IllegalArgumentException( "binary states matrix number of characters is not equal to the number of character labels provided" );
278         }
279         if ( binary_states_matrix.getNumberOfIdentifiers() != getPhylogeny().getNumberOfBranches() ) {
280             throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" );
281         }
282         final CharacterStateMatrix<GainLossStates> gl_matrix = new BasicCharacterStateMatrix<GainLossStates>( binary_states_matrix
283                                                                                                                       .getNumberOfIdentifiers(),
284                                                                                                               binary_states_matrix
285                                                                                                                       .getNumberOfCharacters() );
286         int total_gains = 0;
287         int total_losses = 0;
288         int total_unchanged = 0;
289         int i = 0;
290         for( final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); it.hasNext(); ) {
291             gl_matrix.setIdentifier( i++, it.next().getName() );
292         }
293         for( int c = 0; c < character_labels.length; ++c ) {
294             gl_matrix.setCharacter( c, character_labels[ c ] );
295             final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder();
296             while ( it.hasNext() ) {
297                 final PhylogenyNode node = it.next();
298                 final String name = node.getName();
299                 final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix
300                         .getIdentifierIndex( name ), c );
301                 final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent();
302                 GainLossStates gl_state = null;
303                 if ( node.isRoot() ) {
304                     ++total_unchanged;
305                     if ( bin_state == BinaryStates.ABSENT ) {
306                         gl_state = GainLossStates.UNCHANGED_ABSENT;
307                     }
308                     else {
309                         gl_state = GainLossStates.UNCHANGED_PRESENT;
310                     }
311                 }
312                 else {
313                     final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix
314                             .getIdentifierIndex( parent_node.getName() ), c );
315                     if ( bin_state == BinaryStates.ABSENT ) {
316                         if ( parent_bin_state == BinaryStates.ABSENT ) {
317                             ++total_unchanged;
318                             gl_state = GainLossStates.UNCHANGED_ABSENT;
319                         }
320                         else {
321                             ++total_losses;
322                             gl_state = GainLossStates.LOSS;
323                         }
324                     }
325                     else {
326                         if ( parent_bin_state == BinaryStates.ABSENT ) {
327                             ++total_gains;
328                             gl_state = GainLossStates.GAIN;
329                         }
330                         else {
331                             ++total_unchanged;
332                             gl_state = GainLossStates.UNCHANGED_PRESENT;
333                         }
334                     }
335                 }
336                 gl_matrix.setState( name, c, gl_state );
337             }
338         }
339         setTotalGains( total_gains );
340         setTotalLosses( total_losses );
341         setTotalUnchanged( total_unchanged );
342         setCost( total_gains + total_losses );
343         setGainLossMatrix( gl_matrix );
344     }
345
346     public int getCost() {
347         return _cost;
348     }
349
350     private Map<DomainId, Set<String>> getDomainIdToSecondaryFeaturesMap() {
351         return _domain_id_to_secondary_features_map;
352     }
353
354     public CharacterStateMatrix<Integer> getGainLossCountsMatrix() {
355         final CharacterStateMatrix<Integer> matrix = new BasicCharacterStateMatrix<Integer>( getGainLossMatrix()
356                 .getNumberOfIdentifiers(), 3 );
357         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
358             matrix.setIdentifier( i, getGainLossMatrix().getIdentifier( i ) );
359         }
360         matrix.setCharacter( 0, "GAINS" );
361         matrix.setCharacter( 1, "LOSSES" );
362         matrix.setCharacter( 2, "NET" );
363         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
364             int gains = 0;
365             int losses = 0;
366             for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
367                 final GainLossStates s = getGainLossMatrix().getState( i, c );
368                 if ( s == GainLossStates.GAIN ) {
369                     ++gains;
370                 }
371                 else if ( s == GainLossStates.LOSS ) {
372                     ++losses;
373                 }
374             }
375             matrix.setState( i, 0, gains );
376             matrix.setState( i, 1, losses );
377             matrix.setState( i, 2, gains - losses );
378         }
379         return matrix;
380     }
381
382     public CharacterStateMatrix<GainLossStates> getGainLossMatrix() {
383         return _gain_loss_matrix;
384     }
385
386     private List<GenomeWideCombinableDomains> getGenomeWideCombinableDomainsList() {
387         return _gwcd_list;
388     }
389
390     public CharacterStateMatrix<BinaryStates> getInternalStatesMatrix() {
391         return _binary_internal_states_matrix;
392     }
393
394     public int getNetGainsOnNode( final String node_identifier ) {
395         if ( getGainLossMatrix() == null ) {
396             throw new RuntimeException( "no gain loss matrix has been calculated" );
397         }
398         int net = 0;
399         final int id_index = getGainLossMatrix().getIdentifierIndex( node_identifier );
400         for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
401             if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.GAIN ) {
402                 ++net;
403             }
404             else if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.LOSS ) {
405                 --net;
406             }
407         }
408         return net;
409     }
410
411     private Phylogeny getPhylogeny() {
412         return _phylogeny;
413     }
414
415     private SortedSet<DomainId> getPositiveFilter() {
416         return _positive_filter;
417     }
418
419     public int getSumOfGainsOnNode( final String node_identifier ) {
420         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
421     }
422
423     public int getSumOfLossesOnNode( final String node_identifier ) {
424         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
425     }
426
427     public int getSumOfPresentOnNode( final String node_identifier ) {
428         return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier );
429     }
430
431     int getSumOfUnchangedAbsentOnNode( final String node_identifier ) {
432         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
433     }
434
435     int getSumOfUnchangedOnNode( final String node_identifier ) {
436         return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier );
437     }
438
439     int getSumOfUnchangedPresentOnNode( final String node_identifier ) {
440         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
441     }
442
443     public int getTotalGains() {
444         return _total_gains;
445     }
446
447     public int getTotalLosses() {
448         return _total_losses;
449     }
450
451     public int getTotalUnchanged() {
452         return _total_unchanged;
453     }
454
455     public SortedSet<String> getUnitsGainedOnNode( final String node_identifier ) {
456         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
457     }
458
459     public SortedSet<String> getUnitsLostOnNode( final String node_identifier ) {
460         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
461     }
462
463     public SortedSet<String> getUnitsOnNode( final String node_identifier ) {
464         final SortedSet<String> present = getUnitsGainedOnNode( node_identifier );
465         present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) );
466         return present;
467     }
468
469     SortedSet<String> getUnitsUnchangedAbsentOnNode( final String node_identifier ) {
470         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
471     }
472
473     SortedSet<String> getUnitsUnchangedPresentOnNode( final String node_identifier ) {
474         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
475     }
476
477     private void init() {
478         setDomainIdToSecondaryFeaturesMap( null );
479         setPositiveFilter( null );
480         reset();
481     }
482
483     private void reset() {
484         setGainLossMatrix( null );
485         setBinaryInternalStatesMatrix( null );
486         setCost( -1 );
487         setTotalGains( -1 );
488         setTotalLosses( -1 );
489         setTotalUnchanged( -1 );
490     }
491
492     private void setBinaryInternalStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix ) {
493         _binary_internal_states_matrix = binary_states_matrix;
494     }
495
496     private void setCost( final int cost ) {
497         _cost = cost;
498     }
499
500     private void setDomainIdToSecondaryFeaturesMap( final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
501         _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map;
502     }
503
504     private void setGainLossMatrix( final CharacterStateMatrix<GainLossStates> gain_loss_matrix ) {
505         _gain_loss_matrix = gain_loss_matrix;
506     }
507
508     private void setPositiveFilter( final SortedSet<DomainId> positive_filter ) {
509         _positive_filter = positive_filter;
510     }
511
512     private void setTotalGains( final int total_gains ) {
513         _total_gains = total_gains;
514     }
515
516     private void setTotalLosses( final int total_losses ) {
517         _total_losses = total_losses;
518     }
519
520     private void setTotalUnchanged( final int total_unchanged ) {
521         _total_unchanged = total_unchanged;
522     }
523
524     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) {
525         return new DomainParsimonyCalculator( phylogeny );
526     }
527
528     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
529                                                             final List<GenomeWideCombinableDomains> gwcd_list ) {
530         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
531             throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes()
532                     + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" );
533         }
534         return new DomainParsimonyCalculator( phylogeny, gwcd_list );
535     }
536
537     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
538                                                             final List<GenomeWideCombinableDomains> gwcd_list,
539                                                             final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
540         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
541             throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" );
542         }
543         return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map );
544     }
545
546     /**
547      * For folds instead of Pfam-domains, for example
548      * 
549      * 
550      * @param gwcd_list
551      * @return
552      */
553     static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
554                                                                                                final Map<DomainId, Set<String>> domain_id_to_second_features_map,
555                                                                                                final Map<Species, MappingResults> mapping_results_map ) {
556         if ( gwcd_list.isEmpty() ) {
557             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
558         }
559         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
560             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
561         }
562         final int number_of_identifiers = gwcd_list.size();
563         final SortedSet<String> all_secondary_features = new TreeSet<String>();
564         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
565             int mapped = 0;
566             int not_mapped = 0;
567             for( final DomainId domain : gwcd.getAllDomainIds() ) {
568                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
569                     all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
570                     mapped++;
571                 }
572                 else {
573                     not_mapped++;
574                 }
575             }
576             if ( mapping_results_map != null ) {
577                 final MappingResults mr = new MappingResults();
578                 mr.setDescription( gwcd.getSpecies().getSpeciesId() );
579                 mr.setSumOfSuccesses( mapped );
580                 mr.setSumOfFailures( not_mapped );
581                 mapping_results_map.put( gwcd.getSpecies(), mr );
582             }
583         }
584         final int number_of_characters = all_secondary_features.size();
585         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
586                                                                                                                                                  number_of_characters );
587         int character_index = 0;
588         for( final String second_id : all_secondary_features ) {
589             matrix.setCharacter( character_index++, second_id );
590         }
591         int identifier_index = 0;
592         final Set<String> all_identifiers = new HashSet<String>();
593         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
594             final String species_id = gwcd.getSpecies().getSpeciesId();
595             if ( all_identifiers.contains( species_id ) ) {
596                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
597             }
598             all_identifiers.add( species_id );
599             matrix.setIdentifier( identifier_index, species_id );
600             final Set<String> all_second_per_gwcd = new HashSet<String>();
601             for( final DomainId domain : gwcd.getAllDomainIds() ) {
602                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
603                     all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
604                 }
605             }
606             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
607                 if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
608                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
609                 }
610                 else {
611                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
612                 }
613             }
614             ++identifier_index;
615         }
616         return matrix;
617     }
618
619     public static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
620                                                                                                                              final Map<DomainId, String> domain_id_to_second_features_map ) {
621         if ( gwcd_list.isEmpty() ) {
622             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
623         }
624         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
625             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
626         }
627         final int number_of_identifiers = gwcd_list.size();
628         final SortedSet<BinaryDomainCombination> all_binary_combinations_mapped = new TreeSet<BinaryDomainCombination>();
629         final Set<BinaryDomainCombination>[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ];
630         int identifier_index = 0;
631         final SortedSet<String> no_mappings = new TreeSet<String>();
632         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
633             binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet<BinaryDomainCombination>();
634             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
635                 final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map,
636                                                                                       bc,
637                                                                                       no_mappings );
638                 all_binary_combinations_mapped.add( mapped_bc );
639                 binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc );
640             }
641             ++identifier_index;
642         }
643         if ( !no_mappings.isEmpty() ) {
644             ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size()
645                     + "):" );
646             for( final String id : no_mappings ) {
647                 ForesterUtil.programMessage( surfacing.PRG_NAME, id );
648             }
649         }
650         final int number_of_characters = all_binary_combinations_mapped.size();
651         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
652                                                                                                                                                  number_of_characters );
653         int character_index = 0;
654         for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) {
655             matrix.setCharacter( character_index++, bc.toString() );
656         }
657         identifier_index = 0;
658         final Set<String> all_identifiers = new HashSet<String>();
659         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
660             final String species_id = gwcd.getSpecies().getSpeciesId();
661             if ( all_identifiers.contains( species_id ) ) {
662                 throw new AssertionError( "species [" + species_id + "] is not unique" );
663             }
664             all_identifiers.add( species_id );
665             matrix.setIdentifier( identifier_index, species_id );
666             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
667                 BinaryDomainCombination bc = null;
668                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
669                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
670                 }
671                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
672                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
673                 }
674                 else {
675                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
676                 }
677                 if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) {
678                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
679                 }
680                 else {
681                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
682                 }
683             }
684             ++identifier_index;
685         }
686         return matrix;
687     }
688
689     private static BinaryDomainCombination mapBinaryDomainCombination( final Map<DomainId, String> domain_id_to_second_features_map,
690                                                                        final BinaryDomainCombination bc,
691                                                                        final SortedSet<String> no_mappings ) {
692         String id0 = "";
693         String id1 = "";
694         if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) {
695             no_mappings.add( bc.getId0().getId() );
696             id0 = bc.getId0().getId();
697         }
698         else {
699             id0 = domain_id_to_second_features_map.get( bc.getId0() );
700         }
701         if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) {
702             no_mappings.add( bc.getId1().getId() );
703             id1 = bc.getId1().getId();
704         }
705         else {
706             id1 = domain_id_to_second_features_map.get( bc.getId1() );
707         }
708         return new BasicBinaryDomainCombination( id0, id1 );
709     }
710
711     public static CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
712         if ( gwcd_list.isEmpty() ) {
713             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
714         }
715         final int number_of_identifiers = gwcd_list.size();
716         final SortedSet<BinaryDomainCombination> all_binary_combinations = new TreeSet<BinaryDomainCombination>();
717         final Set<BinaryDomainCombination>[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ];
718         int identifier_index = 0;
719         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
720             binary_combinations_per_genome[ identifier_index ] = new HashSet<BinaryDomainCombination>();
721             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
722                 all_binary_combinations.add( bc );
723                 binary_combinations_per_genome[ identifier_index ].add( bc );
724             }
725             ++identifier_index;
726         }
727         final int number_of_characters = all_binary_combinations.size();
728         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
729                                                                                                                                                  number_of_characters );
730         int character_index = 0;
731         for( final BinaryDomainCombination bc : all_binary_combinations ) {
732             matrix.setCharacter( character_index++, bc.toString() );
733         }
734         identifier_index = 0;
735         final Set<String> all_identifiers = new HashSet<String>();
736         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
737             final String species_id = gwcd.getSpecies().getSpeciesId();
738             if ( all_identifiers.contains( species_id ) ) {
739                 throw new AssertionError( "species [" + species_id + "] is not unique" );
740             }
741             all_identifiers.add( species_id );
742             matrix.setIdentifier( identifier_index, species_id );
743             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
744                 BinaryDomainCombination bc = null;
745                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
746                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
747                 }
748                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
749                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
750                 }
751                 else {
752                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
753                 }
754                 if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) {
755                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
756                 }
757                 else {
758                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
759                 }
760             }
761             ++identifier_index;
762         }
763         return matrix;
764     }
765
766     static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
767         return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null );
768     }
769
770     public static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
771                                                                                             final SortedSet<DomainId> positive_filter ) {
772         if ( gwcd_list.isEmpty() ) {
773             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
774         }
775         if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) {
776             throw new IllegalArgumentException( "positive filter is empty" );
777         }
778         final int number_of_identifiers = gwcd_list.size();
779         final SortedSet<DomainId> all_domain_ids = new TreeSet<DomainId>();
780         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
781             for( final DomainId domain : gwcd.getAllDomainIds() ) {
782                 all_domain_ids.add( domain );
783             }
784         }
785         int number_of_characters = all_domain_ids.size();
786         if ( positive_filter != null ) {
787             //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes 
788             number_of_characters = 0;
789             for( final DomainId id : all_domain_ids ) {
790                 if ( positive_filter.contains( id ) ) {
791                     number_of_characters++;
792                 }
793             }
794         }
795         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
796                                                                                                                                                  number_of_characters );
797         int character_index = 0;
798         for( final DomainId id : all_domain_ids ) {
799             if ( positive_filter == null ) {
800                 matrix.setCharacter( character_index++, id.getId() );
801             }
802             else {
803                 if ( positive_filter.contains( id ) ) {
804                     matrix.setCharacter( character_index++, id.getId() );
805                 }
806             }
807         }
808         int identifier_index = 0;
809         final Set<String> all_identifiers = new HashSet<String>();
810         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
811             final String species_id = gwcd.getSpecies().getSpeciesId();
812             if ( all_identifiers.contains( species_id ) ) {
813                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
814             }
815             all_identifiers.add( species_id );
816             matrix.setIdentifier( identifier_index, species_id );
817             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
818                 if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) {
819                     throw new RuntimeException( "this should not have happened: problem with character #" + ci );
820                 }
821                 final DomainId id = new DomainId( matrix.getCharacter( ci ) );
822                 if ( gwcd.contains( id ) ) {
823                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
824                 }
825                 else {
826                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
827                 }
828             }
829             ++identifier_index;
830         }
831         return matrix;
832     }
833
834     private static int getStateSumDeltaOnNode( final String node_identifier,
835                                                final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
836                                                final GainLossStates state ) {
837         if ( gain_loss_matrix == null ) {
838             throw new RuntimeException( "no gain loss matrix has been calculated" );
839         }
840         if ( ForesterUtil.isEmpty( node_identifier ) ) {
841             throw new IllegalArgumentException( "node identifier must not be empty" );
842         }
843         if ( gain_loss_matrix.isEmpty() ) {
844             throw new RuntimeException( "gain loss matrix is empty" );
845         }
846         int sum = 0;
847         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
848         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
849             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
850                 ++sum;
851             }
852         }
853         return sum;
854     }
855
856     private static SortedSet<String> getUnitsDeltaOnNode( final String node_identifier,
857                                                           final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
858                                                           final GainLossStates state ) {
859         if ( gain_loss_matrix == null ) {
860             throw new RuntimeException( "no gain loss matrix has been calculated" );
861         }
862         if ( ForesterUtil.isEmpty( node_identifier ) ) {
863             throw new IllegalArgumentException( "node identifier must not be empty" );
864         }
865         if ( gain_loss_matrix.isEmpty() ) {
866             throw new RuntimeException( "gain loss matrix is empty" );
867         }
868         final SortedSet<String> d = new TreeSet<String>();
869         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
870         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
871             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
872                 if ( d.contains( gain_loss_matrix.getCharacter( c ) ) ) {
873                     throw new AssertionError( "this should not have happended: character ["
874                             + gain_loss_matrix.getCharacter( c ) + "] already in set" );
875                 }
876                 d.add( gain_loss_matrix.getCharacter( c ) );
877             }
878         }
879         return d;
880     }
881 }