inprogress
[jalview.git] / forester / java / src / org / forester / surfacing / DomainParsimonyCalculator.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26
27 package org.forester.surfacing;
28
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.Map.Entry;
35 import java.util.Set;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
38
39 import org.forester.application.surfacing;
40 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
41 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
42 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
43 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
44 import org.forester.evoinference.parsimony.DolloParsimony;
45 import org.forester.evoinference.parsimony.FitchParsimony;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.BinaryCharacters;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.protein.BinaryDomainCombination;
51 import org.forester.protein.BinaryDomainCombination.DomainCombinationType;
52 import org.forester.species.Species;
53 import org.forester.util.ForesterUtil;
54
55 public final class DomainParsimonyCalculator {
56
57     private static final String                     TYPE_FORBINARY_CHARACTERS = "parsimony inferred";
58     private CharacterStateMatrix<GainLossStates>    _gain_loss_matrix;
59     private CharacterStateMatrix<BinaryStates>      _binary_internal_states_matrix;
60     private final List<GenomeWideCombinableDomains> _gwcd_list;
61     private final Phylogeny                         _phylogeny;
62     private int                                     _total_losses;
63     private int                                     _total_gains;
64     private int                                     _total_unchanged;
65     private int                                     _cost;
66     private Map<String, Set<String>>                _domain_id_to_secondary_features_map;
67     private SortedSet<String>                       _positive_filter;
68
69     private DomainParsimonyCalculator( final Phylogeny phylogeny ) {
70         init();
71         _phylogeny = phylogeny;
72         _gwcd_list = null;
73     }
74
75     private DomainParsimonyCalculator( final Phylogeny phylogeny, final List<GenomeWideCombinableDomains> gwcd_list ) {
76         init();
77         _phylogeny = phylogeny;
78         _gwcd_list = gwcd_list;
79     }
80
81     private DomainParsimonyCalculator( final Phylogeny phylogeny,
82                                        final List<GenomeWideCombinableDomains> gwcd_list,
83                                        final Map<String, Set<String>> domain_id_to_secondary_features_map ) {
84         init();
85         _phylogeny = phylogeny;
86         _gwcd_list = gwcd_list;
87         setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map );
88     }
89
90     int calculateNumberOfBinaryDomainCombination() {
91         if ( getGenomeWideCombinableDomainsList().isEmpty() ) {
92             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
93         }
94         final Set<BinaryDomainCombination> all_binary_combinations = new HashSet<BinaryDomainCombination>();
95         for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) {
96             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
97                 all_binary_combinations.add( bc );
98             }
99         }
100         return all_binary_combinations.size();
101     }
102
103     CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence() {
104         return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
105     }
106
107     CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence() {
108         return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() );
109     }
110
111     CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map<Species, MappingResults> mapping_results_map ) {
112         return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(),
113                                                                 getDomainIdToSecondaryFeaturesMap(),
114                                                                 mapping_results_map );
115     }
116
117     Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) {
118         for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) {
119             final PhylogenyNode node = it.next();
120             final String node_identifier = node.getName();
121             final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ),
122                                                               getUnitsGainedOnNode( node_identifier ),
123                                                               getUnitsLostOnNode( node_identifier ),
124                                                               TYPE_FORBINARY_CHARACTERS,
125                                                               getSumOfPresentOnNode( node_identifier ),
126                                                               getSumOfGainsOnNode( node_identifier ),
127                                                               getSumOfLossesOnNode( node_identifier ) );
128             node.getNodeData().setBinaryCharacters( bc );
129         }
130         return phylogeny;
131     }
132
133     private void executeDolloParsimony( final boolean on_domain_presence ) {
134         reset();
135         final DolloParsimony dollo = DolloParsimony.createInstance();
136         dollo.setReturnGainLossMatrix( true );
137         dollo.setReturnInternalStates( true );
138         CharacterStateMatrix<BinaryStates> states = null;
139         if ( on_domain_presence ) {
140             states = createMatrixOfDomainPresenceOrAbsence();
141         }
142         else {
143             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence();
144         }
145         dollo.execute( getPhylogeny(), states );
146         setGainLossMatrix( dollo.getGainLossMatrix() );
147         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
148         setCost( dollo.getCost() );
149         setTotalGains( dollo.getTotalGains() );
150         setTotalLosses( dollo.getTotalLosses() );
151         setTotalUnchanged( dollo.getTotalUnchanged() );
152     }
153
154     public void executeDolloParsimonyOnBinaryDomainCombintionPresence() {
155         executeDolloParsimony( false );
156     }
157
158     public void executeDolloParsimonyOnDomainPresence() {
159         executeDolloParsimony( true );
160     }
161
162     public void executeDolloParsimonyOnDomainPresence( final SortedSet<String> positive_filter ) {
163         setPositiveFilter( positive_filter );
164         executeDolloParsimony( true );
165         setPositiveFilter( null );
166     }
167
168     public void executeDolloParsimonyOnSecondaryFeatures( final Map<Species, MappingResults> mapping_results_map ) {
169         if ( getDomainIdToSecondaryFeaturesMap() == null ) {
170             throw new RuntimeException( "Domain id to secondary features map has apparently not been set" );
171         }
172         reset();
173         final DolloParsimony dollo = DolloParsimony.createInstance();
174         dollo.setReturnGainLossMatrix( true );
175         dollo.setReturnInternalStates( true );
176         final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeaturePresenceOrAbsence( mapping_results_map );
177         dollo.execute( getPhylogeny(), states );
178         setGainLossMatrix( dollo.getGainLossMatrix() );
179         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
180         setCost( dollo.getCost() );
181         setTotalGains( dollo.getTotalGains() );
182         setTotalLosses( dollo.getTotalLosses() );
183         setTotalUnchanged( dollo.getTotalUnchanged() );
184     }
185
186     private void executeFitchParsimony( final boolean on_domain_presence,
187                                         final boolean use_last,
188                                         final boolean randomize,
189                                         final long random_number_seed ) {
190         reset();
191         if ( use_last ) {
192             System.out.println( "   Fitch parsimony: use_last = true" );
193         }
194         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
195         fitch.setRandomize( randomize );
196         if ( randomize ) {
197             fitch.setRandomNumberSeed( random_number_seed );
198         }
199         fitch.setUseLast( use_last );
200         fitch.setReturnGainLossMatrix( true );
201         fitch.setReturnInternalStates( true );
202         CharacterStateMatrix<BinaryStates> states = null;
203         if ( on_domain_presence ) {
204             states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
205         }
206         else {
207             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
208         }
209         fitch.execute( getPhylogeny(), states );
210         setGainLossMatrix( fitch.getGainLossMatrix() );
211         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
212         setCost( fitch.getCost() );
213         setTotalGains( fitch.getTotalGains() );
214         setTotalLosses( fitch.getTotalLosses() );
215         setTotalUnchanged( fitch.getTotalUnchanged() );
216     }
217
218     private void executeFitchParsimonyOnSecondaryFeatures( final boolean use_last,
219                                                            final boolean randomize,
220                                                            final long random_number_seed ) {
221         reset();
222         if ( use_last ) {
223             System.out.println( "   Fitch parsimony: use_last = true" );
224         }
225         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
226         fitch.setRandomize( randomize );
227         if ( randomize ) {
228             fitch.setRandomNumberSeed( random_number_seed );
229         }
230         fitch.setUseLast( use_last );
231         fitch.setReturnGainLossMatrix( true );
232         fitch.setReturnInternalStates( true );
233         final Map<String, Set<String>> map = getDomainIdToSecondaryFeaturesMap();
234         final Map<String, String> newmap = new HashMap<String, String>();
235         final Iterator<Entry<String, Set<String>>> it = map.entrySet().iterator();
236         while ( it.hasNext() ) {
237             final Map.Entry<String, Set<String>> pair = it.next();
238             if ( pair.getValue().size() != 1 ) {
239                 throw new IllegalArgumentException( pair.getKey() + " mapps to " + pair.getValue().size() + " items" );
240             }
241             newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] );
242         }
243         final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(),
244                                                                                                                                   newmap );
245         fitch.execute( getPhylogeny(), states );
246         setGainLossMatrix( fitch.getGainLossMatrix() );
247         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
248         setCost( fitch.getCost() );
249         setTotalGains( fitch.getTotalGains() );
250         setTotalLosses( fitch.getTotalLosses() );
251         setTotalUnchanged( fitch.getTotalUnchanged() );
252     }
253
254     public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) {
255         executeFitchParsimony( false, use_last, false, 0 );
256     }
257
258     public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) {
259         executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 );
260     }
261
262     public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) {
263         executeFitchParsimony( false, false, true, random_number_seed );
264     }
265
266     public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) {
267         executeFitchParsimony( true, use_last, false, 0 );
268     }
269
270     public void executeFitchParsimonyOnDomainPresence( final long random_number_seed ) {
271         executeFitchParsimony( true, false, true, random_number_seed );
272     }
273
274     public void executeOnGivenBinaryStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix,
275                                                   final String[] character_labels ) {
276         reset();
277         if ( binary_states_matrix.getNumberOfCharacters() != character_labels.length ) {
278             throw new IllegalArgumentException( "binary states matrix number of characters is not equal to the number of character labels provided" );
279         }
280         if ( binary_states_matrix.getNumberOfIdentifiers() != getPhylogeny().getNumberOfBranches() ) {
281             throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" );
282         }
283         final CharacterStateMatrix<GainLossStates> gl_matrix = new BasicCharacterStateMatrix<GainLossStates>( binary_states_matrix
284                                                                                                                       .getNumberOfIdentifiers(),
285                                                                                                               binary_states_matrix
286                                                                                                                       .getNumberOfCharacters() );
287         int total_gains = 0;
288         int total_losses = 0;
289         int total_unchanged = 0;
290         int i = 0;
291         for( final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); it.hasNext(); ) {
292             gl_matrix.setIdentifier( i++, it.next().getName() );
293         }
294         for( int c = 0; c < character_labels.length; ++c ) {
295             gl_matrix.setCharacter( c, character_labels[ c ] );
296             final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder();
297             while ( it.hasNext() ) {
298                 final PhylogenyNode node = it.next();
299                 final String name = node.getName();
300                 final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix
301                         .getIdentifierIndex( name ), c );
302                 final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent();
303                 GainLossStates gl_state = null;
304                 if ( node.isRoot() ) {
305                     ++total_unchanged;
306                     if ( bin_state == BinaryStates.ABSENT ) {
307                         gl_state = GainLossStates.UNCHANGED_ABSENT;
308                     }
309                     else {
310                         gl_state = GainLossStates.UNCHANGED_PRESENT;
311                     }
312                 }
313                 else {
314                     final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix
315                             .getIdentifierIndex( parent_node.getName() ), c );
316                     if ( bin_state == BinaryStates.ABSENT ) {
317                         if ( parent_bin_state == BinaryStates.ABSENT ) {
318                             ++total_unchanged;
319                             gl_state = GainLossStates.UNCHANGED_ABSENT;
320                         }
321                         else {
322                             ++total_losses;
323                             gl_state = GainLossStates.LOSS;
324                         }
325                     }
326                     else {
327                         if ( parent_bin_state == BinaryStates.ABSENT ) {
328                             ++total_gains;
329                             gl_state = GainLossStates.GAIN;
330                         }
331                         else {
332                             ++total_unchanged;
333                             gl_state = GainLossStates.UNCHANGED_PRESENT;
334                         }
335                     }
336                 }
337                 gl_matrix.setState( name, c, gl_state );
338             }
339         }
340         setTotalGains( total_gains );
341         setTotalLosses( total_losses );
342         setTotalUnchanged( total_unchanged );
343         setCost( total_gains + total_losses );
344         setGainLossMatrix( gl_matrix );
345     }
346
347     public int getCost() {
348         return _cost;
349     }
350
351     private Map<String, Set<String>> getDomainIdToSecondaryFeaturesMap() {
352         return _domain_id_to_secondary_features_map;
353     }
354
355     public CharacterStateMatrix<Integer> getGainLossCountsMatrix() {
356         final CharacterStateMatrix<Integer> matrix = new BasicCharacterStateMatrix<Integer>( getGainLossMatrix()
357                 .getNumberOfIdentifiers(), 3 );
358         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
359             matrix.setIdentifier( i, getGainLossMatrix().getIdentifier( i ) );
360         }
361         matrix.setCharacter( 0, "GAINS" );
362         matrix.setCharacter( 1, "LOSSES" );
363         matrix.setCharacter( 2, "NET" );
364         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
365             int gains = 0;
366             int losses = 0;
367             for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
368                 final GainLossStates s = getGainLossMatrix().getState( i, c );
369                 if ( s == GainLossStates.GAIN ) {
370                     ++gains;
371                 }
372                 else if ( s == GainLossStates.LOSS ) {
373                     ++losses;
374                 }
375             }
376             matrix.setState( i, 0, gains );
377             matrix.setState( i, 1, losses );
378             matrix.setState( i, 2, gains - losses );
379         }
380         return matrix;
381     }
382
383     public CharacterStateMatrix<GainLossStates> getGainLossMatrix() {
384         return _gain_loss_matrix;
385     }
386
387     private List<GenomeWideCombinableDomains> getGenomeWideCombinableDomainsList() {
388         return _gwcd_list;
389     }
390
391     public CharacterStateMatrix<BinaryStates> getInternalStatesMatrix() {
392         return _binary_internal_states_matrix;
393     }
394
395     public int getNetGainsOnNode( final String node_identifier ) {
396         if ( getGainLossMatrix() == null ) {
397             throw new RuntimeException( "no gain loss matrix has been calculated" );
398         }
399         int net = 0;
400         final int id_index = getGainLossMatrix().getIdentifierIndex( node_identifier );
401         for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
402             if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.GAIN ) {
403                 ++net;
404             }
405             else if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.LOSS ) {
406                 --net;
407             }
408         }
409         return net;
410     }
411
412     private Phylogeny getPhylogeny() {
413         return _phylogeny;
414     }
415
416     private SortedSet<String> getPositiveFilter() {
417         return _positive_filter;
418     }
419
420     public int getSumOfGainsOnNode( final String node_identifier ) {
421         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
422     }
423
424     public int getSumOfLossesOnNode( final String node_identifier ) {
425         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
426     }
427
428     public int getSumOfPresentOnNode( final String node_identifier ) {
429         return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier );
430     }
431
432     int getSumOfUnchangedAbsentOnNode( final String node_identifier ) {
433         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
434     }
435
436     int getSumOfUnchangedOnNode( final String node_identifier ) {
437         return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier );
438     }
439
440     int getSumOfUnchangedPresentOnNode( final String node_identifier ) {
441         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
442     }
443
444     public int getTotalGains() {
445         return _total_gains;
446     }
447
448     public int getTotalLosses() {
449         return _total_losses;
450     }
451
452     public int getTotalUnchanged() {
453         return _total_unchanged;
454     }
455
456     public SortedSet<String> getUnitsGainedOnNode( final String node_identifier ) {
457         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
458     }
459
460     public SortedSet<String> getUnitsLostOnNode( final String node_identifier ) {
461         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
462     }
463
464     public SortedSet<String> getUnitsOnNode( final String node_identifier ) {
465         final SortedSet<String> present = getUnitsGainedOnNode( node_identifier );
466         present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) );
467         return present;
468     }
469
470     SortedSet<String> getUnitsUnchangedAbsentOnNode( final String node_identifier ) {
471         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
472     }
473
474     SortedSet<String> getUnitsUnchangedPresentOnNode( final String node_identifier ) {
475         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
476     }
477
478     private void init() {
479         setDomainIdToSecondaryFeaturesMap( null );
480         setPositiveFilter( null );
481         reset();
482     }
483
484     private void reset() {
485         setGainLossMatrix( null );
486         setBinaryInternalStatesMatrix( null );
487         setCost( -1 );
488         setTotalGains( -1 );
489         setTotalLosses( -1 );
490         setTotalUnchanged( -1 );
491     }
492
493     private void setBinaryInternalStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix ) {
494         _binary_internal_states_matrix = binary_states_matrix;
495     }
496
497     private void setCost( final int cost ) {
498         _cost = cost;
499     }
500
501     private void setDomainIdToSecondaryFeaturesMap( final Map<String, Set<String>> domain_id_to_secondary_features_map ) {
502         _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map;
503     }
504
505     private void setGainLossMatrix( final CharacterStateMatrix<GainLossStates> gain_loss_matrix ) {
506         _gain_loss_matrix = gain_loss_matrix;
507     }
508
509     private void setPositiveFilter( final SortedSet<String> positive_filter ) {
510         _positive_filter = positive_filter;
511     }
512
513     private void setTotalGains( final int total_gains ) {
514         _total_gains = total_gains;
515     }
516
517     private void setTotalLosses( final int total_losses ) {
518         _total_losses = total_losses;
519     }
520
521     private void setTotalUnchanged( final int total_unchanged ) {
522         _total_unchanged = total_unchanged;
523     }
524
525     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) {
526         return new DomainParsimonyCalculator( phylogeny );
527     }
528
529     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
530                                                             final List<GenomeWideCombinableDomains> gwcd_list ) {
531         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
532             throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes()
533                     + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" );
534         }
535         return new DomainParsimonyCalculator( phylogeny, gwcd_list );
536     }
537
538     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
539                                                             final List<GenomeWideCombinableDomains> gwcd_list,
540                                                             final Map<String, Set<String>> domain_id_to_secondary_features_map ) {
541         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
542             throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" );
543         }
544         return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map );
545     }
546
547     /**
548      * For folds instead of Pfam-domains, for example
549      * 
550      * 
551      * @param gwcd_list
552      * @return
553      */
554     static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
555                                                                                                final Map<String, Set<String>> domain_id_to_second_features_map,
556                                                                                                final Map<Species, MappingResults> mapping_results_map ) {
557         if ( gwcd_list.isEmpty() ) {
558             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
559         }
560         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
561             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
562         }
563         final int number_of_identifiers = gwcd_list.size();
564         final SortedSet<String> all_secondary_features = new TreeSet<String>();
565         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
566             int mapped = 0;
567             int not_mapped = 0;
568             for( final String domain : gwcd.getAllDomainIds() ) {
569                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
570                     all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
571                     mapped++;
572                 }
573                 else {
574                     not_mapped++;
575                 }
576             }
577             if ( mapping_results_map != null ) {
578                 final MappingResults mr = new MappingResults();
579                 mr.setDescription( gwcd.getSpecies().getSpeciesId() );
580                 mr.setSumOfSuccesses( mapped );
581                 mr.setSumOfFailures( not_mapped );
582                 mapping_results_map.put( gwcd.getSpecies(), mr );
583             }
584         }
585         final int number_of_characters = all_secondary_features.size();
586         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
587                                                                                                                                                  number_of_characters );
588         int character_index = 0;
589         for( final String second_id : all_secondary_features ) {
590             matrix.setCharacter( character_index++, second_id );
591         }
592         int identifier_index = 0;
593         final Set<String> all_identifiers = new HashSet<String>();
594         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
595             final String species_id = gwcd.getSpecies().getSpeciesId();
596             if ( all_identifiers.contains( species_id ) ) {
597                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
598             }
599             all_identifiers.add( species_id );
600             matrix.setIdentifier( identifier_index, species_id );
601             final Set<String> all_second_per_gwcd = new HashSet<String>();
602             for( final String domain : gwcd.getAllDomainIds() ) {
603                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
604                     all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
605                 }
606             }
607             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
608                 if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
609                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
610                 }
611                 else {
612                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
613                 }
614             }
615             ++identifier_index;
616         }
617         return matrix;
618     }
619
620     public static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
621                                                                                                                              final Map<String, String> domain_id_to_second_features_map ) {
622         if ( gwcd_list.isEmpty() ) {
623             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
624         }
625         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
626             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
627         }
628         final int number_of_identifiers = gwcd_list.size();
629         final SortedSet<BinaryDomainCombination> all_binary_combinations_mapped = new TreeSet<BinaryDomainCombination>();
630         final Set<BinaryDomainCombination>[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ];
631         int identifier_index = 0;
632         final SortedSet<String> no_mappings = new TreeSet<String>();
633         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
634             binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet<BinaryDomainCombination>();
635             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
636                 final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map,
637                                                                                       bc,
638                                                                                       no_mappings );
639                 all_binary_combinations_mapped.add( mapped_bc );
640                 binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc );
641             }
642             ++identifier_index;
643         }
644         if ( !no_mappings.isEmpty() ) {
645             ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size()
646                     + "):" );
647             for( final String id : no_mappings ) {
648                 ForesterUtil.programMessage( surfacing.PRG_NAME, id );
649             }
650         }
651         final int number_of_characters = all_binary_combinations_mapped.size();
652         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
653                                                                                                                                                  number_of_characters );
654         int character_index = 0;
655         for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) {
656             matrix.setCharacter( character_index++, bc.toString() );
657         }
658         identifier_index = 0;
659         final Set<String> all_identifiers = new HashSet<String>();
660         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
661             final String species_id = gwcd.getSpecies().getSpeciesId();
662             if ( all_identifiers.contains( species_id ) ) {
663                 throw new AssertionError( "species [" + species_id + "] is not unique" );
664             }
665             all_identifiers.add( species_id );
666             matrix.setIdentifier( identifier_index, species_id );
667             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
668                 BinaryDomainCombination bc = null;
669                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
670                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
671                 }
672                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
673                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
674                 }
675                 else {
676                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
677                 }
678                 if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) {
679                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
680                 }
681                 else {
682                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
683                 }
684             }
685             ++identifier_index;
686         }
687         return matrix;
688     }
689
690     private static BinaryDomainCombination mapBinaryDomainCombination( final Map<String, String> domain_id_to_second_features_map,
691                                                                        final BinaryDomainCombination bc,
692                                                                        final SortedSet<String> no_mappings ) {
693         String id0 = "";
694         String id1 = "";
695         if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) {
696             no_mappings.add( bc.getId0() );
697             id0 = bc.getId0();
698         }
699         else {
700             id0 = domain_id_to_second_features_map.get( bc.getId0() );
701         }
702         if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) {
703             no_mappings.add( bc.getId1() );
704             id1 = bc.getId1();
705         }
706         else {
707             id1 = domain_id_to_second_features_map.get( bc.getId1() );
708         }
709         return new BasicBinaryDomainCombination( id0, id1 );
710     }
711
712     public static CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
713         if ( gwcd_list.isEmpty() ) {
714             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
715         }
716         final int number_of_identifiers = gwcd_list.size();
717         final SortedSet<BinaryDomainCombination> all_binary_combinations = new TreeSet<BinaryDomainCombination>();
718         final Set<BinaryDomainCombination>[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ];
719         int identifier_index = 0;
720         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
721             binary_combinations_per_genome[ identifier_index ] = new HashSet<BinaryDomainCombination>();
722             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
723                 all_binary_combinations.add( bc );
724                 binary_combinations_per_genome[ identifier_index ].add( bc );
725             }
726             ++identifier_index;
727         }
728         final int number_of_characters = all_binary_combinations.size();
729         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
730                                                                                                                                                  number_of_characters );
731         int character_index = 0;
732         for( final BinaryDomainCombination bc : all_binary_combinations ) {
733             matrix.setCharacter( character_index++, bc.toString() );
734         }
735         identifier_index = 0;
736         final Set<String> all_identifiers = new HashSet<String>();
737         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
738             final String species_id = gwcd.getSpecies().getSpeciesId();
739             if ( all_identifiers.contains( species_id ) ) {
740                 throw new AssertionError( "species [" + species_id + "] is not unique" );
741             }
742             all_identifiers.add( species_id );
743             matrix.setIdentifier( identifier_index, species_id );
744             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
745                 BinaryDomainCombination bc = null;
746                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
747                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
748                 }
749                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
750                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
751                 }
752                 else {
753                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
754                 }
755                 if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) {
756                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
757                 }
758                 else {
759                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
760                 }
761             }
762             ++identifier_index;
763         }
764         return matrix;
765     }
766
767     static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
768         return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null );
769     }
770
771     public static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
772                                                                                             final SortedSet<String> positive_filter ) {
773         if ( gwcd_list.isEmpty() ) {
774             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
775         }
776         if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) {
777             throw new IllegalArgumentException( "positive filter is empty" );
778         }
779         final int number_of_identifiers = gwcd_list.size();
780         final SortedSet<String> all_domain_ids = new TreeSet<String>();
781         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
782             for( final String domain : gwcd.getAllDomainIds() ) {
783                 all_domain_ids.add( domain );
784             }
785         }
786         int number_of_characters = all_domain_ids.size();
787         if ( positive_filter != null ) {
788             //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes 
789             number_of_characters = 0;
790             for( final String id : all_domain_ids ) {
791                 if ( positive_filter.contains( id ) ) {
792                     number_of_characters++;
793                 }
794             }
795         }
796         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
797                                                                                                                                                  number_of_characters );
798         int character_index = 0;
799         for( final String id : all_domain_ids ) {
800             if ( positive_filter == null ) {
801                 matrix.setCharacter( character_index++, id );
802             }
803             else {
804                 if ( positive_filter.contains( id ) ) {
805                     matrix.setCharacter( character_index++, id );
806                 }
807             }
808         }
809         int identifier_index = 0;
810         final Set<String> all_identifiers = new HashSet<String>();
811         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
812             final String species_id = gwcd.getSpecies().getSpeciesId();
813             if ( all_identifiers.contains( species_id ) ) {
814                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
815             }
816             all_identifiers.add( species_id );
817             matrix.setIdentifier( identifier_index, species_id );
818             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
819                 if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) {
820                     throw new RuntimeException( "this should not have happened: problem with character #" + ci );
821                 }
822                 final String id = matrix.getCharacter( ci );
823                 if ( gwcd.contains( id ) ) {
824                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
825                 }
826                 else {
827                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
828                 }
829             }
830             ++identifier_index;
831         }
832         return matrix;
833     }
834
835     private static int getStateSumDeltaOnNode( final String node_identifier,
836                                                final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
837                                                final GainLossStates state ) {
838         if ( gain_loss_matrix == null ) {
839             throw new RuntimeException( "no gain loss matrix has been calculated" );
840         }
841         if ( ForesterUtil.isEmpty( node_identifier ) ) {
842             throw new IllegalArgumentException( "node identifier must not be empty" );
843         }
844         if ( gain_loss_matrix.isEmpty() ) {
845             throw new RuntimeException( "gain loss matrix is empty" );
846         }
847         int sum = 0;
848         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
849         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
850             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
851                 ++sum;
852             }
853         }
854         return sum;
855     }
856
857     private static SortedSet<String> getUnitsDeltaOnNode( final String node_identifier,
858                                                           final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
859                                                           final GainLossStates state ) {
860         if ( gain_loss_matrix == null ) {
861             throw new RuntimeException( "no gain loss matrix has been calculated" );
862         }
863         if ( ForesterUtil.isEmpty( node_identifier ) ) {
864             throw new IllegalArgumentException( "node identifier must not be empty" );
865         }
866         if ( gain_loss_matrix.isEmpty() ) {
867             throw new RuntimeException( "gain loss matrix is empty" );
868         }
869         final SortedSet<String> d = new TreeSet<String>();
870         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
871         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
872             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
873                 if ( d.contains( gain_loss_matrix.getCharacter( c ) ) ) {
874                     throw new AssertionError( "this should not have happended: character ["
875                             + gain_loss_matrix.getCharacter( c ) + "] already in set" );
876                 }
877                 d.add( gain_loss_matrix.getCharacter( c ) );
878             }
879         }
880         return d;
881     }
882 }