in progress
[jalview.git] / forester / java / src / org / forester / surfacing / DomainParsimonyCalculator.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
26
27 package org.forester.surfacing;
28
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.Map.Entry;
35 import java.util.Set;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
38
39 import org.forester.application.surfacing;
40 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
41 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
42 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
43 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
44 import org.forester.evoinference.parsimony.DolloParsimony;
45 import org.forester.evoinference.parsimony.FitchParsimony;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.BinaryCharacters;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.protein.BinaryDomainCombination;
51 import org.forester.protein.BinaryDomainCombination.DomainCombinationType;
52 import org.forester.protein.DomainId;
53 import org.forester.species.Species;
54 import org.forester.util.ForesterUtil;
55
56 public final class DomainParsimonyCalculator {
57
58     private static final String                     TYPE_FORBINARY_CHARACTERS = "parsimony inferred";
59     private CharacterStateMatrix<GainLossStates>    _gain_loss_matrix;
60     private CharacterStateMatrix<BinaryStates>      _binary_internal_states_matrix;
61     private final List<GenomeWideCombinableDomains> _gwcd_list;
62     private final Phylogeny                         _phylogeny;
63     private int                                     _total_losses;
64     private int                                     _total_gains;
65     private int                                     _total_unchanged;
66     private int                                     _cost;
67     private Map<DomainId, Set<String>>              _domain_id_to_secondary_features_map;
68     private SortedSet<DomainId>                     _positive_filter;
69
70     private DomainParsimonyCalculator( final Phylogeny phylogeny ) {
71         init();
72         _phylogeny = phylogeny;
73         _gwcd_list = null;
74     }
75
76     private DomainParsimonyCalculator( final Phylogeny phylogeny, final List<GenomeWideCombinableDomains> gwcd_list ) {
77         init();
78         _phylogeny = phylogeny;
79         _gwcd_list = gwcd_list;
80     }
81
82     private DomainParsimonyCalculator( final Phylogeny phylogeny,
83                                        final List<GenomeWideCombinableDomains> gwcd_list,
84                                        final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
85         init();
86         _phylogeny = phylogeny;
87         _gwcd_list = gwcd_list;
88         setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map );
89     }
90
91     int calculateNumberOfBinaryDomainCombination() {
92         if ( getGenomeWideCombinableDomainsList().isEmpty() ) {
93             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
94         }
95         final Set<BinaryDomainCombination> all_binary_combinations = new HashSet<BinaryDomainCombination>();
96         for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) {
97             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
98                 all_binary_combinations.add( bc );
99             }
100         }
101         return all_binary_combinations.size();
102     }
103
104     CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence() {
105         return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
106     }
107
108     CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence() {
109         return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() );
110     }
111
112     CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map<Species, MappingResults> mapping_results_map ) {
113         return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(),
114                                                                 getDomainIdToSecondaryFeaturesMap(),
115                                                                 mapping_results_map );
116     }
117
118     Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) {
119         for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) {
120             final PhylogenyNode node = it.next();
121             final String node_identifier = node.getName();
122             final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ),
123                                                               getUnitsGainedOnNode( node_identifier ),
124                                                               getUnitsLostOnNode( node_identifier ),
125                                                               TYPE_FORBINARY_CHARACTERS,
126                                                               getSumOfPresentOnNode( node_identifier ),
127                                                               getSumOfGainsOnNode( node_identifier ),
128                                                               getSumOfLossesOnNode( node_identifier ) );
129             node.getNodeData().setBinaryCharacters( bc );
130         }
131         return phylogeny;
132     }
133
134     private void executeDolloParsimony( final boolean on_domain_presence ) {
135         reset();
136         final DolloParsimony dollo = DolloParsimony.createInstance();
137         dollo.setReturnGainLossMatrix( true );
138         dollo.setReturnInternalStates( true );
139         CharacterStateMatrix<BinaryStates> states = null;
140         if ( on_domain_presence ) {
141             states = createMatrixOfDomainPresenceOrAbsence();
142         }
143         else {
144             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence();
145         }
146         dollo.execute( getPhylogeny(), states );
147         setGainLossMatrix( dollo.getGainLossMatrix() );
148         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
149         setCost( dollo.getCost() );
150         setTotalGains( dollo.getTotalGains() );
151         setTotalLosses( dollo.getTotalLosses() );
152         setTotalUnchanged( dollo.getTotalUnchanged() );
153     }
154
155     public void executeDolloParsimonyOnBinaryDomainCombintionPresence() {
156         executeDolloParsimony( false );
157     }
158
159     public void executeDolloParsimonyOnDomainPresence() {
160         executeDolloParsimony( true );
161     }
162
163     public void executeDolloParsimonyOnDomainPresence( final SortedSet<DomainId> positive_filter ) {
164         setPositiveFilter( positive_filter );
165         executeDolloParsimony( true );
166         setPositiveFilter( null );
167     }
168
169     public void executeDolloParsimonyOnSecondaryFeatures( final Map<Species, MappingResults> mapping_results_map ) {
170         if ( getDomainIdToSecondaryFeaturesMap() == null ) {
171             throw new RuntimeException( "Domain id to secondary features map has apparently not been set" );
172         }
173         reset();
174         final DolloParsimony dollo = DolloParsimony.createInstance();
175         dollo.setReturnGainLossMatrix( true );
176         dollo.setReturnInternalStates( true );
177         final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeaturePresenceOrAbsence( mapping_results_map );
178         dollo.execute( getPhylogeny(), states );
179         setGainLossMatrix( dollo.getGainLossMatrix() );
180         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
181         setCost( dollo.getCost() );
182         setTotalGains( dollo.getTotalGains() );
183         setTotalLosses( dollo.getTotalLosses() );
184         setTotalUnchanged( dollo.getTotalUnchanged() );
185     }
186
187     private void executeFitchParsimony( final boolean on_domain_presence,
188                                         final boolean use_last,
189                                         final boolean randomize,
190                                         final long random_number_seed ) {
191         reset();
192         if ( use_last ) {
193             System.out.println( "   Fitch parsimony: use_last = true" );
194         }
195         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
196         fitch.setRandomize( randomize );
197         if ( randomize ) {
198             fitch.setRandomNumberSeed( random_number_seed );
199         }
200         fitch.setUseLast( use_last );
201         fitch.setReturnGainLossMatrix( true );
202         fitch.setReturnInternalStates( true );
203         CharacterStateMatrix<BinaryStates> states = null;
204         if ( on_domain_presence ) {
205             states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
206         }
207         else {
208             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
209         }
210         fitch.execute( getPhylogeny(), states );
211         setGainLossMatrix( fitch.getGainLossMatrix() );
212         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
213         setCost( fitch.getCost() );
214         setTotalGains( fitch.getTotalGains() );
215         setTotalLosses( fitch.getTotalLosses() );
216         setTotalUnchanged( fitch.getTotalUnchanged() );
217     }
218
219     private void executeFitchParsimonyOnSecondaryFeatures( final boolean use_last,
220                                                            final boolean randomize,
221                                                            final long random_number_seed ) {
222         reset();
223         if ( use_last ) {
224             System.out.println( "   Fitch parsimony: use_last = true" );
225         }
226         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
227         fitch.setRandomize( randomize );
228         if ( randomize ) {
229             fitch.setRandomNumberSeed( random_number_seed );
230         }
231         fitch.setUseLast( use_last );
232         fitch.setReturnGainLossMatrix( true );
233         fitch.setReturnInternalStates( true );
234         final Map<DomainId, Set<String>> map = getDomainIdToSecondaryFeaturesMap();
235         final Map<DomainId, String> newmap = new HashMap<DomainId, String>();
236         final Iterator<Entry<DomainId, Set<String>>> it = map.entrySet().iterator();
237         while ( it.hasNext() ) {
238             final Map.Entry<DomainId, Set<String>> pair = it.next();
239             if ( pair.getValue().size() != 1 ) {
240                 throw new IllegalArgumentException( pair.getKey().getId() + " mapps to " + pair.getValue().size()
241                         + " items" );
242             }
243             newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] );
244         }
245         final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(),
246                                                                                                                                   newmap );
247         fitch.execute( getPhylogeny(), states );
248         setGainLossMatrix( fitch.getGainLossMatrix() );
249         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
250         setCost( fitch.getCost() );
251         setTotalGains( fitch.getTotalGains() );
252         setTotalLosses( fitch.getTotalLosses() );
253         setTotalUnchanged( fitch.getTotalUnchanged() );
254     }
255
256     public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) {
257         executeFitchParsimony( false, use_last, false, 0 );
258     }
259
260     public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) {
261         executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 );
262     }
263
264     public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) {
265         executeFitchParsimony( false, false, true, random_number_seed );
266     }
267
268     public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) {
269         executeFitchParsimony( true, use_last, false, 0 );
270     }
271
272     public void executeFitchParsimonyOnDomainPresence( final long random_number_seed ) {
273         executeFitchParsimony( true, false, true, random_number_seed );
274     }
275
276     public void executeOnGivenBinaryStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix,
277                                                   final String[] character_labels ) {
278         reset();
279         if ( binary_states_matrix.getNumberOfCharacters() != character_labels.length ) {
280             throw new IllegalArgumentException( "binary states matrix number of characters is not equal to the number of character labels provided" );
281         }
282         if ( binary_states_matrix.getNumberOfIdentifiers() != getPhylogeny().getNumberOfBranches() ) {
283             throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" );
284         }
285         final CharacterStateMatrix<GainLossStates> gl_matrix = new BasicCharacterStateMatrix<GainLossStates>( binary_states_matrix
286                                                                                                                       .getNumberOfIdentifiers(),
287                                                                                                               binary_states_matrix
288                                                                                                                       .getNumberOfCharacters() );
289         int total_gains = 0;
290         int total_losses = 0;
291         int total_unchanged = 0;
292         int i = 0;
293         for( final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); it.hasNext(); ) {
294             gl_matrix.setIdentifier( i++, it.next().getName() );
295         }
296         for( int c = 0; c < character_labels.length; ++c ) {
297             gl_matrix.setCharacter( c, character_labels[ c ] );
298             final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder();
299             while ( it.hasNext() ) {
300                 final PhylogenyNode node = it.next();
301                 final String name = node.getName();
302                 final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix
303                         .getIdentifierIndex( name ), c );
304                 final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent();
305                 GainLossStates gl_state = null;
306                 if ( node.isRoot() ) {
307                     ++total_unchanged;
308                     if ( bin_state == BinaryStates.ABSENT ) {
309                         gl_state = GainLossStates.UNCHANGED_ABSENT;
310                     }
311                     else {
312                         gl_state = GainLossStates.UNCHANGED_PRESENT;
313                     }
314                 }
315                 else {
316                     final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix
317                             .getIdentifierIndex( parent_node.getName() ), c );
318                     if ( bin_state == BinaryStates.ABSENT ) {
319                         if ( parent_bin_state == BinaryStates.ABSENT ) {
320                             ++total_unchanged;
321                             gl_state = GainLossStates.UNCHANGED_ABSENT;
322                         }
323                         else {
324                             ++total_losses;
325                             gl_state = GainLossStates.LOSS;
326                         }
327                     }
328                     else {
329                         if ( parent_bin_state == BinaryStates.ABSENT ) {
330                             ++total_gains;
331                             gl_state = GainLossStates.GAIN;
332                         }
333                         else {
334                             ++total_unchanged;
335                             gl_state = GainLossStates.UNCHANGED_PRESENT;
336                         }
337                     }
338                 }
339                 gl_matrix.setState( name, c, gl_state );
340             }
341         }
342         setTotalGains( total_gains );
343         setTotalLosses( total_losses );
344         setTotalUnchanged( total_unchanged );
345         setCost( total_gains + total_losses );
346         setGainLossMatrix( gl_matrix );
347     }
348
349     public int getCost() {
350         return _cost;
351     }
352
353     private Map<DomainId, Set<String>> getDomainIdToSecondaryFeaturesMap() {
354         return _domain_id_to_secondary_features_map;
355     }
356
357     public CharacterStateMatrix<Integer> getGainLossCountsMatrix() {
358         final CharacterStateMatrix<Integer> matrix = new BasicCharacterStateMatrix<Integer>( getGainLossMatrix()
359                 .getNumberOfIdentifiers(), 3 );
360         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
361             matrix.setIdentifier( i, getGainLossMatrix().getIdentifier( i ) );
362         }
363         matrix.setCharacter( 0, "GAINS" );
364         matrix.setCharacter( 1, "LOSSES" );
365         matrix.setCharacter( 2, "NET" );
366         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
367             int gains = 0;
368             int losses = 0;
369             for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
370                 final GainLossStates s = getGainLossMatrix().getState( i, c );
371                 if ( s == GainLossStates.GAIN ) {
372                     ++gains;
373                 }
374                 else if ( s == GainLossStates.LOSS ) {
375                     ++losses;
376                 }
377             }
378             matrix.setState( i, 0, gains );
379             matrix.setState( i, 1, losses );
380             matrix.setState( i, 2, gains - losses );
381         }
382         return matrix;
383     }
384
385     public CharacterStateMatrix<GainLossStates> getGainLossMatrix() {
386         return _gain_loss_matrix;
387     }
388
389     private List<GenomeWideCombinableDomains> getGenomeWideCombinableDomainsList() {
390         return _gwcd_list;
391     }
392
393     public CharacterStateMatrix<BinaryStates> getInternalStatesMatrix() {
394         return _binary_internal_states_matrix;
395     }
396
397     public int getNetGainsOnNode( final String node_identifier ) {
398         if ( getGainLossMatrix() == null ) {
399             throw new RuntimeException( "no gain loss matrix has been calculated" );
400         }
401         int net = 0;
402         final int id_index = getGainLossMatrix().getIdentifierIndex( node_identifier );
403         for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
404             if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.GAIN ) {
405                 ++net;
406             }
407             else if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.LOSS ) {
408                 --net;
409             }
410         }
411         return net;
412     }
413
414     private Phylogeny getPhylogeny() {
415         return _phylogeny;
416     }
417
418     private SortedSet<DomainId> getPositiveFilter() {
419         return _positive_filter;
420     }
421
422     public int getSumOfGainsOnNode( final String node_identifier ) {
423         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
424     }
425
426     public int getSumOfLossesOnNode( final String node_identifier ) {
427         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
428     }
429
430     public int getSumOfPresentOnNode( final String node_identifier ) {
431         return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier );
432     }
433
434     int getSumOfUnchangedAbsentOnNode( final String node_identifier ) {
435         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
436     }
437
438     int getSumOfUnchangedOnNode( final String node_identifier ) {
439         return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier );
440     }
441
442     int getSumOfUnchangedPresentOnNode( final String node_identifier ) {
443         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
444     }
445
446     public int getTotalGains() {
447         return _total_gains;
448     }
449
450     public int getTotalLosses() {
451         return _total_losses;
452     }
453
454     public int getTotalUnchanged() {
455         return _total_unchanged;
456     }
457
458     public SortedSet<String> getUnitsGainedOnNode( final String node_identifier ) {
459         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
460     }
461
462     public SortedSet<String> getUnitsLostOnNode( final String node_identifier ) {
463         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
464     }
465
466     public SortedSet<String> getUnitsOnNode( final String node_identifier ) {
467         final SortedSet<String> present = getUnitsGainedOnNode( node_identifier );
468         present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) );
469         return present;
470     }
471
472     SortedSet<String> getUnitsUnchangedAbsentOnNode( final String node_identifier ) {
473         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
474     }
475
476     SortedSet<String> getUnitsUnchangedPresentOnNode( final String node_identifier ) {
477         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
478     }
479
480     private void init() {
481         setDomainIdToSecondaryFeaturesMap( null );
482         setPositiveFilter( null );
483         reset();
484     }
485
486     private void reset() {
487         setGainLossMatrix( null );
488         setBinaryInternalStatesMatrix( null );
489         setCost( -1 );
490         setTotalGains( -1 );
491         setTotalLosses( -1 );
492         setTotalUnchanged( -1 );
493     }
494
495     private void setBinaryInternalStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix ) {
496         _binary_internal_states_matrix = binary_states_matrix;
497     }
498
499     private void setCost( final int cost ) {
500         _cost = cost;
501     }
502
503     private void setDomainIdToSecondaryFeaturesMap( final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
504         _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map;
505     }
506
507     private void setGainLossMatrix( final CharacterStateMatrix<GainLossStates> gain_loss_matrix ) {
508         _gain_loss_matrix = gain_loss_matrix;
509     }
510
511     private void setPositiveFilter( final SortedSet<DomainId> positive_filter ) {
512         _positive_filter = positive_filter;
513     }
514
515     private void setTotalGains( final int total_gains ) {
516         _total_gains = total_gains;
517     }
518
519     private void setTotalLosses( final int total_losses ) {
520         _total_losses = total_losses;
521     }
522
523     private void setTotalUnchanged( final int total_unchanged ) {
524         _total_unchanged = total_unchanged;
525     }
526
527     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) {
528         return new DomainParsimonyCalculator( phylogeny );
529     }
530
531     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
532                                                             final List<GenomeWideCombinableDomains> gwcd_list ) {
533         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
534             throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes()
535                     + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" );
536         }
537         return new DomainParsimonyCalculator( phylogeny, gwcd_list );
538     }
539
540     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
541                                                             final List<GenomeWideCombinableDomains> gwcd_list,
542                                                             final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
543         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
544             throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" );
545         }
546         return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map );
547     }
548
549     /**
550      * For folds instead of Pfam-domains, for example
551      * 
552      * 
553      * @param gwcd_list
554      * @return
555      */
556     static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
557                                                                                                final Map<DomainId, Set<String>> domain_id_to_second_features_map,
558                                                                                                final Map<Species, MappingResults> mapping_results_map ) {
559         if ( gwcd_list.isEmpty() ) {
560             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
561         }
562         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
563             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
564         }
565         final int number_of_identifiers = gwcd_list.size();
566         final SortedSet<String> all_secondary_features = new TreeSet<String>();
567         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
568             int mapped = 0;
569             int not_mapped = 0;
570             for( final DomainId domain : gwcd.getAllDomainIds() ) {
571                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
572                     all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
573                     mapped++;
574                 }
575                 else {
576                     not_mapped++;
577                 }
578             }
579             if ( mapping_results_map != null ) {
580                 final MappingResults mr = new MappingResults();
581                 mr.setDescription( gwcd.getSpecies().getSpeciesId() );
582                 mr.setSumOfSuccesses( mapped );
583                 mr.setSumOfFailures( not_mapped );
584                 mapping_results_map.put( gwcd.getSpecies(), mr );
585             }
586         }
587         final int number_of_characters = all_secondary_features.size();
588         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
589                                                                                                                                                  number_of_characters );
590         int character_index = 0;
591         for( final String second_id : all_secondary_features ) {
592             matrix.setCharacter( character_index++, second_id );
593         }
594         int identifier_index = 0;
595         final Set<String> all_identifiers = new HashSet<String>();
596         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
597             final String species_id = gwcd.getSpecies().getSpeciesId();
598             if ( all_identifiers.contains( species_id ) ) {
599                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
600             }
601             all_identifiers.add( species_id );
602             matrix.setIdentifier( identifier_index, species_id );
603             final Set<String> all_second_per_gwcd = new HashSet<String>();
604             for( final DomainId domain : gwcd.getAllDomainIds() ) {
605                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
606                     all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
607                 }
608             }
609             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
610                 if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
611                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
612                 }
613                 else {
614                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
615                 }
616             }
617             ++identifier_index;
618         }
619         return matrix;
620     }
621
622     public static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
623                                                                                                                              final Map<DomainId, String> domain_id_to_second_features_map ) {
624         if ( gwcd_list.isEmpty() ) {
625             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
626         }
627         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
628             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
629         }
630         final int number_of_identifiers = gwcd_list.size();
631         final SortedSet<BinaryDomainCombination> all_binary_combinations_mapped = new TreeSet<BinaryDomainCombination>();
632         final Set<BinaryDomainCombination>[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ];
633         int identifier_index = 0;
634         final SortedSet<String> no_mappings = new TreeSet<String>();
635         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
636             binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet<BinaryDomainCombination>();
637             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
638                 final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map,
639                                                                                       bc,
640                                                                                       no_mappings );
641                 all_binary_combinations_mapped.add( mapped_bc );
642                 binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc );
643             }
644             ++identifier_index;
645         }
646         if ( !no_mappings.isEmpty() ) {
647             ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size()
648                     + "):" );
649             for( final String id : no_mappings ) {
650                 ForesterUtil.programMessage( surfacing.PRG_NAME, id );
651             }
652         }
653         final int number_of_characters = all_binary_combinations_mapped.size();
654         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
655                                                                                                                                                  number_of_characters );
656         int character_index = 0;
657         for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) {
658             matrix.setCharacter( character_index++, bc.toString() );
659         }
660         identifier_index = 0;
661         final Set<String> all_identifiers = new HashSet<String>();
662         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
663             final String species_id = gwcd.getSpecies().getSpeciesId();
664             if ( all_identifiers.contains( species_id ) ) {
665                 throw new AssertionError( "species [" + species_id + "] is not unique" );
666             }
667             all_identifiers.add( species_id );
668             matrix.setIdentifier( identifier_index, species_id );
669             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
670                 BinaryDomainCombination bc = null;
671                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
672                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
673                 }
674                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
675                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
676                 }
677                 else {
678                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
679                 }
680                 if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) {
681                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
682                 }
683                 else {
684                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
685                 }
686             }
687             ++identifier_index;
688         }
689         return matrix;
690     }
691
692     private static BinaryDomainCombination mapBinaryDomainCombination( final Map<DomainId, String> domain_id_to_second_features_map,
693                                                                        final BinaryDomainCombination bc,
694                                                                        final SortedSet<String> no_mappings ) {
695         String id0 = "";
696         String id1 = "";
697         if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) {
698             no_mappings.add( bc.getId0().getId() );
699             id0 = bc.getId0().getId();
700         }
701         else {
702             id0 = domain_id_to_second_features_map.get( bc.getId0() );
703         }
704         if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) {
705             no_mappings.add( bc.getId1().getId() );
706             id1 = bc.getId1().getId();
707         }
708         else {
709             id1 = domain_id_to_second_features_map.get( bc.getId1() );
710         }
711         return new BasicBinaryDomainCombination( id0, id1 );
712     }
713
714     public static CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
715         if ( gwcd_list.isEmpty() ) {
716             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
717         }
718         final int number_of_identifiers = gwcd_list.size();
719         final SortedSet<BinaryDomainCombination> all_binary_combinations = new TreeSet<BinaryDomainCombination>();
720         final Set<BinaryDomainCombination>[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ];
721         int identifier_index = 0;
722         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
723             binary_combinations_per_genome[ identifier_index ] = new HashSet<BinaryDomainCombination>();
724             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
725                 all_binary_combinations.add( bc );
726                 binary_combinations_per_genome[ identifier_index ].add( bc );
727             }
728             ++identifier_index;
729         }
730         final int number_of_characters = all_binary_combinations.size();
731         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
732                                                                                                                                                  number_of_characters );
733         int character_index = 0;
734         for( final BinaryDomainCombination bc : all_binary_combinations ) {
735             matrix.setCharacter( character_index++, bc.toString() );
736         }
737         identifier_index = 0;
738         final Set<String> all_identifiers = new HashSet<String>();
739         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
740             final String species_id = gwcd.getSpecies().getSpeciesId();
741             if ( all_identifiers.contains( species_id ) ) {
742                 throw new AssertionError( "species [" + species_id + "] is not unique" );
743             }
744             all_identifiers.add( species_id );
745             matrix.setIdentifier( identifier_index, species_id );
746             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
747                 BinaryDomainCombination bc = null;
748                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
749                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
750                 }
751                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
752                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
753                 }
754                 else {
755                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
756                 }
757                 if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) {
758                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
759                 }
760                 else {
761                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
762                 }
763             }
764             ++identifier_index;
765         }
766         return matrix;
767     }
768
769     static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
770         return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null );
771     }
772
773     public static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
774                                                                                             final SortedSet<DomainId> positive_filter ) {
775         if ( gwcd_list.isEmpty() ) {
776             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
777         }
778         if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) {
779             throw new IllegalArgumentException( "positive filter is empty" );
780         }
781         final int number_of_identifiers = gwcd_list.size();
782         final SortedSet<DomainId> all_domain_ids = new TreeSet<DomainId>();
783         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
784             for( final DomainId domain : gwcd.getAllDomainIds() ) {
785                 all_domain_ids.add( domain );
786             }
787         }
788         int number_of_characters = all_domain_ids.size();
789         if ( positive_filter != null ) {
790             //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes 
791             number_of_characters = 0;
792             for( final DomainId id : all_domain_ids ) {
793                 if ( positive_filter.contains( id ) ) {
794                     number_of_characters++;
795                 }
796             }
797         }
798         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
799                                                                                                                                                  number_of_characters );
800         int character_index = 0;
801         for( final DomainId id : all_domain_ids ) {
802             if ( positive_filter == null ) {
803                 matrix.setCharacter( character_index++, id.getId() );
804             }
805             else {
806                 if ( positive_filter.contains( id ) ) {
807                     matrix.setCharacter( character_index++, id.getId() );
808                 }
809             }
810         }
811         int identifier_index = 0;
812         final Set<String> all_identifiers = new HashSet<String>();
813         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
814             final String species_id = gwcd.getSpecies().getSpeciesId();
815             if ( all_identifiers.contains( species_id ) ) {
816                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
817             }
818             all_identifiers.add( species_id );
819             matrix.setIdentifier( identifier_index, species_id );
820             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
821                 if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) {
822                     throw new RuntimeException( "this should not have happened: problem with character #" + ci );
823                 }
824                 final DomainId id = new DomainId( matrix.getCharacter( ci ) );
825                 if ( gwcd.contains( id ) ) {
826                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
827                 }
828                 else {
829                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
830                 }
831             }
832             ++identifier_index;
833         }
834         return matrix;
835     }
836
837     private static int getStateSumDeltaOnNode( final String node_identifier,
838                                                final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
839                                                final GainLossStates state ) {
840         if ( gain_loss_matrix == null ) {
841             throw new RuntimeException( "no gain loss matrix has been calculated" );
842         }
843         if ( ForesterUtil.isEmpty( node_identifier ) ) {
844             throw new IllegalArgumentException( "node identifier must not be empty" );
845         }
846         if ( gain_loss_matrix.isEmpty() ) {
847             throw new RuntimeException( "gain loss matrix is empty" );
848         }
849         int sum = 0;
850         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
851         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
852             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
853                 ++sum;
854             }
855         }
856         return sum;
857     }
858
859     private static SortedSet<String> getUnitsDeltaOnNode( final String node_identifier,
860                                                           final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
861                                                           final GainLossStates state ) {
862         if ( gain_loss_matrix == null ) {
863             throw new RuntimeException( "no gain loss matrix has been calculated" );
864         }
865         if ( ForesterUtil.isEmpty( node_identifier ) ) {
866             throw new IllegalArgumentException( "node identifier must not be empty" );
867         }
868         if ( gain_loss_matrix.isEmpty() ) {
869             throw new RuntimeException( "gain loss matrix is empty" );
870         }
871         final SortedSet<String> d = new TreeSet<String>();
872         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
873         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
874             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
875                 if ( d.contains( gain_loss_matrix.getCharacter( c ) ) ) {
876                     throw new AssertionError( "this should not have happended: character ["
877                             + gain_loss_matrix.getCharacter( c ) + "] already in set" );
878                 }
879                 d.add( gain_loss_matrix.getCharacter( c ) );
880             }
881         }
882         return d;
883     }
884 }