4658371a9d0097ffddcb081f532bde07840424d9
[jalview.git] / forester / java / src / org / forester / surfacing / DomainParsimonyCalculator.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
26
27 package org.forester.surfacing;
28
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.Map.Entry;
35 import java.util.Set;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
38
39 import org.forester.application.surfacing;
40 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
41 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
42 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
43 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
44 import org.forester.evoinference.parsimony.DolloParsimony;
45 import org.forester.evoinference.parsimony.FitchParsimony;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.BinaryCharacters;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType;
51 import org.forester.util.ForesterUtil;
52
53 public final class DomainParsimonyCalculator {
54
55     private static final String                     TYPE_FORBINARY_CHARACTERS = "parsimony inferred";
56     private CharacterStateMatrix<GainLossStates>    _gain_loss_matrix;
57     private CharacterStateMatrix<BinaryStates>      _binary_internal_states_matrix;
58     private final List<GenomeWideCombinableDomains> _gwcd_list;
59     private final Phylogeny                         _phylogeny;
60     private int                                     _total_losses;
61     private int                                     _total_gains;
62     private int                                     _total_unchanged;
63     private int                                     _cost;
64     private Map<DomainId, Set<String>>              _domain_id_to_secondary_features_map;
65     private SortedSet<DomainId>                     _positive_filter;
66
67     private DomainParsimonyCalculator( final Phylogeny phylogeny ) {
68         init();
69         _phylogeny = phylogeny;
70         _gwcd_list = null;
71     }
72
73     private DomainParsimonyCalculator( final Phylogeny phylogeny, final List<GenomeWideCombinableDomains> gwcd_list ) {
74         init();
75         _phylogeny = phylogeny;
76         _gwcd_list = gwcd_list;
77     }
78
79     private DomainParsimonyCalculator( final Phylogeny phylogeny,
80                                        final List<GenomeWideCombinableDomains> gwcd_list,
81                                        final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
82         init();
83         _phylogeny = phylogeny;
84         _gwcd_list = gwcd_list;
85         setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map );
86     }
87
88     int calculateNumberOfBinaryDomainCombination() {
89         if ( getGenomeWideCombinableDomainsList().isEmpty() ) {
90             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
91         }
92         final Set<BinaryDomainCombination> all_binary_combinations = new HashSet<BinaryDomainCombination>();
93         for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) {
94             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
95                 all_binary_combinations.add( bc );
96             }
97         }
98         return all_binary_combinations.size();
99     }
100
101     CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence() {
102         return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
103     }
104
105     CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence() {
106         return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() );
107     }
108
109     CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map<Species, MappingResults> mapping_results_map ) {
110         return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(),
111                                                                 getDomainIdToSecondaryFeaturesMap(),
112                                                                 mapping_results_map );
113     }
114
115     Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) {
116         for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) {
117             final PhylogenyNode node = it.next();
118             final String node_identifier = node.getName();
119             final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ),
120                                                               getUnitsGainedOnNode( node_identifier ),
121                                                               getUnitsLostOnNode( node_identifier ),
122                                                               TYPE_FORBINARY_CHARACTERS,
123                                                               getSumOfPresentOnNode( node_identifier ),
124                                                               getSumOfGainsOnNode( node_identifier ),
125                                                               getSumOfLossesOnNode( node_identifier ) );
126             node.getNodeData().setBinaryCharacters( bc );
127         }
128         return phylogeny;
129     }
130
131     private void executeDolloParsimony( final boolean on_domain_presence ) {
132         reset();
133         final DolloParsimony dollo = DolloParsimony.createInstance();
134         dollo.setReturnGainLossMatrix( true );
135         dollo.setReturnInternalStates( true );
136         CharacterStateMatrix<BinaryStates> states = null;
137         if ( on_domain_presence ) {
138             states = createMatrixOfDomainPresenceOrAbsence();
139         }
140         else {
141             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence();
142         }
143         dollo.execute( getPhylogeny(), states );
144         setGainLossMatrix( dollo.getGainLossMatrix() );
145         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
146         setCost( dollo.getCost() );
147         setTotalGains( dollo.getTotalGains() );
148         setTotalLosses( dollo.getTotalLosses() );
149         setTotalUnchanged( dollo.getTotalUnchanged() );
150     }
151
152     public void executeDolloParsimonyOnBinaryDomainCombintionPresence() {
153         executeDolloParsimony( false );
154     }
155
156     public void executeDolloParsimonyOnDomainPresence() {
157         executeDolloParsimony( true );
158     }
159
160     public void executeDolloParsimonyOnDomainPresence( final SortedSet<DomainId> positive_filter ) {
161         setPositiveFilter( positive_filter );
162         executeDolloParsimony( true );
163         setPositiveFilter( null );
164     }
165
166     public void executeDolloParsimonyOnSecondaryFeatures( final Map<Species, MappingResults> mapping_results_map ) {
167         if ( getDomainIdToSecondaryFeaturesMap() == null ) {
168             throw new RuntimeException( "Domain id to secondary features map has apparently not been set" );
169         }
170         reset();
171         final DolloParsimony dollo = DolloParsimony.createInstance();
172         dollo.setReturnGainLossMatrix( true );
173         dollo.setReturnInternalStates( true );
174         final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeaturePresenceOrAbsence( mapping_results_map );
175         dollo.execute( getPhylogeny(), states );
176         setGainLossMatrix( dollo.getGainLossMatrix() );
177         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
178         setCost( dollo.getCost() );
179         setTotalGains( dollo.getTotalGains() );
180         setTotalLosses( dollo.getTotalLosses() );
181         setTotalUnchanged( dollo.getTotalUnchanged() );
182     }
183
184     private void executeFitchParsimony( final boolean on_domain_presence,
185                                         final boolean use_last,
186                                         final boolean randomize,
187                                         final long random_number_seed ) {
188         reset();
189         if ( use_last ) {
190             System.out.println( "   Fitch parsimony: use_last = true" );
191         }
192         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
193         fitch.setRandomize( randomize );
194         if ( randomize ) {
195             fitch.setRandomNumberSeed( random_number_seed );
196         }
197         fitch.setUseLast( use_last );
198         fitch.setReturnGainLossMatrix( true );
199         fitch.setReturnInternalStates( true );
200         CharacterStateMatrix<BinaryStates> states = null;
201         if ( on_domain_presence ) {
202             states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
203         }
204         else {
205             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
206         }
207         fitch.execute( getPhylogeny(), states );
208         setGainLossMatrix( fitch.getGainLossMatrix() );
209         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
210         setCost( fitch.getCost() );
211         setTotalGains( fitch.getTotalGains() );
212         setTotalLosses( fitch.getTotalLosses() );
213         setTotalUnchanged( fitch.getTotalUnchanged() );
214     }
215     
216     private void executeFitchParsimonyOnSecondaryFeatures( 
217                                         final boolean use_last,
218                                         final boolean randomize,
219                                         final long random_number_seed ) {
220         reset();
221         if ( use_last ) {
222             System.out.println( "   Fitch parsimony: use_last = true" );
223         }
224         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
225         fitch.setRandomize( randomize );
226         if ( randomize ) {
227             fitch.setRandomNumberSeed( random_number_seed );
228         }
229         fitch.setUseLast( use_last );
230         fitch.setReturnGainLossMatrix( true );
231         fitch.setReturnInternalStates( true );
232         
233         final Map<DomainId, Set<String>> map = getDomainIdToSecondaryFeaturesMap();
234         final Map<DomainId, String> newmap = new HashMap<DomainId, String>();
235         final Iterator<Entry<DomainId, Set<String>>> it = map.entrySet().iterator();
236         while (it.hasNext()) {
237             final Map.Entry<DomainId, Set<String>> pair =  (Map.Entry<DomainId, Set<String>>)it.next();
238             if ( pair.getValue().size() != 1 ) {
239                 throw new IllegalArgumentException( pair.getKey().getId() + " mapps to " + pair.getValue().size() + " items" );
240             }
241             newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] );
242         }
243         
244         CharacterStateMatrix<BinaryStates> states  =createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(),
245                                                                                 newmap );
246        
247         fitch.execute( getPhylogeny(), states );
248         setGainLossMatrix( fitch.getGainLossMatrix() );
249         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
250         setCost( fitch.getCost() );
251         setTotalGains( fitch.getTotalGains() );
252         setTotalLosses( fitch.getTotalLosses() );
253         setTotalUnchanged( fitch.getTotalUnchanged() );
254     }
255
256     public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) {
257         executeFitchParsimony( false, use_last, false, 0 );
258     }
259     
260     public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) {
261         executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 );
262     }
263
264     public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) {
265         executeFitchParsimony( false, false, true, random_number_seed );
266     }
267
268     public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) {
269         executeFitchParsimony( true, use_last, false, 0 );
270     }
271
272     public void executeFitchParsimonyOnDomainPresence( final long random_number_seed ) {
273         executeFitchParsimony( true, false, true, random_number_seed );
274     }
275
276     public void executeOnGivenBinaryStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix,
277                                                   final String[] character_labels ) {
278         reset();
279         if ( binary_states_matrix.getNumberOfCharacters() != character_labels.length ) {
280             throw new IllegalArgumentException( "binary states matrix number of characters is not equal to the number of character labels provided" );
281         }
282         if ( binary_states_matrix.getNumberOfIdentifiers() != getPhylogeny().getNumberOfBranches() ) {
283             throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" );
284         }
285         final CharacterStateMatrix<GainLossStates> gl_matrix = new BasicCharacterStateMatrix<GainLossStates>( binary_states_matrix
286                                                                                                                       .getNumberOfIdentifiers(),
287                                                                                                               binary_states_matrix
288                                                                                                                       .getNumberOfCharacters() );
289         int total_gains = 0;
290         int total_losses = 0;
291         int total_unchanged = 0;
292         int i = 0;
293         for( final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); it.hasNext(); ) {
294             gl_matrix.setIdentifier( i++, it.next().getName() );
295         }
296         for( int c = 0; c < character_labels.length; ++c ) {
297             gl_matrix.setCharacter( c, character_labels[ c ] );
298             final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder();
299             while ( it.hasNext() ) {
300                 final PhylogenyNode node = it.next();
301                 final String name = node.getName();
302                 final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix
303                         .getIdentifierIndex( name ), c );
304                 final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent();
305                 GainLossStates gl_state = null;
306                 if ( node.isRoot() ) {
307                     ++total_unchanged;
308                     if ( bin_state == BinaryStates.ABSENT ) {
309                         gl_state = GainLossStates.UNCHANGED_ABSENT;
310                     }
311                     else {
312                         gl_state = GainLossStates.UNCHANGED_PRESENT;
313                     }
314                 }
315                 else {
316                     final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix
317                             .getIdentifierIndex( parent_node.getName() ), c );
318                     if ( bin_state == BinaryStates.ABSENT ) {
319                         if ( parent_bin_state == BinaryStates.ABSENT ) {
320                             ++total_unchanged;
321                             gl_state = GainLossStates.UNCHANGED_ABSENT;
322                         }
323                         else {
324                             ++total_losses;
325                             gl_state = GainLossStates.LOSS;
326                         }
327                     }
328                     else {
329                         if ( parent_bin_state == BinaryStates.ABSENT ) {
330                             ++total_gains;
331                             gl_state = GainLossStates.GAIN;
332                         }
333                         else {
334                             ++total_unchanged;
335                             gl_state = GainLossStates.UNCHANGED_PRESENT;
336                         }
337                     }
338                 }
339                 gl_matrix.setState( name, c, gl_state );
340             }
341         }
342         setTotalGains( total_gains );
343         setTotalLosses( total_losses );
344         setTotalUnchanged( total_unchanged );
345         setCost( total_gains + total_losses );
346         setGainLossMatrix( gl_matrix );
347     }
348
349     public int getCost() {
350         return _cost;
351     }
352
353     private Map<DomainId, Set<String>> getDomainIdToSecondaryFeaturesMap() {
354         return _domain_id_to_secondary_features_map;
355     }
356
357     public CharacterStateMatrix<Integer> getGainLossCountsMatrix() {
358         final CharacterStateMatrix<Integer> matrix = new BasicCharacterStateMatrix<Integer>( getGainLossMatrix()
359                 .getNumberOfIdentifiers(), 3 );
360         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
361             matrix.setIdentifier( i, getGainLossMatrix().getIdentifier( i ) );
362         }
363         matrix.setCharacter( 0, "GAINS" );
364         matrix.setCharacter( 1, "LOSSES" );
365         matrix.setCharacter( 2, "NET" );
366         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
367             int gains = 0;
368             int losses = 0;
369             for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
370                 final GainLossStates s = getGainLossMatrix().getState( i, c );
371                 if ( s == GainLossStates.GAIN ) {
372                     ++gains;
373                 }
374                 else if ( s == GainLossStates.LOSS ) {
375                     ++losses;
376                 }
377             }
378             matrix.setState( i, 0, gains );
379             matrix.setState( i, 1, losses );
380             matrix.setState( i, 2, gains - losses );
381         }
382         return matrix;
383     }
384
385     public CharacterStateMatrix<GainLossStates> getGainLossMatrix() {
386         return _gain_loss_matrix;
387     }
388
389     private List<GenomeWideCombinableDomains> getGenomeWideCombinableDomainsList() {
390         return _gwcd_list;
391     }
392
393     public CharacterStateMatrix<BinaryStates> getInternalStatesMatrix() {
394         return _binary_internal_states_matrix;
395     }
396
397     public int getNetGainsOnNode( final String node_identifier ) {
398         if ( getGainLossMatrix() == null ) {
399             throw new RuntimeException( "no gain loss matrix has been calculated" );
400         }
401         int net = 0;
402         final int id_index = getGainLossMatrix().getIdentifierIndex( node_identifier );
403         for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
404             if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.GAIN ) {
405                 ++net;
406             }
407             else if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.LOSS ) {
408                 --net;
409             }
410         }
411         return net;
412     }
413
414     private Phylogeny getPhylogeny() {
415         return _phylogeny;
416     }
417
418     private SortedSet<DomainId> getPositiveFilter() {
419         return _positive_filter;
420     }
421
422     public int getSumOfGainsOnNode( final String node_identifier ) {
423         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
424     }
425
426     public int getSumOfLossesOnNode( final String node_identifier ) {
427         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
428     }
429
430     public int getSumOfPresentOnNode( final String node_identifier ) {
431         return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier );
432     }
433
434     int getSumOfUnchangedAbsentOnNode( final String node_identifier ) {
435         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
436     }
437
438     int getSumOfUnchangedOnNode( final String node_identifier ) {
439         return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier );
440     }
441
442     int getSumOfUnchangedPresentOnNode( final String node_identifier ) {
443         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
444     }
445
446     public int getTotalGains() {
447         return _total_gains;
448     }
449
450     public int getTotalLosses() {
451         return _total_losses;
452     }
453
454     public int getTotalUnchanged() {
455         return _total_unchanged;
456     }
457
458     public SortedSet<String> getUnitsGainedOnNode( final String node_identifier ) {
459         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
460     }
461
462     public SortedSet<String> getUnitsLostOnNode( final String node_identifier ) {
463         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
464     }
465
466     public SortedSet<String> getUnitsOnNode( final String node_identifier ) {
467         final SortedSet<String> present = getUnitsGainedOnNode( node_identifier );
468         present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) );
469         return present;
470     }
471
472     SortedSet<String> getUnitsUnchangedAbsentOnNode( final String node_identifier ) {
473         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
474     }
475
476     SortedSet<String> getUnitsUnchangedPresentOnNode( final String node_identifier ) {
477         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
478     }
479
480     private void init() {
481         setDomainIdToSecondaryFeaturesMap( null );
482         setPositiveFilter( null );
483         reset();
484     }
485
486     private void reset() {
487         setGainLossMatrix( null );
488         setBinaryInternalStatesMatrix( null );
489         setCost( -1 );
490         setTotalGains( -1 );
491         setTotalLosses( -1 );
492         setTotalUnchanged( -1 );
493     }
494
495     private void setBinaryInternalStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix ) {
496         _binary_internal_states_matrix = binary_states_matrix;
497     }
498
499     private void setCost( final int cost ) {
500         _cost = cost;
501     }
502
503     private void setDomainIdToSecondaryFeaturesMap( final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
504         _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map;
505     }
506
507     private void setGainLossMatrix( final CharacterStateMatrix<GainLossStates> gain_loss_matrix ) {
508         _gain_loss_matrix = gain_loss_matrix;
509     }
510
511     private void setPositiveFilter( final SortedSet<DomainId> positive_filter ) {
512         _positive_filter = positive_filter;
513     }
514
515     private void setTotalGains( final int total_gains ) {
516         _total_gains = total_gains;
517     }
518
519     private void setTotalLosses( final int total_losses ) {
520         _total_losses = total_losses;
521     }
522
523     private void setTotalUnchanged( final int total_unchanged ) {
524         _total_unchanged = total_unchanged;
525     }
526
527     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) {
528         return new DomainParsimonyCalculator( phylogeny );
529     }
530
531     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
532                                                             final List<GenomeWideCombinableDomains> gwcd_list ) {
533         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
534             throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes()
535                     + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" );
536         }
537         return new DomainParsimonyCalculator( phylogeny, gwcd_list );
538     }
539
540     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
541                                                             final List<GenomeWideCombinableDomains> gwcd_list,
542                                                             final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
543         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
544             throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" );
545         }
546         return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map );
547     }
548
549     
550     /**
551      * For folds instead of Pfam-domains, for example
552      * 
553      * 
554      * @param gwcd_list
555      * @return
556      */
557     static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
558                                                                                                final Map<DomainId, Set<String>> domain_id_to_second_features_map,
559                                                                                                final Map<Species, MappingResults> mapping_results_map ) {
560         if ( gwcd_list.isEmpty() ) {
561             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
562         }
563         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
564             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
565         }
566         final int number_of_identifiers = gwcd_list.size();
567         final SortedSet<String> all_secondary_features = new TreeSet<String>();
568         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
569             int mapped = 0;
570             int not_mapped = 0;
571             for( final DomainId domain : gwcd.getAllDomainIds() ) {
572                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
573                     all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
574                     mapped++;
575                 }
576                 else {
577                     not_mapped++;
578                 }
579             }
580             if ( mapping_results_map != null ) {
581                 final MappingResults mr = new MappingResults();
582                 mr.setDescription( gwcd.getSpecies().getSpeciesId() );
583                 mr.setSumOfSuccesses( mapped );
584                 mr.setSumOfFailures( not_mapped );
585                 mapping_results_map.put( gwcd.getSpecies(), mr );
586             }
587         }
588         final int number_of_characters = all_secondary_features.size();
589         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
590                                                                                                                                                  number_of_characters );
591         int character_index = 0;
592         for( final String second_id : all_secondary_features ) {
593             matrix.setCharacter( character_index++, second_id );
594         }
595         int identifier_index = 0;
596         final Set<String> all_identifiers = new HashSet<String>();
597         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
598             final String species_id = gwcd.getSpecies().getSpeciesId();
599             if ( all_identifiers.contains( species_id ) ) {
600                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
601             }
602             all_identifiers.add( species_id );
603             matrix.setIdentifier( identifier_index, species_id );
604             final Set<String> all_second_per_gwcd = new HashSet<String>();
605             for( final DomainId domain : gwcd.getAllDomainIds() ) {
606                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
607                     all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
608                 }
609             }
610             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
611                 if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
612                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
613                 }
614                 else {
615                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
616                 }
617             }
618             ++identifier_index;
619         }
620         return matrix;
621     }
622     
623     
624     public static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
625                                                                                                                              final Map<DomainId, String> domain_id_to_second_features_map) {
626         if ( gwcd_list.isEmpty() ) {
627             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
628         }
629         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
630             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
631         }
632         final int number_of_identifiers = gwcd_list.size();
633         final SortedSet<BinaryDomainCombination> all_binary_combinations_mapped = new TreeSet<BinaryDomainCombination>();
634         final Set<BinaryDomainCombination>[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ];
635         int identifier_index = 0;
636         
637         final SortedSet<String> no_mappings = new TreeSet<String>();
638         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
639             binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet<BinaryDomainCombination>();
640             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
641                 
642                 final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map,
643                                                                                       bc,
644                                                                                       no_mappings );
645                 all_binary_combinations_mapped.add( mapped_bc );
646                 binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc );
647             }
648             ++identifier_index;
649         }
650        
651         if ( !no_mappings.isEmpty() ) {
652             ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size()  + "):" );
653             for( final String id : no_mappings ) {
654                 ForesterUtil.programMessage( surfacing.PRG_NAME, id);
655             }
656         }
657         
658         final int number_of_characters = all_binary_combinations_mapped.size();
659         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
660                                                                                                                                                  number_of_characters );
661         int character_index = 0;
662         for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) {
663             matrix.setCharacter( character_index++, bc.toString() );
664         }
665         identifier_index = 0;
666         final Set<String> all_identifiers = new HashSet<String>();
667         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
668             final String species_id = gwcd.getSpecies().getSpeciesId();
669             if ( all_identifiers.contains( species_id ) ) {
670                 throw new AssertionError( "species [" + species_id + "] is not unique" );
671             }
672             all_identifiers.add( species_id );
673             matrix.setIdentifier( identifier_index, species_id );
674             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
675                 BinaryDomainCombination bc = null;
676                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
677                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
678                 }
679                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
680                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
681                 }
682                 else {
683                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
684                 }
685                 if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) {
686                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
687                 }
688                 else {
689                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
690                 }
691             }
692             ++identifier_index;
693         }
694         return matrix;
695     }
696
697     private static BinaryDomainCombination mapBinaryDomainCombination( final Map<DomainId, String> domain_id_to_second_features_map,
698                                                                        final BinaryDomainCombination bc,
699                                                                        final SortedSet<String> no_mappings ) {
700         String id0 = "";
701         String id1 = "";
702         
703         if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) {
704            
705             no_mappings.add(bc.getId0().getId()  );
706             id0 = bc.getId0().getId();
707         }
708         else {
709             id0 = domain_id_to_second_features_map.get(  bc.getId0());
710         }
711         if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) {
712            
713             no_mappings.add(bc.getId1().getId()  );
714             id1 = bc.getId1().getId();
715         }
716         else {
717             id1 = domain_id_to_second_features_map.get(  bc.getId1());
718         }
719         
720         return new BasicBinaryDomainCombination( id0, id1 );
721     }
722     
723     
724     
725     
726     public static CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
727         if ( gwcd_list.isEmpty() ) {
728             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
729         }
730         final int number_of_identifiers = gwcd_list.size();
731         final SortedSet<BinaryDomainCombination> all_binary_combinations = new TreeSet<BinaryDomainCombination>();
732         final Set<BinaryDomainCombination>[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ];
733         int identifier_index = 0;
734         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
735             binary_combinations_per_genome[ identifier_index ] = new HashSet<BinaryDomainCombination>();
736             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
737                 all_binary_combinations.add( bc );
738                 binary_combinations_per_genome[ identifier_index ].add( bc );
739             }
740             ++identifier_index;
741         }
742         final int number_of_characters = all_binary_combinations.size();
743         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
744                                                                                                                                                  number_of_characters );
745         int character_index = 0;
746         for( final BinaryDomainCombination bc : all_binary_combinations ) {
747             matrix.setCharacter( character_index++, bc.toString() );
748         }
749         identifier_index = 0;
750         final Set<String> all_identifiers = new HashSet<String>();
751         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
752             final String species_id = gwcd.getSpecies().getSpeciesId();
753             if ( all_identifiers.contains( species_id ) ) {
754                 throw new AssertionError( "species [" + species_id + "] is not unique" );
755             }
756             all_identifiers.add( species_id );
757             matrix.setIdentifier( identifier_index, species_id );
758             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
759                 BinaryDomainCombination bc = null;
760                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
761                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
762                 }
763                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
764                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
765                 }
766                 else {
767                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
768                 }
769                 if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) {
770                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
771                 }
772                 else {
773                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
774                 }
775             }
776             ++identifier_index;
777         }
778         return matrix;
779     }
780
781     static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
782         return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null );
783     }
784
785     public static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
786                                                                                             final SortedSet<DomainId> positive_filter ) {
787         if ( gwcd_list.isEmpty() ) {
788             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
789         }
790         if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) {
791             throw new IllegalArgumentException( "positive filter is empty" );
792         }
793         final int number_of_identifiers = gwcd_list.size();
794         final SortedSet<DomainId> all_domain_ids = new TreeSet<DomainId>();
795         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
796             for( final DomainId domain : gwcd.getAllDomainIds() ) {
797                 all_domain_ids.add( domain );
798             }
799         }
800         int number_of_characters = all_domain_ids.size();
801         if ( positive_filter != null ) {
802             //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes 
803             number_of_characters = 0;
804             for( final DomainId id : all_domain_ids ) {
805                 if ( positive_filter.contains( id ) ) {
806                     number_of_characters++;
807                 }
808             }
809         }
810         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
811                                                                                                                                                  number_of_characters );
812         int character_index = 0;
813         for( final DomainId id : all_domain_ids ) {
814             if ( positive_filter == null ) {
815                 matrix.setCharacter( character_index++, id.getId() );
816             }
817             else {
818                 if ( positive_filter.contains( id ) ) {
819                     matrix.setCharacter( character_index++, id.getId() );
820                 }
821             }
822         }
823         int identifier_index = 0;
824         final Set<String> all_identifiers = new HashSet<String>();
825         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
826             final String species_id = gwcd.getSpecies().getSpeciesId();
827             if ( all_identifiers.contains( species_id ) ) {
828                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
829             }
830             all_identifiers.add( species_id );
831             matrix.setIdentifier( identifier_index, species_id );
832             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
833                 if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) {
834                     throw new RuntimeException( "this should not have happened: problem with character #" + ci );
835                 }
836                 final DomainId id = new DomainId( matrix.getCharacter( ci ) );
837                 if ( gwcd.contains( id ) ) {
838                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
839                 }
840                 else {
841                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
842                 }
843             }
844             ++identifier_index;
845         }
846         return matrix;
847     }
848
849    
850
851     private static int getStateSumDeltaOnNode( final String node_identifier,
852                                                final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
853                                                final GainLossStates state ) {
854         if ( gain_loss_matrix == null ) {
855             throw new RuntimeException( "no gain loss matrix has been calculated" );
856         }
857         if ( ForesterUtil.isEmpty( node_identifier ) ) {
858             throw new IllegalArgumentException( "node identifier must not be empty" );
859         }
860         if ( gain_loss_matrix.isEmpty() ) {
861             throw new RuntimeException( "gain loss matrix is empty" );
862         }
863         int sum = 0;
864         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
865         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
866             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
867                 ++sum;
868             }
869         }
870         return sum;
871     }
872
873     private static SortedSet<String> getUnitsDeltaOnNode( final String node_identifier,
874                                                           final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
875                                                           final GainLossStates state ) {
876         if ( gain_loss_matrix == null ) {
877             throw new RuntimeException( "no gain loss matrix has been calculated" );
878         }
879         if ( ForesterUtil.isEmpty( node_identifier ) ) {
880             throw new IllegalArgumentException( "node identifier must not be empty" );
881         }
882         if ( gain_loss_matrix.isEmpty() ) {
883             throw new RuntimeException( "gain loss matrix is empty" );
884         }
885         final SortedSet<String> d = new TreeSet<String>();
886         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
887         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
888             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
889                 if ( d.contains( gain_loss_matrix.getCharacter( c ) ) ) {
890                     throw new AssertionError( "this should not have happended: character ["
891                             + gain_loss_matrix.getCharacter( c ) + "] already in set" );
892                 }
893                 d.add( gain_loss_matrix.getCharacter( c ) );
894             }
895         }
896         return d;
897     }
898 }