inprogress
[jalview.git] / forester / java / src / org / forester / surfacing / DomainParsimonyCalculator.java
1 // $Id:
2 //
3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
5 //
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
8 // All rights reserved
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 //
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26
27 package org.forester.surfacing;
28
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.Map;
34 import java.util.Map.Entry;
35 import java.util.Set;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
38
39 import org.forester.application.surfacing;
40 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
41 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
42 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
43 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
44 import org.forester.evoinference.parsimony.DolloParsimony;
45 import org.forester.evoinference.parsimony.FitchParsimony;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.BinaryCharacters;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.protein.BinaryDomainCombination;
51 import org.forester.protein.BinaryDomainCombination.DomainCombinationType;
52 import org.forester.species.Species;
53 import org.forester.util.ForesterUtil;
54
55 public final class DomainParsimonyCalculator {
56
57     private static final String                     TYPE_FORBINARY_CHARACTERS = "parsimony inferred";
58     private CharacterStateMatrix<BinaryStates>      _binary_internal_states_matrix;
59     private int                                     _cost;
60     private Map<String, Set<String>>                _domain_id_to_secondary_features_map;
61     private CharacterStateMatrix<GainLossStates>    _gain_loss_matrix;
62     private final List<GenomeWideCombinableDomains> _gwcd_list;
63     private final Phylogeny                         _phylogeny;
64     private SortedSet<String>                       _positive_filter;
65     private int                                     _total_gains;
66     private int                                     _total_losses;
67     private int                                     _total_unchanged;
68
69     private DomainParsimonyCalculator( final Phylogeny phylogeny ) {
70         init();
71         _phylogeny = phylogeny;
72         _gwcd_list = null;
73     }
74
75     private DomainParsimonyCalculator( final Phylogeny phylogeny, final List<GenomeWideCombinableDomains> gwcd_list ) {
76         init();
77         _phylogeny = phylogeny;
78         _gwcd_list = gwcd_list;
79     }
80
81     private DomainParsimonyCalculator( final Phylogeny phylogeny,
82                                        final List<GenomeWideCombinableDomains> gwcd_list,
83                                        final Map<String, Set<String>> domain_id_to_secondary_features_map ) {
84         init();
85         _phylogeny = phylogeny;
86         _gwcd_list = gwcd_list;
87         setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map );
88     }
89
90     public void executeDolloParsimonyOnBinaryDomainCombintionPresence() {
91         executeDolloParsimony( false );
92     }
93
94     public void executeDolloParsimonyOnDomainPresence() {
95         executeDolloParsimony( true );
96     }
97
98     public void executeDolloParsimonyOnDomainPresence( final SortedSet<String> positive_filter ) {
99         setPositiveFilter( positive_filter );
100         executeDolloParsimony( true );
101         setPositiveFilter( null );
102     }
103
104     public void executeDolloParsimonyOnSecondaryFeatures( final Map<Species, MappingResults> mapping_results_map ) {
105         if ( getDomainIdToSecondaryFeaturesMap() == null ) {
106             throw new RuntimeException( "Domain id to secondary features map has apparently not been set" );
107         }
108         reset();
109         final DolloParsimony dollo = DolloParsimony.createInstance();
110         dollo.setReturnGainLossMatrix( true );
111         dollo.setReturnInternalStates( true );
112         final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeaturePresenceOrAbsence( mapping_results_map );
113         dollo.execute( getPhylogeny(), states );
114         setGainLossMatrix( dollo.getGainLossMatrix() );
115         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
116         setCost( dollo.getCost() );
117         setTotalGains( dollo.getTotalGains() );
118         setTotalLosses( dollo.getTotalLosses() );
119         setTotalUnchanged( dollo.getTotalUnchanged() );
120     }
121
122     public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) {
123         executeFitchParsimony( false, use_last, false, 0 );
124     }
125
126     public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) {
127         executeFitchParsimony( false, false, true, random_number_seed );
128     }
129
130     public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) {
131         executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 );
132     }
133
134     public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) {
135         executeFitchParsimony( true, use_last, false, 0 );
136     }
137
138     public void executeFitchParsimonyOnDomainPresence( final long random_number_seed ) {
139         executeFitchParsimony( true, false, true, random_number_seed );
140     }
141
142     public void executeOnGivenBinaryStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix,
143                                                   final String[] character_labels ) {
144         reset();
145         if ( binary_states_matrix.getNumberOfCharacters() != character_labels.length ) {
146             throw new IllegalArgumentException( "binary states matrix number of characters is not equal to the number of character labels provided" );
147         }
148         if ( binary_states_matrix.getNumberOfIdentifiers() != getPhylogeny().getNumberOfBranches() ) {
149             throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" );
150         }
151         final CharacterStateMatrix<GainLossStates> gl_matrix = new BasicCharacterStateMatrix<GainLossStates>( binary_states_matrix
152                                                                                                                       .getNumberOfIdentifiers(),
153                                                                                                               binary_states_matrix
154                                                                                                                       .getNumberOfCharacters() );
155         int total_gains = 0;
156         int total_losses = 0;
157         int total_unchanged = 0;
158         int i = 0;
159         for( final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); it.hasNext(); ) {
160             gl_matrix.setIdentifier( i++, it.next().getName() );
161         }
162         for( int c = 0; c < character_labels.length; ++c ) {
163             gl_matrix.setCharacter( c, character_labels[ c ] );
164             final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder();
165             while ( it.hasNext() ) {
166                 final PhylogenyNode node = it.next();
167                 final String name = node.getName();
168                 final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix
169                         .getIdentifierIndex( name ), c );
170                 final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent();
171                 GainLossStates gl_state = null;
172                 if ( node.isRoot() ) {
173                     ++total_unchanged;
174                     if ( bin_state == BinaryStates.ABSENT ) {
175                         gl_state = GainLossStates.UNCHANGED_ABSENT;
176                     }
177                     else {
178                         gl_state = GainLossStates.UNCHANGED_PRESENT;
179                     }
180                 }
181                 else {
182                     final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix
183                             .getIdentifierIndex( parent_node.getName() ), c );
184                     if ( bin_state == BinaryStates.ABSENT ) {
185                         if ( parent_bin_state == BinaryStates.ABSENT ) {
186                             ++total_unchanged;
187                             gl_state = GainLossStates.UNCHANGED_ABSENT;
188                         }
189                         else {
190                             ++total_losses;
191                             gl_state = GainLossStates.LOSS;
192                         }
193                     }
194                     else {
195                         if ( parent_bin_state == BinaryStates.ABSENT ) {
196                             ++total_gains;
197                             gl_state = GainLossStates.GAIN;
198                         }
199                         else {
200                             ++total_unchanged;
201                             gl_state = GainLossStates.UNCHANGED_PRESENT;
202                         }
203                     }
204                 }
205                 gl_matrix.setState( name, c, gl_state );
206             }
207         }
208         setTotalGains( total_gains );
209         setTotalLosses( total_losses );
210         setTotalUnchanged( total_unchanged );
211         setCost( total_gains + total_losses );
212         setGainLossMatrix( gl_matrix );
213     }
214
215     public int getCost() {
216         return _cost;
217     }
218
219     public CharacterStateMatrix<Integer> getGainLossCountsMatrix() {
220         final CharacterStateMatrix<Integer> matrix = new BasicCharacterStateMatrix<Integer>( getGainLossMatrix()
221                 .getNumberOfIdentifiers(), 3 );
222         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
223             matrix.setIdentifier( i, getGainLossMatrix().getIdentifier( i ) );
224         }
225         matrix.setCharacter( 0, "GAINS" );
226         matrix.setCharacter( 1, "LOSSES" );
227         matrix.setCharacter( 2, "NET" );
228         for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
229             int gains = 0;
230             int losses = 0;
231             for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
232                 final GainLossStates s = getGainLossMatrix().getState( i, c );
233                 if ( s == GainLossStates.GAIN ) {
234                     ++gains;
235                 }
236                 else if ( s == GainLossStates.LOSS ) {
237                     ++losses;
238                 }
239             }
240             matrix.setState( i, 0, gains );
241             matrix.setState( i, 1, losses );
242             matrix.setState( i, 2, gains - losses );
243         }
244         return matrix;
245     }
246
247     public CharacterStateMatrix<GainLossStates> getGainLossMatrix() {
248         return _gain_loss_matrix;
249     }
250
251     public CharacterStateMatrix<BinaryStates> getInternalStatesMatrix() {
252         return _binary_internal_states_matrix;
253     }
254
255     public int getNetGainsOnNode( final String node_identifier ) {
256         if ( getGainLossMatrix() == null ) {
257             throw new RuntimeException( "no gain loss matrix has been calculated" );
258         }
259         int net = 0;
260         final int id_index = getGainLossMatrix().getIdentifierIndex( node_identifier );
261         for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
262             if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.GAIN ) {
263                 ++net;
264             }
265             else if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.LOSS ) {
266                 --net;
267             }
268         }
269         return net;
270     }
271
272     public int getSumOfGainsOnNode( final String node_identifier ) {
273         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
274     }
275
276     public int getSumOfLossesOnNode( final String node_identifier ) {
277         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
278     }
279
280     public int getSumOfPresentOnNode( final String node_identifier ) {
281         return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier );
282     }
283
284     public int getTotalGains() {
285         return _total_gains;
286     }
287
288     public int getTotalLosses() {
289         return _total_losses;
290     }
291
292     public int getTotalUnchanged() {
293         return _total_unchanged;
294     }
295
296     public SortedSet<String> getUnitsGainedOnNode( final String node_identifier ) {
297         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
298     }
299
300     public SortedSet<String> getUnitsLostOnNode( final String node_identifier ) {
301         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
302     }
303
304     public SortedSet<String> getUnitsOnNode( final String node_identifier ) {
305         final SortedSet<String> present = getUnitsGainedOnNode( node_identifier );
306         present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) );
307         return present;
308     }
309
310     int calculateNumberOfBinaryDomainCombination() {
311         if ( getGenomeWideCombinableDomainsList().isEmpty() ) {
312             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
313         }
314         final Set<BinaryDomainCombination> all_binary_combinations = new HashSet<BinaryDomainCombination>();
315         for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) {
316             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
317                 all_binary_combinations.add( bc );
318             }
319         }
320         return all_binary_combinations.size();
321     }
322
323     CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence() {
324         return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
325     }
326
327     CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence() {
328         return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() );
329     }
330
331     CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map<Species, MappingResults> mapping_results_map ) {
332         return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(),
333                                                                 getDomainIdToSecondaryFeaturesMap(),
334                                                                 mapping_results_map );
335     }
336
337     Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) {
338         for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) {
339             final PhylogenyNode node = it.next();
340             final String node_identifier = node.getName();
341             final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ),
342                                                               getUnitsGainedOnNode( node_identifier ),
343                                                               getUnitsLostOnNode( node_identifier ),
344                                                               TYPE_FORBINARY_CHARACTERS,
345                                                               getSumOfPresentOnNode( node_identifier ),
346                                                               getSumOfGainsOnNode( node_identifier ),
347                                                               getSumOfLossesOnNode( node_identifier ) );
348             node.getNodeData().setBinaryCharacters( bc );
349         }
350         return phylogeny;
351     }
352
353     int getSumOfUnchangedAbsentOnNode( final String node_identifier ) {
354         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
355     }
356
357     int getSumOfUnchangedOnNode( final String node_identifier ) {
358         return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier );
359     }
360
361     int getSumOfUnchangedPresentOnNode( final String node_identifier ) {
362         return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
363     }
364
365     SortedSet<String> getUnitsUnchangedAbsentOnNode( final String node_identifier ) {
366         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
367     }
368
369     SortedSet<String> getUnitsUnchangedPresentOnNode( final String node_identifier ) {
370         return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
371     }
372
373     private void executeDolloParsimony( final boolean on_domain_presence ) {
374         reset();
375         final DolloParsimony dollo = DolloParsimony.createInstance();
376         dollo.setReturnGainLossMatrix( true );
377         dollo.setReturnInternalStates( true );
378         CharacterStateMatrix<BinaryStates> states = null;
379         if ( on_domain_presence ) {
380             states = createMatrixOfDomainPresenceOrAbsence();
381         }
382         else {
383             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence();
384         }
385         dollo.execute( getPhylogeny(), states );
386         setGainLossMatrix( dollo.getGainLossMatrix() );
387         setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
388         setCost( dollo.getCost() );
389         setTotalGains( dollo.getTotalGains() );
390         setTotalLosses( dollo.getTotalLosses() );
391         setTotalUnchanged( dollo.getTotalUnchanged() );
392     }
393
394     private void executeFitchParsimony( final boolean on_domain_presence,
395                                         final boolean use_last,
396                                         final boolean randomize,
397                                         final long random_number_seed ) {
398         reset();
399         if ( use_last ) {
400             System.out.println( "   Fitch parsimony: use_last = true" );
401         }
402         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
403         fitch.setRandomize( randomize );
404         if ( randomize ) {
405             fitch.setRandomNumberSeed( random_number_seed );
406         }
407         fitch.setUseLast( use_last );
408         fitch.setReturnGainLossMatrix( true );
409         fitch.setReturnInternalStates( true );
410         CharacterStateMatrix<BinaryStates> states = null;
411         if ( on_domain_presence ) {
412             states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
413         }
414         else {
415             states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
416         }
417         fitch.execute( getPhylogeny(), states, true );
418         setGainLossMatrix( fitch.getGainLossMatrix() );
419         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
420         setCost( fitch.getCost() );
421         setTotalGains( fitch.getTotalGains() );
422         setTotalLosses( fitch.getTotalLosses() );
423         setTotalUnchanged( fitch.getTotalUnchanged() );
424     }
425
426     private void executeFitchParsimonyOnSecondaryFeatures( final boolean use_last,
427                                                            final boolean randomize,
428                                                            final long random_number_seed ) {
429         reset();
430         if ( use_last ) {
431             System.out.println( "   Fitch parsimony: use_last = true" );
432         }
433         final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
434         fitch.setRandomize( randomize );
435         if ( randomize ) {
436             fitch.setRandomNumberSeed( random_number_seed );
437         }
438         fitch.setUseLast( use_last );
439         fitch.setReturnGainLossMatrix( true );
440         fitch.setReturnInternalStates( true );
441         final Map<String, Set<String>> map = getDomainIdToSecondaryFeaturesMap();
442         final Map<String, String> newmap = new HashMap<String, String>();
443         final Iterator<Entry<String, Set<String>>> it = map.entrySet().iterator();
444         while ( it.hasNext() ) {
445             final Map.Entry<String, Set<String>> pair = it.next();
446             if ( pair.getValue().size() != 1 ) {
447                 throw new IllegalArgumentException( pair.getKey() + " mapps to " + pair.getValue().size() + " items" );
448             }
449             newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] );
450         }
451         final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(),
452                                                                                                                                   newmap );
453         fitch.execute( getPhylogeny(), states, true );
454         setGainLossMatrix( fitch.getGainLossMatrix() );
455         setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
456         setCost( fitch.getCost() );
457         setTotalGains( fitch.getTotalGains() );
458         setTotalLosses( fitch.getTotalLosses() );
459         setTotalUnchanged( fitch.getTotalUnchanged() );
460     }
461
462     private Map<String, Set<String>> getDomainIdToSecondaryFeaturesMap() {
463         return _domain_id_to_secondary_features_map;
464     }
465
466     private List<GenomeWideCombinableDomains> getGenomeWideCombinableDomainsList() {
467         return _gwcd_list;
468     }
469
470     private Phylogeny getPhylogeny() {
471         return _phylogeny;
472     }
473
474     private SortedSet<String> getPositiveFilter() {
475         return _positive_filter;
476     }
477
478     private void init() {
479         setDomainIdToSecondaryFeaturesMap( null );
480         setPositiveFilter( null );
481         reset();
482     }
483
484     private void reset() {
485         setGainLossMatrix( null );
486         setBinaryInternalStatesMatrix( null );
487         setCost( -1 );
488         setTotalGains( -1 );
489         setTotalLosses( -1 );
490         setTotalUnchanged( -1 );
491     }
492
493     private void setBinaryInternalStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix ) {
494         _binary_internal_states_matrix = binary_states_matrix;
495     }
496
497     private void setCost( final int cost ) {
498         _cost = cost;
499     }
500
501     private void setDomainIdToSecondaryFeaturesMap( final Map<String, Set<String>> domain_id_to_secondary_features_map ) {
502         _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map;
503     }
504
505     private void setGainLossMatrix( final CharacterStateMatrix<GainLossStates> gain_loss_matrix ) {
506         _gain_loss_matrix = gain_loss_matrix;
507     }
508
509     private void setPositiveFilter( final SortedSet<String> positive_filter ) {
510         _positive_filter = positive_filter;
511     }
512
513     private void setTotalGains( final int total_gains ) {
514         _total_gains = total_gains;
515     }
516
517     private void setTotalLosses( final int total_losses ) {
518         _total_losses = total_losses;
519     }
520
521     private void setTotalUnchanged( final int total_unchanged ) {
522         _total_unchanged = total_unchanged;
523     }
524
525     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) {
526         return new DomainParsimonyCalculator( phylogeny );
527     }
528
529     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
530                                                             final List<GenomeWideCombinableDomains> gwcd_list ) {
531         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
532             throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes()
533                     + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" );
534         }
535         return new DomainParsimonyCalculator( phylogeny, gwcd_list );
536     }
537
538     public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
539                                                             final List<GenomeWideCombinableDomains> gwcd_list,
540                                                             final Map<String, Set<String>> domain_id_to_secondary_features_map ) {
541         if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
542             throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" );
543         }
544         return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map );
545     }
546
547     public static CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
548         if ( gwcd_list.isEmpty() ) {
549             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
550         }
551         final int number_of_identifiers = gwcd_list.size();
552         final SortedSet<BinaryDomainCombination> all_binary_combinations = new TreeSet<BinaryDomainCombination>();
553         final Set<BinaryDomainCombination>[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ];
554         int identifier_index = 0;
555         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
556             binary_combinations_per_genome[ identifier_index ] = new HashSet<BinaryDomainCombination>();
557             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
558                 all_binary_combinations.add( bc );
559                 binary_combinations_per_genome[ identifier_index ].add( bc );
560             }
561             ++identifier_index;
562         }
563         final int number_of_characters = all_binary_combinations.size();
564         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
565                                                                                                                                                  number_of_characters );
566         int character_index = 0;
567         for( final BinaryDomainCombination bc : all_binary_combinations ) {
568             matrix.setCharacter( character_index++, bc.toString() );
569         }
570         identifier_index = 0;
571         final Set<String> all_identifiers = new HashSet<String>();
572         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
573             final String species_id = gwcd.getSpecies().getSpeciesId();
574             if ( all_identifiers.contains( species_id ) ) {
575                 throw new AssertionError( "species [" + species_id + "] is not unique" );
576             }
577             all_identifiers.add( species_id );
578             matrix.setIdentifier( identifier_index, species_id );
579             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
580                 BinaryDomainCombination bc = null;
581                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
582                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
583                 }
584                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
585                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
586                 }
587                 else {
588                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
589                 }
590                 if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) {
591                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
592                 }
593                 else {
594                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
595                 }
596             }
597             ++identifier_index;
598         }
599         return matrix;
600     }
601
602     public static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
603                                                                                             final SortedSet<String> positive_filter ) {
604         if ( gwcd_list.isEmpty() ) {
605             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
606         }
607         if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) {
608             throw new IllegalArgumentException( "positive filter is empty" );
609         }
610         final int number_of_identifiers = gwcd_list.size();
611         final SortedSet<String> all_domain_ids = new TreeSet<String>();
612         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
613             for( final String domain : gwcd.getAllDomainIds() ) {
614                 all_domain_ids.add( domain );
615             }
616         }
617         int number_of_characters = all_domain_ids.size();
618         if ( positive_filter != null ) {
619             //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes 
620             number_of_characters = 0;
621             for( final String id : all_domain_ids ) {
622                 if ( positive_filter.contains( id ) ) {
623                     number_of_characters++;
624                 }
625             }
626         }
627         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
628                                                                                                                                                  number_of_characters );
629         int character_index = 0;
630         for( final String id : all_domain_ids ) {
631             if ( positive_filter == null ) {
632                 matrix.setCharacter( character_index++, id );
633             }
634             else {
635                 if ( positive_filter.contains( id ) ) {
636                     matrix.setCharacter( character_index++, id );
637                 }
638             }
639         }
640         int identifier_index = 0;
641         final Set<String> all_identifiers = new HashSet<String>();
642         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
643             final String species_id = gwcd.getSpecies().getSpeciesId();
644             if ( all_identifiers.contains( species_id ) ) {
645                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
646             }
647             all_identifiers.add( species_id );
648             matrix.setIdentifier( identifier_index, species_id );
649             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
650                 if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) {
651                     throw new RuntimeException( "this should not have happened: problem with character #" + ci );
652                 }
653                 final String id = matrix.getCharacter( ci );
654                 if ( gwcd.contains( id ) ) {
655                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
656                 }
657                 else {
658                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
659                 }
660             }
661             ++identifier_index;
662         }
663         return matrix;
664     }
665
666     public static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
667                                                                                                                              final Map<String, String> domain_id_to_second_features_map ) {
668         if ( gwcd_list.isEmpty() ) {
669             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
670         }
671         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
672             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
673         }
674         final int number_of_identifiers = gwcd_list.size();
675         final SortedSet<BinaryDomainCombination> all_binary_combinations_mapped = new TreeSet<BinaryDomainCombination>();
676         final Set<BinaryDomainCombination>[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ];
677         int identifier_index = 0;
678         final SortedSet<String> no_mappings = new TreeSet<String>();
679         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
680             binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet<BinaryDomainCombination>();
681             for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
682                 final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map,
683                                                                                       bc,
684                                                                                       no_mappings );
685                 all_binary_combinations_mapped.add( mapped_bc );
686                 binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc );
687             }
688             ++identifier_index;
689         }
690         if ( !no_mappings.isEmpty() ) {
691             ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size()
692                     + "):" );
693             for( final String id : no_mappings ) {
694                 ForesterUtil.programMessage( surfacing.PRG_NAME, id );
695             }
696         }
697         final int number_of_characters = all_binary_combinations_mapped.size();
698         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
699                                                                                                                                                  number_of_characters );
700         int character_index = 0;
701         for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) {
702             matrix.setCharacter( character_index++, bc.toString() );
703         }
704         identifier_index = 0;
705         final Set<String> all_identifiers = new HashSet<String>();
706         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
707             final String species_id = gwcd.getSpecies().getSpeciesId();
708             if ( all_identifiers.contains( species_id ) ) {
709                 throw new AssertionError( "species [" + species_id + "] is not unique" );
710             }
711             all_identifiers.add( species_id );
712             matrix.setIdentifier( identifier_index, species_id );
713             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
714                 BinaryDomainCombination bc = null;
715                 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
716                     bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
717                 }
718                 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
719                     bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
720                 }
721                 else {
722                     bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
723                 }
724                 if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) {
725                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
726                 }
727                 else {
728                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
729                 }
730             }
731             ++identifier_index;
732         }
733         return matrix;
734     }
735
736     static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
737         return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null );
738     }
739
740     /**
741      * For folds instead of Pfam-domains, for example
742      * 
743      * 
744      * @param gwcd_list
745      * @return
746      */
747     static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
748                                                                                                final Map<String, Set<String>> domain_id_to_second_features_map,
749                                                                                                final Map<Species, MappingResults> mapping_results_map ) {
750         if ( gwcd_list.isEmpty() ) {
751             throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
752         }
753         if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
754             throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
755         }
756         final int number_of_identifiers = gwcd_list.size();
757         final SortedSet<String> all_secondary_features = new TreeSet<String>();
758         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
759             int mapped = 0;
760             int not_mapped = 0;
761             for( final String domain : gwcd.getAllDomainIds() ) {
762                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
763                     all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
764                     mapped++;
765                 }
766                 else {
767                     not_mapped++;
768                 }
769             }
770             if ( mapping_results_map != null ) {
771                 final MappingResults mr = new MappingResults();
772                 mr.setDescription( gwcd.getSpecies().getSpeciesId() );
773                 mr.setSumOfSuccesses( mapped );
774                 mr.setSumOfFailures( not_mapped );
775                 mapping_results_map.put( gwcd.getSpecies(), mr );
776             }
777         }
778         final int number_of_characters = all_secondary_features.size();
779         final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
780                                                                                                                                                  number_of_characters );
781         int character_index = 0;
782         for( final String second_id : all_secondary_features ) {
783             matrix.setCharacter( character_index++, second_id );
784         }
785         int identifier_index = 0;
786         final Set<String> all_identifiers = new HashSet<String>();
787         for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
788             final String species_id = gwcd.getSpecies().getSpeciesId();
789             if ( all_identifiers.contains( species_id ) ) {
790                 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
791             }
792             all_identifiers.add( species_id );
793             matrix.setIdentifier( identifier_index, species_id );
794             final Set<String> all_second_per_gwcd = new HashSet<String>();
795             for( final String domain : gwcd.getAllDomainIds() ) {
796                 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
797                     all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
798                 }
799             }
800             for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
801                 if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
802                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
803                 }
804                 else {
805                     matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
806                 }
807             }
808             ++identifier_index;
809         }
810         return matrix;
811     }
812
813     private static int getStateSumDeltaOnNode( final String node_identifier,
814                                                final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
815                                                final GainLossStates state ) {
816         if ( gain_loss_matrix == null ) {
817             throw new RuntimeException( "no gain loss matrix has been calculated" );
818         }
819         if ( ForesterUtil.isEmpty( node_identifier ) ) {
820             throw new IllegalArgumentException( "node identifier must not be empty" );
821         }
822         if ( gain_loss_matrix.isEmpty() ) {
823             throw new RuntimeException( "gain loss matrix is empty" );
824         }
825         int sum = 0;
826         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
827         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
828             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
829                 ++sum;
830             }
831         }
832         return sum;
833     }
834
835     private static SortedSet<String> getUnitsDeltaOnNode( final String node_identifier,
836                                                           final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
837                                                           final GainLossStates state ) {
838         if ( gain_loss_matrix == null ) {
839             throw new RuntimeException( "no gain loss matrix has been calculated" );
840         }
841         if ( ForesterUtil.isEmpty( node_identifier ) ) {
842             throw new IllegalArgumentException( "node identifier must not be empty" );
843         }
844         if ( gain_loss_matrix.isEmpty() ) {
845             throw new RuntimeException( "gain loss matrix is empty" );
846         }
847         final SortedSet<String> d = new TreeSet<String>();
848         final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
849         for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
850             if ( gain_loss_matrix.getState( id_index, c ) == state ) {
851                 if ( d.contains( gain_loss_matrix.getCharacter( c ) ) ) {
852                     throw new AssertionError( "this should not have happended: character ["
853                             + gain_loss_matrix.getCharacter( c ) + "] already in set" );
854                 }
855                 d.add( gain_loss_matrix.getCharacter( c ) );
856             }
857         }
858         return d;
859     }
860
861     private static BinaryDomainCombination mapBinaryDomainCombination( final Map<String, String> domain_id_to_second_features_map,
862                                                                        final BinaryDomainCombination bc,
863                                                                        final SortedSet<String> no_mappings ) {
864         String id0 = "";
865         String id1 = "";
866         if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) {
867             no_mappings.add( bc.getId0() );
868             id0 = bc.getId0();
869         }
870         else {
871             id0 = domain_id_to_second_features_map.get( bc.getId0() );
872         }
873         if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) {
874             no_mappings.add( bc.getId1() );
875             id1 = bc.getId1();
876         }
877         else {
878             id1 = domain_id_to_second_features_map.get( bc.getId1() );
879         }
880         return new BasicBinaryDomainCombination( id0, id1 );
881     }
882 }