3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27 package org.forester.surfacing;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.Iterator;
32 import java.util.List;
34 import java.util.Map.Entry;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
39 import org.forester.application.surfacing;
40 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
41 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
42 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
43 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
44 import org.forester.evoinference.parsimony.DolloParsimony;
45 import org.forester.evoinference.parsimony.FitchParsimony;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.BinaryCharacters;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.protein.BinaryDomainCombination;
51 import org.forester.protein.BinaryDomainCombination.DomainCombinationType;
52 import org.forester.species.Species;
53 import org.forester.util.ForesterUtil;
55 public final class DomainParsimonyCalculator {
57 private static final String TYPE_FORBINARY_CHARACTERS = "parsimony inferred";
58 private CharacterStateMatrix<GainLossStates> _gain_loss_matrix;
59 private CharacterStateMatrix<BinaryStates> _binary_internal_states_matrix;
60 private final List<GenomeWideCombinableDomains> _gwcd_list;
61 private final Phylogeny _phylogeny;
62 private int _total_losses;
63 private int _total_gains;
64 private int _total_unchanged;
66 private Map<String, Set<String>> _domain_id_to_secondary_features_map;
67 private SortedSet<String> _positive_filter;
69 private DomainParsimonyCalculator( final Phylogeny phylogeny ) {
71 _phylogeny = phylogeny;
75 private DomainParsimonyCalculator( final Phylogeny phylogeny, final List<GenomeWideCombinableDomains> gwcd_list ) {
77 _phylogeny = phylogeny;
78 _gwcd_list = gwcd_list;
81 private DomainParsimonyCalculator( final Phylogeny phylogeny,
82 final List<GenomeWideCombinableDomains> gwcd_list,
83 final Map<String, Set<String>> domain_id_to_secondary_features_map ) {
85 _phylogeny = phylogeny;
86 _gwcd_list = gwcd_list;
87 setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map );
90 int calculateNumberOfBinaryDomainCombination() {
91 if ( getGenomeWideCombinableDomainsList().isEmpty() ) {
92 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
94 final Set<BinaryDomainCombination> all_binary_combinations = new HashSet<BinaryDomainCombination>();
95 for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) {
96 for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
97 all_binary_combinations.add( bc );
100 return all_binary_combinations.size();
103 CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence() {
104 return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
107 CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence() {
108 return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() );
111 CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map<Species, MappingResults> mapping_results_map ) {
112 return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(),
113 getDomainIdToSecondaryFeaturesMap(),
114 mapping_results_map );
117 Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) {
118 for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) {
119 final PhylogenyNode node = it.next();
120 final String node_identifier = node.getName();
121 final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ),
122 getUnitsGainedOnNode( node_identifier ),
123 getUnitsLostOnNode( node_identifier ),
124 TYPE_FORBINARY_CHARACTERS,
125 getSumOfPresentOnNode( node_identifier ),
126 getSumOfGainsOnNode( node_identifier ),
127 getSumOfLossesOnNode( node_identifier ) );
128 node.getNodeData().setBinaryCharacters( bc );
133 private void executeDolloParsimony( final boolean on_domain_presence ) {
135 final DolloParsimony dollo = DolloParsimony.createInstance();
136 dollo.setReturnGainLossMatrix( true );
137 dollo.setReturnInternalStates( true );
138 CharacterStateMatrix<BinaryStates> states = null;
139 if ( on_domain_presence ) {
140 states = createMatrixOfDomainPresenceOrAbsence();
143 states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence();
145 dollo.execute( getPhylogeny(), states );
146 setGainLossMatrix( dollo.getGainLossMatrix() );
147 setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
148 setCost( dollo.getCost() );
149 setTotalGains( dollo.getTotalGains() );
150 setTotalLosses( dollo.getTotalLosses() );
151 setTotalUnchanged( dollo.getTotalUnchanged() );
154 public void executeDolloParsimonyOnBinaryDomainCombintionPresence() {
155 executeDolloParsimony( false );
158 public void executeDolloParsimonyOnDomainPresence() {
159 executeDolloParsimony( true );
162 public void executeDolloParsimonyOnDomainPresence( final SortedSet<String> positive_filter ) {
163 setPositiveFilter( positive_filter );
164 executeDolloParsimony( true );
165 setPositiveFilter( null );
168 public void executeDolloParsimonyOnSecondaryFeatures( final Map<Species, MappingResults> mapping_results_map ) {
169 if ( getDomainIdToSecondaryFeaturesMap() == null ) {
170 throw new RuntimeException( "Domain id to secondary features map has apparently not been set" );
173 final DolloParsimony dollo = DolloParsimony.createInstance();
174 dollo.setReturnGainLossMatrix( true );
175 dollo.setReturnInternalStates( true );
176 final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeaturePresenceOrAbsence( mapping_results_map );
177 dollo.execute( getPhylogeny(), states );
178 setGainLossMatrix( dollo.getGainLossMatrix() );
179 setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
180 setCost( dollo.getCost() );
181 setTotalGains( dollo.getTotalGains() );
182 setTotalLosses( dollo.getTotalLosses() );
183 setTotalUnchanged( dollo.getTotalUnchanged() );
186 private void executeFitchParsimony( final boolean on_domain_presence,
187 final boolean use_last,
188 final boolean randomize,
189 final long random_number_seed ) {
192 System.out.println( " Fitch parsimony: use_last = true" );
194 final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
195 fitch.setRandomize( randomize );
197 fitch.setRandomNumberSeed( random_number_seed );
199 fitch.setUseLast( use_last );
200 fitch.setReturnGainLossMatrix( true );
201 fitch.setReturnInternalStates( true );
202 CharacterStateMatrix<BinaryStates> states = null;
203 if ( on_domain_presence ) {
204 states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
207 states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
209 fitch.execute( getPhylogeny(), states, true );
210 setGainLossMatrix( fitch.getGainLossMatrix() );
211 setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
212 setCost( fitch.getCost() );
213 setTotalGains( fitch.getTotalGains() );
214 setTotalLosses( fitch.getTotalLosses() );
215 setTotalUnchanged( fitch.getTotalUnchanged() );
218 private void executeFitchParsimonyOnSecondaryFeatures( final boolean use_last,
219 final boolean randomize,
220 final long random_number_seed ) {
223 System.out.println( " Fitch parsimony: use_last = true" );
225 final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
226 fitch.setRandomize( randomize );
228 fitch.setRandomNumberSeed( random_number_seed );
230 fitch.setUseLast( use_last );
231 fitch.setReturnGainLossMatrix( true );
232 fitch.setReturnInternalStates( true );
233 final Map<String, Set<String>> map = getDomainIdToSecondaryFeaturesMap();
234 final Map<String, String> newmap = new HashMap<String, String>();
235 final Iterator<Entry<String, Set<String>>> it = map.entrySet().iterator();
236 while ( it.hasNext() ) {
237 final Map.Entry<String, Set<String>> pair = it.next();
238 if ( pair.getValue().size() != 1 ) {
239 throw new IllegalArgumentException( pair.getKey() + " mapps to " + pair.getValue().size() + " items" );
241 newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] );
243 final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(),
245 fitch.execute( getPhylogeny(), states, true );
246 setGainLossMatrix( fitch.getGainLossMatrix() );
247 setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
248 setCost( fitch.getCost() );
249 setTotalGains( fitch.getTotalGains() );
250 setTotalLosses( fitch.getTotalLosses() );
251 setTotalUnchanged( fitch.getTotalUnchanged() );
254 public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) {
255 executeFitchParsimony( false, use_last, false, 0 );
258 public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) {
259 executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 );
262 public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) {
263 executeFitchParsimony( false, false, true, random_number_seed );
266 public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) {
267 executeFitchParsimony( true, use_last, false, 0 );
270 public void executeFitchParsimonyOnDomainPresence( final long random_number_seed ) {
271 executeFitchParsimony( true, false, true, random_number_seed );
274 public void executeOnGivenBinaryStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix,
275 final String[] character_labels ) {
277 if ( binary_states_matrix.getNumberOfCharacters() != character_labels.length ) {
278 throw new IllegalArgumentException( "binary states matrix number of characters is not equal to the number of character labels provided" );
280 if ( binary_states_matrix.getNumberOfIdentifiers() != getPhylogeny().getNumberOfBranches() ) {
281 throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" );
283 final CharacterStateMatrix<GainLossStates> gl_matrix = new BasicCharacterStateMatrix<GainLossStates>( binary_states_matrix
284 .getNumberOfIdentifiers(),
286 .getNumberOfCharacters() );
288 int total_losses = 0;
289 int total_unchanged = 0;
291 for( final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); it.hasNext(); ) {
292 gl_matrix.setIdentifier( i++, it.next().getName() );
294 for( int c = 0; c < character_labels.length; ++c ) {
295 gl_matrix.setCharacter( c, character_labels[ c ] );
296 final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder();
297 while ( it.hasNext() ) {
298 final PhylogenyNode node = it.next();
299 final String name = node.getName();
300 final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix
301 .getIdentifierIndex( name ), c );
302 final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent();
303 GainLossStates gl_state = null;
304 if ( node.isRoot() ) {
306 if ( bin_state == BinaryStates.ABSENT ) {
307 gl_state = GainLossStates.UNCHANGED_ABSENT;
310 gl_state = GainLossStates.UNCHANGED_PRESENT;
314 final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix
315 .getIdentifierIndex( parent_node.getName() ), c );
316 if ( bin_state == BinaryStates.ABSENT ) {
317 if ( parent_bin_state == BinaryStates.ABSENT ) {
319 gl_state = GainLossStates.UNCHANGED_ABSENT;
323 gl_state = GainLossStates.LOSS;
327 if ( parent_bin_state == BinaryStates.ABSENT ) {
329 gl_state = GainLossStates.GAIN;
333 gl_state = GainLossStates.UNCHANGED_PRESENT;
337 gl_matrix.setState( name, c, gl_state );
340 setTotalGains( total_gains );
341 setTotalLosses( total_losses );
342 setTotalUnchanged( total_unchanged );
343 setCost( total_gains + total_losses );
344 setGainLossMatrix( gl_matrix );
347 public int getCost() {
351 private Map<String, Set<String>> getDomainIdToSecondaryFeaturesMap() {
352 return _domain_id_to_secondary_features_map;
355 public CharacterStateMatrix<Integer> getGainLossCountsMatrix() {
356 final CharacterStateMatrix<Integer> matrix = new BasicCharacterStateMatrix<Integer>( getGainLossMatrix()
357 .getNumberOfIdentifiers(), 3 );
358 for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
359 matrix.setIdentifier( i, getGainLossMatrix().getIdentifier( i ) );
361 matrix.setCharacter( 0, "GAINS" );
362 matrix.setCharacter( 1, "LOSSES" );
363 matrix.setCharacter( 2, "NET" );
364 for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
367 for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
368 final GainLossStates s = getGainLossMatrix().getState( i, c );
369 if ( s == GainLossStates.GAIN ) {
372 else if ( s == GainLossStates.LOSS ) {
376 matrix.setState( i, 0, gains );
377 matrix.setState( i, 1, losses );
378 matrix.setState( i, 2, gains - losses );
383 public CharacterStateMatrix<GainLossStates> getGainLossMatrix() {
384 return _gain_loss_matrix;
387 private List<GenomeWideCombinableDomains> getGenomeWideCombinableDomainsList() {
391 public CharacterStateMatrix<BinaryStates> getInternalStatesMatrix() {
392 return _binary_internal_states_matrix;
395 public int getNetGainsOnNode( final String node_identifier ) {
396 if ( getGainLossMatrix() == null ) {
397 throw new RuntimeException( "no gain loss matrix has been calculated" );
400 final int id_index = getGainLossMatrix().getIdentifierIndex( node_identifier );
401 for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
402 if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.GAIN ) {
405 else if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.LOSS ) {
412 private Phylogeny getPhylogeny() {
416 private SortedSet<String> getPositiveFilter() {
417 return _positive_filter;
420 public int getSumOfGainsOnNode( final String node_identifier ) {
421 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
424 public int getSumOfLossesOnNode( final String node_identifier ) {
425 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
428 public int getSumOfPresentOnNode( final String node_identifier ) {
429 return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier );
432 int getSumOfUnchangedAbsentOnNode( final String node_identifier ) {
433 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
436 int getSumOfUnchangedOnNode( final String node_identifier ) {
437 return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier );
440 int getSumOfUnchangedPresentOnNode( final String node_identifier ) {
441 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
444 public int getTotalGains() {
448 public int getTotalLosses() {
449 return _total_losses;
452 public int getTotalUnchanged() {
453 return _total_unchanged;
456 public SortedSet<String> getUnitsGainedOnNode( final String node_identifier ) {
457 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
460 public SortedSet<String> getUnitsLostOnNode( final String node_identifier ) {
461 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
464 public SortedSet<String> getUnitsOnNode( final String node_identifier ) {
465 final SortedSet<String> present = getUnitsGainedOnNode( node_identifier );
466 present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) );
470 SortedSet<String> getUnitsUnchangedAbsentOnNode( final String node_identifier ) {
471 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
474 SortedSet<String> getUnitsUnchangedPresentOnNode( final String node_identifier ) {
475 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
478 private void init() {
479 setDomainIdToSecondaryFeaturesMap( null );
480 setPositiveFilter( null );
484 private void reset() {
485 setGainLossMatrix( null );
486 setBinaryInternalStatesMatrix( null );
489 setTotalLosses( -1 );
490 setTotalUnchanged( -1 );
493 private void setBinaryInternalStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix ) {
494 _binary_internal_states_matrix = binary_states_matrix;
497 private void setCost( final int cost ) {
501 private void setDomainIdToSecondaryFeaturesMap( final Map<String, Set<String>> domain_id_to_secondary_features_map ) {
502 _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map;
505 private void setGainLossMatrix( final CharacterStateMatrix<GainLossStates> gain_loss_matrix ) {
506 _gain_loss_matrix = gain_loss_matrix;
509 private void setPositiveFilter( final SortedSet<String> positive_filter ) {
510 _positive_filter = positive_filter;
513 private void setTotalGains( final int total_gains ) {
514 _total_gains = total_gains;
517 private void setTotalLosses( final int total_losses ) {
518 _total_losses = total_losses;
521 private void setTotalUnchanged( final int total_unchanged ) {
522 _total_unchanged = total_unchanged;
525 public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) {
526 return new DomainParsimonyCalculator( phylogeny );
529 public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
530 final List<GenomeWideCombinableDomains> gwcd_list ) {
531 if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
532 throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes()
533 + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" );
535 return new DomainParsimonyCalculator( phylogeny, gwcd_list );
538 public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
539 final List<GenomeWideCombinableDomains> gwcd_list,
540 final Map<String, Set<String>> domain_id_to_secondary_features_map ) {
541 if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
542 throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" );
544 return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map );
548 * For folds instead of Pfam-domains, for example
554 static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
555 final Map<String, Set<String>> domain_id_to_second_features_map,
556 final Map<Species, MappingResults> mapping_results_map ) {
557 if ( gwcd_list.isEmpty() ) {
558 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
560 if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
561 throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
563 final int number_of_identifiers = gwcd_list.size();
564 final SortedSet<String> all_secondary_features = new TreeSet<String>();
565 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
568 for( final String domain : gwcd.getAllDomainIds() ) {
569 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
570 all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
577 if ( mapping_results_map != null ) {
578 final MappingResults mr = new MappingResults();
579 mr.setDescription( gwcd.getSpecies().getSpeciesId() );
580 mr.setSumOfSuccesses( mapped );
581 mr.setSumOfFailures( not_mapped );
582 mapping_results_map.put( gwcd.getSpecies(), mr );
585 final int number_of_characters = all_secondary_features.size();
586 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
587 number_of_characters );
588 int character_index = 0;
589 for( final String second_id : all_secondary_features ) {
590 matrix.setCharacter( character_index++, second_id );
592 int identifier_index = 0;
593 final Set<String> all_identifiers = new HashSet<String>();
594 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
595 final String species_id = gwcd.getSpecies().getSpeciesId();
596 if ( all_identifiers.contains( species_id ) ) {
597 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
599 all_identifiers.add( species_id );
600 matrix.setIdentifier( identifier_index, species_id );
601 final Set<String> all_second_per_gwcd = new HashSet<String>();
602 for( final String domain : gwcd.getAllDomainIds() ) {
603 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
604 all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
607 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
608 if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
609 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
612 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
620 public static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
621 final Map<String, String> domain_id_to_second_features_map ) {
622 if ( gwcd_list.isEmpty() ) {
623 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
625 if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
626 throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
628 final int number_of_identifiers = gwcd_list.size();
629 final SortedSet<BinaryDomainCombination> all_binary_combinations_mapped = new TreeSet<BinaryDomainCombination>();
630 final Set<BinaryDomainCombination>[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ];
631 int identifier_index = 0;
632 final SortedSet<String> no_mappings = new TreeSet<String>();
633 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
634 binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet<BinaryDomainCombination>();
635 for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
636 final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map,
639 all_binary_combinations_mapped.add( mapped_bc );
640 binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc );
644 if ( !no_mappings.isEmpty() ) {
645 ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size()
647 for( final String id : no_mappings ) {
648 ForesterUtil.programMessage( surfacing.PRG_NAME, id );
651 final int number_of_characters = all_binary_combinations_mapped.size();
652 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
653 number_of_characters );
654 int character_index = 0;
655 for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) {
656 matrix.setCharacter( character_index++, bc.toString() );
658 identifier_index = 0;
659 final Set<String> all_identifiers = new HashSet<String>();
660 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
661 final String species_id = gwcd.getSpecies().getSpeciesId();
662 if ( all_identifiers.contains( species_id ) ) {
663 throw new AssertionError( "species [" + species_id + "] is not unique" );
665 all_identifiers.add( species_id );
666 matrix.setIdentifier( identifier_index, species_id );
667 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
668 BinaryDomainCombination bc = null;
669 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
670 bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
672 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
673 bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
676 bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
678 if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) {
679 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
682 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
690 private static BinaryDomainCombination mapBinaryDomainCombination( final Map<String, String> domain_id_to_second_features_map,
691 final BinaryDomainCombination bc,
692 final SortedSet<String> no_mappings ) {
695 if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) {
696 no_mappings.add( bc.getId0() );
700 id0 = domain_id_to_second_features_map.get( bc.getId0() );
702 if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) {
703 no_mappings.add( bc.getId1() );
707 id1 = domain_id_to_second_features_map.get( bc.getId1() );
709 return new BasicBinaryDomainCombination( id0, id1 );
712 public static CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
713 if ( gwcd_list.isEmpty() ) {
714 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
716 final int number_of_identifiers = gwcd_list.size();
717 final SortedSet<BinaryDomainCombination> all_binary_combinations = new TreeSet<BinaryDomainCombination>();
718 final Set<BinaryDomainCombination>[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ];
719 int identifier_index = 0;
720 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
721 binary_combinations_per_genome[ identifier_index ] = new HashSet<BinaryDomainCombination>();
722 for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
723 all_binary_combinations.add( bc );
724 binary_combinations_per_genome[ identifier_index ].add( bc );
728 final int number_of_characters = all_binary_combinations.size();
729 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
730 number_of_characters );
731 int character_index = 0;
732 for( final BinaryDomainCombination bc : all_binary_combinations ) {
733 matrix.setCharacter( character_index++, bc.toString() );
735 identifier_index = 0;
736 final Set<String> all_identifiers = new HashSet<String>();
737 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
738 final String species_id = gwcd.getSpecies().getSpeciesId();
739 if ( all_identifiers.contains( species_id ) ) {
740 throw new AssertionError( "species [" + species_id + "] is not unique" );
742 all_identifiers.add( species_id );
743 matrix.setIdentifier( identifier_index, species_id );
744 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
745 BinaryDomainCombination bc = null;
746 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
747 bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
749 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
750 bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
753 bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
755 if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) {
756 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
759 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
767 static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
768 return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null );
771 public static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
772 final SortedSet<String> positive_filter ) {
773 if ( gwcd_list.isEmpty() ) {
774 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
776 if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) {
777 throw new IllegalArgumentException( "positive filter is empty" );
779 final int number_of_identifiers = gwcd_list.size();
780 final SortedSet<String> all_domain_ids = new TreeSet<String>();
781 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
782 for( final String domain : gwcd.getAllDomainIds() ) {
783 all_domain_ids.add( domain );
786 int number_of_characters = all_domain_ids.size();
787 if ( positive_filter != null ) {
788 //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes
789 number_of_characters = 0;
790 for( final String id : all_domain_ids ) {
791 if ( positive_filter.contains( id ) ) {
792 number_of_characters++;
796 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
797 number_of_characters );
798 int character_index = 0;
799 for( final String id : all_domain_ids ) {
800 if ( positive_filter == null ) {
801 matrix.setCharacter( character_index++, id );
804 if ( positive_filter.contains( id ) ) {
805 matrix.setCharacter( character_index++, id );
809 int identifier_index = 0;
810 final Set<String> all_identifiers = new HashSet<String>();
811 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
812 final String species_id = gwcd.getSpecies().getSpeciesId();
813 if ( all_identifiers.contains( species_id ) ) {
814 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
816 all_identifiers.add( species_id );
817 matrix.setIdentifier( identifier_index, species_id );
818 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
819 if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) {
820 throw new RuntimeException( "this should not have happened: problem with character #" + ci );
822 final String id = matrix.getCharacter( ci );
823 if ( gwcd.contains( id ) ) {
824 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
827 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
835 private static int getStateSumDeltaOnNode( final String node_identifier,
836 final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
837 final GainLossStates state ) {
838 if ( gain_loss_matrix == null ) {
839 throw new RuntimeException( "no gain loss matrix has been calculated" );
841 if ( ForesterUtil.isEmpty( node_identifier ) ) {
842 throw new IllegalArgumentException( "node identifier must not be empty" );
844 if ( gain_loss_matrix.isEmpty() ) {
845 throw new RuntimeException( "gain loss matrix is empty" );
848 final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
849 for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
850 if ( gain_loss_matrix.getState( id_index, c ) == state ) {
857 private static SortedSet<String> getUnitsDeltaOnNode( final String node_identifier,
858 final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
859 final GainLossStates state ) {
860 if ( gain_loss_matrix == null ) {
861 throw new RuntimeException( "no gain loss matrix has been calculated" );
863 if ( ForesterUtil.isEmpty( node_identifier ) ) {
864 throw new IllegalArgumentException( "node identifier must not be empty" );
866 if ( gain_loss_matrix.isEmpty() ) {
867 throw new RuntimeException( "gain loss matrix is empty" );
869 final SortedSet<String> d = new TreeSet<String>();
870 final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
871 for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
872 if ( gain_loss_matrix.getState( id_index, c ) == state ) {
873 if ( d.contains( gain_loss_matrix.getCharacter( c ) ) ) {
874 throw new AssertionError( "this should not have happended: character ["
875 + gain_loss_matrix.getCharacter( c ) + "] already in set" );
877 d.add( gain_loss_matrix.getCharacter( c ) );