3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27 package org.forester.surfacing;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.Iterator;
32 import java.util.List;
34 import java.util.Map.Entry;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
39 import org.forester.application.surfacing;
40 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
41 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
42 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
43 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
44 import org.forester.evoinference.parsimony.DolloParsimony;
45 import org.forester.evoinference.parsimony.FitchParsimony;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.BinaryCharacters;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.protein.BinaryDomainCombination;
51 import org.forester.protein.BinaryDomainCombination.DomainCombinationType;
52 import org.forester.protein.DomainId;
53 import org.forester.species.Species;
54 import org.forester.util.ForesterUtil;
56 public final class DomainParsimonyCalculator {
58 private static final String TYPE_FORBINARY_CHARACTERS = "parsimony inferred";
59 private CharacterStateMatrix<GainLossStates> _gain_loss_matrix;
60 private CharacterStateMatrix<BinaryStates> _binary_internal_states_matrix;
61 private final List<GenomeWideCombinableDomains> _gwcd_list;
62 private final Phylogeny _phylogeny;
63 private int _total_losses;
64 private int _total_gains;
65 private int _total_unchanged;
67 private Map<DomainId, Set<String>> _domain_id_to_secondary_features_map;
68 private SortedSet<DomainId> _positive_filter;
70 private DomainParsimonyCalculator( final Phylogeny phylogeny ) {
72 _phylogeny = phylogeny;
76 private DomainParsimonyCalculator( final Phylogeny phylogeny, final List<GenomeWideCombinableDomains> gwcd_list ) {
78 _phylogeny = phylogeny;
79 _gwcd_list = gwcd_list;
82 private DomainParsimonyCalculator( final Phylogeny phylogeny,
83 final List<GenomeWideCombinableDomains> gwcd_list,
84 final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
86 _phylogeny = phylogeny;
87 _gwcd_list = gwcd_list;
88 setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map );
91 int calculateNumberOfBinaryDomainCombination() {
92 if ( getGenomeWideCombinableDomainsList().isEmpty() ) {
93 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
95 final Set<BinaryDomainCombination> all_binary_combinations = new HashSet<BinaryDomainCombination>();
96 for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) {
97 for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
98 all_binary_combinations.add( bc );
101 return all_binary_combinations.size();
104 CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence() {
105 return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
108 CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence() {
109 return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() );
112 CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map<Species, MappingResults> mapping_results_map ) {
113 return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(),
114 getDomainIdToSecondaryFeaturesMap(),
115 mapping_results_map );
118 Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) {
119 for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) {
120 final PhylogenyNode node = it.next();
121 final String node_identifier = node.getName();
122 final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ),
123 getUnitsGainedOnNode( node_identifier ),
124 getUnitsLostOnNode( node_identifier ),
125 TYPE_FORBINARY_CHARACTERS,
126 getSumOfPresentOnNode( node_identifier ),
127 getSumOfGainsOnNode( node_identifier ),
128 getSumOfLossesOnNode( node_identifier ) );
129 node.getNodeData().setBinaryCharacters( bc );
134 private void executeDolloParsimony( final boolean on_domain_presence ) {
136 final DolloParsimony dollo = DolloParsimony.createInstance();
137 dollo.setReturnGainLossMatrix( true );
138 dollo.setReturnInternalStates( true );
139 CharacterStateMatrix<BinaryStates> states = null;
140 if ( on_domain_presence ) {
141 states = createMatrixOfDomainPresenceOrAbsence();
144 states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence();
146 dollo.execute( getPhylogeny(), states );
147 setGainLossMatrix( dollo.getGainLossMatrix() );
148 setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
149 setCost( dollo.getCost() );
150 setTotalGains( dollo.getTotalGains() );
151 setTotalLosses( dollo.getTotalLosses() );
152 setTotalUnchanged( dollo.getTotalUnchanged() );
155 public void executeDolloParsimonyOnBinaryDomainCombintionPresence() {
156 executeDolloParsimony( false );
159 public void executeDolloParsimonyOnDomainPresence() {
160 executeDolloParsimony( true );
163 public void executeDolloParsimonyOnDomainPresence( final SortedSet<DomainId> positive_filter ) {
164 setPositiveFilter( positive_filter );
165 executeDolloParsimony( true );
166 setPositiveFilter( null );
169 public void executeDolloParsimonyOnSecondaryFeatures( final Map<Species, MappingResults> mapping_results_map ) {
170 if ( getDomainIdToSecondaryFeaturesMap() == null ) {
171 throw new RuntimeException( "Domain id to secondary features map has apparently not been set" );
174 final DolloParsimony dollo = DolloParsimony.createInstance();
175 dollo.setReturnGainLossMatrix( true );
176 dollo.setReturnInternalStates( true );
177 final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeaturePresenceOrAbsence( mapping_results_map );
178 dollo.execute( getPhylogeny(), states );
179 setGainLossMatrix( dollo.getGainLossMatrix() );
180 setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
181 setCost( dollo.getCost() );
182 setTotalGains( dollo.getTotalGains() );
183 setTotalLosses( dollo.getTotalLosses() );
184 setTotalUnchanged( dollo.getTotalUnchanged() );
187 private void executeFitchParsimony( final boolean on_domain_presence,
188 final boolean use_last,
189 final boolean randomize,
190 final long random_number_seed ) {
193 System.out.println( " Fitch parsimony: use_last = true" );
195 final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
196 fitch.setRandomize( randomize );
198 fitch.setRandomNumberSeed( random_number_seed );
200 fitch.setUseLast( use_last );
201 fitch.setReturnGainLossMatrix( true );
202 fitch.setReturnInternalStates( true );
203 CharacterStateMatrix<BinaryStates> states = null;
204 if ( on_domain_presence ) {
205 states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
208 states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
210 fitch.execute( getPhylogeny(), states );
211 setGainLossMatrix( fitch.getGainLossMatrix() );
212 setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
213 setCost( fitch.getCost() );
214 setTotalGains( fitch.getTotalGains() );
215 setTotalLosses( fitch.getTotalLosses() );
216 setTotalUnchanged( fitch.getTotalUnchanged() );
219 private void executeFitchParsimonyOnSecondaryFeatures( final boolean use_last,
220 final boolean randomize,
221 final long random_number_seed ) {
224 System.out.println( " Fitch parsimony: use_last = true" );
226 final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
227 fitch.setRandomize( randomize );
229 fitch.setRandomNumberSeed( random_number_seed );
231 fitch.setUseLast( use_last );
232 fitch.setReturnGainLossMatrix( true );
233 fitch.setReturnInternalStates( true );
234 final Map<DomainId, Set<String>> map = getDomainIdToSecondaryFeaturesMap();
235 final Map<DomainId, String> newmap = new HashMap<DomainId, String>();
236 final Iterator<Entry<DomainId, Set<String>>> it = map.entrySet().iterator();
237 while ( it.hasNext() ) {
238 final Map.Entry<DomainId, Set<String>> pair = it.next();
239 if ( pair.getValue().size() != 1 ) {
240 throw new IllegalArgumentException( pair.getKey().getId() + " mapps to " + pair.getValue().size()
243 newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] );
245 final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(),
247 fitch.execute( getPhylogeny(), states );
248 setGainLossMatrix( fitch.getGainLossMatrix() );
249 setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
250 setCost( fitch.getCost() );
251 setTotalGains( fitch.getTotalGains() );
252 setTotalLosses( fitch.getTotalLosses() );
253 setTotalUnchanged( fitch.getTotalUnchanged() );
256 public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) {
257 executeFitchParsimony( false, use_last, false, 0 );
260 public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) {
261 executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 );
264 public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) {
265 executeFitchParsimony( false, false, true, random_number_seed );
268 public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) {
269 executeFitchParsimony( true, use_last, false, 0 );
272 public void executeFitchParsimonyOnDomainPresence( final long random_number_seed ) {
273 executeFitchParsimony( true, false, true, random_number_seed );
276 public void executeOnGivenBinaryStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix,
277 final String[] character_labels ) {
279 if ( binary_states_matrix.getNumberOfCharacters() != character_labels.length ) {
280 throw new IllegalArgumentException( "binary states matrix number of characters is not equal to the number of character labels provided" );
282 if ( binary_states_matrix.getNumberOfIdentifiers() != getPhylogeny().getNumberOfBranches() ) {
283 throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" );
285 final CharacterStateMatrix<GainLossStates> gl_matrix = new BasicCharacterStateMatrix<GainLossStates>( binary_states_matrix
286 .getNumberOfIdentifiers(),
288 .getNumberOfCharacters() );
290 int total_losses = 0;
291 int total_unchanged = 0;
293 for( final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); it.hasNext(); ) {
294 gl_matrix.setIdentifier( i++, it.next().getName() );
296 for( int c = 0; c < character_labels.length; ++c ) {
297 gl_matrix.setCharacter( c, character_labels[ c ] );
298 final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder();
299 while ( it.hasNext() ) {
300 final PhylogenyNode node = it.next();
301 final String name = node.getName();
302 final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix
303 .getIdentifierIndex( name ), c );
304 final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent();
305 GainLossStates gl_state = null;
306 if ( node.isRoot() ) {
308 if ( bin_state == BinaryStates.ABSENT ) {
309 gl_state = GainLossStates.UNCHANGED_ABSENT;
312 gl_state = GainLossStates.UNCHANGED_PRESENT;
316 final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix
317 .getIdentifierIndex( parent_node.getName() ), c );
318 if ( bin_state == BinaryStates.ABSENT ) {
319 if ( parent_bin_state == BinaryStates.ABSENT ) {
321 gl_state = GainLossStates.UNCHANGED_ABSENT;
325 gl_state = GainLossStates.LOSS;
329 if ( parent_bin_state == BinaryStates.ABSENT ) {
331 gl_state = GainLossStates.GAIN;
335 gl_state = GainLossStates.UNCHANGED_PRESENT;
339 gl_matrix.setState( name, c, gl_state );
342 setTotalGains( total_gains );
343 setTotalLosses( total_losses );
344 setTotalUnchanged( total_unchanged );
345 setCost( total_gains + total_losses );
346 setGainLossMatrix( gl_matrix );
349 public int getCost() {
353 private Map<DomainId, Set<String>> getDomainIdToSecondaryFeaturesMap() {
354 return _domain_id_to_secondary_features_map;
357 public CharacterStateMatrix<Integer> getGainLossCountsMatrix() {
358 final CharacterStateMatrix<Integer> matrix = new BasicCharacterStateMatrix<Integer>( getGainLossMatrix()
359 .getNumberOfIdentifiers(), 3 );
360 for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
361 matrix.setIdentifier( i, getGainLossMatrix().getIdentifier( i ) );
363 matrix.setCharacter( 0, "GAINS" );
364 matrix.setCharacter( 1, "LOSSES" );
365 matrix.setCharacter( 2, "NET" );
366 for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
369 for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
370 final GainLossStates s = getGainLossMatrix().getState( i, c );
371 if ( s == GainLossStates.GAIN ) {
374 else if ( s == GainLossStates.LOSS ) {
378 matrix.setState( i, 0, gains );
379 matrix.setState( i, 1, losses );
380 matrix.setState( i, 2, gains - losses );
385 public CharacterStateMatrix<GainLossStates> getGainLossMatrix() {
386 return _gain_loss_matrix;
389 private List<GenomeWideCombinableDomains> getGenomeWideCombinableDomainsList() {
393 public CharacterStateMatrix<BinaryStates> getInternalStatesMatrix() {
394 return _binary_internal_states_matrix;
397 public int getNetGainsOnNode( final String node_identifier ) {
398 if ( getGainLossMatrix() == null ) {
399 throw new RuntimeException( "no gain loss matrix has been calculated" );
402 final int id_index = getGainLossMatrix().getIdentifierIndex( node_identifier );
403 for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
404 if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.GAIN ) {
407 else if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.LOSS ) {
414 private Phylogeny getPhylogeny() {
418 private SortedSet<DomainId> getPositiveFilter() {
419 return _positive_filter;
422 public int getSumOfGainsOnNode( final String node_identifier ) {
423 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
426 public int getSumOfLossesOnNode( final String node_identifier ) {
427 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
430 public int getSumOfPresentOnNode( final String node_identifier ) {
431 return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier );
434 int getSumOfUnchangedAbsentOnNode( final String node_identifier ) {
435 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
438 int getSumOfUnchangedOnNode( final String node_identifier ) {
439 return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier );
442 int getSumOfUnchangedPresentOnNode( final String node_identifier ) {
443 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
446 public int getTotalGains() {
450 public int getTotalLosses() {
451 return _total_losses;
454 public int getTotalUnchanged() {
455 return _total_unchanged;
458 public SortedSet<String> getUnitsGainedOnNode( final String node_identifier ) {
459 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
462 public SortedSet<String> getUnitsLostOnNode( final String node_identifier ) {
463 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
466 public SortedSet<String> getUnitsOnNode( final String node_identifier ) {
467 final SortedSet<String> present = getUnitsGainedOnNode( node_identifier );
468 present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) );
472 SortedSet<String> getUnitsUnchangedAbsentOnNode( final String node_identifier ) {
473 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
476 SortedSet<String> getUnitsUnchangedPresentOnNode( final String node_identifier ) {
477 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
480 private void init() {
481 setDomainIdToSecondaryFeaturesMap( null );
482 setPositiveFilter( null );
486 private void reset() {
487 setGainLossMatrix( null );
488 setBinaryInternalStatesMatrix( null );
491 setTotalLosses( -1 );
492 setTotalUnchanged( -1 );
495 private void setBinaryInternalStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix ) {
496 _binary_internal_states_matrix = binary_states_matrix;
499 private void setCost( final int cost ) {
503 private void setDomainIdToSecondaryFeaturesMap( final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
504 _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map;
507 private void setGainLossMatrix( final CharacterStateMatrix<GainLossStates> gain_loss_matrix ) {
508 _gain_loss_matrix = gain_loss_matrix;
511 private void setPositiveFilter( final SortedSet<DomainId> positive_filter ) {
512 _positive_filter = positive_filter;
515 private void setTotalGains( final int total_gains ) {
516 _total_gains = total_gains;
519 private void setTotalLosses( final int total_losses ) {
520 _total_losses = total_losses;
523 private void setTotalUnchanged( final int total_unchanged ) {
524 _total_unchanged = total_unchanged;
527 public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) {
528 return new DomainParsimonyCalculator( phylogeny );
531 public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
532 final List<GenomeWideCombinableDomains> gwcd_list ) {
533 if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
534 throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes()
535 + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" );
537 return new DomainParsimonyCalculator( phylogeny, gwcd_list );
540 public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
541 final List<GenomeWideCombinableDomains> gwcd_list,
542 final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
543 if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
544 throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" );
546 return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map );
550 * For folds instead of Pfam-domains, for example
556 static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
557 final Map<DomainId, Set<String>> domain_id_to_second_features_map,
558 final Map<Species, MappingResults> mapping_results_map ) {
559 if ( gwcd_list.isEmpty() ) {
560 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
562 if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
563 throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
565 final int number_of_identifiers = gwcd_list.size();
566 final SortedSet<String> all_secondary_features = new TreeSet<String>();
567 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
570 for( final DomainId domain : gwcd.getAllDomainIds() ) {
571 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
572 all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
579 if ( mapping_results_map != null ) {
580 final MappingResults mr = new MappingResults();
581 mr.setDescription( gwcd.getSpecies().getSpeciesId() );
582 mr.setSumOfSuccesses( mapped );
583 mr.setSumOfFailures( not_mapped );
584 mapping_results_map.put( gwcd.getSpecies(), mr );
587 final int number_of_characters = all_secondary_features.size();
588 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
589 number_of_characters );
590 int character_index = 0;
591 for( final String second_id : all_secondary_features ) {
592 matrix.setCharacter( character_index++, second_id );
594 int identifier_index = 0;
595 final Set<String> all_identifiers = new HashSet<String>();
596 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
597 final String species_id = gwcd.getSpecies().getSpeciesId();
598 if ( all_identifiers.contains( species_id ) ) {
599 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
601 all_identifiers.add( species_id );
602 matrix.setIdentifier( identifier_index, species_id );
603 final Set<String> all_second_per_gwcd = new HashSet<String>();
604 for( final DomainId domain : gwcd.getAllDomainIds() ) {
605 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
606 all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
609 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
610 if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
611 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
614 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
622 public static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
623 final Map<DomainId, String> domain_id_to_second_features_map ) {
624 if ( gwcd_list.isEmpty() ) {
625 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
627 if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
628 throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
630 final int number_of_identifiers = gwcd_list.size();
631 final SortedSet<BinaryDomainCombination> all_binary_combinations_mapped = new TreeSet<BinaryDomainCombination>();
632 final Set<BinaryDomainCombination>[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ];
633 int identifier_index = 0;
634 final SortedSet<String> no_mappings = new TreeSet<String>();
635 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
636 binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet<BinaryDomainCombination>();
637 for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
638 final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map,
641 all_binary_combinations_mapped.add( mapped_bc );
642 binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc );
646 if ( !no_mappings.isEmpty() ) {
647 ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size()
649 for( final String id : no_mappings ) {
650 ForesterUtil.programMessage( surfacing.PRG_NAME, id );
653 final int number_of_characters = all_binary_combinations_mapped.size();
654 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
655 number_of_characters );
656 int character_index = 0;
657 for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) {
658 matrix.setCharacter( character_index++, bc.toString() );
660 identifier_index = 0;
661 final Set<String> all_identifiers = new HashSet<String>();
662 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
663 final String species_id = gwcd.getSpecies().getSpeciesId();
664 if ( all_identifiers.contains( species_id ) ) {
665 throw new AssertionError( "species [" + species_id + "] is not unique" );
667 all_identifiers.add( species_id );
668 matrix.setIdentifier( identifier_index, species_id );
669 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
670 BinaryDomainCombination bc = null;
671 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
672 bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
674 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
675 bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
678 bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
680 if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) {
681 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
684 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
692 private static BinaryDomainCombination mapBinaryDomainCombination( final Map<DomainId, String> domain_id_to_second_features_map,
693 final BinaryDomainCombination bc,
694 final SortedSet<String> no_mappings ) {
697 if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) {
698 no_mappings.add( bc.getId0().getId() );
699 id0 = bc.getId0().getId();
702 id0 = domain_id_to_second_features_map.get( bc.getId0() );
704 if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) {
705 no_mappings.add( bc.getId1().getId() );
706 id1 = bc.getId1().getId();
709 id1 = domain_id_to_second_features_map.get( bc.getId1() );
711 return new BasicBinaryDomainCombination( id0, id1 );
714 public static CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
715 if ( gwcd_list.isEmpty() ) {
716 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
718 final int number_of_identifiers = gwcd_list.size();
719 final SortedSet<BinaryDomainCombination> all_binary_combinations = new TreeSet<BinaryDomainCombination>();
720 final Set<BinaryDomainCombination>[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ];
721 int identifier_index = 0;
722 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
723 binary_combinations_per_genome[ identifier_index ] = new HashSet<BinaryDomainCombination>();
724 for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
725 all_binary_combinations.add( bc );
726 binary_combinations_per_genome[ identifier_index ].add( bc );
730 final int number_of_characters = all_binary_combinations.size();
731 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
732 number_of_characters );
733 int character_index = 0;
734 for( final BinaryDomainCombination bc : all_binary_combinations ) {
735 matrix.setCharacter( character_index++, bc.toString() );
737 identifier_index = 0;
738 final Set<String> all_identifiers = new HashSet<String>();
739 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
740 final String species_id = gwcd.getSpecies().getSpeciesId();
741 if ( all_identifiers.contains( species_id ) ) {
742 throw new AssertionError( "species [" + species_id + "] is not unique" );
744 all_identifiers.add( species_id );
745 matrix.setIdentifier( identifier_index, species_id );
746 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
747 BinaryDomainCombination bc = null;
748 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
749 bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
751 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
752 bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
755 bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
757 if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) {
758 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
761 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
769 static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
770 return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null );
773 public static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
774 final SortedSet<DomainId> positive_filter ) {
775 if ( gwcd_list.isEmpty() ) {
776 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
778 if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) {
779 throw new IllegalArgumentException( "positive filter is empty" );
781 final int number_of_identifiers = gwcd_list.size();
782 final SortedSet<DomainId> all_domain_ids = new TreeSet<DomainId>();
783 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
784 for( final DomainId domain : gwcd.getAllDomainIds() ) {
785 all_domain_ids.add( domain );
788 int number_of_characters = all_domain_ids.size();
789 if ( positive_filter != null ) {
790 //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes
791 number_of_characters = 0;
792 for( final DomainId id : all_domain_ids ) {
793 if ( positive_filter.contains( id ) ) {
794 number_of_characters++;
798 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
799 number_of_characters );
800 int character_index = 0;
801 for( final DomainId id : all_domain_ids ) {
802 if ( positive_filter == null ) {
803 matrix.setCharacter( character_index++, id.getId() );
806 if ( positive_filter.contains( id ) ) {
807 matrix.setCharacter( character_index++, id.getId() );
811 int identifier_index = 0;
812 final Set<String> all_identifiers = new HashSet<String>();
813 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
814 final String species_id = gwcd.getSpecies().getSpeciesId();
815 if ( all_identifiers.contains( species_id ) ) {
816 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
818 all_identifiers.add( species_id );
819 matrix.setIdentifier( identifier_index, species_id );
820 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
821 if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) {
822 throw new RuntimeException( "this should not have happened: problem with character #" + ci );
824 final DomainId id = new DomainId( matrix.getCharacter( ci ) );
825 if ( gwcd.contains( id ) ) {
826 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
829 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
837 private static int getStateSumDeltaOnNode( final String node_identifier,
838 final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
839 final GainLossStates state ) {
840 if ( gain_loss_matrix == null ) {
841 throw new RuntimeException( "no gain loss matrix has been calculated" );
843 if ( ForesterUtil.isEmpty( node_identifier ) ) {
844 throw new IllegalArgumentException( "node identifier must not be empty" );
846 if ( gain_loss_matrix.isEmpty() ) {
847 throw new RuntimeException( "gain loss matrix is empty" );
850 final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
851 for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
852 if ( gain_loss_matrix.getState( id_index, c ) == state ) {
859 private static SortedSet<String> getUnitsDeltaOnNode( final String node_identifier,
860 final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
861 final GainLossStates state ) {
862 if ( gain_loss_matrix == null ) {
863 throw new RuntimeException( "no gain loss matrix has been calculated" );
865 if ( ForesterUtil.isEmpty( node_identifier ) ) {
866 throw new IllegalArgumentException( "node identifier must not be empty" );
868 if ( gain_loss_matrix.isEmpty() ) {
869 throw new RuntimeException( "gain loss matrix is empty" );
871 final SortedSet<String> d = new TreeSet<String>();
872 final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
873 for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
874 if ( gain_loss_matrix.getState( id_index, c ) == state ) {
875 if ( d.contains( gain_loss_matrix.getCharacter( c ) ) ) {
876 throw new AssertionError( "this should not have happended: character ["
877 + gain_loss_matrix.getCharacter( c ) + "] already in set" );
879 d.add( gain_loss_matrix.getCharacter( c ) );