3 // FORESTER -- software libraries and applications
4 // for evolutionary biology research and applications.
6 // Copyright (C) 2008-2009 Christian M. Zmasek
7 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
27 package org.forester.surfacing;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.Iterator;
32 import java.util.List;
34 import java.util.Map.Entry;
36 import java.util.SortedSet;
37 import java.util.TreeSet;
39 import org.forester.application.surfacing;
40 import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix;
41 import org.forester.evoinference.matrix.character.CharacterStateMatrix;
42 import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates;
43 import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates;
44 import org.forester.evoinference.parsimony.DolloParsimony;
45 import org.forester.evoinference.parsimony.FitchParsimony;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.data.BinaryCharacters;
49 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
50 import org.forester.surfacing.BinaryDomainCombination.DomainCombinationType;
51 import org.forester.util.ForesterUtil;
53 public final class DomainParsimonyCalculator {
55 private static final String TYPE_FORBINARY_CHARACTERS = "parsimony inferred";
56 private CharacterStateMatrix<GainLossStates> _gain_loss_matrix;
57 private CharacterStateMatrix<BinaryStates> _binary_internal_states_matrix;
58 private final List<GenomeWideCombinableDomains> _gwcd_list;
59 private final Phylogeny _phylogeny;
60 private int _total_losses;
61 private int _total_gains;
62 private int _total_unchanged;
64 private Map<DomainId, Set<String>> _domain_id_to_secondary_features_map;
65 private SortedSet<DomainId> _positive_filter;
67 private DomainParsimonyCalculator( final Phylogeny phylogeny ) {
69 _phylogeny = phylogeny;
73 private DomainParsimonyCalculator( final Phylogeny phylogeny, final List<GenomeWideCombinableDomains> gwcd_list ) {
75 _phylogeny = phylogeny;
76 _gwcd_list = gwcd_list;
79 private DomainParsimonyCalculator( final Phylogeny phylogeny,
80 final List<GenomeWideCombinableDomains> gwcd_list,
81 final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
83 _phylogeny = phylogeny;
84 _gwcd_list = gwcd_list;
85 setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map );
88 int calculateNumberOfBinaryDomainCombination() {
89 if ( getGenomeWideCombinableDomainsList().isEmpty() ) {
90 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
92 final Set<BinaryDomainCombination> all_binary_combinations = new HashSet<BinaryDomainCombination>();
93 for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) {
94 for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
95 all_binary_combinations.add( bc );
98 return all_binary_combinations.size();
101 CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence() {
102 return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
105 CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence() {
106 return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() );
109 CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map<Species, MappingResults> mapping_results_map ) {
110 return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(),
111 getDomainIdToSecondaryFeaturesMap(),
112 mapping_results_map );
115 Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) {
116 for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) {
117 final PhylogenyNode node = it.next();
118 final String node_identifier = node.getName();
119 final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ),
120 getUnitsGainedOnNode( node_identifier ),
121 getUnitsLostOnNode( node_identifier ),
122 TYPE_FORBINARY_CHARACTERS,
123 getSumOfPresentOnNode( node_identifier ),
124 getSumOfGainsOnNode( node_identifier ),
125 getSumOfLossesOnNode( node_identifier ) );
126 node.getNodeData().setBinaryCharacters( bc );
131 private void executeDolloParsimony( final boolean on_domain_presence ) {
133 final DolloParsimony dollo = DolloParsimony.createInstance();
134 dollo.setReturnGainLossMatrix( true );
135 dollo.setReturnInternalStates( true );
136 CharacterStateMatrix<BinaryStates> states = null;
137 if ( on_domain_presence ) {
138 states = createMatrixOfDomainPresenceOrAbsence();
141 states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence();
143 dollo.execute( getPhylogeny(), states );
144 setGainLossMatrix( dollo.getGainLossMatrix() );
145 setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
146 setCost( dollo.getCost() );
147 setTotalGains( dollo.getTotalGains() );
148 setTotalLosses( dollo.getTotalLosses() );
149 setTotalUnchanged( dollo.getTotalUnchanged() );
152 public void executeDolloParsimonyOnBinaryDomainCombintionPresence() {
153 executeDolloParsimony( false );
156 public void executeDolloParsimonyOnDomainPresence() {
157 executeDolloParsimony( true );
160 public void executeDolloParsimonyOnDomainPresence( final SortedSet<DomainId> positive_filter ) {
161 setPositiveFilter( positive_filter );
162 executeDolloParsimony( true );
163 setPositiveFilter( null );
166 public void executeDolloParsimonyOnSecondaryFeatures( final Map<Species, MappingResults> mapping_results_map ) {
167 if ( getDomainIdToSecondaryFeaturesMap() == null ) {
168 throw new RuntimeException( "Domain id to secondary features map has apparently not been set" );
171 final DolloParsimony dollo = DolloParsimony.createInstance();
172 dollo.setReturnGainLossMatrix( true );
173 dollo.setReturnInternalStates( true );
174 final CharacterStateMatrix<BinaryStates> states = createMatrixOfSecondaryFeaturePresenceOrAbsence( mapping_results_map );
175 dollo.execute( getPhylogeny(), states );
176 setGainLossMatrix( dollo.getGainLossMatrix() );
177 setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() );
178 setCost( dollo.getCost() );
179 setTotalGains( dollo.getTotalGains() );
180 setTotalLosses( dollo.getTotalLosses() );
181 setTotalUnchanged( dollo.getTotalUnchanged() );
184 private void executeFitchParsimony( final boolean on_domain_presence,
185 final boolean use_last,
186 final boolean randomize,
187 final long random_number_seed ) {
190 System.out.println( " Fitch parsimony: use_last = true" );
192 final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
193 fitch.setRandomize( randomize );
195 fitch.setRandomNumberSeed( random_number_seed );
197 fitch.setUseLast( use_last );
198 fitch.setReturnGainLossMatrix( true );
199 fitch.setReturnInternalStates( true );
200 CharacterStateMatrix<BinaryStates> states = null;
201 if ( on_domain_presence ) {
202 states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
205 states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() );
207 fitch.execute( getPhylogeny(), states );
208 setGainLossMatrix( fitch.getGainLossMatrix() );
209 setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
210 setCost( fitch.getCost() );
211 setTotalGains( fitch.getTotalGains() );
212 setTotalLosses( fitch.getTotalLosses() );
213 setTotalUnchanged( fitch.getTotalUnchanged() );
216 private void executeFitchParsimonyOnSecondaryFeatures(
217 final boolean use_last,
218 final boolean randomize,
219 final long random_number_seed ) {
222 System.out.println( " Fitch parsimony: use_last = true" );
224 final FitchParsimony<BinaryStates> fitch = new FitchParsimony<BinaryStates>();
225 fitch.setRandomize( randomize );
227 fitch.setRandomNumberSeed( random_number_seed );
229 fitch.setUseLast( use_last );
230 fitch.setReturnGainLossMatrix( true );
231 fitch.setReturnInternalStates( true );
233 final Map<DomainId, Set<String>> map = getDomainIdToSecondaryFeaturesMap();
234 final Map<DomainId, String> newmap = new HashMap<DomainId, String>();
235 final Iterator<Entry<DomainId, Set<String>>> it = map.entrySet().iterator();
236 while (it.hasNext()) {
237 final Map.Entry<DomainId, Set<String>> pair = (Map.Entry<DomainId, Set<String>>)it.next();
238 if ( pair.getValue().size() != 1 ) {
239 throw new IllegalArgumentException( pair.getKey().getId() + " mapps to " + pair.getValue().size() + " items" );
241 newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] );
244 CharacterStateMatrix<BinaryStates> states =createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(),
247 fitch.execute( getPhylogeny(), states );
248 setGainLossMatrix( fitch.getGainLossMatrix() );
249 setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() );
250 setCost( fitch.getCost() );
251 setTotalGains( fitch.getTotalGains() );
252 setTotalLosses( fitch.getTotalLosses() );
253 setTotalUnchanged( fitch.getTotalUnchanged() );
256 public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) {
257 executeFitchParsimony( false, use_last, false, 0 );
260 public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) {
261 executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 );
264 public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) {
265 executeFitchParsimony( false, false, true, random_number_seed );
268 public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) {
269 executeFitchParsimony( true, use_last, false, 0 );
272 public void executeFitchParsimonyOnDomainPresence( final long random_number_seed ) {
273 executeFitchParsimony( true, false, true, random_number_seed );
276 public void executeOnGivenBinaryStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix,
277 final String[] character_labels ) {
279 if ( binary_states_matrix.getNumberOfCharacters() != character_labels.length ) {
280 throw new IllegalArgumentException( "binary states matrix number of characters is not equal to the number of character labels provided" );
282 if ( binary_states_matrix.getNumberOfIdentifiers() != getPhylogeny().getNumberOfBranches() ) {
283 throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" );
285 final CharacterStateMatrix<GainLossStates> gl_matrix = new BasicCharacterStateMatrix<GainLossStates>( binary_states_matrix
286 .getNumberOfIdentifiers(),
288 .getNumberOfCharacters() );
290 int total_losses = 0;
291 int total_unchanged = 0;
293 for( final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder(); it.hasNext(); ) {
294 gl_matrix.setIdentifier( i++, it.next().getName() );
296 for( int c = 0; c < character_labels.length; ++c ) {
297 gl_matrix.setCharacter( c, character_labels[ c ] );
298 final PhylogenyNodeIterator it = getPhylogeny().iteratorPostorder();
299 while ( it.hasNext() ) {
300 final PhylogenyNode node = it.next();
301 final String name = node.getName();
302 final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix
303 .getIdentifierIndex( name ), c );
304 final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent();
305 GainLossStates gl_state = null;
306 if ( node.isRoot() ) {
308 if ( bin_state == BinaryStates.ABSENT ) {
309 gl_state = GainLossStates.UNCHANGED_ABSENT;
312 gl_state = GainLossStates.UNCHANGED_PRESENT;
316 final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix
317 .getIdentifierIndex( parent_node.getName() ), c );
318 if ( bin_state == BinaryStates.ABSENT ) {
319 if ( parent_bin_state == BinaryStates.ABSENT ) {
321 gl_state = GainLossStates.UNCHANGED_ABSENT;
325 gl_state = GainLossStates.LOSS;
329 if ( parent_bin_state == BinaryStates.ABSENT ) {
331 gl_state = GainLossStates.GAIN;
335 gl_state = GainLossStates.UNCHANGED_PRESENT;
339 gl_matrix.setState( name, c, gl_state );
342 setTotalGains( total_gains );
343 setTotalLosses( total_losses );
344 setTotalUnchanged( total_unchanged );
345 setCost( total_gains + total_losses );
346 setGainLossMatrix( gl_matrix );
349 public int getCost() {
353 private Map<DomainId, Set<String>> getDomainIdToSecondaryFeaturesMap() {
354 return _domain_id_to_secondary_features_map;
357 public CharacterStateMatrix<Integer> getGainLossCountsMatrix() {
358 final CharacterStateMatrix<Integer> matrix = new BasicCharacterStateMatrix<Integer>( getGainLossMatrix()
359 .getNumberOfIdentifiers(), 3 );
360 for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
361 matrix.setIdentifier( i, getGainLossMatrix().getIdentifier( i ) );
363 matrix.setCharacter( 0, "GAINS" );
364 matrix.setCharacter( 1, "LOSSES" );
365 matrix.setCharacter( 2, "NET" );
366 for( int i = 0; i < getGainLossMatrix().getNumberOfIdentifiers(); ++i ) {
369 for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
370 final GainLossStates s = getGainLossMatrix().getState( i, c );
371 if ( s == GainLossStates.GAIN ) {
374 else if ( s == GainLossStates.LOSS ) {
378 matrix.setState( i, 0, gains );
379 matrix.setState( i, 1, losses );
380 matrix.setState( i, 2, gains - losses );
385 public CharacterStateMatrix<GainLossStates> getGainLossMatrix() {
386 return _gain_loss_matrix;
389 private List<GenomeWideCombinableDomains> getGenomeWideCombinableDomainsList() {
393 public CharacterStateMatrix<BinaryStates> getInternalStatesMatrix() {
394 return _binary_internal_states_matrix;
397 public int getNetGainsOnNode( final String node_identifier ) {
398 if ( getGainLossMatrix() == null ) {
399 throw new RuntimeException( "no gain loss matrix has been calculated" );
402 final int id_index = getGainLossMatrix().getIdentifierIndex( node_identifier );
403 for( int c = 0; c < getGainLossMatrix().getNumberOfCharacters(); ++c ) {
404 if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.GAIN ) {
407 else if ( getGainLossMatrix().getState( id_index, c ) == GainLossStates.LOSS ) {
414 private Phylogeny getPhylogeny() {
418 private SortedSet<DomainId> getPositiveFilter() {
419 return _positive_filter;
422 public int getSumOfGainsOnNode( final String node_identifier ) {
423 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
426 public int getSumOfLossesOnNode( final String node_identifier ) {
427 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
430 public int getSumOfPresentOnNode( final String node_identifier ) {
431 return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier );
434 int getSumOfUnchangedAbsentOnNode( final String node_identifier ) {
435 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
438 int getSumOfUnchangedOnNode( final String node_identifier ) {
439 return getSumOfUnchangedPresentOnNode( node_identifier ) + getSumOfUnchangedAbsentOnNode( node_identifier );
442 int getSumOfUnchangedPresentOnNode( final String node_identifier ) {
443 return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
446 public int getTotalGains() {
450 public int getTotalLosses() {
451 return _total_losses;
454 public int getTotalUnchanged() {
455 return _total_unchanged;
458 public SortedSet<String> getUnitsGainedOnNode( final String node_identifier ) {
459 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN );
462 public SortedSet<String> getUnitsLostOnNode( final String node_identifier ) {
463 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS );
466 public SortedSet<String> getUnitsOnNode( final String node_identifier ) {
467 final SortedSet<String> present = getUnitsGainedOnNode( node_identifier );
468 present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) );
472 SortedSet<String> getUnitsUnchangedAbsentOnNode( final String node_identifier ) {
473 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT );
476 SortedSet<String> getUnitsUnchangedPresentOnNode( final String node_identifier ) {
477 return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT );
480 private void init() {
481 setDomainIdToSecondaryFeaturesMap( null );
482 setPositiveFilter( null );
486 private void reset() {
487 setGainLossMatrix( null );
488 setBinaryInternalStatesMatrix( null );
491 setTotalLosses( -1 );
492 setTotalUnchanged( -1 );
495 private void setBinaryInternalStatesMatrix( final CharacterStateMatrix<BinaryStates> binary_states_matrix ) {
496 _binary_internal_states_matrix = binary_states_matrix;
499 private void setCost( final int cost ) {
503 private void setDomainIdToSecondaryFeaturesMap( final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
504 _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map;
507 private void setGainLossMatrix( final CharacterStateMatrix<GainLossStates> gain_loss_matrix ) {
508 _gain_loss_matrix = gain_loss_matrix;
511 private void setPositiveFilter( final SortedSet<DomainId> positive_filter ) {
512 _positive_filter = positive_filter;
515 private void setTotalGains( final int total_gains ) {
516 _total_gains = total_gains;
519 private void setTotalLosses( final int total_losses ) {
520 _total_losses = total_losses;
523 private void setTotalUnchanged( final int total_unchanged ) {
524 _total_unchanged = total_unchanged;
527 public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny ) {
528 return new DomainParsimonyCalculator( phylogeny );
531 public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
532 final List<GenomeWideCombinableDomains> gwcd_list ) {
533 if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
534 throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes()
535 + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" );
537 return new DomainParsimonyCalculator( phylogeny, gwcd_list );
540 public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny,
541 final List<GenomeWideCombinableDomains> gwcd_list,
542 final Map<DomainId, Set<String>> domain_id_to_secondary_features_map ) {
543 if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) {
544 throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" );
546 return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map );
551 * For folds instead of Pfam-domains, for example
557 static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeaturePresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
558 final Map<DomainId, Set<String>> domain_id_to_second_features_map,
559 final Map<Species, MappingResults> mapping_results_map ) {
560 if ( gwcd_list.isEmpty() ) {
561 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
563 if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
564 throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
566 final int number_of_identifiers = gwcd_list.size();
567 final SortedSet<String> all_secondary_features = new TreeSet<String>();
568 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
571 for( final DomainId domain : gwcd.getAllDomainIds() ) {
572 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
573 all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) );
580 if ( mapping_results_map != null ) {
581 final MappingResults mr = new MappingResults();
582 mr.setDescription( gwcd.getSpecies().getSpeciesId() );
583 mr.setSumOfSuccesses( mapped );
584 mr.setSumOfFailures( not_mapped );
585 mapping_results_map.put( gwcd.getSpecies(), mr );
588 final int number_of_characters = all_secondary_features.size();
589 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
590 number_of_characters );
591 int character_index = 0;
592 for( final String second_id : all_secondary_features ) {
593 matrix.setCharacter( character_index++, second_id );
595 int identifier_index = 0;
596 final Set<String> all_identifiers = new HashSet<String>();
597 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
598 final String species_id = gwcd.getSpecies().getSpeciesId();
599 if ( all_identifiers.contains( species_id ) ) {
600 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
602 all_identifiers.add( species_id );
603 matrix.setIdentifier( identifier_index, species_id );
604 final Set<String> all_second_per_gwcd = new HashSet<String>();
605 for( final DomainId domain : gwcd.getAllDomainIds() ) {
606 if ( domain_id_to_second_features_map.containsKey( domain ) ) {
607 all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) );
610 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
611 if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) {
612 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
615 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
624 public static CharacterStateMatrix<BinaryStates> createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
625 final Map<DomainId, String> domain_id_to_second_features_map) {
626 if ( gwcd_list.isEmpty() ) {
627 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
629 if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) {
630 throw new IllegalArgumentException( "domain id to secondary features map is null or empty" );
632 final int number_of_identifiers = gwcd_list.size();
633 final SortedSet<BinaryDomainCombination> all_binary_combinations_mapped = new TreeSet<BinaryDomainCombination>();
634 final Set<BinaryDomainCombination>[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ];
635 int identifier_index = 0;
637 final SortedSet<String> no_mappings = new TreeSet<String>();
638 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
639 binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet<BinaryDomainCombination>();
640 for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
642 final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map,
645 all_binary_combinations_mapped.add( mapped_bc );
646 binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc );
651 if ( !no_mappings.isEmpty() ) {
652 ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size() + "):" );
653 for( final String id : no_mappings ) {
654 ForesterUtil.programMessage( surfacing.PRG_NAME, id);
658 final int number_of_characters = all_binary_combinations_mapped.size();
659 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
660 number_of_characters );
661 int character_index = 0;
662 for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) {
663 matrix.setCharacter( character_index++, bc.toString() );
665 identifier_index = 0;
666 final Set<String> all_identifiers = new HashSet<String>();
667 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
668 final String species_id = gwcd.getSpecies().getSpeciesId();
669 if ( all_identifiers.contains( species_id ) ) {
670 throw new AssertionError( "species [" + species_id + "] is not unique" );
672 all_identifiers.add( species_id );
673 matrix.setIdentifier( identifier_index, species_id );
674 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
675 BinaryDomainCombination bc = null;
676 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
677 bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
679 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
680 bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
683 bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
685 if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) {
686 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
689 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
697 private static BinaryDomainCombination mapBinaryDomainCombination( final Map<DomainId, String> domain_id_to_second_features_map,
698 final BinaryDomainCombination bc,
699 final SortedSet<String> no_mappings ) {
703 if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) {
705 no_mappings.add(bc.getId0().getId() );
706 id0 = bc.getId0().getId();
709 id0 = domain_id_to_second_features_map.get( bc.getId0());
711 if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) {
713 no_mappings.add(bc.getId1().getId() );
714 id1 = bc.getId1().getId();
717 id1 = domain_id_to_second_features_map.get( bc.getId1());
720 return new BasicBinaryDomainCombination( id0, id1 );
726 public static CharacterStateMatrix<BinaryStates> createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
727 if ( gwcd_list.isEmpty() ) {
728 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
730 final int number_of_identifiers = gwcd_list.size();
731 final SortedSet<BinaryDomainCombination> all_binary_combinations = new TreeSet<BinaryDomainCombination>();
732 final Set<BinaryDomainCombination>[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ];
733 int identifier_index = 0;
734 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
735 binary_combinations_per_genome[ identifier_index ] = new HashSet<BinaryDomainCombination>();
736 for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) {
737 all_binary_combinations.add( bc );
738 binary_combinations_per_genome[ identifier_index ].add( bc );
742 final int number_of_characters = all_binary_combinations.size();
743 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
744 number_of_characters );
745 int character_index = 0;
746 for( final BinaryDomainCombination bc : all_binary_combinations ) {
747 matrix.setCharacter( character_index++, bc.toString() );
749 identifier_index = 0;
750 final Set<String> all_identifiers = new HashSet<String>();
751 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
752 final String species_id = gwcd.getSpecies().getSpeciesId();
753 if ( all_identifiers.contains( species_id ) ) {
754 throw new AssertionError( "species [" + species_id + "] is not unique" );
756 all_identifiers.add( species_id );
757 matrix.setIdentifier( identifier_index, species_id );
758 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
759 BinaryDomainCombination bc = null;
760 if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) {
761 bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
763 else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) {
764 bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
767 bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) );
769 if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) {
770 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
773 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
781 static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list ) {
782 return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null );
785 public static CharacterStateMatrix<BinaryStates> createMatrixOfDomainPresenceOrAbsence( final List<GenomeWideCombinableDomains> gwcd_list,
786 final SortedSet<DomainId> positive_filter ) {
787 if ( gwcd_list.isEmpty() ) {
788 throw new IllegalArgumentException( "genome wide combinable domains list is empty" );
790 if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) {
791 throw new IllegalArgumentException( "positive filter is empty" );
793 final int number_of_identifiers = gwcd_list.size();
794 final SortedSet<DomainId> all_domain_ids = new TreeSet<DomainId>();
795 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
796 for( final DomainId domain : gwcd.getAllDomainIds() ) {
797 all_domain_ids.add( domain );
800 int number_of_characters = all_domain_ids.size();
801 if ( positive_filter != null ) {
802 //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes
803 number_of_characters = 0;
804 for( final DomainId id : all_domain_ids ) {
805 if ( positive_filter.contains( id ) ) {
806 number_of_characters++;
810 final CharacterStateMatrix<CharacterStateMatrix.BinaryStates> matrix = new BasicCharacterStateMatrix<CharacterStateMatrix.BinaryStates>( number_of_identifiers,
811 number_of_characters );
812 int character_index = 0;
813 for( final DomainId id : all_domain_ids ) {
814 if ( positive_filter == null ) {
815 matrix.setCharacter( character_index++, id.getId() );
818 if ( positive_filter.contains( id ) ) {
819 matrix.setCharacter( character_index++, id.getId() );
823 int identifier_index = 0;
824 final Set<String> all_identifiers = new HashSet<String>();
825 for( final GenomeWideCombinableDomains gwcd : gwcd_list ) {
826 final String species_id = gwcd.getSpecies().getSpeciesId();
827 if ( all_identifiers.contains( species_id ) ) {
828 throw new IllegalArgumentException( "species [" + species_id + "] is not unique" );
830 all_identifiers.add( species_id );
831 matrix.setIdentifier( identifier_index, species_id );
832 for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) {
833 if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) {
834 throw new RuntimeException( "this should not have happened: problem with character #" + ci );
836 final DomainId id = new DomainId( matrix.getCharacter( ci ) );
837 if ( gwcd.contains( id ) ) {
838 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT );
841 matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT );
851 private static int getStateSumDeltaOnNode( final String node_identifier,
852 final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
853 final GainLossStates state ) {
854 if ( gain_loss_matrix == null ) {
855 throw new RuntimeException( "no gain loss matrix has been calculated" );
857 if ( ForesterUtil.isEmpty( node_identifier ) ) {
858 throw new IllegalArgumentException( "node identifier must not be empty" );
860 if ( gain_loss_matrix.isEmpty() ) {
861 throw new RuntimeException( "gain loss matrix is empty" );
864 final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
865 for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
866 if ( gain_loss_matrix.getState( id_index, c ) == state ) {
873 private static SortedSet<String> getUnitsDeltaOnNode( final String node_identifier,
874 final CharacterStateMatrix<GainLossStates> gain_loss_matrix,
875 final GainLossStates state ) {
876 if ( gain_loss_matrix == null ) {
877 throw new RuntimeException( "no gain loss matrix has been calculated" );
879 if ( ForesterUtil.isEmpty( node_identifier ) ) {
880 throw new IllegalArgumentException( "node identifier must not be empty" );
882 if ( gain_loss_matrix.isEmpty() ) {
883 throw new RuntimeException( "gain loss matrix is empty" );
885 final SortedSet<String> d = new TreeSet<String>();
886 final int id_index = gain_loss_matrix.getIdentifierIndex( node_identifier );
887 for( int c = 0; c < gain_loss_matrix.getNumberOfCharacters(); ++c ) {
888 if ( gain_loss_matrix.getState( id_index, c ) == state ) {
889 if ( d.contains( gain_loss_matrix.getCharacter( c ) ) ) {
890 throw new AssertionError( "this should not have happended: character ["
891 + gain_loss_matrix.getCharacter( c ) + "] already in set" );
893 d.add( gain_loss_matrix.getCharacter( c ) );