X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsurfacing%2FDomainParsimonyCalculator.java;h=b20ee0f1dbd5abfcddfc42c0727eddbdf49ad6b1;hb=63685dd0262a506f0bf17cba6be1aa1ada5330f3;hp=e2e0af3566e4fd464e8aad018c5873dec339d24f;hpb=656be28debec520e0e35a8b311114398a40ea366;p=jalview.git diff --git a/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java b/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java index e2e0af3..b20ee0f 100644 --- a/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java +++ b/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java @@ -49,23 +49,22 @@ import org.forester.phylogeny.data.BinaryCharacters; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.protein.BinaryDomainCombination; import org.forester.protein.BinaryDomainCombination.DomainCombinationType; -import org.forester.protein.DomainId; import org.forester.species.Species; import org.forester.util.ForesterUtil; public final class DomainParsimonyCalculator { private static final String TYPE_FORBINARY_CHARACTERS = "parsimony inferred"; - private CharacterStateMatrix _gain_loss_matrix; private CharacterStateMatrix _binary_internal_states_matrix; + private int _cost; + private Map> _domain_id_to_secondary_features_map; + private CharacterStateMatrix _gain_loss_matrix; private final List _gwcd_list; private final Phylogeny _phylogeny; - private int _total_losses; + private SortedSet _positive_filter; private int _total_gains; + private int _total_losses; private int _total_unchanged; - private int _cost; - private Map> _domain_id_to_secondary_features_map; - private SortedSet _positive_filter; private DomainParsimonyCalculator( final Phylogeny phylogeny ) { init(); @@ -81,77 +80,13 @@ public final class DomainParsimonyCalculator { private DomainParsimonyCalculator( final Phylogeny phylogeny, final List gwcd_list, - final Map> domain_id_to_secondary_features_map ) { + final Map> domain_id_to_secondary_features_map ) { init(); _phylogeny = phylogeny; _gwcd_list = gwcd_list; setDomainIdToSecondaryFeaturesMap( domain_id_to_secondary_features_map ); } - int calculateNumberOfBinaryDomainCombination() { - if ( getGenomeWideCombinableDomainsList().isEmpty() ) { - throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); - } - final Set all_binary_combinations = new HashSet(); - for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) { - for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { - all_binary_combinations.add( bc ); - } - } - return all_binary_combinations.size(); - } - - CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence() { - return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); - } - - CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence() { - return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() ); - } - - CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map mapping_results_map ) { - return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(), - getDomainIdToSecondaryFeaturesMap(), - mapping_results_map ); - } - - Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) { - for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) { - final PhylogenyNode node = it.next(); - final String node_identifier = node.getName(); - final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ), - getUnitsGainedOnNode( node_identifier ), - getUnitsLostOnNode( node_identifier ), - TYPE_FORBINARY_CHARACTERS, - getSumOfPresentOnNode( node_identifier ), - getSumOfGainsOnNode( node_identifier ), - getSumOfLossesOnNode( node_identifier ) ); - node.getNodeData().setBinaryCharacters( bc ); - } - return phylogeny; - } - - private void executeDolloParsimony( final boolean on_domain_presence ) { - reset(); - final DolloParsimony dollo = DolloParsimony.createInstance(); - dollo.setReturnGainLossMatrix( true ); - dollo.setReturnInternalStates( true ); - CharacterStateMatrix states = null; - if ( on_domain_presence ) { - states = createMatrixOfDomainPresenceOrAbsence(); - } - else { - states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence(); - } - dollo.execute( getPhylogeny(), states ); - setGainLossMatrix( dollo.getGainLossMatrix() ); - setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() ); - setCost( dollo.getCost() ); - setTotalGains( dollo.getTotalGains() ); - setTotalLosses( dollo.getTotalLosses() ); - setTotalUnchanged( dollo.getTotalUnchanged() ); - } - public void executeDolloParsimonyOnBinaryDomainCombintionPresence() { executeDolloParsimony( false ); } @@ -160,7 +95,7 @@ public final class DomainParsimonyCalculator { executeDolloParsimony( true ); } - public void executeDolloParsimonyOnDomainPresence( final SortedSet positive_filter ) { + public void executeDolloParsimonyOnDomainPresence( final SortedSet positive_filter ) { setPositiveFilter( positive_filter ); executeDolloParsimony( true ); setPositiveFilter( null ); @@ -184,87 +119,18 @@ public final class DomainParsimonyCalculator { setTotalUnchanged( dollo.getTotalUnchanged() ); } - private void executeFitchParsimony( final boolean on_domain_presence, - final boolean use_last, - final boolean randomize, - final long random_number_seed ) { - reset(); - if ( use_last ) { - System.out.println( " Fitch parsimony: use_last = true" ); - } - final FitchParsimony fitch = new FitchParsimony(); - fitch.setRandomize( randomize ); - if ( randomize ) { - fitch.setRandomNumberSeed( random_number_seed ); - } - fitch.setUseLast( use_last ); - fitch.setReturnGainLossMatrix( true ); - fitch.setReturnInternalStates( true ); - CharacterStateMatrix states = null; - if ( on_domain_presence ) { - states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); - } - else { - states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); - } - fitch.execute( getPhylogeny(), states ); - setGainLossMatrix( fitch.getGainLossMatrix() ); - setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() ); - setCost( fitch.getCost() ); - setTotalGains( fitch.getTotalGains() ); - setTotalLosses( fitch.getTotalLosses() ); - setTotalUnchanged( fitch.getTotalUnchanged() ); - } - - private void executeFitchParsimonyOnSecondaryFeatures( final boolean use_last, - final boolean randomize, - final long random_number_seed ) { - reset(); - if ( use_last ) { - System.out.println( " Fitch parsimony: use_last = true" ); - } - final FitchParsimony fitch = new FitchParsimony(); - fitch.setRandomize( randomize ); - if ( randomize ) { - fitch.setRandomNumberSeed( random_number_seed ); - } - fitch.setUseLast( use_last ); - fitch.setReturnGainLossMatrix( true ); - fitch.setReturnInternalStates( true ); - final Map> map = getDomainIdToSecondaryFeaturesMap(); - final Map newmap = new HashMap(); - final Iterator>> it = map.entrySet().iterator(); - while ( it.hasNext() ) { - final Map.Entry> pair = it.next(); - if ( pair.getValue().size() != 1 ) { - throw new IllegalArgumentException( pair.getKey().getId() + " mapps to " + pair.getValue().size() - + " items" ); - } - newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] ); - } - final CharacterStateMatrix states = createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(), - newmap ); - fitch.execute( getPhylogeny(), states ); - setGainLossMatrix( fitch.getGainLossMatrix() ); - setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() ); - setCost( fitch.getCost() ); - setTotalGains( fitch.getTotalGains() ); - setTotalLosses( fitch.getTotalLosses() ); - setTotalUnchanged( fitch.getTotalUnchanged() ); - } - public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) { executeFitchParsimony( false, use_last, false, 0 ); } - public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) { - executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 ); - } - public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) { executeFitchParsimony( false, false, true, random_number_seed ); } + public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) { + executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 ); + } + public void executeFitchParsimonyOnDomainPresence( final boolean use_last ) { executeFitchParsimony( true, use_last, false, 0 ); } @@ -283,9 +149,9 @@ public final class DomainParsimonyCalculator { throw new IllegalArgumentException( "binary states matrix number of identifiers is not equal to the number of tree nodes provided" ); } final CharacterStateMatrix gl_matrix = new BasicCharacterStateMatrix( binary_states_matrix - .getNumberOfIdentifiers(), - binary_states_matrix - .getNumberOfCharacters() ); + .getNumberOfIdentifiers(), + binary_states_matrix + .getNumberOfCharacters() ); int total_gains = 0; int total_losses = 0; int total_unchanged = 0; @@ -300,7 +166,7 @@ public final class DomainParsimonyCalculator { final PhylogenyNode node = it.next(); final String name = node.getName(); final BinaryStates bin_state = binary_states_matrix.getState( binary_states_matrix - .getIdentifierIndex( name ), c ); + .getIdentifierIndex( name ), c ); final PhylogenyNode parent_node = getPhylogeny().getNode( name ).getParent(); GainLossStates gl_state = null; if ( node.isRoot() ) { @@ -314,7 +180,7 @@ public final class DomainParsimonyCalculator { } else { final BinaryStates parent_bin_state = binary_states_matrix.getState( binary_states_matrix - .getIdentifierIndex( parent_node.getName() ), c ); + .getIdentifierIndex( parent_node.getName() ), c ); if ( bin_state == BinaryStates.ABSENT ) { if ( parent_bin_state == BinaryStates.ABSENT ) { ++total_unchanged; @@ -350,10 +216,6 @@ public final class DomainParsimonyCalculator { return _cost; } - private Map> getDomainIdToSecondaryFeaturesMap() { - return _domain_id_to_secondary_features_map; - } - public CharacterStateMatrix getGainLossCountsMatrix() { final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( getGainLossMatrix() .getNumberOfIdentifiers(), 3 ); @@ -386,10 +248,6 @@ public final class DomainParsimonyCalculator { return _gain_loss_matrix; } - private List getGenomeWideCombinableDomainsList() { - return _gwcd_list; - } - public CharacterStateMatrix getInternalStatesMatrix() { return _binary_internal_states_matrix; } @@ -411,14 +269,6 @@ public final class DomainParsimonyCalculator { return net; } - private Phylogeny getPhylogeny() { - return _phylogeny; - } - - private SortedSet getPositiveFilter() { - return _positive_filter; - } - public int getSumOfGainsOnNode( final String node_identifier ) { return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN ); } @@ -431,6 +281,75 @@ public final class DomainParsimonyCalculator { return getSumOfGainsOnNode( node_identifier ) + getSumOfUnchangedPresentOnNode( node_identifier ); } + public int getTotalGains() { + return _total_gains; + } + + public int getTotalLosses() { + return _total_losses; + } + + public int getTotalUnchanged() { + return _total_unchanged; + } + + public SortedSet getUnitsGainedOnNode( final String node_identifier ) { + return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN ); + } + + public SortedSet getUnitsLostOnNode( final String node_identifier ) { + return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS ); + } + + public SortedSet getUnitsOnNode( final String node_identifier ) { + final SortedSet present = getUnitsGainedOnNode( node_identifier ); + present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) ); + return present; + } + + int calculateNumberOfBinaryDomainCombination() { + if ( getGenomeWideCombinableDomainsList().isEmpty() ) { + throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); + } + final Set all_binary_combinations = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : getGenomeWideCombinableDomainsList() ) { + for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { + all_binary_combinations.add( bc ); + } + } + return all_binary_combinations.size(); + } + + CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence() { + return createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); + } + + CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence() { + return createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList(), getPositiveFilter() ); + } + + CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final Map mapping_results_map ) { + return createMatrixOfSecondaryFeaturePresenceOrAbsence( getGenomeWideCombinableDomainsList(), + getDomainIdToSecondaryFeaturesMap(), + mapping_results_map ); + } + + Phylogeny decoratePhylogenyWithDomains( final Phylogeny phylogeny ) { + for( final PhylogenyNodeIterator it = phylogeny.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + final String node_identifier = node.getName(); + final BinaryCharacters bc = new BinaryCharacters( getUnitsOnNode( node_identifier ), + getUnitsGainedOnNode( node_identifier ), + getUnitsLostOnNode( node_identifier ), + TYPE_FORBINARY_CHARACTERS, + getSumOfPresentOnNode( node_identifier ), + getSumOfGainsOnNode( node_identifier ), + getSumOfLossesOnNode( node_identifier ) ); + node.getNodeData().setBinaryCharacters( bc ); + } + return phylogeny; + } + int getSumOfUnchangedAbsentOnNode( final String node_identifier ) { return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT ); } @@ -443,38 +362,117 @@ public final class DomainParsimonyCalculator { return getStateSumDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT ); } - public int getTotalGains() { - return _total_gains; + SortedSet getUnitsUnchangedAbsentOnNode( final String node_identifier ) { + return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT ); } - public int getTotalLosses() { - return _total_losses; + SortedSet getUnitsUnchangedPresentOnNode( final String node_identifier ) { + return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT ); } - public int getTotalUnchanged() { - return _total_unchanged; + private void executeDolloParsimony( final boolean on_domain_presence ) { + reset(); + final DolloParsimony dollo = DolloParsimony.createInstance(); + dollo.setReturnGainLossMatrix( true ); + dollo.setReturnInternalStates( true ); + CharacterStateMatrix states = null; + if ( on_domain_presence ) { + states = createMatrixOfDomainPresenceOrAbsence(); + } + else { + states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence(); + } + dollo.execute( getPhylogeny(), states ); + setGainLossMatrix( dollo.getGainLossMatrix() ); + setBinaryInternalStatesMatrix( dollo.getInternalStatesMatrix() ); + setCost( dollo.getCost() ); + setTotalGains( dollo.getTotalGains() ); + setTotalLosses( dollo.getTotalLosses() ); + setTotalUnchanged( dollo.getTotalUnchanged() ); } - public SortedSet getUnitsGainedOnNode( final String node_identifier ) { - return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.GAIN ); + private void executeFitchParsimony( final boolean on_domain_presence, + final boolean use_last, + final boolean randomize, + final long random_number_seed ) { + reset(); + if ( use_last ) { + System.out.println( " Fitch parsimony: use_last = true" ); + } + final FitchParsimony fitch = new FitchParsimony(); + fitch.setRandomize( randomize ); + if ( randomize ) { + fitch.setRandomNumberSeed( random_number_seed ); + } + fitch.setUseLast( use_last ); + fitch.setReturnGainLossMatrix( true ); + fitch.setReturnInternalStates( true ); + CharacterStateMatrix states = null; + if ( on_domain_presence ) { + states = createMatrixOfDomainPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); + } + else { + states = createMatrixOfBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList() ); + } + fitch.execute( getPhylogeny(), states, true ); + setGainLossMatrix( fitch.getGainLossMatrix() ); + setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() ); + setCost( fitch.getCost() ); + setTotalGains( fitch.getTotalGains() ); + setTotalLosses( fitch.getTotalLosses() ); + setTotalUnchanged( fitch.getTotalUnchanged() ); + } + + private void executeFitchParsimonyOnSecondaryFeatures( final boolean use_last, + final boolean randomize, + final long random_number_seed ) { + reset(); + if ( use_last ) { + System.out.println( " Fitch parsimony: use_last = true" ); + } + final FitchParsimony fitch = new FitchParsimony(); + fitch.setRandomize( randomize ); + if ( randomize ) { + fitch.setRandomNumberSeed( random_number_seed ); + } + fitch.setUseLast( use_last ); + fitch.setReturnGainLossMatrix( true ); + fitch.setReturnInternalStates( true ); + final Map> map = getDomainIdToSecondaryFeaturesMap(); + final Map newmap = new HashMap(); + final Iterator>> it = map.entrySet().iterator(); + while ( it.hasNext() ) { + final Map.Entry> pair = it.next(); + if ( pair.getValue().size() != 1 ) { + throw new IllegalArgumentException( pair.getKey() + " mapps to " + pair.getValue().size() + " items" ); + } + newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] ); + } + final CharacterStateMatrix states = createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(), + newmap ); + fitch.execute( getPhylogeny(), states, true ); + setGainLossMatrix( fitch.getGainLossMatrix() ); + setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() ); + setCost( fitch.getCost() ); + setTotalGains( fitch.getTotalGains() ); + setTotalLosses( fitch.getTotalLosses() ); + setTotalUnchanged( fitch.getTotalUnchanged() ); } - public SortedSet getUnitsLostOnNode( final String node_identifier ) { - return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.LOSS ); + private Map> getDomainIdToSecondaryFeaturesMap() { + return _domain_id_to_secondary_features_map; } - public SortedSet getUnitsOnNode( final String node_identifier ) { - final SortedSet present = getUnitsGainedOnNode( node_identifier ); - present.addAll( getUnitsUnchangedPresentOnNode( node_identifier ) ); - return present; + private List getGenomeWideCombinableDomainsList() { + return _gwcd_list; } - SortedSet getUnitsUnchangedAbsentOnNode( final String node_identifier ) { - return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_ABSENT ); + private Phylogeny getPhylogeny() { + return _phylogeny; } - SortedSet getUnitsUnchangedPresentOnNode( final String node_identifier ) { - return getUnitsDeltaOnNode( node_identifier, getGainLossMatrix(), GainLossStates.UNCHANGED_PRESENT ); + private SortedSet getPositiveFilter() { + return _positive_filter; } private void init() { @@ -500,7 +498,7 @@ public final class DomainParsimonyCalculator { _cost = cost; } - private void setDomainIdToSecondaryFeaturesMap( final Map> domain_id_to_secondary_features_map ) { + private void setDomainIdToSecondaryFeaturesMap( final Map> domain_id_to_secondary_features_map ) { _domain_id_to_secondary_features_map = domain_id_to_secondary_features_map; } @@ -508,7 +506,7 @@ public final class DomainParsimonyCalculator { _gain_loss_matrix = gain_loss_matrix; } - private void setPositiveFilter( final SortedSet positive_filter ) { + private void setPositiveFilter( final SortedSet positive_filter ) { _positive_filter = positive_filter; } @@ -532,82 +530,64 @@ public final class DomainParsimonyCalculator { final List gwcd_list ) { if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) { throw new IllegalArgumentException( "number of external nodes [" + phylogeny.getNumberOfExternalNodes() - + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" ); + + "] does not equal size of genome wide combinable domains list [" + gwcd_list.size() + "]" ); } return new DomainParsimonyCalculator( phylogeny, gwcd_list ); } public static DomainParsimonyCalculator createInstance( final Phylogeny phylogeny, final List gwcd_list, - final Map> domain_id_to_secondary_features_map ) { + final Map> domain_id_to_secondary_features_map ) { if ( phylogeny.getNumberOfExternalNodes() != gwcd_list.size() ) { throw new IllegalArgumentException( "size of external nodes does not equal size of genome wide combinable domains list" ); } return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map ); } - - /** - * For folds instead of Pfam-domains, for example - * - * - * @param gwcd_list - * @return - */ - static CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final List gwcd_list, - final Map> domain_id_to_second_features_map, - final Map mapping_results_map ) { + @SuppressWarnings("unchecked") + public static CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list ) { if ( gwcd_list.isEmpty() ) { throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); } - if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { - throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); - } final int number_of_identifiers = gwcd_list.size(); - final SortedSet all_secondary_features = new TreeSet(); + final SortedSet all_binary_combinations = new TreeSet(); + final Set[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ]; + int identifier_index = 0; for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - int mapped = 0; - int not_mapped = 0; - for( final DomainId domain : gwcd.getAllDomainIds() ) { - if ( domain_id_to_second_features_map.containsKey( domain ) ) { - all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) ); - mapped++; - } - else { - not_mapped++; - } - } - if ( mapping_results_map != null ) { - final MappingResults mr = new MappingResults(); - mr.setDescription( gwcd.getSpecies().getSpeciesId() ); - mr.setSumOfSuccesses( mapped ); - mr.setSumOfFailures( not_mapped ); - mapping_results_map.put( gwcd.getSpecies(), mr ); + binary_combinations_per_genome[ identifier_index ] = new HashSet(); + for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { + all_binary_combinations.add( bc ); + binary_combinations_per_genome[ identifier_index ].add( bc ); } + ++identifier_index; } - final int number_of_characters = all_secondary_features.size(); + final int number_of_characters = all_binary_combinations.size(); final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, - number_of_characters ); + number_of_characters ); int character_index = 0; - for( final String second_id : all_secondary_features ) { - matrix.setCharacter( character_index++, second_id ); + for( final BinaryDomainCombination bc : all_binary_combinations ) { + matrix.setCharacter( character_index++, bc.toString() ); } - int identifier_index = 0; + identifier_index = 0; final Set all_identifiers = new HashSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { final String species_id = gwcd.getSpecies().getSpeciesId(); if ( all_identifiers.contains( species_id ) ) { - throw new IllegalArgumentException( "species [" + species_id + "] is not unique" ); + throw new AssertionError( "species [" + species_id + "] is not unique" ); } all_identifiers.add( species_id ); matrix.setIdentifier( identifier_index, species_id ); - final Set all_second_per_gwcd = new HashSet(); - for( final DomainId domain : gwcd.getAllDomainIds() ) { - if ( domain_id_to_second_features_map.containsKey( domain ) ) { - all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) ); - } - } for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { - if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) { + BinaryDomainCombination bc = null; + if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) { + bc = AdjactantDirectedBinaryDomainCombination.obtainInstance( matrix.getCharacter( ci ) ); + } + else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) { + bc = DirectedBinaryDomainCombination.obtainInstance( matrix.getCharacter( ci ) ); + } + else { + bc = BasicBinaryDomainCombination.obtainInstance( matrix.getCharacter( ci ) ); + } + if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) { matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); } else { @@ -619,65 +599,59 @@ public final class DomainParsimonyCalculator { return matrix; } - public static CharacterStateMatrix createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list, - final Map domain_id_to_second_features_map ) { + public static CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence( final List gwcd_list, + final SortedSet positive_filter ) { if ( gwcd_list.isEmpty() ) { throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); } - if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { - throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); + if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) { + throw new IllegalArgumentException( "positive filter is empty" ); } final int number_of_identifiers = gwcd_list.size(); - final SortedSet all_binary_combinations_mapped = new TreeSet(); - final Set[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ]; - int identifier_index = 0; - final SortedSet no_mappings = new TreeSet(); + final SortedSet all_domain_ids = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet(); - for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { - final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map, - bc, - no_mappings ); - all_binary_combinations_mapped.add( mapped_bc ); - binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc ); + for( final String domain : gwcd.getAllDomainIds() ) { + all_domain_ids.add( domain ); } - ++identifier_index; } - if ( !no_mappings.isEmpty() ) { - ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size() - + "):" ); - for( final String id : no_mappings ) { - ForesterUtil.programMessage( surfacing.PRG_NAME, id ); + int number_of_characters = all_domain_ids.size(); + if ( positive_filter != null ) { + //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes + number_of_characters = 0; + for( final String id : all_domain_ids ) { + if ( positive_filter.contains( id ) ) { + number_of_characters++; + } } } - final int number_of_characters = all_binary_combinations_mapped.size(); final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, - number_of_characters ); + number_of_characters ); int character_index = 0; - for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) { - matrix.setCharacter( character_index++, bc.toString() ); + for( final String id : all_domain_ids ) { + if ( positive_filter == null ) { + matrix.setCharacter( character_index++, id ); + } + else { + if ( positive_filter.contains( id ) ) { + matrix.setCharacter( character_index++, id ); + } + } } - identifier_index = 0; + int identifier_index = 0; final Set all_identifiers = new HashSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { final String species_id = gwcd.getSpecies().getSpeciesId(); if ( all_identifiers.contains( species_id ) ) { - throw new AssertionError( "species [" + species_id + "] is not unique" ); + throw new IllegalArgumentException( "species [" + species_id + "] is not unique" ); } all_identifiers.add( species_id ); matrix.setIdentifier( identifier_index, species_id ); for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { - BinaryDomainCombination bc = null; - if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) { - bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); - } - else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) { - bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); - } - else { - bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) { + throw new RuntimeException( "this should not have happened: problem with character #" + ci ); } - if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) { + final String id = matrix.getCharacter( ci ); + if ( gwcd.contains( id ) ) { matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); } else { @@ -688,50 +662,43 @@ public final class DomainParsimonyCalculator { } return matrix; } - - private static BinaryDomainCombination mapBinaryDomainCombination( final Map domain_id_to_second_features_map, - final BinaryDomainCombination bc, - final SortedSet no_mappings ) { - String id0 = ""; - String id1 = ""; - if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) { - no_mappings.add( bc.getId0().getId() ); - id0 = bc.getId0().getId(); - } - else { - id0 = domain_id_to_second_features_map.get( bc.getId0() ); - } - if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) { - no_mappings.add( bc.getId1().getId() ); - id1 = bc.getId1().getId(); - } - else { - id1 = domain_id_to_second_features_map.get( bc.getId1() ); - } - return new BasicBinaryDomainCombination( id0, id1 ); - } - - public static CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list ) { + @SuppressWarnings("unchecked") + public static CharacterStateMatrix createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list, + final Map domain_id_to_second_features_map ) { if ( gwcd_list.isEmpty() ) { throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); } + if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { + throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); + } final int number_of_identifiers = gwcd_list.size(); - final SortedSet all_binary_combinations = new TreeSet(); - final Set[] binary_combinations_per_genome = new HashSet[ number_of_identifiers ]; + final SortedSet all_binary_combinations_mapped = new TreeSet(); + final Set[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ]; int identifier_index = 0; + final SortedSet no_mappings = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - binary_combinations_per_genome[ identifier_index ] = new HashSet(); + binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet(); for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { - all_binary_combinations.add( bc ); - binary_combinations_per_genome[ identifier_index ].add( bc ); + final BinaryDomainCombination mapped_bc = mapBinaryDomainCombination( domain_id_to_second_features_map, + bc, + no_mappings ); + all_binary_combinations_mapped.add( mapped_bc ); + binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc ); } ++identifier_index; } - final int number_of_characters = all_binary_combinations.size(); + if ( !no_mappings.isEmpty() ) { + ForesterUtil.programMessage( surfacing.PRG_NAME, "No mappings for the following (" + no_mappings.size() + + "):" ); + for( final String id : no_mappings ) { + ForesterUtil.programMessage( surfacing.PRG_NAME, id ); + } + } + final int number_of_characters = all_binary_combinations_mapped.size(); final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, - number_of_characters ); + number_of_characters ); int character_index = 0; - for( final BinaryDomainCombination bc : all_binary_combinations ) { + for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) { matrix.setCharacter( character_index++, bc.toString() ); } identifier_index = 0; @@ -746,15 +713,15 @@ public final class DomainParsimonyCalculator { for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { BinaryDomainCombination bc = null; if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) { - bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + bc = AdjactantDirectedBinaryDomainCombination.obtainInstance( matrix.getCharacter( ci ) ); } else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) { - bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + bc = DirectedBinaryDomainCombination.obtainInstance( matrix.getCharacter( ci ) ); } else { - bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + bc = BasicBinaryDomainCombination.obtainInstance( matrix.getCharacter( ci ) ); } - if ( binary_combinations_per_genome[ identifier_index ].contains( bc ) ) { + if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) { matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); } else { @@ -770,43 +737,50 @@ public final class DomainParsimonyCalculator { return createMatrixOfDomainPresenceOrAbsence( gwcd_list, null ); } - public static CharacterStateMatrix createMatrixOfDomainPresenceOrAbsence( final List gwcd_list, - final SortedSet positive_filter ) { + /** + * For folds instead of Pfam-domains, for example + * + * + * @param gwcd_list + * @return + */ + static CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final List gwcd_list, + final Map> domain_id_to_second_features_map, + final Map mapping_results_map ) { if ( gwcd_list.isEmpty() ) { throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); } - if ( ( positive_filter != null ) && ( positive_filter.size() < 1 ) ) { - throw new IllegalArgumentException( "positive filter is empty" ); + if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { + throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); } final int number_of_identifiers = gwcd_list.size(); - final SortedSet all_domain_ids = new TreeSet(); + final SortedSet all_secondary_features = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - for( final DomainId domain : gwcd.getAllDomainIds() ) { - all_domain_ids.add( domain ); - } - } - int number_of_characters = all_domain_ids.size(); - if ( positive_filter != null ) { - //number_of_characters = positive_filter.size(); -- bad if doms in filter but not in genomes - number_of_characters = 0; - for( final DomainId id : all_domain_ids ) { - if ( positive_filter.contains( id ) ) { - number_of_characters++; + int mapped = 0; + int not_mapped = 0; + for( final String domain : gwcd.getAllDomainIds() ) { + if ( domain_id_to_second_features_map.containsKey( domain ) ) { + all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) ); + mapped++; + } + else { + not_mapped++; } } + if ( mapping_results_map != null ) { + final MappingResults mr = new MappingResults(); + mr.setDescription( gwcd.getSpecies().getSpeciesId() ); + mr.setSumOfSuccesses( mapped ); + mr.setSumOfFailures( not_mapped ); + mapping_results_map.put( gwcd.getSpecies(), mr ); + } } + final int number_of_characters = all_secondary_features.size(); final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, - number_of_characters ); + number_of_characters ); int character_index = 0; - for( final DomainId id : all_domain_ids ) { - if ( positive_filter == null ) { - matrix.setCharacter( character_index++, id.getId() ); - } - else { - if ( positive_filter.contains( id ) ) { - matrix.setCharacter( character_index++, id.getId() ); - } - } + for( final String second_id : all_secondary_features ) { + matrix.setCharacter( character_index++, second_id ); } int identifier_index = 0; final Set all_identifiers = new HashSet(); @@ -817,12 +791,14 @@ public final class DomainParsimonyCalculator { } all_identifiers.add( species_id ); matrix.setIdentifier( identifier_index, species_id ); - for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { - if ( ForesterUtil.isEmpty( matrix.getCharacter( ci ) ) ) { - throw new RuntimeException( "this should not have happened: problem with character #" + ci ); + final Set all_second_per_gwcd = new HashSet(); + for( final String domain : gwcd.getAllDomainIds() ) { + if ( domain_id_to_second_features_map.containsKey( domain ) ) { + all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) ); } - final DomainId id = new DomainId( matrix.getCharacter( ci ) ); - if ( gwcd.contains( id ) ) { + } + for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { + if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) { matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); } else { @@ -881,4 +857,27 @@ public final class DomainParsimonyCalculator { } return d; } + + private static BinaryDomainCombination mapBinaryDomainCombination( final Map domain_id_to_second_features_map, + final BinaryDomainCombination bc, + final SortedSet no_mappings ) { + String id0 = ""; + String id1 = ""; + if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) { + no_mappings.add( bc.getId0() ); + id0 = bc.getId0(); + } + else { + id0 = domain_id_to_second_features_map.get( bc.getId0() ); + } + if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) { + no_mappings.add( bc.getId1() ); + id1 = bc.getId1(); + } + else { + id1 = domain_id_to_second_features_map.get( bc.getId1() ); + } + // return new BasicBinaryDomainCombination( id0, id1 ); + return BasicBinaryDomainCombination.obtainInstance( id0, id1 ); + } }