From: cmzmasek Date: Sun, 22 Apr 2012 05:09:17 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=ffc6085cfd61eabca43b39c0423881b6f4fa7706;p=jalview.git in progress --- diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index 951ce21..62c6981 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -279,6 +279,12 @@ public class surfacing { public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique.txt"; public static final String LIMIT_SPEC_FOR_PROT_EX = null; // e.g. "HUMAN"; set to null for not using this feature (default). + public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED = "_dc_MAPPED_secondary_features_fitch" + + ForesterConstants.PHYLO_XML_SUFFIX; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts_MAPPED.txt"; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_MAPPED.txt"; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt"; + public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt"; private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, final String[][] input_file_properties, diff --git a/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java b/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java index 2396b90..18e0879 100644 --- a/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java +++ b/forester/java/src/org/forester/surfacing/DomainParsimonyCalculator.java @@ -26,9 +26,12 @@ package org.forester.surfacing; +import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; @@ -208,10 +211,54 @@ public final class DomainParsimonyCalculator { setTotalLosses( fitch.getTotalLosses() ); setTotalUnchanged( fitch.getTotalUnchanged() ); } + + private void executeFitchParsimonyOnSecondaryFeatures( + final boolean use_last, + final boolean randomize, + final long random_number_seed ) { + reset(); + if ( use_last ) { + System.out.println( " Fitch parsimony: use_last = true" ); + } + final FitchParsimony fitch = new FitchParsimony(); + fitch.setRandomize( randomize ); + if ( randomize ) { + fitch.setRandomNumberSeed( random_number_seed ); + } + fitch.setUseLast( use_last ); + fitch.setReturnGainLossMatrix( true ); + fitch.setReturnInternalStates( true ); + + final Map> map = getDomainIdToSecondaryFeaturesMap(); + final Map newmap = new HashMap(); + final Iterator>> it = map.entrySet().iterator(); + while (it.hasNext()) { + final Map.Entry> pair = (Map.Entry>)it.next(); + if ( pair.getValue().size() != 1 ) { + throw new IllegalArgumentException( pair.getKey().getId() + " mapps to " + pair.getValue().size() + " items" ); + } + newmap.put( pair.getKey(), ( String ) pair.getValue().toArray()[ 0 ] ); + } + + CharacterStateMatrix states =createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( getGenomeWideCombinableDomainsList(), + newmap ); + + fitch.execute( getPhylogeny(), states ); + setGainLossMatrix( fitch.getGainLossMatrix() ); + setBinaryInternalStatesMatrix( fitch.getInternalStatesMatrix() ); + setCost( fitch.getCost() ); + setTotalGains( fitch.getTotalGains() ); + setTotalLosses( fitch.getTotalLosses() ); + setTotalUnchanged( fitch.getTotalUnchanged() ); + } public void executeFitchParsimonyOnBinaryDomainCombintion( final boolean use_last ) { executeFitchParsimony( false, use_last, false, 0 ); } + + public void executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( final boolean use_last ) { + executeFitchParsimonyOnSecondaryFeatures( use_last, false, 0 ); + } public void executeFitchParsimonyOnBinaryDomainCombintion( final long random_number_seed ) { executeFitchParsimony( false, false, true, random_number_seed ); @@ -498,6 +545,155 @@ public final class DomainParsimonyCalculator { return new DomainParsimonyCalculator( phylogeny, gwcd_list, domain_id_to_secondary_features_map ); } + + /** + * For folds instead of Pfam-domains, for example + * + * + * @param gwcd_list + * @return + */ + static CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final List gwcd_list, + final Map> domain_id_to_second_features_map, + final Map mapping_results_map ) { + if ( gwcd_list.isEmpty() ) { + throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); + } + if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { + throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); + } + final int number_of_identifiers = gwcd_list.size(); + final SortedSet all_secondary_features = new TreeSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + int mapped = 0; + int not_mapped = 0; + for( final DomainId domain : gwcd.getAllDomainIds() ) { + if ( domain_id_to_second_features_map.containsKey( domain ) ) { + all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) ); + mapped++; + } + else { + not_mapped++; + } + } + if ( mapping_results_map != null ) { + final MappingResults mr = new MappingResults(); + mr.setDescription( gwcd.getSpecies().getSpeciesId() ); + mr.setSumOfSuccesses( mapped ); + mr.setSumOfFailures( not_mapped ); + mapping_results_map.put( gwcd.getSpecies(), mr ); + } + } + final int number_of_characters = all_secondary_features.size(); + final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, + number_of_characters ); + int character_index = 0; + for( final String second_id : all_secondary_features ) { + matrix.setCharacter( character_index++, second_id ); + } + int identifier_index = 0; + final Set all_identifiers = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + final String species_id = gwcd.getSpecies().getSpeciesId(); + if ( all_identifiers.contains( species_id ) ) { + throw new IllegalArgumentException( "species [" + species_id + "] is not unique" ); + } + all_identifiers.add( species_id ); + matrix.setIdentifier( identifier_index, species_id ); + final Set all_second_per_gwcd = new HashSet(); + for( final DomainId domain : gwcd.getAllDomainIds() ) { + if ( domain_id_to_second_features_map.containsKey( domain ) ) { + all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) ); + } + } + for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { + if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) { + matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); + } + else { + matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT ); + } + } + ++identifier_index; + } + return matrix; + } + + + public static CharacterStateMatrix createMatrixOfSecondaryFeatureBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list, + final Map domain_id_to_second_features_map) { + if ( gwcd_list.isEmpty() ) { + throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); + } + if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { + throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); + } + final int number_of_identifiers = gwcd_list.size(); + final SortedSet all_binary_combinations_mapped = new TreeSet(); + final Set[] binary_combinations_per_genome_mapped = new HashSet[ number_of_identifiers ]; + int identifier_index = 0; + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + binary_combinations_per_genome_mapped[ identifier_index ] = new HashSet(); + for( final BinaryDomainCombination bc : gwcd.toBinaryDomainCombinations() ) { + + if ( !domain_id_to_second_features_map.containsKey( bc.getId0() ) ) { + throw new IllegalArgumentException( "no mapping found for " + bc.getId0() ); + } + if ( !domain_id_to_second_features_map.containsKey( bc.getId1() ) ) { + throw new IllegalArgumentException( "no mapping found for " + bc.getId1() ); + } + + + final BinaryDomainCombination mapped_bc = new BasicBinaryDomainCombination( domain_id_to_second_features_map.get( bc.getId0()) , + domain_id_to_second_features_map.get( bc.getId1()) ); + all_binary_combinations_mapped.add( mapped_bc ); + binary_combinations_per_genome_mapped[ identifier_index ].add( mapped_bc ); + } + ++identifier_index; + } + + final int number_of_characters = all_binary_combinations_mapped.size(); + final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, + number_of_characters ); + int character_index = 0; + for( final BinaryDomainCombination bc : all_binary_combinations_mapped ) { + matrix.setCharacter( character_index++, bc.toString() ); + } + identifier_index = 0; + final Set all_identifiers = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + final String species_id = gwcd.getSpecies().getSpeciesId(); + if ( all_identifiers.contains( species_id ) ) { + throw new AssertionError( "species [" + species_id + "] is not unique" ); + } + all_identifiers.add( species_id ); + matrix.setIdentifier( identifier_index, species_id ); + for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { + BinaryDomainCombination bc = null; + if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED_ADJACTANT ) { + bc = AdjactantDirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + } + else if ( gwcd.getDomainCombinationType() == DomainCombinationType.DIRECTED ) { + bc = DirectedBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + } + else { + bc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( ci ) ); + } + if ( binary_combinations_per_genome_mapped[ identifier_index ].contains( bc ) ) { + matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); + } + else { + matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT ); + } + } + ++identifier_index; + } + return matrix; + } + + + + public static CharacterStateMatrix createMatrixOfBinaryDomainCombinationPresenceOrAbsence( final List gwcd_list ) { if ( gwcd_list.isEmpty() ) { throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); @@ -621,78 +817,7 @@ public final class DomainParsimonyCalculator { return matrix; } - /** - * For folds instead of Pfam-domains, for example - * - * - * @param gwcd_list - * @return - */ - static CharacterStateMatrix createMatrixOfSecondaryFeaturePresenceOrAbsence( final List gwcd_list, - final Map> domain_id_to_second_features_map, - final Map mapping_results_map ) { - if ( gwcd_list.isEmpty() ) { - throw new IllegalArgumentException( "genome wide combinable domains list is empty" ); - } - if ( ( domain_id_to_second_features_map == null ) || domain_id_to_second_features_map.isEmpty() ) { - throw new IllegalArgumentException( "domain id to secondary features map is null or empty" ); - } - final int number_of_identifiers = gwcd_list.size(); - final SortedSet all_secondary_features = new TreeSet(); - for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - int mapped = 0; - int not_mapped = 0; - for( final DomainId domain : gwcd.getAllDomainIds() ) { - if ( domain_id_to_second_features_map.containsKey( domain ) ) { - all_secondary_features.addAll( domain_id_to_second_features_map.get( domain ) ); - mapped++; - } - else { - not_mapped++; - } - } - if ( mapping_results_map != null ) { - final MappingResults mr = new MappingResults(); - mr.setDescription( gwcd.getSpecies().getSpeciesId() ); - mr.setSumOfSuccesses( mapped ); - mr.setSumOfFailures( not_mapped ); - mapping_results_map.put( gwcd.getSpecies(), mr ); - } - } - final int number_of_characters = all_secondary_features.size(); - final CharacterStateMatrix matrix = new BasicCharacterStateMatrix( number_of_identifiers, - number_of_characters ); - int character_index = 0; - for( final String second_id : all_secondary_features ) { - matrix.setCharacter( character_index++, second_id ); - } - int identifier_index = 0; - final Set all_identifiers = new HashSet(); - for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - final String species_id = gwcd.getSpecies().getSpeciesId(); - if ( all_identifiers.contains( species_id ) ) { - throw new IllegalArgumentException( "species [" + species_id + "] is not unique" ); - } - all_identifiers.add( species_id ); - matrix.setIdentifier( identifier_index, species_id ); - final Set all_second_per_gwcd = new HashSet(); - for( final DomainId domain : gwcd.getAllDomainIds() ) { - if ( domain_id_to_second_features_map.containsKey( domain ) ) { - all_second_per_gwcd.addAll( domain_id_to_second_features_map.get( domain ) ); - } - } - for( int ci = 0; ci < matrix.getNumberOfCharacters(); ++ci ) { - if ( all_second_per_gwcd.contains( matrix.getCharacter( ci ) ) ) { - matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.PRESENT ); - } - else { - matrix.setState( identifier_index, ci, CharacterStateMatrix.BinaryStates.ABSENT ); - } - } - ++identifier_index; - } - return matrix; - } + private static int getStateSumDeltaOnNode( final String node_identifier, final CharacterStateMatrix gain_loss_matrix, diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index f979897..9118dd4 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -802,7 +802,7 @@ public final class SurfacingUtil { writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES, secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ), phylogeny ); - final Phylogeny local_phylogeny_copy = phylogeny.copy(); + Phylogeny local_phylogeny_copy = phylogeny.copy(); secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map ); SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); @@ -840,6 +840,35 @@ public final class SurfacingUtil { parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + + + // FITCH DOMAIN COMBINATIONS + // ------------------------- + local_phylogeny_copy = phylogeny.copy(); + String randomization = "no"; + + secondary_features_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( true ); + + + + + preparePhylogeny( local_phylogeny_copy, + secondary_features_parsimony, + date_time, + "Fitch parsimony on secondary binary domain combination presence/absence randomization: " + + randomization, + "fitch_on_binary_domain_combinations_" + outfile_name, + parameters_str ); + SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name + + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED ); + calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name + + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt" ); + + } public static void extractProteinNames( final List proteins,