From bb7849228e35e5c27c39ba5b53d559f8620be730 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Sat, 5 May 2012 04:16:19 +0000 Subject: [PATCH] in progress --- .../src/org/forester/application/surfacing.java | 16 ++- .../BasicGenomeWideCombinableDomains.java | 31 +++++- .../src/org/forester/surfacing/SurfacingUtil.java | 112 ++++++++++++++++++-- .../src/org/forester/surfacing/TestSurfacing.java | 4 +- 4 files changed, 145 insertions(+), 18 deletions(-) diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index fc9567d..08dde5e 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -234,8 +234,8 @@ public class surfacing { final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; final static private char SEPARATOR_FOR_INPUT_VALUES = '#'; - final static private String PRG_VERSION = "2.230"; - final static private String PRG_DATE = "2012.04.22"; + final static private String PRG_VERSION = "2.240"; + final static private String PRG_DATE = "2012.05.04"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; @@ -358,6 +358,7 @@ public class surfacing { * @param sum_of_all_domains_encountered * @param all_bin_domain_combinations_encountered * @param is_gains_analysis + * @param protein_length_stats_by_dc * @throws IOException */ private static void executeFitchGainsAnalysis( final File output_file, @@ -1412,7 +1413,6 @@ public class surfacing { System.out.println( "Ignore combination with self: " + ignore_combination_with_same ); html_desc.append( "Ignore combination with self for domain combination similarity analyses:" + ignore_combination_with_same + "" + nl ); - ; System.out.println( "Consider directedness : " + ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) ); html_desc.append( "Consider directedness of binary domain combinations:" @@ -1759,6 +1759,8 @@ public class surfacing { catch ( final IOException e3 ) { e3.printStackTrace(); } + final Map protein_length_stats_by_dc = new HashMap(); + final Map domain_number_stats_by_dc = new HashMap(); // Main loop: for( int i = 0; i < number_of_genomes; ++i ) { System.out.println(); @@ -1927,7 +1929,9 @@ public class surfacing { ignore_combination_with_same, new BasicSpecies( input_file_properties[ i ][ 1 ] ), domain_id_to_go_ids_map, - dc_type ) ); + dc_type, + protein_length_stats_by_dc, + domain_number_stats_by_dc ) ); domain_lengths_table.addLengths( protein_list ); if ( gwcd_list.get( i ).getSize() > 0 ) { SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties, @@ -2255,7 +2259,9 @@ public class surfacing { output_binary_domain_combinationsfor_graph_analysis, all_bin_domain_combinations_gained_fitch, all_bin_domain_combinations_lost_fitch, - dc_type ); + dc_type, + protein_length_stats_by_dc, + domain_number_stats_by_dc ); // Listing of all domain combinations gained is only done if only one input tree is used. if ( ( domain_id_to_secondary_features_maps != null ) && ( domain_id_to_secondary_features_maps.length > 0 ) ) { diff --git a/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java b/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java index 6112bdb..ef98751 100644 --- a/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java +++ b/forester/java/src/org/forester/surfacing/BasicGenomeWideCombinableDomains.java @@ -264,21 +264,25 @@ public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDom ignore_combination_with_same_domain, species, null, - DomainCombinationType.BASIC ); + DomainCombinationType.BASIC, + null, + null ); } public static BasicGenomeWideCombinableDomains createInstance( final List protein_list, final boolean ignore_combination_with_same_domain, final Species species, final DomainCombinationType dc_type ) { - return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type ); + return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type, null, null ); } public static BasicGenomeWideCombinableDomains createInstance( final List protein_list, final boolean ignore_combination_with_same_domain, final Species species, final Map> domain_id_to_go_ids_map, - final DomainCombinationType dc_type ) { + final DomainCombinationType dc_type, + final Map protein_length_stats_by_dc, + final Map domain_number_stats_by_dc ) { final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type ); final Map domain_counts = new HashMap(); final Map domain_protein_counts = new HashMap(); @@ -361,6 +365,27 @@ public class BasicGenomeWideCombinableDomains implements GenomeWideCombinableDom if ( ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) && ( closest != null ) ) { domain_combination.addCombinableDomain( closest.getDomainId() ); } + if ( protein_length_stats_by_dc != null ) { + final List dcs = domain_combination.toBinaryDomainCombinations(); + for( final BinaryDomainCombination dc : dcs ) { + final String dc_str = dc.toString(); + if ( !protein_length_stats_by_dc.containsKey( dc_str ) ) { + protein_length_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() ); + } + protein_length_stats_by_dc.get( dc_str ).addValue( protein.getLength() ); + } + } + if ( domain_number_stats_by_dc != null ) { + final List dcs = domain_combination.toBinaryDomainCombinations(); + for( final BinaryDomainCombination dc : dcs ) { + final String dc_str = dc.toString(); + if ( !domain_number_stats_by_dc.containsKey( dc_str ) ) { + domain_number_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() ); + } + domain_number_stats_by_dc.get( dc_str ).addValue( protein.getNumberOfProteinDomains() ); + } + } + // } } } diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index c0b8b8f..e998e82 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -41,6 +41,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.PriorityQueue; import java.util.Set; import java.util.SortedMap; @@ -176,7 +177,9 @@ public final class SurfacingUtil { final String outfilename_for_dc_for_go_mapping, final String outfilename_for_dc_for_go_mapping_unique, final String outfilename_for_rank_counts, - final String outfilename_for_ancestor_species_counts ) { + final String outfilename_for_ancestor_species_counts, + final Map protein_length_stats_by_dc, + final Map domain_number_stats_by_dc ) { try { final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); @@ -197,6 +200,8 @@ public final class SurfacingUtil { } final SortedMap histogram = new TreeMap(); final SortedMap domain_lists = new TreeMap(); + final SortedMap dc_reapp_counts_to_protein_length_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_number_stats = new TreeMap(); final SortedMap> domain_lists_go = new TreeMap>(); final SortedMap> domain_lists_go_unique = new TreeMap>(); final Set dcs = dc_gain_counts.keySet(); @@ -219,6 +224,20 @@ public final class SurfacingUtil { set.addAll( splitDomainCombination( dc ) ); domain_lists_go_unique.put( count, set ); } + if ( protein_length_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) { + dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); + } + dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_number_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); + } + dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) + .arithmeticMean() ); + } if ( count > 1 ) { more_than_once.add( dc ); } @@ -290,6 +309,22 @@ public final class SurfacingUtil { ForesterUtil.LINE_SEPARATOR ); out_for_rank_counts.close(); out_for_ancestor_species_counts.close(); + System.out.println( "Lengths: " ); + if ( protein_length_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_protein_length_stats.entrySet() ) { + System.out.println( entry.getKey().toString() ); + System.out.println( ": " ); + System.out.println( entry.getValue().toString() ); + } + } + System.out.println( "Number of domains: " ); + if ( domain_number_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_number_stats.entrySet() ) { + System.out.println( entry.getKey().toString() ); + System.out.println( ": " ); + System.out.println( entry.getValue().toString() ); + } + } } catch ( final IOException e ) { ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); @@ -572,7 +607,9 @@ public final class SurfacingUtil { final boolean output_binary_domain_combinations_for_graphs, final List all_binary_domains_combination_gained_fitch, final List all_binary_domains_combination_lost_fitch, - final BinaryDomainCombination.DomainCombinationType dc_type ) { + final BinaryDomainCombination.DomainCombinationType dc_type, + final Map protein_length_stats_by_dc, + final Map domain_number_stats_by_dc ) { final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; final String date_time = ForesterUtil.getCurrentDateTime(); final SortedSet all_pfams_encountered = new TreeSet(); @@ -789,12 +826,19 @@ public final class SurfacingUtil { parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH ); - calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name - + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt" ); + calculateIndependentDomainCombinationGains( local_phylogeny_l, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, + outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt", + outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt", + protein_length_stats_by_dc, + domain_number_stats_by_dc ); } } @@ -867,7 +911,57 @@ public final class SurfacingUtil { + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name - + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt" ); + + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null ); + } + + public static void doit( final List proteins, + final List query_domain_ids_nc_order, + final Writer out, + final String separator, + final String limit_to_species, + final Map> average_protein_lengths_by_dc ) throws IOException { + for( final Protein protein : proteins ) { + if ( ForesterUtil.isEmpty( limit_to_species ) + || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { + if ( protein.contains( query_domain_ids_nc_order, true ) ) { + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator ); + out.write( protein.getProteinId().getId() ); + out.write( separator ); + out.write( "[" ); + final Set visited_domain_ids = new HashSet(); + boolean first = true; + for( final Domain domain : protein.getProteinDomains() ) { + if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { + visited_domain_ids.add( domain.getDomainId() ); + if ( first ) { + first = false; + } + else { + out.write( " " ); + } + out.write( domain.getDomainId().getId() ); + out.write( " {" ); + out.write( "" + domain.getTotalCount() ); + out.write( "}" ); + } + } + out.write( "]" ); + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getDescription() ); + } + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getAccession() ); + } + out.write( SurfacingConstants.NL ); + } + } + } + out.flush(); } public static void extractProteinNames( final List proteins, diff --git a/forester/java/src/org/forester/surfacing/TestSurfacing.java b/forester/java/src/org/forester/surfacing/TestSurfacing.java index 8d4186c..11f4246 100644 --- a/forester/java/src/org/forester/surfacing/TestSurfacing.java +++ b/forester/java/src/org/forester/surfacing/TestSurfacing.java @@ -4108,7 +4108,9 @@ public class TestSurfacing { true, new BasicSpecies( "human" ), null, - DomainCombinationType.BASIC ); + DomainCombinationType.BASIC, + null, + null ); cd = cdcc2.get( new DomainId( "A" ) ); if ( cd.getKeyDomainCount() != 9 ) { return false; -- 1.7.10.2