final static private String INPUT_SPECIES_TREE_OPTION = "species_tree";
final static private String SEQ_EXTRACT_OPTION = "prot_extract";
final static private char SEPARATOR_FOR_INPUT_VALUES = '#';
- final static private String PRG_VERSION = "2.230";
- final static private String PRG_DATE = "2012.04.22";
+ final static private String PRG_VERSION = "2.240";
+ final static private String PRG_DATE = "2012.05.04";
final static private String E_MAIL = "czmasek@burnham.org";
final static private String WWW = "www.phylosoft.org/forester/applications/surfacing";
final static private boolean IGNORE_DUFS_DEFAULT = true;
* @param sum_of_all_domains_encountered
* @param all_bin_domain_combinations_encountered
* @param is_gains_analysis
+ * @param protein_length_stats_by_dc
* @throws IOException
*/
private static void executeFitchGainsAnalysis( final File output_file,
System.out.println( "Ignore combination with self: " + ignore_combination_with_same );
html_desc.append( "<tr><td>Ignore combination with self for domain combination similarity analyses:</td><td>"
+ ignore_combination_with_same + "</td></tr>" + nl );
- ;
System.out.println( "Consider directedness : "
+ ( dc_type != BinaryDomainCombination.DomainCombinationType.BASIC ) );
html_desc.append( "<tr><td>Consider directedness of binary domain combinations:</td><td>"
catch ( final IOException e3 ) {
e3.printStackTrace();
}
+ final Map<String, DescriptiveStatistics> protein_length_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
+ final Map<String, DescriptiveStatistics> domain_number_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
// Main loop:
for( int i = 0; i < number_of_genomes; ++i ) {
System.out.println();
ignore_combination_with_same,
new BasicSpecies( input_file_properties[ i ][ 1 ] ),
domain_id_to_go_ids_map,
- dc_type ) );
+ dc_type,
+ protein_length_stats_by_dc,
+ domain_number_stats_by_dc ) );
domain_lengths_table.addLengths( protein_list );
if ( gwcd_list.get( i ).getSize() > 0 ) {
SurfacingUtil.writeDomainCombinationsCountsFile( input_file_properties,
output_binary_domain_combinationsfor_graph_analysis,
all_bin_domain_combinations_gained_fitch,
all_bin_domain_combinations_lost_fitch,
- dc_type );
+ dc_type,
+ protein_length_stats_by_dc,
+ domain_number_stats_by_dc );
// Listing of all domain combinations gained is only done if only one input tree is used.
if ( ( domain_id_to_secondary_features_maps != null )
&& ( domain_id_to_secondary_features_maps.length > 0 ) ) {
ignore_combination_with_same_domain,
species,
null,
- DomainCombinationType.BASIC );
+ DomainCombinationType.BASIC,
+ null,
+ null );
}
public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
final boolean ignore_combination_with_same_domain,
final Species species,
final DomainCombinationType dc_type ) {
- return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type );
+ return createInstance( protein_list, ignore_combination_with_same_domain, species, null, dc_type, null, null );
}
public static BasicGenomeWideCombinableDomains createInstance( final List<Protein> protein_list,
final boolean ignore_combination_with_same_domain,
final Species species,
final Map<DomainId, List<GoId>> domain_id_to_go_ids_map,
- final DomainCombinationType dc_type ) {
+ final DomainCombinationType dc_type,
+ final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
+ final Map<String, DescriptiveStatistics> domain_number_stats_by_dc ) {
final BasicGenomeWideCombinableDomains instance = new BasicGenomeWideCombinableDomains( species, dc_type );
final Map<DomainId, Integer> domain_counts = new HashMap<DomainId, Integer>();
final Map<DomainId, Integer> domain_protein_counts = new HashMap<DomainId, Integer>();
if ( ( dc_type == DomainCombinationType.DIRECTED_ADJACTANT ) && ( closest != null ) ) {
domain_combination.addCombinableDomain( closest.getDomainId() );
}
+ if ( protein_length_stats_by_dc != null ) {
+ final List<BinaryDomainCombination> dcs = domain_combination.toBinaryDomainCombinations();
+ for( final BinaryDomainCombination dc : dcs ) {
+ final String dc_str = dc.toString();
+ if ( !protein_length_stats_by_dc.containsKey( dc_str ) ) {
+ protein_length_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() );
+ }
+ protein_length_stats_by_dc.get( dc_str ).addValue( protein.getLength() );
+ }
+ }
+ if ( domain_number_stats_by_dc != null ) {
+ final List<BinaryDomainCombination> dcs = domain_combination.toBinaryDomainCombinations();
+ for( final BinaryDomainCombination dc : dcs ) {
+ final String dc_str = dc.toString();
+ if ( !domain_number_stats_by_dc.containsKey( dc_str ) ) {
+ domain_number_stats_by_dc.put( dc_str, new BasicDescriptiveStatistics() );
+ }
+ domain_number_stats_by_dc.get( dc_str ).addValue( protein.getNumberOfProteinDomains() );
+ }
+ }
+ //
}
}
}
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.SortedMap;
final String outfilename_for_dc_for_go_mapping,
final String outfilename_for_dc_for_go_mapping_unique,
final String outfilename_for_rank_counts,
- final String outfilename_for_ancestor_species_counts ) {
+ final String outfilename_for_ancestor_species_counts,
+ final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
+ final Map<String, DescriptiveStatistics> domain_number_stats_by_dc ) {
try {
final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
}
final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
+ final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_protein_length_stats = new TreeMap<Integer, DescriptiveStatistics>();
+ final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_number_stats = new TreeMap<Integer, DescriptiveStatistics>();
final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
final Set<String> dcs = dc_gain_counts.keySet();
set.addAll( splitDomainCombination( dc ) );
domain_lists_go_unique.put( count, set );
}
+ if ( protein_length_stats_by_dc != null ) {
+ if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) {
+ dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() );
+ }
+ dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc )
+ .arithmeticMean() );
+ }
+ if ( domain_number_stats_by_dc != null ) {
+ if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) {
+ dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() );
+ }
+ dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc )
+ .arithmeticMean() );
+ }
if ( count > 1 ) {
more_than_once.add( dc );
}
ForesterUtil.LINE_SEPARATOR );
out_for_rank_counts.close();
out_for_ancestor_species_counts.close();
+ System.out.println( "Lengths: " );
+ if ( protein_length_stats_by_dc != null ) {
+ for( final Entry<?, ?> entry : dc_reapp_counts_to_protein_length_stats.entrySet() ) {
+ System.out.println( entry.getKey().toString() );
+ System.out.println( ": " );
+ System.out.println( entry.getValue().toString() );
+ }
+ }
+ System.out.println( "Number of domains: " );
+ if ( domain_number_stats_by_dc != null ) {
+ for( final Entry<?, ?> entry : dc_reapp_counts_to_domain_number_stats.entrySet() ) {
+ System.out.println( entry.getKey().toString() );
+ System.out.println( ": " );
+ System.out.println( entry.getValue().toString() );
+ }
+ }
}
catch ( final IOException e ) {
ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e );
final boolean output_binary_domain_combinations_for_graphs,
final List<BinaryDomainCombination> all_binary_domains_combination_gained_fitch,
final List<BinaryDomainCombination> all_binary_domains_combination_lost_fitch,
- final BinaryDomainCombination.DomainCombinationType dc_type ) {
+ final BinaryDomainCombination.DomainCombinationType dc_type,
+ final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
+ final Map<String, DescriptiveStatistics> domain_number_stats_by_dc ) {
final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR;
final String date_time = ForesterUtil.getCurrentDateTime();
final SortedSet<String> all_pfams_encountered = new TreeSet<String>();
parameters_str );
SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name
+ surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH );
- calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name
- + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name
- + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name
- + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name
- + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name
- + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt" );
+ calculateIndependentDomainCombinationGains( local_phylogeny_l,
+ outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX,
+ outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX,
+ outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX,
+ outfile_name
+ + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX,
+ outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt",
+ outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt",
+ protein_length_stats_by_dc,
+ domain_number_stats_by_dc );
}
}
+ surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name
+ surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name
+ "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name
- + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt" );
+ + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null );
+ }
+
+ public static void doit( final List<Protein> proteins,
+ final List<DomainId> query_domain_ids_nc_order,
+ final Writer out,
+ final String separator,
+ final String limit_to_species,
+ final Map<String, List<Integer>> average_protein_lengths_by_dc ) throws IOException {
+ for( final Protein protein : proteins ) {
+ if ( ForesterUtil.isEmpty( limit_to_species )
+ || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) {
+ if ( protein.contains( query_domain_ids_nc_order, true ) ) {
+ out.write( protein.getSpecies().getSpeciesId() );
+ out.write( separator );
+ out.write( protein.getProteinId().getId() );
+ out.write( separator );
+ out.write( "[" );
+ final Set<DomainId> visited_domain_ids = new HashSet<DomainId>();
+ boolean first = true;
+ for( final Domain domain : protein.getProteinDomains() ) {
+ if ( !visited_domain_ids.contains( domain.getDomainId() ) ) {
+ visited_domain_ids.add( domain.getDomainId() );
+ if ( first ) {
+ first = false;
+ }
+ else {
+ out.write( " " );
+ }
+ out.write( domain.getDomainId().getId() );
+ out.write( " {" );
+ out.write( "" + domain.getTotalCount() );
+ out.write( "}" );
+ }
+ }
+ out.write( "]" );
+ out.write( separator );
+ if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription()
+ .equals( SurfacingConstants.NONE ) ) ) {
+ out.write( protein.getDescription() );
+ }
+ out.write( separator );
+ if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession()
+ .equals( SurfacingConstants.NONE ) ) ) {
+ out.write( protein.getAccession() );
+ }
+ out.write( SurfacingConstants.NL );
+ }
+ }
+ }
+ out.flush();
}
public static void extractProteinNames( final List<Protein> proteins,