// Copyright (C) 2008-2009 Christian M. Zmasek
// Copyright (C) 2008-2009 Burnham Institute for Medical Research
// All rights reserved
-//
+//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
-//
+//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
import java.io.Writer;
import java.util.ArrayList;
import java.util.Date;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import org.forester.go.PfamToGoParser;
import org.forester.io.parsers.HmmscanPerDomainTableParser;
import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF;
+import org.forester.io.parsers.util.ParserUtils;
import org.forester.io.writers.PhylogenyWriter;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyMethods;
import org.forester.surfacing.DomainLengthsTable;
import org.forester.surfacing.DomainParsimonyCalculator;
import org.forester.surfacing.DomainSimilarity;
+import org.forester.surfacing.DomainSimilarity.DomainSimilarityScoring;
+import org.forester.surfacing.DomainSimilarity.DomainSimilaritySortField;
import org.forester.surfacing.DomainSimilarityCalculator;
+import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
import org.forester.surfacing.GenomeWideCombinableDomains;
+import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
import org.forester.surfacing.MappingResults;
import org.forester.surfacing.PairwiseDomainSimilarityCalculator;
import org.forester.surfacing.PairwiseGenomeComparator;
import org.forester.surfacing.PrintableDomainSimilarity;
+import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION;
import org.forester.surfacing.Protein;
import org.forester.surfacing.ProteinCountsBasedPairwiseDomainSimilarityCalculator;
import org.forester.surfacing.Species;
import org.forester.surfacing.SurfacingUtil;
-import org.forester.surfacing.DomainSimilarity.DomainSimilarityScoring;
-import org.forester.surfacing.DomainSimilarity.DomainSimilaritySortField;
-import org.forester.surfacing.DomainSimilarityCalculator.Detailedness;
-import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder;
-import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION;
+import org.forester.util.BasicDescriptiveStatistics;
import org.forester.util.BasicTable;
import org.forester.util.BasicTableParser;
import org.forester.util.CommandLineArguments;
public class surfacing {
+ private static final int MINIMAL_NUMBER_OF_SIMILARITIES_FOR_SPLITTING = 1000;
public final static String DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS = "graph_analysis_out";
public final static String DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_dc.dot";
public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS = "_fitch_present_dc.dot";
public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_DOMAINS = "_fitch_glc_d";
public final static String PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS = "_fitch_glc_dc";
// tables:
- // public final static String PARSIMONY_OUTPUT_FITCH_GAINS_BC = "_fitch_gains_dc";
- // public final static String PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC = "_fitch_gains_dc.html";
- // public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_BC = "_fitch_losses_dc";
- // public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC = "_fitch_losses_dc.html";
- // public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC = "_fitch_present_dc";
- // public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC = "_fitch_present_dc.html";
- // public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_D = "_dollo_gains_d";
- // public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_GOID_D = "_dollo_gains_goid_d";
- // public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D = "_dollo_gains_d.html";
- // public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_D = "_dollo_losses_d";
- //public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D = "_dollo_losses_d.html";
- // public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_D = "_dollo_present_d";
- public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_GOID_D = "_dollo_present_goid_d";
- //public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D = "_dollo_present_d.html";
+ public final static String PARSIMONY_OUTPUT_FITCH_GAINS_BC = "_fitch_gains_dc";
+ public final static String PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC = "_fitch_gains_dc.html";
+ public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_BC = "_fitch_losses_dc";
+ public final static String PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC = "_fitch_losses_dc.html";
+ public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_BC = "_fitch_present_dc";
+ public final static String PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC = "_fitch_present_dc.html";
+ public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_D = "_dollo_gains_d";
+ public final static String PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D = "_dollo_gains_d.html";
+ public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_D = "_dollo_losses_d";
+ public final static String PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D = "_dollo_losses_d.html";
+ public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_D = "_dollo_present_d";
+ public final static String PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D = "_dollo_present_d.html";
public final static String DOMAINS_PRESENT_NEXUS = "_dom.nex";
public final static String BDC_PRESENT_NEXUS = "_dc.nex";
// ---
public static final String PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES = "_dollo_present_secondary_features";
public static final String SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO = "_secondary_features_dollo"
+ ForesterConstants.PHYLO_XML_SUFFIX;
- public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_BIOLOGICAL_PROCESS = "_dollo_biol_proc_goid_d";
- public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_CELLULAR_COMPONENT = "_dollo_cell_comp_goid_d";
- public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_MOLECULAR_FUNCTION = "_dollo_mol_funct_goid_d";
public static final String PARSIMONY_OUTPUT_DOLLO_ALL_GOID_D_ALL_NAMESPACES = "_dollo_goid_d";
- public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_BIOLOGICAL_PROCESS = "_fitch_biol_proc_goid_dc";
- public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_CELLULAR_COMPONENT = "_fitch_cell_comp_goid_dc";
- public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_MOLECULAR_FUNCTION = "_fitch_mol_funct_goid_dc";
public static final String PARSIMONY_OUTPUT_FITCH_ALL_GOID_BC_ALL_NAMESPACES = "_fitch_goid_dc";
final static private String HELP_OPTION_1 = "help";
final static private String HELP_OPTION_2 = "h";
+ ForesterConstants.PHYLO_XML_SUFFIX;
final static private String NJ_TREE_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX = "_bin_combinations_NJ"
+ ForesterConstants.PHYLO_XML_SUFFIX;
- final static private String DISPLAY_M_HISTOGRAMS_OPTION = "mhisto";
- // final static private boolean DISPLAY_M_HISTOGRAMS_OPTION_DEFAULT = false;
final static private String JACKNIFE_OPTION = "jack";
final static private String JACKNIFE_RANDOM_SEED_OPTION = "seed";
final static private String JACKNIFE_RATIO_OPTION = "jack_ratio";
final static private String INPUT_SPECIES_TREE_OPTION = "species_tree";
final static private String SEQ_EXTRACT_OPTION = "prot_extract";
final static private char SEPARATOR_FOR_INPUT_VALUES = '#';
- final static private String PRG_VERSION = "2.003";
- final static private String PRG_DATE = "2010.12.03";
+ final static private String PRG_VERSION = "2.210";
+ final static private String PRG_DATE = "2012.02.21";
final static private String E_MAIL = "czmasek@burnham.org";
final static private String WWW = "www.phylosoft.org/forester/applications/surfacing";
final static private boolean IGNORE_DUFS_DEFAULT = true;
final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false;
final static private double MAX_E_VALUE_DEFAULT = -1;
final static private int MAX_ALLOWED_OVERLAP_DEFAULT = -1;
- final static private String DEFAULT_SEARCH_PARAMETER = "ls";
- final private static boolean VERBOSE_DEFAULT = true;
private static final String RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION = "random_seed";
private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS = "consider_bdc_direction";
private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY = "consider_bdc_adj";
private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt";
private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN";
private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE;
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique.txt";
+ public static final String LIMIT_SPEC_FOR_PROT_EX = null; // e.g. "HUMAN"; set to null for not using this feature (default).
+ public static final String BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED = "_dc_MAPPED_secondary_features_fitch"
+ + ForesterConstants.PHYLO_XML_SUFFIX;
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts_MAPPED.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_MAPPED.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_MAPPED.txt";
+ public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt";
- // final String error = ForesterUtil.isReadableFile( new File(
- // input_file_properties[ i ][ 0 ] ) );
- // if ( !ForesterUtil.isEmpty( error ) ) {
- // ForesterUtil.fatalError( surfacing.PRG_NAME, error );
- // }
private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option,
final String[][] input_file_properties,
final String automated_pairwise_comparison_suffix,
}
/**
- * Warning: This sideeffects 'all_bin_domain_combinations_encountered'!
+ * Warning: This side-effects 'all_bin_domain_combinations_encountered'!
*
*
* @param output_file
}
try {
final Phylogeny[] p_array = ParserBasedPhylogenyFactory.getInstance()
- .create( intree_file, ForesterUtil.createParserDependingOnFileType( intree_file, true ) );
+ .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) );
if ( p_array.length < 1 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file
+ "] does not contain any phylogeny in phyloXML format" );
ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" );
}
if ( intree.getNumberOfExternalNodes() < number_of_genomes ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "number of external nodes ["
- + intree.getNumberOfExternalNodes() + "] of input tree [" + intree_file
- + "] is smaller than the number of genomes the be analyzed [" + number_of_genomes + "]" );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "number of external nodes [" + intree.getNumberOfExternalNodes()
+ + "] of input tree [" + intree_file
+ + "] is smaller than the number of genomes the be analyzed ["
+ + number_of_genomes + "]" );
}
final StringBuilder parent_names = new StringBuilder();
final int nodes_lacking_name = SurfacingUtil.getNumberOfNodesLackingName( intree, parent_names );
allowed_options.add( surfacing.GO_NAMESPACE_LIMIT_OPTION );
allowed_options.add( surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION );
allowed_options.add( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION );
- allowed_options.add( surfacing.DISPLAY_M_HISTOGRAMS_OPTION );
allowed_options.add( surfacing.CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS );
allowed_options.add( JACKNIFE_OPTION );
allowed_options.add( JACKNIFE_RANDOM_SEED_OPTION );
.createDomainIdToSecondaryFeaturesMap( secondary_features_map_files[ i ] );
}
catch ( final IOException e ) {
- ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read secondary features map file: "
- + e.getMessage() );
+ ForesterUtil.fatalError( surfacing.PRG_NAME,
+ "cannot read secondary features map file: " + e.getMessage() );
}
catch ( final Exception e ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, "problem with contents of features map file ["
"no (acceptable) go id to term mapping file provided ('GO OBO file') (-"
+ surfacing.GO_OBO_FILE_USE_OPTION + "=<file>)" );
}
- boolean display_histograms = false;
- if ( cla.isOptionSet( surfacing.DISPLAY_M_HISTOGRAMS_OPTION ) ) {
- display_histograms = true;
- }
System.out.println( "Output directory : " + out_dir );
if ( input_file_names_from_file != null ) {
System.out.println( "Input files names from : " + input_files_file + " ["
}
} // if ( perform_pwc ) {
System.out.println();
- html_desc.append( "<tr><td>Command line:</td><td>" + cla.getCommandLineArgsAsString() + "</td></tr>" + nl );
+ html_desc.append( "<tr><td>Command line:</td><td>\n" + cla.getCommandLineArgsAsString() + "\n</td></tr>" + nl );
System.out.println( "Command line : " + cla.getCommandLineArgsAsString() );
BufferedWriter[] query_domains_writer_ary = null;
List<DomainId>[] query_domain_ids_array = null;
catch ( final IOException e2 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() );
}
+ final DescriptiveStatistics protein_coverage_stats = new BasicDescriptiveStatistics();
+ final DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics();
+ final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo = new TreeMap<Integer, Integer>();
+ final SortedSet<String> domains_which_are_always_single = new TreeSet<String>();
+ final SortedSet<String> domains_which_are_sometimes_single_sometimes_not = new TreeSet<String>();
+ final SortedSet<String> domains_which_never_single = new TreeSet<String>();
+ BufferedWriter domains_per_potein_stats_writer = null;
+ try {
+ domains_per_potein_stats_writer = new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR
+ + output_file + "__domains_per_potein_stats.txt" ) );
+ domains_per_potein_stats_writer.write( "Genome" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( "Mean" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( "SD" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( "Median" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( "N" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( "Min" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( "Max" );
+ domains_per_potein_stats_writer.write( "\n" );
+ }
+ catch ( final IOException e3 ) {
+ e3.printStackTrace();
+ }
+ // Main loop:
for( int i = 0; i < number_of_genomes; ++i ) {
System.out.println();
System.out.println( ( i + 1 ) + "/" + number_of_genomes );
input_file_properties[ i ][ 1 ],
filter,
filter_type,
- ind_score_cutoff );
+ ind_score_cutoff,
+ true );
}
else {
parser = new HmmscanPerDomainTableParser( new File( input_file_properties[ i ][ 0 ] ),
input_file_properties[ i ][ 1 ],
- ind_score_cutoff );
+ ind_score_cutoff,
+ true );
}
if ( e_value_max >= 0.0 ) {
parser.setEValueMaximum( e_value_max );
if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) {
parser.setMaxAllowedOverlap( max_allowed_overlap );
}
- parser
- .setReturnType( HmmscanPerDomainTableParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN );
+ parser.setReturnType( HmmscanPerDomainTableParser.ReturnType.UNORDERED_PROTEIN_DOMAIN_COLLECTION_PER_PROTEIN );
if ( individual_score_cutoffs != null ) {
parser.setIndividualScoreCutoffs( individual_score_cutoffs );
}
System.out.println( "Domains ignored due to virus like id: " );
ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToVirusLikeIdCountsMap() );
}
+ final double coverage = ( double ) protein_list.size() / parser.getProteinsEncountered();
+ protein_coverage_stats.addValue( coverage );
System.out.println( "Number of proteins encountered : " + parser.getProteinsEncountered() );
log( "Number of proteins encountered : " + parser.getProteinsEncountered(), log_writer );
System.out.println( "Number of proteins stored : " + protein_list.size() );
log( "Number of proteins stored : " + protein_list.size(), log_writer );
+ System.out.println( "Coverage : "
+ + ForesterUtil.roundToInt( 100.0 * coverage ) + "%" );
+ log( "Coverage : " + ForesterUtil.roundToInt( 100.0 * coverage )
+ + "%", log_writer );
System.out.println( "Domains encountered : " + parser.getDomainsEncountered() );
log( "Domains encountered : " + parser.getDomainsEncountered(), log_writer );
System.out.println( "Domains stored : " + parser.getDomainsStored() );
System.out.println( "Domains ignored due to individual score cutoffs: "
+ parser.getDomainsIgnoredDueToIndividualScoreCutoff() );
log( "Domains ignored due to individual score cutoffs: "
- + parser.getDomainsIgnoredDueToIndividualScoreCutoff(), log_writer );
+ + parser.getDomainsIgnoredDueToIndividualScoreCutoff(),
+ log_writer );
System.out.println( "Domains ignored due to E-value : "
+ parser.getDomainsIgnoredDueToEval() );
log( "Domains ignored due to E-value : " + parser.getDomainsIgnoredDueToEval(), log_writer );
System.out.println( "Domains ignored due negative domain filter : "
+ parser.getDomainsIgnoredDueToNegativeDomainFilter() );
log( "Domains ignored due negative domain filter : "
- + parser.getDomainsIgnoredDueToNegativeDomainFilter(), log_writer );
+ + parser.getDomainsIgnoredDueToNegativeDomainFilter(),
+ log_writer );
System.out.println( "Domains ignored due to overlap : "
+ parser.getDomainsIgnoredDueToOverlap() );
log( "Domains ignored due to overlap : " + parser.getDomainsIgnoredDueToOverlap(),
catch ( final IOException e ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e.toString() );
}
+ SurfacingUtil.domainsPerProteinsStatistics( input_file_properties[ i ][ 1 ],
+ protein_list,
+ all_genomes_domains_per_potein_stats,
+ all_genomes_domains_per_potein_histo,
+ domains_which_are_always_single,
+ domains_which_are_sometimes_single_sometimes_not,
+ domains_which_never_single,
+ domains_per_potein_stats_writer );
gwcd_list.add( BasicGenomeWideCombinableDomains
.createInstance( protein_list,
ignore_combination_with_same,
SurfacingUtil.extractProteinNames( protein_list,
query_domain_ids_array[ j ],
query_domains_writer_ary[ j ],
- "\t" );
+ "\t",
+ LIMIT_SPEC_FOR_PROT_EX );
query_domains_writer_ary[ j ].flush();
}
catch ( final IOException e ) {
}
System.gc();
} // for( int i = 0; i < number_of_genomes; ++i ) {
+ ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: "
+ + per_genome_domain_promiscuity_statistics_file );
+ //
try {
- per_genome_domain_promiscuity_statistics_writer.flush();
- per_genome_domain_promiscuity_statistics_writer.close();
- dc_data_writer.flush();
- dc_data_writer.close();
- log_writer.flush();
- log_writer.close();
+ domains_per_potein_stats_writer.write( "ALL" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.arithmeticMean() + "" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.sampleStandardDeviation() + "" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.median() + "" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getN() + "" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMin() + "" );
+ domains_per_potein_stats_writer.write( "\t" );
+ domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMax() + "" );
+ domains_per_potein_stats_writer.write( "\n" );
+ domains_per_potein_stats_writer.close();
+ printOutPercentageOfMultidomainProteins( all_genomes_domains_per_potein_histo, log_writer );
+ ForesterUtil.map2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ + "__all_genomes_domains_per_potein_histo.txt" ), all_genomes_domains_per_potein_histo, "\t", "\n" );
+ ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ + "__domains_always_single_.txt" ), domains_which_are_always_single, "\n" );
+ ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ + "__domains_single_or_combined.txt" ), domains_which_are_sometimes_single_sometimes_not, "\n" );
+ ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
+ + "__domains_always_combined.txt" ), domains_which_never_single, "\n" );
+ ForesterUtil.programMessage( PRG_NAME,
+ "Average of proteins with a least one domain assigned: "
+ + ( 100 * protein_coverage_stats.arithmeticMean() ) + "% (+/-"
+ + ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)" );
+ ForesterUtil.programMessage( PRG_NAME, "Range of proteins with a least one domain assigned: " + 100
+ * protein_coverage_stats.getMin() + "%-" + 100 * protein_coverage_stats.getMax() + "%" );
+ log( "Average of prot with a least one dom assigned : " + ( 100 * protein_coverage_stats.arithmeticMean() )
+ + "% (+/-" + ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)", log_writer );
+ log( "Range of prot with a least one dom assigned : " + 100 * protein_coverage_stats.getMin() + "%-"
+ + 100 * protein_coverage_stats.getMax() + "%", log_writer );
}
catch ( final IOException e2 ) {
ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
}
- ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: "
- + per_genome_domain_promiscuity_statistics_file );
if ( query_domains_writer_ary != null ) {
for( int j = 0; j < query_domain_ids_array.length; j++ ) {
try {
}
}
}
+ try {
+ per_genome_domain_promiscuity_statistics_writer.close();
+ dc_data_writer.close();
+ log_writer.close();
+ }
+ catch ( final IOException e2 ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
+ }
if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) {
try {
SurfacingUtil.executeDomainLengthAnalysis( input_file_properties,
DescriptiveStatistics pw_stats = null;
try {
String my_outfile = output_file.toString();
- if ( !my_outfile.endsWith( ".html" ) ) {
+ Map<Character, Writer> split_writers = null;
+ Writer writer = null;
+ if ( similarities.size() > MINIMAL_NUMBER_OF_SIMILARITIES_FOR_SPLITTING ) {
+ if ( my_outfile.endsWith( ".html" ) ) {
+ my_outfile = my_outfile.substring( 0, my_outfile.length() - 5 );
+ }
+ split_writers = new HashMap<Character, Writer>();
+ createSplitWriters( out_dir, my_outfile, split_writers );
+ }
+ else if ( !my_outfile.endsWith( ".html" ) ) {
my_outfile += ".html";
+ writer = new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile ) );
}
- final Writer writer = new BufferedWriter( new FileWriter( out_dir == null ? my_outfile : out_dir
- + ForesterUtil.FILE_SEPARATOR + my_outfile ) );
List<Species> species_order = null;
if ( species_matrix ) {
species_order = new ArrayList<Species>();
.writeDomainSimilaritiesToFile( html_desc,
new StringBuilder( number_of_genomes + " genomes" ),
writer,
+ split_writers,
similarities,
number_of_genomes == 2,
species_order,
true,
surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX,
surfacing.PRG_NAME,
- display_histograms,
out_dir,
write_pwc_files );
String matrix_output_file = new String( output_file.toString() );
}
SurfacingUtil.writeMatrixToFile( new File( matrix_output_file
+ surfacing.MATRIX_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ), pwgc.getDomainDistanceScoresMeans() );
+ SurfacingUtil
+ .writeMatrixToFile( new File( matrix_output_file
+ + surfacing.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ),
+ pwgc.getSharedBinaryCombinationsBasedDistances() );
SurfacingUtil.writeMatrixToFile( new File( matrix_output_file
- + surfacing.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc
- .getSharedBinaryCombinationsBasedDistances() );
- SurfacingUtil.writeMatrixToFile( new File( matrix_output_file
- + surfacing.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc
- .getSharedDomainsBasedDistances() );
+ + surfacing.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ),
+ pwgc.getSharedDomainsBasedDistances() );
final Phylogeny nj_gd = SurfacingUtil.createNjTreeBasedOnMatrixToFile( new File( matrix_output_file
+ surfacing.NJ_TREE_MEAN_SCORE_BASED_GENOME_DISTANCE_SUFFIX ), pwgc.getDomainDistanceScoresMeans()
.get( 0 ) );
inferred_trees.add( nj_gd );
inferred_trees.add( nj_bc );
inferred_trees.add( nj_d );
- // final List<HistogramData> histogram_datas = pwgc.getHistogramDatas();
- // if ( infer_species_trees ) {
- // inferred_trees = new ArrayList<Phylogeny>();
- // final List<Phylogeny> inferred_trees_bc = inferSpeciesTrees( new File( output_file + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc
- // .getSharedBinaryCombinationsBasedDistances() );
- // final List<Phylogeny> inferred_trees_d = inferSpeciesTrees( new File( output_file + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc
- // .getSharedDomainsBasedDistances() );
- // inferred_trees.addAll( inferred_trees_bc );
- // inferred_trees.addAll( inferred_trees_d );
- // }
if ( jacknifed_distances ) {
pwgc.performPairwiseComparisonsJacknifed( species,
number_of_genomes,
jacknife_resamplings,
jacknife_ratio,
random_seed );
- SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + "_"
- + ForesterUtil.round( jacknife_ratio, 2 ) + "_" + jacknife_resamplings
- + surfacing.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc
- .getSharedBinaryCombinationsBasedDistances() );
- SurfacingUtil.writeMatrixToFile( new File( matrix_output_file + "_"
- + ForesterUtil.round( jacknife_ratio, 2 ) + "_" + jacknife_resamplings
- + surfacing.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ), pwgc
- .getSharedDomainsBasedDistances() );
+ SurfacingUtil
+ .writeMatrixToFile( new File( matrix_output_file
+ + "_"
+ + ForesterUtil.round( jacknife_ratio, 2 )
+ + "_"
+ + jacknife_resamplings
+ + surfacing.MATRIX_SHARED_BIN_COMBINATIONS_BASED_GENOME_DISTANCE_SUFFIX ),
+ pwgc.getSharedBinaryCombinationsBasedDistances() );
+ SurfacingUtil
+ .writeMatrixToFile( new File( matrix_output_file + "_" + ForesterUtil.round( jacknife_ratio, 2 )
+ + "_" + jacknife_resamplings
+ + surfacing.MATRIX_SHARED_DOMAINS_BASED_GENOME_DISTANCE_SUFFIX ),
+ pwgc.getSharedDomainsBasedDistances() );
// if ( infer_species_trees ) {
// inferSpeciesTrees( new File( output_file + "_" + jacknife_resamplings
// + INFERRED_SBC_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc
// + INFERRED_SD_BASED_NJ_SPECIES_TREE_SUFFIX ), pwgc.getSharedDomainsBasedDistances() );
// }
}
- if ( display_histograms ) {
- // final List<HistogramData> histogram_datas_all = new ArrayList<HistogramData>();
- // histogram_datas_all.add( new HistogramData( "all",
- // values_for_all_scores_histogram,
- // null,
- // 20 ) );
- // final HistogramsFrame hf_all = new HistogramsFrame( histogram_datas_all );
- // final HistogramsFrame hf = new HistogramsFrame( histogram_datas );
- // hf_all.setVisible( true );
- // hf.setVisible( true );
- }
} // if ( ( output_file != null ) && ( number_of_genomes > 2 ) && !isEmpty( automated_pairwise_comparison_suffix ) )
if ( ( out_dir != null ) && ( !perform_pwc ) ) {
output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file );
System.out.println();
}
+ private static void createSplitWriters( final File out_dir,
+ final String my_outfile,
+ final Map<Character, Writer> split_writers ) throws IOException {
+ split_writers.put( 'a', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_A.html" ) ) );
+ split_writers.put( 'b', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_B.html" ) ) );
+ split_writers.put( 'c', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_C.html" ) ) );
+ split_writers.put( 'd', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_D.html" ) ) );
+ split_writers.put( 'e', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_E.html" ) ) );
+ split_writers.put( 'f', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_F.html" ) ) );
+ split_writers.put( 'g', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_G.html" ) ) );
+ split_writers.put( 'h', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_H.html" ) ) );
+ split_writers.put( 'i', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_I.html" ) ) );
+ split_writers.put( 'j', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_J.html" ) ) );
+ split_writers.put( 'k', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_K.html" ) ) );
+ split_writers.put( 'l', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_L.html" ) ) );
+ split_writers.put( 'm', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_M.html" ) ) );
+ split_writers.put( 'n', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_N.html" ) ) );
+ split_writers.put( 'o', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_O.html" ) ) );
+ split_writers.put( 'p', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_P.html" ) ) );
+ split_writers.put( 'q', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_Q.html" ) ) );
+ split_writers.put( 'r', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_R.html" ) ) );
+ split_writers.put( 's', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_S.html" ) ) );
+ split_writers.put( 't', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_T.html" ) ) );
+ split_writers.put( 'u', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_U.html" ) ) );
+ split_writers.put( 'v', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_V.html" ) ) );
+ split_writers.put( 'w', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_W.html" ) ) );
+ split_writers.put( 'x', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_X.html" ) ) );
+ split_writers.put( 'y', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_Y.html" ) ) );
+ split_writers.put( 'z', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_Z.html" ) ) );
+ split_writers.put( '0', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile
+ + "_domains_0.html" ) ) );
+ }
+
+ private static void printOutPercentageOfMultidomainProteins( final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
+ final Writer log_writer ) {
+ int sum = 0;
+ for( final Entry<Integer, Integer> entry : all_genomes_domains_per_potein_histo.entrySet() ) {
+ sum += entry.getValue();
+ }
+ final double percentage = 100.0 * ( sum - all_genomes_domains_per_potein_histo.get( 1 ) ) / sum;
+ ForesterUtil.programMessage( PRG_NAME, "Percentage of multidomain proteins: " + percentage + "%" );
+ log( "Percentage of multidomain proteins: : " + percentage + "%", log_writer );
+ }
+
private static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree,
final String[][] input_file_properties ) {
final String[] genomes = new String[ input_file_properties.length ];
final PhylogenyNode n = it.next();
if ( ForesterUtil.isEmpty( n.getName() ) ) {
if ( n.getNodeData().isHasTaxonomy()
+ && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getTaxonomyCode() ) ) {
+ n.setName( n.getNodeData().getTaxonomy().getTaxonomyCode() );
+ }
+ else if ( n.getNodeData().isHasTaxonomy()
&& !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) {
n.setName( n.getNodeData().getTaxonomy().getScientificName() );
}
+ else if ( n.getNodeData().isHasTaxonomy()
+ && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getCommonName() ) ) {
+ n.setName( n.getNodeData().getTaxonomy().getCommonName() );
+ }
else {
- ForesterUtil.fatalError( surfacing.PRG_NAME,
- "node without both name and scientific taxonomy name found" );
+ ForesterUtil
+ .fatalError( surfacing.PRG_NAME,
+ "node with no name, scientific name, common name, or taxonomy code present" );
}
}
}
System.out.println( surfacing.INPUT_SPECIES_TREE_OPTION
+ ": species tree, to perform (Dollo, Fitch) parismony analyses" );
System.out
- .println( surfacing.DISPLAY_M_HISTOGRAMS_OPTION + ": to display multiple histograms (using fluorite)" );
- System.out
.println( JACKNIFE_OPTION
+ ": perform jacknife resampling for domain and binary domain combination based distance matrices [default resamplings: "
+ JACKNIFE_NUMBER_OF_RESAMPLINGS_DEFAULT + "]" );
System.out.println();
System.out.println();
System.out.println( "Example: java -Xms128m -Xmx512m -cp path/to/forester.jar"
- + "org.forester.application.surfacing -detail=punctilious -o=TEST.html -pwc=TEST"
+ + " org.forester.application.surfacing -detail=punctilious -o=TEST.html -pwc=TEST"
+ " -cos=Pfam_ls_22_TC2 -p2g=pfam2go -obo=gene_ontology_edit.obo "
+ "-dc_sort=dom -ignore_with_self -no_singles -e=0.001 -mo=1 -no_eo "
+ "-ds_output=detailed_html -scoring=domains -sort=alpha -" + JACKNIFE_OPTION
SurfacingUtil.checkForOutputFileWriteability( out );
try {
final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) );
- SurfacingUtil.extractProteinNames( protein_lists_per_species, domain, proteins_file_writer, "\t" );
+ SurfacingUtil.extractProteinNames( protein_lists_per_species,
+ domain,
+ proteins_file_writer,
+ "\t",
+ LIMIT_SPEC_FOR_PROT_EX );
proteins_file_writer.close();
}
catch ( final IOException e ) {