From 0fc3bc32fc5be907e3f91a780af68c6baff79db1 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Thu, 24 Oct 2013 02:57:49 +0000 Subject: [PATCH] in progress --- .../src/org/forester/application/surfacing.java | 716 +--- .../java/src/org/forester/phylogeny/Phylogeny.java | 13 + .../surfacing/PrintableDomainSimilarity.java | 84 +- .../surfacing/PrintableSpeciesSpecificDcData.java | 4 +- .../src/org/forester/surfacing/SurfacingUtil.java | 3544 ++++++++++++-------- forester/java/src/org/forester/test/Test.java | 57 +- .../java/src/org/forester/util/ForesterUtil.java | 62 +- 7 files changed, 2308 insertions(+), 2172 deletions(-) diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index ce11dce..616135a 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -34,17 +34,14 @@ import java.io.Writer; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Map.Entry; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; -import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format; import org.forester.go.GoId; import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; @@ -54,13 +51,7 @@ import org.forester.go.PfamToGoMapping; import org.forester.go.PfamToGoParser; import org.forester.io.parsers.HmmscanPerDomainTableParser; import org.forester.io.parsers.HmmscanPerDomainTableParser.INDIVIDUAL_SCORE_CUTOFF; -import org.forester.io.parsers.phyloxml.PhyloXmlUtil; -import org.forester.io.parsers.util.ParserUtils; import org.forester.phylogeny.Phylogeny; -import org.forester.phylogeny.PhylogenyMethods; -import org.forester.phylogeny.PhylogenyNode; -import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; -import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.protein.BinaryDomainCombination; import org.forester.protein.Domain; import org.forester.protein.Protein; @@ -70,7 +61,6 @@ import org.forester.surfacing.BasicDomainSimilarityCalculator; import org.forester.surfacing.BasicGenomeWideCombinableDomains; import org.forester.surfacing.CombinationsBasedPairwiseDomainSimilarityCalculator; import org.forester.surfacing.DomainCountsBasedPairwiseSimilarityCalculator; -import org.forester.surfacing.DomainCountsDifferenceUtil; import org.forester.surfacing.DomainLengthsTable; import org.forester.surfacing.DomainParsimonyCalculator; import org.forester.surfacing.DomainSimilarity; @@ -190,7 +180,7 @@ public class surfacing { final static private String PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION = "dc_regain_stats"; final static private String DA_ANALYSIS_OPTION = "DA_analyis"; final static private String USE_LAST_IN_FITCH_OPTION = "last"; - final static private String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_"; + public final static String PAIRWISE_DOMAIN_COMPARISONS_PREFIX = "pwc_"; final static private String PAIRWISE_DOMAIN_COMPARISONS_OPTION = "pwc"; final static private String OUTPUT_FILE_OPTION = "o"; final static private String PFAM_TO_GO_FILE_USE_OPTION = "p2g"; @@ -232,22 +222,22 @@ public class surfacing { final static private boolean IGNORE_DUFS_DEFAULT = true; final static private boolean IGNORE_COMBINATION_WITH_SAME_DEFAULLT = false; final static private double MAX_E_VALUE_DEFAULT = -1; - final static private int MAX_ALLOWED_OVERLAP_DEFAULT = -1; + public final static int MAX_ALLOWED_OVERLAP_DEFAULT = -1; private static final String RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION = "random_seed"; private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS = "consider_bdc_direction"; private static final String CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY = "consider_bdc_adj"; - private static final String SEQ_EXTRACT_SUFFIX = ".prot"; - private static final String PLUS_MINUS_ANALYSIS_OPTION = "plus_minus"; - private static final String PLUS_MINUS_DOM_SUFFIX = "_plus_minus_dom.txt"; - private static final String PLUS_MINUS_DOM_SUFFIX_HTML = "_plus_minus_dom.html"; - private static final String PLUS_MINUS_DC_SUFFIX_HTML = "_plus_minus_dc.html"; - private static final int PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT = 0; - private static final double PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT = 1.0; - private static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt"; - private static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt"; + public static final String SEQ_EXTRACT_SUFFIX = ".prot"; + public static final String PLUS_MINUS_ANALYSIS_OPTION = "plus_minus"; + public static final String PLUS_MINUS_DOM_SUFFIX = "_plus_minus_dom.txt"; + public static final String PLUS_MINUS_DOM_SUFFIX_HTML = "_plus_minus_dom.html"; + public static final String PLUS_MINUS_DC_SUFFIX_HTML = "_plus_minus_dc.html"; + public static final int PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT = 0; + public static final double PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT = 1.0; + public static final String PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_all.txt"; + public static final String PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX = "_plus_minus_go_ids_passing.txt"; private static final String OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS = "all_prot"; final static private String OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION = "all_prot_e"; - private static final boolean VERBOSE = false; + public static final boolean VERBOSE = false; private static final String OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_gains_counts"; private static final String OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX = "_fitch_dc_losses_counts"; private static final String DOMAIN_LENGTHS_ANALYSIS_SUFFIX = "_domain_lengths_analysis"; @@ -280,274 +270,6 @@ public class surfacing { public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX = "_indep_dc_gains_fitch_lists_for_go_mapping_unique_MAPPED.txt"; private static final boolean CALC_SIMILARITY_SCORES = false; - private static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, - final String[][] input_file_properties, - final String automated_pairwise_comparison_suffix, - final File outdir ) { - for( int i = 0; i < input_file_properties.length; ++i ) { - for( int j = 0; j < i; ++j ) { - final String species_i = input_file_properties[ i ][ 1 ]; - final String species_j = input_file_properties[ j ][ 1 ]; - String pairwise_similarities_output_file_str = PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i + "_" - + species_j + automated_pairwise_comparison_suffix; - switch ( domain_similarity_print_option ) { - case HTML: - if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) { - pairwise_similarities_output_file_str += ".html"; - } - break; - } - final String error = ForesterUtil - .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir - + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); - if ( !ForesterUtil.isEmpty( error ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, error ); - } - } - } - } - - private static StringBuilder createParametersAsString( final boolean ignore_dufs, - final double e_value_max, - final int max_allowed_overlap, - final boolean no_engulfing_overlaps, - final File cutoff_scores_file, - final BinaryDomainCombination.DomainCombinationType dc_type ) { - final StringBuilder parameters_sb = new StringBuilder(); - parameters_sb.append( "E-value: " + e_value_max ); - if ( cutoff_scores_file != null ) { - parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file ); - } - else { - parameters_sb.append( ", Cutoff-scores-file: not-set" ); - } - if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) { - parameters_sb.append( ", Max-overlap: " + max_allowed_overlap ); - } - else { - parameters_sb.append( ", Max-overlap: not-set" ); - } - if ( no_engulfing_overlaps ) { - parameters_sb.append( ", Engulfing-overlaps: not-allowed" ); - } - else { - parameters_sb.append( ", Engulfing-overlaps: allowed" ); - } - if ( ignore_dufs ) { - parameters_sb.append( ", Ignore-dufs: true" ); - } - else { - parameters_sb.append( ", Ignore-dufs: false" ); - } - parameters_sb.append( ", DC type (if applicable): " + dc_type ); - return parameters_sb; - } - - /** - * Warning: This side-effects 'all_bin_domain_combinations_encountered'! - * - * - * @param output_file - * @param all_bin_domain_combinations_changed - * @param sum_of_all_domains_encountered - * @param all_bin_domain_combinations_encountered - * @param is_gains_analysis - * @param protein_length_stats_by_dc - * @throws IOException - */ - private static void executeFitchGainsAnalysis( final File output_file, - final List all_bin_domain_combinations_changed, - final int sum_of_all_domains_encountered, - final SortedSet all_bin_domain_combinations_encountered, - final boolean is_gains_analysis ) throws IOException { - SurfacingUtil.checkForOutputFileWriteability( output_file ); - final Writer out = ForesterUtil.createBufferedWriter( output_file ); - final SortedMap bdc_to_counts = ForesterUtil - .listToSortedCountsMap( all_bin_domain_combinations_changed ); - final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); - final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); - int above_one = 0; - int one = 0; - for( final Object bdc_object : bdc_to_counts.keySet() ) { - final BinaryDomainCombination bdc = ( BinaryDomainCombination ) bdc_object; - final int count = bdc_to_counts.get( bdc_object ); - if ( count < 1 ) { - ForesterUtil.unexpectedFatalError( PRG_NAME, "count < 1 " ); - } - out.write( bdc + "\t" + count + ForesterUtil.LINE_SEPARATOR ); - if ( count > 1 ) { - all_domains_in_combination_changed_more_than_once.add( bdc.getId0() ); - all_domains_in_combination_changed_more_than_once.add( bdc.getId1() ); - above_one++; - } - else if ( count == 1 ) { - all_domains_in_combination_changed_only_once.add( bdc.getId0() ); - all_domains_in_combination_changed_only_once.add( bdc.getId1() ); - one++; - } - } - final int all = all_bin_domain_combinations_encountered.size(); - int never_lost = -1; - if ( !is_gains_analysis ) { - all_bin_domain_combinations_encountered.removeAll( all_bin_domain_combinations_changed ); - never_lost = all_bin_domain_combinations_encountered.size(); - for( final BinaryDomainCombination bdc : all_bin_domain_combinations_encountered ) { - out.write( bdc + "\t" + "0" + ForesterUtil.LINE_SEPARATOR ); - } - } - if ( is_gains_analysis ) { - out.write( "Sum of all distinct domain combinations appearing once : " + one - + ForesterUtil.LINE_SEPARATOR ); - out.write( "Sum of all distinct domain combinations appearing more than once : " + above_one - + ForesterUtil.LINE_SEPARATOR ); - out.write( "Sum of all distinct domains in combinations apppearing only once : " - + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); - out.write( "Sum of all distinct domains in combinations apppearing more than once: " - + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); - } - else { - out.write( "Sum of all distinct domain combinations never lost : " + never_lost - + ForesterUtil.LINE_SEPARATOR ); - out.write( "Sum of all distinct domain combinations lost once : " + one - + ForesterUtil.LINE_SEPARATOR ); - out.write( "Sum of all distinct domain combinations lost more than once : " + above_one - + ForesterUtil.LINE_SEPARATOR ); - out.write( "Sum of all distinct domains in combinations lost only once : " - + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); - out.write( "Sum of all distinct domains in combinations lost more than once: " - + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); - } - out.write( "All binary combinations : " + all - + ForesterUtil.LINE_SEPARATOR ); - out.write( "All domains : " - + sum_of_all_domains_encountered ); - out.close(); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote fitch domain combination dynamics counts analysis to \"" + output_file - + "\"" ); - } - - private static void executePlusMinusAnalysis( final File output_file, - final List plus_minus_analysis_high_copy_base, - final List plus_minus_analysis_high_copy_target, - final List plus_minus_analysis_low_copy, - final List gwcd_list, - final SortedMap> protein_lists_per_species, - final Map> domain_id_to_go_ids_map, - final Map go_id_to_term_map, - final List plus_minus_analysis_numbers ) { - final Set all_spec = new HashSet(); - for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - all_spec.add( gwcd.getSpecies().getSpeciesId() ); - } - final File html_out_dom = new File( output_file + PLUS_MINUS_DOM_SUFFIX_HTML ); - final File plain_out_dom = new File( output_file + PLUS_MINUS_DOM_SUFFIX ); - final File html_out_dc = new File( output_file + PLUS_MINUS_DC_SUFFIX_HTML ); - final File all_domains_go_ids_out_dom = new File( output_file + PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX ); - final File passing_domains_go_ids_out_dom = new File( output_file + PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX ); - final File proteins_file_base = new File( output_file + "" ); - final int min_diff = ( ( Integer ) plus_minus_analysis_numbers.get( 0 ) ).intValue(); - final double factor = ( ( Double ) plus_minus_analysis_numbers.get( 1 ) ).doubleValue(); - try { - DomainCountsDifferenceUtil.calculateCopyNumberDifferences( gwcd_list, - protein_lists_per_species, - plus_minus_analysis_high_copy_base, - plus_minus_analysis_high_copy_target, - plus_minus_analysis_low_copy, - min_diff, - factor, - plain_out_dom, - html_out_dom, - html_out_dc, - domain_id_to_go_ids_map, - go_id_to_term_map, - all_domains_go_ids_out_dom, - passing_domains_go_ids_out_dom, - proteins_file_base ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" - + html_out_dom + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" - + plain_out_dom + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + html_out_dc - + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based passing GO ids to \"" - + passing_domains_go_ids_out_dom + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based all GO ids to \"" - + all_domains_go_ids_out_dom + "\"" ); - } - - private static Phylogeny[] getIntrees( final File[] intree_files, - final int number_of_genomes, - final String[][] input_file_properties ) { - final Phylogeny[] intrees = new Phylogeny[ intree_files.length ]; - int i = 0; - for( final File intree_file : intree_files ) { - Phylogeny intree = null; - final String error = ForesterUtil.isReadableFile( intree_file ); - if ( !ForesterUtil.isEmpty( error ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read input tree file [" + intree_file + "]: " - + error ); - } - try { - final Phylogeny[] p_array = ParserBasedPhylogenyFactory.getInstance() - .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) ); - if ( p_array.length < 1 ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file - + "] does not contain any phylogeny in phyloXML format" ); - } - else if ( p_array.length > 1 ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file - + "] contains more than one phylogeny in phyloXML format" ); - } - intree = p_array[ 0 ]; - } - catch ( final Exception e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file - + "]: " + error ); - } - if ( ( intree == null ) || intree.isEmpty() ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" ); - } - if ( !intree.isRooted() ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" ); - } - if ( intree.getNumberOfExternalNodes() < number_of_genomes ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, - "number of external nodes [" + intree.getNumberOfExternalNodes() - + "] of input tree [" + intree_file - + "] is smaller than the number of genomes the be analyzed [" - + number_of_genomes + "]" ); - } - final StringBuilder parent_names = new StringBuilder(); - final int nodes_lacking_name = SurfacingUtil.getNumberOfNodesLackingName( intree, parent_names ); - if ( nodes_lacking_name > 0 ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] has " - + nodes_lacking_name + " node(s) lacking a name [parent names:" + parent_names + "]" ); - } - preparePhylogenyForParsimonyAnalyses( intree, input_file_properties ); - if ( !intree.isCompletelyBinary() ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "input tree [" + intree_file - + "] is not completely binary" ); - } - intrees[ i++ ] = intree; - } - return intrees; - } - - private static void log( final String msg, final Writer w ) { - try { - w.write( msg ); - w.write( ForesterUtil.LINE_SEPARATOR ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); - } - } - public static void main( final String args[] ) { final long start_time = new Date().getTime(); // final StringBuffer log = new StringBuffer(); @@ -685,6 +407,17 @@ public class surfacing { if ( cla.isOptionSet( surfacing.IGNORE_DOMAINS_SPECIFIC_TO_ONE_SPECIES_OPTION ) ) { ignore_species_specific_domains = true; } + + + + if ( !cla.isOptionValueSet( surfacing.INPUT_SPECIES_TREE_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no input species tree file given: " + + surfacing.INPUT_SPECIES_TREE_OPTION + "=" ); + } + + + + File output_file = null; if ( cla.isOptionSet( surfacing.OUTPUT_FILE_OPTION ) ) { if ( !cla.isOptionValueSet( surfacing.OUTPUT_FILE_OPTION ) ) { @@ -796,7 +529,7 @@ public class surfacing { final List plus_minus_analysis_high_copy_target_species = new ArrayList(); final List plus_minus_analysis_high_low_copy_species = new ArrayList(); final List plus_minus_analysis_numbers = new ArrayList(); - processPlusMinusAnalysisOption( cla, + SurfacingUtil.processPlusMinusAnalysisOption( cla, plus_minus_analysis_high_copy_base_species, plus_minus_analysis_high_copy_target_species, plus_minus_analysis_high_low_copy_species, @@ -818,6 +551,9 @@ public class surfacing { ForesterUtil.fatalError( surfacing.PRG_NAME, "no input genomes file given: " + surfacing.INPUT_GENOMES_FILE_OPTION + "=" ); } + + + DomainSimilarity.DomainSimilarityScoring scoring = SCORING_DEFAULT; if ( cla.isOptionSet( surfacing.SCORING_OPTION ) ) { if ( !cla.isOptionValueSet( surfacing.SCORING_OPTION ) ) { @@ -1032,7 +768,7 @@ public class surfacing { + surfacing.DOMAIN_COUNT_SORT_COMBINATIONS_COUNT + ">\"" ); } } - final String[][] input_file_properties = processInputGenomesFile( input_genomes_file ); + final String[][] input_file_properties = SurfacingUtil.processInputGenomesFile( input_genomes_file ); final int number_of_genomes = input_file_properties.length; if ( number_of_genomes < 2 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot analyze less than two files" ); @@ -1042,7 +778,7 @@ public class surfacing { + surfacing.PAIRWISE_DOMAIN_COMPARISONS_OPTION + "= to turn on pairwise analyses with less than three input files" ); } - checkWriteabilityForPairwiseComparisons( domain_similarity_print_option, + SurfacingUtil.checkWriteabilityForPairwiseComparisons( domain_similarity_print_option, input_file_properties, automated_pairwise_comparison_suffix, out_dir ); @@ -1179,8 +915,10 @@ public class surfacing { intree_files = new File[ 1 ]; intree_files[ 0 ] = new File( intrees_str ); } - intrees = getIntrees( intree_files, number_of_genomes, input_file_properties ); + intrees = SurfacingUtil.obtainAndPreProcessIntrees( intree_files, number_of_genomes, input_file_properties ); + } + final Phylogeny intree_0_orig = SurfacingUtil.obtainFirstIntree( intree_files[ 0 ]); long random_number_seed_for_fitch_parsimony = 0l; boolean radomize_fitch_parsimony = false; if ( cla.isOptionSet( surfacing.RANDOM_SEED_FOR_FITCH_PARSIMONY_OPTION ) ) { @@ -1202,13 +940,13 @@ public class surfacing { || ( negative_domains_filter_file != null ) ) { filter = new TreeSet(); if ( positive_filter_file != null ) { - processFilter( positive_filter_file, filter ); + SurfacingUtil.processFilter( positive_filter_file, filter ); } else if ( negative_filter_file != null ) { - processFilter( negative_filter_file, filter ); + SurfacingUtil.processFilter( negative_filter_file, filter ); } else if ( negative_domains_filter_file != null ) { - processFilter( negative_domains_filter_file, filter ); + SurfacingUtil.processFilter( negative_domains_filter_file, filter ); } } Map>[] domain_id_to_secondary_features_maps = null; @@ -1709,10 +1447,10 @@ public class surfacing { for( int i = 0; i < number_of_genomes; ++i ) { System.out.println(); System.out.println( ( i + 1 ) + "/" + number_of_genomes ); - log( ( i + 1 ) + "/" + number_of_genomes, log_writer ); + SurfacingUtil.log( ( i + 1 ) + "/" + number_of_genomes, log_writer ); System.out.println( "Processing : " + input_file_properties[ i ][ 1 ] + " [" + input_file_properties[ i ][ 0 ] + "]" ); - log( "Genome : " + input_file_properties[ i ][ 1 ] + " [" + SurfacingUtil.log( "Genome : " + input_file_properties[ i ][ 1 ] + " [" + input_file_properties[ i ][ 0 ] + "]", log_writer ); HmmscanPerDomainTableParser parser = null; INDIVIDUAL_SCORE_CUTOFF ind_score_cutoff = INDIVIDUAL_SCORE_CUTOFF.NONE; @@ -1784,64 +1522,64 @@ public class surfacing { distinct_domain_architecuture_counts ); } System.out.println( "Number of proteins encountered : " + parser.getProteinsEncountered() ); - log( "Number of proteins encountered : " + parser.getProteinsEncountered(), log_writer ); + SurfacingUtil.log( "Number of proteins encountered : " + parser.getProteinsEncountered(), log_writer ); System.out.println( "Number of proteins stored : " + protein_list.size() ); - log( "Number of proteins stored : " + protein_list.size(), log_writer ); + SurfacingUtil.log( "Number of proteins stored : " + protein_list.size(), log_writer ); System.out.println( "Coverage : " + ForesterUtil.roundToInt( 100.0 * coverage ) + "%" ); - log( "Coverage : " + ForesterUtil.roundToInt( 100.0 * coverage ) + SurfacingUtil.log( "Coverage : " + ForesterUtil.roundToInt( 100.0 * coverage ) + "%", log_writer ); System.out.println( "Domains encountered : " + parser.getDomainsEncountered() ); - log( "Domains encountered : " + parser.getDomainsEncountered(), log_writer ); + SurfacingUtil.log( "Domains encountered : " + parser.getDomainsEncountered(), log_writer ); System.out.println( "Domains stored : " + parser.getDomainsStored() ); - log( "Domains stored : " + parser.getDomainsStored(), log_writer ); + SurfacingUtil.log( "Domains stored : " + parser.getDomainsStored(), log_writer ); System.out.println( "Distinct domains stored : " + parser.getDomainsStoredSet().size() ); - log( "Distinct domains stored : " + parser.getDomainsStoredSet().size(), log_writer ); + SurfacingUtil.log( "Distinct domains stored : " + parser.getDomainsStoredSet().size(), log_writer ); System.out.println( "Domains ignored due to individual score cutoffs: " + parser.getDomainsIgnoredDueToIndividualScoreCutoff() ); - log( "Domains ignored due to individual score cutoffs: " + SurfacingUtil.log( "Domains ignored due to individual score cutoffs: " + parser.getDomainsIgnoredDueToIndividualScoreCutoff(), log_writer ); System.out.println( "Domains ignored due to E-value : " + parser.getDomainsIgnoredDueToEval() ); - log( "Domains ignored due to E-value : " + parser.getDomainsIgnoredDueToEval(), log_writer ); + SurfacingUtil.log( "Domains ignored due to E-value : " + parser.getDomainsIgnoredDueToEval(), log_writer ); System.out.println( "Domains ignored due to DUF designation : " + parser.getDomainsIgnoredDueToDuf() ); - log( "Domains ignored due to DUF designation : " + parser.getDomainsIgnoredDueToDuf(), log_writer ); + SurfacingUtil.log( "Domains ignored due to DUF designation : " + parser.getDomainsIgnoredDueToDuf(), log_writer ); if ( ignore_virus_like_ids ) { System.out.println( "Domains ignored due virus like ids : " + parser.getDomainsIgnoredDueToVirusLikeIds() ); - log( "Domains ignored due virus like ids : " + parser.getDomainsIgnoredDueToVirusLikeIds(), + SurfacingUtil.log( "Domains ignored due virus like ids : " + parser.getDomainsIgnoredDueToVirusLikeIds(), log_writer ); } System.out.println( "Domains ignored due negative domain filter : " + parser.getDomainsIgnoredDueToNegativeDomainFilter() ); - log( "Domains ignored due negative domain filter : " + SurfacingUtil.log( "Domains ignored due negative domain filter : " + parser.getDomainsIgnoredDueToNegativeDomainFilter(), log_writer ); System.out.println( "Domains ignored due to overlap : " + parser.getDomainsIgnoredDueToOverlap() ); - log( "Domains ignored due to overlap : " + parser.getDomainsIgnoredDueToOverlap(), + SurfacingUtil.log( "Domains ignored due to overlap : " + parser.getDomainsIgnoredDueToOverlap(), log_writer ); if ( negative_filter_file != null ) { System.out.println( "Proteins ignored due to negative filter : " + parser.getProteinsIgnoredDueToFilter() ); - log( "Proteins ignored due to negative filter : " + parser.getProteinsIgnoredDueToFilter(), + SurfacingUtil.log( "Proteins ignored due to negative filter : " + parser.getProteinsIgnoredDueToFilter(), log_writer ); } if ( positive_filter_file != null ) { System.out.println( "Proteins ignored due to positive filter : " + parser.getProteinsIgnoredDueToFilter() ); - log( "Proteins ignored due to positive filter : " + parser.getProteinsIgnoredDueToFilter(), + SurfacingUtil.log( "Proteins ignored due to positive filter : " + parser.getProteinsIgnoredDueToFilter(), log_writer ); } if ( da_analysis ) { System.out.println( "Distinct domain architectures stored : " + distinct_das ); - log( "Distinct domain architectures stored : " + distinct_das, log_writer ); + SurfacingUtil.log( "Distinct domain architectures stored : " + distinct_das, log_writer ); } System.out.println( "Time for processing : " + parser.getTime() + "ms" ); - log( "", log_writer ); + SurfacingUtil.log( "", log_writer ); try { int count = 0; for( final Protein protein : protein_list ) { @@ -1954,7 +1692,7 @@ public class surfacing { domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMax() + "" ); domains_per_potein_stats_writer.write( "\n" ); domains_per_potein_stats_writer.close(); - printOutPercentageOfMultidomainProteins( all_genomes_domains_per_potein_histo, log_writer ); + SurfacingUtil.printOutPercentageOfMultidomainProteins( all_genomes_domains_per_potein_histo, log_writer ); ForesterUtil.map2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + "_all_genomes_domains_per_potein_histo.txt" ), all_genomes_domains_per_potein_histo, "\t", "\n" ); ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file @@ -1970,9 +1708,9 @@ public class surfacing { ForesterUtil.programMessage( PRG_NAME, "Range of proteins with a least one domain assigned: " + ( 100 * protein_coverage_stats.getMin() ) + "%-" + ( 100 * protein_coverage_stats.getMax() ) + "%" ); - log( "Average of prot with a least one dom assigned : " + ( 100 * protein_coverage_stats.arithmeticMean() ) + SurfacingUtil.log( "Average of prot with a least one dom assigned : " + ( 100 * protein_coverage_stats.arithmeticMean() ) + "% (+/-" + ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)", log_writer ); - log( "Range of prot with a least one dom assigned : " + ( 100 * protein_coverage_stats.getMin() ) + "%-" + SurfacingUtil.log( "Range of prot with a least one dom assigned : " + ( 100 * protein_coverage_stats.getMin() ) + "%-" + ( 100 * protein_coverage_stats.getMax() ) + "%", log_writer ); } catch ( final IOException e2 ) { @@ -2050,7 +1788,7 @@ public class surfacing { my_outfile = my_outfile.substring( 0, my_outfile.length() - 5 ); } split_writers = new HashMap(); - createSplitWriters( out_dir, my_outfile, split_writers ); + SurfacingUtil.createSplitWriters( out_dir, my_outfile, split_writers ); } else if ( !my_outfile.endsWith( ".html" ) ) { my_outfile += ".html"; @@ -2085,7 +1823,7 @@ public class surfacing { scoring, true, tax_code_to_id_map, - intrees[ 0 ] ); + intree_0_orig ); simple_tab_writer.close(); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote main output (includes domain similarities) to: \"" + ( out_dir == null ? my_outfile : out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile ) + "\"" ); @@ -2125,7 +1863,7 @@ public class surfacing { write_pwc_files, tax_code_to_id_map, CALC_SIMILARITY_SCORES, - intrees[ 0 ] ); + intree_0_orig ); String matrix_output_file = new String( output_file.toString() ); if ( matrix_output_file.indexOf( '.' ) > 1 ) { matrix_output_file = matrix_output_file.substring( 0, matrix_output_file.indexOf( '.' ) ); @@ -2161,10 +1899,10 @@ public class surfacing { output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); } if ( write_to_nexus ) { - writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); + SurfacingUtil.writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); } if ( ( ( intrees != null ) && ( intrees.length > 0 ) ) && ( number_of_genomes > 2 ) ) { - final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, + final StringBuilder parameters_sb = SurfacingUtil.createParametersAsString( ignore_dufs, e_value_max, max_allowed_overlap, no_engulfing_overlaps, @@ -2243,7 +1981,7 @@ public class surfacing { } // for( final Phylogeny intree : intrees ) { } if ( plus_minus_analysis_high_copy_base_species.size() > 0 ) { - executePlusMinusAnalysis( output_file, + SurfacingUtil.executePlusMinusAnalysis( output_file, plus_minus_analysis_high_copy_base_species, plus_minus_analysis_high_copy_target_species, plus_minus_analysis_high_low_copy_species, @@ -2254,7 +1992,7 @@ public class surfacing { plus_minus_analysis_numbers ); } if ( output_protein_lists_for_all_domains ) { - writeProteinListsForAllSpecies( out_dir, + SurfacingUtil.writeProteinListsForAllSpecies( out_dir, protein_lists_per_species, gwcd_list, output_list_of_all_proteins_per_domain_e_value_max ); @@ -2262,7 +2000,7 @@ public class surfacing { gwcd_list = null; if ( all_bin_domain_combinations_gained_fitch != null ) { try { - executeFitchGainsAnalysis( new File( output_file + SurfacingUtil.executeFitchGainsAnalysis( new File( output_file + surfacing.OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX ), all_bin_domain_combinations_gained_fitch, all_domains_encountered.size(), @@ -2275,7 +2013,7 @@ public class surfacing { } if ( all_bin_domain_combinations_lost_fitch != null ) { try { - executeFitchGainsAnalysis( new File( output_file + SurfacingUtil.executeFitchGainsAnalysis( new File( output_file + surfacing.OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX ), all_bin_domain_combinations_lost_fitch, all_domains_encountered.size(), @@ -2300,130 +2038,6 @@ public class surfacing { System.out.println(); } - private static void createSplitWriters( final File out_dir, - final String my_outfile, - final Map split_writers ) throws IOException { - split_writers.put( 'a', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_A.html" ) ) ); - split_writers.put( 'b', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_B.html" ) ) ); - split_writers.put( 'c', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_C.html" ) ) ); - split_writers.put( 'd', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_D.html" ) ) ); - split_writers.put( 'e', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_E.html" ) ) ); - split_writers.put( 'f', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_F.html" ) ) ); - split_writers.put( 'g', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_G.html" ) ) ); - split_writers.put( 'h', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_H.html" ) ) ); - split_writers.put( 'i', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_I.html" ) ) ); - split_writers.put( 'j', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_J.html" ) ) ); - split_writers.put( 'k', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_K.html" ) ) ); - split_writers.put( 'l', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_L.html" ) ) ); - split_writers.put( 'm', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_M.html" ) ) ); - split_writers.put( 'n', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_N.html" ) ) ); - split_writers.put( 'o', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_O.html" ) ) ); - split_writers.put( 'p', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_P.html" ) ) ); - split_writers.put( 'q', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_Q.html" ) ) ); - split_writers.put( 'r', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_R.html" ) ) ); - split_writers.put( 's', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_S.html" ) ) ); - split_writers.put( 't', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_T.html" ) ) ); - split_writers.put( 'u', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_U.html" ) ) ); - split_writers.put( 'v', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_V.html" ) ) ); - split_writers.put( 'w', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_W.html" ) ) ); - split_writers.put( 'x', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_X.html" ) ) ); - split_writers.put( 'y', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_Y.html" ) ) ); - split_writers.put( 'z', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_Z.html" ) ) ); - split_writers.put( '0', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_0.html" ) ) ); - } - - private static void printOutPercentageOfMultidomainProteins( final SortedMap all_genomes_domains_per_potein_histo, - final Writer log_writer ) { - int sum = 0; - for( final Entry entry : all_genomes_domains_per_potein_histo.entrySet() ) { - sum += entry.getValue(); - } - final double percentage = ( 100.0 * ( sum - all_genomes_domains_per_potein_histo.get( 1 ) ) ) / sum; - ForesterUtil.programMessage( PRG_NAME, "Percentage of multidomain proteins: " + percentage + "%" ); - log( "Percentage of multidomain proteins: : " + percentage + "%", log_writer ); - } - - private static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree, - final String[][] input_file_properties ) { - final String[] genomes = new String[ input_file_properties.length ]; - for( int i = 0; i < input_file_properties.length; ++i ) { - if ( intree.getNodes( input_file_properties[ i ][ 1 ] ).size() > 1 ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ] - + "] is not unique in input tree " + intree.getName() ); - } - genomes[ i ] = input_file_properties[ i ][ 1 ]; - } - // - final PhylogenyNodeIterator it = intree.iteratorPostorder(); - while ( it.hasNext() ) { - final PhylogenyNode n = it.next(); - if ( ForesterUtil.isEmpty( n.getName() ) ) { - if ( n.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { - n.setName( n.getNodeData().getTaxonomy().getTaxonomyCode() ); - } - else if ( n.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { - n.setName( n.getNodeData().getTaxonomy().getScientificName() ); - } - else if ( n.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getCommonName() ) ) { - n.setName( n.getNodeData().getTaxonomy().getCommonName() ); - } - else { - ForesterUtil - .fatalError( surfacing.PRG_NAME, - "node with no name, scientific name, common name, or taxonomy code present" ); - } - } - } - // - final List igns = PhylogenyMethods.deleteExternalNodesPositiveSelection( genomes, intree ); - if ( igns.size() > 0 ) { - System.out.println( "Not using the following " + igns.size() + " nodes:" ); - for( int i = 0; i < igns.size(); ++i ) { - System.out.println( " " + i + ": " + igns.get( i ) ); - } - System.out.println( "--" ); - } - for( final String[] input_file_propertie : input_file_properties ) { - try { - intree.getNode( input_file_propertie[ 1 ] ); - } - catch ( final IllegalArgumentException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_propertie[ 1 ] - + "] not present/not unique in input tree" ); - } - } - } - private static void printHelp() { System.out.println(); System.out.println( "Usage:" ); @@ -2519,204 +2133,4 @@ public class surfacing { + "-ds_output=detailed_html -scoring=domains -sort=alpha " ); System.out.println(); } - - private static void processFilter( final File filter_file, final SortedSet filter ) { - SortedSet filter_str = null; - try { - filter_str = ForesterUtil.file2set( filter_file ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - if ( filter_str != null ) { - for( final String string : filter_str ) { - filter.add( string ); - } - } - if ( VERBOSE ) { - System.out.println( "Filter:" ); - for( final String domainId : filter ) { - System.out.println( domainId ); - } - } - } - - private static String[][] processInputGenomesFile( final File input_genomes ) { - String[][] input_file_properties = null; - try { - input_file_properties = ForesterUtil.file22dArray( input_genomes ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, - "genomes files is to be in the following format \" \": " - + e.getLocalizedMessage() ); - } - final Set specs = new HashSet(); - final Set paths = new HashSet(); - for( int i = 0; i < input_file_properties.length; ++i ) { - if ( !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( input_file_properties[ i ][ 1 ] ).matches() ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for species code: " - + input_file_properties[ i ][ 1 ] ); - } - if ( specs.contains( input_file_properties[ i ][ 1 ] ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "species code " + input_file_properties[ i ][ 1 ] - + " is not unique" ); - } - specs.add( input_file_properties[ i ][ 1 ] ); - if ( paths.contains( input_file_properties[ i ][ 0 ] ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "path " + input_file_properties[ i ][ 0 ] - + " is not unique" ); - } - paths.add( input_file_properties[ i ][ 0 ] ); - final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) ); - if ( !ForesterUtil.isEmpty( error ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, error ); - } - } - return input_file_properties; - } - - private static void processPlusMinusAnalysisOption( final CommandLineArguments cla, - final List high_copy_base, - final List high_copy_target, - final List low_copy, - final List numbers ) { - if ( cla.isOptionSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) { - if ( !cla.isOptionValueSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for 'plus-minus' file: -" - + surfacing.PLUS_MINUS_ANALYSIS_OPTION + "=" ); - } - final File plus_minus_file = new File( cla.getOptionValue( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ); - final String msg = ForesterUtil.isReadableFile( plus_minus_file ); - if ( !ForesterUtil.isEmpty( msg ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + plus_minus_file + "\": " + msg ); - } - processPlusMinusFile( plus_minus_file, high_copy_base, high_copy_target, low_copy, numbers ); - } - } - - // First numbers is minimal difference, second is factor. - private static void processPlusMinusFile( final File plus_minus_file, - final List high_copy_base, - final List high_copy_target, - final List low_copy, - final List numbers ) { - Set species_set = null; - int min_diff = PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT; - double factor = PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT; - try { - species_set = ForesterUtil.file2set( plus_minus_file ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - if ( species_set != null ) { - for( final String species : species_set ) { - final String species_trimmed = species.substring( 1 ); - if ( species.startsWith( "+" ) ) { - if ( low_copy.contains( species_trimmed ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, - "species/genome names can not appear with both '+' and '-' suffix, as appears the case for: \"" - + species_trimmed + "\"" ); - } - high_copy_base.add( species_trimmed ); - } - else if ( species.startsWith( "*" ) ) { - if ( low_copy.contains( species_trimmed ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, - "species/genome names can not appear with both '*' and '-' suffix, as appears the case for: \"" - + species_trimmed + "\"" ); - } - high_copy_target.add( species_trimmed ); - } - else if ( species.startsWith( "-" ) ) { - if ( high_copy_base.contains( species_trimmed ) || high_copy_target.contains( species_trimmed ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, - "species/genome names can not appear with both '+' or '*' and '-' suffix, as appears the case for: \"" - + species_trimmed + "\"" ); - } - low_copy.add( species_trimmed ); - } - else if ( species.startsWith( "$D" ) ) { - try { - min_diff = Integer.parseInt( species.substring( 3 ) ); - } - catch ( final NumberFormatException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, - "could not parse integer value for minimal difference from: \"" - + species.substring( 3 ) + "\"" ); - } - } - else if ( species.startsWith( "$F" ) ) { - try { - factor = Double.parseDouble( species.substring( 3 ) ); - } - catch ( final NumberFormatException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "could not parse double value for factor from: \"" - + species.substring( 3 ) + "\"" ); - } - } - else if ( species.startsWith( "#" ) ) { - // Comment, ignore. - } - else { - ForesterUtil - .fatalError( surfacing.PRG_NAME, - "species/genome names in 'plus minus' file must begin with '*' (high copy target genome), '+' (high copy base genomes), '-' (low copy genomes), '$D=' minimal Difference (default is 1), '$F=' factor (default is 1.0), double), or '#' (ignore) suffix, encountered: \"" - + species + "\"" ); - } - numbers.add( new Integer( min_diff + "" ) ); - numbers.add( new Double( factor + "" ) ); - } - } - else { - ForesterUtil.fatalError( surfacing.PRG_NAME, "'plus minus' file [" + plus_minus_file + "] appears empty" ); - } - } - - private static void writePresentToNexus( final File output_file, - final File positive_filter_file, - final SortedSet filter, - final List gwcd_list ) { - try { - SurfacingUtil - .writeMatrixToFile( DomainParsimonyCalculator - .createMatrixOfDomainPresenceOrAbsence( gwcd_list, positive_filter_file == null ? null - : filter ), output_file + DOMAINS_PRESENT_NEXUS, Format.NEXUS_BINARY ); - SurfacingUtil.writeMatrixToFile( DomainParsimonyCalculator - .createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ), output_file - + BDC_PRESENT_NEXUS, Format.NEXUS_BINARY ); - } - catch ( final Exception e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); - } - } - - private static void writeProteinListsForAllSpecies( final File output_dir, - final SortedMap> protein_lists_per_species, - final List gwcd_list, - final double domain_e_cutoff ) { - final SortedSet all_domains = new TreeSet(); - for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - all_domains.addAll( gwcd.getAllDomainIds() ); - } - for( final String domain : all_domains ) { - final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + SEQ_EXTRACT_SUFFIX ); - SurfacingUtil.checkForOutputFileWriteability( out ); - try { - final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) ); - SurfacingUtil.extractProteinNames( protein_lists_per_species, - domain, - proteins_file_writer, - "\t", - LIMIT_SPEC_FOR_PROT_EX, - domain_e_cutoff ); - proteins_file_writer.close(); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote proteins list to \"" + out + "\"" ); - } - } } diff --git a/forester/java/src/org/forester/phylogeny/Phylogeny.java b/forester/java/src/org/forester/phylogeny/Phylogeny.java index 3679cd5..eda37d3 100644 --- a/forester/java/src/org/forester/phylogeny/Phylogeny.java +++ b/forester/java/src/org/forester/phylogeny/Phylogeny.java @@ -38,6 +38,7 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Vector; +import org.forester.io.parsers.nhx.NHXParser; import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE; import org.forester.phylogeny.data.BranchData; @@ -47,6 +48,8 @@ import org.forester.phylogeny.data.PhylogenyDataUtil; import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.data.SequenceRelation; import org.forester.phylogeny.data.SequenceRelation.SEQUENCE_RELATION_TYPE; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; +import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.phylogeny.iterators.ExternalForwardIterator; import org.forester.phylogeny.iterators.LevelOrderTreeIterator; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; @@ -304,6 +307,16 @@ public class Phylogeny { return _distance_unit; } + public final static Phylogeny createInstanceFromNhxString( final String nhx ) throws IOException { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + + return factory.create( nhx, new NHXParser() )[ 0 ]; + + + + } + + /** * * Warning. The order of the returned nodes is random diff --git a/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java b/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java index 931fa32..9abbe22 100644 --- a/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java +++ b/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java @@ -27,6 +27,7 @@ package org.forester.surfacing; import java.awt.Color; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.SortedMap; @@ -42,9 +43,9 @@ import org.forester.util.ForesterUtil; public class PrintableDomainSimilarity implements DomainSimilarity { - final public static String SPECIES_SEPARATOR = " "; - final private static int EQUAL = 0; - final private static String NO_SPECIES = " "; + final public static String SPECIES_SEPARATOR = " "; + final private static int EQUAL = 0; + final private static String NO_SPECIES = " "; final private double _min; final private double _max; final private double _mean; @@ -57,6 +58,7 @@ public class PrintableDomainSimilarity implements DomainSimilarity { private List _species_order; private DomainSimilarityCalculator.Detailedness _detailedness; private final boolean _treat_as_binary_comparison; + private final static Map _TAXCODE_HEXCOLORSTRING_MAP = new HashMap(); public PrintableDomainSimilarity( final CombinableDomains combinable_domains, final double min, @@ -186,19 +188,22 @@ public class PrintableDomainSimilarity implements DomainSimilarity { final String tax_code, final Map tax_code_to_id_map, final Phylogeny phy ) { - Color c = null; + String hex = null; if ( phy != null && !phy.isEmpty() ) { - c = getColorDependingOnTaxonomy( tax_code, phy ); + hex = obtainHexColorStringDependingOnTaxonomyGroup( tax_code, phy ); } - if ( c == null ) { - c = new Color( 0, 0, 0 ); - } - final String hex = String.format( "#%02x%02x%02x", c.getRed(), c.getGreen(), c.getBlue() ); sb.append( "" ); if ( !ForesterUtil.isEmpty( tax_code ) && ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( tax_code ) ) ) { - sb.append( "" + tax_code + "" ); + if ( !ForesterUtil.isEmpty( hex ) ) { + sb.append( "" + + tax_code + "" ); + } + else { + sb.append( "" + tax_code + "" ); + } } else { sb.append( tax_code ); @@ -206,31 +211,38 @@ public class PrintableDomainSimilarity implements DomainSimilarity { sb.append( "" ); } - private Color getColorDependingOnTaxonomy( final String tax_code, final Phylogeny phy ) { - List nodes = phy.getNodesViaTaxonomyCode( tax_code ); - Color c = null; - if ( nodes == null || nodes.isEmpty() ) { - throw new RuntimeException( tax_code + " is not found" ); - } - if ( nodes.size() != 1 ) { - throw new RuntimeException( tax_code + " is not unique" ); - } - PhylogenyNode n = nodes.get( 0 ); - while ( n != null ) { - c = null; - if ( n.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { - c = SurfacingUtil.getColorForTaxCode( n.getNodeData().getTaxonomy().getScientificName() ); + private String obtainHexColorStringDependingOnTaxonomyGroup( final String tax_code, final Phylogeny phy ) { + if ( phy != null && !_TAXCODE_HEXCOLORSTRING_MAP.containsKey( tax_code ) ) { + List nodes = phy.getNodesViaTaxonomyCode( tax_code ); + Color c = null; + if ( nodes == null || nodes.isEmpty() ) { + throw new RuntimeException( tax_code + " is not found" ); } - if ( c == null && !ForesterUtil.isEmpty( n.getName() ) ) { - c = SurfacingUtil.getColorForTaxCode( n.getName() ); + if ( nodes.size() != 1 ) { + throw new RuntimeException( tax_code + " is not unique" ); } - if ( c != null ) { - break; + PhylogenyNode n = nodes.get( 0 ); + while ( n != null ) { + if ( n.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { + c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( n.getNodeData().getTaxonomy() + .getScientificName() ); + } + if ( c == null && !ForesterUtil.isEmpty( n.getName() ) ) { + c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( n.getName() ); + } + if ( c != null ) { + break; + } + n = n.getParent(); } - n = n.getParent(); + if ( c == null ) { + throw new RuntimeException( "no color found for taxonomy code \"" + tax_code + "\"" ); + } + final String hex = String.format( "#%02x%02x%02x", c.getRed(), c.getGreen(), c.getBlue() ); + _TAXCODE_HEXCOLORSTRING_MAP.put( tax_code, hex ); } - return c; + return _TAXCODE_HEXCOLORSTRING_MAP.get( tax_code ); } private int compareByDomainId( final DomainSimilarity other ) { @@ -350,7 +362,13 @@ public class PrintableDomainSimilarity implements DomainSimilarity { sb.append( "" + e.getKey() + "" ); sb.append( ": " ); for( final String s : e.getValue() ) { - sb.append( s ); + final String hex = obtainHexColorStringDependingOnTaxonomyGroup( s, null ); + if ( !ForesterUtil.isEmpty( hex ) ) { + sb.append( "" + s + "" ); + } + else { + sb.append( s ); + } sb.append( " " ); } sb.append( "
" ); diff --git a/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java b/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java index 6c6e6fd..55db1bd 100644 --- a/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java +++ b/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java @@ -121,9 +121,9 @@ class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData { } if ( html ) { final Set ids = getCombinableDomainIdToCountsMap().keySet(); - int i = 0; + for( final String domain_id : ids ) { - ++i; + sb.append( " " ); if ( html ) { sb.append( "" + domain_id diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index 27678c4..599fd73 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -26,7 +26,6 @@ package org.forester.surfacing; -import java.awt.Color; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; @@ -67,6 +66,8 @@ import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; import org.forester.go.PfamToGoMapping; import org.forester.io.parsers.nexus.NexusConstants; +import org.forester.io.parsers.phyloxml.PhyloXmlUtil; +import org.forester.io.parsers.util.ParserUtils; import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; @@ -75,6 +76,7 @@ import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE; import org.forester.phylogeny.data.BinaryCharacters; import org.forester.phylogeny.data.Confidence; import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.protein.BasicDomain; import org.forester.protein.BasicProtein; @@ -89,11 +91,31 @@ import org.forester.util.AsciiHistogram; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.BasicTable; import org.forester.util.BasicTableParser; +import org.forester.util.CommandLineArguments; import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterUtil; public final class SurfacingUtil { + final static class DomainComparator implements Comparator { + + final private boolean _ascending; + + public DomainComparator( final boolean ascending ) { + _ascending = ascending; + } + + @Override + public final int compare( final Domain d0, final Domain d1 ) { + if ( d0.getFrom() < d1.getFrom() ) { + return _ascending ? -1 : 1; + } + else if ( d0.getFrom() > d1.getFrom() ) { + return _ascending ? 1 : -1; + } + return 0; + } + } private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" ); private static final Comparator ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator() { @@ -116,10 +138,6 @@ public final class SurfacingUtil { }; public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" ); - private SurfacingUtil() { - // Hidden constructor. - } - public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome, final SortedSet binary_domain_combinations ) { final SortedMap all_cd = genome.getAllCombinableDomainsIds(); @@ -184,6 +202,15 @@ public final class SurfacingUtil { w.write( SurfacingConstants.NL ); } + private final static void addToCountMap( final Map map, final String s ) { + if ( map.containsKey( s ) ) { + map.put( s, map.get( s ) + 1 ); + } + else { + map.put( s, 1 ); + } + } + public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set similarities ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( final DomainSimilarity similarity : similarities ) { @@ -192,279 +219,799 @@ public final class SurfacingUtil { return stats; } - public static void checkForOutputFileWriteability( final File outfile ) { - final String error = ForesterUtil.isWritableFile( outfile ); - if ( !ForesterUtil.isEmpty( error ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, error ); - } - } - - public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, - final BinaryDomainCombination.DomainCombinationType dc_type, - final List all_binary_domains_combination_gained, - final boolean get_gains ) { - final SortedSet sorted_ids = new TreeSet(); - for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { - sorted_ids.add( matrix.getIdentifier( i ) ); - } - for( final String id : sorted_ids ) { - for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { - if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) - || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) { - if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) { - all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination - .createInstance( matrix.getCharacter( c ) ) ); - } - else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) { - all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination - .createInstance( matrix.getCharacter( c ) ) ); + private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, + final String outfilename_for_counts, + final String outfilename_for_dc, + final String outfilename_for_dc_for_go_mapping, + final String outfilename_for_dc_for_go_mapping_unique, + final String outfilename_for_rank_counts, + final String outfilename_for_ancestor_species_counts, + final String outfilename_for_protein_stats, + final Map protein_length_stats_by_dc, + final Map domain_number_stats_by_dc, + final Map domain_length_stats_by_domain ) { + try { + // + // if ( protein_length_stats_by_dc != null ) { + // for( final Entry entry : protein_length_stats_by_dc.entrySet() ) { + // System.out.print( entry.getKey().toString() ); + // System.out.print( ": " ); + // double[] a = entry.getValue().getDataAsDoubleArray(); + // for( int i = 0; i < a.length; i++ ) { + // System.out.print( a[ i ] + " " ); + // } + // System.out.println(); + // } + // } + // if ( domain_number_stats_by_dc != null ) { + // for( final Entry entry : domain_number_stats_by_dc.entrySet() ) { + // System.out.print( entry.getKey().toString() ); + // System.out.print( ": " ); + // double[] a = entry.getValue().getDataAsDoubleArray(); + // for( int i = 0; i < a.length; i++ ) { + // System.out.print( a[ i ] + " " ); + // } + // System.out.println(); + // } + // } + // + final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); + final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); + final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); + final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) ); + final SortedMap dc_gain_counts = new TreeMap(); + for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + final Set gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters(); + for( final String dc : gained_dc ) { + if ( dc_gain_counts.containsKey( dc ) ) { + dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 ); } else { - all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix - .getCharacter( c ) ) ); + dc_gain_counts.put( dc, 1 ); } } } - } - } - - public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { - final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings.size() ); - for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) { - if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) { - domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList() ); - } - domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() ); - } - return domain_id_to_go_ids_map; - } - - public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) - throws IOException { - final BasicTable primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' ); - final Map> map = new TreeMap>(); - for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) { - final String domain_id = primary_table.getValue( 0, r ); - if ( !map.containsKey( domain_id ) ) { - map.put( domain_id, new HashSet() ); - } - map.get( domain_id ).add( primary_table.getValue( 1, r ) ); - } - return map; - } - - public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { - checkForOutputFileWriteability( nj_tree_outfile ); - final NeighborJoining nj = NeighborJoining.createInstance(); - final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance ); - phylogeny.setName( nj_tree_outfile.getName() ); - writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); - return phylogeny; - } - - public static Map createTaxCodeToIdMap( final Phylogeny phy ) { - final Map m = new HashMap(); - for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode n = iter.next(); - if ( n.getNodeData().isHasTaxonomy() ) { - final Taxonomy t = n.getNodeData().getTaxonomy(); - final String c = t.getTaxonomyCode(); - if ( !ForesterUtil.isEmpty( c ) ) { - if ( n.getNodeData().getTaxonomy() == null ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n ); + final SortedMap histogram = new TreeMap(); + final SortedMap domain_lists = new TreeMap(); + final SortedMap dc_reapp_counts_to_protein_length_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_number_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_lengths_stats = new TreeMap(); + final SortedMap> domain_lists_go = new TreeMap>(); + final SortedMap> domain_lists_go_unique = new TreeMap>(); + final Set dcs = dc_gain_counts.keySet(); + final SortedSet more_than_once = new TreeSet(); + DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); + DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); + DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics(); + long gained_multiple_times_domain_length_sum = 0; + long gained_once_domain_length_sum = 0; + long gained_multiple_times_domain_length_count = 0; + long gained_once_domain_length_count = 0; + for( final String dc : dcs ) { + final int count = dc_gain_counts.get( dc ); + if ( histogram.containsKey( count ) ) { + histogram.put( count, histogram.get( count ) + 1 ); + domain_lists.get( count ).append( ", " + dc ); + domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) ); + domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) ); + } + else { + histogram.put( count, 1 ); + domain_lists.put( count, new StringBuilder( dc ) ); + final PriorityQueue q = new PriorityQueue(); + q.addAll( splitDomainCombination( dc ) ); + domain_lists_go.put( count, q ); + final SortedSet set = new TreeSet(); + set.addAll( splitDomainCombination( dc ) ); + domain_lists_go_unique.put( count, set ); + } + if ( protein_length_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) { + dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); } - final String id = n.getNodeData().getTaxonomy().getIdentifier().getValue(); - if ( ForesterUtil.isEmpty( id ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n ); + dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_number_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); } - if ( m.containsKey( c ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy code " + c + " is not unique" ); + dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_length_stats_by_domain != null ) { + if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() ); } - final int iid = Integer.valueOf( id ); - if ( m.containsValue( iid ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy id " + iid + " is not unique" ); + final String[] ds = dc.split( "=" ); + dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain + .get( ds[ 0 ] ).arithmeticMean() ); + dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain + .get( ds[ 1 ] ).arithmeticMean() ); + } + if ( count > 1 ) { + more_than_once.add( dc ); + if ( protein_length_stats_by_dc != null ) { + final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_multiple_times_lengths_stats.addValue( element ); + } + } + if ( domain_number_stats_by_dc != null ) { + final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_multiple_times_domain_count_stats.addValue( element ); + } + } + if ( domain_length_stats_by_domain != null ) { + final String[] ds = dc.split( "=" ); + final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); + final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); + for( final double element : s0.getData() ) { + gained_multiple_times_domain_length_sum += element; + ++gained_multiple_times_domain_length_count; + } + for( final double element : s1.getData() ) { + gained_multiple_times_domain_length_sum += element; + ++gained_multiple_times_domain_length_count; + } } - m.put( c, iid ); } - } - else { - ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy for node " + n ); - } - } - return m; - } - - public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, - final Detailedness detailedness ) { - for( final DomainSimilarity domain_similarity : domain_similarities ) { - if ( domain_similarity instanceof PrintableDomainSimilarity ) { - final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity; - printable_domain_similarity.setDetailedness( detailedness ); - } - } - } - - public static void doit( final List proteins, - final List query_domain_ids_nc_order, - final Writer out, - final String separator, - final String limit_to_species, - final Map> average_protein_lengths_by_dc ) throws IOException { - for( final Protein protein : proteins ) { - if ( ForesterUtil.isEmpty( limit_to_species ) - || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { - if ( protein.contains( query_domain_ids_nc_order, true ) ) { - out.write( protein.getSpecies().getSpeciesId() ); - out.write( separator ); - out.write( protein.getProteinId().getId() ); - out.write( separator ); - out.write( "[" ); - final Set visited_domain_ids = new HashSet(); - boolean first = true; - for( final Domain domain : protein.getProteinDomains() ) { - if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { - visited_domain_ids.add( domain.getDomainId() ); - if ( first ) { - first = false; - } - else { - out.write( " " ); - } - out.write( domain.getDomainId() ); - out.write( " {" ); - out.write( "" + domain.getTotalCount() ); - out.write( "}" ); + else { + if ( protein_length_stats_by_dc != null ) { + final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_once_lengths_stats.addValue( element ); } } - out.write( "]" ); - out.write( separator ); - if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() - .equals( SurfacingConstants.NONE ) ) ) { - out.write( protein.getDescription() ); + if ( domain_number_stats_by_dc != null ) { + final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_once_domain_count_stats.addValue( element ); + } } - out.write( separator ); - if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() - .equals( SurfacingConstants.NONE ) ) ) { - out.write( protein.getAccession() ); + if ( domain_length_stats_by_domain != null ) { + final String[] ds = dc.split( "=" ); + final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); + final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); + for( final double element : s0.getData() ) { + gained_once_domain_length_sum += element; + ++gained_once_domain_length_count; + } + for( final double element : s1.getData() ) { + gained_once_domain_length_sum += element; + ++gained_once_domain_length_count; + } } - out.write( SurfacingConstants.NL ); } } - } - out.flush(); - } - - public static void domainsPerProteinsStatistics( final String genome, - final List protein_list, - final DescriptiveStatistics all_genomes_domains_per_potein_stats, - final SortedMap all_genomes_domains_per_potein_histo, - final SortedSet domains_which_are_always_single, - final SortedSet domains_which_are_sometimes_single_sometimes_not, - final SortedSet domains_which_never_single, - final Writer writer ) { - final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - for( final Protein protein : protein_list ) { - final int domains = protein.getNumberOfProteinDomains(); - //System.out.println( domains ); - stats.addValue( domains ); - all_genomes_domains_per_potein_stats.addValue( domains ); - if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) { - all_genomes_domains_per_potein_histo.put( domains, 1 ); - } - else { - all_genomes_domains_per_potein_histo.put( domains, - 1 + all_genomes_domains_per_potein_histo.get( domains ) ); + final Set histogram_keys = histogram.keySet(); + for( final Integer histogram_key : histogram_keys ) { + final int count = histogram.get( histogram_key ); + final StringBuilder dc = domain_lists.get( histogram_key ); + out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); + out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); + final Object[] sorted = domain_lists_go.get( histogram_key ).toArray(); + Arrays.sort( sorted ); + for( final Object domain : sorted ) { + out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR ); + } + out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); + for( final String domain : domain_lists_go_unique.get( histogram_key ) ) { + out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR ); + } } - if ( domains == 1 ) { - final String domain = protein.getProteinDomain( 0 ).getDomainId(); - if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { - if ( domains_which_never_single.contains( domain ) ) { - domains_which_never_single.remove( domain ); - domains_which_are_sometimes_single_sometimes_not.add( domain ); - } - else { - domains_which_are_always_single.add( domain ); + out_counts.close(); + out_dc.close(); + out_dc_for_go_mapping.close(); + out_dc_for_go_mapping_unique.close(); + final SortedMap lca_rank_counts = new TreeMap(); + final SortedMap lca_ancestor_species_counts = new TreeMap(); + for( final String dc : more_than_once ) { + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) { + nodes.add( n ); } } - } - else if ( domains > 1 ) { - for( final Domain d : protein.getProteinDomains() ) { - final String domain = d.getDomainId(); - // System.out.println( domain ); - if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { - if ( domains_which_are_always_single.contains( domain ) ) { - domains_which_are_always_single.remove( domain ); - domains_which_are_sometimes_single_sometimes_not.add( domain ); + for( int i = 0; i < ( nodes.size() - 1 ); ++i ) { + for( int j = i + 1; j < nodes.size(); ++j ) { + final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) ); + String rank = "unknown"; + if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) { + rank = lca.getNodeData().getTaxonomy().getRank(); + } + addToCountMap( lca_rank_counts, rank ); + String lca_species; + if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) { + lca_species = lca.getNodeData().getTaxonomy().getScientificName(); + } + else if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) { + lca_species = lca.getNodeData().getTaxonomy().getCommonName(); } else { - domains_which_never_single.add( domain ); + lca_species = lca.getName(); } + addToCountMap( lca_ancestor_species_counts, lca_species ); } } } - } - try { - writer.write( genome ); - writer.write( "\t" ); - if ( stats.getN() >= 1 ) { - writer.write( stats.arithmeticMean() + "" ); - writer.write( "\t" ); - if ( stats.getN() >= 2 ) { - writer.write( stats.sampleStandardDeviation() + "" ); - } - else { - writer.write( "" ); + final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) ); + final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) ); + ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR ); + ForesterUtil.map2writer( out_for_ancestor_species_counts, + lca_ancestor_species_counts, + "\t", + ForesterUtil.LINE_SEPARATOR ); + out_for_rank_counts.close(); + out_for_ancestor_species_counts.close(); + if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats ) + && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) { + final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) ); + w.write( "Domain Lengths: " ); + w.write( "\n" ); + if ( domain_length_stats_by_domain != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_lengths_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } } - writer.write( "\t" ); - writer.write( stats.median() + "" ); - writer.write( "\t" ); - writer.write( stats.getN() + "" ); - writer.write( "\t" ); - writer.write( stats.getMin() + "" ); - writer.write( "\t" ); - writer.write( stats.getMax() + "" ); - } - else { - writer.write( "\t" ); - writer.write( "\t" ); - writer.write( "\t" ); - writer.write( "0" ); - writer.write( "\t" ); - writer.write( "\t" ); - } - writer.write( "\n" ); - } - catch ( final IOException e ) { - e.printStackTrace(); - } - } - - public static void executeDomainLengthAnalysis( final String[][] input_file_properties, - final int number_of_genomes, - final DomainLengthsTable domain_lengths_table, - final File outfile ) throws IOException { - final DecimalFormat df = new DecimalFormat( "#.00" ); - checkForOutputFileWriteability( outfile ); - final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); - out.write( "MEAN BASED STATISTICS PER SPECIES" ); - out.write( ForesterUtil.LINE_SEPARATOR ); - out.write( domain_lengths_table.createMeanBasedStatisticsPerSpeciesTable().toString() ); - out.write( ForesterUtil.LINE_SEPARATOR ); - out.write( ForesterUtil.LINE_SEPARATOR ); - final List domain_lengths_list = domain_lengths_table.getDomainLengthsList(); - out.write( "OUTLIER SPECIES PER DOMAIN (Z>=1.5)" ); - out.write( ForesterUtil.LINE_SEPARATOR ); - for( final DomainLengths domain_lengths : domain_lengths_list ) { - final List species_list = domain_lengths.getMeanBasedOutlierSpecies( 1.5 ); - if ( species_list.size() > 0 ) { - out.write( domain_lengths.getDomainId() + "\t" ); - for( final Species species : species_list ) { - out.write( species + "\t" ); + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Protein Lengths: " ); + w.write( "\n" ); + if ( protein_length_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_protein_length_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } } - out.write( ForesterUtil.LINE_SEPARATOR ); - } - } - out.write( ForesterUtil.LINE_SEPARATOR ); + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Number of domains: " ); + w.write( "\n" ); + if ( domain_number_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_number_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, domain lengths:" ); + w.write( "\n" ); + w.write( "N: " + gained_once_domain_length_count ); + w.write( "\n" ); + w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, domain lengths:" ); + w.write( "\n" ); + w.write( "N: " + gained_multiple_times_domain_length_count ); + w.write( "\n" ); + w.write( "Avg: " + + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, protein lengths:" ); + w.write( "\n" ); + w.write( gained_once_lengths_stats.toString() ); + gained_once_lengths_stats = null; + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, domain counts:" ); + w.write( "\n" ); + w.write( gained_once_domain_count_stats.toString() ); + gained_once_domain_count_stats = null; + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, protein lengths:" ); + w.write( "\n" ); + w.write( gained_multiple_times_lengths_stats.toString() ); + gained_multiple_times_lengths_stats = null; + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, domain counts:" ); + w.write( "\n" ); + w.write( gained_multiple_times_domain_count_stats.toString() ); + w.flush(); + w.close(); + } + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to [" + + outfilename_for_counts + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to [" + + outfilename_for_dc + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote independent domain combination gains fitch lists to (for GO mapping) [" + + outfilename_for_dc_for_go_mapping + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) [" + + outfilename_for_dc_for_go_mapping_unique + "]" ); + } + + public static void checkForOutputFileWriteability( final File outfile ) { + final String error = ForesterUtil.isWritableFile( outfile ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } + } + + public static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, + final String[][] input_file_properties, + final String automated_pairwise_comparison_suffix, + final File outdir ) { + for( int i = 0; i < input_file_properties.length; ++i ) { + for( int j = 0; j < i; ++j ) { + final String species_i = input_file_properties[ i ][ 1 ]; + final String species_j = input_file_properties[ j ][ 1 ]; + String pairwise_similarities_output_file_str = surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i + + "_" + species_j + automated_pairwise_comparison_suffix; + switch ( domain_similarity_print_option ) { + case HTML: + if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) { + pairwise_similarities_output_file_str += ".html"; + } + break; + } + final String error = ForesterUtil + .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir + + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } + } + } + } + + private static SortedSet collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root, + final boolean get_gains ) { + final SortedSet domains = new TreeSet(); + for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) { + final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters(); + if ( get_gains ) { + domains.addAll( chars.getGainedCharacters() ); + } + else { + domains.addAll( chars.getLostCharacters() ); + } + } + return domains; + } + + public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, + final BinaryDomainCombination.DomainCombinationType dc_type, + final List all_binary_domains_combination_gained, + final boolean get_gains ) { + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } + for( final String id : sorted_ids ) { + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) + || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) { + if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) { + all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination + .createInstance( matrix.getCharacter( c ) ) ); + } + else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) { + all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination + .createInstance( matrix.getCharacter( c ) ) ); + } + else { + all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix + .getCharacter( c ) ) ); + } + } + } + } + } + + private static File createBaseDirForPerNodeDomainFiles( final String base_dir, + final boolean domain_combinations, + final CharacterStateMatrix.GainLossStates state, + final String outfile ) { + File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent() + + ForesterUtil.FILE_SEPARATOR + base_dir ); + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + if ( domain_combinations ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "DC" ); + } + else { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "DOMAINS" ); + } + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + if ( state == GainLossStates.GAIN ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "GAINS" ); + } + else if ( state == GainLossStates.LOSS ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "LOSSES" ); + } + else { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "PRESENT" ); + } + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + return per_node_go_mapped_domain_gain_loss_files_base_dir; + } + + public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { + final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings.size() ); + for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) { + if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) { + domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList() ); + } + domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() ); + } + return domain_id_to_go_ids_map; + } + + public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) + throws IOException { + final BasicTable primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' ); + final Map> map = new TreeMap>(); + for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) { + final String domain_id = primary_table.getValue( 0, r ); + if ( !map.containsKey( domain_id ) ) { + map.put( domain_id, new HashSet() ); + } + map.get( domain_id ).add( primary_table.getValue( 1, r ) ); + } + return map; + } + + public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { + checkForOutputFileWriteability( nj_tree_outfile ); + final NeighborJoining nj = NeighborJoining.createInstance(); + final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance ); + phylogeny.setName( nj_tree_outfile.getName() ); + writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); + return phylogeny; + } + + public static StringBuilder createParametersAsString( final boolean ignore_dufs, + final double e_value_max, + final int max_allowed_overlap, + final boolean no_engulfing_overlaps, + final File cutoff_scores_file, + final BinaryDomainCombination.DomainCombinationType dc_type ) { + final StringBuilder parameters_sb = new StringBuilder(); + parameters_sb.append( "E-value: " + e_value_max ); + if ( cutoff_scores_file != null ) { + parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file ); + } + else { + parameters_sb.append( ", Cutoff-scores-file: not-set" ); + } + if ( max_allowed_overlap != surfacing.MAX_ALLOWED_OVERLAP_DEFAULT ) { + parameters_sb.append( ", Max-overlap: " + max_allowed_overlap ); + } + else { + parameters_sb.append( ", Max-overlap: not-set" ); + } + if ( no_engulfing_overlaps ) { + parameters_sb.append( ", Engulfing-overlaps: not-allowed" ); + } + else { + parameters_sb.append( ", Engulfing-overlaps: allowed" ); + } + if ( ignore_dufs ) { + parameters_sb.append( ", Ignore-dufs: true" ); + } + else { + parameters_sb.append( ", Ignore-dufs: false" ); + } + parameters_sb.append( ", DC type (if applicable): " + dc_type ); + return parameters_sb; + } + + private static SortedSet createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) { + final SortedMap cds = gwcd.getAllCombinableDomainsIds(); + final SortedSet binary_combinations = new TreeSet(); + for( final String domain_id : cds.keySet() ) { + final CombinableDomains cd = cds.get( domain_id ); + binary_combinations.addAll( cd.toBinaryDomainCombinations() ); + } + return binary_combinations; + } + + public static void createSplitWriters( final File out_dir, + final String my_outfile, + final Map split_writers ) throws IOException { + split_writers.put( 'a', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_A.html" ) ) ); + split_writers.put( 'b', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_B.html" ) ) ); + split_writers.put( 'c', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_C.html" ) ) ); + split_writers.put( 'd', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_D.html" ) ) ); + split_writers.put( 'e', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_E.html" ) ) ); + split_writers.put( 'f', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_F.html" ) ) ); + split_writers.put( 'g', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_G.html" ) ) ); + split_writers.put( 'h', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_H.html" ) ) ); + split_writers.put( 'i', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_I.html" ) ) ); + split_writers.put( 'j', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_J.html" ) ) ); + split_writers.put( 'k', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_K.html" ) ) ); + split_writers.put( 'l', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_L.html" ) ) ); + split_writers.put( 'm', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_M.html" ) ) ); + split_writers.put( 'n', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_N.html" ) ) ); + split_writers.put( 'o', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_O.html" ) ) ); + split_writers.put( 'p', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_P.html" ) ) ); + split_writers.put( 'q', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_Q.html" ) ) ); + split_writers.put( 'r', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_R.html" ) ) ); + split_writers.put( 's', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_S.html" ) ) ); + split_writers.put( 't', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_T.html" ) ) ); + split_writers.put( 'u', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_U.html" ) ) ); + split_writers.put( 'v', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_V.html" ) ) ); + split_writers.put( 'w', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_W.html" ) ) ); + split_writers.put( 'x', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_X.html" ) ) ); + split_writers.put( 'y', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_Y.html" ) ) ); + split_writers.put( 'z', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_Z.html" ) ) ); + split_writers.put( '0', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile + + "_domains_0.html" ) ) ); + } + + public static Map createTaxCodeToIdMap( final Phylogeny phy ) { + final Map m = new HashMap(); + for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getNodeData().isHasTaxonomy() ) { + final Taxonomy t = n.getNodeData().getTaxonomy(); + final String c = t.getTaxonomyCode(); + if ( !ForesterUtil.isEmpty( c ) ) { + if ( n.getNodeData().getTaxonomy() == null ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n ); + } + final String id = n.getNodeData().getTaxonomy().getIdentifier().getValue(); + if ( ForesterUtil.isEmpty( id ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n ); + } + if ( m.containsKey( c ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy code " + c + " is not unique" ); + } + final int iid = Integer.valueOf( id ); + if ( m.containsValue( iid ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy id " + iid + " is not unique" ); + } + m.put( c, iid ); + } + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy for node " + n ); + } + } + return m; + } + + public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, + final Detailedness detailedness ) { + for( final DomainSimilarity domain_similarity : domain_similarities ) { + if ( domain_similarity instanceof PrintableDomainSimilarity ) { + final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity; + printable_domain_similarity.setDetailedness( detailedness ); + } + } + } + + public static void doit( final List proteins, + final List query_domain_ids_nc_order, + final Writer out, + final String separator, + final String limit_to_species, + final Map> average_protein_lengths_by_dc ) throws IOException { + for( final Protein protein : proteins ) { + if ( ForesterUtil.isEmpty( limit_to_species ) + || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { + if ( protein.contains( query_domain_ids_nc_order, true ) ) { + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator ); + out.write( protein.getProteinId().getId() ); + out.write( separator ); + out.write( "[" ); + final Set visited_domain_ids = new HashSet(); + boolean first = true; + for( final Domain domain : protein.getProteinDomains() ) { + if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { + visited_domain_ids.add( domain.getDomainId() ); + if ( first ) { + first = false; + } + else { + out.write( " " ); + } + out.write( domain.getDomainId() ); + out.write( " {" ); + out.write( "" + domain.getTotalCount() ); + out.write( "}" ); + } + } + out.write( "]" ); + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getDescription() ); + } + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getAccession() ); + } + out.write( SurfacingConstants.NL ); + } + } + } + out.flush(); + } + + public static void domainsPerProteinsStatistics( final String genome, + final List protein_list, + final DescriptiveStatistics all_genomes_domains_per_potein_stats, + final SortedMap all_genomes_domains_per_potein_histo, + final SortedSet domains_which_are_always_single, + final SortedSet domains_which_are_sometimes_single_sometimes_not, + final SortedSet domains_which_never_single, + final Writer writer ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final Protein protein : protein_list ) { + final int domains = protein.getNumberOfProteinDomains(); + //System.out.println( domains ); + stats.addValue( domains ); + all_genomes_domains_per_potein_stats.addValue( domains ); + if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) { + all_genomes_domains_per_potein_histo.put( domains, 1 ); + } + else { + all_genomes_domains_per_potein_histo.put( domains, + 1 + all_genomes_domains_per_potein_histo.get( domains ) ); + } + if ( domains == 1 ) { + final String domain = protein.getProteinDomain( 0 ).getDomainId(); + if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { + if ( domains_which_never_single.contains( domain ) ) { + domains_which_never_single.remove( domain ); + domains_which_are_sometimes_single_sometimes_not.add( domain ); + } + else { + domains_which_are_always_single.add( domain ); + } + } + } + else if ( domains > 1 ) { + for( final Domain d : protein.getProteinDomains() ) { + final String domain = d.getDomainId(); + // System.out.println( domain ); + if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { + if ( domains_which_are_always_single.contains( domain ) ) { + domains_which_are_always_single.remove( domain ); + domains_which_are_sometimes_single_sometimes_not.add( domain ); + } + else { + domains_which_never_single.add( domain ); + } + } + } + } + } + try { + writer.write( genome ); + writer.write( "\t" ); + if ( stats.getN() >= 1 ) { + writer.write( stats.arithmeticMean() + "" ); + writer.write( "\t" ); + if ( stats.getN() >= 2 ) { + writer.write( stats.sampleStandardDeviation() + "" ); + } + else { + writer.write( "" ); + } + writer.write( "\t" ); + writer.write( stats.median() + "" ); + writer.write( "\t" ); + writer.write( stats.getN() + "" ); + writer.write( "\t" ); + writer.write( stats.getMin() + "" ); + writer.write( "\t" ); + writer.write( stats.getMax() + "" ); + } + else { + writer.write( "\t" ); + writer.write( "\t" ); + writer.write( "\t" ); + writer.write( "0" ); + writer.write( "\t" ); + writer.write( "\t" ); + } + writer.write( "\n" ); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + } + + public static void executeDomainLengthAnalysis( final String[][] input_file_properties, + final int number_of_genomes, + final DomainLengthsTable domain_lengths_table, + final File outfile ) throws IOException { + final DecimalFormat df = new DecimalFormat( "#.00" ); + checkForOutputFileWriteability( outfile ); + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + out.write( "MEAN BASED STATISTICS PER SPECIES" ); + out.write( ForesterUtil.LINE_SEPARATOR ); + out.write( domain_lengths_table.createMeanBasedStatisticsPerSpeciesTable().toString() ); + out.write( ForesterUtil.LINE_SEPARATOR ); + out.write( ForesterUtil.LINE_SEPARATOR ); + final List domain_lengths_list = domain_lengths_table.getDomainLengthsList(); + out.write( "OUTLIER SPECIES PER DOMAIN (Z>=1.5)" ); + out.write( ForesterUtil.LINE_SEPARATOR ); + for( final DomainLengths domain_lengths : domain_lengths_list ) { + final List species_list = domain_lengths.getMeanBasedOutlierSpecies( 1.5 ); + if ( species_list.size() > 0 ) { + out.write( domain_lengths.getDomainId() + "\t" ); + for( final Species species : species_list ) { + out.write( species + "\t" ); + } + out.write( ForesterUtil.LINE_SEPARATOR ); + } + } + out.write( ForesterUtil.LINE_SEPARATOR ); out.write( ForesterUtil.LINE_SEPARATOR ); out.write( "OUTLIER SPECIES (Z 1.0)" ); out.write( ForesterUtil.LINE_SEPARATOR ); @@ -499,6 +1046,90 @@ public final class SurfacingUtil { } /** + * Warning: This side-effects 'all_bin_domain_combinations_encountered'! + * + * + * @param output_file + * @param all_bin_domain_combinations_changed + * @param sum_of_all_domains_encountered + * @param all_bin_domain_combinations_encountered + * @param is_gains_analysis + * @param protein_length_stats_by_dc + * @throws IOException + */ + public static void executeFitchGainsAnalysis( final File output_file, + final List all_bin_domain_combinations_changed, + final int sum_of_all_domains_encountered, + final SortedSet all_bin_domain_combinations_encountered, + final boolean is_gains_analysis ) throws IOException { + checkForOutputFileWriteability( output_file ); + final Writer out = ForesterUtil.createBufferedWriter( output_file ); + final SortedMap bdc_to_counts = ForesterUtil + .listToSortedCountsMap( all_bin_domain_combinations_changed ); + final SortedSet all_domains_in_combination_changed_more_than_once = new TreeSet(); + final SortedSet all_domains_in_combination_changed_only_once = new TreeSet(); + int above_one = 0; + int one = 0; + for( final Object bdc_object : bdc_to_counts.keySet() ) { + final BinaryDomainCombination bdc = ( BinaryDomainCombination ) bdc_object; + final int count = bdc_to_counts.get( bdc_object ); + if ( count < 1 ) { + ForesterUtil.unexpectedFatalError( surfacing.PRG_NAME, "count < 1 " ); + } + out.write( bdc + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + if ( count > 1 ) { + all_domains_in_combination_changed_more_than_once.add( bdc.getId0() ); + all_domains_in_combination_changed_more_than_once.add( bdc.getId1() ); + above_one++; + } + else if ( count == 1 ) { + all_domains_in_combination_changed_only_once.add( bdc.getId0() ); + all_domains_in_combination_changed_only_once.add( bdc.getId1() ); + one++; + } + } + final int all = all_bin_domain_combinations_encountered.size(); + int never_lost = -1; + if ( !is_gains_analysis ) { + all_bin_domain_combinations_encountered.removeAll( all_bin_domain_combinations_changed ); + never_lost = all_bin_domain_combinations_encountered.size(); + for( final BinaryDomainCombination bdc : all_bin_domain_combinations_encountered ) { + out.write( bdc + "\t" + "0" + ForesterUtil.LINE_SEPARATOR ); + } + } + if ( is_gains_analysis ) { + out.write( "Sum of all distinct domain combinations appearing once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations appearing more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations apppearing more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + else { + out.write( "Sum of all distinct domain combinations never lost : " + never_lost + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost once : " + one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domain combinations lost more than once : " + above_one + + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost only once : " + + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); + out.write( "Sum of all distinct domains in combinations lost more than once: " + + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); + } + out.write( "All binary combinations : " + all + + ForesterUtil.LINE_SEPARATOR ); + out.write( "All domains : " + + sum_of_all_domains_encountered ); + out.close(); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote fitch domain combination dynamics counts analysis to \"" + output_file + + "\"" ); + } + + /** * * @param all_binary_domains_combination_lost_fitch * @param use_last_in_fitch_parsimony @@ -843,6 +1474,60 @@ public final class SurfacingUtil { + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null ); } + public static void executePlusMinusAnalysis( final File output_file, + final List plus_minus_analysis_high_copy_base, + final List plus_minus_analysis_high_copy_target, + final List plus_minus_analysis_low_copy, + final List gwcd_list, + final SortedMap> protein_lists_per_species, + final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final List plus_minus_analysis_numbers ) { + final Set all_spec = new HashSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_spec.add( gwcd.getSpecies().getSpeciesId() ); + } + final File html_out_dom = new File( output_file + surfacing.PLUS_MINUS_DOM_SUFFIX_HTML ); + final File plain_out_dom = new File( output_file + surfacing.PLUS_MINUS_DOM_SUFFIX ); + final File html_out_dc = new File( output_file + surfacing.PLUS_MINUS_DC_SUFFIX_HTML ); + final File all_domains_go_ids_out_dom = new File( output_file + surfacing.PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX ); + final File passing_domains_go_ids_out_dom = new File( output_file + + surfacing.PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX ); + final File proteins_file_base = new File( output_file + "" ); + final int min_diff = ( ( Integer ) plus_minus_analysis_numbers.get( 0 ) ).intValue(); + final double factor = ( ( Double ) plus_minus_analysis_numbers.get( 1 ) ).doubleValue(); + try { + DomainCountsDifferenceUtil.calculateCopyNumberDifferences( gwcd_list, + protein_lists_per_species, + plus_minus_analysis_high_copy_base, + plus_minus_analysis_high_copy_target, + plus_minus_analysis_low_copy, + min_diff, + factor, + plain_out_dom, + html_out_dom, + html_out_dc, + domain_id_to_go_ids_map, + go_id_to_term_map, + all_domains_go_ids_out_dom, + passing_domains_go_ids_out_dom, + proteins_file_base ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + html_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + + plain_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + html_out_dc + + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based passing GO ids to \"" + + passing_domains_go_ids_out_dom + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based all GO ids to \"" + + all_domains_go_ids_out_dom + "\"" ); + } + public static void extractProteinNames( final List proteins, final List query_domain_ids_nc_order, final Writer out, @@ -898,9 +1583,9 @@ public final class SurfacingUtil { final String separator, final String limit_to_species, final double domain_e_cutoff ) throws IOException { - System.out.println( "Per domain E-value: " + domain_e_cutoff ); + //System.out.println( "Per domain E-value: " + domain_e_cutoff ); for( final Species species : protein_lists_per_species.keySet() ) { - System.out.println( species + ":" ); + //System.out.println( species + ":" ); for( final Protein protein : protein_lists_per_species.get( species ) ) { if ( ForesterUtil.isEmpty( limit_to_species ) || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { @@ -919,7 +1604,7 @@ public final class SurfacingUtil { out.write( domain.getFrom() + "-" + domain.getTo() ); if ( prev_to >= 0 ) { final int l = domain.getFrom() - prev_to; - System.out.println( l ); + // System.out.println( l ); } prev_to = domain.getTo(); } @@ -1017,7 +1702,107 @@ public final class SurfacingUtil { ++c; } } - return c; + return c; + } + + public static void log( final String msg, final Writer w ) { + try { + w.write( msg ); + w.write( ForesterUtil.LINE_SEPARATOR ); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + } + + public static Phylogeny[] obtainAndPreProcessIntrees( final File[] intree_files, + final int number_of_genomes, + final String[][] input_file_properties ) { + final Phylogeny[] intrees = new Phylogeny[ intree_files.length ]; + int i = 0; + for( final File intree_file : intree_files ) { + Phylogeny intree = null; + final String error = ForesterUtil.isReadableFile( intree_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read input tree file [" + intree_file + "]: " + + error ); + } + try { + final Phylogeny[] p_array = ParserBasedPhylogenyFactory.getInstance() + .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) ); + if ( p_array.length < 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file + + "] does not contain any phylogeny in phyloXML format" ); + } + else if ( p_array.length > 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file + + "] contains more than one phylogeny in phyloXML format" ); + } + intree = p_array[ 0 ]; + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file + + "]: " + error ); + } + if ( ( intree == null ) || intree.isEmpty() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" ); + } + if ( !intree.isRooted() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" ); + } + if ( intree.getNumberOfExternalNodes() < number_of_genomes ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "number of external nodes [" + intree.getNumberOfExternalNodes() + + "] of input tree [" + intree_file + + "] is smaller than the number of genomes the be analyzed [" + + number_of_genomes + "]" ); + } + final StringBuilder parent_names = new StringBuilder(); + final int nodes_lacking_name = getNumberOfNodesLackingName( intree, parent_names ); + if ( nodes_lacking_name > 0 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] has " + + nodes_lacking_name + " node(s) lacking a name [parent names:" + parent_names + "]" ); + } + preparePhylogenyForParsimonyAnalyses( intree, input_file_properties ); + if ( !intree.isCompletelyBinary() ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "input tree [" + intree_file + + "] is not completely binary" ); + } + intrees[ i++ ] = intree; + } + return intrees; + } + + public static Phylogeny obtainFirstIntree( final File intree_file ) { + Phylogeny intree = null; + final String error = ForesterUtil.isReadableFile( intree_file ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot read input tree file [" + intree_file + "]: " + error ); + } + try { + final Phylogeny[] phys = ParserBasedPhylogenyFactory.getInstance() + .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) ); + if ( phys.length < 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file + + "] does not contain any phylogeny in phyloXML format" ); + } + else if ( phys.length > 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file + + "] contains more than one phylogeny in phyloXML format" ); + } + intree = phys[ 0 ]; + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file + "]: " + + error ); + } + if ( ( intree == null ) || intree.isEmpty() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" ); + } + if ( !intree.isRooted() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is not rooted" ); + } + return intree; } public static void performDomainArchitectureAnalysis( final SortedMap> domain_architecutures, @@ -1088,1289 +1873,912 @@ public final class SurfacingUtil { p.setRooted( true ); } - /* - * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value - * - * - */ - static public StringBuffer proteinToDomainCombinations( final Protein protein, - final String protein_id, - final String separator ) { - final StringBuffer sb = new StringBuffer(); - if ( protein.getSpecies() == null ) { - throw new IllegalArgumentException( "species must not be null" ); - } - if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) { - throw new IllegalArgumentException( "species id must not be empty" ); + public static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree, + final String[][] input_file_properties ) { + final String[] genomes = new String[ input_file_properties.length ]; + for( int i = 0; i < input_file_properties.length; ++i ) { + if ( intree.getNodes( input_file_properties[ i ][ 1 ] ).size() > 1 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_properties[ i ][ 1 ] + + "] is not unique in input tree " + intree.getName() ); + } + genomes[ i ] = input_file_properties[ i ][ 1 ]; } - final List domains = protein.getProteinDomains(); - if ( domains.size() > 1 ) { - final Map counts = new HashMap(); - for( final Domain domain : domains ) { - final String id = domain.getDomainId(); - if ( counts.containsKey( id ) ) { - counts.put( id, counts.get( id ) + 1 ); + // + final PhylogenyNodeIterator it = intree.iteratorPostorder(); + while ( it.hasNext() ) { + final PhylogenyNode n = it.next(); + if ( ForesterUtil.isEmpty( n.getName() ) ) { + if ( n.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getTaxonomyCode() ) ) { + n.setName( n.getNodeData().getTaxonomy().getTaxonomyCode() ); } - else { - counts.put( id, 1 ); + else if ( n.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { + n.setName( n.getNodeData().getTaxonomy().getScientificName() ); } - } - final Set dcs = new HashSet(); - for( int i = 1; i < domains.size(); ++i ) { - for( int j = 0; j < i; ++j ) { - Domain domain_n = domains.get( i ); - Domain domain_c = domains.get( j ); - if ( domain_n.getFrom() > domain_c.getFrom() ) { - domain_n = domains.get( j ); - domain_c = domains.get( i ); - } - final String dc = domain_n.getDomainId() + domain_c.getDomainId(); - if ( !dcs.contains( dc ) ) { - dcs.add( dc ); - sb.append( protein.getSpecies() ); - sb.append( separator ); - sb.append( protein_id ); - sb.append( separator ); - sb.append( domain_n.getDomainId() ); - sb.append( separator ); - sb.append( domain_c.getDomainId() ); - sb.append( separator ); - sb.append( domain_n.getPerDomainEvalue() ); - sb.append( separator ); - sb.append( domain_c.getPerDomainEvalue() ); - sb.append( separator ); - sb.append( counts.get( domain_n.getDomainId() ) ); - sb.append( separator ); - sb.append( counts.get( domain_c.getDomainId() ) ); - sb.append( ForesterUtil.LINE_SEPARATOR ); - } + else if ( n.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getCommonName() ) ) { + n.setName( n.getNodeData().getTaxonomy().getCommonName() ); + } + else { + ForesterUtil + .fatalError( surfacing.PRG_NAME, + "node with no name, scientific name, common name, or taxonomy code present" ); } } } - else if ( domains.size() == 1 ) { - sb.append( protein.getSpecies() ); - sb.append( separator ); - sb.append( protein_id ); - sb.append( separator ); - sb.append( domains.get( 0 ).getDomainId() ); - sb.append( separator ); - sb.append( separator ); - sb.append( domains.get( 0 ).getPerDomainEvalue() ); - sb.append( separator ); - sb.append( separator ); - sb.append( 1 ); - sb.append( separator ); - sb.append( ForesterUtil.LINE_SEPARATOR ); + // + final List igns = PhylogenyMethods.deleteExternalNodesPositiveSelection( genomes, intree ); + if ( igns.size() > 0 ) { + System.out.println( "Not using the following " + igns.size() + " nodes:" ); + for( int i = 0; i < igns.size(); ++i ) { + System.out.println( " " + i + ": " + igns.get( i ) ); + } + System.out.println( "--" ); } - else { - sb.append( protein.getSpecies() ); - sb.append( separator ); - sb.append( protein_id ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( ForesterUtil.LINE_SEPARATOR ); + for( final String[] input_file_propertie : input_file_properties ) { + try { + intree.getNode( input_file_propertie[ 1 ] ); + } + catch ( final IllegalArgumentException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "node named [" + input_file_propertie[ 1 ] + + "] not present/not unique in input tree" ); + } } - return sb; } - public static List sortDomainsWithAscendingConfidenceValues( final Protein protein ) { - final List domains = new ArrayList(); - for( final Domain d : protein.getProteinDomains() ) { - domains.add( d ); - } - Collections.sort( domains, SurfacingUtil.ASCENDING_CONFIDENCE_VALUE_ORDER ); - return domains; + public static void printOutPercentageOfMultidomainProteins( final SortedMap all_genomes_domains_per_potein_histo, + final Writer log_writer ) { + int sum = 0; + for( final Entry entry : all_genomes_domains_per_potein_histo.entrySet() ) { + sum += entry.getValue(); + } + final double percentage = ( 100.0 * ( sum - all_genomes_domains_per_potein_histo.get( 1 ) ) ) / sum; + ForesterUtil.programMessage( surfacing.PRG_NAME, "Percentage of multidomain proteins: " + percentage + "%" ); + log( "Percentage of multidomain proteins: : " + percentage + "%", log_writer ); } - public static int storeDomainArchitectures( final String genome, - final SortedMap> domain_architecutures, - final List protein_list, - final Map distinct_domain_architecuture_counts ) { - final Set da = new HashSet(); - domain_architecutures.put( genome, da ); - for( final Protein protein : protein_list ) { - final String da_str = ( ( BasicProtein ) protein ).toDomainArchitectureString( "~", 3, "=" ); - if ( !da.contains( da_str ) ) { - if ( !distinct_domain_architecuture_counts.containsKey( da_str ) ) { - distinct_domain_architecuture_counts.put( da_str, 1 ); - } - else { - distinct_domain_architecuture_counts.put( da_str, - distinct_domain_architecuture_counts.get( da_str ) + 1 ); - } - da.add( da_str ); - } + private static void printSomeStats( final DescriptiveStatistics stats, final AsciiHistogram histo, final Writer w ) + throws IOException { + w.write( "
" ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" );
+        w.write( SurfacingConstants.NL );
+        if ( histo != null ) {
+            w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
+            w.write( SurfacingConstants.NL );
         }
-        return da.size();
+        w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( stats.getN() > 1 ) { + w.write( "" ); + } + else { + w.write( "" ); + } + w.write( SurfacingConstants.NL ); + w.write( "
N: " + stats.getN() + "
Min: " + stats.getMin() + "
Max: " + stats.getMax() + "
Mean: " + stats.arithmeticMean() + "
SD: " + stats.sampleStandardDeviation() + "
SD: n/a
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); } - public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p, - final boolean get_gains, - final String outdir, - final String suffix_for_filename ) throws IOException { - CharacterStateMatrix.GainLossStates state = CharacterStateMatrix.GainLossStates.GAIN; - if ( !get_gains ) { - state = CharacterStateMatrix.GainLossStates.LOSS; + public static void processFilter( final File filter_file, final SortedSet filter ) { + SortedSet filter_str = null; + try { + filter_str = ForesterUtil.file2set( filter_file ); } - final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES, - false, - state, - outdir ); - for( final PhylogenyNodeIterator it = p.iteratorPostorder(); it.hasNext(); ) { - final PhylogenyNode node = it.next(); - if ( !node.isExternal() ) { - final SortedSet domains = collectAllDomainsChangedOnSubtree( node, get_gains ); - if ( domains.size() > 0 ) { - final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR - + node.getName() + suffix_for_filename ); - for( final String domain : domains ) { - writer.write( domain ); - writer.write( ForesterUtil.LINE_SEPARATOR ); - } - writer.close(); - } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + if ( filter_str != null ) { + for( final String string : filter_str ) { + filter.add( string ); + } + } + if ( surfacing.VERBOSE ) { + System.out.println( "Filter:" ); + for( final String domainId : filter ) { + System.out.println( domainId ); } } } - public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties, - final File output_dir, - final GenomeWideCombinableDomains gwcd, - final int i, - final GenomeWideCombinableDomainsSortOrder dc_sort_order ) { - File dc_outfile_dot = new File( input_file_properties[ i ][ 1 ] - + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS ); - if ( output_dir != null ) { - dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot ); - } - checkForOutputFileWriteability( dc_outfile_dot ); - final SortedSet binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd ); + public static String[][] processInputGenomesFile( final File input_genomes ) { + String[][] input_file_properties = null; try { - final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) ); - for( final BinaryDomainCombination bdc : binary_combinations ) { - out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null ) - .toString() ); - out_dot.write( SurfacingConstants.NL ); - } - out_dot.close(); + input_file_properties = ForesterUtil.file22dArray( input_genomes ); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, + "genomes files is to be in the following format \" \": " + + e.getLocalizedMessage() ); + } + final Set specs = new HashSet(); + final Set paths = new HashSet(); + for( int i = 0; i < input_file_properties.length; ++i ) { + if ( !PhyloXmlUtil.TAXOMONY_CODE_PATTERN.matcher( input_file_properties[ i ][ 1 ] ).matches() ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "illegal format for species code: " + + input_file_properties[ i ][ 1 ] ); + } + if ( specs.contains( input_file_properties[ i ][ 1 ] ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "species code " + input_file_properties[ i ][ 1 ] + + " is not unique" ); + } + specs.add( input_file_properties[ i ][ 1 ] ); + if ( paths.contains( input_file_properties[ i ][ 0 ] ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "path " + input_file_properties[ i ][ 0 ] + + " is not unique" ); + } + paths.add( input_file_properties[ i ][ 0 ] ); + final String error = ForesterUtil.isReadableFile( new File( input_file_properties[ i ][ 0 ] ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" - + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " - + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); + return input_file_properties; } - public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, - final CharacterStateMatrix.GainLossStates state, - final String filename, - final String indentifier_characters_separator, - final String character_separator, - final Map descriptions ) { - final File outfile = new File( filename ); - checkForOutputFileWriteability( outfile ); - final SortedSet sorted_ids = new TreeSet(); - for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { - sorted_ids.add( matrix.getIdentifier( i ) ); + public static void processPlusMinusAnalysisOption( final CommandLineArguments cla, + final List high_copy_base, + final List high_copy_target, + final List low_copy, + final List numbers ) { + if ( cla.isOptionSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) { + if ( !cla.isOptionValueSet( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no value for 'plus-minus' file: -" + + surfacing.PLUS_MINUS_ANALYSIS_OPTION + "=" ); + } + final File plus_minus_file = new File( cla.getOptionValue( surfacing.PLUS_MINUS_ANALYSIS_OPTION ) ); + final String msg = ForesterUtil.isReadableFile( plus_minus_file ); + if ( !ForesterUtil.isEmpty( msg ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "can not read from \"" + plus_minus_file + "\": " + msg ); + } + processPlusMinusFile( plus_minus_file, high_copy_base, high_copy_target, low_copy, numbers ); } + } + + // First numbers is minimal difference, second is factor. + public static void processPlusMinusFile( final File plus_minus_file, + final List high_copy_base, + final List high_copy_target, + final List low_copy, + final List numbers ) { + Set species_set = null; + int min_diff = surfacing.PLUS_MINUS_ANALYSIS_MIN_DIFF_DEFAULT; + double factor = surfacing.PLUS_MINUS_ANALYSIS_FACTOR_DEFAULT; try { - final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); - for( final String id : sorted_ids ) { - out.write( indentifier_characters_separator ); - out.write( "#" + id ); - out.write( indentifier_characters_separator ); - for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { - // Not nice: - // using null to indicate either UNCHANGED_PRESENT or GAIN. - if ( ( matrix.getState( id, c ) == state ) - || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix - .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) { - out.write( matrix.getCharacter( c ) ); - if ( ( descriptions != null ) && !descriptions.isEmpty() - && descriptions.containsKey( matrix.getCharacter( c ) ) ) { - out.write( "\t" ); - out.write( descriptions.get( matrix.getCharacter( c ) ) ); - } - out.write( character_separator ); - } - } - } - out.flush(); - out.close(); + species_set = ForesterUtil.file2set( plus_minus_file ); } catch ( final IOException e ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); - } - - public static void writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( final CharacterStateMatrix matrix, - final CharacterStateMatrix.GainLossStates state, - final String filename, - final String indentifier_characters_separator, - final String character_separator, - final BinaryDomainCombination.OutputFormat bc_output_format ) { - final File outfile = new File( filename ); - checkForOutputFileWriteability( outfile ); - final SortedSet sorted_ids = new TreeSet(); - for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { - sorted_ids.add( matrix.getIdentifier( i ) ); - } - try { - final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); - for( final String id : sorted_ids ) { - out.write( indentifier_characters_separator ); - out.write( "#" + id ); - out.write( indentifier_characters_separator ); - for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { - // Not nice: - // using null to indicate either UNCHANGED_PRESENT or GAIN. - if ( ( matrix.getState( id, c ) == state ) - || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix - .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) { - BinaryDomainCombination bdc = null; - try { - bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) ); - } - catch ( final Exception e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); - } - out.write( bdc.toGraphDescribingLanguage( bc_output_format, null, null ).toString() ); - out.write( character_separator ); + if ( species_set != null ) { + for( final String species : species_set ) { + final String species_trimmed = species.substring( 1 ); + if ( species.startsWith( "+" ) ) { + if ( low_copy.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "species/genome names can not appear with both '+' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + high_copy_base.add( species_trimmed ); + } + else if ( species.startsWith( "*" ) ) { + if ( low_copy.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "species/genome names can not appear with both '*' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); + } + high_copy_target.add( species_trimmed ); + } + else if ( species.startsWith( "-" ) ) { + if ( high_copy_base.contains( species_trimmed ) || high_copy_target.contains( species_trimmed ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "species/genome names can not appear with both '+' or '*' and '-' suffix, as appears the case for: \"" + + species_trimmed + "\"" ); } + low_copy.add( species_trimmed ); + } + else if ( species.startsWith( "$D" ) ) { + try { + min_diff = Integer.parseInt( species.substring( 3 ) ); + } + catch ( final NumberFormatException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, + "could not parse integer value for minimal difference from: \"" + + species.substring( 3 ) + "\"" ); + } + } + else if ( species.startsWith( "$F" ) ) { + try { + factor = Double.parseDouble( species.substring( 3 ) ); + } + catch ( final NumberFormatException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "could not parse double value for factor from: \"" + + species.substring( 3 ) + "\"" ); + } + } + else if ( species.startsWith( "#" ) ) { + // Comment, ignore. + } + else { + ForesterUtil + .fatalError( surfacing.PRG_NAME, + "species/genome names in 'plus minus' file must begin with '*' (high copy target genome), '+' (high copy base genomes), '-' (low copy genomes), '$D=' minimal Difference (default is 1), '$F=' factor (default is 1.0), double), or '#' (ignore) suffix, encountered: \"" + + species + "\"" ); } + numbers.add( new Integer( min_diff + "" ) ); + numbers.add( new Double( factor + "" ) ); } - out.flush(); - out.close(); } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "'plus minus' file [" + plus_minus_file + "] appears empty" ); } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); } - public static void writeBinaryStatesMatrixToList( final Map> domain_id_to_go_ids_map, - final Map go_id_to_term_map, - final GoNameSpace go_namespace_limit, - final boolean domain_combinations, - final CharacterStateMatrix matrix, - final CharacterStateMatrix.GainLossStates state, - final String filename, - final String indentifier_characters_separator, - final String character_separator, - final String title_for_html, - final String prefix_for_html, - final Map>[] domain_id_to_secondary_features_maps, - final SortedSet all_pfams_encountered, - final SortedSet pfams_gained_or_lost, - final String suffix_for_per_node_events_file, - final Map tax_code_to_id_map ) { - if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { - throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" ); - } - else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) { - throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" ); - } - else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { - throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" ); + /* + * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value + * + * + */ + static public StringBuffer proteinToDomainCombinations( final Protein protein, + final String protein_id, + final String separator ) { + final StringBuffer sb = new StringBuffer(); + if ( protein.getSpecies() == null ) { + throw new IllegalArgumentException( "species must not be null" ); } - final File outfile = new File( filename ); - checkForOutputFileWriteability( outfile ); - final SortedSet sorted_ids = new TreeSet(); - for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { - sorted_ids.add( matrix.getIdentifier( i ) ); + if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) { + throw new IllegalArgumentException( "species id must not be empty" ); } - try { - final Writer out = new BufferedWriter( new FileWriter( outfile ) ); - final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES, - domain_combinations, - state, - filename ); - Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null; - File per_node_go_mapped_domain_gain_loss_outfile = null; - int per_node_counter = 0; - out.write( "" ); - out.write( SurfacingConstants.NL ); - addHtmlHead( out, title_for_html ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "

" ); - out.write( SurfacingConstants.NL ); - out.write( title_for_html ); - out.write( SurfacingConstants.NL ); - out.write( "

" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - for( final String id : sorted_ids ) { - final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id ); - if ( matcher.matches() ) { - continue; - } - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - } - out.write( "
" ); - out.write( "" + id + "" ); - out.write( "
" ); - out.write( SurfacingConstants.NL ); - for( final String id : sorted_ids ) { - final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id ); - if ( matcher.matches() ) { - continue; - } - out.write( SurfacingConstants.NL ); - out.write( "

" ); - out.write( "" + id + "" ); - writeTaxonomyLinks( out, id, tax_code_to_id_map ); - out.write( "

" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - per_node_counter = 0; - if ( matrix.getNumberOfCharacters() > 0 ) { - per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file ); - SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile ); - per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil - .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile ); + final List domains = protein.getProteinDomains(); + if ( domains.size() > 1 ) { + final Map counts = new HashMap(); + for( final Domain domain : domains ) { + final String id = domain.getDomainId(); + if ( counts.containsKey( id ) ) { + counts.put( id, counts.get( id ) + 1 ); } else { - per_node_go_mapped_domain_gain_loss_outfile = null; - per_node_go_mapped_domain_gain_loss_outfile_writer = null; + counts.put( id, 1 ); } - for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { - // Not nice: - // using null to indicate either UNCHANGED_PRESENT or GAIN. - if ( ( matrix.getState( id, c ) == state ) - || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix - .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) { - final String character = matrix.getCharacter( c ); - String domain_0 = ""; - String domain_1 = ""; - if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) { - final String[] s = character.split( BinaryDomainCombination.SEPARATOR ); - if ( s.length != 2 ) { - throw new AssertionError( "this should not have happened: unexpected format for domain combination: [" - + character + "]" ); - } - domain_0 = s[ 0 ]; - domain_1 = s[ 1 ]; - } - else { - domain_0 = character; - } - writeDomainData( domain_id_to_go_ids_map, - go_id_to_term_map, - go_namespace_limit, - out, - domain_0, - domain_1, - prefix_for_html, - character_separator, - domain_id_to_secondary_features_maps, - null ); - all_pfams_encountered.add( domain_0 ); - if ( pfams_gained_or_lost != null ) { - pfams_gained_or_lost.add( domain_0 ); - } - if ( !ForesterUtil.isEmpty( domain_1 ) ) { - all_pfams_encountered.add( domain_1 ); - if ( pfams_gained_or_lost != null ) { - pfams_gained_or_lost.add( domain_1 ); - } - } - if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { - writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer, - domain_0, - domain_1 ); - per_node_counter++; - } + } + final Set dcs = new HashSet(); + for( int i = 1; i < domains.size(); ++i ) { + for( int j = 0; j < i; ++j ) { + Domain domain_n = domains.get( i ); + Domain domain_c = domains.get( j ); + if ( domain_n.getFrom() > domain_c.getFrom() ) { + domain_n = domains.get( j ); + domain_c = domains.get( i ); } - } - if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { - per_node_go_mapped_domain_gain_loss_outfile_writer.close(); - if ( per_node_counter < 1 ) { - per_node_go_mapped_domain_gain_loss_outfile.delete(); + final String dc = domain_n.getDomainId() + domain_c.getDomainId(); + if ( !dcs.contains( dc ) ) { + dcs.add( dc ); + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( domain_n.getDomainId() ); + sb.append( separator ); + sb.append( domain_c.getDomainId() ); + sb.append( separator ); + sb.append( domain_n.getPerDomainEvalue() ); + sb.append( separator ); + sb.append( domain_c.getPerDomainEvalue() ); + sb.append( separator ); + sb.append( counts.get( domain_n.getDomainId() ) ); + sb.append( separator ); + sb.append( counts.get( domain_c.getDomainId() ) ); + sb.append( ForesterUtil.LINE_SEPARATOR ); } - per_node_counter = 0; } - out.write( "
" ); - out.write( "Pfam domain(s)" ); - out.write( "" ); - out.write( "GO term acc" ); - out.write( "" ); - out.write( "GO term" ); - out.write( "" ); - out.write( "GO namespace" ); - out.write( "
" ); - out.write( SurfacingConstants.NL ); - out.write( "
" ); - out.write( SurfacingConstants.NL ); - } // for( final String id : sorted_ids ) { - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.flush(); - out.close(); + } } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + else if ( domains.size() == 1 ) { + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( domains.get( 0 ).getDomainId() ); + sb.append( separator ); + sb.append( separator ); + sb.append( domains.get( 0 ).getPerDomainEvalue() ); + sb.append( separator ); + sb.append( separator ); + sb.append( 1 ); + sb.append( separator ); + sb.append( ForesterUtil.LINE_SEPARATOR ); } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" ); + else { + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + return sb; } - public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties, - final File output_dir, - final Writer per_genome_domain_promiscuity_statistics_writer, - final GenomeWideCombinableDomains gwcd, - final int i, - final GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order ) { - File dc_outfile = new File( input_file_properties[ i ][ 1 ] - + surfacing.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX ); - if ( output_dir != null ) { - dc_outfile = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile ); - } - checkForOutputFileWriteability( dc_outfile ); - try { - final BufferedWriter out = new BufferedWriter( new FileWriter( dc_outfile ) ); - out.write( gwcd.toStringBuilder( dc_sort_order ).toString() ); - out.close(); + public static List sortDomainsWithAscendingConfidenceValues( final Protein protein ) { + final List domains = new ArrayList(); + for( final Domain d : protein.getProteinDomains() ) { + domains.add( d ); } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + Collections.sort( domains, SurfacingUtil.ASCENDING_CONFIDENCE_VALUE_ORDER ); + return domains; + } + + private static List splitDomainCombination( final String dc ) { + final String[] s = dc.split( "=" ); + if ( s.length != 2 ) { + ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: " + + dc ); + System.exit( -1 ); } - final DescriptiveStatistics stats = gwcd.getPerGenomeDomainPromiscuityStatistics(); - try { - per_genome_domain_promiscuity_statistics_writer.write( input_file_properties[ i ][ 1 ] + "\t" ); - per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.arithmeticMean() ) + "\t" ); - if ( stats.getN() < 2 ) { - per_genome_domain_promiscuity_statistics_writer.write( "n/a" + "\t" ); - } - else { - per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats - .sampleStandardDeviation() ) + "\t" ); - } - per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" ); - per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" ); - per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" ); - per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" ); - final SortedSet mpds = gwcd.getMostPromiscuosDomain(); - for( final String mpd : mpds ) { - per_genome_domain_promiscuity_statistics_writer.write( mpd + " " ); + final List l = new ArrayList( 2 ); + l.add( s[ 0 ] ); + l.add( s[ 1 ] ); + return l; + } + + public static int storeDomainArchitectures( final String genome, + final SortedMap> domain_architecutures, + final List protein_list, + final Map distinct_domain_architecuture_counts ) { + final Set da = new HashSet(); + domain_architecutures.put( genome, da ); + for( final Protein protein : protein_list ) { + final String da_str = ( ( BasicProtein ) protein ).toDomainArchitectureString( "~", 3, "=" ); + if ( !da.contains( da_str ) ) { + if ( !distinct_domain_architecuture_counts.containsKey( da_str ) ) { + distinct_domain_architecuture_counts.put( da_str, 1 ); + } + else { + distinct_domain_architecuture_counts.put( da_str, + distinct_domain_architecuture_counts.get( da_str ) + 1 ); + } + da.add( da_str ); } - per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - if ( input_file_properties[ i ].length == 3 ) { - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" - + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " - + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" ); - } - else { - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" - + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \"" - + dc_outfile + "\"" ); } + return da.size(); } - public static void writeDomainSimilaritiesToFile( final StringBuilder html_desc, - final StringBuilder html_title, - final Writer simple_tab_writer, - final Writer single_writer, - Map split_writers, - final SortedSet similarities, - final boolean treat_as_binary, - final List species_order, - final PrintableDomainSimilarity.PRINT_OPTION print_option, - final DomainSimilarity.DomainSimilarityScoring scoring, - final boolean verbose, - final Map tax_code_to_id_map, - Phylogeny phy ) throws IOException { - if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) { - split_writers = new HashMap(); - split_writers.put( '_', single_writer ); + public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p, + final boolean get_gains, + final String outdir, + final String suffix_for_filename ) throws IOException { + CharacterStateMatrix.GainLossStates state = CharacterStateMatrix.GainLossStates.GAIN; + if ( !get_gains ) { + state = CharacterStateMatrix.GainLossStates.LOSS; } - switch ( print_option ) { - case SIMPLE_TAB_DELIMITED: - break; - case HTML: - for( final Character key : split_writers.keySet() ) { - final Writer w = split_writers.get( key ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( key != '_' ) { - addHtmlHead( w, "DC analysis (" + html_title + ") " + key.toString().toUpperCase() ); - } - else { - addHtmlHead( w, "DC analysis (" + html_title + ")" ); + final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES, + false, + state, + outdir ); + for( final PhylogenyNodeIterator it = p.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode node = it.next(); + if ( !node.isExternal() ) { + final SortedSet domains = collectAllDomainsChangedOnSubtree( node, get_gains ); + if ( domains.size() > 0 ) { + final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR + + node.getName() + suffix_for_filename ); + for( final String domain : domains ) { + writer.write( domain ); + writer.write( ForesterUtil.LINE_SEPARATOR ); } - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( html_desc.toString() ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); + writer.close(); } - break; - } - // - for( final DomainSimilarity similarity : similarities ) { - if ( ( species_order != null ) && !species_order.isEmpty() ) { - ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); - } - if ( single_writer != null ) { - single_writer.write( "" ); - single_writer.write( SurfacingConstants.NL ); } - else { - Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase() - .charAt( 0 ) ); - if ( local_writer == null ) { - local_writer = split_writers.get( '0' ); + } + } + + private static void writeAllEncounteredPfamsToFile( final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final String outfile_name, + final SortedSet all_pfams_encountered ) { + final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX ); + final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name + + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); + final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX ); + int biological_process_counter = 0; + int cellular_component_counter = 0; + int molecular_function_counter = 0; + int pfams_with_mappings_counter = 0; + int pfams_without_mappings_counter = 0; + int pfams_without_mappings_to_bp_or_mf_counter = 0; + int pfams_with_mappings_to_bp_or_mf_counter = 0; + try { + final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) ); + final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) ); + final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) ); + summary_writer.write( "# Pfam to GO mapping summary" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Actual summary is at the end of this file." ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Encountered Pfams without a GO mapping:" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + for( final String pfam : all_pfams_encountered ) { + all_pfams_encountered_writer.write( pfam ); + all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR ); + final String domain_id = new String( pfam ); + if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + ++pfams_with_mappings_counter; + all_pfams_encountered_with_go_annotation_writer.write( pfam ); + all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR ); + final List go_ids = domain_id_to_go_ids_map.get( domain_id ); + boolean maps_to_bp = false; + boolean maps_to_cc = false; + boolean maps_to_mf = false; + for( final GoId go_id : go_ids ) { + final GoTerm go_term = go_id_to_term_map.get( go_id ); + if ( go_term.getGoNameSpace().isBiologicalProcess() ) { + maps_to_bp = true; + } + else if ( go_term.getGoNameSpace().isCellularComponent() ) { + maps_to_cc = true; + } + else if ( go_term.getGoNameSpace().isMolecularFunction() ) { + maps_to_mf = true; + } + } + if ( maps_to_bp ) { + ++biological_process_counter; + } + if ( maps_to_cc ) { + ++cellular_component_counter; + } + if ( maps_to_mf ) { + ++molecular_function_counter; + } + if ( maps_to_bp || maps_to_mf ) { + ++pfams_with_mappings_to_bp_or_mf_counter; + } + else { + ++pfams_without_mappings_to_bp_or_mf_counter; + } } - local_writer.write( "" ); - local_writer.write( SurfacingConstants.NL ); - } - } - for( final Writer w : split_writers.values() ) { - w.write( "
Domains:
" - + similarity.getDomainId() + "
" - + similarity.getDomainId() + "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - } - // - for( final DomainSimilarity similarity : similarities ) { - if ( ( species_order != null ) && !species_order.isEmpty() ) { - ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); - } - if ( simple_tab_writer != null ) { - simple_tab_writer.write( similarity.toStringBuffer( PRINT_OPTION.SIMPLE_TAB_DELIMITED, - tax_code_to_id_map, - null ).toString() ); - } - if ( single_writer != null ) { - single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() ); - single_writer.write( SurfacingConstants.NL ); - } - else { - Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase() - .charAt( 0 ) ); - if ( local_writer == null ) { - local_writer = split_writers.get( '0' ); + else { + ++pfams_without_mappings_to_bp_or_mf_counter; + ++pfams_without_mappings_counter; + summary_writer.write( pfam ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); } - local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() ); - local_writer.write( SurfacingConstants.NL ); } + all_pfams_encountered_writer.close(); + all_pfams_encountered_with_go_annotation_writer.close(); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size() + + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter + + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file + + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all [" + + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \"" + + encountered_pfams_summary_file + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered : " + + all_pfams_encountered.size() ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : " + + pfams_without_mappings_counter + " [" + + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : " + + pfams_without_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : " + + pfams_with_mappings_counter + " [" + + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : " + + pfams_with_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: " + + biological_process_counter + " [" + + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: " + + molecular_function_counter + " [" + + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: " + + cellular_component_counter + " [" + + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter + + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams without mapping to proc. or func. : " + + pfams_without_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " [" + + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with a mapping to proc. or func. : " + + pfams_with_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " [" + + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " [" + + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " [" + + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.close(); } - switch ( print_option ) { - case HTML: - for( final Writer w : split_writers.values() ) { - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - } - break; - } - for( final Writer w : split_writers.values() ) { - w.close(); + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); } } - private static void printSomeStats( final DescriptiveStatistics stats, final AsciiHistogram histo, final Writer w ) - throws IOException { - w.write( "
" ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" );
-        w.write( SurfacingConstants.NL );
-        if ( histo != null ) {
-            w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
-            w.write( SurfacingConstants.NL );
+    public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties,
+                                                                          final File output_dir,
+                                                                          final GenomeWideCombinableDomains gwcd,
+                                                                          final int i,
+                                                                          final GenomeWideCombinableDomainsSortOrder dc_sort_order ) {
+        File dc_outfile_dot = new File( input_file_properties[ i ][ 1 ]
+                + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS );
+        if ( output_dir != null ) {
+            dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot );
         }
-        w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( stats.getN() > 1 ) { - w.write( "" ); + checkForOutputFileWriteability( dc_outfile_dot ); + final SortedSet binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd ); + try { + final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) ); + for( final BinaryDomainCombination bdc : binary_combinations ) { + out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null ) + .toString() ); + out_dot.write( SurfacingConstants.NL ); + } + out_dot.close(); } - else { - w.write( "" ); + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - w.write( SurfacingConstants.NL ); - w.write( "
N: " + stats.getN() + "
Min: " + stats.getMin() + "
Max: " + stats.getMax() + "
Mean: " + stats.arithmeticMean() + "
SD: " + stats.sampleStandardDeviation() + "
SD: n/a
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); } - public static void writeMatrixToFile( final CharacterStateMatrix matrix, - final String filename, - final Format format ) { + public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, + final CharacterStateMatrix.GainLossStates state, + final String filename, + final String indentifier_characters_separator, + final String character_separator, + final Map descriptions ) { final File outfile = new File( filename ); checkForOutputFileWriteability( outfile ); + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } try { final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); - matrix.toWriter( out, format ); + for( final String id : sorted_ids ) { + out.write( indentifier_characters_separator ); + out.write( "#" + id ); + out.write( indentifier_characters_separator ); + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + // Not nice: + // using null to indicate either UNCHANGED_PRESENT or GAIN. + if ( ( matrix.getState( id, c ) == state ) + || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix + .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) { + out.write( matrix.getCharacter( c ) ); + if ( ( descriptions != null ) && !descriptions.isEmpty() + && descriptions.containsKey( matrix.getCharacter( c ) ) ) { + out.write( "\t" ); + out.write( descriptions.get( matrix.getCharacter( c ) ) ); + } + out.write( character_separator ); + } + } + } out.flush(); out.close(); } catch ( final IOException e ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); } - public static void writeMatrixToFile( final File matrix_outfile, final List matrices ) { - checkForOutputFileWriteability( matrix_outfile ); + public static void writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( final CharacterStateMatrix matrix, + final CharacterStateMatrix.GainLossStates state, + final String filename, + final String indentifier_characters_separator, + final String character_separator, + final BinaryDomainCombination.OutputFormat bc_output_format ) { + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } try { - final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) ); - for( final DistanceMatrix distance_matrix : matrices ) { - out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() ); - out.write( ForesterUtil.LINE_SEPARATOR ); - out.flush(); + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + for( final String id : sorted_ids ) { + out.write( indentifier_characters_separator ); + out.write( "#" + id ); + out.write( indentifier_characters_separator ); + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + // Not nice: + // using null to indicate either UNCHANGED_PRESENT or GAIN. + if ( ( matrix.getState( id, c ) == state ) + || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix + .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) { + BinaryDomainCombination bdc = null; + try { + bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + out.write( bdc.toGraphDescribingLanguage( bc_output_format, null, null ).toString() ); + out.write( character_separator ); + } + } } + out.flush(); out.close(); } catch ( final IOException e ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); } - public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) { - final PhylogenyWriter writer = new PhylogenyWriter(); - try { - writer.toPhyloXML( new File( filename ), phylogeny, 1 ); - } - catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": " - + e ); + public static void writeBinaryStatesMatrixToList( final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final GoNameSpace go_namespace_limit, + final boolean domain_combinations, + final CharacterStateMatrix matrix, + final CharacterStateMatrix.GainLossStates state, + final String filename, + final String indentifier_characters_separator, + final String character_separator, + final String title_for_html, + final String prefix_for_html, + final Map>[] domain_id_to_secondary_features_maps, + final SortedSet all_pfams_encountered, + final SortedSet pfams_gained_or_lost, + final String suffix_for_per_node_events_file, + final Map tax_code_to_id_map ) { + if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { + throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" ); } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" ); - } - - public static void writeTaxonomyLinks( final Writer writer, - final String species, - final Map tax_code_to_id_map ) throws IOException { - if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) { - writer.write( " [" ); - if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) { - writer.write( "uniprot" ); - } - else { - writer.write( "eol" ); - writer.write( "|" ); - writer.write( "scholar" ); - writer.write( "|" ); - writer.write( "google" ); - } - writer.write( "]" ); + else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) { + throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" ); } - } - - private final static void addToCountMap( final Map map, final String s ) { - if ( map.containsKey( s ) ) { - map.put( s, map.get( s ) + 1 ); + else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { + throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" ); } - else { - map.put( s, 1 ); + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); } - } - - private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, - final String outfilename_for_counts, - final String outfilename_for_dc, - final String outfilename_for_dc_for_go_mapping, - final String outfilename_for_dc_for_go_mapping_unique, - final String outfilename_for_rank_counts, - final String outfilename_for_ancestor_species_counts, - final String outfilename_for_protein_stats, - final Map protein_length_stats_by_dc, - final Map domain_number_stats_by_dc, - final Map domain_length_stats_by_domain ) { try { - // - // if ( protein_length_stats_by_dc != null ) { - // for( final Entry entry : protein_length_stats_by_dc.entrySet() ) { - // System.out.print( entry.getKey().toString() ); - // System.out.print( ": " ); - // double[] a = entry.getValue().getDataAsDoubleArray(); - // for( int i = 0; i < a.length; i++ ) { - // System.out.print( a[ i ] + " " ); - // } - // System.out.println(); - // } - // } - // if ( domain_number_stats_by_dc != null ) { - // for( final Entry entry : domain_number_stats_by_dc.entrySet() ) { - // System.out.print( entry.getKey().toString() ); - // System.out.print( ": " ); - // double[] a = entry.getValue().getDataAsDoubleArray(); - // for( int i = 0; i < a.length; i++ ) { - // System.out.print( a[ i ] + " " ); - // } - // System.out.println(); - // } - // } - // - final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); - final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); - final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); - final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) ); - final SortedMap dc_gain_counts = new TreeMap(); - for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { - final PhylogenyNode n = it.next(); - final Set gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters(); - for( final String dc : gained_dc ) { - if ( dc_gain_counts.containsKey( dc ) ) { - dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 ); - } - else { - dc_gain_counts.put( dc, 1 ); - } - } - } - final SortedMap histogram = new TreeMap(); - final SortedMap domain_lists = new TreeMap(); - final SortedMap dc_reapp_counts_to_protein_length_stats = new TreeMap(); - final SortedMap dc_reapp_counts_to_domain_number_stats = new TreeMap(); - final SortedMap dc_reapp_counts_to_domain_lengths_stats = new TreeMap(); - final SortedMap> domain_lists_go = new TreeMap>(); - final SortedMap> domain_lists_go_unique = new TreeMap>(); - final Set dcs = dc_gain_counts.keySet(); - final SortedSet more_than_once = new TreeSet(); - DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); - DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); - DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics(); - long gained_multiple_times_domain_length_sum = 0; - long gained_once_domain_length_sum = 0; - long gained_multiple_times_domain_length_count = 0; - long gained_once_domain_length_count = 0; - for( final String dc : dcs ) { - final int count = dc_gain_counts.get( dc ); - if ( histogram.containsKey( count ) ) { - histogram.put( count, histogram.get( count ) + 1 ); - domain_lists.get( count ).append( ", " + dc ); - domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) ); - domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) ); - } - else { - histogram.put( count, 1 ); - domain_lists.put( count, new StringBuilder( dc ) ); - final PriorityQueue q = new PriorityQueue(); - q.addAll( splitDomainCombination( dc ) ); - domain_lists_go.put( count, q ); - final SortedSet set = new TreeSet(); - set.addAll( splitDomainCombination( dc ) ); - domain_lists_go_unique.put( count, set ); - } - if ( protein_length_stats_by_dc != null ) { - if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) { - dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); - } - dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) - .arithmeticMean() ); - } - if ( domain_number_stats_by_dc != null ) { - if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { - dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); - } - dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) - .arithmeticMean() ); - } - if ( domain_length_stats_by_domain != null ) { - if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) { - dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() ); - } - final String[] ds = dc.split( "=" ); - dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain - .get( ds[ 0 ] ).arithmeticMean() ); - dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain - .get( ds[ 1 ] ).arithmeticMean() ); - } - if ( count > 1 ) { - more_than_once.add( dc ); - if ( protein_length_stats_by_dc != null ) { - final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_multiple_times_lengths_stats.addValue( element ); - } - } - if ( domain_number_stats_by_dc != null ) { - final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_multiple_times_domain_count_stats.addValue( element ); - } - } - if ( domain_length_stats_by_domain != null ) { - final String[] ds = dc.split( "=" ); - final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); - final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); - for( final double element : s0.getData() ) { - gained_multiple_times_domain_length_sum += element; - ++gained_multiple_times_domain_length_count; - } - for( final double element : s1.getData() ) { - gained_multiple_times_domain_length_sum += element; - ++gained_multiple_times_domain_length_count; - } - } - } - else { - if ( protein_length_stats_by_dc != null ) { - final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_once_lengths_stats.addValue( element ); - } - } - if ( domain_number_stats_by_dc != null ) { - final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_once_domain_count_stats.addValue( element ); - } - } - if ( domain_length_stats_by_domain != null ) { - final String[] ds = dc.split( "=" ); - final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); - final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); - for( final double element : s0.getData() ) { - gained_once_domain_length_sum += element; - ++gained_once_domain_length_count; - } - for( final double element : s1.getData() ) { - gained_once_domain_length_sum += element; - ++gained_once_domain_length_count; - } - } + final Writer out = new BufferedWriter( new FileWriter( outfile ) ); + final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES, + domain_combinations, + state, + filename ); + Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null; + File per_node_go_mapped_domain_gain_loss_outfile = null; + int per_node_counter = 0; + out.write( "" ); + out.write( SurfacingConstants.NL ); + addHtmlHead( out, title_for_html ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "

" ); + out.write( SurfacingConstants.NL ); + out.write( title_for_html ); + out.write( SurfacingConstants.NL ); + out.write( "

" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + for( final String id : sorted_ids ) { + final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id ); + if ( matcher.matches() ) { + continue; } + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( SurfacingConstants.NL ); } - final Set histogram_keys = histogram.keySet(); - for( final Integer histogram_key : histogram_keys ) { - final int count = histogram.get( histogram_key ); - final StringBuilder dc = domain_lists.get( histogram_key ); - out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); - out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); - out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); - final Object[] sorted = domain_lists_go.get( histogram_key ).toArray(); - Arrays.sort( sorted ); - for( final Object domain : sorted ) { - out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR ); + out.write( "
" ); + out.write( "" + id + "" ); + out.write( "
" ); + out.write( SurfacingConstants.NL ); + for( final String id : sorted_ids ) { + final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( id ); + if ( matcher.matches() ) { + continue; } - out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); - for( final String domain : domain_lists_go_unique.get( histogram_key ) ) { - out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR ); + out.write( SurfacingConstants.NL ); + out.write( "

" ); + out.write( "" + id + "" ); + writeTaxonomyLinks( out, id, tax_code_to_id_map ); + out.write( "

" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + per_node_counter = 0; + if ( matrix.getNumberOfCharacters() > 0 ) { + per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file ); + SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile ); + per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil + .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile ); } - } - out_counts.close(); - out_dc.close(); - out_dc_for_go_mapping.close(); - out_dc_for_go_mapping_unique.close(); - final SortedMap lca_rank_counts = new TreeMap(); - final SortedMap lca_ancestor_species_counts = new TreeMap(); - for( final String dc : more_than_once ) { - final List nodes = new ArrayList(); - for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) { - final PhylogenyNode n = it.next(); - if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) { - nodes.add( n ); - } + else { + per_node_go_mapped_domain_gain_loss_outfile = null; + per_node_go_mapped_domain_gain_loss_outfile_writer = null; } - for( int i = 0; i < ( nodes.size() - 1 ); ++i ) { - for( int j = i + 1; j < nodes.size(); ++j ) { - final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) ); - String rank = "unknown"; - if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) { - rank = lca.getNodeData().getTaxonomy().getRank(); + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + // Not nice: + // using null to indicate either UNCHANGED_PRESENT or GAIN. + if ( ( matrix.getState( id, c ) == state ) + || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix + .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) { + final String character = matrix.getCharacter( c ); + String domain_0 = ""; + String domain_1 = ""; + if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) { + final String[] s = character.split( BinaryDomainCombination.SEPARATOR ); + if ( s.length != 2 ) { + throw new AssertionError( "this should not have happened: unexpected format for domain combination: [" + + character + "]" ); + } + domain_0 = s[ 0 ]; + domain_1 = s[ 1 ]; } - addToCountMap( lca_rank_counts, rank ); - String lca_species; - if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) { - lca_species = lca.getNodeData().getTaxonomy().getScientificName(); + else { + domain_0 = character; } - else if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) { - lca_species = lca.getNodeData().getTaxonomy().getCommonName(); + writeDomainData( domain_id_to_go_ids_map, + go_id_to_term_map, + go_namespace_limit, + out, + domain_0, + domain_1, + prefix_for_html, + character_separator, + domain_id_to_secondary_features_maps, + null ); + all_pfams_encountered.add( domain_0 ); + if ( pfams_gained_or_lost != null ) { + pfams_gained_or_lost.add( domain_0 ); } - else { - lca_species = lca.getName(); + if ( !ForesterUtil.isEmpty( domain_1 ) ) { + all_pfams_encountered.add( domain_1 ); + if ( pfams_gained_or_lost != null ) { + pfams_gained_or_lost.add( domain_1 ); + } + } + if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { + writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer, + domain_0, + domain_1 ); + per_node_counter++; } - addToCountMap( lca_ancestor_species_counts, lca_species ); - } - } - } - final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) ); - final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) ); - ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR ); - ForesterUtil.map2writer( out_for_ancestor_species_counts, - lca_ancestor_species_counts, - "\t", - ForesterUtil.LINE_SEPARATOR ); - out_for_rank_counts.close(); - out_for_ancestor_species_counts.close(); - if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats ) - && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) { - final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) ); - w.write( "Domain Lengths: " ); - w.write( "\n" ); - if ( domain_length_stats_by_domain != null ) { - for( final Entry entry : dc_reapp_counts_to_domain_lengths_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); - } - } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Protein Lengths: " ); - w.write( "\n" ); - if ( protein_length_stats_by_dc != null ) { - for( final Entry entry : dc_reapp_counts_to_protein_length_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); } } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Number of domains: " ); - w.write( "\n" ); - if ( domain_number_stats_by_dc != null ) { - for( final Entry entry : dc_reapp_counts_to_domain_number_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); + if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { + per_node_go_mapped_domain_gain_loss_outfile_writer.close(); + if ( per_node_counter < 1 ) { + per_node_go_mapped_domain_gain_loss_outfile.delete(); } + per_node_counter = 0; } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, domain lengths:" ); - w.write( "\n" ); - w.write( "N: " + gained_once_domain_length_count ); - w.write( "\n" ); - w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, domain lengths:" ); - w.write( "\n" ); - w.write( "N: " + gained_multiple_times_domain_length_count ); - w.write( "\n" ); - w.write( "Avg: " - + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, protein lengths:" ); - w.write( "\n" ); - w.write( gained_once_lengths_stats.toString() ); - gained_once_lengths_stats = null; - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, domain counts:" ); - w.write( "\n" ); - w.write( gained_once_domain_count_stats.toString() ); - gained_once_domain_count_stats = null; - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, protein lengths:" ); - w.write( "\n" ); - w.write( gained_multiple_times_lengths_stats.toString() ); - gained_multiple_times_lengths_stats = null; - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, domain counts:" ); - w.write( "\n" ); - w.write( gained_multiple_times_domain_count_stats.toString() ); - w.flush(); - w.close(); - } + out.write( "
" ); + out.write( "Pfam domain(s)" ); + out.write( "" ); + out.write( "GO term acc" ); + out.write( "" ); + out.write( "GO term" ); + out.write( "" ); + out.write( "GO namespace" ); + out.write( "
" ); + out.write( SurfacingConstants.NL ); + out.write( "
" ); + out.write( SurfacingConstants.NL ); + } // for( final String id : sorted_ids ) { + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.write( "" ); + out.write( SurfacingConstants.NL ); + out.flush(); + out.close(); } catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to [" - + outfilename_for_counts + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to [" - + outfilename_for_dc + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote independent domain combination gains fitch lists to (for GO mapping) [" - + outfilename_for_dc_for_go_mapping + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) [" - + outfilename_for_dc_for_go_mapping_unique + "]" ); - } - - private static SortedSet collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root, - final boolean get_gains ) { - final SortedSet domains = new TreeSet(); - for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) { - final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters(); - if ( get_gains ) { - domains.addAll( chars.getGainedCharacters() ); - } - else { - domains.addAll( chars.getLostCharacters() ); - } + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - return domains; + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" ); } - private static File createBaseDirForPerNodeDomainFiles( final String base_dir, - final boolean domain_combinations, - final CharacterStateMatrix.GainLossStates state, - final String outfile ) { - File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent() - + ForesterUtil.FILE_SEPARATOR + base_dir ); - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - if ( domain_combinations ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "DC" ); - } - else { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "DOMAINS" ); - } - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - if ( state == GainLossStates.GAIN ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "GAINS" ); - } - else if ( state == GainLossStates.LOSS ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "LOSSES" ); - } - else { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "PRESENT" ); - } - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties, + final File output_dir, + final Writer per_genome_domain_promiscuity_statistics_writer, + final GenomeWideCombinableDomains gwcd, + final int i, + final GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order ) { + File dc_outfile = new File( input_file_properties[ i ][ 1 ] + + surfacing.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX ); + if ( output_dir != null ) { + dc_outfile = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile ); } - return per_node_go_mapped_domain_gain_loss_files_base_dir; - } - - private static SortedSet createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) { - final SortedMap cds = gwcd.getAllCombinableDomainsIds(); - final SortedSet binary_combinations = new TreeSet(); - for( final String domain_id : cds.keySet() ) { - final CombinableDomains cd = cds.get( domain_id ); - binary_combinations.addAll( cd.toBinaryDomainCombinations() ); + checkForOutputFileWriteability( dc_outfile ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( dc_outfile ) ); + out.write( gwcd.toStringBuilder( dc_sort_order ).toString() ); + out.close(); } - return binary_combinations; - } - - private static List splitDomainCombination( final String dc ) { - final String[] s = dc.split( "=" ); - if ( s.length != 2 ) { - ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: " - + dc ); - System.exit( -1 ); + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - final List l = new ArrayList( 2 ); - l.add( s[ 0 ] ); - l.add( s[ 1 ] ); - return l; - } - - private static void writeAllEncounteredPfamsToFile( final Map> domain_id_to_go_ids_map, - final Map go_id_to_term_map, - final String outfile_name, - final SortedSet all_pfams_encountered ) { - final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX ); - final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name - + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); - final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX ); - int biological_process_counter = 0; - int cellular_component_counter = 0; - int molecular_function_counter = 0; - int pfams_with_mappings_counter = 0; - int pfams_without_mappings_counter = 0; - int pfams_without_mappings_to_bp_or_mf_counter = 0; - int pfams_with_mappings_to_bp_or_mf_counter = 0; + final DescriptiveStatistics stats = gwcd.getPerGenomeDomainPromiscuityStatistics(); try { - final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) ); - final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) ); - final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) ); - summary_writer.write( "# Pfam to GO mapping summary" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Actual summary is at the end of this file." ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Encountered Pfams without a GO mapping:" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - for( final String pfam : all_pfams_encountered ) { - all_pfams_encountered_writer.write( pfam ); - all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR ); - final String domain_id = new String( pfam ); - if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { - ++pfams_with_mappings_counter; - all_pfams_encountered_with_go_annotation_writer.write( pfam ); - all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR ); - final List go_ids = domain_id_to_go_ids_map.get( domain_id ); - boolean maps_to_bp = false; - boolean maps_to_cc = false; - boolean maps_to_mf = false; - for( final GoId go_id : go_ids ) { - final GoTerm go_term = go_id_to_term_map.get( go_id ); - if ( go_term.getGoNameSpace().isBiologicalProcess() ) { - maps_to_bp = true; - } - else if ( go_term.getGoNameSpace().isCellularComponent() ) { - maps_to_cc = true; - } - else if ( go_term.getGoNameSpace().isMolecularFunction() ) { - maps_to_mf = true; - } - } - if ( maps_to_bp ) { - ++biological_process_counter; - } - if ( maps_to_cc ) { - ++cellular_component_counter; - } - if ( maps_to_mf ) { - ++molecular_function_counter; - } - if ( maps_to_bp || maps_to_mf ) { - ++pfams_with_mappings_to_bp_or_mf_counter; - } - else { - ++pfams_without_mappings_to_bp_or_mf_counter; - } - } - else { - ++pfams_without_mappings_to_bp_or_mf_counter; - ++pfams_without_mappings_counter; - summary_writer.write( pfam ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - } + per_genome_domain_promiscuity_statistics_writer.write( input_file_properties[ i ][ 1 ] + "\t" ); + per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.arithmeticMean() ) + "\t" ); + if ( stats.getN() < 2 ) { + per_genome_domain_promiscuity_statistics_writer.write( "n/a" + "\t" ); } - all_pfams_encountered_writer.close(); - all_pfams_encountered_with_go_annotation_writer.close(); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size() - + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter - + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file - + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all [" - + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \"" - + encountered_pfams_summary_file + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered : " - + all_pfams_encountered.size() ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : " - + pfams_without_mappings_counter + " [" - + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : " - + pfams_without_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : " - + pfams_with_mappings_counter + " [" - + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : " - + pfams_with_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: " - + biological_process_counter + " [" - + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: " - + molecular_function_counter + " [" - + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: " - + cellular_component_counter + " [" - + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter - + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams without mapping to proc. or func. : " - + pfams_without_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " [" - + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with a mapping to proc. or func. : " - + pfams_with_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " [" - + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " [" - + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " [" - + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.close(); + else { + per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats + .sampleStandardDeviation() ) + "\t" ); + } + per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" ); + per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" ); + per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" ); + per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" ); + final SortedSet mpds = gwcd.getMostPromiscuosDomain(); + for( final String mpd : mpds ) { + per_genome_domain_promiscuity_statistics_writer.write( mpd + " " ); + } + per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR ); } catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + if ( input_file_properties[ i ].length == 3 ) { + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" ); + } + else { + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \"" + + dc_outfile + "\"" ); } } @@ -2492,6 +2900,126 @@ public final class SurfacingUtil { out.write( "" ); } + public static void writeDomainSimilaritiesToFile( final StringBuilder html_desc, + final StringBuilder html_title, + final Writer simple_tab_writer, + final Writer single_writer, + Map split_writers, + final SortedSet similarities, + final boolean treat_as_binary, + final List species_order, + final PrintableDomainSimilarity.PRINT_OPTION print_option, + final DomainSimilarity.DomainSimilarityScoring scoring, + final boolean verbose, + final Map tax_code_to_id_map, + Phylogeny phy ) throws IOException { + if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) { + split_writers = new HashMap(); + split_writers.put( '_', single_writer ); + } + switch ( print_option ) { + case SIMPLE_TAB_DELIMITED: + break; + case HTML: + for( final Character key : split_writers.keySet() ) { + final Writer w = split_writers.get( key ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( key != '_' ) { + addHtmlHead( w, "DC analysis (" + html_title + ") " + key.toString().toUpperCase() ); + } + else { + addHtmlHead( w, "DC analysis (" + html_title + ")" ); + } + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( html_desc.toString() ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + break; + } + // + for( final DomainSimilarity similarity : similarities ) { + if ( ( species_order != null ) && !species_order.isEmpty() ) { + ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); + } + if ( single_writer != null ) { + single_writer.write( "" ); + single_writer.write( SurfacingConstants.NL ); + } + else { + Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase() + .charAt( 0 ) ); + if ( local_writer == null ) { + local_writer = split_writers.get( '0' ); + } + local_writer.write( "" ); + local_writer.write( SurfacingConstants.NL ); + } + } + for( final Writer w : split_writers.values() ) { + w.write( "
Domains:
" + + similarity.getDomainId() + "
" + + similarity.getDomainId() + "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + // + for( final DomainSimilarity similarity : similarities ) { + if ( ( species_order != null ) && !species_order.isEmpty() ) { + ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); + } + if ( simple_tab_writer != null ) { + simple_tab_writer.write( similarity.toStringBuffer( PRINT_OPTION.SIMPLE_TAB_DELIMITED, + tax_code_to_id_map, + null ).toString() ); + } + if ( single_writer != null ) { + single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() ); + single_writer.write( SurfacingConstants.NL ); + } + else { + Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase() + .charAt( 0 ) ); + if ( local_writer == null ) { + local_writer = split_writers.get( '0' ); + } + local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() ); + local_writer.write( SurfacingConstants.NL ); + } + } + switch ( print_option ) { + case HTML: + for( final Writer w : split_writers.values() ) { + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + break; + } + for( final Writer w : split_writers.values() ) { + w.close(); + } + } + private static void writeDomainsToIndividualFilePerTreeNode( final Writer individual_files_writer, final String domain_0, final String domain_1 ) throws IOException { @@ -2503,6 +3031,40 @@ public final class SurfacingUtil { } } + public static void writeMatrixToFile( final CharacterStateMatrix matrix, + final String filename, + final Format format ) { + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + matrix.toWriter( out, format ); + out.flush(); + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" ); + } + + public static void writeMatrixToFile( final File matrix_outfile, final List matrices ) { + checkForOutputFileWriteability( matrix_outfile ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) ); + for( final DistanceMatrix distance_matrix : matrices ) { + out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() ); + out.write( ForesterUtil.LINE_SEPARATOR ); + out.flush(); + } + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" ); + } + private static void writePfamsToFile( final String outfile_name, final SortedSet pfams ) { try { final Writer writer = new BufferedWriter( new FileWriter( new File( outfile_name ) ) ); @@ -2519,6 +3081,88 @@ public final class SurfacingUtil { } } + public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) { + final PhylogenyWriter writer = new PhylogenyWriter(); + try { + writer.toPhyloXML( new File( filename ), phylogeny, 1 ); + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": " + + e ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" ); + } + + public static void writePresentToNexus( final File output_file, + final File positive_filter_file, + final SortedSet filter, + final List gwcd_list ) { + try { + writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfDomainPresenceOrAbsence( gwcd_list, + positive_filter_file == null ? null + : filter ), + output_file + surfacing.DOMAINS_PRESENT_NEXUS, + Format.NEXUS_BINARY ); + writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ), + output_file + surfacing.BDC_PRESENT_NEXUS, + Format.NEXUS_BINARY ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + } + + public static void writeProteinListsForAllSpecies( final File output_dir, + final SortedMap> protein_lists_per_species, + final List gwcd_list, + final double domain_e_cutoff ) { + final SortedSet all_domains = new TreeSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_domains.addAll( gwcd.getAllDomainIds() ); + } + for( final String domain : all_domains ) { + final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + surfacing.SEQ_EXTRACT_SUFFIX ); + checkForOutputFileWriteability( out ); + try { + final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) ); + extractProteinNames( protein_lists_per_species, + domain, + proteins_file_writer, + "\t", + surfacing.LIMIT_SPEC_FOR_PROT_EX, + domain_e_cutoff ); + proteins_file_writer.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote proteins list to \"" + out + "\"" ); + } + } + + public static void writeTaxonomyLinks( final Writer writer, + final String species, + final Map tax_code_to_id_map ) throws IOException { + if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) { + writer.write( " [" ); + if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) { + writer.write( "uniprot" ); + } + else { + writer.write( "eol" ); + writer.write( "|" ); + writer.write( "scholar" ); + writer.write( "|" ); + writer.write( "google" ); + } + writer.write( "]" ); + } + } + private static void writeToNexus( final String outfile_name, final CharacterStateMatrix matrix, final Phylogeny phylogeny ) { @@ -2556,75 +3200,7 @@ public final class SurfacingUtil { phylogeny ); } - final static class DomainComparator implements Comparator { - - final private boolean _ascending; - - public DomainComparator( final boolean ascending ) { - _ascending = ascending; - } - - @Override - public final int compare( final Domain d0, final Domain d1 ) { - if ( d0.getFrom() < d1.getFrom() ) { - return _ascending ? -1 : 1; - } - else if ( d0.getFrom() > d1.getFrom() ) { - return _ascending ? 1 : -1; - } - return 0; - } - } - - final static Color getColorForTaxCode( final String tax ) { - if ( tax.equals( "Deuterostomia" ) ) { - return ForesterUtil.DEUTEROSTOMIA_COLOR; - } - else if ( tax.equals( "Protostomia" ) ) { - return ForesterUtil.PROTOSTOMIA_COLOR; - } - else if ( tax.equals( "Metazoa" ) ) { - return ForesterUtil.METAZOA_COLOR; - } - else if ( tax.equals( "Holozoa" ) ) { - return ForesterUtil.HOLOZOA_COLOR; - } - else if ( tax.equals( "Fungi" ) ) { - return ForesterUtil.FUNGI_COLOR; - } - else if ( tax.equals( "Holomycota" ) ) { - return ForesterUtil.HOLOMYCOTA_COLOR; - } - else if ( tax.equals( "Amoebozoa" ) ) { - return ForesterUtil.AMOEBOZOA_COLOR; - } - else if ( tax.equals( "Viridiplantae" ) ) { - return ForesterUtil.VIRIDPLANTAE_COLOR; - } - else if ( tax.equals( "Rhodophytaa" ) ) { - return ForesterUtil.RHODOPHYTA_COLOR; - } - else if ( tax.startsWith( "Hacrobia" ) ) { - return ForesterUtil.HACROBIA_COLOR; - } - else if ( tax.equals( "Stramenopiles" ) ) { - return ForesterUtil.STRAMENOPILES_COLOR; - } - else if ( tax.equals( "Alveolata" ) ) { - return ForesterUtil.ALVEOLATA_COLOR; - } - else if ( tax.equals( "Rhizaria" ) ) { - return ForesterUtil.RHIZARIA_COLOR; - } - else if ( tax.equals( "Excavata" ) ) { - return ForesterUtil.EXCAVATA_COLOR; - } - else if ( tax.equals( "Archaea" ) ) { - return ForesterUtil.ARCHAEA_COLOR; - } - else if ( tax.equals( "Bacteria" ) ) { - return ForesterUtil.BACTERIA_COLOR; - } - return null; + private SurfacingUtil() { + // Hidden constructor. } } diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 9326fd1..68777de 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -127,7 +127,7 @@ import org.forester.ws.wabi.TxSearch.TAX_RANK; @SuppressWarnings( "unused") public final class Test { - private final static boolean PERFORM_DB_TESTS = true; + private final static boolean PERFORM_DB_TESTS = false; private final static double ZERO_DIFF = 1.0E-9; private final static String PATH_TO_TEST_DATA = System.getProperty( "user.dir" ) + ForesterUtil.getFileSeparator() + "test_data" @@ -759,6 +759,24 @@ public final class Test { System.out.println( "failed." ); failed++; } + + + + System.out.print( "Tree copy: " ); + if ( Test.testTreeCopy() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + + + + + + System.out.print( "Basic tree methods: " ); if ( Test.testBasicTreeMethods() ) { System.out.println( "OK." ); @@ -2764,6 +2782,43 @@ public final class Test { } return true; } + + + private static boolean testTreeCopy() { + try { + final String str_0 = "((((a,b),c),d)[&&NHX:S=lizards],e[&&NHX:S=reptiles])"; + final Phylogeny t0 = Phylogeny.createInstanceFromNhxString( str_0 ); + final Phylogeny t1 = t0.copy(); + if ( !t1.toNewHampshireX().equals( t0.toNewHampshireX() ) ) { + return false; + } + if ( !t1.toNewHampshireX().equals( str_0 )) { + return false; + } + t0.deleteSubtree( t0.getNode( "c" ), true ); + t0.deleteSubtree( t0.getNode( "a" ), true ); + t0.deleteSubtree( t0.getNode( "e" ), true ); + if ( !t0.toNewHampshireX().equals( "(b,d)[&&NHX:S=lizards]" )) { + return false; + } + + if ( !t1.toNewHampshireX().equals( str_0 )) { + return false; + } + t0.deleteSubtree( t0.getNode( "b" ), true ); + t0.deleteSubtree( t0.getNode( "d" ), true ); + if ( !t1.toNewHampshireX().equals( str_0 )) { + return false; + } + + } + catch ( final Exception e ) { + e.printStackTrace(); + return false; + } + return true; + } + private static boolean testCreateBalancedPhylogeny() { try { diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index 4b8da89..8036c2d 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -96,15 +96,17 @@ public final class ForesterUtil { public final static Color PROTOSTOMIA_COLOR = new Color( 204, 0, 0 ); public final static Color METAZOA_COLOR = new Color( 204, 0, 102 ); public final static Color HOLOZOA_COLOR = new Color( 127, 0, 255 ); - public final static Color FUNGI_COLOR = new Color( 255, 128, 0 ); + public final static Color FUNGI_COLOR = new Color( 255, 153, 0 ); public final static Color HOLOMYCOTA_COLOR = new Color( 204, 102, 0 ); public final static Color AMOEBOZOA_COLOR = new Color( 255, 0, 255 ); public final static Color VIRIDPLANTAE_COLOR = new Color( 0, 255, 0 ); public final static Color RHODOPHYTA_COLOR = new Color( 0, 153, 76 ); public final static Color HACROBIA_COLOR = new Color( 0, 102, 51 ); + public final static Color GLAUCOPHYTA_COLOR = new Color( 0, 102, 51 ); public final static Color STRAMENOPILES_COLOR = new Color( 0, 0, 255 ); public final static Color ALVEOLATA_COLOR = new Color( 0, 128, 255 ); public final static Color RHIZARIA_COLOR = new Color( 0, 255, 255 ); + public static final Color APUSOZOA_COLOR = new Color( 204, 255, 255 ); public final static Color EXCAVATA_COLOR = new Color( 204, 204, 0 ); public final static Color ARCHAEA_COLOR = new Color( 160, 160, 160 ); public final static Color BACTERIA_COLOR = new Color( 64, 64, 64 ); @@ -1221,4 +1223,62 @@ public final class ForesterUtil { System.err.println(); System.exit( -1 ); } + + public final static Color obtainColorDependingOnTaxonomyGroup( final String tax ) { + if ( tax.equalsIgnoreCase( "deuterostomia" ) ) { + return DEUTEROSTOMIA_COLOR; + } + else if ( tax.equalsIgnoreCase( "protostomia" ) ) { + return PROTOSTOMIA_COLOR; + } + else if ( tax.equalsIgnoreCase( "metazoa" ) ) { + return METAZOA_COLOR; + } + else if ( tax.equalsIgnoreCase( "holozoa" ) ) { + return HOLOZOA_COLOR; + } + else if ( tax.equalsIgnoreCase( "fungi" ) ) { + return FUNGI_COLOR; + } + else if ( tax.equalsIgnoreCase( "holomycota" ) ) { + return HOLOMYCOTA_COLOR; + } + else if ( tax.equalsIgnoreCase( "amoebozoa" ) ) { + return AMOEBOZOA_COLOR; + } + else if ( tax.equalsIgnoreCase( "viridiplantae" ) ) { + return VIRIDPLANTAE_COLOR; + } + else if ( tax.equalsIgnoreCase( "rhodophyta" ) ) { + return RHODOPHYTA_COLOR; + } + else if ( tax.toLowerCase().startsWith( "hacrobia" ) ) { + return HACROBIA_COLOR; + } + else if ( tax.equalsIgnoreCase( "glaucocystophyceae" ) || tax.equalsIgnoreCase( "glaucophyta" ) ) { + return GLAUCOPHYTA_COLOR; + } + else if ( tax.equalsIgnoreCase( "stramenopiles" ) ) { + return STRAMENOPILES_COLOR; + } + else if ( tax.equalsIgnoreCase( "alveolata" ) ) { + return ALVEOLATA_COLOR; + } + else if ( tax.equalsIgnoreCase( "rhizaria" ) ) { + return RHIZARIA_COLOR; + } + else if ( tax.equalsIgnoreCase( "excavata" ) ) { + return EXCAVATA_COLOR; + } + else if ( tax.equalsIgnoreCase( "apusozoa" ) ) { + return APUSOZOA_COLOR; + } + else if ( tax.equalsIgnoreCase( "archaea" ) ) { + return ARCHAEA_COLOR; + } + else if ( tax.equalsIgnoreCase( "bacteria" ) ) { + return BACTERIA_COLOR; + } + return null; + } } -- 1.7.10.2