From 6ec28d1a3a137c14c9647aec7b9e40b77fe968bf Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 28 Jun 2013 23:32:32 +0000 Subject: [PATCH] inprogress --- .../src/org/forester/application/surfacing.java | 37 +- .../src/org/forester/go/etc/MetaOntologizer.java | 2 +- .../org/forester/surfacing/DomainSimilarity.java | 20 +- .../surfacing/PairwiseGenomeComparator.java | 6 +- .../surfacing/PrintableDomainSimilarity.java | 49 +- .../org/forester/surfacing/SurfacingConstants.java | 18 +- .../src/org/forester/surfacing/SurfacingUtil.java | 2606 ++++++++++---------- 7 files changed, 1386 insertions(+), 1352 deletions(-) diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index f2f0c9e..dbba361 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -229,8 +229,8 @@ public class surfacing { final static private String INPUT_GENOMES_FILE_OPTION = "genomes"; final static private String INPUT_SPECIES_TREE_OPTION = "species_tree"; final static private String SEQ_EXTRACT_OPTION = "prot_extract"; - final static private String PRG_VERSION = "2.260"; - final static private String PRG_DATE = "130721"; + final static private String PRG_VERSION = "2.270"; + final static private String PRG_DATE = "130628"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "www.phylosoft.org/forester/applications/surfacing"; final static private boolean IGNORE_DUFS_DEFAULT = true; @@ -269,6 +269,7 @@ public class surfacing { private static final String LOG_FILE_SUFFIX = "_log.txt"; private static final String DATA_FILE_SUFFIX = "_domain_combination_data.txt"; private static final String DATA_FILE_DESC = "#SPECIES\tPRTEIN_ID\tN_TERM_DOMAIN\tC_TERM_DOMAIN\tN_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tC_TERM_DOMAIN_PER_DOMAIN_E_VALUE\tN_TERM_DOMAIN_COUNTS_PER_PROTEIN\tC_TERM_DOMAIN_COUNTS_PER_PROTEIN"; + private static final String WRITE_TO_NEXUS_OPTION = "nexus"; private static final INDIVIDUAL_SCORE_CUTOFF INDIVIDUAL_SCORE_CUTOFF_DEFAULT = INDIVIDUAL_SCORE_CUTOFF.FULL_SEQUENCE; //TODO look at me! change? public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_counts.txt"; public static final String INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX = "_indep_dc_gains_fitch_lists.txt"; @@ -622,6 +623,7 @@ public class surfacing { allowed_options.add( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ); allowed_options.add( OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS ); allowed_options.add( CONSIDER_DOMAIN_COMBINATION_DIRECTEDNESS_AND_ADJACENCY ); + allowed_options.add( WRITE_TO_NEXUS_OPTION ); boolean ignore_dufs = surfacing.IGNORE_DUFS_DEFAULT; boolean ignore_combination_with_same = surfacing.IGNORE_COMBINATION_WITH_SAME_DEFAULLT; double e_value_max = surfacing.MAX_E_VALUE_DEFAULT; @@ -630,6 +632,10 @@ public class surfacing { if ( dissallowed_options.length() > 0 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "unknown option(s): " + dissallowed_options ); } + boolean write_to_nexus = false; + if ( cla.isOptionSet( WRITE_TO_NEXUS_OPTION ) ) { + write_to_nexus = true; + } boolean output_binary_domain_combinationsfor_graph_analysis = false; if ( cla.isOptionSet( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ) ) { output_binary_domain_combinationsfor_graph_analysis = true; @@ -1020,12 +1026,6 @@ public class surfacing { } } final String[][] input_file_properties = processInputGenomesFile( input_genomes_file ); - // for( final String[] input_file_propertie : input_file_properties ) { - // for( final String element : input_file_propertie ) { - // System.out.print( element + " " ); - // } - // System.out.println(); - // } final int number_of_genomes = input_file_properties.length; if ( number_of_genomes < 2 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "cannot analyze less than two files" ); @@ -1399,6 +1399,7 @@ public class surfacing { + ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) + "" + nl ); } + System.out.println( "Write to Nexus files : " + write_to_nexus ); System.out.print( "Domain counts sort order : " ); switch ( dc_sort_order ) { case ALPHABETICAL_KEY_ID: @@ -2094,7 +2095,7 @@ public class surfacing { go_annotation_output, go_id_to_term_map, go_namespace_limit ); - DescriptiveStatistics pw_stats = null; + final Map tax_code_to_id_map = SurfacingUtil.createTaxCodeToIdMap( intrees[ 0 ] ); try { String my_outfile = output_file.toString(); Map split_writers = null; @@ -2125,7 +2126,7 @@ public class surfacing { + new java.text.SimpleDateFormat( "yyyy.MM.dd HH:mm:ss" ).format( new java.util.Date() ) + "" + nl ); html_desc.append( "" + nl ); - pw_stats = SurfacingUtil + final DescriptiveStatistics pw_stats = SurfacingUtil .writeDomainSimilaritiesToFile( html_desc, new StringBuilder( number_of_genomes + " genomes" ), writer, @@ -2136,7 +2137,8 @@ public class surfacing { domain_similarity_print_option, domain_similarity_sort_field, scoring, - true ); + true, + tax_code_to_id_map ); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote main output (includes domain similarities) to: \"" + ( out_dir == null ? my_outfile : out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile ) + "\"" ); } @@ -2145,7 +2147,6 @@ public class surfacing { + e.getMessage() + "]" ); } System.out.println(); - // values_for_all_scores_histogram = pw_stats.getDataAsDoubleArray(); final Species[] species = new Species[ number_of_genomes ]; for( int i = 0; i < number_of_genomes; ++i ) { species[ i ] = new BasicSpecies( input_file_properties[ i ][ 1 ] ); @@ -2173,7 +2174,8 @@ public class surfacing { surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX, surfacing.PRG_NAME, out_dir, - write_pwc_files ); + write_pwc_files, + tax_code_to_id_map ); String matrix_output_file = new String( output_file.toString() ); if ( matrix_output_file.indexOf( '.' ) > 1 ) { matrix_output_file = matrix_output_file.substring( 0, matrix_output_file.indexOf( '.' ) ); @@ -2237,7 +2239,9 @@ public class surfacing { if ( ( out_dir != null ) && ( !perform_pwc ) ) { output_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file ); } - writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); + if ( write_to_nexus ) { + writePresentToNexus( output_file, positive_filter_file, filter, gwcd_list ); + } if ( ( ( intrees != null ) && ( intrees.length > 0 ) ) && ( number_of_genomes > 2 ) ) { final StringBuilder parameters_sb = createParametersAsString( ignore_dufs, e_value_max, @@ -2272,7 +2276,9 @@ public class surfacing { dc_type, protein_length_stats_by_dc, domain_number_stats_by_dc, - domain_length_stats_by_domain ); + domain_length_stats_by_domain, + tax_code_to_id_map, + write_to_nexus ); // Listing of all domain combinations gained is only done if only one input tree is used. if ( ( domain_id_to_secondary_features_maps != null ) && ( domain_id_to_secondary_features_maps.length > 0 ) ) { @@ -2583,6 +2589,7 @@ public class surfacing { System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_OPTIONS + ": to output all proteins per domain" ); System.out.println( surfacing.OUTPUT_LIST_OF_ALL_PROTEINS_PER_DOMAIN_E_VALUE_OPTION + ": e value max per domain for output of all proteins per domain" ); + System.out.println( surfacing.WRITE_TO_NEXUS_OPTION + ": to output in Nexus format" ); System.out.println(); System.out.println( "Example 1: java -Xms128m -Xmx512m -cp path/to/forester.jar" + " org.forester.application.surfacing p2g=pfam2go_2012_02_07.txt -dufs -cos=Pfam_260_NC1" diff --git a/forester/java/src/org/forester/go/etc/MetaOntologizer.java b/forester/java/src/org/forester/go/etc/MetaOntologizer.java index 11afb54..32f841d 100644 --- a/forester/java/src/org/forester/go/etc/MetaOntologizer.java +++ b/forester/java/src/org/forester/go/etc/MetaOntologizer.java @@ -520,7 +520,7 @@ public class MetaOntologizer { writer.write( "" ); writer.write( "

" ); writer.write( species ); - SurfacingUtil.writeTaxonomyLinks( writer, species ); + SurfacingUtil.writeTaxonomyLinks( writer, species, null ); writer.write( "

" ); writer.write( "" ); writer.write( ForesterUtil.LINE_SEPARATOR ); diff --git a/forester/java/src/org/forester/surfacing/DomainSimilarity.java b/forester/java/src/org/forester/surfacing/DomainSimilarity.java index b9e9860..b6ae576 100644 --- a/forester/java/src/org/forester/surfacing/DomainSimilarity.java +++ b/forester/java/src/org/forester/surfacing/DomainSimilarity.java @@ -26,11 +26,13 @@ package org.forester.surfacing; +import java.util.Map; import java.util.SortedMap; import java.util.SortedSet; import org.forester.protein.DomainId; import org.forester.species.Species; +import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION; /* * This is to represent a measure of similarity between two or more domains from @@ -38,6 +40,14 @@ import org.forester.species.Species; */ public interface DomainSimilarity extends Comparable { + static public enum DomainSimilarityScoring { + DOMAINS, PROTEINS, COMBINATIONS; + } + + public static enum DomainSimilaritySortField { + MIN, MAX, SD, MEAN, ABS_MAX_COUNTS_DIFFERENCE, MAX_COUNTS_DIFFERENCE, MAX_DIFFERENCE, SPECIES_COUNT, DOMAIN_ID, + } + public SortedSet getCombinableDomainIds( final Species species_of_combinable_domain );; public DomainId getDomainId(); @@ -92,13 +102,5 @@ public interface DomainSimilarity extends Comparable { public double getStandardDeviationOfSimilarityScore(); - public StringBuffer toStringBuffer( final PrintableDomainSimilarity.PRINT_OPTION print_option ); - - static public enum DomainSimilarityScoring { - DOMAINS, PROTEINS, COMBINATIONS; - } - - public static enum DomainSimilaritySortField { - MIN, MAX, SD, MEAN, ABS_MAX_COUNTS_DIFFERENCE, MAX_COUNTS_DIFFERENCE, MAX_DIFFERENCE, SPECIES_COUNT, DOMAIN_ID, - } + public StringBuffer toStringBuffer( PRINT_OPTION print_option, Map tax_code_to_id_map ); } diff --git a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java index 6281764..a811450 100644 --- a/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java +++ b/forester/java/src/org/forester/surfacing/PairwiseGenomeComparator.java @@ -102,7 +102,8 @@ public class PairwiseGenomeComparator { final String automated_pairwise_comparison_prefix, final String command_line_prg_name, final File out_dir, - final boolean write_pairwise_comparisons ) { + final boolean write_pairwise_comparisons, + final Map tax_code_to_id_map ) { init(); final BasicSymmetricalDistanceMatrix domain_distance_scores_means = new BasicSymmetricalDistanceMatrix( number_of_genomes ); final BasicSymmetricalDistanceMatrix shared_domains_based_distances = new BasicSymmetricalDistanceMatrix( number_of_genomes ); @@ -223,7 +224,8 @@ public class PairwiseGenomeComparator { domain_similarity_print_option, domain_similarity_sort_field, scoring, - false ); + false, + tax_code_to_id_map ); } catch ( final IOException e ) { ForesterUtil.fatalError( command_line_prg_name, "Failed to write similarites to: \"" diff --git a/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java b/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java index 6966aee..31d0e55 100644 --- a/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java +++ b/forester/java/src/org/forester/surfacing/PrintableDomainSimilarity.java @@ -185,19 +185,23 @@ public class PrintableDomainSimilarity implements DomainSimilarity { } } - private void addSpeciesSpecificDomainData( final StringBuffer sb, final Species species, final boolean html ) { + private void addSpeciesSpecificDomainData( final StringBuffer sb, + final Species species, + final boolean html, + final Map tax_code_to_id_map ) { if ( getDetaildness() != DomainSimilarityCalculator.Detailedness.BASIC ) { sb.append( "[" ); } if ( html ) { sb.append( "" ); - if ( ( SurfacingConstants.TAXONOMY_LINK != null ) && ( species.getSpeciesId().length() > 2 ) - && ( species.getSpeciesId().length() < 6 ) ) { - sb.append( "" + species.getSpeciesId() + "" ); + final String tax_code = species.getSpeciesId(); + if ( !ForesterUtil.isEmpty( tax_code ) + && ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( tax_code ) ) ) { + sb.append( "" + tax_code + "" ); } else { - sb.append( species.getSpeciesId() ); + sb.append( tax_code ); } sb.append( "" ); } @@ -510,19 +514,20 @@ public class PrintableDomainSimilarity implements DomainSimilarity { return _species_data; } - private StringBuffer getSpeciesDataInAlphabeticalOrder( final boolean html ) { + private StringBuffer getSpeciesDataInAlphabeticalOrder( final boolean html, + final Map tax_code_to_id_map ) { final StringBuffer sb = new StringBuffer(); for( final Species species : getSpeciesData().keySet() ) { - addSpeciesSpecificDomainData( sb, species, html ); + addSpeciesSpecificDomainData( sb, species, html, tax_code_to_id_map ); } return sb; } - private StringBuffer getSpeciesDataInCustomOrder( final boolean html ) { + private StringBuffer getSpeciesDataInCustomOrder( final boolean html, final Map tax_code_to_id_map ) { final StringBuffer sb = new StringBuffer(); for( final Species order_species : getSpeciesCustomOrder() ) { if ( getSpeciesData().keySet().contains( order_species ) ) { - addSpeciesSpecificDomainData( sb, order_species, html ); + addSpeciesSpecificDomainData( sb, order_species, html, tax_code_to_id_map ); } else { sb.append( PrintableDomainSimilarity.NO_SPECIES ); @@ -575,32 +580,32 @@ public class PrintableDomainSimilarity implements DomainSimilarity { } @Override - public String toString() { - return toStringBuffer( null ).toString(); - } - - @Override - public StringBuffer toStringBuffer( final PrintableDomainSimilarity.PRINT_OPTION print_option ) { + public StringBuffer toStringBuffer( final PrintableDomainSimilarity.PRINT_OPTION print_option, + final Map tax_code_to_id_map ) { switch ( print_option ) { case SIMPLE_TAB_DELIMITED: return toStringBufferSimpleTabDelimited(); case HTML: - return toStringBufferDetailedHTML(); + return toStringBufferDetailedHTML( tax_code_to_id_map ); default: throw new AssertionError( "Unknown print option: " + print_option ); } } - private StringBuffer toStringBufferDetailedHTML() { + private StringBuffer toStringBufferDetailedHTML( final Map tax_code_to_id_map ) { final StringBuffer sb = new StringBuffer(); sb.append( "" ); sb.append( "" ); boldStartIfSortedBy( DomainSimilaritySortField.DOMAIN_ID, sb ); - sb.append( "" + getDomainId() - + "" ); + sb.append( "" + + getDomainId() + "" ); boldEndIfSortedBy( DomainSimilaritySortField.DOMAIN_ID, sb ); sb.append( "" ); sb.append( "" ); + sb.append( "gs" ); + sb.append( "" ); + sb.append( "" ); boldStartIfSortedBy( DomainSimilaritySortField.MEAN, sb ); sb.append( ForesterUtil.round( getMeanSimilarityScore(), 3 ) ); boldEndIfSortedBy( DomainSimilaritySortField.MEAN, sb ); @@ -664,12 +669,12 @@ public class PrintableDomainSimilarity implements DomainSimilarity { } if ( ( getSpeciesCustomOrder() == null ) || getSpeciesCustomOrder().isEmpty() ) { sb.append( "" ); - sb.append( getSpeciesDataInAlphabeticalOrder( true ) ); + sb.append( getSpeciesDataInAlphabeticalOrder( true, tax_code_to_id_map ) ); sb.append( "" ); } else { sb.append( "" ); - sb.append( getSpeciesDataInCustomOrder( true ) ); + sb.append( getSpeciesDataInCustomOrder( true, tax_code_to_id_map ) ); sb.append( "" ); } sb.append( "" ); diff --git a/forester/java/src/org/forester/surfacing/SurfacingConstants.java b/forester/java/src/org/forester/surfacing/SurfacingConstants.java index c1991f6..9709508 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingConstants.java +++ b/forester/java/src/org/forester/surfacing/SurfacingConstants.java @@ -30,19 +30,15 @@ import org.forester.util.ForesterUtil; public class SurfacingConstants { - public static final String GOOGLE_WEB_SEARCH_LINK = "http://www.google.com/search?q="; - public static final String GOOGLE_SCHOLAR_LINK = "http://scholar.google.com/scholar?q="; - public static final String GOOGLE_SCHOLAR_LIMITS = "&as_subj=bio&as_subj=med&as_subj=chm&num=100"; public static final String AMIGO_LINK = "http://amigo.geneontology.org/cgi-bin/amigo/go.cgi?view=details&search_constraint=terms&query="; - public static final String PFAM_FAMILY_ID_LINK = "http://pfam.sanger.ac.uk/family?id="; + public static final String EOL_LINK = "http://www.eol.org/search?q="; + public static final String GO_LINK = "http://amigo.geneontology.org/cgi-bin/amigo/go.cgi?view=details&search_constraint=terms&query="; + public static final String GOOGLE_SCHOLAR_SEARCH = "http://scholar.google.com/scholar?q="; + public static final String GOOGLE_WEB_SEARCH_LINK = "http://www.google.com/search?q="; public static final String NL = ForesterUtil.LINE_SEPARATOR; - public static final String TAXONOMY_LINK = "http://beta.uniprot.org/taxonomy/?query="; + public static final String NONE = "[none]"; + public static final String PFAM_FAMILY_ID_LINK = "http://pfam.janelia.org/family/"; + public static final String UNIPROT_TAXONOMY_ID_LINK = "http://www.uniprot.org/taxonomy/"; static final boolean SECONDARY_FEATURES_ARE_SCOP = true; static final String SECONDARY_FEATURES_SCOP_LINK = "http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi?key="; - public static final String NONE = "[none]"; - public static final String UNIPROT_LINK = "http://www.uniprot.org/taxonomy/?query="; - public static final String GO_LINK = "http://amigo.geneontology.org/cgi-bin/amigo/go.cgi?view=details&search_constraint=terms&query="; - public static final String EOL_LINK = "http://www.eol.org/search?q="; - public static final String TOL_LINK = "http://www.googlesyndicatedsearch.com/u/TreeofLife?q="; - public static final String WIKIPEDIA_LINK = "http://wikipedia.org/wiki/"; } diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index 9497ea9..ebce1cc 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -73,6 +73,7 @@ import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE; import org.forester.phylogeny.data.BinaryCharacters; import org.forester.phylogeny.data.Confidence; +import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.protein.BasicDomain; import org.forester.protein.BasicProtein; @@ -93,7 +94,6 @@ import org.forester.util.ForesterUtil; public final class SurfacingUtil { - private final static NumberFormat FORMATTER = new DecimalFormat( "0.0E0" ); private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" ); private static final Comparator ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator() { @@ -121,75 +121,6 @@ public final class SurfacingUtil { // Hidden constructor. } - public static void performDomainArchitectureAnalysis( final SortedMap> domain_architecutures, - final SortedMap domain_architecuture_counts, - final int min_count, - final File da_counts_outfile, - final File unique_da_outfile ) { - checkForOutputFileWriteability( da_counts_outfile ); - checkForOutputFileWriteability( unique_da_outfile ); - try { - final BufferedWriter da_counts_out = new BufferedWriter( new FileWriter( da_counts_outfile ) ); - final BufferedWriter unique_da_out = new BufferedWriter( new FileWriter( unique_da_outfile ) ); - final Iterator> it = domain_architecuture_counts.entrySet().iterator(); - while ( it.hasNext() ) { - final Map.Entry e = it.next(); - final String da = e.getKey(); - final int count = e.getValue(); - if ( count >= min_count ) { - da_counts_out.write( da ); - da_counts_out.write( "\t" ); - da_counts_out.write( String.valueOf( count ) ); - da_counts_out.write( ForesterUtil.LINE_SEPARATOR ); - } - if ( count == 1 ) { - final Iterator>> it2 = domain_architecutures.entrySet().iterator(); - while ( it2.hasNext() ) { - final Map.Entry> e2 = it2.next(); - final String genome = e2.getKey(); - final Set das = e2.getValue(); - if ( das.contains( da ) ) { - unique_da_out.write( genome ); - unique_da_out.write( "\t" ); - unique_da_out.write( da ); - unique_da_out.write( ForesterUtil.LINE_SEPARATOR ); - } - } - } - } - unique_da_out.close(); - da_counts_out.close(); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + da_counts_outfile + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + unique_da_outfile + "\"" ); - // - } - - public static int storeDomainArchitectures( final String genome, - final SortedMap> domain_architecutures, - final List protein_list, - final Map distinct_domain_architecuture_counts ) { - final Set da = new HashSet(); - domain_architecutures.put( genome, da ); - for( final Protein protein : protein_list ) { - final String da_str = ( ( BasicProtein ) protein ).toDomainArchitectureString( "~", 3, "=" ); - if ( !da.contains( da_str ) ) { - if ( !distinct_domain_architecuture_counts.containsKey( da_str ) ) { - distinct_domain_architecuture_counts.put( da_str, 1 ); - } - else { - distinct_domain_architecuture_counts.put( da_str, - distinct_domain_architecuture_counts.get( da_str ) + 1 ); - } - da.add( da_str ); - } - } - return da.size(); - } - public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome, final SortedSet binary_domain_combinations ) { final SortedMap all_cd = genome.getAllCombinableDomainsIds(); @@ -243,507 +174,270 @@ public final class SurfacingUtil { return stats; } - private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, - final String outfilename_for_counts, - final String outfilename_for_dc, - final String outfilename_for_dc_for_go_mapping, - final String outfilename_for_dc_for_go_mapping_unique, - final String outfilename_for_rank_counts, - final String outfilename_for_ancestor_species_counts, - final String outfilename_for_protein_stats, - final Map protein_length_stats_by_dc, - final Map domain_number_stats_by_dc, - final Map domain_length_stats_by_domain ) { - try { - // - // if ( protein_length_stats_by_dc != null ) { - // for( final Entry entry : protein_length_stats_by_dc.entrySet() ) { - // System.out.print( entry.getKey().toString() ); - // System.out.print( ": " ); - // double[] a = entry.getValue().getDataAsDoubleArray(); - // for( int i = 0; i < a.length; i++ ) { - // System.out.print( a[ i ] + " " ); - // } - // System.out.println(); - // } - // } - // if ( domain_number_stats_by_dc != null ) { - // for( final Entry entry : domain_number_stats_by_dc.entrySet() ) { - // System.out.print( entry.getKey().toString() ); - // System.out.print( ": " ); - // double[] a = entry.getValue().getDataAsDoubleArray(); - // for( int i = 0; i < a.length; i++ ) { - // System.out.print( a[ i ] + " " ); - // } - // System.out.println(); - // } - // } - // - final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); - final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); - final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); - final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) ); - final SortedMap dc_gain_counts = new TreeMap(); - for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { - final PhylogenyNode n = it.next(); - final Set gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters(); - for( final String dc : gained_dc ) { - if ( dc_gain_counts.containsKey( dc ) ) { - dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 ); + public static int calculateOverlap( final Domain domain, final List covered_positions ) { + int overlap_count = 0; + for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { + if ( ( i < covered_positions.size() ) && ( covered_positions.get( i ) == true ) ) { + ++overlap_count; + } + } + return overlap_count; + } + + public static void checkForOutputFileWriteability( final File outfile ) { + final String error = ForesterUtil.isWritableFile( outfile ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } + } + + public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, + final BinaryDomainCombination.DomainCombinationType dc_type, + final List all_binary_domains_combination_gained, + final boolean get_gains ) { + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } + for( final String id : sorted_ids ) { + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) + || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) { + if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) { + all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination + .createInstance( matrix.getCharacter( c ) ) ); + } + else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) { + all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination + .createInstance( matrix.getCharacter( c ) ) ); } else { - dc_gain_counts.put( dc, 1 ); + all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix + .getCharacter( c ) ) ); } } } - final SortedMap histogram = new TreeMap(); - final SortedMap domain_lists = new TreeMap(); - final SortedMap dc_reapp_counts_to_protein_length_stats = new TreeMap(); - final SortedMap dc_reapp_counts_to_domain_number_stats = new TreeMap(); - final SortedMap dc_reapp_counts_to_domain_lengths_stats = new TreeMap(); - final SortedMap> domain_lists_go = new TreeMap>(); - final SortedMap> domain_lists_go_unique = new TreeMap>(); - final Set dcs = dc_gain_counts.keySet(); - final SortedSet more_than_once = new TreeSet(); - final DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics(); - long gained_multiple_times_domain_length_sum = 0; - long gained_once_domain_length_sum = 0; - long gained_multiple_times_domain_length_count = 0; - long gained_once_domain_length_count = 0; - for( final String dc : dcs ) { - final int count = dc_gain_counts.get( dc ); - if ( histogram.containsKey( count ) ) { - histogram.put( count, histogram.get( count ) + 1 ); - domain_lists.get( count ).append( ", " + dc ); - domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) ); - domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) ); - } - else { - histogram.put( count, 1 ); - domain_lists.put( count, new StringBuilder( dc ) ); - final PriorityQueue q = new PriorityQueue(); - q.addAll( splitDomainCombination( dc ) ); - domain_lists_go.put( count, q ); - final SortedSet set = new TreeSet(); - set.addAll( splitDomainCombination( dc ) ); - domain_lists_go_unique.put( count, set ); - } - if ( protein_length_stats_by_dc != null ) { - if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) { - dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); + } + } + + public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { + final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings + .size() ); + for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) { + if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) { + domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList() ); + } + domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() ); + } + return domain_id_to_go_ids_map; + } + + public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) + throws IOException { + final BasicTable primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' ); + final Map> map = new TreeMap>(); + for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) { + final DomainId domain_id = new DomainId( primary_table.getValue( 0, r ) ); + if ( !map.containsKey( domain_id ) ) { + map.put( domain_id, new HashSet() ); + } + map.get( domain_id ).add( primary_table.getValue( 1, r ) ); + } + return map; + } + + public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { + checkForOutputFileWriteability( nj_tree_outfile ); + final NeighborJoining nj = NeighborJoining.createInstance(); + final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance ); + phylogeny.setName( nj_tree_outfile.getName() ); + writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); + return phylogeny; + } + + public static Map createTaxCodeToIdMap( final Phylogeny phy ) { + final Map m = new HashMap(); + for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getNodeData().isHasTaxonomy() ) { + final Taxonomy t = n.getNodeData().getTaxonomy(); + final String c = t.getTaxonomyCode(); + if ( !ForesterUtil.isEmpty( c ) ) { + if ( n.getNodeData().getTaxonomy() == null ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n ); } - dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) - .arithmeticMean() ); - } - if ( domain_number_stats_by_dc != null ) { - if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { - dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); + final String id = n.getNodeData().getTaxonomy().getIdentifier().getValue(); + if ( ForesterUtil.isEmpty( id ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy id for node " + n ); } - dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) - .arithmeticMean() ); - } - if ( domain_length_stats_by_domain != null ) { - if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) { - dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() ); + if ( m.containsKey( c ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy code " + c + " is not unique" ); } - final String[] ds = dc.split( "=" ); - dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain - .get( ds[ 0 ] ).arithmeticMean() ); - dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain - .get( ds[ 1 ] ).arithmeticMean() ); - } - if ( count > 1 ) { - more_than_once.add( dc ); - if ( protein_length_stats_by_dc != null ) { - final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_multiple_times_lengths_stats.addValue( element ); - } - } - if ( domain_number_stats_by_dc != null ) { - final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_multiple_times_domain_count_stats.addValue( element ); - } - } - if ( domain_length_stats_by_domain != null ) { - final String[] ds = dc.split( "=" ); - final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); - final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); - for( final double element : s0.getData() ) { - gained_multiple_times_domain_length_sum += element; - ++gained_multiple_times_domain_length_count; - } - for( final double element : s1.getData() ) { - gained_multiple_times_domain_length_sum += element; - ++gained_multiple_times_domain_length_count; - } + final int iid = Integer.valueOf( id ); + if ( m.containsValue( iid ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, "taxonomy id " + iid + " is not unique" ); } + m.put( c, iid ); } - else { - if ( protein_length_stats_by_dc != null ) { - final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_once_lengths_stats.addValue( element ); + } + else { + ForesterUtil.fatalError( surfacing.PRG_NAME, "no taxonomy for node " + n ); + } + } + return m; + } + + public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, + final Detailedness detailedness, + final GoAnnotationOutput go_annotation_output, + final Map go_id_to_term_map, + final GoNameSpace go_namespace_limit ) { + if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) ) { + throw new IllegalArgumentException( "attempt to use a GO namespace limit without a GO id to term map" ); + } + for( final DomainSimilarity domain_similarity : domain_similarities ) { + if ( domain_similarity instanceof PrintableDomainSimilarity ) { + final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity; + printable_domain_similarity.setDetailedness( detailedness ); + printable_domain_similarity.setGoAnnotationOutput( go_annotation_output ); + printable_domain_similarity.setGoIdToTermMap( go_id_to_term_map ); + printable_domain_similarity.setGoNamespaceLimit( go_namespace_limit ); + } + } + } + + public static void doit( final List proteins, + final List query_domain_ids_nc_order, + final Writer out, + final String separator, + final String limit_to_species, + final Map> average_protein_lengths_by_dc ) throws IOException { + for( final Protein protein : proteins ) { + if ( ForesterUtil.isEmpty( limit_to_species ) + || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { + if ( protein.contains( query_domain_ids_nc_order, true ) ) { + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator ); + out.write( protein.getProteinId().getId() ); + out.write( separator ); + out.write( "[" ); + final Set visited_domain_ids = new HashSet(); + boolean first = true; + for( final Domain domain : protein.getProteinDomains() ) { + if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { + visited_domain_ids.add( domain.getDomainId() ); + if ( first ) { + first = false; + } + else { + out.write( " " ); + } + out.write( domain.getDomainId().getId() ); + out.write( " {" ); + out.write( "" + domain.getTotalCount() ); + out.write( "}" ); } } - if ( domain_number_stats_by_dc != null ) { - final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_once_domain_count_stats.addValue( element ); - } + out.write( "]" ); + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getDescription() ); } - if ( domain_length_stats_by_domain != null ) { - final String[] ds = dc.split( "=" ); - final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); - final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); - for( final double element : s0.getData() ) { - gained_once_domain_length_sum += element; - ++gained_once_domain_length_count; - } - for( final double element : s1.getData() ) { - gained_once_domain_length_sum += element; - ++gained_once_domain_length_count; - } + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getAccession() ); } + out.write( SurfacingConstants.NL ); } } - final Set histogram_keys = histogram.keySet(); - for( final Integer histogram_key : histogram_keys ) { - final int count = histogram.get( histogram_key ); - final StringBuilder dc = domain_lists.get( histogram_key ); - out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); - out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); - out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); - final Object[] sorted = domain_lists_go.get( histogram_key ).toArray(); - Arrays.sort( sorted ); - for( final Object domain : sorted ) { - out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR ); - } - out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); - for( final String domain : domain_lists_go_unique.get( histogram_key ) ) { - out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR ); - } + } + out.flush(); + } + + public static void domainsPerProteinsStatistics( final String genome, + final List protein_list, + final DescriptiveStatistics all_genomes_domains_per_potein_stats, + final SortedMap all_genomes_domains_per_potein_histo, + final SortedSet domains_which_are_always_single, + final SortedSet domains_which_are_sometimes_single_sometimes_not, + final SortedSet domains_which_never_single, + final Writer writer ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final Protein protein : protein_list ) { + final int domains = protein.getNumberOfProteinDomains(); + //System.out.println( domains ); + stats.addValue( domains ); + all_genomes_domains_per_potein_stats.addValue( domains ); + if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) { + all_genomes_domains_per_potein_histo.put( domains, 1 ); } - out_counts.close(); - out_dc.close(); - out_dc_for_go_mapping.close(); - out_dc_for_go_mapping_unique.close(); - final SortedMap lca_rank_counts = new TreeMap(); - final SortedMap lca_ancestor_species_counts = new TreeMap(); - for( final String dc : more_than_once ) { - final List nodes = new ArrayList(); - for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) { - final PhylogenyNode n = it.next(); - if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) { - nodes.add( n ); + else { + all_genomes_domains_per_potein_histo.put( domains, + 1 + all_genomes_domains_per_potein_histo.get( domains ) ); + } + if ( domains == 1 ) { + final String domain = protein.getProteinDomain( 0 ).getDomainId().getId(); + if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { + if ( domains_which_never_single.contains( domain ) ) { + domains_which_never_single.remove( domain ); + domains_which_are_sometimes_single_sometimes_not.add( domain ); + } + else { + domains_which_are_always_single.add( domain ); } } - for( int i = 0; i < ( nodes.size() - 1 ); ++i ) { - for( int j = i + 1; j < nodes.size(); ++j ) { - final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) ); - String rank = "unknown"; - if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) { - rank = lca.getNodeData().getTaxonomy().getRank(); - } - addToCountMap( lca_rank_counts, rank ); - String lca_species; - if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) { - lca_species = lca.getNodeData().getTaxonomy().getScientificName(); - } - else if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) { - lca_species = lca.getNodeData().getTaxonomy().getCommonName(); + } + else if ( domains > 1 ) { + for( final Domain d : protein.getProteinDomains() ) { + final String domain = d.getDomainId().getId(); + // System.out.println( domain ); + if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { + if ( domains_which_are_always_single.contains( domain ) ) { + domains_which_are_always_single.remove( domain ); + domains_which_are_sometimes_single_sometimes_not.add( domain ); } else { - lca_species = lca.getName(); + domains_which_never_single.add( domain ); } - addToCountMap( lca_ancestor_species_counts, lca_species ); } } } - final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) ); - final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) ); - ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR ); - ForesterUtil.map2writer( out_for_ancestor_species_counts, - lca_ancestor_species_counts, - "\t", - ForesterUtil.LINE_SEPARATOR ); - out_for_rank_counts.close(); - out_for_ancestor_species_counts.close(); - if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats ) - && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) { - final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) ); - w.write( "Domain Lengths: " ); - w.write( "\n" ); - if ( domain_length_stats_by_domain != null ) { - for( final Entry entry : dc_reapp_counts_to_domain_lengths_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); - } - } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Protein Lengths: " ); - w.write( "\n" ); - if ( protein_length_stats_by_dc != null ) { - for( final Entry entry : dc_reapp_counts_to_protein_length_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); - } + } + try { + writer.write( genome ); + writer.write( "\t" ); + if ( stats.getN() >= 1 ) { + writer.write( stats.arithmeticMean() + "" ); + writer.write( "\t" ); + if ( stats.getN() >= 2 ) { + writer.write( stats.sampleStandardDeviation() + "" ); } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Number of domains: " ); - w.write( "\n" ); - if ( domain_number_stats_by_dc != null ) { - for( final Entry entry : dc_reapp_counts_to_domain_number_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); - } + else { + writer.write( "" ); } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, domain lengths:" ); - w.write( "\n" ); - w.write( "N: " + gained_once_domain_length_count ); - w.write( "\n" ); - w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, domain lengths:" ); - w.write( "\n" ); - w.write( "N: " + gained_multiple_times_domain_length_count ); - w.write( "\n" ); - w.write( "Avg: " - + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, protein lengths:" ); - w.write( "\n" ); - w.write( gained_once_lengths_stats.toString() ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, domain counts:" ); - w.write( "\n" ); - w.write( gained_once_domain_count_stats.toString() ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, protein lengths:" ); - w.write( "\n" ); - w.write( gained_multiple_times_lengths_stats.toString() ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, domain counts:" ); - w.write( "\n" ); - w.write( gained_multiple_times_domain_count_stats.toString() ); - w.flush(); - w.close(); - } - } - catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to [" - + outfilename_for_counts + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to [" - + outfilename_for_dc + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote independent domain combination gains fitch lists to (for GO mapping) [" - + outfilename_for_dc_for_go_mapping + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) [" - + outfilename_for_dc_for_go_mapping_unique + "]" ); - } - - private final static void addToCountMap( final Map map, final String s ) { - if ( map.containsKey( s ) ) { - map.put( s, map.get( s ) + 1 ); - } - else { - map.put( s, 1 ); - } - } - - public static int calculateOverlap( final Domain domain, final List covered_positions ) { - int overlap_count = 0; - for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { - if ( ( i < covered_positions.size() ) && ( covered_positions.get( i ) == true ) ) { - ++overlap_count; - } - } - return overlap_count; - } - - public static void checkForOutputFileWriteability( final File outfile ) { - final String error = ForesterUtil.isWritableFile( outfile ); - if ( !ForesterUtil.isEmpty( error ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, error ); - } - } - - private static SortedSet collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root, - final boolean get_gains ) { - final SortedSet domains = new TreeSet(); - for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) { - final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters(); - if ( get_gains ) { - domains.addAll( chars.getGainedCharacters() ); + writer.write( "\t" ); + writer.write( stats.median() + "" ); + writer.write( "\t" ); + writer.write( stats.getN() + "" ); + writer.write( "\t" ); + writer.write( stats.getMin() + "" ); + writer.write( "\t" ); + writer.write( stats.getMax() + "" ); } else { - domains.addAll( chars.getLostCharacters() ); - } - } - return domains; - } - - public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, - final BinaryDomainCombination.DomainCombinationType dc_type, - final List all_binary_domains_combination_gained, - final boolean get_gains ) { - final SortedSet sorted_ids = new TreeSet(); - for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { - sorted_ids.add( matrix.getIdentifier( i ) ); - } - for( final String id : sorted_ids ) { - for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { - if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) - || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) { - if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) { - all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination - .createInstance( matrix.getCharacter( c ) ) ); - } - else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) { - all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination - .createInstance( matrix.getCharacter( c ) ) ); - } - else { - all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix - .getCharacter( c ) ) ); - } - } - } - } - } - - private static File createBaseDirForPerNodeDomainFiles( final String base_dir, - final boolean domain_combinations, - final CharacterStateMatrix.GainLossStates state, - final String outfile ) { - File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent() - + ForesterUtil.FILE_SEPARATOR + base_dir ); - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - if ( domain_combinations ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "DC" ); - } - else { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "DOMAINS" ); - } - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - if ( state == GainLossStates.GAIN ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "GAINS" ); - } - else if ( state == GainLossStates.LOSS ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "LOSSES" ); - } - else { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "PRESENT" ); - } - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - return per_node_go_mapped_domain_gain_loss_files_base_dir; - } - - public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { - final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings - .size() ); - for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) { - if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) { - domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList() ); - } - domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() ); - } - return domain_id_to_go_ids_map; - } - - public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) - throws IOException { - final BasicTable primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' ); - final Map> map = new TreeMap>(); - for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) { - final DomainId domain_id = new DomainId( primary_table.getValue( 0, r ) ); - if ( !map.containsKey( domain_id ) ) { - map.put( domain_id, new HashSet() ); + writer.write( "\t" ); + writer.write( "\t" ); + writer.write( "\t" ); + writer.write( "0" ); + writer.write( "\t" ); + writer.write( "\t" ); } - map.get( domain_id ).add( primary_table.getValue( 1, r ) ); - } - return map; - } - - public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { - checkForOutputFileWriteability( nj_tree_outfile ); - final NeighborJoining nj = NeighborJoining.createInstance(); - final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance ); - phylogeny.setName( nj_tree_outfile.getName() ); - writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); - return phylogeny; - } - - private static SortedSet createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) { - final SortedMap cds = gwcd.getAllCombinableDomainsIds(); - final SortedSet binary_combinations = new TreeSet(); - for( final DomainId domain_id : cds.keySet() ) { - final CombinableDomains cd = cds.get( domain_id ); - binary_combinations.addAll( cd.toBinaryDomainCombinations() ); - } - return binary_combinations; - } - - public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, - final Detailedness detailedness, - final GoAnnotationOutput go_annotation_output, - final Map go_id_to_term_map, - final GoNameSpace go_namespace_limit ) { - if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || go_id_to_term_map.isEmpty() ) ) { - throw new IllegalArgumentException( "attempt to use a GO namespace limit without a GO id to term map" ); + writer.write( "\n" ); } - for( final DomainSimilarity domain_similarity : domain_similarities ) { - if ( domain_similarity instanceof PrintableDomainSimilarity ) { - final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity; - printable_domain_similarity.setDetailedness( detailedness ); - printable_domain_similarity.setGoAnnotationOutput( go_annotation_output ); - printable_domain_similarity.setGoIdToTermMap( go_id_to_term_map ); - printable_domain_similarity.setGoNamespaceLimit( go_namespace_limit ); - } + catch ( final IOException e ) { + e.printStackTrace(); } } @@ -844,7 +538,9 @@ public final class SurfacingUtil { final BinaryDomainCombination.DomainCombinationType dc_type, final Map protein_length_stats_by_dc, final Map domain_number_stats_by_dc, - final Map domain_length_stats_by_domain ) { + final Map domain_length_stats_by_domain, + final Map tax_code_to_id_map, + final boolean write_to_nexus ) { final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; final String date_time = ForesterUtil.getCurrentDateTime(); final SortedSet all_pfams_encountered = new TreeSet(); @@ -852,7 +548,9 @@ public final class SurfacingUtil { final SortedSet all_pfams_lost_as_domains = new TreeSet(); final SortedSet all_pfams_gained_as_dom_combinations = new TreeSet(); final SortedSet all_pfams_lost_as_dom_combinations = new TreeSet(); - writeToNexus( outfile_name, domain_parsimony, phylogeny ); + if ( write_to_nexus ) { + writeToNexus( outfile_name, domain_parsimony, phylogeny ); + } // DOLLO DOMAINS // ------------- Phylogeny local_phylogeny_l = phylogeny.copy(); @@ -895,7 +593,8 @@ public final class SurfacingUtil { domain_id_to_secondary_features_maps, all_pfams_encountered, all_pfams_gained_as_domains, - "_dollo_gains_d" ); + "_dollo_gains_d", + tax_code_to_id_map ); writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, go_id_to_term_map, go_namespace_limit, @@ -910,7 +609,8 @@ public final class SurfacingUtil { domain_id_to_secondary_features_maps, all_pfams_encountered, all_pfams_lost_as_domains, - "_dollo_losses_d" ); + "_dollo_losses_d", + tax_code_to_id_map ); writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, go_id_to_term_map, go_namespace_limit, @@ -925,7 +625,8 @@ public final class SurfacingUtil { domain_id_to_secondary_features_maps, all_pfams_encountered, null, - "_dollo_present_d" ); + "_dollo_present_d", + tax_code_to_id_map ); preparePhylogeny( local_phylogeny_l, domain_parsimony, date_time, @@ -1012,7 +713,8 @@ public final class SurfacingUtil { null, all_pfams_encountered, all_pfams_gained_as_dom_combinations, - "_fitch_gains_dc" ); + "_fitch_gains_dc", + tax_code_to_id_map ); writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, go_id_to_term_map, go_namespace_limit, @@ -1027,7 +729,8 @@ public final class SurfacingUtil { null, all_pfams_encountered, all_pfams_lost_as_dom_combinations, - "_fitch_losses_dc" ); + "_fitch_losses_dc", + tax_code_to_id_map ); writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, go_id_to_term_map, go_namespace_limit, @@ -1042,7 +745,8 @@ public final class SurfacingUtil { null, all_pfams_encountered, null, - "_fitch_present_dc" ); + "_fitch_present_dc", + tax_code_to_id_map ); writeAllEncounteredPfamsToFile( domain_id_to_go_ids_map, go_id_to_term_map, outfile_name, @@ -1151,12 +855,11 @@ public final class SurfacingUtil { + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null ); } - public static void doit( final List proteins, - final List query_domain_ids_nc_order, - final Writer out, - final String separator, - final String limit_to_species, - final Map> average_protein_lengths_by_dc ) throws IOException { + public static void extractProteinNames( final List proteins, + final List query_domain_ids_nc_order, + final Writer out, + final String separator, + final String limit_to_species ) throws IOException { for( final Protein protein : proteins ) { if ( ForesterUtil.isEmpty( limit_to_species ) || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { @@ -1201,57 +904,8 @@ public final class SurfacingUtil { out.flush(); } - public static void extractProteinNames( final List proteins, - final List query_domain_ids_nc_order, - final Writer out, - final String separator, - final String limit_to_species ) throws IOException { - for( final Protein protein : proteins ) { - if ( ForesterUtil.isEmpty( limit_to_species ) - || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { - if ( protein.contains( query_domain_ids_nc_order, true ) ) { - out.write( protein.getSpecies().getSpeciesId() ); - out.write( separator ); - out.write( protein.getProteinId().getId() ); - out.write( separator ); - out.write( "[" ); - final Set visited_domain_ids = new HashSet(); - boolean first = true; - for( final Domain domain : protein.getProteinDomains() ) { - if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { - visited_domain_ids.add( domain.getDomainId() ); - if ( first ) { - first = false; - } - else { - out.write( " " ); - } - out.write( domain.getDomainId().getId() ); - out.write( " {" ); - out.write( "" + domain.getTotalCount() ); - out.write( "}" ); - } - } - out.write( "]" ); - out.write( separator ); - if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() - .equals( SurfacingConstants.NONE ) ) ) { - out.write( protein.getDescription() ); - } - out.write( separator ); - if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() - .equals( SurfacingConstants.NONE ) ) ) { - out.write( protein.getAccession() ); - } - out.write( SurfacingConstants.NL ); - } - } - } - out.flush(); - } - - public static void extractProteinNames( final SortedMap> protein_lists_per_species, - final DomainId domain_id, + public static void extractProteinNames( final SortedMap> protein_lists_per_species, + final DomainId domain_id, final Writer out, final String separator, final String limit_to_species, @@ -1395,6 +1049,53 @@ public final class SurfacingUtil { return true; } + public static void performDomainArchitectureAnalysis( final SortedMap> domain_architecutures, + final SortedMap domain_architecuture_counts, + final int min_count, + final File da_counts_outfile, + final File unique_da_outfile ) { + checkForOutputFileWriteability( da_counts_outfile ); + checkForOutputFileWriteability( unique_da_outfile ); + try { + final BufferedWriter da_counts_out = new BufferedWriter( new FileWriter( da_counts_outfile ) ); + final BufferedWriter unique_da_out = new BufferedWriter( new FileWriter( unique_da_outfile ) ); + final Iterator> it = domain_architecuture_counts.entrySet().iterator(); + while ( it.hasNext() ) { + final Map.Entry e = it.next(); + final String da = e.getKey(); + final int count = e.getValue(); + if ( count >= min_count ) { + da_counts_out.write( da ); + da_counts_out.write( "\t" ); + da_counts_out.write( String.valueOf( count ) ); + da_counts_out.write( ForesterUtil.LINE_SEPARATOR ); + } + if ( count == 1 ) { + final Iterator>> it2 = domain_architecutures.entrySet().iterator(); + while ( it2.hasNext() ) { + final Map.Entry> e2 = it2.next(); + final String genome = e2.getKey(); + final Set das = e2.getValue(); + if ( das.contains( da ) ) { + unique_da_out.write( genome ); + unique_da_out.write( "\t" ); + unique_da_out.write( da ); + unique_da_out.write( ForesterUtil.LINE_SEPARATOR ); + } + } + } + } + unique_da_out.close(); + da_counts_out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + da_counts_outfile + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + unique_da_outfile + "\"" ); + // + } + public static void preparePhylogeny( final Phylogeny p, final DomainParsimonyCalculator domain_parsimony, final String date_time, @@ -1563,17 +1264,26 @@ public final class SurfacingUtil { return domains; } - private static List splitDomainCombination( final String dc ) { - final String[] s = dc.split( "=" ); - if ( s.length != 2 ) { - ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: " - + dc ); - System.exit( -1 ); + public static int storeDomainArchitectures( final String genome, + final SortedMap> domain_architecutures, + final List protein_list, + final Map distinct_domain_architecuture_counts ) { + final Set da = new HashSet(); + domain_architecutures.put( genome, da ); + for( final Protein protein : protein_list ) { + final String da_str = ( ( BasicProtein ) protein ).toDomainArchitectureString( "~", 3, "=" ); + if ( !da.contains( da_str ) ) { + if ( !distinct_domain_architecuture_counts.containsKey( da_str ) ) { + distinct_domain_architecuture_counts.put( da_str, 1 ); + } + else { + distinct_domain_architecuture_counts.put( da_str, + distinct_domain_architecuture_counts.get( da_str ) + 1 ); + } + da.add( da_str ); + } } - final List l = new ArrayList( 2 ); - l.add( s[ 0 ] ); - l.add( s[ 1 ] ); - return l; + return da.size(); } public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p, @@ -1605,204 +1315,66 @@ public final class SurfacingUtil { } } - private static void writeAllEncounteredPfamsToFile( final Map> domain_id_to_go_ids_map, - final Map go_id_to_term_map, - final String outfile_name, - final SortedSet all_pfams_encountered ) { - final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX ); - final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name - + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); - final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX ); - int biological_process_counter = 0; - int cellular_component_counter = 0; - int molecular_function_counter = 0; - int pfams_with_mappings_counter = 0; - int pfams_without_mappings_counter = 0; - int pfams_without_mappings_to_bp_or_mf_counter = 0; - int pfams_with_mappings_to_bp_or_mf_counter = 0; + public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties, + final File output_dir, + final GenomeWideCombinableDomains gwcd, + final int i, + final GenomeWideCombinableDomainsSortOrder dc_sort_order ) { + File dc_outfile_dot = new File( input_file_properties[ i ][ 1 ] + + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS ); + if ( output_dir != null ) { + dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot ); + } + checkForOutputFileWriteability( dc_outfile_dot ); + final SortedSet binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd ); try { - final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) ); - final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) ); - final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) ); - summary_writer.write( "# Pfam to GO mapping summary" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Actual summary is at the end of this file." ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Encountered Pfams without a GO mapping:" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - for( final String pfam : all_pfams_encountered ) { - all_pfams_encountered_writer.write( pfam ); - all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR ); - final DomainId domain_id = new DomainId( pfam ); - if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { - ++pfams_with_mappings_counter; - all_pfams_encountered_with_go_annotation_writer.write( pfam ); - all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR ); - final List go_ids = domain_id_to_go_ids_map.get( domain_id ); - boolean maps_to_bp = false; - boolean maps_to_cc = false; - boolean maps_to_mf = false; - for( final GoId go_id : go_ids ) { - final GoTerm go_term = go_id_to_term_map.get( go_id ); - if ( go_term.getGoNameSpace().isBiologicalProcess() ) { - maps_to_bp = true; - } - else if ( go_term.getGoNameSpace().isCellularComponent() ) { - maps_to_cc = true; - } - else if ( go_term.getGoNameSpace().isMolecularFunction() ) { - maps_to_mf = true; + final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) ); + for( final BinaryDomainCombination bdc : binary_combinations ) { + out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null ) + .toString() ); + out_dot.write( SurfacingConstants.NL ); + } + out_dot.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); + } + + public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, + final CharacterStateMatrix.GainLossStates state, + final String filename, + final String indentifier_characters_separator, + final String character_separator, + final Map descriptions ) { + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + for( final String id : sorted_ids ) { + out.write( indentifier_characters_separator ); + out.write( "#" + id ); + out.write( indentifier_characters_separator ); + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + // Not nice: + // using null to indicate either UNCHANGED_PRESENT or GAIN. + if ( ( matrix.getState( id, c ) == state ) + || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix + .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) { + out.write( matrix.getCharacter( c ) ); + if ( ( descriptions != null ) && !descriptions.isEmpty() + && descriptions.containsKey( matrix.getCharacter( c ) ) ) { + out.write( "\t" ); + out.write( descriptions.get( matrix.getCharacter( c ) ) ); } - } - if ( maps_to_bp ) { - ++biological_process_counter; - } - if ( maps_to_cc ) { - ++cellular_component_counter; - } - if ( maps_to_mf ) { - ++molecular_function_counter; - } - if ( maps_to_bp || maps_to_mf ) { - ++pfams_with_mappings_to_bp_or_mf_counter; - } - else { - ++pfams_without_mappings_to_bp_or_mf_counter; - } - } - else { - ++pfams_without_mappings_to_bp_or_mf_counter; - ++pfams_without_mappings_counter; - summary_writer.write( pfam ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - } - } - all_pfams_encountered_writer.close(); - all_pfams_encountered_with_go_annotation_writer.close(); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size() - + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter - + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file - + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all [" - + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \"" - + encountered_pfams_summary_file + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered : " - + all_pfams_encountered.size() ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : " - + pfams_without_mappings_counter + " [" - + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : " - + pfams_without_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : " - + pfams_with_mappings_counter + " [" - + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : " - + pfams_with_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: " - + biological_process_counter + " [" - + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: " - + molecular_function_counter + " [" - + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: " - + cellular_component_counter + " [" - + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter - + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams without mapping to proc. or func. : " - + pfams_without_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " [" - + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with a mapping to proc. or func. : " - + pfams_with_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " [" - + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " [" - + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " [" - + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.close(); - } - catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); - } - } - - public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties, - final File output_dir, - final GenomeWideCombinableDomains gwcd, - final int i, - final GenomeWideCombinableDomainsSortOrder dc_sort_order ) { - File dc_outfile_dot = new File( input_file_properties[ i ][ 1 ] - + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS ); - if ( output_dir != null ) { - dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot ); - } - checkForOutputFileWriteability( dc_outfile_dot ); - final SortedSet binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd ); - try { - final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) ); - for( final BinaryDomainCombination bdc : binary_combinations ) { - out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null ) - .toString() ); - out_dot.write( SurfacingConstants.NL ); - } - out_dot.close(); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" - + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " - + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); - } - - public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, - final CharacterStateMatrix.GainLossStates state, - final String filename, - final String indentifier_characters_separator, - final String character_separator, - final Map descriptions ) { - final File outfile = new File( filename ); - checkForOutputFileWriteability( outfile ); - final SortedSet sorted_ids = new TreeSet(); - for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { - sorted_ids.add( matrix.getIdentifier( i ) ); - } - try { - final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); - for( final String id : sorted_ids ) { - out.write( indentifier_characters_separator ); - out.write( "#" + id ); - out.write( indentifier_characters_separator ); - for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { - // Not nice: - // using null to indicate either UNCHANGED_PRESENT or GAIN. - if ( ( matrix.getState( id, c ) == state ) - || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) || ( matrix - .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) { - out.write( matrix.getCharacter( c ) ); - if ( ( descriptions != null ) && !descriptions.isEmpty() - && descriptions.containsKey( matrix.getCharacter( c ) ) ) { - out.write( "\t" ); - out.write( descriptions.get( matrix.getCharacter( c ) ) ); - } - out.write( character_separator ); + out.write( character_separator ); } } } @@ -1874,7 +1446,8 @@ public final class SurfacingUtil { final Map>[] domain_id_to_secondary_features_maps, final SortedSet all_pfams_encountered, final SortedSet pfams_gained_or_lost, - final String suffix_for_per_node_events_file ) { + final String suffix_for_per_node_events_file, + final Map tax_code_to_id_map ) { if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" ); } @@ -1935,7 +1508,7 @@ public final class SurfacingUtil { out.write( SurfacingConstants.NL ); out.write( "

" ); out.write( "" + id + "" ); - writeTaxonomyLinks( out, id ); + writeTaxonomyLinks( out, id, tax_code_to_id_map ); out.write( "

" ); out.write( SurfacingConstants.NL ); out.write( "" ); @@ -2082,17 +1655,878 @@ public final class SurfacingUtil { per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR ); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - if ( input_file_properties[ i ].length == 3 ) { - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" - + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " - + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" ); - } - else { - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" - + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \"" - + dc_outfile + "\"" ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + if ( input_file_properties[ i ].length == 3 ) { + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" ); + } + else { + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \"" + + dc_outfile + "\"" ); + } + } + + public static DescriptiveStatistics writeDomainSimilaritiesToFile( final StringBuilder html_desc, + final StringBuilder html_title, + final Writer single_writer, + Map split_writers, + final SortedSet similarities, + final boolean treat_as_binary, + final List species_order, + final PrintableDomainSimilarity.PRINT_OPTION print_option, + final DomainSimilarity.DomainSimilaritySortField sort_field, + final DomainSimilarity.DomainSimilarityScoring scoring, + final boolean verbose, + final Map tax_code_to_id_map ) + throws IOException { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + String histogram_title = null; + switch ( sort_field ) { + case ABS_MAX_COUNTS_DIFFERENCE: + if ( treat_as_binary ) { + histogram_title = "absolute counts difference:"; + } + else { + histogram_title = "absolute (maximal) counts difference:"; + } + break; + case MAX_COUNTS_DIFFERENCE: + if ( treat_as_binary ) { + histogram_title = "counts difference:"; + } + else { + histogram_title = "(maximal) counts difference:"; + } + break; + case DOMAIN_ID: + histogram_title = "score mean:"; + break; + case MIN: + histogram_title = "score minimum:"; + break; + case MAX: + histogram_title = "score maximum:"; + break; + case MAX_DIFFERENCE: + if ( treat_as_binary ) { + histogram_title = "difference:"; + } + else { + histogram_title = "(maximal) difference:"; + } + break; + case MEAN: + histogram_title = "score mean:"; + break; + case SD: + histogram_title = "score standard deviation:"; + break; + case SPECIES_COUNT: + histogram_title = "species number:"; + break; + default: + throw new AssertionError( "Unknown sort field: " + sort_field ); + } + for( final DomainSimilarity similarity : similarities ) { + switch ( sort_field ) { + case ABS_MAX_COUNTS_DIFFERENCE: + stats.addValue( Math.abs( similarity.getMaximalDifferenceInCounts() ) ); + break; + case MAX_COUNTS_DIFFERENCE: + stats.addValue( similarity.getMaximalDifferenceInCounts() ); + break; + case DOMAIN_ID: + stats.addValue( similarity.getMeanSimilarityScore() ); + break; + case MIN: + stats.addValue( similarity.getMinimalSimilarityScore() ); + break; + case MAX: + stats.addValue( similarity.getMaximalSimilarityScore() ); + break; + case MAX_DIFFERENCE: + stats.addValue( similarity.getMaximalDifference() ); + break; + case MEAN: + stats.addValue( similarity.getMeanSimilarityScore() ); + break; + case SD: + stats.addValue( similarity.getStandardDeviationOfSimilarityScore() ); + break; + case SPECIES_COUNT: + stats.addValue( similarity.getSpecies().size() ); + break; + default: + throw new AssertionError( "Unknown sort field: " + sort_field ); + } + } + AsciiHistogram histo = null; + if ( stats.getMin() < stats.getMin() ) { + histo = new AsciiHistogram( stats, histogram_title ); + } + if ( verbose ) { + if ( histo != null ) { + System.out.println( histo.toStringBuffer( 20, '|', 40, 5 ) ); + } + System.out.println(); + System.out.println( "N : " + stats.getN() ); + System.out.println( "Min : " + stats.getMin() ); + System.out.println( "Max : " + stats.getMax() ); + System.out.println( "Mean : " + stats.arithmeticMean() ); + if ( stats.getN() > 1 ) { + System.out.println( "SD : " + stats.sampleStandardDeviation() ); + } + else { + System.out.println( "SD : n/a" ); + } + System.out.println( "Median : " + stats.median() ); + if ( stats.getN() > 1 ) { + System.out.println( "Pearsonian skewness : " + stats.pearsonianSkewness() ); + } + else { + System.out.println( "Pearsonian skewness : n/a" ); + } + } + if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) { + split_writers = new HashMap(); + split_writers.put( '_', single_writer ); + } + switch ( print_option ) { + case SIMPLE_TAB_DELIMITED: + break; + case HTML: + for( final Character key : split_writers.keySet() ) { + final Writer w = split_writers.get( key ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( key != '_' ) { + addHtmlHead( w, "DCs (" + html_title + ") " + key.toString().toUpperCase() ); + } + else { + addHtmlHead( w, "DCs (" + html_title + ")" ); + } + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( html_desc.toString() ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" );
+                    w.write( SurfacingConstants.NL );
+                    if ( histo != null ) {
+                        w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
+                        w.write( SurfacingConstants.NL );
+                    }
+                    w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( stats.getN() > 1 ) { + w.write( "" ); + } + else { + w.write( "" ); + } + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( stats.getN() > 1 ) { + w.write( "" ); + } + else { + w.write( "" ); + } + w.write( SurfacingConstants.NL ); + w.write( "
N: " + stats.getN() + "
Min: " + stats.getMin() + "
Max: " + stats.getMax() + "
Mean: " + stats.arithmeticMean() + "
SD: " + stats.sampleStandardDeviation() + "
SD: n/a
Median: " + stats.median() + "
Pearsonian skewness: " + stats.pearsonianSkewness() + "
Pearsonian skewness: n/a
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + break; + } + for( final Writer w : split_writers.values() ) { + w.write( SurfacingConstants.NL ); + } + for( final DomainSimilarity similarity : similarities ) { + if ( ( species_order != null ) && !species_order.isEmpty() ) { + ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); + } + if ( single_writer != null ) { + single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map ).toString() ); + } + else { + Writer local_writer = split_writers.get( ( similarity.getDomainId().getId().charAt( 0 ) + "" ) + .toLowerCase().charAt( 0 ) ); + if ( local_writer == null ) { + local_writer = split_writers.get( '0' ); + } + local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map ).toString() ); + } + for( final Writer w : split_writers.values() ) { + w.write( SurfacingConstants.NL ); + } + } + switch ( print_option ) { + case HTML: + for( final Writer w : split_writers.values() ) { + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + break; + } + for( final Writer w : split_writers.values() ) { + w.close(); + } + return stats; + } + + public static void writeMatrixToFile( final CharacterStateMatrix matrix, + final String filename, + final Format format ) { + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + matrix.toWriter( out, format ); + out.flush(); + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" ); + } + + public static void writeMatrixToFile( final File matrix_outfile, final List matrices ) { + checkForOutputFileWriteability( matrix_outfile ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) ); + for( final DistanceMatrix distance_matrix : matrices ) { + out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() ); + out.write( ForesterUtil.LINE_SEPARATOR ); + out.flush(); + } + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" ); + } + + public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) { + final PhylogenyWriter writer = new PhylogenyWriter(); + try { + writer.toPhyloXML( new File( filename ), phylogeny, 1 ); + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": " + + e ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" ); + } + + public static void writeTaxonomyLinks( final Writer writer, + final String species, + final Map tax_code_to_id_map ) throws IOException { + if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) { + writer.write( " [" ); + if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) { + writer.write( "uniprot" ); + } + else { + writer.write( "eol" ); + writer.write( "|" ); + writer.write( "scholar" ); + writer.write( "|" ); + writer.write( "google" ); + } + writer.write( "]" ); + } + } + + private final static void addToCountMap( final Map map, final String s ) { + if ( map.containsKey( s ) ) { + map.put( s, map.get( s ) + 1 ); + } + else { + map.put( s, 1 ); + } + } + + private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, + final String outfilename_for_counts, + final String outfilename_for_dc, + final String outfilename_for_dc_for_go_mapping, + final String outfilename_for_dc_for_go_mapping_unique, + final String outfilename_for_rank_counts, + final String outfilename_for_ancestor_species_counts, + final String outfilename_for_protein_stats, + final Map protein_length_stats_by_dc, + final Map domain_number_stats_by_dc, + final Map domain_length_stats_by_domain ) { + try { + // + // if ( protein_length_stats_by_dc != null ) { + // for( final Entry entry : protein_length_stats_by_dc.entrySet() ) { + // System.out.print( entry.getKey().toString() ); + // System.out.print( ": " ); + // double[] a = entry.getValue().getDataAsDoubleArray(); + // for( int i = 0; i < a.length; i++ ) { + // System.out.print( a[ i ] + " " ); + // } + // System.out.println(); + // } + // } + // if ( domain_number_stats_by_dc != null ) { + // for( final Entry entry : domain_number_stats_by_dc.entrySet() ) { + // System.out.print( entry.getKey().toString() ); + // System.out.print( ": " ); + // double[] a = entry.getValue().getDataAsDoubleArray(); + // for( int i = 0; i < a.length; i++ ) { + // System.out.print( a[ i ] + " " ); + // } + // System.out.println(); + // } + // } + // + final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); + final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); + final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); + final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) ); + final SortedMap dc_gain_counts = new TreeMap(); + for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + final Set gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters(); + for( final String dc : gained_dc ) { + if ( dc_gain_counts.containsKey( dc ) ) { + dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 ); + } + else { + dc_gain_counts.put( dc, 1 ); + } + } + } + final SortedMap histogram = new TreeMap(); + final SortedMap domain_lists = new TreeMap(); + final SortedMap dc_reapp_counts_to_protein_length_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_number_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_lengths_stats = new TreeMap(); + final SortedMap> domain_lists_go = new TreeMap>(); + final SortedMap> domain_lists_go_unique = new TreeMap>(); + final Set dcs = dc_gain_counts.keySet(); + final SortedSet more_than_once = new TreeSet(); + final DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics(); + long gained_multiple_times_domain_length_sum = 0; + long gained_once_domain_length_sum = 0; + long gained_multiple_times_domain_length_count = 0; + long gained_once_domain_length_count = 0; + for( final String dc : dcs ) { + final int count = dc_gain_counts.get( dc ); + if ( histogram.containsKey( count ) ) { + histogram.put( count, histogram.get( count ) + 1 ); + domain_lists.get( count ).append( ", " + dc ); + domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) ); + domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) ); + } + else { + histogram.put( count, 1 ); + domain_lists.put( count, new StringBuilder( dc ) ); + final PriorityQueue q = new PriorityQueue(); + q.addAll( splitDomainCombination( dc ) ); + domain_lists_go.put( count, q ); + final SortedSet set = new TreeSet(); + set.addAll( splitDomainCombination( dc ) ); + domain_lists_go_unique.put( count, set ); + } + if ( protein_length_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) { + dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); + } + dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_number_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); + } + dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_length_stats_by_domain != null ) { + if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() ); + } + final String[] ds = dc.split( "=" ); + dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain + .get( ds[ 0 ] ).arithmeticMean() ); + dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain + .get( ds[ 1 ] ).arithmeticMean() ); + } + if ( count > 1 ) { + more_than_once.add( dc ); + if ( protein_length_stats_by_dc != null ) { + final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_multiple_times_lengths_stats.addValue( element ); + } + } + if ( domain_number_stats_by_dc != null ) { + final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_multiple_times_domain_count_stats.addValue( element ); + } + } + if ( domain_length_stats_by_domain != null ) { + final String[] ds = dc.split( "=" ); + final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); + final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); + for( final double element : s0.getData() ) { + gained_multiple_times_domain_length_sum += element; + ++gained_multiple_times_domain_length_count; + } + for( final double element : s1.getData() ) { + gained_multiple_times_domain_length_sum += element; + ++gained_multiple_times_domain_length_count; + } + } + } + else { + if ( protein_length_stats_by_dc != null ) { + final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_once_lengths_stats.addValue( element ); + } + } + if ( domain_number_stats_by_dc != null ) { + final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_once_domain_count_stats.addValue( element ); + } + } + if ( domain_length_stats_by_domain != null ) { + final String[] ds = dc.split( "=" ); + final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); + final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); + for( final double element : s0.getData() ) { + gained_once_domain_length_sum += element; + ++gained_once_domain_length_count; + } + for( final double element : s1.getData() ) { + gained_once_domain_length_sum += element; + ++gained_once_domain_length_count; + } + } + } + } + final Set histogram_keys = histogram.keySet(); + for( final Integer histogram_key : histogram_keys ) { + final int count = histogram.get( histogram_key ); + final StringBuilder dc = domain_lists.get( histogram_key ); + out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); + out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); + final Object[] sorted = domain_lists_go.get( histogram_key ).toArray(); + Arrays.sort( sorted ); + for( final Object domain : sorted ) { + out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR ); + } + out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); + for( final String domain : domain_lists_go_unique.get( histogram_key ) ) { + out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR ); + } + } + out_counts.close(); + out_dc.close(); + out_dc_for_go_mapping.close(); + out_dc_for_go_mapping_unique.close(); + final SortedMap lca_rank_counts = new TreeMap(); + final SortedMap lca_ancestor_species_counts = new TreeMap(); + for( final String dc : more_than_once ) { + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) { + nodes.add( n ); + } + } + for( int i = 0; i < ( nodes.size() - 1 ); ++i ) { + for( int j = i + 1; j < nodes.size(); ++j ) { + final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) ); + String rank = "unknown"; + if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) { + rank = lca.getNodeData().getTaxonomy().getRank(); + } + addToCountMap( lca_rank_counts, rank ); + String lca_species; + if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) { + lca_species = lca.getNodeData().getTaxonomy().getScientificName(); + } + else if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) { + lca_species = lca.getNodeData().getTaxonomy().getCommonName(); + } + else { + lca_species = lca.getName(); + } + addToCountMap( lca_ancestor_species_counts, lca_species ); + } + } + } + final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) ); + final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) ); + ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR ); + ForesterUtil.map2writer( out_for_ancestor_species_counts, + lca_ancestor_species_counts, + "\t", + ForesterUtil.LINE_SEPARATOR ); + out_for_rank_counts.close(); + out_for_ancestor_species_counts.close(); + if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats ) + && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) { + final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) ); + w.write( "Domain Lengths: " ); + w.write( "\n" ); + if ( domain_length_stats_by_domain != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_lengths_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Protein Lengths: " ); + w.write( "\n" ); + if ( protein_length_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_protein_length_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Number of domains: " ); + w.write( "\n" ); + if ( domain_number_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_number_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, domain lengths:" ); + w.write( "\n" ); + w.write( "N: " + gained_once_domain_length_count ); + w.write( "\n" ); + w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, domain lengths:" ); + w.write( "\n" ); + w.write( "N: " + gained_multiple_times_domain_length_count ); + w.write( "\n" ); + w.write( "Avg: " + + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, protein lengths:" ); + w.write( "\n" ); + w.write( gained_once_lengths_stats.toString() ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, domain counts:" ); + w.write( "\n" ); + w.write( gained_once_domain_count_stats.toString() ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, protein lengths:" ); + w.write( "\n" ); + w.write( gained_multiple_times_lengths_stats.toString() ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, domain counts:" ); + w.write( "\n" ); + w.write( gained_multiple_times_domain_count_stats.toString() ); + w.flush(); + w.close(); + } + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to [" + + outfilename_for_counts + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to [" + + outfilename_for_dc + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote independent domain combination gains fitch lists to (for GO mapping) [" + + outfilename_for_dc_for_go_mapping + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) [" + + outfilename_for_dc_for_go_mapping_unique + "]" ); + } + + private static SortedSet collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root, + final boolean get_gains ) { + final SortedSet domains = new TreeSet(); + for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) { + final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters(); + if ( get_gains ) { + domains.addAll( chars.getGainedCharacters() ); + } + else { + domains.addAll( chars.getLostCharacters() ); + } + } + return domains; + } + + private static File createBaseDirForPerNodeDomainFiles( final String base_dir, + final boolean domain_combinations, + final CharacterStateMatrix.GainLossStates state, + final String outfile ) { + File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent() + + ForesterUtil.FILE_SEPARATOR + base_dir ); + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + if ( domain_combinations ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "DC" ); + } + else { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "DOMAINS" ); + } + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + if ( state == GainLossStates.GAIN ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "GAINS" ); + } + else if ( state == GainLossStates.LOSS ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "LOSSES" ); + } + else { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "PRESENT" ); + } + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + return per_node_go_mapped_domain_gain_loss_files_base_dir; + } + + private static SortedSet createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) { + final SortedMap cds = gwcd.getAllCombinableDomainsIds(); + final SortedSet binary_combinations = new TreeSet(); + for( final DomainId domain_id : cds.keySet() ) { + final CombinableDomains cd = cds.get( domain_id ); + binary_combinations.addAll( cd.toBinaryDomainCombinations() ); + } + return binary_combinations; + } + + private static List splitDomainCombination( final String dc ) { + final String[] s = dc.split( "=" ); + if ( s.length != 2 ) { + ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: " + + dc ); + System.exit( -1 ); + } + final List l = new ArrayList( 2 ); + l.add( s[ 0 ] ); + l.add( s[ 1 ] ); + return l; + } + + private static void writeAllEncounteredPfamsToFile( final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final String outfile_name, + final SortedSet all_pfams_encountered ) { + final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX ); + final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name + + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); + final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX ); + int biological_process_counter = 0; + int cellular_component_counter = 0; + int molecular_function_counter = 0; + int pfams_with_mappings_counter = 0; + int pfams_without_mappings_counter = 0; + int pfams_without_mappings_to_bp_or_mf_counter = 0; + int pfams_with_mappings_to_bp_or_mf_counter = 0; + try { + final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) ); + final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) ); + final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) ); + summary_writer.write( "# Pfam to GO mapping summary" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Actual summary is at the end of this file." ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Encountered Pfams without a GO mapping:" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + for( final String pfam : all_pfams_encountered ) { + all_pfams_encountered_writer.write( pfam ); + all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR ); + final DomainId domain_id = new DomainId( pfam ); + if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + ++pfams_with_mappings_counter; + all_pfams_encountered_with_go_annotation_writer.write( pfam ); + all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR ); + final List go_ids = domain_id_to_go_ids_map.get( domain_id ); + boolean maps_to_bp = false; + boolean maps_to_cc = false; + boolean maps_to_mf = false; + for( final GoId go_id : go_ids ) { + final GoTerm go_term = go_id_to_term_map.get( go_id ); + if ( go_term.getGoNameSpace().isBiologicalProcess() ) { + maps_to_bp = true; + } + else if ( go_term.getGoNameSpace().isCellularComponent() ) { + maps_to_cc = true; + } + else if ( go_term.getGoNameSpace().isMolecularFunction() ) { + maps_to_mf = true; + } + } + if ( maps_to_bp ) { + ++biological_process_counter; + } + if ( maps_to_cc ) { + ++cellular_component_counter; + } + if ( maps_to_mf ) { + ++molecular_function_counter; + } + if ( maps_to_bp || maps_to_mf ) { + ++pfams_with_mappings_to_bp_or_mf_counter; + } + else { + ++pfams_without_mappings_to_bp_or_mf_counter; + } + } + else { + ++pfams_without_mappings_to_bp_or_mf_counter; + ++pfams_without_mappings_counter; + summary_writer.write( pfam ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + } + } + all_pfams_encountered_writer.close(); + all_pfams_encountered_with_go_annotation_writer.close(); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size() + + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter + + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file + + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all [" + + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \"" + + encountered_pfams_summary_file + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered : " + + all_pfams_encountered.size() ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : " + + pfams_without_mappings_counter + " [" + + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : " + + pfams_without_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : " + + pfams_with_mappings_counter + " [" + + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : " + + pfams_with_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: " + + biological_process_counter + " [" + + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: " + + molecular_function_counter + " [" + + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: " + + cellular_component_counter + " [" + + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter + + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams without mapping to proc. or func. : " + + pfams_without_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " [" + + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with a mapping to proc. or func. : " + + pfams_with_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " [" + + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " [" + + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " [" + + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.close(); + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); } } @@ -2150,332 +2584,70 @@ public final class SurfacingUtil { else { out.write( "" ); } - if ( !go_id_to_term_map.containsKey( go_id ) ) { - throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" ); - } - final GoTerm go_term = go_id_to_term_map.get( go_id ); - if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) { - // final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName(); - final String go_id_str = go_id.getId(); - out.write( "" ); - out.write( "" + go_id_str + "" ); - out.write( "" ); - out.write( go_term.getName() ); - if ( domain_count == 2 ) { - out.write( " (" + d + ")" ); - } - out.write( "" ); - // out.write( top ); - // out.write( "" ); - out.write( "[" ); - out.write( go_term.getGoNameSpace().toShortString() ); - out.write( "]" ); - out.write( "" ); - if ( all_go_ids != null ) { - all_go_ids.add( go_id ); - } - } - else { - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - } - out.write( "" ); - out.write( SurfacingConstants.NL ); - } - } - } // for( int d = 0; d < domain_count; ++d ) - if ( !any_go_annotation_present ) { - out.write( "" ); - writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - } - } - - private static void writeDomainIdsToHtml( final Writer out, - final String domain_0, - final String domain_1, - final String prefix_for_detailed_html, - final Map>[] domain_id_to_secondary_features_maps ) - throws IOException { - out.write( "" ); - if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) { - out.write( prefix_for_detailed_html ); - out.write( " " ); - } - out.write( "" + domain_0 + "" ); - out.write( "" ); - } - - public static DescriptiveStatistics writeDomainSimilaritiesToFile( final StringBuilder html_desc, - final StringBuilder html_title, - final Writer single_writer, - Map split_writers, - final SortedSet similarities, - final boolean treat_as_binary, - final List species_order, - final PrintableDomainSimilarity.PRINT_OPTION print_option, - final DomainSimilarity.DomainSimilaritySortField sort_field, - final DomainSimilarity.DomainSimilarityScoring scoring, - final boolean verbose ) throws IOException { - final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - String histogram_title = null; - switch ( sort_field ) { - case ABS_MAX_COUNTS_DIFFERENCE: - if ( treat_as_binary ) { - histogram_title = "absolute counts difference:"; - } - else { - histogram_title = "absolute (maximal) counts difference:"; - } - break; - case MAX_COUNTS_DIFFERENCE: - if ( treat_as_binary ) { - histogram_title = "counts difference:"; - } - else { - histogram_title = "(maximal) counts difference:"; - } - break; - case DOMAIN_ID: - histogram_title = "score mean:"; - break; - case MIN: - histogram_title = "score minimum:"; - break; - case MAX: - histogram_title = "score maximum:"; - break; - case MAX_DIFFERENCE: - if ( treat_as_binary ) { - histogram_title = "difference:"; - } - else { - histogram_title = "(maximal) difference:"; - } - break; - case MEAN: - histogram_title = "score mean:"; - break; - case SD: - histogram_title = "score standard deviation:"; - break; - case SPECIES_COUNT: - histogram_title = "species number:"; - break; - default: - throw new AssertionError( "Unknown sort field: " + sort_field ); - } - for( final DomainSimilarity similarity : similarities ) { - switch ( sort_field ) { - case ABS_MAX_COUNTS_DIFFERENCE: - stats.addValue( Math.abs( similarity.getMaximalDifferenceInCounts() ) ); - break; - case MAX_COUNTS_DIFFERENCE: - stats.addValue( similarity.getMaximalDifferenceInCounts() ); - break; - case DOMAIN_ID: - stats.addValue( similarity.getMeanSimilarityScore() ); - break; - case MIN: - stats.addValue( similarity.getMinimalSimilarityScore() ); - break; - case MAX: - stats.addValue( similarity.getMaximalSimilarityScore() ); - break; - case MAX_DIFFERENCE: - stats.addValue( similarity.getMaximalDifference() ); - break; - case MEAN: - stats.addValue( similarity.getMeanSimilarityScore() ); - break; - case SD: - stats.addValue( similarity.getStandardDeviationOfSimilarityScore() ); - break; - case SPECIES_COUNT: - stats.addValue( similarity.getSpecies().size() ); - break; - default: - throw new AssertionError( "Unknown sort field: " + sort_field ); - } - } - // - // final HistogramData[] hists = new HistogramData[ 1 ]; - // - // - // List data_items = new - // ArrayList(); - // double[] values = stats.getDataAsDoubleArray(); - // for( int i = 0; i < values.length; i++ ) { - // HistogramDataItem data_item = new BasicHistogramDataItem( "", values[ - // i ] ); - // data_items.add( data_item ); - // } - // - // - // HistogramData hd0 = new HistogramData( "name", - // data_items, - // null, 20, - // 40 ); - // - // - // - // - // hists[ 0 ] = hd0; - // - // final HistogramsFrame hf = new HistogramsFrame( hists ); - // hf.setVisible( true ); - // - AsciiHistogram histo = null; - if ( stats.getMin() < stats.getMin() ) { - histo = new AsciiHistogram( stats, histogram_title ); - } - if ( verbose ) { - if ( histo != null ) { - System.out.println( histo.toStringBuffer( 20, '|', 40, 5 ) ); - } - System.out.println(); - System.out.println( "N : " + stats.getN() ); - System.out.println( "Min : " + stats.getMin() ); - System.out.println( "Max : " + stats.getMax() ); - System.out.println( "Mean : " + stats.arithmeticMean() ); - if ( stats.getN() > 1 ) { - System.out.println( "SD : " + stats.sampleStandardDeviation() ); - } - else { - System.out.println( "SD : n/a" ); - } - System.out.println( "Median : " + stats.median() ); - if ( stats.getN() > 1 ) { - System.out.println( "Pearsonian skewness : " + stats.pearsonianSkewness() ); - } - else { - System.out.println( "Pearsonian skewness : n/a" ); - } - } - if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) { - split_writers = new HashMap(); - split_writers.put( '_', single_writer ); - } - switch ( print_option ) { - case SIMPLE_TAB_DELIMITED: - break; - case HTML: - for( final Character key : split_writers.keySet() ) { - final Writer w = split_writers.get( key ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( key != '_' ) { - addHtmlHead( w, "DCs (" + html_title + ") " + key.toString().toUpperCase() ); - } - else { - addHtmlHead( w, "DCs (" + html_title + ")" ); - } - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( html_desc.toString() ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" );
-                    w.write( SurfacingConstants.NL );
-                    if ( histo != null ) {
-                        w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
-                        w.write( SurfacingConstants.NL );
-                    }
-                    w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( stats.getN() > 1 ) { - w.write( "" ); - } - else { - w.write( "" ); + if ( !go_id_to_term_map.containsKey( go_id ) ) { + throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" ); } - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( stats.getN() > 1 ) { - w.write( "" ); + final GoTerm go_term = go_id_to_term_map.get( go_id ); + if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) { + // final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName(); + final String go_id_str = go_id.getId(); + out.write( "" ); + if ( all_go_ids != null ) { + all_go_ids.add( go_id ); + } } else { - w.write( "" ); + out.write( "" ); } - w.write( SurfacingConstants.NL ); - w.write( "
N: " + stats.getN() + "
Min: " + stats.getMin() + "
Max: " + stats.getMax() + "
Mean: " + stats.arithmeticMean() + "
SD: " + stats.sampleStandardDeviation() + "
SD: n/a
Median: " + stats.median() + "
Pearsonian skewness: " + stats.pearsonianSkewness() + "
" ); + out.write( "" + go_id_str + "" ); + out.write( "" ); + out.write( go_term.getName() ); + if ( domain_count == 2 ) { + out.write( " (" + d + ")" ); + } + out.write( "" ); + // out.write( top ); + // out.write( "" ); + out.write( "[" ); + out.write( go_term.getGoNameSpace().toShortString() ); + out.write( "]" ); + out.write( "
Pearsonian skewness: n/a
" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - } - break; - } - for( final Writer w : split_writers.values() ) { - w.write( SurfacingConstants.NL ); - } - for( final DomainSimilarity similarity : similarities ) { - if ( ( species_order != null ) && !species_order.isEmpty() ) { - ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); - } - if ( single_writer != null ) { - single_writer.write( similarity.toStringBuffer( print_option ).toString() ); - } - else { - Writer local_writer = split_writers.get( ( similarity.getDomainId().getId().charAt( 0 ) + "" ) - .toLowerCase().charAt( 0 ) ); - if ( local_writer == null ) { - local_writer = split_writers.get( '0' ); + out.write( "" ); + out.write( SurfacingConstants.NL ); } - local_writer.write( similarity.toStringBuffer( print_option ).toString() ); - } - for( final Writer w : split_writers.values() ) { - w.write( SurfacingConstants.NL ); } + } // for( int d = 0; d < domain_count; ++d ) + if ( !any_go_annotation_present ) { + out.write( "" ); + writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps ); + out.write( "" ); + out.write( "" ); + out.write( SurfacingConstants.NL ); } - switch ( print_option ) { - case HTML: - for( final Writer w : split_writers.values() ) { - w.write( SurfacingConstants.NL ); - w.write( "
" ); + out.write( "" ); + out.write( "" ); + out.write( "" ); + out.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - } - break; - } - for( final Writer w : split_writers.values() ) { - w.close(); + } + + private static void writeDomainIdsToHtml( final Writer out, + final String domain_0, + final String domain_1, + final String prefix_for_detailed_html, + final Map>[] domain_id_to_secondary_features_maps ) + throws IOException { + out.write( "" ); + if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) { + out.write( prefix_for_detailed_html ); + out.write( " " ); } - return stats; + out.write( "" + domain_0 + "" ); + out.write( "" ); } private static void writeDomainsToIndividualFilePerTreeNode( final Writer individual_files_writer, @@ -2489,40 +2661,6 @@ public final class SurfacingUtil { } } - public static void writeMatrixToFile( final CharacterStateMatrix matrix, - final String filename, - final Format format ) { - final File outfile = new File( filename ); - checkForOutputFileWriteability( outfile ); - try { - final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); - matrix.toWriter( out, format ); - out.flush(); - out.close(); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" ); - } - - public static void writeMatrixToFile( final File matrix_outfile, final List matrices ) { - checkForOutputFileWriteability( matrix_outfile ); - try { - final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) ); - for( final DistanceMatrix distance_matrix : matrices ) { - out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() ); - out.write( ForesterUtil.LINE_SEPARATOR ); - out.flush(); - } - out.close(); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" ); - } - private static void writePfamsToFile( final String outfile_name, final SortedSet pfams ) { try { final Writer writer = new BufferedWriter( new FileWriter( new File( outfile_name ) ) ); @@ -2539,37 +2677,6 @@ public final class SurfacingUtil { } } - public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) { - final PhylogenyWriter writer = new PhylogenyWriter(); - try { - writer.toPhyloXML( new File( filename ), phylogeny, 1 ); - } - catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": " - + e ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" ); - } - - public static void writeTaxonomyLinks( final Writer writer, final String species ) throws IOException { - if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) { - final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( species ); - writer.write( " [" ); - if ( matcher.matches() ) { - writer.write( "uniprot" ); - } - else { - writer.write( "eol" ); - writer.write( "|" ); - writer.write( "tol" ); - } - writer.write( "]" ); - } - } - private static void writeToNexus( final String outfile_name, final CharacterStateMatrix matrix, final Phylogeny phylogeny ) { @@ -2607,91 +2714,6 @@ public final class SurfacingUtil { phylogeny ); } - public static void domainsPerProteinsStatistics( final String genome, - final List protein_list, - final DescriptiveStatistics all_genomes_domains_per_potein_stats, - final SortedMap all_genomes_domains_per_potein_histo, - final SortedSet domains_which_are_always_single, - final SortedSet domains_which_are_sometimes_single_sometimes_not, - final SortedSet domains_which_never_single, - final Writer writer ) { - final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - for( final Protein protein : protein_list ) { - final int domains = protein.getNumberOfProteinDomains(); - //System.out.println( domains ); - stats.addValue( domains ); - all_genomes_domains_per_potein_stats.addValue( domains ); - if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) { - all_genomes_domains_per_potein_histo.put( domains, 1 ); - } - else { - all_genomes_domains_per_potein_histo.put( domains, - 1 + all_genomes_domains_per_potein_histo.get( domains ) ); - } - if ( domains == 1 ) { - final String domain = protein.getProteinDomain( 0 ).getDomainId().getId(); - if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { - if ( domains_which_never_single.contains( domain ) ) { - domains_which_never_single.remove( domain ); - domains_which_are_sometimes_single_sometimes_not.add( domain ); - } - else { - domains_which_are_always_single.add( domain ); - } - } - } - else if ( domains > 1 ) { - for( final Domain d : protein.getProteinDomains() ) { - final String domain = d.getDomainId().getId(); - // System.out.println( domain ); - if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { - if ( domains_which_are_always_single.contains( domain ) ) { - domains_which_are_always_single.remove( domain ); - domains_which_are_sometimes_single_sometimes_not.add( domain ); - } - else { - domains_which_never_single.add( domain ); - } - } - } - } - } - try { - writer.write( genome ); - writer.write( "\t" ); - if ( stats.getN() >= 1 ) { - writer.write( stats.arithmeticMean() + "" ); - writer.write( "\t" ); - if ( stats.getN() >= 2 ) { - writer.write( stats.sampleStandardDeviation() + "" ); - } - else { - writer.write( "" ); - } - writer.write( "\t" ); - writer.write( stats.median() + "" ); - writer.write( "\t" ); - writer.write( stats.getN() + "" ); - writer.write( "\t" ); - writer.write( stats.getMin() + "" ); - writer.write( "\t" ); - writer.write( stats.getMax() + "" ); - } - else { - writer.write( "\t" ); - writer.write( "\t" ); - writer.write( "\t" ); - writer.write( "0" ); - writer.write( "\t" ); - writer.write( "\t" ); - } - writer.write( "\n" ); - } - catch ( final IOException e ) { - e.printStackTrace(); - } - } - final static class DomainComparator implements Comparator { final private boolean _ascending; -- 1.7.10.2