X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsurfacing%2FSurfacingUtil.java;h=ea51d190da7ae4d2cc19980eabf460ac71157233;hb=e572f0142daf64409db0461f15215288137603f8;hp=599fd73c30b52856ef3ed6df734a132e09b0ebed;hpb=0fc3bc32fc5be907e3f91a780af68c6baff79db1;p=jalview.git diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index 599fd73..ea51d19 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -26,6 +26,7 @@ package org.forester.surfacing; +import java.awt.Color; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; @@ -84,9 +85,9 @@ import org.forester.protein.BinaryDomainCombination; import org.forester.protein.Domain; import org.forester.protein.Protein; import org.forester.species.Species; +import org.forester.surfacing.DomainSimilarity.PRINT_OPTION; import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder; -import org.forester.surfacing.PrintableDomainSimilarity.PRINT_OPTION; import org.forester.util.AsciiHistogram; import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.BasicTable; @@ -94,49 +95,36 @@ import org.forester.util.BasicTableParser; import org.forester.util.CommandLineArguments; import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterUtil; +import org.forester.util.TaxonomyColors; public final class SurfacingUtil { - final static class DomainComparator implements Comparator { - - final private boolean _ascending; - - public DomainComparator( final boolean ascending ) { - _ascending = ascending; - } + public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" ); + private final static Map _TAXCODE_HEXCOLORSTRING_MAP = new HashMap(); + private final static Map _TAXCODE_TAXGROUP_MAP = new HashMap(); + private static final Comparator ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator() { @Override - public final int compare( final Domain d0, final Domain d1 ) { - if ( d0.getFrom() < d1.getFrom() ) { - return _ascending ? -1 : 1; + public int compare( final Domain d1, + final Domain d2 ) { + if ( d1.getPerDomainEvalue() < d2 + .getPerDomainEvalue() ) { + return -1; } - else if ( d0.getFrom() > d1.getFrom() ) { - return _ascending ? 1 : -1; + else if ( d1.getPerDomainEvalue() > d2 + .getPerDomainEvalue() ) { + return 1; + } + else { + return d1.compareTo( d2 ); } - return 0; } + }; + private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" ); + + private SurfacingUtil() { + // Hidden constructor. } - private final static NumberFormat FORMATTER_3 = new DecimalFormat( "0.000" ); - private static final Comparator ASCENDING_CONFIDENCE_VALUE_ORDER = new Comparator() { - - @Override - public int compare( final Domain d1, - final Domain d2 ) { - if ( d1.getPerSequenceEvalue() < d2 - .getPerSequenceEvalue() ) { - return -1; - } - else if ( d1 - .getPerSequenceEvalue() > d2 - .getPerSequenceEvalue() ) { - return 1; - } - else { - return d1.compareTo( d2 ); - } - } - }; - public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" ); public static void addAllBinaryDomainCombinationToSet( final GenomeWideCombinableDomains genome, final SortedSet binary_domain_combinations ) { @@ -154,63 +142,6 @@ public final class SurfacingUtil { } } - public static void addHtmlHead( final Writer w, final String title ) throws IOException { - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( "" ); - w.write( title ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - } - - private final static void addToCountMap( final Map map, final String s ) { - if ( map.containsKey( s ) ) { - map.put( s, map.get( s ) + 1 ); - } - else { - map.put( s, 1 ); - } - } - public static DescriptiveStatistics calculateDescriptiveStatisticsForMeanValues( final Set similarities ) { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( final DomainSimilarity similarity : similarities ) { @@ -219,501 +150,115 @@ public final class SurfacingUtil { return stats; } - private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, - final String outfilename_for_counts, - final String outfilename_for_dc, - final String outfilename_for_dc_for_go_mapping, - final String outfilename_for_dc_for_go_mapping_unique, - final String outfilename_for_rank_counts, - final String outfilename_for_ancestor_species_counts, - final String outfilename_for_protein_stats, - final Map protein_length_stats_by_dc, - final Map domain_number_stats_by_dc, - final Map domain_length_stats_by_domain ) { - try { - // - // if ( protein_length_stats_by_dc != null ) { - // for( final Entry entry : protein_length_stats_by_dc.entrySet() ) { - // System.out.print( entry.getKey().toString() ); - // System.out.print( ": " ); - // double[] a = entry.getValue().getDataAsDoubleArray(); - // for( int i = 0; i < a.length; i++ ) { - // System.out.print( a[ i ] + " " ); - // } - // System.out.println(); - // } - // } - // if ( domain_number_stats_by_dc != null ) { - // for( final Entry entry : domain_number_stats_by_dc.entrySet() ) { - // System.out.print( entry.getKey().toString() ); - // System.out.print( ": " ); - // double[] a = entry.getValue().getDataAsDoubleArray(); - // for( int i = 0; i < a.length; i++ ) { - // System.out.print( a[ i ] + " " ); - // } - // System.out.println(); - // } - // } - // - final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); - final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); - final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); - final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) ); - final SortedMap dc_gain_counts = new TreeMap(); - for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { - final PhylogenyNode n = it.next(); - final Set gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters(); - for( final String dc : gained_dc ) { - if ( dc_gain_counts.containsKey( dc ) ) { - dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 ); - } - else { - dc_gain_counts.put( dc, 1 ); - } - } - } - final SortedMap histogram = new TreeMap(); - final SortedMap domain_lists = new TreeMap(); - final SortedMap dc_reapp_counts_to_protein_length_stats = new TreeMap(); - final SortedMap dc_reapp_counts_to_domain_number_stats = new TreeMap(); - final SortedMap dc_reapp_counts_to_domain_lengths_stats = new TreeMap(); - final SortedMap> domain_lists_go = new TreeMap>(); - final SortedMap> domain_lists_go_unique = new TreeMap>(); - final Set dcs = dc_gain_counts.keySet(); - final SortedSet more_than_once = new TreeSet(); - DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); - DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); - DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); - final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics(); - long gained_multiple_times_domain_length_sum = 0; - long gained_once_domain_length_sum = 0; - long gained_multiple_times_domain_length_count = 0; - long gained_once_domain_length_count = 0; - for( final String dc : dcs ) { - final int count = dc_gain_counts.get( dc ); - if ( histogram.containsKey( count ) ) { - histogram.put( count, histogram.get( count ) + 1 ); - domain_lists.get( count ).append( ", " + dc ); - domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) ); - domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) ); - } - else { - histogram.put( count, 1 ); - domain_lists.put( count, new StringBuilder( dc ) ); - final PriorityQueue q = new PriorityQueue(); - q.addAll( splitDomainCombination( dc ) ); - domain_lists_go.put( count, q ); - final SortedSet set = new TreeSet(); - set.addAll( splitDomainCombination( dc ) ); - domain_lists_go_unique.put( count, set ); - } - if ( protein_length_stats_by_dc != null ) { - if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) { - dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); - } - dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) - .arithmeticMean() ); - } - if ( domain_number_stats_by_dc != null ) { - if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { - dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); - } - dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) - .arithmeticMean() ); + public static void checkForOutputFileWriteability( final File outfile ) { + final String error = ForesterUtil.isWritableFile( outfile ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + } + } + + public static void checkWriteabilityForPairwiseComparisons( final DomainSimilarity.PRINT_OPTION domain_similarity_print_option, + final String[][] input_file_properties, + final String automated_pairwise_comparison_suffix, + final File outdir ) { + for( int i = 0; i < input_file_properties.length; ++i ) { + for( int j = 0; j < i; ++j ) { + final String species_i = input_file_properties[ i ][ 1 ]; + final String species_j = input_file_properties[ j ][ 1 ]; + String pairwise_similarities_output_file_str = surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i + + "_" + species_j + automated_pairwise_comparison_suffix; + switch ( domain_similarity_print_option ) { + case HTML: + if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) { + pairwise_similarities_output_file_str += ".html"; + } + break; } - if ( domain_length_stats_by_domain != null ) { - if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) { - dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() ); - } - final String[] ds = dc.split( "=" ); - dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain - .get( ds[ 0 ] ).arithmeticMean() ); - dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain - .get( ds[ 1 ] ).arithmeticMean() ); + final String error = ForesterUtil + .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir + + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); + if ( !ForesterUtil.isEmpty( error ) ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); } - if ( count > 1 ) { - more_than_once.add( dc ); - if ( protein_length_stats_by_dc != null ) { - final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_multiple_times_lengths_stats.addValue( element ); - } + } + } + } + + public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, + final BinaryDomainCombination.DomainCombinationType dc_type, + final List all_binary_domains_combination_gained, + final boolean get_gains ) { + final SortedSet sorted_ids = new TreeSet(); + for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { + sorted_ids.add( matrix.getIdentifier( i ) ); + } + for( final String id : sorted_ids ) { + for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { + if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) + || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) { + if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) { + all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination + .obtainInstance( matrix.getCharacter( c ) ) ); } - if ( domain_number_stats_by_dc != null ) { - final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_multiple_times_domain_count_stats.addValue( element ); - } + else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) { + all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination + .obtainInstance( matrix.getCharacter( c ) ) ); } - if ( domain_length_stats_by_domain != null ) { - final String[] ds = dc.split( "=" ); - final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); - final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); - for( final double element : s0.getData() ) { - gained_multiple_times_domain_length_sum += element; - ++gained_multiple_times_domain_length_count; - } - for( final double element : s1.getData() ) { - gained_multiple_times_domain_length_sum += element; - ++gained_multiple_times_domain_length_count; - } + else { + all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.obtainInstance( matrix + .getCharacter( c ) ) ); } } - else { - if ( protein_length_stats_by_dc != null ) { - final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_once_lengths_stats.addValue( element ); - } - } - if ( domain_number_stats_by_dc != null ) { - final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); - for( final double element : s.getData() ) { - gained_once_domain_count_stats.addValue( element ); - } - } - if ( domain_length_stats_by_domain != null ) { - final String[] ds = dc.split( "=" ); - final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); - final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); - for( final double element : s0.getData() ) { - gained_once_domain_length_sum += element; - ++gained_once_domain_length_count; - } - for( final double element : s1.getData() ) { - gained_once_domain_length_sum += element; - ++gained_once_domain_length_count; - } - } - } - } - final Set histogram_keys = histogram.keySet(); - for( final Integer histogram_key : histogram_keys ) { - final int count = histogram.get( histogram_key ); - final StringBuilder dc = domain_lists.get( histogram_key ); - out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); - out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); - out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); - final Object[] sorted = domain_lists_go.get( histogram_key ).toArray(); - Arrays.sort( sorted ); - for( final Object domain : sorted ) { - out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR ); - } - out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); - for( final String domain : domain_lists_go_unique.get( histogram_key ) ) { - out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR ); - } - } - out_counts.close(); - out_dc.close(); - out_dc_for_go_mapping.close(); - out_dc_for_go_mapping_unique.close(); - final SortedMap lca_rank_counts = new TreeMap(); - final SortedMap lca_ancestor_species_counts = new TreeMap(); - for( final String dc : more_than_once ) { - final List nodes = new ArrayList(); - for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) { - final PhylogenyNode n = it.next(); - if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) { - nodes.add( n ); - } - } - for( int i = 0; i < ( nodes.size() - 1 ); ++i ) { - for( int j = i + 1; j < nodes.size(); ++j ) { - final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) ); - String rank = "unknown"; - if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) { - rank = lca.getNodeData().getTaxonomy().getRank(); - } - addToCountMap( lca_rank_counts, rank ); - String lca_species; - if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) { - lca_species = lca.getNodeData().getTaxonomy().getScientificName(); - } - else if ( lca.getNodeData().isHasTaxonomy() - && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) { - lca_species = lca.getNodeData().getTaxonomy().getCommonName(); - } - else { - lca_species = lca.getName(); - } - addToCountMap( lca_ancestor_species_counts, lca_species ); - } - } - } - final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) ); - final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) ); - ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR ); - ForesterUtil.map2writer( out_for_ancestor_species_counts, - lca_ancestor_species_counts, - "\t", - ForesterUtil.LINE_SEPARATOR ); - out_for_rank_counts.close(); - out_for_ancestor_species_counts.close(); - if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats ) - && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) { - final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) ); - w.write( "Domain Lengths: " ); - w.write( "\n" ); - if ( domain_length_stats_by_domain != null ) { - for( final Entry entry : dc_reapp_counts_to_domain_lengths_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); - } - } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Protein Lengths: " ); - w.write( "\n" ); - if ( protein_length_stats_by_dc != null ) { - for( final Entry entry : dc_reapp_counts_to_protein_length_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); - } - } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Number of domains: " ); - w.write( "\n" ); - if ( domain_number_stats_by_dc != null ) { - for( final Entry entry : dc_reapp_counts_to_domain_number_stats - .entrySet() ) { - w.write( entry.getKey().toString() ); - w.write( "\t" + entry.getValue().arithmeticMean() ); - w.write( "\t" + entry.getValue().median() ); - w.write( "\n" ); - } - } - w.flush(); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, domain lengths:" ); - w.write( "\n" ); - w.write( "N: " + gained_once_domain_length_count ); - w.write( "\n" ); - w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, domain lengths:" ); - w.write( "\n" ); - w.write( "N: " + gained_multiple_times_domain_length_count ); - w.write( "\n" ); - w.write( "Avg: " - + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, protein lengths:" ); - w.write( "\n" ); - w.write( gained_once_lengths_stats.toString() ); - gained_once_lengths_stats = null; - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained once, domain counts:" ); - w.write( "\n" ); - w.write( gained_once_domain_count_stats.toString() ); - gained_once_domain_count_stats = null; - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, protein lengths:" ); - w.write( "\n" ); - w.write( gained_multiple_times_lengths_stats.toString() ); - gained_multiple_times_lengths_stats = null; - w.write( "\n" ); - w.write( "\n" ); - w.write( "Gained multiple times, domain counts:" ); - w.write( "\n" ); - w.write( gained_multiple_times_domain_count_stats.toString() ); - w.flush(); - w.close(); } } - catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to [" - + outfilename_for_counts + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to [" - + outfilename_for_dc + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote independent domain combination gains fitch lists to (for GO mapping) [" - + outfilename_for_dc_for_go_mapping + "]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, - "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) [" - + outfilename_for_dc_for_go_mapping_unique + "]" ); } - public static void checkForOutputFileWriteability( final File outfile ) { - final String error = ForesterUtil.isWritableFile( outfile ); - if ( !ForesterUtil.isEmpty( error ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, error ); + public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { + final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings.size() ); + for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) { + if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) { + domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList() ); + } + domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() ); } + return domain_id_to_go_ids_map; } - public static void checkWriteabilityForPairwiseComparisons( final PrintableDomainSimilarity.PRINT_OPTION domain_similarity_print_option, - final String[][] input_file_properties, - final String automated_pairwise_comparison_suffix, - final File outdir ) { - for( int i = 0; i < input_file_properties.length; ++i ) { - for( int j = 0; j < i; ++j ) { - final String species_i = input_file_properties[ i ][ 1 ]; - final String species_j = input_file_properties[ j ][ 1 ]; - String pairwise_similarities_output_file_str = surfacing.PAIRWISE_DOMAIN_COMPARISONS_PREFIX + species_i - + "_" + species_j + automated_pairwise_comparison_suffix; - switch ( domain_similarity_print_option ) { - case HTML: - if ( !pairwise_similarities_output_file_str.endsWith( ".html" ) ) { - pairwise_similarities_output_file_str += ".html"; - } - break; - } - final String error = ForesterUtil - .isWritableFile( new File( outdir == null ? pairwise_similarities_output_file_str : outdir - + ForesterUtil.FILE_SEPARATOR + pairwise_similarities_output_file_str ) ); - if ( !ForesterUtil.isEmpty( error ) ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, error ); - } + public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) + throws IOException { + final BasicTable primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' ); + final Map> map = new TreeMap>(); + for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) { + final String domain_id = primary_table.getValue( 0, r ); + if ( !map.containsKey( domain_id ) ) { + map.put( domain_id, new HashSet() ); } + map.get( domain_id ).add( primary_table.getValue( 1, r ) ); } + return map; } - private static SortedSet collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root, - final boolean get_gains ) { - final SortedSet domains = new TreeSet(); - for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) { - final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters(); - if ( get_gains ) { - domains.addAll( chars.getGainedCharacters() ); - } - else { - domains.addAll( chars.getLostCharacters() ); - } - } - return domains; + public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { + checkForOutputFileWriteability( nj_tree_outfile ); + final NeighborJoining nj = NeighborJoining.createInstance(); + final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance ); + phylogeny.setName( nj_tree_outfile.getName() ); + writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); + return phylogeny; } - public static void collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, - final BinaryDomainCombination.DomainCombinationType dc_type, - final List all_binary_domains_combination_gained, - final boolean get_gains ) { - final SortedSet sorted_ids = new TreeSet(); - for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { - sorted_ids.add( matrix.getIdentifier( i ) ); - } - for( final String id : sorted_ids ) { - for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { - if ( ( get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) - || ( !get_gains && ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.LOSS ) ) ) { - if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED_ADJACTANT ) { - all_binary_domains_combination_gained.add( AdjactantDirectedBinaryDomainCombination - .createInstance( matrix.getCharacter( c ) ) ); - } - else if ( dc_type == BinaryDomainCombination.DomainCombinationType.DIRECTED ) { - all_binary_domains_combination_gained.add( DirectedBinaryDomainCombination - .createInstance( matrix.getCharacter( c ) ) ); - } - else { - all_binary_domains_combination_gained.add( BasicBinaryDomainCombination.createInstance( matrix - .getCharacter( c ) ) ); - } - } - } - } - } - - private static File createBaseDirForPerNodeDomainFiles( final String base_dir, - final boolean domain_combinations, - final CharacterStateMatrix.GainLossStates state, - final String outfile ) { - File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent() - + ForesterUtil.FILE_SEPARATOR + base_dir ); - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - if ( domain_combinations ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "DC" ); - } - else { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "DOMAINS" ); - } - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - if ( state == GainLossStates.GAIN ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "GAINS" ); - } - else if ( state == GainLossStates.LOSS ) { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "LOSSES" ); - } - else { - per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + "PRESENT" ); - } - if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { - per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); - } - return per_node_go_mapped_domain_gain_loss_files_base_dir; - } - - public static Map> createDomainIdToGoIdMap( final List pfam_to_go_mappings ) { - final Map> domain_id_to_go_ids_map = new HashMap>( pfam_to_go_mappings.size() ); - for( final PfamToGoMapping pfam_to_go : pfam_to_go_mappings ) { - if ( !domain_id_to_go_ids_map.containsKey( pfam_to_go.getKey() ) ) { - domain_id_to_go_ids_map.put( pfam_to_go.getKey(), new ArrayList() ); - } - domain_id_to_go_ids_map.get( pfam_to_go.getKey() ).add( pfam_to_go.getValue() ); - } - return domain_id_to_go_ids_map; - } - - public static Map> createDomainIdToSecondaryFeaturesMap( final File secondary_features_map_file ) - throws IOException { - final BasicTable primary_table = BasicTableParser.parse( secondary_features_map_file, '\t' ); - final Map> map = new TreeMap>(); - for( int r = 0; r < primary_table.getNumberOfRows(); ++r ) { - final String domain_id = primary_table.getValue( 0, r ); - if ( !map.containsKey( domain_id ) ) { - map.put( domain_id, new HashSet() ); - } - map.get( domain_id ).add( primary_table.getValue( 1, r ) ); - } - return map; - } - - public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { - checkForOutputFileWriteability( nj_tree_outfile ); - final NeighborJoining nj = NeighborJoining.createInstance(); - final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance ); - phylogeny.setName( nj_tree_outfile.getName() ); - writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); - return phylogeny; - } - - public static StringBuilder createParametersAsString( final boolean ignore_dufs, - final double e_value_max, - final int max_allowed_overlap, - final boolean no_engulfing_overlaps, - final File cutoff_scores_file, - final BinaryDomainCombination.DomainCombinationType dc_type ) { - final StringBuilder parameters_sb = new StringBuilder(); - parameters_sb.append( "E-value: " + e_value_max ); - if ( cutoff_scores_file != null ) { - parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file ); + public static StringBuilder createParametersAsString( final boolean ignore_dufs, + final double ie_value_max, + final double fs_e_value_max, + final int max_allowed_overlap, + final boolean no_engulfing_overlaps, + final File cutoff_scores_file, + final BinaryDomainCombination.DomainCombinationType dc_type ) { + final StringBuilder parameters_sb = new StringBuilder(); + parameters_sb.append( "iE-value: " + ie_value_max ); + parameters_sb.append( ", FS E-value: " + fs_e_value_max ); + if ( cutoff_scores_file != null ) { + parameters_sb.append( ", Cutoff-scores-file: " + cutoff_scores_file ); } else { parameters_sb.append( ", Cutoff-scores-file: not-set" ); @@ -740,73 +285,63 @@ public final class SurfacingUtil { return parameters_sb; } - private static SortedSet createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) { - final SortedMap cds = gwcd.getAllCombinableDomainsIds(); - final SortedSet binary_combinations = new TreeSet(); - for( final String domain_id : cds.keySet() ) { - final CombinableDomains cd = cds.get( domain_id ); - binary_combinations.addAll( cd.toBinaryDomainCombinations() ); - } - return binary_combinations; - } - public static void createSplitWriters( final File out_dir, final String my_outfile, final Map split_writers ) throws IOException { split_writers.put( 'a', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_A.html" ) ) ); + + "_domains_A.html" ) ) ); split_writers.put( 'b', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_B.html" ) ) ); + + "_domains_B.html" ) ) ); split_writers.put( 'c', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_C.html" ) ) ); + + "_domains_C.html" ) ) ); split_writers.put( 'd', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_D.html" ) ) ); + + "_domains_D.html" ) ) ); split_writers.put( 'e', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_E.html" ) ) ); + + "_domains_E.html" ) ) ); split_writers.put( 'f', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_F.html" ) ) ); + + "_domains_F.html" ) ) ); split_writers.put( 'g', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_G.html" ) ) ); + + "_domains_G.html" ) ) ); split_writers.put( 'h', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_H.html" ) ) ); + + "_domains_H.html" ) ) ); split_writers.put( 'i', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_I.html" ) ) ); + + "_domains_I.html" ) ) ); split_writers.put( 'j', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_J.html" ) ) ); + + "_domains_J.html" ) ) ); split_writers.put( 'k', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_K.html" ) ) ); + + "_domains_K.html" ) ) ); split_writers.put( 'l', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_L.html" ) ) ); + + "_domains_L.html" ) ) ); split_writers.put( 'm', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_M.html" ) ) ); + + "_domains_M.html" ) ) ); split_writers.put( 'n', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_N.html" ) ) ); + + "_domains_N.html" ) ) ); split_writers.put( 'o', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_O.html" ) ) ); + + "_domains_O.html" ) ) ); split_writers.put( 'p', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_P.html" ) ) ); + + "_domains_P.html" ) ) ); split_writers.put( 'q', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_Q.html" ) ) ); + + "_domains_Q.html" ) ) ); split_writers.put( 'r', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_R.html" ) ) ); + + "_domains_R.html" ) ) ); split_writers.put( 's', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_S.html" ) ) ); + + "_domains_S.html" ) ) ); split_writers.put( 't', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_T.html" ) ) ); + + "_domains_T.html" ) ) ); split_writers.put( 'u', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_U.html" ) ) ); + + "_domains_U.html" ) ) ); split_writers.put( 'v', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_V.html" ) ) ); + + "_domains_V.html" ) ) ); split_writers.put( 'w', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_W.html" ) ) ); + + "_domains_W.html" ) ) ); split_writers.put( 'x', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_X.html" ) ) ); + + "_domains_X.html" ) ) ); split_writers.put( 'y', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_Y.html" ) ) ); + + "_domains_Y.html" ) ) ); split_writers.put( 'z', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_Z.html" ) ) ); + + "_domains_Z.html" ) ) ); split_writers.put( '0', new BufferedWriter( new FileWriter( out_dir + ForesterUtil.FILE_SEPARATOR + my_outfile - + "_domains_0.html" ) ) ); + + "_domains_0.html" ) ) ); } public static Map createTaxCodeToIdMap( final Phylogeny phy ) { @@ -844,8 +379,8 @@ public final class SurfacingUtil { public static void decoratePrintableDomainSimilarities( final SortedSet domain_similarities, final Detailedness detailedness ) { for( final DomainSimilarity domain_similarity : domain_similarities ) { - if ( domain_similarity instanceof PrintableDomainSimilarity ) { - final PrintableDomainSimilarity printable_domain_similarity = ( PrintableDomainSimilarity ) domain_similarity; + if ( domain_similarity instanceof DomainSimilarity ) { + final DomainSimilarity printable_domain_similarity = domain_similarity; printable_domain_similarity.setDetailedness( detailedness ); } } @@ -1047,14 +582,14 @@ public final class SurfacingUtil { /** * Warning: This side-effects 'all_bin_domain_combinations_encountered'! - * - * + * + * * @param output_file * @param all_bin_domain_combinations_changed * @param sum_of_all_domains_encountered * @param all_bin_domain_combinations_encountered * @param is_gains_analysis - * @param protein_length_stats_by_dc + * @param protein_length_stats_by_dc * @throws IOException */ public static void executeFitchGainsAnalysis( final File output_file, @@ -1099,9 +634,9 @@ public final class SurfacingUtil { } if ( is_gains_analysis ) { out.write( "Sum of all distinct domain combinations appearing once : " + one - + ForesterUtil.LINE_SEPARATOR ); + + ForesterUtil.LINE_SEPARATOR ); out.write( "Sum of all distinct domain combinations appearing more than once : " + above_one - + ForesterUtil.LINE_SEPARATOR ); + + ForesterUtil.LINE_SEPARATOR ); out.write( "Sum of all distinct domains in combinations apppearing only once : " + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); out.write( "Sum of all distinct domains in combinations apppearing more than once: " @@ -1109,31 +644,32 @@ public final class SurfacingUtil { } else { out.write( "Sum of all distinct domain combinations never lost : " + never_lost - + ForesterUtil.LINE_SEPARATOR ); + + ForesterUtil.LINE_SEPARATOR ); out.write( "Sum of all distinct domain combinations lost once : " + one - + ForesterUtil.LINE_SEPARATOR ); + + ForesterUtil.LINE_SEPARATOR ); out.write( "Sum of all distinct domain combinations lost more than once : " + above_one - + ForesterUtil.LINE_SEPARATOR ); + + ForesterUtil.LINE_SEPARATOR ); out.write( "Sum of all distinct domains in combinations lost only once : " + all_domains_in_combination_changed_only_once.size() + ForesterUtil.LINE_SEPARATOR ); out.write( "Sum of all distinct domains in combinations lost more than once: " + all_domains_in_combination_changed_more_than_once.size() + ForesterUtil.LINE_SEPARATOR ); } out.write( "All binary combinations : " + all - + ForesterUtil.LINE_SEPARATOR ); + + ForesterUtil.LINE_SEPARATOR ); out.write( "All domains : " + sum_of_all_domains_encountered ); out.close(); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote fitch domain combination dynamics counts analysis to \"" + output_file - + "\"" ); + + "\"" ); } /** - * - * @param all_binary_domains_combination_lost_fitch - * @param use_last_in_fitch_parsimony - * @param consider_directedness_and_adjacency_for_bin_combinations + * + * @param all_binary_domains_combination_lost_fitch + * @param use_last_in_fitch_parsimony + * @param perform_dc_fich + * @param consider_directedness_and_adjacency_for_bin_combinations * @param all_binary_domains_combination_gained if null ignored, otherwise this is to list all binary domain combinations * which were gained under unweighted (Fitch) parsimony. */ @@ -1157,7 +693,8 @@ public final class SurfacingUtil { final Map domain_length_stats_by_domain, final Map tax_code_to_id_map, final boolean write_to_nexus, - final boolean use_last_in_fitch_parsimony ) { + final boolean use_last_in_fitch_parsimony, + final boolean perform_dc_fich ) { final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; final String date_time = ForesterUtil.getCurrentDateTime(); final SortedSet all_pfams_encountered = new TreeSet(); @@ -1178,9 +715,9 @@ public final class SurfacingUtil { domain_parsimony.executeDolloParsimonyOnDomainPresence(); } SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.GAIN, outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_D, @@ -1194,7 +731,7 @@ public final class SurfacingUtil { ForesterUtil.LINE_SEPARATOR, null ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name - + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null ); + + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null ); //HTML: writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, go_id_to_term_map, @@ -1251,7 +788,7 @@ public final class SurfacingUtil { "dollo_on_domains_" + outfile_name, parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name - + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); try { writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, true, outfile_name, "_dollo_all_gains_d" ); writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, false, outfile_name, "_dollo_all_losses_d" ); @@ -1260,7 +797,7 @@ public final class SurfacingUtil { e.printStackTrace(); ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); } - if ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) { + if ( perform_dc_fich && ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) ) { // FITCH DOMAIN COMBINATIONS // ------------------------- local_phylogeny_l = phylogeny.copy(); @@ -1273,25 +810,25 @@ public final class SurfacingUtil { domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( use_last_in_fitch_parsimony ); } SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); SurfacingUtil - .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), - CharacterStateMatrix.GainLossStates.GAIN, - outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC, - sep, - ForesterUtil.LINE_SEPARATOR, - null ); + .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.LOSS, outfile_name - + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC, + + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC, sep, ForesterUtil.LINE_SEPARATOR, null ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name - + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null ); + + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null ); if ( all_binary_domains_combination_gained_fitch != null ) { collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), dc_type, @@ -1306,14 +843,14 @@ public final class SurfacingUtil { } if ( output_binary_domain_combinations_for_graphs ) { SurfacingUtil - .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony - .getGainLossMatrix(), - null, - outfile_name - + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS, - sep, - ForesterUtil.LINE_SEPARATOR, - BinaryDomainCombination.OutputFormat.DOT ); + .writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( domain_parsimony + .getGainLossMatrix(), + null, + outfile_name + + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS, + sep, + ForesterUtil.LINE_SEPARATOR, + BinaryDomainCombination.OutputFormat.DOT ); } // HTML: writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, @@ -1378,19 +915,19 @@ public final class SurfacingUtil { date_time, "Fitch parsimony on binary domain combination presence/absence randomization: " + randomization, - "fitch_on_binary_domain_combinations_" + outfile_name, - parameters_str ); + "fitch_on_binary_domain_combinations_" + outfile_name, + parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name - + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH ); + + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH ); calculateIndependentDomainCombinationGains( local_phylogeny_l, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt", outfile_name + "_indep_dc_gains_fitch_protein_statistics.txt", @@ -1415,33 +952,33 @@ public final class SurfacingUtil { Phylogeny local_phylogeny_copy = phylogeny.copy(); secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map ); SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossCountsMatrix(), outfile_name - + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); SurfacingUtil - .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), - CharacterStateMatrix.GainLossStates.GAIN, - outfile_name - + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES, - sep, - ForesterUtil.LINE_SEPARATOR, - null ); + .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); SurfacingUtil - .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), - CharacterStateMatrix.GainLossStates.LOSS, - outfile_name - + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES, - sep, - ForesterUtil.LINE_SEPARATOR, - null ); + .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.LOSS, + outfile_name + + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); SurfacingUtil - .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), - null, - outfile_name - + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES, - sep, - ForesterUtil.LINE_SEPARATOR, - null ); + .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), + null, + outfile_name + + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); preparePhylogeny( local_phylogeny_copy, secondary_features_parsimony, date_time, @@ -1449,29 +986,29 @@ public final class SurfacingUtil { "dollo_on_secondary_features_" + outfile_name, parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name - + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); // FITCH DOMAIN COMBINATIONS // ------------------------- local_phylogeny_copy = phylogeny.copy(); final String randomization = "no"; secondary_features_parsimony - .executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( use_last_in_fitch_parsimony ); + .executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( use_last_in_fitch_parsimony ); preparePhylogeny( local_phylogeny_copy, secondary_features_parsimony, date_time, "Fitch parsimony on secondary binary domain combination presence/absence randomization: " + randomization, - "fitch_on_binary_domain_combinations_" + outfile_name, - parameters_str ); + "fitch_on_binary_domain_combinations_" + outfile_name, + parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name - + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED ); + + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED ); calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name - + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name - + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name - + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null ); + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name + + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null ); } public static void executePlusMinusAnalysis( final File output_file, @@ -1492,7 +1029,7 @@ public final class SurfacingUtil { final File html_out_dc = new File( output_file + surfacing.PLUS_MINUS_DC_SUFFIX_HTML ); final File all_domains_go_ids_out_dom = new File( output_file + surfacing.PLUS_MINUS_ALL_GO_IDS_DOM_SUFFIX ); final File passing_domains_go_ids_out_dom = new File( output_file - + surfacing.PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX ); + + surfacing.PLUS_MINUS_PASSING_GO_IDS_DOM_SUFFIX ); final File proteins_file_base = new File( output_file + "" ); final int min_diff = ( ( Integer ) plus_minus_analysis_numbers.get( 0 ) ).intValue(); final double factor = ( ( Double ) plus_minus_analysis_numbers.get( 1 ) ).doubleValue(); @@ -1521,7 +1058,7 @@ public final class SurfacingUtil { ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + plain_out_dom + "\"" ); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis results to \"" + html_out_dc - + "\"" ); + + "\"" ); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based passing GO ids to \"" + passing_domains_go_ids_out_dom + "\"" ); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote plus minus domain analysis based all GO ids to \"" @@ -1688,9 +1225,9 @@ public final class SurfacingUtil { final PhylogenyNode n = it.next(); if ( ForesterUtil.isEmpty( n.getName() ) && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() - .getScientificName() ) ) - && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() - .getCommonName() ) ) ) { + .getScientificName() ) ) + && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() + .getCommonName() ) ) ) { if ( n.getParent() != null ) { names.append( " " ); names.append( n.getParent().getName() ); @@ -1732,17 +1269,17 @@ public final class SurfacingUtil { .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) ); if ( p_array.length < 1 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file - + "] does not contain any phylogeny in phyloXML format" ); + + "] does not contain any phylogeny in phyloXML format" ); } else if ( p_array.length > 1 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file - + "] contains more than one phylogeny in phyloXML format" ); + + "] contains more than one phylogeny in phyloXML format" ); } intree = p_array[ 0 ]; } catch ( final Exception e ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "failed to read input tree from file [" + intree_file - + "]: " + error ); + + "]: " + error ); } if ( ( intree == null ) || intree.isEmpty() ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "input tree [" + intree_file + "] is empty" ); @@ -1753,9 +1290,9 @@ public final class SurfacingUtil { if ( intree.getNumberOfExternalNodes() < number_of_genomes ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "number of external nodes [" + intree.getNumberOfExternalNodes() - + "] of input tree [" + intree_file - + "] is smaller than the number of genomes the be analyzed [" - + number_of_genomes + "]" ); + + "] of input tree [" + intree_file + + "] is smaller than the number of genomes the be analyzed [" + + number_of_genomes + "]" ); } final StringBuilder parent_names = new StringBuilder(); final int nodes_lacking_name = getNumberOfNodesLackingName( intree, parent_names ); @@ -1766,7 +1303,7 @@ public final class SurfacingUtil { preparePhylogenyForParsimonyAnalyses( intree, input_file_properties ); if ( !intree.isCompletelyBinary() ) { ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "input tree [" + intree_file - + "] is not completely binary" ); + + "] is not completely binary" ); } intrees[ i++ ] = intree; } @@ -1784,11 +1321,11 @@ public final class SurfacingUtil { .create( intree_file, ParserUtils.createParserDependingOnFileType( intree_file, true ) ); if ( phys.length < 1 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file - + "] does not contain any phylogeny in phyloXML format" ); + + "] does not contain any phylogeny in phyloXML format" ); } else if ( phys.length > 1 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, "file [" + intree_file - + "] contains more than one phylogeny in phyloXML format" ); + + "] contains more than one phylogeny in phyloXML format" ); } intree = phys[ 0 ]; } @@ -1805,6 +1342,90 @@ public final class SurfacingUtil { return intree; } + public static String obtainHexColorStringDependingOnTaxonomyGroup( final String tax_code, final Phylogeny phy ) + throws IllegalArgumentException { + if ( !_TAXCODE_HEXCOLORSTRING_MAP.containsKey( tax_code ) ) { + if ( ( phy != null ) && !phy.isEmpty() ) { + // final List nodes = phy.getNodesViaTaxonomyCode( tax_code ); + // Color c = null; + // if ( ( nodes == null ) || nodes.isEmpty() ) { + // throw new IllegalArgumentException( "code " + tax_code + " is not found" ); + // } + // if ( nodes.size() != 1 ) { + // throw new IllegalArgumentException( "code " + tax_code + " is not unique" ); + // } + // PhylogenyNode n = nodes.get( 0 ); + // while ( n != null ) { + // if ( n.getNodeData().isHasTaxonomy() + // && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { + // c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( n.getNodeData().getTaxonomy() + // .getScientificName(), tax_code ); + // } + // if ( ( c == null ) && !ForesterUtil.isEmpty( n.getName() ) ) { + // c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( n.getName(), tax_code ); + // } + // if ( c != null ) { + // break; + // } + // n = n.getParent(); + // } + final String group = obtainTaxonomyGroup( tax_code, phy ); + final Color c = ForesterUtil.obtainColorDependingOnTaxonomyGroup( group ); + if ( c == null ) { + throw new IllegalArgumentException( "no color found for taxonomy group \"" + group + + "\" for code \"" + tax_code + "\"" ); + } + final String hex = String.format( "#%02x%02x%02x", c.getRed(), c.getGreen(), c.getBlue() ); + _TAXCODE_HEXCOLORSTRING_MAP.put( tax_code, hex ); + } + else { + throw new IllegalArgumentException( "unable to obtain color for code " + tax_code + + " (tree is null or empty and code is not in map)" ); + } + } + return _TAXCODE_HEXCOLORSTRING_MAP.get( tax_code ); + } + + public static String obtainTaxonomyGroup( final String tax_code, final Phylogeny species_tree ) + throws IllegalArgumentException { + if ( !_TAXCODE_TAXGROUP_MAP.containsKey( tax_code ) ) { + if ( ( species_tree != null ) && !species_tree.isEmpty() ) { + final List nodes = species_tree.getNodesViaTaxonomyCode( tax_code ); + if ( ( nodes == null ) || nodes.isEmpty() ) { + throw new IllegalArgumentException( "code " + tax_code + " is not found" ); + } + if ( nodes.size() != 1 ) { + throw new IllegalArgumentException( "code " + tax_code + " is not unique" ); + } + PhylogenyNode n = nodes.get( 0 ); + String group = null; + while ( n != null ) { + if ( n.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( n.getNodeData().getTaxonomy().getScientificName() ) ) { + group = ForesterUtil.obtainNormalizedTaxonomyGroup( n.getNodeData().getTaxonomy() + .getScientificName() ); + } + if ( ForesterUtil.isEmpty( group ) && !ForesterUtil.isEmpty( n.getName() ) ) { + group = ForesterUtil.obtainNormalizedTaxonomyGroup( n.getName() ); + } + if ( !ForesterUtil.isEmpty( group ) ) { + break; + } + n = n.getParent(); + } + if ( ForesterUtil.isEmpty( group ) ) { + throw new IllegalArgumentException( "no group found for taxonomy code \"" + tax_code + "\"" ); + } + _TAXCODE_TAXGROUP_MAP.put( tax_code, group ); + } + else { + throw new IllegalArgumentException( "unable to obtain group for code " + tax_code + + " (tree is null or empty and code is not in map)" ); + } + } + return _TAXCODE_TAXGROUP_MAP.get( tax_code ); + } + public static void performDomainArchitectureAnalysis( final SortedMap> domain_architecutures, final SortedMap domain_architecuture_counts, final int min_count, @@ -1902,8 +1523,8 @@ public final class SurfacingUtil { } else { ForesterUtil - .fatalError( surfacing.PRG_NAME, - "node with no name, scientific name, common name, or taxonomy code present" ); + .fatalError( surfacing.PRG_NAME, + "node with no name, scientific name, common name, or taxonomy code present" ); } } } @@ -1938,42 +1559,6 @@ public final class SurfacingUtil { log( "Percentage of multidomain proteins: : " + percentage + "%", log_writer ); } - private static void printSomeStats( final DescriptiveStatistics stats, final AsciiHistogram histo, final Writer w ) - throws IOException { - w.write( "
" ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" );
-        w.write( SurfacingConstants.NL );
-        if ( histo != null ) {
-            w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
-            w.write( SurfacingConstants.NL );
-        }
-        w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( stats.getN() > 1 ) { - w.write( "" ); - } - else { - w.write( "" ); - } - w.write( SurfacingConstants.NL ); - w.write( "
N: " + stats.getN() + "
Min: " + stats.getMin() + "
Max: " + stats.getMax() + "
Mean: " + stats.arithmeticMean() + "
SD: " + stats.sampleStandardDeviation() + "
SD: n/a
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - } - public static void processFilter( final File filter_file, final SortedSet filter ) { SortedSet filter_str = null; try { @@ -2115,9 +1700,9 @@ public final class SurfacingUtil { } else { ForesterUtil - .fatalError( surfacing.PRG_NAME, - "species/genome names in 'plus minus' file must begin with '*' (high copy target genome), '+' (high copy base genomes), '-' (low copy genomes), '$D=' minimal Difference (default is 1), '$F=' factor (default is 1.0), double), or '#' (ignore) suffix, encountered: \"" - + species + "\"" ); + .fatalError( surfacing.PRG_NAME, + "species/genome names in 'plus minus' file must begin with '*' (high copy target genome), '+' (high copy base genomes), '-' (low copy genomes), '$D=' minimal Difference (default is 1), '$F=' factor (default is 1.0), double), or '#' (ignore) suffix, encountered: \"" + + species + "\"" ); } numbers.add( new Integer( min_diff + "" ) ); numbers.add( new Double( factor + "" ) ); @@ -2130,8 +1715,8 @@ public final class SurfacingUtil { /* * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value - * - * + * + * */ static public StringBuffer proteinToDomainCombinations( final Protein protein, final String protein_id, @@ -2226,19 +1811,6 @@ public final class SurfacingUtil { return domains; } - private static List splitDomainCombination( final String dc ) { - final String[] s = dc.split( "=" ); - if ( s.length != 2 ) { - ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: " - + dc ); - System.exit( -1 ); - } - final List l = new ArrayList( 2 ); - l.add( s[ 0 ] ); - l.add( s[ 1 ] ); - return l; - } - public static int storeDomainArchitectures( final String genome, final SortedMap> domain_architecutures, final List protein_list, @@ -2279,7 +1851,7 @@ public final class SurfacingUtil { final SortedSet domains = collectAllDomainsChangedOnSubtree( node, get_gains ); if ( domains.size() > 0 ) { final Writer writer = ForesterUtil.createBufferedWriter( base_dir + ForesterUtil.FILE_SEPARATOR - + node.getName() + suffix_for_filename ); + + node.getName() + suffix_for_filename ); for( final String domain : domains ) { writer.write( domain ); writer.write( ForesterUtil.LINE_SEPARATOR ); @@ -2290,171 +1862,40 @@ public final class SurfacingUtil { } } - private static void writeAllEncounteredPfamsToFile( final Map> domain_id_to_go_ids_map, - final Map go_id_to_term_map, - final String outfile_name, - final SortedSet all_pfams_encountered ) { - final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX ); - final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name - + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); - final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX ); - int biological_process_counter = 0; - int cellular_component_counter = 0; - int molecular_function_counter = 0; - int pfams_with_mappings_counter = 0; - int pfams_without_mappings_counter = 0; - int pfams_without_mappings_to_bp_or_mf_counter = 0; - int pfams_with_mappings_to_bp_or_mf_counter = 0; + public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties, + final File output_dir, + final GenomeWideCombinableDomains gwcd, + final int i, + final GenomeWideCombinableDomainsSortOrder dc_sort_order ) { + File dc_outfile_dot = new File( input_file_properties[ i ][ 1 ] + + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS ); + if ( output_dir != null ) { + dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot ); + } + checkForOutputFileWriteability( dc_outfile_dot ); + final SortedSet binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd ); try { - final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) ); - final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) ); - final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) ); - summary_writer.write( "# Pfam to GO mapping summary" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Actual summary is at the end of this file." ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Encountered Pfams without a GO mapping:" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - for( final String pfam : all_pfams_encountered ) { - all_pfams_encountered_writer.write( pfam ); - all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR ); - final String domain_id = new String( pfam ); - if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { - ++pfams_with_mappings_counter; - all_pfams_encountered_with_go_annotation_writer.write( pfam ); - all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR ); - final List go_ids = domain_id_to_go_ids_map.get( domain_id ); - boolean maps_to_bp = false; - boolean maps_to_cc = false; - boolean maps_to_mf = false; - for( final GoId go_id : go_ids ) { - final GoTerm go_term = go_id_to_term_map.get( go_id ); - if ( go_term.getGoNameSpace().isBiologicalProcess() ) { - maps_to_bp = true; - } - else if ( go_term.getGoNameSpace().isCellularComponent() ) { - maps_to_cc = true; - } - else if ( go_term.getGoNameSpace().isMolecularFunction() ) { - maps_to_mf = true; - } - } - if ( maps_to_bp ) { - ++biological_process_counter; - } - if ( maps_to_cc ) { - ++cellular_component_counter; - } - if ( maps_to_mf ) { - ++molecular_function_counter; - } - if ( maps_to_bp || maps_to_mf ) { - ++pfams_with_mappings_to_bp_or_mf_counter; - } - else { - ++pfams_without_mappings_to_bp_or_mf_counter; - } - } - else { - ++pfams_without_mappings_to_bp_or_mf_counter; - ++pfams_without_mappings_counter; - summary_writer.write( pfam ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - } - } - all_pfams_encountered_writer.close(); - all_pfams_encountered_with_go_annotation_writer.close(); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size() - + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter - + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file - + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all [" - + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \"" - + encountered_pfams_summary_file + "\"" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered : " - + all_pfams_encountered.size() ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : " - + pfams_without_mappings_counter + " [" - + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : " - + pfams_without_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : " - + pfams_with_mappings_counter + " [" - + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : " - + pfams_with_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: " - + biological_process_counter + " [" - + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: " - + molecular_function_counter + " [" - + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: " - + cellular_component_counter + " [" - + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter - + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams without mapping to proc. or func. : " - + pfams_without_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " [" - + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with a mapping to proc. or func. : " - + pfams_with_mappings_to_bp_or_mf_counter + " [" - + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " [" - + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " [" - + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " [" - + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); - summary_writer.write( ForesterUtil.LINE_SEPARATOR ); - summary_writer.close(); - } - catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); - } - } - - public static void writeBinaryDomainCombinationsFileForGraphAnalysis( final String[][] input_file_properties, - final File output_dir, - final GenomeWideCombinableDomains gwcd, - final int i, - final GenomeWideCombinableDomainsSortOrder dc_sort_order ) { - File dc_outfile_dot = new File( input_file_properties[ i ][ 1 ] - + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS ); - if ( output_dir != null ) { - dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot ); - } - checkForOutputFileWriteability( dc_outfile_dot ); - final SortedSet binary_combinations = createSetOfAllBinaryDomainCombinationsPerGenome( gwcd ); - try { - final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) ); - for( final BinaryDomainCombination bdc : binary_combinations ) { - out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null ) - .toString() ); - out_dot.write( SurfacingConstants.NL ); + final BufferedWriter out_dot = new BufferedWriter( new FileWriter( dc_outfile_dot ) ); + for( final BinaryDomainCombination bdc : binary_combinations ) { + out_dot.write( bdc.toGraphDescribingLanguage( BinaryDomainCombination.OutputFormat.DOT, null, null ) + .toString() ); + out_dot.write( SurfacingConstants.NL ); } out_dot.close(); } catch ( final IOException e ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" - + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " - + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); + if ( input_file_properties[ i ].length == 3 ) { + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); + } + else { + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \"" + + dc_outfile_dot + "\"" ); + } } public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, @@ -2526,7 +1967,7 @@ public final class SurfacingUtil { .getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) ) ) ) { BinaryDomainCombination bdc = null; try { - bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) ); + bdc = BasicBinaryDomainCombination.obtainInstance( matrix.getCharacter( c ) ); } catch ( final Exception e ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); @@ -2587,7 +2028,7 @@ public final class SurfacingUtil { int per_node_counter = 0; out.write( "" ); out.write( SurfacingConstants.NL ); - addHtmlHead( out, title_for_html ); + writeHtmlHead( out, title_for_html ); out.write( SurfacingConstants.NL ); out.write( "" ); out.write( SurfacingConstants.NL ); @@ -2643,7 +2084,7 @@ public final class SurfacingUtil { per_node_counter = 0; if ( matrix.getNumberOfCharacters() > 0 ) { per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file ); + + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file ); SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile ); per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile ); @@ -2712,7 +2153,7 @@ public final class SurfacingUtil { out.write( SurfacingConstants.NL ); out.write( "
" ); out.write( SurfacingConstants.NL ); - } // for( final String id : sorted_ids ) { + } // for( final String id : sorted_ids ) { out.write( "" ); out.write( SurfacingConstants.NL ); out.write( "" ); @@ -2755,7 +2196,7 @@ public final class SurfacingUtil { } else { per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats - .sampleStandardDeviation() ) + "\t" ); + .sampleStandardDeviation() ) + "\t" ); } per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" ); per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" ); @@ -2768,20 +2209,970 @@ public final class SurfacingUtil { per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR ); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - if ( input_file_properties[ i ].length == 3 ) { - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" - + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " - + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" ); - } - else { - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" - + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \"" - + dc_outfile + "\"" ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + if ( input_file_properties[ i ].length == 3 ) { + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" ); + } + else { + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" + + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \"" + + dc_outfile + "\"" ); + } + } + + public static void writeDomainSimilaritiesToFile( final StringBuilder html_desc, + final StringBuilder html_title, + final Writer simple_tab_writer, + final Writer single_writer, + Map split_writers, + final SortedSet similarities, + final boolean treat_as_binary, + final List species_order, + final DomainSimilarity.PRINT_OPTION print_option, + final DomainSimilarity.DomainSimilarityScoring scoring, + final boolean verbose, + final Map tax_code_to_id_map, + final Phylogeny phy, + final Set pos_filter_doms ) throws IOException { + if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) { + split_writers = new HashMap(); + split_writers.put( '_', single_writer ); + } + switch ( print_option ) { + case SIMPLE_TAB_DELIMITED: + break; + case HTML: + for( final Character key : split_writers.keySet() ) { + final Writer w = split_writers.get( key ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( key != '_' ) { + writeHtmlHead( w, "DC analysis (" + html_title + ") " + key.toString().toUpperCase() ); + } + else { + writeHtmlHead( w, "DC analysis (" + html_title + ")" ); + } + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( html_desc.toString() ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + break; + } + // + for( final DomainSimilarity similarity : similarities ) { + if ( ( species_order != null ) && !species_order.isEmpty() ) { + ( similarity ).setSpeciesOrder( species_order ); + } + if ( single_writer != null ) { + if ( !ForesterUtil.isEmpty( pos_filter_doms ) && pos_filter_doms.contains( similarity.getDomainId() ) ) { + single_writer.write( "" ); + } + else { + single_writer.write( "" ); + } + single_writer.write( SurfacingConstants.NL ); + } + else { + Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase() + .charAt( 0 ) ); + if ( local_writer == null ) { + local_writer = split_writers.get( '0' ); + } + if ( !ForesterUtil.isEmpty( pos_filter_doms ) && pos_filter_doms.contains( similarity.getDomainId() ) ) { + local_writer.write( "" ); + } + else { + local_writer.write( "" ); + } + local_writer.write( SurfacingConstants.NL ); + } + } + for( final Writer w : split_writers.values() ) { + w.write( "
Domains:
" + similarity.getDomainId() + + "
" + + similarity.getDomainId() + "
" + similarity.getDomainId() + + "
" + + similarity.getDomainId() + "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + // + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + writeColorLabels( "Deuterostomia", TaxonomyColors.DEUTEROSTOMIA_COLOR, w ); + writeColorLabels( "Protostomia", TaxonomyColors.PROTOSTOMIA_COLOR, w ); + writeColorLabels( "Cnidaria", TaxonomyColors.CNIDARIA_COLOR, w ); + writeColorLabels( "Placozoa", TaxonomyColors.PLACOZOA_COLOR, w ); + writeColorLabels( "Ctenophora (comb jellies)", TaxonomyColors.CTENOPHORA_COLOR, w ); + writeColorLabels( "Porifera (sponges)", TaxonomyColors.PORIFERA_COLOR, w ); + writeColorLabels( "Choanoflagellida", TaxonomyColors.CHOANOFLAGELLIDA, w ); + writeColorLabels( "Ichthyosporea & Filasterea", TaxonomyColors.ICHTHYOSPOREA_AND_FILASTEREA, w ); + writeColorLabels( "Dikarya (Ascomycota & Basidiomycota, so-called \"higher fungi\")", + TaxonomyColors.DIKARYA_COLOR, + w ); + writeColorLabels( "other Fungi", TaxonomyColors.OTHER_FUNGI_COLOR, w ); + writeColorLabels( "Nucleariidae and Fonticula group", + TaxonomyColors.NUCLEARIIDAE_AND_FONTICULA_GROUP_COLOR, + w ); + writeColorLabels( "Amoebozoa", TaxonomyColors.AMOEBOZOA_COLOR, w ); + writeColorLabels( "Embryophyta (plants)", TaxonomyColors.EMBRYOPHYTA_COLOR, w ); + writeColorLabels( "Chlorophyta (green algae)", TaxonomyColors.CHLOROPHYTA_COLOR, w ); + writeColorLabels( "Rhodophyta (red algae)", TaxonomyColors.RHODOPHYTA_COLOR, w ); + writeColorLabels( "Glaucocystophyce (Glaucophyta)", TaxonomyColors.GLAUCOPHYTA_COLOR, w ); + writeColorLabels( "Hacrobia (Cryptophyta & Haptophyceae & Centroheliozoa)", + TaxonomyColors.HACROBIA_COLOR, + w ); + writeColorLabels( "Stramenopiles (Chromophyta, heterokonts)", TaxonomyColors.STRAMENOPILES_COLOR, w ); + writeColorLabels( "Alveolata", TaxonomyColors.ALVEOLATA_COLOR, w ); + writeColorLabels( "Rhizaria", TaxonomyColors.RHIZARIA_COLOR, w ); + writeColorLabels( "Excavata", TaxonomyColors.EXCAVATA_COLOR, w ); + writeColorLabels( "Apusozoa", TaxonomyColors.APUSOZOA_COLOR, w ); + writeColorLabels( "Archaea", TaxonomyColors.ARCHAEA_COLOR, w ); + writeColorLabels( "Bacteria", TaxonomyColors.BACTERIA_COLOR, w ); + w.write( "
" ); + w.write( "Species group colors:" ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + // + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + // + for( final DomainSimilarity similarity : similarities ) { + if ( ( species_order != null ) && !species_order.isEmpty() ) { + ( similarity ).setSpeciesOrder( species_order ); + } + if ( simple_tab_writer != null ) { + simple_tab_writer.write( similarity.toStringBuffer( PRINT_OPTION.SIMPLE_TAB_DELIMITED, + tax_code_to_id_map, + null ).toString() ); + } + if ( single_writer != null ) { + single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() ); + single_writer.write( SurfacingConstants.NL ); + } + else { + Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase() + .charAt( 0 ) ); + if ( local_writer == null ) { + local_writer = split_writers.get( '0' ); + } + local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() ); + local_writer.write( SurfacingConstants.NL ); + } + } + switch ( print_option ) { + case HTML: + for( final Writer w : split_writers.values() ) { + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + break; + default: + break; + } + for( final Writer w : split_writers.values() ) { + w.close(); + } + } + + public static void writeHtmlHead( final Writer w, final String title ) throws IOException { + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( "" ); + w.write( title ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + + public static void writeMatrixToFile( final CharacterStateMatrix matrix, + final String filename, + final Format format ) { + final File outfile = new File( filename ); + checkForOutputFileWriteability( outfile ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); + matrix.toWriter( out, format ); + out.flush(); + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" ); + } + + public static void writeMatrixToFile( final File matrix_outfile, final List matrices ) { + checkForOutputFileWriteability( matrix_outfile ); + try { + final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) ); + for( final DistanceMatrix distance_matrix : matrices ) { + out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() ); + out.write( ForesterUtil.LINE_SEPARATOR ); + out.flush(); + } + out.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" ); + } + + public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) { + final PhylogenyWriter writer = new PhylogenyWriter(); + try { + writer.toPhyloXML( new File( filename ), phylogeny, 1 ); + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": " + + e ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" ); + } + + public static void writePresentToNexus( final File output_file, + final File positive_filter_file, + final SortedSet filter, + final List gwcd_list ) { + try { + writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfDomainPresenceOrAbsence( gwcd_list, + positive_filter_file == null ? null + : filter ), + output_file + surfacing.DOMAINS_PRESENT_NEXUS, + Format.NEXUS_BINARY ); + writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ), + output_file + surfacing.BDC_PRESENT_NEXUS, + Format.NEXUS_BINARY ); + } + catch ( final Exception e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + } + + public static void writeProteinListsForAllSpecies( final File output_dir, + final SortedMap> protein_lists_per_species, + final List gwcd_list, + final double domain_e_cutoff, + final Set pos_filter_doms ) { + final SortedSet all_domains = new TreeSet(); + for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { + all_domains.addAll( gwcd.getAllDomainIds() ); + } + for( final String domain : all_domains ) { + if ( !ForesterUtil.isEmpty( pos_filter_doms ) && !pos_filter_doms.contains( domain ) ) { + continue; + } + final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + surfacing.SEQ_EXTRACT_SUFFIX ); + checkForOutputFileWriteability( out ); + try { + final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) ); + extractProteinNames( protein_lists_per_species, + domain, + proteins_file_writer, + "\t", + surfacing.LIMIT_SPEC_FOR_PROT_EX, + domain_e_cutoff ); + proteins_file_writer.close(); + } + catch ( final IOException e ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote proteins list to \"" + out + "\"" ); + } + } + + public static void writeTaxonomyLinks( final Writer writer, + final String species, + final Map tax_code_to_id_map ) throws IOException { + if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) { + writer.write( " [" ); + if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) { + writer.write( "uniprot" ); + } + else { + writer.write( "eol" ); + writer.write( "|" ); + writer.write( "scholar" ); + writer.write( "|" ); + writer.write( "google" ); + } + writer.write( "]" ); + } + } + + private final static void addToCountMap( final Map map, final String s ) { + if ( map.containsKey( s ) ) { + map.put( s, map.get( s ) + 1 ); + } + else { + map.put( s, 1 ); + } + } + + private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, + final String outfilename_for_counts, + final String outfilename_for_dc, + final String outfilename_for_dc_for_go_mapping, + final String outfilename_for_dc_for_go_mapping_unique, + final String outfilename_for_rank_counts, + final String outfilename_for_ancestor_species_counts, + final String outfilename_for_protein_stats, + final Map protein_length_stats_by_dc, + final Map domain_number_stats_by_dc, + final Map domain_length_stats_by_domain ) { + try { + // + // if ( protein_length_stats_by_dc != null ) { + // for( final Entry entry : protein_length_stats_by_dc.entrySet() ) { + // System.out.print( entry.getKey().toString() ); + // System.out.print( ": " ); + // double[] a = entry.getValue().getDataAsDoubleArray(); + // for( int i = 0; i < a.length; i++ ) { + // System.out.print( a[ i ] + " " ); + // } + // System.out.println(); + // } + // } + // if ( domain_number_stats_by_dc != null ) { + // for( final Entry entry : domain_number_stats_by_dc.entrySet() ) { + // System.out.print( entry.getKey().toString() ); + // System.out.print( ": " ); + // double[] a = entry.getValue().getDataAsDoubleArray(); + // for( int i = 0; i < a.length; i++ ) { + // System.out.print( a[ i ] + " " ); + // } + // System.out.println(); + // } + // } + // + final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); + final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); + final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); + final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) ); + final SortedMap dc_gain_counts = new TreeMap(); + for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + final Set gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters(); + for( final String dc : gained_dc ) { + if ( dc_gain_counts.containsKey( dc ) ) { + dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 ); + } + else { + dc_gain_counts.put( dc, 1 ); + } + } + } + final SortedMap histogram = new TreeMap(); + final SortedMap domain_lists = new TreeMap(); + final SortedMap dc_reapp_counts_to_protein_length_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_number_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_lengths_stats = new TreeMap(); + final SortedMap> domain_lists_go = new TreeMap>(); + final SortedMap> domain_lists_go_unique = new TreeMap>(); + final Set dcs = dc_gain_counts.keySet(); + final SortedSet more_than_once = new TreeSet(); + DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); + DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); + DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics(); + long gained_multiple_times_domain_length_sum = 0; + long gained_once_domain_length_sum = 0; + long gained_multiple_times_domain_length_count = 0; + long gained_once_domain_length_count = 0; + for( final String dc : dcs ) { + final int count = dc_gain_counts.get( dc ); + if ( histogram.containsKey( count ) ) { + histogram.put( count, histogram.get( count ) + 1 ); + domain_lists.get( count ).append( ", " + dc ); + domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) ); + domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) ); + } + else { + histogram.put( count, 1 ); + domain_lists.put( count, new StringBuilder( dc ) ); + final PriorityQueue q = new PriorityQueue(); + q.addAll( splitDomainCombination( dc ) ); + domain_lists_go.put( count, q ); + final SortedSet set = new TreeSet(); + set.addAll( splitDomainCombination( dc ) ); + domain_lists_go_unique.put( count, set ); + } + if ( protein_length_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) { + dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); + } + dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_number_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); + } + dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_length_stats_by_domain != null ) { + if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() ); + } + final String[] ds = dc.split( "=" ); + dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain + .get( ds[ 0 ] ).arithmeticMean() ); + dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain + .get( ds[ 1 ] ).arithmeticMean() ); + } + if ( count > 1 ) { + more_than_once.add( dc ); + if ( protein_length_stats_by_dc != null ) { + final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_multiple_times_lengths_stats.addValue( element ); + } + } + if ( domain_number_stats_by_dc != null ) { + final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_multiple_times_domain_count_stats.addValue( element ); + } + } + if ( domain_length_stats_by_domain != null ) { + final String[] ds = dc.split( "=" ); + final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); + final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); + for( final double element : s0.getData() ) { + gained_multiple_times_domain_length_sum += element; + ++gained_multiple_times_domain_length_count; + } + for( final double element : s1.getData() ) { + gained_multiple_times_domain_length_sum += element; + ++gained_multiple_times_domain_length_count; + } + } + } + else { + if ( protein_length_stats_by_dc != null ) { + final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_once_lengths_stats.addValue( element ); + } + } + if ( domain_number_stats_by_dc != null ) { + final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_once_domain_count_stats.addValue( element ); + } + } + if ( domain_length_stats_by_domain != null ) { + final String[] ds = dc.split( "=" ); + final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); + final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); + for( final double element : s0.getData() ) { + gained_once_domain_length_sum += element; + ++gained_once_domain_length_count; + } + for( final double element : s1.getData() ) { + gained_once_domain_length_sum += element; + ++gained_once_domain_length_count; + } + } + } + } + final Set histogram_keys = histogram.keySet(); + for( final Integer histogram_key : histogram_keys ) { + final int count = histogram.get( histogram_key ); + final StringBuilder dc = domain_lists.get( histogram_key ); + out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); + out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); + final Object[] sorted = domain_lists_go.get( histogram_key ).toArray(); + Arrays.sort( sorted ); + for( final Object domain : sorted ) { + out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR ); + } + out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); + for( final String domain : domain_lists_go_unique.get( histogram_key ) ) { + out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR ); + } + } + out_counts.close(); + out_dc.close(); + out_dc_for_go_mapping.close(); + out_dc_for_go_mapping_unique.close(); + final SortedMap lca_rank_counts = new TreeMap(); + final SortedMap lca_ancestor_species_counts = new TreeMap(); + for( final String dc : more_than_once ) { + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) { + nodes.add( n ); + } + } + for( int i = 0; i < ( nodes.size() - 1 ); ++i ) { + for( int j = i + 1; j < nodes.size(); ++j ) { + final PhylogenyNode lca = PhylogenyMethods.calculateLCA( nodes.get( i ), nodes.get( j ) ); + String rank = "unknown"; + if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) { + rank = lca.getNodeData().getTaxonomy().getRank(); + } + addToCountMap( lca_rank_counts, rank ); + String lca_species; + if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) { + lca_species = lca.getNodeData().getTaxonomy().getScientificName(); + } + else if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) { + lca_species = lca.getNodeData().getTaxonomy().getCommonName(); + } + else { + lca_species = lca.getName(); + } + addToCountMap( lca_ancestor_species_counts, lca_species ); + } + } + } + final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) ); + final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) ); + ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR ); + ForesterUtil.map2writer( out_for_ancestor_species_counts, + lca_ancestor_species_counts, + "\t", + ForesterUtil.LINE_SEPARATOR ); + out_for_rank_counts.close(); + out_for_ancestor_species_counts.close(); + if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats ) + && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) { + final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) ); + w.write( "Domain Lengths: " ); + w.write( "\n" ); + if ( domain_length_stats_by_domain != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_lengths_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Protein Lengths: " ); + w.write( "\n" ); + if ( protein_length_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_protein_length_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Number of domains: " ); + w.write( "\n" ); + if ( domain_number_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_number_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, domain lengths:" ); + w.write( "\n" ); + w.write( "N: " + gained_once_domain_length_count ); + w.write( "\n" ); + w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, domain lengths:" ); + w.write( "\n" ); + w.write( "N: " + gained_multiple_times_domain_length_count ); + w.write( "\n" ); + w.write( "Avg: " + + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, protein lengths:" ); + w.write( "\n" ); + w.write( gained_once_lengths_stats.toString() ); + gained_once_lengths_stats = null; + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, domain counts:" ); + w.write( "\n" ); + w.write( gained_once_domain_count_stats.toString() ); + gained_once_domain_count_stats = null; + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, protein lengths:" ); + w.write( "\n" ); + w.write( gained_multiple_times_lengths_stats.toString() ); + gained_multiple_times_lengths_stats = null; + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, domain counts:" ); + w.write( "\n" ); + w.write( gained_multiple_times_domain_count_stats.toString() ); + w.flush(); + w.close(); + } + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to [" + + outfilename_for_counts + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to [" + + outfilename_for_dc + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote independent domain combination gains fitch lists to (for GO mapping) [" + + outfilename_for_dc_for_go_mapping + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) [" + + outfilename_for_dc_for_go_mapping_unique + "]" ); + } + + private static SortedSet collectAllDomainsChangedOnSubtree( final PhylogenyNode subtree_root, + final boolean get_gains ) { + final SortedSet domains = new TreeSet(); + for( final PhylogenyNode descendant : PhylogenyMethods.getAllDescendants( subtree_root ) ) { + final BinaryCharacters chars = descendant.getNodeData().getBinaryCharacters(); + if ( get_gains ) { + domains.addAll( chars.getGainedCharacters() ); + } + else { + domains.addAll( chars.getLostCharacters() ); + } + } + return domains; + } + + private static File createBaseDirForPerNodeDomainFiles( final String base_dir, + final boolean domain_combinations, + final CharacterStateMatrix.GainLossStates state, + final String outfile ) { + File per_node_go_mapped_domain_gain_loss_files_base_dir = new File( new File( outfile ).getParent() + + ForesterUtil.FILE_SEPARATOR + base_dir ); + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + if ( domain_combinations ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "DC" ); + } + else { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "DOMAINS" ); + } + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + if ( state == GainLossStates.GAIN ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "GAINS" ); + } + else if ( state == GainLossStates.LOSS ) { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "LOSSES" ); + } + else { + per_node_go_mapped_domain_gain_loss_files_base_dir = new File( per_node_go_mapped_domain_gain_loss_files_base_dir + + ForesterUtil.FILE_SEPARATOR + "PRESENT" ); + } + if ( !per_node_go_mapped_domain_gain_loss_files_base_dir.exists() ) { + per_node_go_mapped_domain_gain_loss_files_base_dir.mkdir(); + } + return per_node_go_mapped_domain_gain_loss_files_base_dir; + } + + private static SortedSet createSetOfAllBinaryDomainCombinationsPerGenome( final GenomeWideCombinableDomains gwcd ) { + final SortedMap cds = gwcd.getAllCombinableDomainsIds(); + final SortedSet binary_combinations = new TreeSet(); + for( final String domain_id : cds.keySet() ) { + final CombinableDomains cd = cds.get( domain_id ); + binary_combinations.addAll( cd.toBinaryDomainCombinations() ); + } + return binary_combinations; + } + + private static void printSomeStats( final DescriptiveStatistics stats, final AsciiHistogram histo, final Writer w ) + throws IOException { + w.write( "
" ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" );
+        w.write( SurfacingConstants.NL );
+        if ( histo != null ) {
+            w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
+            w.write( SurfacingConstants.NL );
+        }
+        w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( stats.getN() > 1 ) { + w.write( "" ); + } + else { + w.write( "" ); + } + w.write( SurfacingConstants.NL ); + w.write( "
N: " + stats.getN() + "
Min: " + stats.getMin() + "
Max: " + stats.getMax() + "
Mean: " + stats.arithmeticMean() + "
SD: " + stats.sampleStandardDeviation() + "
SD: n/a
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + } + + private static List splitDomainCombination( final String dc ) { + final String[] s = dc.split( "=" ); + if ( s.length != 2 ) { + ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: " + + dc ); + System.exit( -1 ); + } + final List l = new ArrayList( 2 ); + l.add( s[ 0 ] ); + l.add( s[ 1 ] ); + return l; + } + + private static void writeAllEncounteredPfamsToFile( final Map> domain_id_to_go_ids_map, + final Map go_id_to_term_map, + final String outfile_name, + final SortedSet all_pfams_encountered ) { + final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX ); + final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name + + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); + final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX ); + int biological_process_counter = 0; + int cellular_component_counter = 0; + int molecular_function_counter = 0; + int pfams_with_mappings_counter = 0; + int pfams_without_mappings_counter = 0; + int pfams_without_mappings_to_bp_or_mf_counter = 0; + int pfams_with_mappings_to_bp_or_mf_counter = 0; + try { + final Writer all_pfams_encountered_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_file ) ); + final Writer all_pfams_encountered_with_go_annotation_writer = new BufferedWriter( new FileWriter( all_pfams_encountered_with_go_annotation_file ) ); + final Writer summary_writer = new BufferedWriter( new FileWriter( encountered_pfams_summary_file ) ); + summary_writer.write( "# Pfam to GO mapping summary" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Actual summary is at the end of this file." ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Encountered Pfams without a GO mapping:" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + for( final String pfam : all_pfams_encountered ) { + all_pfams_encountered_writer.write( pfam ); + all_pfams_encountered_writer.write( ForesterUtil.LINE_SEPARATOR ); + final String domain_id = new String( pfam ); + if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { + ++pfams_with_mappings_counter; + all_pfams_encountered_with_go_annotation_writer.write( pfam ); + all_pfams_encountered_with_go_annotation_writer.write( ForesterUtil.LINE_SEPARATOR ); + final List go_ids = domain_id_to_go_ids_map.get( domain_id ); + boolean maps_to_bp = false; + boolean maps_to_cc = false; + boolean maps_to_mf = false; + for( final GoId go_id : go_ids ) { + final GoTerm go_term = go_id_to_term_map.get( go_id ); + if ( go_term.getGoNameSpace().isBiologicalProcess() ) { + maps_to_bp = true; + } + else if ( go_term.getGoNameSpace().isCellularComponent() ) { + maps_to_cc = true; + } + else if ( go_term.getGoNameSpace().isMolecularFunction() ) { + maps_to_mf = true; + } + } + if ( maps_to_bp ) { + ++biological_process_counter; + } + if ( maps_to_cc ) { + ++cellular_component_counter; + } + if ( maps_to_mf ) { + ++molecular_function_counter; + } + if ( maps_to_bp || maps_to_mf ) { + ++pfams_with_mappings_to_bp_or_mf_counter; + } + else { + ++pfams_without_mappings_to_bp_or_mf_counter; + } + } + else { + ++pfams_without_mappings_to_bp_or_mf_counter; + ++pfams_without_mappings_counter; + summary_writer.write( pfam ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + } + } + all_pfams_encountered_writer.close(); + all_pfams_encountered_with_go_annotation_writer.close(); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size() + + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter + + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file + + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all [" + + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \"" + + encountered_pfams_summary_file + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered : " + + all_pfams_encountered.size() ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : " + + pfams_without_mappings_counter + " [" + + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : " + + pfams_without_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : " + + pfams_with_mappings_counter + " [" + + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : " + + pfams_with_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: " + + biological_process_counter + " [" + + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: " + + molecular_function_counter + " [" + + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: " + + cellular_component_counter + " [" + + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Sum of Pfams encountered : " + all_pfams_encountered.size() ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams without a mapping : " + pfams_without_mappings_counter + + " [" + ( ( 100 * pfams_without_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams without mapping to proc. or func. : " + + pfams_without_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_without_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with a mapping : " + pfams_with_mappings_counter + " [" + + ( ( 100 * pfams_with_mappings_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with a mapping to proc. or func. : " + + pfams_with_mappings_to_bp_or_mf_counter + " [" + + ( ( 100 * pfams_with_mappings_to_bp_or_mf_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to biological process: " + biological_process_counter + " [" + + ( ( 100 * biological_process_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to molecular function: " + molecular_function_counter + " [" + + ( ( 100 * molecular_function_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.write( "# Pfams with mapping to cellular component: " + cellular_component_counter + " [" + + ( ( 100 * cellular_component_counter ) / all_pfams_encountered.size() ) + "%]" ); + summary_writer.write( ForesterUtil.LINE_SEPARATOR ); + summary_writer.close(); + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); } } + private final static void writeColorLabels( final String l, final Color c, final Writer w ) throws IOException { + w.write( "" ); + w.write( l ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } + private static void writeDomainData( final Map> domain_id_to_go_ids_map, final Map go_id_to_term_map, final GoNameSpace go_namespace_limit, @@ -2798,7 +3189,7 @@ public final class SurfacingUtil { if ( ForesterUtil.isEmpty( domain_1 ) ) { domain_count = 1; } - // The following has a difficult to understand logic. + // The following has a difficult to understand logic. for( int d = 0; d < domain_count; ++d ) { List go_ids = null; boolean go_annotation_present = false; @@ -2843,7 +3234,7 @@ public final class SurfacingUtil { final String go_id_str = go_id.getId(); out.write( "" ); out.write( "" + go_id_str + "" ); + + "\" target=\"amigo_window\">" + go_id_str + "" ); out.write( "" ); out.write( go_term.getName() ); if ( domain_count == 2 ) { @@ -2871,7 +3262,7 @@ public final class SurfacingUtil { out.write( SurfacingConstants.NL ); } } - } // for( int d = 0; d < domain_count; ++d ) + } // for( int d = 0; d < domain_count; ++d ) if ( !any_go_annotation_present ) { out.write( "" ); writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps ); @@ -2890,7 +3281,7 @@ public final class SurfacingUtil { final String domain_1, final String prefix_for_detailed_html, final Map>[] domain_id_to_secondary_features_maps ) - throws IOException { + throws IOException { out.write( "" ); if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) { out.write( prefix_for_detailed_html ); @@ -2900,126 +3291,6 @@ public final class SurfacingUtil { out.write( "" ); } - public static void writeDomainSimilaritiesToFile( final StringBuilder html_desc, - final StringBuilder html_title, - final Writer simple_tab_writer, - final Writer single_writer, - Map split_writers, - final SortedSet similarities, - final boolean treat_as_binary, - final List species_order, - final PrintableDomainSimilarity.PRINT_OPTION print_option, - final DomainSimilarity.DomainSimilarityScoring scoring, - final boolean verbose, - final Map tax_code_to_id_map, - Phylogeny phy ) throws IOException { - if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) { - split_writers = new HashMap(); - split_writers.put( '_', single_writer ); - } - switch ( print_option ) { - case SIMPLE_TAB_DELIMITED: - break; - case HTML: - for( final Character key : split_writers.keySet() ) { - final Writer w = split_writers.get( key ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( key != '_' ) { - addHtmlHead( w, "DC analysis (" + html_title + ") " + key.toString().toUpperCase() ); - } - else { - addHtmlHead( w, "DC analysis (" + html_title + ")" ); - } - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( html_desc.toString() ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - } - break; - } - // - for( final DomainSimilarity similarity : similarities ) { - if ( ( species_order != null ) && !species_order.isEmpty() ) { - ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); - } - if ( single_writer != null ) { - single_writer.write( "" ); - single_writer.write( SurfacingConstants.NL ); - } - else { - Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase() - .charAt( 0 ) ); - if ( local_writer == null ) { - local_writer = split_writers.get( '0' ); - } - local_writer.write( "" ); - local_writer.write( SurfacingConstants.NL ); - } - } - for( final Writer w : split_writers.values() ) { - w.write( "
Domains:
" - + similarity.getDomainId() + "
" - + similarity.getDomainId() + "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - } - // - for( final DomainSimilarity similarity : similarities ) { - if ( ( species_order != null ) && !species_order.isEmpty() ) { - ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); - } - if ( simple_tab_writer != null ) { - simple_tab_writer.write( similarity.toStringBuffer( PRINT_OPTION.SIMPLE_TAB_DELIMITED, - tax_code_to_id_map, - null ).toString() ); - } - if ( single_writer != null ) { - single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() ); - single_writer.write( SurfacingConstants.NL ); - } - else { - Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase() - .charAt( 0 ) ); - if ( local_writer == null ) { - local_writer = split_writers.get( '0' ); - } - local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map, phy ).toString() ); - local_writer.write( SurfacingConstants.NL ); - } - } - switch ( print_option ) { - case HTML: - for( final Writer w : split_writers.values() ) { - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - } - break; - } - for( final Writer w : split_writers.values() ) { - w.close(); - } - } - private static void writeDomainsToIndividualFilePerTreeNode( final Writer individual_files_writer, final String domain_0, final String domain_1 ) throws IOException { @@ -3031,40 +3302,6 @@ public final class SurfacingUtil { } } - public static void writeMatrixToFile( final CharacterStateMatrix matrix, - final String filename, - final Format format ) { - final File outfile = new File( filename ); - checkForOutputFileWriteability( outfile ); - try { - final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) ); - matrix.toWriter( out, format ); - out.flush(); - out.close(); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" ); - } - - public static void writeMatrixToFile( final File matrix_outfile, final List matrices ) { - checkForOutputFileWriteability( matrix_outfile ); - try { - final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) ); - for( final DistanceMatrix distance_matrix : matrices ) { - out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() ); - out.write( ForesterUtil.LINE_SEPARATOR ); - out.flush(); - } - out.close(); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" ); - } - private static void writePfamsToFile( final String outfile_name, final SortedSet pfams ) { try { final Writer writer = new BufferedWriter( new FileWriter( new File( outfile_name ) ) ); @@ -3074,101 +3311,19 @@ public final class SurfacingUtil { } writer.close(); ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name - + "]" ); + + "]" ); } catch ( final IOException e ) { ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); } } - public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) { - final PhylogenyWriter writer = new PhylogenyWriter(); - try { - writer.toPhyloXML( new File( filename ), phylogeny, 1 ); - } - catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": " - + e ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" ); - } - - public static void writePresentToNexus( final File output_file, - final File positive_filter_file, - final SortedSet filter, - final List gwcd_list ) { - try { - writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfDomainPresenceOrAbsence( gwcd_list, - positive_filter_file == null ? null - : filter ), - output_file + surfacing.DOMAINS_PRESENT_NEXUS, - Format.NEXUS_BINARY ); - writeMatrixToFile( DomainParsimonyCalculator.createMatrixOfBinaryDomainCombinationPresenceOrAbsence( gwcd_list ), - output_file + surfacing.BDC_PRESENT_NEXUS, - Format.NEXUS_BINARY ); - } - catch ( final Exception e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); - } - } - - public static void writeProteinListsForAllSpecies( final File output_dir, - final SortedMap> protein_lists_per_species, - final List gwcd_list, - final double domain_e_cutoff ) { - final SortedSet all_domains = new TreeSet(); - for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { - all_domains.addAll( gwcd.getAllDomainIds() ); - } - for( final String domain : all_domains ) { - final File out = new File( output_dir + ForesterUtil.FILE_SEPARATOR + domain + surfacing.SEQ_EXTRACT_SUFFIX ); - checkForOutputFileWriteability( out ); - try { - final Writer proteins_file_writer = new BufferedWriter( new FileWriter( out ) ); - extractProteinNames( protein_lists_per_species, - domain, - proteins_file_writer, - "\t", - surfacing.LIMIT_SPEC_FOR_PROT_EX, - domain_e_cutoff ); - proteins_file_writer.close(); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); - } - ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote proteins list to \"" + out + "\"" ); - } - } - - public static void writeTaxonomyLinks( final Writer writer, - final String species, - final Map tax_code_to_id_map ) throws IOException { - if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) { - writer.write( " [" ); - if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) { - writer.write( "uniprot" ); - } - else { - writer.write( "eol" ); - writer.write( "|" ); - writer.write( "scholar" ); - writer.write( "|" ); - writer.write( "google" ); - } - writer.write( "]" ); - } - } - private static void writeToNexus( final String outfile_name, final CharacterStateMatrix matrix, final Phylogeny phylogeny ) { if ( !( matrix instanceof BasicCharacterStateMatrix ) ) { throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class - + "] to nexus" ); + + "] to nexus" ); } final BasicCharacterStateMatrix my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix ) matrix; final List phylogenies = new ArrayList( 1 ); @@ -3200,7 +3355,23 @@ public final class SurfacingUtil { phylogeny ); } - private SurfacingUtil() { - // Hidden constructor. + final static class DomainComparator implements Comparator { + + final private boolean _ascending; + + public DomainComparator( final boolean ascending ) { + _ascending = ascending; + } + + @Override + public final int compare( final Domain d0, final Domain d1 ) { + if ( d0.getFrom() < d1.getFrom() ) { + return _ascending ? -1 : 1; + } + else if ( d0.getFrom() > d1.getFrom() ) { + return _ascending ? 1 : -1; + } + return 0; + } } }