+ else {
+ per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats
+ .sampleStandardDeviation() ) + "\t" );
+ }
+ per_genome_domain_promiscuity_statistics_writer.write( FORMATTER_3.format( stats.median() ) + "\t" );
+ per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMin() + "\t" );
+ per_genome_domain_promiscuity_statistics_writer.write( ( int ) stats.getMax() + "\t" );
+ per_genome_domain_promiscuity_statistics_writer.write( stats.getN() + "\t" );
+ final SortedSet<String> mpds = gwcd.getMostPromiscuosDomain();
+ for( final String mpd : mpds ) {
+ per_genome_domain_promiscuity_statistics_writer.write( mpd + " " );
+ }
+ per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR );
+ }
+ catch ( final IOException e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
+ }
+ if ( input_file_properties[ i ].length == 3 ) {
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
+ + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", "
+ + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" );
+ }
+ else {
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \""
+ + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \""
+ + dc_outfile + "\"" );
+ }
+ }
+
+ public static void writeDomainSimilaritiesToFile( final StringBuilder html_desc,
+ final StringBuilder html_title,
+ final Writer simple_tab_writer,
+ final Writer single_writer,
+ Map<Character, Writer> split_writers,
+ final SortedSet<DomainSimilarity> similarities,
+ final boolean treat_as_binary,
+ final List<Species> species_order,
+ final PrintableDomainSimilarity.PRINT_OPTION print_option,
+ final DomainSimilarity.DomainSimilarityScoring scoring,
+ final boolean verbose,
+ final Map<String, Integer> tax_code_to_id_map )
+ throws IOException {
+ if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) {
+ split_writers = new HashMap<Character, Writer>();
+ split_writers.put( '_', single_writer );
+ }
+ switch ( print_option ) {
+ case SIMPLE_TAB_DELIMITED:
+ break;
+ case HTML:
+ for( final Character key : split_writers.keySet() ) {
+ final Writer w = split_writers.get( key );
+ w.write( "<html>" );
+ w.write( SurfacingConstants.NL );
+ if ( key != '_' ) {
+ addHtmlHead( w, "DC analysis (" + html_title + ") " + key.toString().toUpperCase() );
+ }
+ else {
+ addHtmlHead( w, "DC analysis (" + html_title + ")" );
+ }
+ w.write( SurfacingConstants.NL );
+ w.write( "<body>" );
+ w.write( SurfacingConstants.NL );
+ w.write( html_desc.toString() );
+ w.write( SurfacingConstants.NL );
+ w.write( "<hr>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<br>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<table>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tr><td><b>Domains:</b></td></tr>" );
+ w.write( SurfacingConstants.NL );
+ }
+ break;
+ }
+ //
+ for( final DomainSimilarity similarity : similarities ) {
+ if ( ( species_order != null ) && !species_order.isEmpty() ) {
+ ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
+ }
+ if ( single_writer != null ) {
+ single_writer.write( "<tr><td><b><a href=\"#" + similarity.getDomainId() + "\">"
+ + similarity.getDomainId() + "</a></b></td></tr>" );
+ single_writer.write( SurfacingConstants.NL );
+ }
+ else {
+ Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase()
+ .charAt( 0 ) );
+ if ( local_writer == null ) {
+ local_writer = split_writers.get( '0' );
+ }
+ local_writer.write( "<tr><td><b><a href=\"#" + similarity.getDomainId() + "\">"
+ + similarity.getDomainId() + "</a></b></td></tr>" );
+ local_writer.write( SurfacingConstants.NL );
+ }
+ }
+ for( final Writer w : split_writers.values() ) {
+ w.write( "</table>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<hr>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<table>" );
+ w.write( SurfacingConstants.NL );
+ }
+ //
+ for( final DomainSimilarity similarity : similarities ) {
+ if ( ( species_order != null ) && !species_order.isEmpty() ) {
+ ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order );
+ }
+ if ( simple_tab_writer != null ) {
+ simple_tab_writer.write( similarity.toStringBuffer( PRINT_OPTION.SIMPLE_TAB_DELIMITED,
+ tax_code_to_id_map ).toString() );
+ }
+ if ( single_writer != null ) {
+ single_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map ).toString() );
+ single_writer.write( SurfacingConstants.NL );
+ }
+ else {
+ Writer local_writer = split_writers.get( ( similarity.getDomainId().charAt( 0 ) + "" ).toLowerCase()
+ .charAt( 0 ) );
+ if ( local_writer == null ) {
+ local_writer = split_writers.get( '0' );
+ }
+ local_writer.write( similarity.toStringBuffer( print_option, tax_code_to_id_map ).toString() );
+ local_writer.write( SurfacingConstants.NL );
+ }
+ }
+ switch ( print_option ) {
+ case HTML:
+ for( final Writer w : split_writers.values() ) {
+ w.write( SurfacingConstants.NL );
+ w.write( "</table>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "</font>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "</body>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "</html>" );
+ w.write( SurfacingConstants.NL );
+ }
+ break;
+ }
+ for( final Writer w : split_writers.values() ) {
+ w.close();
+ }
+ }
+
+ private static void printSomeStats( final DescriptiveStatistics stats, final AsciiHistogram histo, final Writer w )
+ throws IOException {
+ w.write( "<hr>" );
+ w.write( "<br>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tt><pre>" );
+ w.write( SurfacingConstants.NL );
+ if ( histo != null ) {
+ w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
+ w.write( SurfacingConstants.NL );
+ }
+ w.write( "</pre></tt>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<table>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tr><td>N: </td><td>" + stats.getN() + "</td></tr>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tr><td>Min: </td><td>" + stats.getMin() + "</td></tr>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tr><td>Max: </td><td>" + stats.getMax() + "</td></tr>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<tr><td>Mean: </td><td>" + stats.arithmeticMean() + "</td></tr>" );
+ w.write( SurfacingConstants.NL );
+ if ( stats.getN() > 1 ) {
+ w.write( "<tr><td>SD: </td><td>" + stats.sampleStandardDeviation() + "</td></tr>" );
+ }
+ else {
+ w.write( "<tr><td>SD: </td><td>n/a</td></tr>" );
+ }
+ w.write( SurfacingConstants.NL );
+ w.write( "</table>" );
+ w.write( SurfacingConstants.NL );
+ w.write( "<br>" );
+ w.write( SurfacingConstants.NL );
+ }
+
+ public static void writeMatrixToFile( final CharacterStateMatrix<?> matrix,
+ final String filename,
+ final Format format ) {
+ final File outfile = new File( filename );
+ checkForOutputFileWriteability( outfile );
+ try {
+ final BufferedWriter out = new BufferedWriter( new FileWriter( outfile ) );
+ matrix.toWriter( out, format );
+ out.flush();
+ out.close();
+ }
+ catch ( final IOException e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
+ }
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" );
+ }
+
+ public static void writeMatrixToFile( final File matrix_outfile, final List<DistanceMatrix> matrices ) {
+ checkForOutputFileWriteability( matrix_outfile );
+ try {
+ final BufferedWriter out = new BufferedWriter( new FileWriter( matrix_outfile ) );
+ for( final DistanceMatrix distance_matrix : matrices ) {
+ out.write( distance_matrix.toStringBuffer( DistanceMatrix.Format.PHYLIP ).toString() );
+ out.write( ForesterUtil.LINE_SEPARATOR );
+ out.flush();
+ }
+ out.close();
+ }
+ catch ( final IOException e ) {
+ ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() );
+ }
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" );
+ }
+
+ public static void writePhylogenyToFile( final Phylogeny phylogeny, final String filename ) {
+ final PhylogenyWriter writer = new PhylogenyWriter();
+ try {
+ writer.toPhyloXML( new File( filename ), phylogeny, 1 );
+ }
+ catch ( final IOException e ) {
+ ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": "
+ + e );
+ }
+ ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" );
+ }
+
+ public static void writeTaxonomyLinks( final Writer writer,
+ final String species,
+ final Map<String, Integer> tax_code_to_id_map ) throws IOException {
+ if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) {
+ writer.write( " [" );
+ if ( ( tax_code_to_id_map != null ) && tax_code_to_id_map.containsKey( species ) ) {
+ writer.write( "<a href=\"" + SurfacingConstants.UNIPROT_TAXONOMY_ID_LINK
+ + tax_code_to_id_map.get( species ) + "\" target=\"taxonomy_window\">uniprot</a>" );
+ }
+ else {
+ writer.write( "<a href=\"" + SurfacingConstants.EOL_LINK + species
+ + "\" target=\"taxonomy_window\">eol</a>" );
+ writer.write( "|" );
+ writer.write( "<a href=\"" + SurfacingConstants.GOOGLE_SCHOLAR_SEARCH + species
+ + "\" target=\"taxonomy_window\">scholar</a>" );
+ writer.write( "|" );
+ writer.write( "<a href=\"" + SurfacingConstants.GOOGLE_WEB_SEARCH_LINK + species
+ + "\" target=\"taxonomy_window\">google</a>" );
+ }
+ writer.write( "]" );
+ }
+ }
+
+ private final static void addToCountMap( final Map<String, Integer> map, final String s ) {
+ if ( map.containsKey( s ) ) {
+ map.put( s, map.get( s ) + 1 );
+ }
+ else {
+ map.put( s, 1 );
+ }
+ }
+
+ private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l,
+ final String outfilename_for_counts,
+ final String outfilename_for_dc,
+ final String outfilename_for_dc_for_go_mapping,
+ final String outfilename_for_dc_for_go_mapping_unique,
+ final String outfilename_for_rank_counts,
+ final String outfilename_for_ancestor_species_counts,
+ final String outfilename_for_protein_stats,
+ final Map<String, DescriptiveStatistics> protein_length_stats_by_dc,
+ final Map<String, DescriptiveStatistics> domain_number_stats_by_dc,
+ final Map<String, DescriptiveStatistics> domain_length_stats_by_domain ) {
+ try {
+ //
+ // if ( protein_length_stats_by_dc != null ) {
+ // for( final Entry<?, DescriptiveStatistics> entry : protein_length_stats_by_dc.entrySet() ) {
+ // System.out.print( entry.getKey().toString() );
+ // System.out.print( ": " );
+ // double[] a = entry.getValue().getDataAsDoubleArray();
+ // for( int i = 0; i < a.length; i++ ) {
+ // System.out.print( a[ i ] + " " );
+ // }
+ // System.out.println();
+ // }
+ // }
+ // if ( domain_number_stats_by_dc != null ) {
+ // for( final Entry<?, DescriptiveStatistics> entry : domain_number_stats_by_dc.entrySet() ) {
+ // System.out.print( entry.getKey().toString() );
+ // System.out.print( ": " );
+ // double[] a = entry.getValue().getDataAsDoubleArray();
+ // for( int i = 0; i < a.length; i++ ) {
+ // System.out.print( a[ i ] + " " );
+ // }
+ // System.out.println();
+ // }
+ // }
+ //
+ final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) );
+ final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) );
+ final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) );
+ final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) );
+ final SortedMap<String, Integer> dc_gain_counts = new TreeMap<String, Integer>();
+ for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) {
+ final PhylogenyNode n = it.next();
+ final Set<String> gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters();
+ for( final String dc : gained_dc ) {
+ if ( dc_gain_counts.containsKey( dc ) ) {
+ dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 );
+ }
+ else {
+ dc_gain_counts.put( dc, 1 );
+ }
+ }
+ }
+ final SortedMap<Integer, Integer> histogram = new TreeMap<Integer, Integer>();
+ final SortedMap<Integer, StringBuilder> domain_lists = new TreeMap<Integer, StringBuilder>();
+ final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_protein_length_stats = new TreeMap<Integer, DescriptiveStatistics>();
+ final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_number_stats = new TreeMap<Integer, DescriptiveStatistics>();
+ final SortedMap<Integer, DescriptiveStatistics> dc_reapp_counts_to_domain_lengths_stats = new TreeMap<Integer, DescriptiveStatistics>();
+ final SortedMap<Integer, PriorityQueue<String>> domain_lists_go = new TreeMap<Integer, PriorityQueue<String>>();
+ final SortedMap<Integer, SortedSet<String>> domain_lists_go_unique = new TreeMap<Integer, SortedSet<String>>();
+ final Set<String> dcs = dc_gain_counts.keySet();
+ final SortedSet<String> more_than_once = new TreeSet<String>();
+ DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics();
+ DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics();
+ DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics();
+ final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics();
+ long gained_multiple_times_domain_length_sum = 0;
+ long gained_once_domain_length_sum = 0;
+ long gained_multiple_times_domain_length_count = 0;
+ long gained_once_domain_length_count = 0;
+ for( final String dc : dcs ) {
+ final int count = dc_gain_counts.get( dc );
+ if ( histogram.containsKey( count ) ) {
+ histogram.put( count, histogram.get( count ) + 1 );
+ domain_lists.get( count ).append( ", " + dc );
+ domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) );
+ domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) );
+ }
+ else {
+ histogram.put( count, 1 );
+ domain_lists.put( count, new StringBuilder( dc ) );
+ final PriorityQueue<String> q = new PriorityQueue<String>();
+ q.addAll( splitDomainCombination( dc ) );
+ domain_lists_go.put( count, q );
+ final SortedSet<String> set = new TreeSet<String>();
+ set.addAll( splitDomainCombination( dc ) );
+ domain_lists_go_unique.put( count, set );
+ }
+ if ( protein_length_stats_by_dc != null ) {
+ if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) {
+ dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() );
+ }
+ dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc )
+ .arithmeticMean() );
+ }
+ if ( domain_number_stats_by_dc != null ) {
+ if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) {
+ dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() );
+ }
+ dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc )
+ .arithmeticMean() );
+ }
+ if ( domain_length_stats_by_domain != null ) {
+ if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) {
+ dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() );
+ }
+ final String[] ds = dc.split( "=" );
+ dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
+ .get( ds[ 0 ] ).arithmeticMean() );
+ dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain
+ .get( ds[ 1 ] ).arithmeticMean() );
+ }
+ if ( count > 1 ) {
+ more_than_once.add( dc );
+ if ( protein_length_stats_by_dc != null ) {
+ final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc );
+ for( final double element : s.getData() ) {
+ gained_multiple_times_lengths_stats.addValue( element );
+ }
+ }
+ if ( domain_number_stats_by_dc != null ) {
+ final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc );
+ for( final double element : s.getData() ) {
+ gained_multiple_times_domain_count_stats.addValue( element );
+ }
+ }
+ if ( domain_length_stats_by_domain != null ) {
+ final String[] ds = dc.split( "=" );
+ final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] );
+ final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] );
+ for( final double element : s0.getData() ) {
+ gained_multiple_times_domain_length_sum += element;
+ ++gained_multiple_times_domain_length_count;
+ }
+ for( final double element : s1.getData() ) {
+ gained_multiple_times_domain_length_sum += element;
+ ++gained_multiple_times_domain_length_count;
+ }
+ }