X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsurfacing%2FSurfacingUtil.java;h=ab83ebfed2477a6649ed95ad9082077c01d3ea97;hb=fffc26ac5f8cf4eaa5faea6a7e369b94d381d859;hp=8b1ffe72f5fb9a750d6016e08fbd8f01da5e23f1;hpb=5383c68bc6041a8e8eaab4258e7901b64d4cc081;p=jalview.git diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index 8b1ffe7..ab83ebf 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -34,12 +34,15 @@ import java.io.Writer; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Map.Entry; +import java.util.PriorityQueue; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; @@ -48,27 +51,35 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.forester.application.surfacing_old; +import org.forester.application.surfacing; import org.forester.evoinference.distance.NeighborJoining; import org.forester.evoinference.matrix.character.BasicCharacterStateMatrix; import org.forester.evoinference.matrix.character.CharacterStateMatrix; import org.forester.evoinference.matrix.character.CharacterStateMatrix.BinaryStates; import org.forester.evoinference.matrix.character.CharacterStateMatrix.Format; import org.forester.evoinference.matrix.character.CharacterStateMatrix.GainLossStates; +import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix; import org.forester.evoinference.matrix.distance.DistanceMatrix; import org.forester.go.GoId; import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; -import org.forester.go.GoUtils; import org.forester.go.PfamToGoMapping; import org.forester.io.parsers.nexus.NexusConstants; import org.forester.io.writers.PhylogenyWriter; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.PhylogenyNodeI.NH_CONVERSION_SUPPORT_VALUE_STYLE; import org.forester.phylogeny.data.BinaryCharacters; import org.forester.phylogeny.data.Confidence; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.protein.BasicDomain; +import org.forester.protein.BasicProtein; +import org.forester.protein.BinaryDomainCombination; +import org.forester.protein.Domain; +import org.forester.protein.DomainId; +import org.forester.protein.Protein; +import org.forester.species.Species; import org.forester.surfacing.DomainSimilarityCalculator.Detailedness; import org.forester.surfacing.DomainSimilarityCalculator.GoAnnotationOutput; import org.forester.surfacing.GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder; @@ -103,6 +114,7 @@ public final class SurfacingUtil { } }; public final static Pattern PATTERN_SP_STYLE_TAXONOMY = Pattern.compile( "^[A-Z0-9]{3,5}$" ); + private static final boolean USE_LAST = true; private SurfacingUtil() { // Hidden constructor. @@ -161,6 +173,348 @@ public final class SurfacingUtil { return stats; } + private static void calculateIndependentDomainCombinationGains( final Phylogeny local_phylogeny_l, + final String outfilename_for_counts, + final String outfilename_for_dc, + final String outfilename_for_dc_for_go_mapping, + final String outfilename_for_dc_for_go_mapping_unique, + final String outfilename_for_rank_counts, + final String outfilename_for_ancestor_species_counts, + final String outfilename_for_protein_stats, + final Map protein_length_stats_by_dc, + final Map domain_number_stats_by_dc, + final Map domain_length_stats_by_domain ) { + try { + // + // if ( protein_length_stats_by_dc != null ) { + // for( final Entry entry : protein_length_stats_by_dc.entrySet() ) { + // System.out.print( entry.getKey().toString() ); + // System.out.print( ": " ); + // double[] a = entry.getValue().getDataAsDoubleArray(); + // for( int i = 0; i < a.length; i++ ) { + // System.out.print( a[ i ] + " " ); + // } + // System.out.println(); + // } + // } + // if ( domain_number_stats_by_dc != null ) { + // for( final Entry entry : domain_number_stats_by_dc.entrySet() ) { + // System.out.print( entry.getKey().toString() ); + // System.out.print( ": " ); + // double[] a = entry.getValue().getDataAsDoubleArray(); + // for( int i = 0; i < a.length; i++ ) { + // System.out.print( a[ i ] + " " ); + // } + // System.out.println(); + // } + // } + // + final BufferedWriter out_counts = new BufferedWriter( new FileWriter( outfilename_for_counts ) ); + final BufferedWriter out_dc = new BufferedWriter( new FileWriter( outfilename_for_dc ) ); + final BufferedWriter out_dc_for_go_mapping = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping ) ); + final BufferedWriter out_dc_for_go_mapping_unique = new BufferedWriter( new FileWriter( outfilename_for_dc_for_go_mapping_unique ) ); + final SortedMap dc_gain_counts = new TreeMap(); + for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorPostorder(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + final Set gained_dc = n.getNodeData().getBinaryCharacters().getGainedCharacters(); + for( final String dc : gained_dc ) { + if ( dc_gain_counts.containsKey( dc ) ) { + dc_gain_counts.put( dc, dc_gain_counts.get( dc ) + 1 ); + } + else { + dc_gain_counts.put( dc, 1 ); + } + } + } + final SortedMap histogram = new TreeMap(); + final SortedMap domain_lists = new TreeMap(); + final SortedMap dc_reapp_counts_to_protein_length_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_number_stats = new TreeMap(); + final SortedMap dc_reapp_counts_to_domain_lengths_stats = new TreeMap(); + final SortedMap> domain_lists_go = new TreeMap>(); + final SortedMap> domain_lists_go_unique = new TreeMap>(); + final Set dcs = dc_gain_counts.keySet(); + final SortedSet more_than_once = new TreeSet(); + final DescriptiveStatistics gained_once_lengths_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_once_domain_count_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_multiple_times_lengths_stats = new BasicDescriptiveStatistics(); + final DescriptiveStatistics gained_multiple_times_domain_count_stats = new BasicDescriptiveStatistics(); + long gained_multiple_times_domain_length_sum = 0; + long gained_once_domain_length_sum = 0; + long gained_multiple_times_domain_length_count = 0; + long gained_once_domain_length_count = 0; + for( final String dc : dcs ) { + final int count = dc_gain_counts.get( dc ); + if ( histogram.containsKey( count ) ) { + histogram.put( count, histogram.get( count ) + 1 ); + domain_lists.get( count ).append( ", " + dc ); + domain_lists_go.get( count ).addAll( splitDomainCombination( dc ) ); + domain_lists_go_unique.get( count ).addAll( splitDomainCombination( dc ) ); + } + else { + histogram.put( count, 1 ); + domain_lists.put( count, new StringBuilder( dc ) ); + final PriorityQueue q = new PriorityQueue(); + q.addAll( splitDomainCombination( dc ) ); + domain_lists_go.put( count, q ); + final SortedSet set = new TreeSet(); + set.addAll( splitDomainCombination( dc ) ); + domain_lists_go_unique.put( count, set ); + } + if ( protein_length_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_protein_length_stats.containsKey( count ) ) { + dc_reapp_counts_to_protein_length_stats.put( count, new BasicDescriptiveStatistics() ); + } + dc_reapp_counts_to_protein_length_stats.get( count ).addValue( protein_length_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_number_stats_by_dc != null ) { + if ( !dc_reapp_counts_to_domain_number_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_number_stats.put( count, new BasicDescriptiveStatistics() ); + } + dc_reapp_counts_to_domain_number_stats.get( count ).addValue( domain_number_stats_by_dc.get( dc ) + .arithmeticMean() ); + } + if ( domain_length_stats_by_domain != null ) { + if ( !dc_reapp_counts_to_domain_lengths_stats.containsKey( count ) ) { + dc_reapp_counts_to_domain_lengths_stats.put( count, new BasicDescriptiveStatistics() ); + } + final String[] ds = dc.split( "=" ); + dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain + .get( ds[ 0 ] ).arithmeticMean() ); + dc_reapp_counts_to_domain_lengths_stats.get( count ).addValue( domain_length_stats_by_domain + .get( ds[ 1 ] ).arithmeticMean() ); + } + if ( count > 1 ) { + more_than_once.add( dc ); + if ( protein_length_stats_by_dc != null ) { + final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_multiple_times_lengths_stats.addValue( element ); + } + } + if ( domain_number_stats_by_dc != null ) { + final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_multiple_times_domain_count_stats.addValue( element ); + } + } + if ( domain_length_stats_by_domain != null ) { + final String[] ds = dc.split( "=" ); + final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); + final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); + for( final double element : s0.getData() ) { + gained_multiple_times_domain_length_sum += element; + ++gained_multiple_times_domain_length_count; + } + for( final double element : s1.getData() ) { + gained_multiple_times_domain_length_sum += element; + ++gained_multiple_times_domain_length_count; + } + } + } + else { + if ( protein_length_stats_by_dc != null ) { + final DescriptiveStatistics s = protein_length_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_once_lengths_stats.addValue( element ); + } + } + if ( domain_number_stats_by_dc != null ) { + final DescriptiveStatistics s = domain_number_stats_by_dc.get( dc ); + for( final double element : s.getData() ) { + gained_once_domain_count_stats.addValue( element ); + } + } + if ( domain_length_stats_by_domain != null ) { + final String[] ds = dc.split( "=" ); + final DescriptiveStatistics s0 = domain_length_stats_by_domain.get( ds[ 0 ] ); + final DescriptiveStatistics s1 = domain_length_stats_by_domain.get( ds[ 1 ] ); + for( final double element : s0.getData() ) { + gained_once_domain_length_sum += element; + ++gained_once_domain_length_count; + } + for( final double element : s1.getData() ) { + gained_once_domain_length_sum += element; + ++gained_once_domain_length_count; + } + } + } + } + final Set histogram_keys = histogram.keySet(); + for( final Integer histogram_key : histogram_keys ) { + final int count = histogram.get( histogram_key ); + final StringBuilder dc = domain_lists.get( histogram_key ); + out_counts.write( histogram_key + "\t" + count + ForesterUtil.LINE_SEPARATOR ); + out_dc.write( histogram_key + "\t" + dc + ForesterUtil.LINE_SEPARATOR ); + out_dc_for_go_mapping.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); + final Object[] sorted = domain_lists_go.get( histogram_key ).toArray(); + Arrays.sort( sorted ); + for( final Object domain : sorted ) { + out_dc_for_go_mapping.write( domain + ForesterUtil.LINE_SEPARATOR ); + } + out_dc_for_go_mapping_unique.write( "#" + histogram_key + ForesterUtil.LINE_SEPARATOR ); + for( final String domain : domain_lists_go_unique.get( histogram_key ) ) { + out_dc_for_go_mapping_unique.write( domain + ForesterUtil.LINE_SEPARATOR ); + } + } + out_counts.close(); + out_dc.close(); + out_dc_for_go_mapping.close(); + out_dc_for_go_mapping_unique.close(); + final SortedMap lca_rank_counts = new TreeMap(); + final SortedMap lca_ancestor_species_counts = new TreeMap(); + for( final String dc : more_than_once ) { + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator it = local_phylogeny_l.iteratorExternalForward(); it.hasNext(); ) { + final PhylogenyNode n = it.next(); + if ( n.getNodeData().getBinaryCharacters().getGainedCharacters().contains( dc ) ) { + nodes.add( n ); + } + } + for( int i = 0; i < nodes.size() - 1; ++i ) { + for( int j = i + 1; j < nodes.size(); ++j ) { + final PhylogenyNode lca = PhylogenyMethods.obtainLCA( nodes.get( i ), + nodes.get( j ) ); + String rank = "unknown"; + if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getRank() ) ) { + rank = lca.getNodeData().getTaxonomy().getRank(); + } + addToCountMap( lca_rank_counts, rank ); + String lca_species; + if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getScientificName() ) ) { + lca_species = lca.getNodeData().getTaxonomy().getScientificName(); + } + else if ( lca.getNodeData().isHasTaxonomy() + && !ForesterUtil.isEmpty( lca.getNodeData().getTaxonomy().getCommonName() ) ) { + lca_species = lca.getNodeData().getTaxonomy().getCommonName(); + } + else { + lca_species = lca.getName(); + } + addToCountMap( lca_ancestor_species_counts, lca_species ); + } + } + } + final BufferedWriter out_for_rank_counts = new BufferedWriter( new FileWriter( outfilename_for_rank_counts ) ); + final BufferedWriter out_for_ancestor_species_counts = new BufferedWriter( new FileWriter( outfilename_for_ancestor_species_counts ) ); + ForesterUtil.map2writer( out_for_rank_counts, lca_rank_counts, "\t", ForesterUtil.LINE_SEPARATOR ); + ForesterUtil.map2writer( out_for_ancestor_species_counts, + lca_ancestor_species_counts, + "\t", + ForesterUtil.LINE_SEPARATOR ); + out_for_rank_counts.close(); + out_for_ancestor_species_counts.close(); + if ( !ForesterUtil.isEmpty( outfilename_for_protein_stats ) + && ( ( domain_length_stats_by_domain != null ) || ( protein_length_stats_by_dc != null ) || ( domain_number_stats_by_dc != null ) ) ) { + final BufferedWriter w = new BufferedWriter( new FileWriter( outfilename_for_protein_stats ) ); + w.write( "Domain Lengths: " ); + w.write( "\n" ); + if ( domain_length_stats_by_domain != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_lengths_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Protein Lengths: " ); + w.write( "\n" ); + if ( protein_length_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_protein_length_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Number of domains: " ); + w.write( "\n" ); + if ( domain_number_stats_by_dc != null ) { + for( final Entry entry : dc_reapp_counts_to_domain_number_stats + .entrySet() ) { + w.write( entry.getKey().toString() ); + w.write( "\t" + entry.getValue().arithmeticMean() ); + w.write( "\t" + entry.getValue().median() ); + w.write( "\n" ); + } + } + w.flush(); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, domain lengths:" ); + w.write( "\n" ); + w.write( "N: " + gained_once_domain_length_count ); + w.write( "\n" ); + w.write( "Avg: " + ( ( double ) gained_once_domain_length_sum / gained_once_domain_length_count ) ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, domain lengths:" ); + w.write( "\n" ); + w.write( "N: " + gained_multiple_times_domain_length_count ); + w.write( "\n" ); + w.write( "Avg: " + + ( ( double ) gained_multiple_times_domain_length_sum / gained_multiple_times_domain_length_count ) ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, protein lengths:" ); + w.write( "\n" ); + w.write( gained_once_lengths_stats.toString() ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained once, domain counts:" ); + w.write( "\n" ); + w.write( gained_once_domain_count_stats.toString() ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, protein lengths:" ); + w.write( "\n" ); + w.write( gained_multiple_times_lengths_stats.toString() ); + w.write( "\n" ); + w.write( "\n" ); + w.write( "Gained multiple times, domain counts:" ); + w.write( "\n" ); + w.write( gained_multiple_times_domain_count_stats.toString() ); + w.flush(); + w.close(); + } + } + catch ( final IOException e ) { + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); + } + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch counts to [" + + outfilename_for_counts + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote independent domain combination gains fitch lists to [" + + outfilename_for_dc + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote independent domain combination gains fitch lists to (for GO mapping) [" + + outfilename_for_dc_for_go_mapping + "]" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, + "Wrote independent domain combination gains fitch lists to (for GO mapping, unique) [" + + outfilename_for_dc_for_go_mapping_unique + "]" ); + } + + private final static void addToCountMap( final Map map, final String s ) { + if ( map.containsKey( s ) ) { + map.put( s, map.get( s ) + 1 ); + } + else { + map.put( s, 1 ); + } + } + public static int calculateOverlap( final Domain domain, final List covered_positions ) { int overlap_count = 0; for( int i = domain.getFrom(); i <= domain.getTo(); ++i ) { @@ -174,7 +528,7 @@ public final class SurfacingUtil { public static void checkForOutputFileWriteability( final File outfile ) { final String error = ForesterUtil.isWritableFile( outfile ); if ( !ForesterUtil.isEmpty( error ) ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, error ); + ForesterUtil.fatalError( surfacing.PRG_NAME, error ); } } @@ -289,7 +643,7 @@ public final class SurfacingUtil { public static Phylogeny createNjTreeBasedOnMatrixToFile( final File nj_tree_outfile, final DistanceMatrix distance ) { checkForOutputFileWriteability( nj_tree_outfile ); final NeighborJoining nj = NeighborJoining.createInstance(); - final Phylogeny phylogeny = nj.execute( distance ); + final Phylogeny phylogeny = nj.execute( ( BasicSymmetricalDistanceMatrix ) distance ); phylogeny.setName( nj_tree_outfile.getName() ); writePhylogenyToFile( phylogeny, nj_tree_outfile.toString() ); return phylogeny; @@ -418,7 +772,10 @@ public final class SurfacingUtil { final boolean output_binary_domain_combinations_for_graphs, final List all_binary_domains_combination_gained_fitch, final List all_binary_domains_combination_lost_fitch, - final BinaryDomainCombination.DomainCombinationType dc_type ) { + final BinaryDomainCombination.DomainCombinationType dc_type, + final Map protein_length_stats_by_dc, + final Map domain_number_stats_by_dc, + final Map domain_length_stats_by_domain ) { final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; final String date_time = ForesterUtil.getCurrentDateTime(); final SortedSet all_pfams_encountered = new TreeSet(); @@ -437,24 +794,23 @@ public final class SurfacingUtil { domain_parsimony.executeDolloParsimonyOnDomainPresence(); } SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name - + surfacing_old.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name - + surfacing_old.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_DOMAINS, Format.FORESTER ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.GAIN, - outfile_name + surfacing_old.PARSIMONY_OUTPUT_DOLLO_GAINS_D, + outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_D, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); + SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.LOSS, + outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_D, sep, ForesterUtil.LINE_SEPARATOR, null ); - SurfacingUtil - .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), - CharacterStateMatrix.GainLossStates.LOSS, - outfile_name + surfacing_old.PARSIMONY_OUTPUT_DOLLO_LOSSES_D, - sep, - ForesterUtil.LINE_SEPARATOR, - null ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name - + surfacing_old.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null ); + + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_D, sep, ForesterUtil.LINE_SEPARATOR, null ); //HTML: writeBinaryStatesMatrixToList( domain_id_to_go_ids_map, go_id_to_term_map, @@ -462,7 +818,7 @@ public final class SurfacingUtil { false, domain_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.GAIN, - outfile_name + surfacing_old.PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D, + outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_HTML_D, sep, ForesterUtil.LINE_SEPARATOR, "Dollo Parsimony | Gains | Domains", @@ -477,7 +833,7 @@ public final class SurfacingUtil { false, domain_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.LOSS, - outfile_name + surfacing_old.PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D, + outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_HTML_D, sep, ForesterUtil.LINE_SEPARATOR, "Dollo Parsimony | Losses | Domains", @@ -492,7 +848,7 @@ public final class SurfacingUtil { false, domain_parsimony.getGainLossMatrix(), null, - outfile_name + surfacing_old.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D, + outfile_name + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_HTML_D, sep, ForesterUtil.LINE_SEPARATOR, "Dollo Parsimony | Present | Domains", @@ -508,14 +864,14 @@ public final class SurfacingUtil { "dollo_on_domains_" + outfile_name, parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name - + surfacing_old.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + + surfacing.DOMAINS_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); try { writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, true, outfile_name, "_dollo_all_gains_d" ); writeAllDomainsChangedOnAllSubtrees( local_phylogeny_l, false, outfile_name, "_dollo_all_losses_d" ); } catch ( final IOException e ) { e.printStackTrace(); - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getLocalizedMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); } if ( domain_parsimony.calculateNumberOfBinaryDomainCombination() > 0 ) { // FITCH DOMAIN COMBINATIONS @@ -527,28 +883,28 @@ public final class SurfacingUtil { randomization = "yes, seed = " + random_number_seed_for_fitch_parsimony; } else { - domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( false ); + domain_parsimony.executeFitchParsimonyOnBinaryDomainCombintion( USE_LAST ); } SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossMatrix(), outfile_name - + surfacing_old.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); SurfacingUtil.writeMatrixToFile( domain_parsimony.getGainLossCountsMatrix(), outfile_name - + surfacing_old.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); - SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), - CharacterStateMatrix.GainLossStates.GAIN, - outfile_name - + surfacing_old.PARSIMONY_OUTPUT_FITCH_GAINS_BC, - sep, - ForesterUtil.LINE_SEPARATOR, - null ); + + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_FITCH_BINARY_COMBINATIONS, Format.FORESTER ); SurfacingUtil .writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), - CharacterStateMatrix.GainLossStates.LOSS, - outfile_name + surfacing_old.PARSIMONY_OUTPUT_FITCH_LOSSES_BC, + CharacterStateMatrix.GainLossStates.GAIN, + outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_BC, sep, ForesterUtil.LINE_SEPARATOR, null ); + SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), + CharacterStateMatrix.GainLossStates.LOSS, + outfile_name + + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_BC, + sep, + ForesterUtil.LINE_SEPARATOR, + null ); SurfacingUtil.writeBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), null, outfile_name - + surfacing_old.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null ); + + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC, sep, ForesterUtil.LINE_SEPARATOR, null ); if ( all_binary_domains_combination_gained_fitch != null ) { collectChangedDomainCombinationsFromBinaryStatesMatrixAsListToFile( domain_parsimony.getGainLossMatrix(), dc_type, @@ -567,7 +923,7 @@ public final class SurfacingUtil { .getGainLossMatrix(), null, outfile_name - + surfacing_old.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS, + + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_BC_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS, sep, ForesterUtil.LINE_SEPARATOR, BinaryDomainCombination.OutputFormat.DOT ); @@ -579,7 +935,7 @@ public final class SurfacingUtil { true, domain_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.GAIN, - outfile_name + surfacing_old.PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC, + outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_GAINS_HTML_BC, sep, ForesterUtil.LINE_SEPARATOR, "Fitch Parsimony | Gains | Domain Combinations", @@ -594,7 +950,7 @@ public final class SurfacingUtil { true, domain_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.LOSS, - outfile_name + surfacing_old.PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC, + outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_LOSSES_HTML_BC, sep, ForesterUtil.LINE_SEPARATOR, "Fitch Parsimony | Losses | Domain Combinations", @@ -609,7 +965,7 @@ public final class SurfacingUtil { true, domain_parsimony.getGainLossMatrix(), null, - outfile_name + surfacing_old.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC, + outfile_name + surfacing.PARSIMONY_OUTPUT_FITCH_PRESENT_HTML_BC, sep, ForesterUtil.LINE_SEPARATOR, "Fitch Parsimony | Present | Domain Combinations", @@ -622,13 +978,11 @@ public final class SurfacingUtil { go_id_to_term_map, outfile_name, all_pfams_encountered ); - writePfamsToFile( outfile_name + surfacing_old.ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX, - all_pfams_gained_as_domains ); - writePfamsToFile( outfile_name + surfacing_old.ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX, all_pfams_lost_as_domains ); - writePfamsToFile( outfile_name + surfacing_old.ALL_PFAMS_GAINED_AS_DC_SUFFIX, + writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DOMAINS_SUFFIX, all_pfams_gained_as_domains ); + writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DOMAINS_SUFFIX, all_pfams_lost_as_domains ); + writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_GAINED_AS_DC_SUFFIX, all_pfams_gained_as_dom_combinations ); - writePfamsToFile( outfile_name + surfacing_old.ALL_PFAMS_LOST_AS_DC_SUFFIX, - all_pfams_lost_as_dom_combinations ); + writePfamsToFile( outfile_name + surfacing.ALL_PFAMS_LOST_AS_DC_SUFFIX, all_pfams_lost_as_dom_combinations ); preparePhylogeny( local_phylogeny_l, domain_parsimony, date_time, @@ -637,7 +991,22 @@ public final class SurfacingUtil { "fitch_on_binary_domain_combinations_" + outfile_name, parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_l, outfile_name - + surfacing_old.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH ); + + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH ); + calculateIndependentDomainCombinationGains( local_phylogeny_l, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_OUTPUT_SUFFIX, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_OUTPUT_SUFFIX, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_SUFFIX, + outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_OUTPUT_UNIQUE_SUFFIX, + outfile_name + "_indep_dc_gains_fitch_lca_ranks.txt", + outfile_name + "_indep_dc_gains_fitch_lca_taxonomies.txt", + outfile_name + "_indep_dc_gains_fitch_protein_statistics.txt", + protein_length_stats_by_dc, + domain_number_stats_by_dc, + domain_length_stats_by_domain ); } } @@ -649,20 +1018,20 @@ public final class SurfacingUtil { final String sep = ForesterUtil.LINE_SEPARATOR + "###################" + ForesterUtil.LINE_SEPARATOR; final String date_time = ForesterUtil.getCurrentDateTime(); System.out.println(); - writeToNexus( outfile_name + surfacing_old.NEXUS_SECONDARY_FEATURES, + writeToNexus( outfile_name + surfacing.NEXUS_SECONDARY_FEATURES, secondary_features_parsimony.createMatrixOfSecondaryFeaturePresenceOrAbsence( null ), phylogeny ); - final Phylogeny local_phylogeny_copy = phylogeny.copy(); + Phylogeny local_phylogeny_copy = phylogeny.copy(); secondary_features_parsimony.executeDolloParsimonyOnSecondaryFeatures( mapping_results_map ); SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossMatrix(), outfile_name - + surfacing_old.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); SurfacingUtil.writeMatrixToFile( secondary_features_parsimony.getGainLossCountsMatrix(), outfile_name - + surfacing_old.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); + + surfacing.PARSIMONY_OUTPUT_GL_COUNTS_SUFFIX_DOLLO_SECONDARY_FEATURES, Format.FORESTER ); SurfacingUtil .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.GAIN, outfile_name - + surfacing_old.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES, + + surfacing.PARSIMONY_OUTPUT_DOLLO_GAINS_SECONDARY_FEATURES, sep, ForesterUtil.LINE_SEPARATOR, null ); @@ -670,7 +1039,7 @@ public final class SurfacingUtil { .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), CharacterStateMatrix.GainLossStates.LOSS, outfile_name - + surfacing_old.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES, + + surfacing.PARSIMONY_OUTPUT_DOLLO_LOSSES_SECONDARY_FEATURES, sep, ForesterUtil.LINE_SEPARATOR, null ); @@ -678,7 +1047,7 @@ public final class SurfacingUtil { .writeBinaryStatesMatrixAsListToFile( secondary_features_parsimony.getGainLossMatrix(), null, outfile_name - + surfacing_old.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES, + + surfacing.PARSIMONY_OUTPUT_DOLLO_PRESENT_SECONDARY_FEATURES, sep, ForesterUtil.LINE_SEPARATOR, null ); @@ -689,71 +1058,112 @@ public final class SurfacingUtil { "dollo_on_secondary_features_" + outfile_name, parameters_str ); SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name - + surfacing_old.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + + surfacing.SECONDARY_FEATURES_PARSIMONY_TREE_OUTPUT_SUFFIX_DOLLO ); + // FITCH DOMAIN COMBINATIONS + // ------------------------- + local_phylogeny_copy = phylogeny.copy(); + final String randomization = "no"; + secondary_features_parsimony.executeFitchParsimonyOnBinaryDomainCombintionOnSecondaryFeatures( USE_LAST ); + preparePhylogeny( local_phylogeny_copy, + secondary_features_parsimony, + date_time, + "Fitch parsimony on secondary binary domain combination presence/absence randomization: " + + randomization, + "fitch_on_binary_domain_combinations_" + outfile_name, + parameters_str ); + SurfacingUtil.writePhylogenyToFile( local_phylogeny_copy, outfile_name + + surfacing.BINARY_DOMAIN_COMBINATIONS_PARSIMONY_TREE_OUTPUT_SUFFIX_FITCH_MAPPED ); + calculateIndependentDomainCombinationGains( local_phylogeny_copy, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_COUNTS_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_SUFFIX, outfile_name + + surfacing.INDEPENDENT_DC_GAINS_FITCH_PARS_DC_FOR_GO_MAPPING_MAPPED_OUTPUT_UNIQUE_SUFFIX, outfile_name + + "_MAPPED_indep_dc_gains_fitch_lca_ranks.txt", outfile_name + + "_MAPPED_indep_dc_gains_fitch_lca_taxonomies.txt", null, null, null, null ); } - public static void extractProteinNames( final List proteins, - final List query_domain_ids_nc_order, - final Writer out, - final String separator ) throws IOException { + public static void doit( final List proteins, + final List query_domain_ids_nc_order, + final Writer out, + final String separator, + final String limit_to_species, + final Map> average_protein_lengths_by_dc ) throws IOException { for( final Protein protein : proteins ) { - if ( protein.contains( query_domain_ids_nc_order, true ) ) { - out.write( protein.getSpecies().getSpeciesId() ); - out.write( separator ); - out.write( protein.getProteinId().getId() ); - out.write( separator ); - out.write( "[" ); - final Set visited_domain_ids = new HashSet(); - boolean first = true; - for( final Domain domain : protein.getProteinDomains() ) { - if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { - visited_domain_ids.add( domain.getDomainId() ); - if ( first ) { - first = false; - } - else { - out.write( " " ); + if ( ForesterUtil.isEmpty( limit_to_species ) + || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { + if ( protein.contains( query_domain_ids_nc_order, true ) ) { + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator ); + out.write( protein.getProteinId().getId() ); + out.write( separator ); + out.write( "[" ); + final Set visited_domain_ids = new HashSet(); + boolean first = true; + for( final Domain domain : protein.getProteinDomains() ) { + if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { + visited_domain_ids.add( domain.getDomainId() ); + if ( first ) { + first = false; + } + else { + out.write( " " ); + } + out.write( domain.getDomainId().getId() ); + out.write( " {" ); + out.write( "" + domain.getTotalCount() ); + out.write( "}" ); } - out.write( domain.getDomainId().getId() ); - out.write( " {" ); - out.write( "" + domain.getTotalCount() ); - out.write( "}" ); } + out.write( "]" ); + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getDescription() ); + } + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getAccession() ); + } + out.write( SurfacingConstants.NL ); } - out.write( "]" ); - out.write( separator ); - if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() - .equals( SurfacingConstants.NONE ) ) ) { - out.write( protein.getDescription() ); - } - out.write( separator ); - if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() - .equals( SurfacingConstants.NONE ) ) ) { - out.write( protein.getAccession() ); - } - out.write( SurfacingConstants.NL ); } } out.flush(); } - public static void extractProteinNames( final SortedMap> protein_lists_per_species, - final DomainId domain_id, + public static void extractProteinNames( final List proteins, + final List query_domain_ids_nc_order, final Writer out, - final String separator ) throws IOException { - for( final Species species : protein_lists_per_species.keySet() ) { - for( final Protein protein : protein_lists_per_species.get( species ) ) { - final List domains = protein.getProteinDomains( domain_id ); - if ( domains.size() > 0 ) { - final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); - for( final Domain domain : domains ) { - stats.addValue( domain.getPerSequenceEvalue() ); - } + final String separator, + final String limit_to_species ) throws IOException { + for( final Protein protein : proteins ) { + if ( ForesterUtil.isEmpty( limit_to_species ) + || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { + if ( protein.contains( query_domain_ids_nc_order, true ) ) { out.write( protein.getSpecies().getSpeciesId() ); out.write( separator ); out.write( protein.getProteinId().getId() ); out.write( separator ); - out.write( "[" + FORMATTER.format( stats.median() ) + "]" ); + out.write( "[" ); + final Set visited_domain_ids = new HashSet(); + boolean first = true; + for( final Domain domain : protein.getProteinDomains() ) { + if ( !visited_domain_ids.contains( domain.getDomainId() ) ) { + visited_domain_ids.add( domain.getDomainId() ); + if ( first ) { + first = false; + } + else { + out.write( " " ); + } + out.write( domain.getDomainId().getId() ); + out.write( " {" ); + out.write( "" + domain.getTotalCount() ); + out.write( "}" ); + } + } + out.write( "]" ); out.write( separator ); if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() .equals( SurfacingConstants.NONE ) ) ) { @@ -771,6 +1181,58 @@ public final class SurfacingUtil { out.flush(); } + public static void extractProteinNames( final SortedMap> protein_lists_per_species, + final DomainId domain_id, + final Writer out, + final String separator, + final String limit_to_species ) throws IOException { + for( final Species species : protein_lists_per_species.keySet() ) { + for( final Protein protein : protein_lists_per_species.get( species ) ) { + if ( ForesterUtil.isEmpty( limit_to_species ) + || protein.getSpecies().getSpeciesId().equalsIgnoreCase( limit_to_species ) ) { + final List domains = protein.getProteinDomains( domain_id ); + if ( domains.size() > 0 ) { + out.write( protein.getSpecies().getSpeciesId() ); + out.write( separator ); + out.write( protein.getProteinId().getId() ); + out.write( separator ); + out.write( domain_id.toString() ); + out.write( separator ); + for( final Domain domain : domains ) { + out.write( "/" ); + out.write( domain.getFrom() + "-" + domain.getTo() ); + } + out.write( "/" ); + out.write( separator ); + out.write( "{" ); + boolean first = true; + for( final Domain domain : protein.getProteinDomains() ) { + if ( first ) { + first = false; + } + else { + out.write( "," ); + } + out.write( domain.getDomainId().toString() ); + } + out.write( "}" ); + if ( !( ForesterUtil.isEmpty( protein.getDescription() ) || protein.getDescription() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getDescription() ); + } + out.write( separator ); + if ( !( ForesterUtil.isEmpty( protein.getAccession() ) || protein.getAccession() + .equals( SurfacingConstants.NONE ) ) ) { + out.write( protein.getAccession() ); + } + out.write( SurfacingConstants.NL ); + } + } + } + } + out.flush(); + } + public static SortedSet getAllDomainIds( final List gwcd_list ) { final SortedSet all_domains_ids = new TreeSet(); for( final GenomeWideCombinableDomains gwcd : gwcd_list ) { @@ -806,11 +1268,17 @@ public final class SurfacingUtil { final PhylogenyNode n = it.next(); if ( ForesterUtil.isEmpty( n.getName() ) && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() - .getScientificName() ) ) ) { + .getScientificName() ) ) + && ( !n.getNodeData().isHasTaxonomy() || ForesterUtil.isEmpty( n.getNodeData().getTaxonomy() + .getCommonName() ) ) ) { if ( n.getParent() != null ) { names.append( " " ); names.append( n.getParent().getName() ); } + final List l = n.getAllExternalDescendants(); + for( final Object object : l ) { + System.out.println( l.toString() ); + } ++c; } } @@ -855,6 +1323,95 @@ public final class SurfacingUtil { p.setRooted( true ); } + /* + * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value + * + * + */ + static public StringBuffer proteinToDomainCombinations( final Protein protein, + final String protein_id, + final String separator ) { + final StringBuffer sb = new StringBuffer(); + if ( protein.getSpecies() == null ) { + throw new IllegalArgumentException( "species must not be null" ); + } + if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) { + throw new IllegalArgumentException( "species id must not be empty" ); + } + final List domains = protein.getProteinDomains(); + if ( domains.size() > 1 ) { + final Map counts = new HashMap(); + for( final Domain domain : domains ) { + final String id = domain.getDomainId().getId(); + if ( counts.containsKey( id ) ) { + counts.put( id, counts.get( id ) + 1 ); + } + else { + counts.put( id, 1 ); + } + } + final Set dcs = new HashSet(); + for( int i = 1; i < domains.size(); ++i ) { + for( int j = 0; j < i; ++j ) { + Domain domain_n = domains.get( i ); + Domain domain_c = domains.get( j ); + if ( domain_n.getFrom() > domain_c.getFrom() ) { + domain_n = domains.get( j ); + domain_c = domains.get( i ); + } + final String dc = domain_n.getDomainId().getId() + domain_c.getDomainId().getId(); + if ( !dcs.contains( dc ) ) { + dcs.add( dc ); + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( domain_n.getDomainId().getId() ); + sb.append( separator ); + sb.append( domain_c.getDomainId().getId() ); + sb.append( separator ); + sb.append( domain_n.getPerDomainEvalue() ); + sb.append( separator ); + sb.append( domain_c.getPerDomainEvalue() ); + sb.append( separator ); + sb.append( counts.get( domain_n.getDomainId().getId() ) ); + sb.append( separator ); + sb.append( counts.get( domain_c.getDomainId().getId() ) ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + } + } + } + else if ( domains.size() == 1 ) { + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( domains.get( 0 ).getDomainId().getId() ); + sb.append( separator ); + sb.append( separator ); + sb.append( domains.get( 0 ).getPerDomainEvalue() ); + sb.append( separator ); + sb.append( separator ); + sb.append( 1 ); + sb.append( separator ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + else { + sb.append( protein.getSpecies() ); + sb.append( separator ); + sb.append( protein_id ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( separator ); + sb.append( ForesterUtil.LINE_SEPARATOR ); + } + return sb; + } + /** * * Example regarding engulfment: ------------0.1 ----------0.2 --0.3 => @@ -879,7 +1436,7 @@ public final class SurfacingUtil { final boolean remove_engulfed_domains, final Protein protein ) { final Protein pruned_protein = new BasicProtein( protein.getProteinId().getId(), protein.getSpecies() - .getSpeciesId() ); + .getSpeciesId(), protein.getLength() ); final List sorted = SurfacingUtil.sortDomainsWithAscendingConfidenceValues( protein ); final List covered_positions = new ArrayList(); for( final Domain domain : sorted ) { @@ -904,7 +1461,7 @@ public final class SurfacingUtil { return pruned_protein; } - static List sortDomainsWithAscendingConfidenceValues( final Protein protein ) { + public static List sortDomainsWithAscendingConfidenceValues( final Protein protein ) { final List domains = new ArrayList(); for( final Domain d : protein.getProteinDomains() ) { domains.add( d ); @@ -913,6 +1470,19 @@ public final class SurfacingUtil { return domains; } + private static List splitDomainCombination( final String dc ) { + final String[] s = dc.split( "=" ); + if ( s.length != 2 ) { + ForesterUtil.printErrorMessage( surfacing.PRG_NAME, "Stringyfied domain combination has illegal format: " + + dc ); + System.exit( -1 ); + } + final List l = new ArrayList( 2 ); + l.add( s[ 0 ] ); + l.add( s[ 1 ] ); + return l; + } + public static void writeAllDomainsChangedOnAllSubtrees( final Phylogeny p, final boolean get_gains, final String outdir, @@ -921,7 +1491,7 @@ public final class SurfacingUtil { if ( !get_gains ) { state = CharacterStateMatrix.GainLossStates.LOSS; } - final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing_old.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES, + final File base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_SUBTREE_DOMAIN_GAIN_LOSS_FILES, false, state, outdir ); @@ -946,11 +1516,10 @@ public final class SurfacingUtil { final Map go_id_to_term_map, final String outfile_name, final SortedSet all_pfams_encountered ) { - final File all_pfams_encountered_file = new File( outfile_name + surfacing_old.ALL_PFAMS_ENCOUNTERED_SUFFIX ); + final File all_pfams_encountered_file = new File( outfile_name + surfacing.ALL_PFAMS_ENCOUNTERED_SUFFIX ); final File all_pfams_encountered_with_go_annotation_file = new File( outfile_name - + surfacing_old.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); - final File encountered_pfams_summary_file = new File( outfile_name - + surfacing_old.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX ); + + surfacing.ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX ); + final File encountered_pfams_summary_file = new File( outfile_name + surfacing.ENCOUNTERED_PFAMS_SUMMARY_SUFFIX ); int biological_process_counter = 0; int cellular_component_counter = 0; int molecular_function_counter = 0; @@ -1017,41 +1586,41 @@ public final class SurfacingUtil { } all_pfams_encountered_writer.close(); all_pfams_encountered_with_go_annotation_writer.close(); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote all [" + all_pfams_encountered.size() + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + all_pfams_encountered.size() + "] encountered Pfams to: \"" + all_pfams_encountered_file + "\"" ); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote all [" + pfams_with_mappings_counter + "] encountered Pfams with GO mappings to: \"" + all_pfams_encountered_with_go_annotation_file + "\"" ); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote summary (including all [" + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote summary (including all [" + pfams_without_mappings_counter + "] encountered Pfams without GO mappings) to: \"" + encountered_pfams_summary_file + "\"" ); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Sum of Pfams encountered : " + ForesterUtil.programMessage( surfacing.PRG_NAME, "Sum of Pfams encountered : " + all_pfams_encountered.size() ); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Pfams without a mapping : " + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without a mapping : " + pfams_without_mappings_counter + " [" + ( 100 * pfams_without_mappings_counter / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Pfams without mapping to proc. or func. : " + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams without mapping to proc. or func. : " + pfams_without_mappings_to_bp_or_mf_counter + " [" + ( 100 * pfams_without_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping : " + pfams_with_mappings_counter + " [" + ( 100 * pfams_with_mappings_counter / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Pfams with a mapping to proc. or func. : " + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with a mapping to proc. or func. : " + pfams_with_mappings_to_bp_or_mf_counter + " [" + ( 100 * pfams_with_mappings_to_bp_or_mf_counter / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to biological process: " + biological_process_counter + " [" + ( 100 * biological_process_counter / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to molecular function: " + molecular_function_counter + " [" + ( 100 * molecular_function_counter / all_pfams_encountered.size() ) + "%]" ); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, + ForesterUtil.programMessage( surfacing.PRG_NAME, "Pfams with mapping to cellular component: " + cellular_component_counter + " [" + ( 100 * cellular_component_counter / all_pfams_encountered.size() ) @@ -1085,7 +1654,7 @@ public final class SurfacingUtil { summary_writer.close(); } catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing_old.PRG_NAME, "Failure to write: " + e ); + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); } } @@ -1095,7 +1664,7 @@ public final class SurfacingUtil { final int i, final GenomeWideCombinableDomainsSortOrder dc_sort_order ) { File dc_outfile_dot = new File( input_file_properties[ i ][ 0 ] - + surfacing_old.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS ); + + surfacing.DOMAIN_COMBINITONS_OUTPUTFILE_SUFFIX_FOR_GRAPH_ANALYSIS ); if ( output_dir != null ) { dc_outfile_dot = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile_dot ); } @@ -1111,102 +1680,13 @@ public final class SurfacingUtil { out_dot.close(); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote binary domain combination for \"" + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote binary domain combination for \"" + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile_dot + "\"" ); } - /* - * species | protein id | n-terminal domain | c-terminal domain | n-terminal domain per domain E-value | c-terminal domain per domain E-value - * - * - */ - static public StringBuffer proteinToDomainCombinations( final Protein protein, - final String protein_id, - final String separator ) { - final StringBuffer sb = new StringBuffer(); - if ( protein.getSpecies() == null ) { - throw new IllegalArgumentException( "species must not be null" ); - } - if ( ForesterUtil.isEmpty( protein.getSpecies().getSpeciesId() ) ) { - throw new IllegalArgumentException( "species id must not be empty" ); - } - final List domains = protein.getProteinDomains(); - if ( domains.size() > 1 ) { - final Map counts = new HashMap(); - for( final Domain domain : domains ) { - final String id = domain.getDomainId().getId(); - if ( counts.containsKey( id ) ) { - counts.put( id, counts.get( id ) + 1 ); - } - else { - counts.put( id, 1 ); - } - } - final Set dcs = new HashSet(); - for( int i = 1; i < domains.size(); ++i ) { - for( int j = 0; j < i; ++j ) { - Domain domain_n = domains.get( i ); - Domain domain_c = domains.get( j ); - if ( domain_n.getFrom() > domain_c.getFrom() ) { - domain_n = domains.get( j ); - domain_c = domains.get( i ); - } - final String dc = domain_n.getDomainId().getId() + domain_c.getDomainId().getId(); - if ( !dcs.contains( dc ) ) { - dcs.add( dc ); - sb.append( protein.getSpecies() ); - sb.append( separator ); - sb.append( protein_id ); - sb.append( separator ); - sb.append( domain_n.getDomainId().getId() ); - sb.append( separator ); - sb.append( domain_c.getDomainId().getId() ); - sb.append( separator ); - sb.append( domain_n.getPerDomainEvalue() ); - sb.append( separator ); - sb.append( domain_c.getPerDomainEvalue() ); - sb.append( separator ); - sb.append( counts.get( domain_n.getDomainId().getId() ) ); - sb.append( separator ); - sb.append( counts.get( domain_c.getDomainId().getId() ) ); - sb.append( ForesterUtil.LINE_SEPARATOR ); - } - } - } - } - else if ( domains.size() == 1 ) { - sb.append( protein.getSpecies() ); - sb.append( separator ); - sb.append( protein_id ); - sb.append( separator ); - sb.append( domains.get( 0 ).getDomainId().getId() ); - sb.append( separator ); - sb.append( separator ); - sb.append( domains.get( 0 ).getPerDomainEvalue() ); - sb.append( separator ); - sb.append( separator ); - sb.append( 1 ); - sb.append( separator ); - sb.append( ForesterUtil.LINE_SEPARATOR ); - } - else { - sb.append( protein.getSpecies() ); - sb.append( separator ); - sb.append( protein_id ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( separator ); - sb.append( ForesterUtil.LINE_SEPARATOR ); - } - return sb; - } - public static void writeBinaryStatesMatrixAsListToFile( final CharacterStateMatrix matrix, final CharacterStateMatrix.GainLossStates state, final String filename, @@ -1245,9 +1725,9 @@ public final class SurfacingUtil { out.close(); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); } public static void writeBinaryStatesMatrixAsListToFileForBinaryCombinationsForGraphAnalysis( final CharacterStateMatrix matrix, @@ -1279,7 +1759,7 @@ public final class SurfacingUtil { bdc = BasicBinaryDomainCombination.createInstance( matrix.getCharacter( c ) ); } catch ( final Exception e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getLocalizedMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getLocalizedMessage() ); } out.write( bdc.toGraphDescribingLanguage( bc_output_format, null, null ).toString() ); out.write( character_separator ); @@ -1290,9 +1770,9 @@ public final class SurfacingUtil { out.close(); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters list: \"" + filename + "\"" ); } public static void writeBinaryStatesMatrixToList( final Map> domain_id_to_go_ids_map, @@ -1327,7 +1807,7 @@ public final class SurfacingUtil { } try { final Writer out = new BufferedWriter( new FileWriter( outfile ) ); - final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing_old.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES, + final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES, domain_combinations, state, filename ); @@ -1470,186 +1950,9 @@ public final class SurfacingUtil { out.close(); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename - + "\"" ); - } - - public static void writeBinaryStatesMatrixToListORIGIG( final Map> domain_id_to_go_ids_map, - final Map go_id_to_term_map, - final GoNameSpace go_namespace_limit, - final boolean domain_combinations, - final CharacterStateMatrix matrix, - final CharacterStateMatrix.GainLossStates state, - final String filename, - final String indentifier_characters_separator, - final String character_separator, - final String title_for_html, - final String prefix_for_html, - final Map>[] domain_id_to_secondary_features_maps, - final SortedSet all_pfams_encountered, - final SortedSet pfams_gained_or_lost, - final String suffix_for_per_node_events_file ) { - if ( ( go_namespace_limit != null ) && ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { - throw new IllegalArgumentException( "attempt to use GO namespace limit without a GO-id to term map" ); - } - else if ( ( ( domain_id_to_go_ids_map == null ) || ( domain_id_to_go_ids_map.size() < 1 ) ) ) { - throw new IllegalArgumentException( "attempt to output detailed HTML without a Pfam to GO map" ); - } - else if ( ( ( go_id_to_term_map == null ) || ( go_id_to_term_map.size() < 1 ) ) ) { - throw new IllegalArgumentException( "attempt to output detailed HTML without a GO-id to term map" ); - } - final File outfile = new File( filename ); - checkForOutputFileWriteability( outfile ); - final SortedSet sorted_ids = new TreeSet(); - for( int i = 0; i < matrix.getNumberOfIdentifiers(); ++i ) { - sorted_ids.add( matrix.getIdentifier( i ) ); - } - try { - final Writer out = new BufferedWriter( new FileWriter( outfile ) ); - final File per_node_go_mapped_domain_gain_loss_files_base_dir = createBaseDirForPerNodeDomainFiles( surfacing_old.BASE_DIRECTORY_PER_NODE_DOMAIN_GAIN_LOSS_FILES, - domain_combinations, - state, - filename ); - Writer per_node_go_mapped_domain_gain_loss_outfile_writer = null; - File per_node_go_mapped_domain_gain_loss_outfile = null; - int per_node_counter = 0; - out.write( "" ); - out.write( SurfacingConstants.NL ); - addHtmlHead( out, title_for_html ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "

" ); - out.write( SurfacingConstants.NL ); - out.write( title_for_html ); - out.write( SurfacingConstants.NL ); - out.write( "

" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - for( final String id : sorted_ids ) { - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - } - out.write( "
" ); - out.write( "" + id + "" ); - writeTaxonomyLinks( out, id ); - out.write( "
" ); - out.write( SurfacingConstants.NL ); - for( final String id : sorted_ids ) { - out.write( SurfacingConstants.NL ); - out.write( "

" ); - out.write( "" + id + "" ); - writeTaxonomyLinks( out, id ); - out.write( "

" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - per_node_counter = 0; - if ( matrix.getNumberOfCharacters() > 0 ) { - per_node_go_mapped_domain_gain_loss_outfile = new File( per_node_go_mapped_domain_gain_loss_files_base_dir - + ForesterUtil.FILE_SEPARATOR + id + suffix_for_per_node_events_file ); - SurfacingUtil.checkForOutputFileWriteability( per_node_go_mapped_domain_gain_loss_outfile ); - per_node_go_mapped_domain_gain_loss_outfile_writer = ForesterUtil - .createBufferedWriter( per_node_go_mapped_domain_gain_loss_outfile ); - } - else { - per_node_go_mapped_domain_gain_loss_outfile = null; - per_node_go_mapped_domain_gain_loss_outfile_writer = null; - } - for( int c = 0; c < matrix.getNumberOfCharacters(); ++c ) { - // Not nice: - // using null to indicate either UNCHANGED_PRESENT or GAIN. - if ( ( matrix.getState( id, c ) == state ) - || ( ( state == null ) && ( ( matrix.getState( id, c ) == CharacterStateMatrix.GainLossStates.UNCHANGED_PRESENT ) || ( matrix - .getState( id, c ) == CharacterStateMatrix.GainLossStates.GAIN ) ) ) ) { - final String character = matrix.getCharacter( c ); - String domain_0 = ""; - String domain_1 = ""; - if ( character.indexOf( BinaryDomainCombination.SEPARATOR ) > 0 ) { - final String[] s = character.split( BinaryDomainCombination.SEPARATOR ); - if ( s.length != 2 ) { - throw new AssertionError( "this should not have happened: unexpected format for domain combination: [" - + character + "]" ); - } - domain_0 = s[ 0 ]; - domain_1 = s[ 1 ]; - } - else { - domain_0 = character; - } - writeDomainData( domain_id_to_go_ids_map, - go_id_to_term_map, - go_namespace_limit, - out, - domain_0, - domain_1, - prefix_for_html, - character_separator, - domain_id_to_secondary_features_maps, - null ); - all_pfams_encountered.add( domain_0 ); - if ( pfams_gained_or_lost != null ) { - pfams_gained_or_lost.add( domain_0 ); - } - if ( !ForesterUtil.isEmpty( domain_1 ) ) { - all_pfams_encountered.add( domain_1 ); - if ( pfams_gained_or_lost != null ) { - pfams_gained_or_lost.add( domain_1 ); - } - } - if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { - writeDomainsToIndividualFilePerTreeNode( per_node_go_mapped_domain_gain_loss_outfile_writer, - domain_0, - domain_1 ); - per_node_counter++; - } - } - } - if ( per_node_go_mapped_domain_gain_loss_outfile_writer != null ) { - per_node_go_mapped_domain_gain_loss_outfile_writer.close(); - if ( per_node_counter < 1 ) { - per_node_go_mapped_domain_gain_loss_outfile.delete(); - } - per_node_counter = 0; - } - out.write( "
" ); - out.write( "Pfam domain(s)" ); - out.write( "" ); - out.write( "GO term acc" ); - out.write( "" ); - out.write( "GO term" ); - out.write( "" ); - out.write( "Penultimate GO term" ); - out.write( "" ); - out.write( "GO namespace" ); - out.write( "
" ); - out.write( SurfacingConstants.NL ); - out.write( "
" ); - out.write( SurfacingConstants.NL ); - } // for( final String id : sorted_ids ) { - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - out.flush(); - out.close(); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); - } - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename - + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote characters detailed HTML list: \"" + filename + "\"" ); } public static void writeDomainCombinationsCountsFile( final String[][] input_file_properties, @@ -1659,7 +1962,7 @@ public final class SurfacingUtil { final int i, final GenomeWideCombinableDomains.GenomeWideCombinableDomainsSortOrder dc_sort_order ) { File dc_outfile = new File( input_file_properties[ i ][ 0 ] - + surfacing_old.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX ); + + surfacing.DOMAIN_COMBINITON_COUNTS_OUTPUTFILE_SUFFIX ); if ( output_dir != null ) { dc_outfile = new File( output_dir + ForesterUtil.FILE_SEPARATOR + dc_outfile ); } @@ -1670,7 +1973,7 @@ public final class SurfacingUtil { out.close(); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } final DescriptiveStatistics stats = gwcd.getPerGenomeDomainPromiscuityStatistics(); try { @@ -1694,15 +1997,15 @@ public final class SurfacingUtil { per_genome_domain_promiscuity_statistics_writer.write( ForesterUtil.LINE_SEPARATOR ); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } if ( input_file_properties[ i ].length == 3 ) { - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote domain combination counts for \"" + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ", " + input_file_properties[ i ][ 2 ] + ") to: \"" + dc_outfile + "\"" ); } else { - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote domain combination counts for \"" + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote domain combination counts for \"" + input_file_properties[ i ][ 0 ] + "\" (" + input_file_properties[ i ][ 1 ] + ") to: \"" + dc_outfile + "\"" ); } @@ -1813,111 +2116,6 @@ public final class SurfacingUtil { } } - private static void writeDomainDataORIG( final Map> domain_id_to_go_ids_map, - final Map go_id_to_term_map, - final GoNameSpace go_namespace_limit, - final Writer out, - final String domain_0, - final String domain_1, - final String prefix_for_html, - final String character_separator_for_non_html_output, - final Map>[] domain_id_to_secondary_features_maps, - final Set all_go_ids ) throws IOException { - boolean any_go_annotation_present = false; - boolean first_has_no_go = false; - int domain_count = 2; // To distinguish between domains and binary domain combinations. - if ( ForesterUtil.isEmpty( domain_1 ) ) { - domain_count = 1; - } - // The following has a difficult to understand logic. - for( int d = 0; d < domain_count; ++d ) { - List go_ids = null; - boolean go_annotation_present = false; - if ( d == 0 ) { - final DomainId domain_id = new DomainId( domain_0 ); - if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { - go_annotation_present = true; - any_go_annotation_present = true; - go_ids = domain_id_to_go_ids_map.get( domain_id ); - } - else { - first_has_no_go = true; - } - } - else { - final DomainId domain_id = new DomainId( domain_1 ); - if ( domain_id_to_go_ids_map.containsKey( domain_id ) ) { - go_annotation_present = true; - any_go_annotation_present = true; - go_ids = domain_id_to_go_ids_map.get( domain_id ); - } - } - if ( go_annotation_present ) { - boolean first = ( ( d == 0 ) || ( ( d == 1 ) && first_has_no_go ) ); - for( final GoId go_id : go_ids ) { - out.write( "" ); - if ( first ) { - first = false; - writeDomainIdsToHtml( out, - domain_0, - domain_1, - prefix_for_html, - domain_id_to_secondary_features_maps ); - } - else { - out.write( "" ); - } - if ( !go_id_to_term_map.containsKey( go_id ) ) { - throw new IllegalArgumentException( "GO-id [" + go_id + "] not found in GO-id to GO-term map" ); - } - final GoTerm go_term = go_id_to_term_map.get( go_id ); - if ( ( go_namespace_limit == null ) || go_namespace_limit.equals( go_term.getGoNameSpace() ) ) { - final String top = GoUtils.getPenultimateGoTerm( go_term, go_id_to_term_map ).getName(); - final String go_id_str = go_id.getId(); - out.write( "" ); - out.write( "" + go_id_str + "" ); - out.write( "" ); - out.write( go_term.getName() ); - if ( domain_count == 2 ) { - out.write( " (" + d + ")" ); - } - out.write( "" ); - out.write( top ); - out.write( "" ); - out.write( "[" ); - out.write( go_term.getGoNameSpace().toShortString() ); - out.write( "]" ); - out.write( "" ); - if ( all_go_ids != null ) { - all_go_ids.add( go_id ); - } - } - else { - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - } - out.write( "" ); - out.write( SurfacingConstants.NL ); - } - } - } // for( int d = 0; d < domain_count; ++d ) - if ( !any_go_annotation_present ) { - out.write( "" ); - writeDomainIdsToHtml( out, domain_0, domain_1, prefix_for_html, domain_id_to_secondary_features_maps ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( "" ); - out.write( SurfacingConstants.NL ); - } - } - private static void writeDomainIdsToHtml( final Writer out, final String domain_0, final String domain_1, @@ -1930,97 +2128,13 @@ public final class SurfacingUtil { out.write( " " ); } out.write( "" + domain_0 + "" ); - //if ( ForesterUtil.isEmpty( domain_1 ) ) { - // out.write( " [gs]" ); - //} - // if ( !ForesterUtil.isEmpty( domain_1 ) ) { - // out.write( "=" ); - // out.write( "" + domain_1 + "" ); - //} - // else if ( ( domain_id_to_secondary_features_maps != null ) - // && ( domain_id_to_secondary_features_maps.length > 0 ) ) { - // out.write( " [" ); - // boolean first = true; - // for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { - // final Set sec_features = domain_id_to_secondary_features_map.get( new DomainId( domain_0 ) ); - // if ( ( sec_features != null ) && ( sec_features.size() > 0 ) ) { - // for( final String sec_feature : sec_features ) { - // if ( first ) { - // first = false; - // } - // else { - // out.write( ", " ); - // } - // if ( SurfacingConstants.SECONDARY_FEATURES_ARE_SCOP - // && ( SurfacingConstants.SECONDARY_FEATURES_SCOP_LINK != null ) ) { - // out.write( "" + sec_feature + "" ); - // } - // else { - // out.write( sec_feature ); - // } - // } - // } - // } - // out.write( "]" ); - // } - out.write( "" ); - } - - private static void writeDomainIdsToHtmlORIG( final Writer out, - final String domain_0, - final String domain_1, - final String prefix_for_detailed_html, - final Map>[] domain_id_to_secondary_features_maps ) - throws IOException { - out.write( "" ); - if ( !ForesterUtil.isEmpty( prefix_for_detailed_html ) ) { - out.write( prefix_for_detailed_html ); - out.write( " " ); - } - out.write( "" + domain_0 + "" ); - if ( ForesterUtil.isEmpty( domain_1 ) ) { - out.write( " [gs]" ); - } - if ( !ForesterUtil.isEmpty( domain_1 ) ) { - out.write( "=" ); - out.write( "" + domain_1 + "" ); - } - else if ( ( domain_id_to_secondary_features_maps != null ) - && ( domain_id_to_secondary_features_maps.length > 0 ) ) { - out.write( " [" ); - boolean first = true; - for( final Map> domain_id_to_secondary_features_map : domain_id_to_secondary_features_maps ) { - final Set sec_features = domain_id_to_secondary_features_map.get( new DomainId( domain_0 ) ); - if ( ( sec_features != null ) && ( sec_features.size() > 0 ) ) { - for( final String sec_feature : sec_features ) { - if ( first ) { - first = false; - } - else { - out.write( ", " ); - } - if ( SurfacingConstants.SECONDARY_FEATURES_ARE_SCOP - && ( SurfacingConstants.SECONDARY_FEATURES_SCOP_LINK != null ) ) { - out.write( "" + sec_feature + "" ); - } - else { - out.write( sec_feature ); - } - } - } - } - out.write( "]" ); - } out.write( "" ); } public static DescriptiveStatistics writeDomainSimilaritiesToFile( final StringBuilder html_desc, final StringBuilder html_title, - final Writer w, + final Writer single_writer, + Map split_writers, final SortedSet similarities, final boolean treat_as_binary, final List species_order, @@ -2163,90 +2277,119 @@ public final class SurfacingUtil { System.out.println( "Pearsonian skewness : n/a" ); } } + if ( ( single_writer != null ) && ( ( split_writers == null ) || split_writers.isEmpty() ) ) { + split_writers = new HashMap(); + split_writers.put( '_', single_writer ); + } switch ( print_option ) { case SIMPLE_TAB_DELIMITED: break; case HTML: - w.write( "" ); - w.write( SurfacingConstants.NL ); - addHtmlHead( w, "SURFACING :: " + html_title ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( html_desc.toString() ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" );
-                w.write( SurfacingConstants.NL );
-                if ( histo != null ) {
-                    w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
+                for( final Character key : split_writers.keySet() ) {
+                    final Writer w = split_writers.get( key );
+                    w.write( "" );
+                    w.write( SurfacingConstants.NL );
+                    if ( key != '_' ) {
+                        addHtmlHead( w, "DCs (" + html_title + ") " + key.toString().toUpperCase() );
+                    }
+                    else {
+                        addHtmlHead( w, "DCs (" + html_title + ")" );
+                    }
+                    w.write( SurfacingConstants.NL );
+                    w.write( "" );
+                    w.write( SurfacingConstants.NL );
+                    w.write( html_desc.toString() );
+                    w.write( SurfacingConstants.NL );
+                    w.write( "
" ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" );
+                    w.write( SurfacingConstants.NL );
+                    if ( histo != null ) {
+                        w.write( histo.toStringBuffer( 20, '|', 40, 5 ).toString() );
+                        w.write( SurfacingConstants.NL );
+                    }
+                    w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( stats.getN() > 1 ) { + w.write( "" ); + } + else { + w.write( "" ); + } + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + if ( stats.getN() > 1 ) { + w.write( "" ); + } + else { + w.write( "" ); + } + w.write( SurfacingConstants.NL ); + w.write( "
N: " + stats.getN() + "
Min: " + stats.getMin() + "
Max: " + stats.getMax() + "
Mean: " + stats.arithmeticMean() + "
SD: " + stats.sampleStandardDeviation() + "
SD: n/a
Median: " + stats.median() + "
Pearsonian skewness: " + stats.pearsonianSkewness() + "
Pearsonian skewness: n/a
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "
" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); w.write( SurfacingConstants.NL ); } - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( stats.getN() > 1 ) { - w.write( "" ); - } - else { - w.write( "" ); - } - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - if ( stats.getN() > 1 ) { - w.write( "" ); - } - else { - w.write( "" ); - } - w.write( SurfacingConstants.NL ); - w.write( "
N: " + stats.getN() + "
Min: " + stats.getMin() + "
Max: " + stats.getMax() + "
Mean: " + stats.arithmeticMean() + "
SD: " + stats.sampleStandardDeviation() + "
SD: n/a
Median: " + stats.median() + "
Pearsonian skewness: " + stats.pearsonianSkewness() + "
Pearsonian skewness: n/a
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); break; } - w.write( SurfacingConstants.NL ); + for( final Writer w : split_writers.values() ) { + w.write( SurfacingConstants.NL ); + } for( final DomainSimilarity similarity : similarities ) { if ( ( species_order != null ) && !species_order.isEmpty() ) { ( ( PrintableDomainSimilarity ) similarity ).setSpeciesOrder( species_order ); } - w.write( similarity.toStringBuffer( print_option ).toString() ); - w.write( SurfacingConstants.NL ); + if ( single_writer != null ) { + single_writer.write( similarity.toStringBuffer( print_option ).toString() ); + } + else { + Writer local_writer = split_writers.get( ( similarity.getDomainId().getId().charAt( 0 ) + "" ) + .toLowerCase().charAt( 0 ) ); + if ( local_writer == null ) { + local_writer = split_writers.get( '0' ); + } + local_writer.write( similarity.toStringBuffer( print_option ).toString() ); + } + for( final Writer w : split_writers.values() ) { + w.write( SurfacingConstants.NL ); + } } switch ( print_option ) { case HTML: - w.write( SurfacingConstants.NL ); - w.write( "
" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); - w.write( "" ); - w.write( SurfacingConstants.NL ); + for( final Writer w : split_writers.values() ) { + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + w.write( "" ); + w.write( SurfacingConstants.NL ); + } break; } - w.flush(); - w.close(); + for( final Writer w : split_writers.values() ) { + w.close(); + } return stats; } @@ -2273,9 +2416,9 @@ public final class SurfacingUtil { out.close(); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote matrix: \"" + filename + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote matrix: \"" + filename + "\"" ); } public static void writeMatrixToFile( final File matrix_outfile, final List matrices ) { @@ -2290,9 +2433,9 @@ public final class SurfacingUtil { out.close(); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote distance matrices to \"" + matrix_outfile + "\"" ); } private static void writePfamsToFile( final String outfile_name, final SortedSet pfams ) { @@ -2303,11 +2446,11 @@ public final class SurfacingUtil { writer.write( ForesterUtil.LINE_SEPARATOR ); } writer.close(); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote " + pfams.size() + " pfams to [" + outfile_name + "]" ); } catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing_old.PRG_NAME, "Failure to write: " + e ); + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "Failure to write: " + e ); } } @@ -2317,10 +2460,10 @@ public final class SurfacingUtil { writer.toPhyloXML( new File( filename ), phylogeny, 1 ); } catch ( final IOException e ) { - ForesterUtil.printWarningMessage( surfacing_old.PRG_NAME, "failed to write phylogeny to \"" + filename - + "\": " + e ); + ForesterUtil.printWarningMessage( surfacing.PRG_NAME, "failed to write phylogeny to \"" + filename + "\": " + + e ); } - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote phylogeny to \"" + filename + "\"" ); } public static void writeTaxonomyLinks( final Writer writer, final String species ) throws IOException { @@ -2342,52 +2485,6 @@ public final class SurfacingUtil { } } - public static void writeTaxonomyLinksORIG( final Writer writer, final String species ) throws IOException { - if ( ( species.length() > 1 ) && ( species.indexOf( '_' ) < 1 ) ) { - final Matcher matcher = PATTERN_SP_STYLE_TAXONOMY.matcher( species ); - writer.write( " [" ); - if ( matcher.matches() ) { - writer.write( "uniprot" ); - } - else { - writer.write( "eol" ); - writer.write( "|" ); - writer.write( "tol" ); - writer.write( "|" ); - writer.write( "wikipedia" ); - writer.write( "|" ); - writer.write( "gs" ); - } - writer.write( "]" ); - } - } - - private static void writeToNexus( final String outfile_name, final CharacterStateMatrix matrix ) { - if ( !( matrix instanceof BasicCharacterStateMatrix ) ) { - throw new IllegalArgumentException( "can only write matrices of type [" + BasicCharacterStateMatrix.class - + "] to nexus" ); - } - final BasicCharacterStateMatrix my_matrix = ( org.forester.evoinference.matrix.character.BasicCharacterStateMatrix ) matrix; - try { - final BufferedWriter w = new BufferedWriter( new FileWriter( outfile_name ) ); - w.write( NexusConstants.NEXUS ); - w.write( ForesterUtil.LINE_SEPARATOR ); - my_matrix.writeNexusTaxaBlock( w ); - my_matrix.writeNexusBinaryChractersBlock( w ); - w.flush(); - w.close(); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" ); - } - catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); - } - } - private static void writeToNexus( final String outfile_name, final CharacterStateMatrix matrix, final Phylogeny phylogeny ) { @@ -2404,31 +2501,109 @@ public final class SurfacingUtil { w.write( ForesterUtil.LINE_SEPARATOR ); my_matrix.writeNexusTaxaBlock( w ); my_matrix.writeNexusBinaryChractersBlock( w ); - PhylogenyWriter.writeNexusTreesBlock( w, phylogenies ); + PhylogenyWriter.writeNexusTreesBlock( w, phylogenies, NH_CONVERSION_SUPPORT_VALUE_STYLE.NONE ); w.flush(); w.close(); - ForesterUtil.programMessage( surfacing_old.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" ); + ForesterUtil.programMessage( surfacing.PRG_NAME, "Wrote Nexus file: \"" + outfile_name + "\"" ); } catch ( final IOException e ) { - ForesterUtil.fatalError( surfacing_old.PRG_NAME, e.getMessage() ); + ForesterUtil.fatalError( surfacing.PRG_NAME, e.getMessage() ); } } - private static void writeToNexus( final String outfile_name, final DomainParsimonyCalculator domain_parsimony ) { - writeToNexus( outfile_name + surfacing_old.NEXUS_EXTERNAL_DOMAINS, - domain_parsimony.createMatrixOfDomainPresenceOrAbsence() ); - writeToNexus( outfile_name + surfacing_old.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS, - domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence() ); - } - private static void writeToNexus( final String outfile_name, final DomainParsimonyCalculator domain_parsimony, final Phylogeny phylogeny ) { - writeToNexus( outfile_name + surfacing_old.NEXUS_EXTERNAL_DOMAINS, + writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAINS, domain_parsimony.createMatrixOfDomainPresenceOrAbsence(), phylogeny ); - writeToNexus( outfile_name + surfacing_old.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS, + writeToNexus( outfile_name + surfacing.NEXUS_EXTERNAL_DOMAIN_COMBINATIONS, domain_parsimony.createMatrixOfBinaryDomainCombinationPresenceOrAbsence(), phylogeny ); } + + public static void domainsPerProteinsStatistics( final String genome, + final List protein_list, + final DescriptiveStatistics all_genomes_domains_per_potein_stats, + final SortedMap all_genomes_domains_per_potein_histo, + final SortedSet domains_which_are_always_single, + final SortedSet domains_which_are_sometimes_single_sometimes_not, + final SortedSet domains_which_never_single, + final Writer writer ) { + final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); + for( final Protein protein : protein_list ) { + final int domains = protein.getNumberOfProteinDomains(); + //System.out.println( domains ); + stats.addValue( domains ); + all_genomes_domains_per_potein_stats.addValue( domains ); + if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) { + all_genomes_domains_per_potein_histo.put( domains, 1 ); + } + else { + all_genomes_domains_per_potein_histo.put( domains, + 1 + all_genomes_domains_per_potein_histo.get( domains ) ); + } + if ( domains == 1 ) { + final String domain = protein.getProteinDomain( 0 ).getDomainId().getId(); + if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { + if ( domains_which_never_single.contains( domain ) ) { + domains_which_never_single.remove( domain ); + domains_which_are_sometimes_single_sometimes_not.add( domain ); + } + else { + domains_which_are_always_single.add( domain ); + } + } + } + else if ( domains > 1 ) { + for( final Domain d : protein.getProteinDomains() ) { + final String domain = d.getDomainId().getId(); + // System.out.println( domain ); + if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { + if ( domains_which_are_always_single.contains( domain ) ) { + domains_which_are_always_single.remove( domain ); + domains_which_are_sometimes_single_sometimes_not.add( domain ); + } + else { + domains_which_never_single.add( domain ); + } + } + } + } + } + try { + writer.write( genome ); + writer.write( "\t" ); + if ( stats.getN() >= 1 ) { + writer.write( stats.arithmeticMean() + "" ); + writer.write( "\t" ); + if ( stats.getN() >= 2 ) { + writer.write( stats.sampleStandardDeviation() + "" ); + } + else { + writer.write( "" ); + } + writer.write( "\t" ); + writer.write( stats.median() + "" ); + writer.write( "\t" ); + writer.write( stats.getN() + "" ); + writer.write( "\t" ); + writer.write( stats.getMin() + "" ); + writer.write( "\t" ); + writer.write( stats.getMax() + "" ); + } + else { + writer.write( "\t" ); + writer.write( "\t" ); + writer.write( "\t" ); + writer.write( "0" ); + writer.write( "\t" ); + writer.write( "\t" ); + } + writer.write( "\n" ); + } + catch ( final IOException e ) { + e.printStackTrace(); + } + } }