From: cmzmasek@gmail.com Date: Wed, 2 Nov 2011 20:24:45 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=756e207d17b773c707ff2315091bcef338ddcf85;p=jalview.git in progress --- diff --git a/forester/java/src/org/forester/application/surfacing.java b/forester/java/src/org/forester/application/surfacing.java index b795c3e..12274f7 100644 --- a/forester/java/src/org/forester/application/surfacing.java +++ b/forester/java/src/org/forester/application/surfacing.java @@ -36,6 +36,7 @@ import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; @@ -1721,6 +1722,7 @@ public class surfacing { catch ( final IOException e2 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() ); } + final DescriptiveStatistics protein_coverage_stats = new BasicDescriptiveStatistics(); final DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics(); final SortedMap all_genomes_domains_per_potein_histo = new TreeMap(); final SortedSet domains_which_are_always_single = new TreeSet(); @@ -1748,6 +1750,7 @@ public class surfacing { catch ( final IOException e3 ) { e3.printStackTrace(); } + // Main loop: for( int i = 0; i < number_of_genomes; ++i ) { System.out.println(); System.out.println( ( i + 1 ) + "/" + number_of_genomes ); @@ -1811,10 +1814,16 @@ public class surfacing { System.out.println( "Domains ignored due to virus like id: " ); ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToVirusLikeIdCountsMap() ); } + final double coverage = ( double ) protein_list.size() / parser.getProteinsEncountered(); + protein_coverage_stats.addValue( coverage ); System.out.println( "Number of proteins encountered : " + parser.getProteinsEncountered() ); log( "Number of proteins encountered : " + parser.getProteinsEncountered(), log_writer ); System.out.println( "Number of proteins stored : " + protein_list.size() ); log( "Number of proteins stored : " + protein_list.size(), log_writer ); + System.out.println( "Coverage : " + + ForesterUtil.roundToInt( 100.0 * coverage ) + "%" ); + log( "Coverage : " + ForesterUtil.roundToInt( 100.0 * coverage ) + + "%", log_writer ); System.out.println( "Domains encountered : " + parser.getDomainsEncountered() ); log( "Domains encountered : " + parser.getDomainsEncountered(), log_writer ); System.out.println( "Domains stored : " + parser.getDomainsStored() ); @@ -1894,7 +1903,7 @@ public class surfacing { catch ( final IOException e ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e.toString() ); } - SurfacingUtil.domainsPerProteinsStatistics( input_file_properties[ i ][ 0 ], + SurfacingUtil.domainsPerProteinsStatistics( input_file_properties[ i ][ 1 ], protein_list, all_genomes_domains_per_potein_stats, all_genomes_domains_per_potein_histo, @@ -1952,17 +1961,6 @@ public class surfacing { } System.gc(); } // for( int i = 0; i < number_of_genomes; ++i ) { - try { - per_genome_domain_promiscuity_statistics_writer.flush(); - per_genome_domain_promiscuity_statistics_writer.close(); - dc_data_writer.flush(); - dc_data_writer.close(); - log_writer.flush(); - log_writer.close(); - } - catch ( final IOException e2 ) { - ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() ); - } ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: " + per_genome_domain_promiscuity_statistics_file ); // @@ -1981,8 +1979,8 @@ public class surfacing { domains_per_potein_stats_writer.write( "\t" ); domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMax() + "" ); domains_per_potein_stats_writer.write( "\n" ); - domains_per_potein_stats_writer.flush(); domains_per_potein_stats_writer.close(); + printOutPercentageOfMultidomainProteins( all_genomes_domains_per_potein_histo, log_writer ); ForesterUtil.map2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + "__all_genomes_domains_per_potein_histo.txt" ), all_genomes_domains_per_potein_histo, "\t", "\n" ); ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file @@ -1991,6 +1989,16 @@ public class surfacing { + "__domains_single_or_combined.txt" ), domains_which_are_sometimes_single_sometimes_not, "\n" ); ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file + "__domains_always_combined.txt" ), domains_which_never_single, "\n" ); + ForesterUtil.programMessage( PRG_NAME, + "Average of proteins with a least one domain assigned: " + + ( 100 * protein_coverage_stats.arithmeticMean() ) + "% (+/-" + + ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)" ); + ForesterUtil.programMessage( PRG_NAME, "Range of proteins with a least one domain assigned: " + 100 + * protein_coverage_stats.getMin() + "%-" + 100 * protein_coverage_stats.getMax() + "%" ); + log( "Average of prot with a least one dom assigned : " + ( 100 * protein_coverage_stats.arithmeticMean() ) + + "% (+/-" + ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)", log_writer ); + log( "Range of prot with a least one dom assigned : " + 100 * protein_coverage_stats.getMin() + "%-" + + 100 * protein_coverage_stats.getMax() + "%", log_writer ); } catch ( final IOException e2 ) { ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() ); @@ -2005,6 +2013,14 @@ public class surfacing { } } } + try { + per_genome_domain_promiscuity_statistics_writer.close(); + dc_data_writer.close(); + log_writer.close(); + } + catch ( final IOException e2 ) { + ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() ); + } if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) { try { SurfacingUtil.executeDomainLengthAnalysis( input_file_properties, @@ -2345,6 +2361,17 @@ public class surfacing { System.out.println(); } + private static void printOutPercentageOfMultidomainProteins( final SortedMap all_genomes_domains_per_potein_histo, + final Writer log_writer ) { + int sum = 0; + for( final Entry entry : all_genomes_domains_per_potein_histo.entrySet() ) { + sum += entry.getValue(); + } + final double percentage = 100.0 * ( sum - all_genomes_domains_per_potein_histo.get( 1 ) ) / sum; + ForesterUtil.programMessage( PRG_NAME, "Percentage of multidomain proteins: " + percentage + "%" ); + log( "Percentage of multidomain proteins: : " + percentage + "%", log_writer ); + } + private static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree, final String[][] input_file_properties ) { final String[] genomes = new String[ input_file_properties.length ]; diff --git a/forester/java/src/org/forester/archaeopteryx/Constants.java b/forester/java/src/org/forester/archaeopteryx/Constants.java index ea932fe..c77853d 100644 --- a/forester/java/src/org/forester/archaeopteryx/Constants.java +++ b/forester/java/src/org/forester/archaeopteryx/Constants.java @@ -41,8 +41,8 @@ public final class Constants { public final static boolean __SNAPSHOT_RELEASE = true; // TODO remove me public final static boolean __SYNTH_LF = false; // TODO remove me public final static String PRG_NAME = "Archaeopteryx"; - final static String VERSION = "0.962 beta 2N"; - final static String PRG_DATE = "2011.09.17"; + final static String VERSION = "0.963 beta BG"; + final static String PRG_DATE = "2011.11.02"; final static String DEFAULT_CONFIGURATION_FILE_NAME = "_aptx_configuration_file"; final static String[] DEFAULT_FONT_CHOICES = { "Verdana", "Tahoma", "Arial", "Helvetica", "Dialog", "Lucida Sans", "SansSerif", "Sans-serif", "Sans" }; diff --git a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java index ac6a2bc..186ecc8 100644 --- a/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java +++ b/forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java @@ -482,11 +482,11 @@ public final class HmmscanPerDomainTableParser { _domains_ignored_due_to_duf = domains_ignored_due_to_duf; } - public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) { + private void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) { _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value; } - public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { + private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) { _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff; } diff --git a/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java index 9cd9bd9..dddecec 100644 --- a/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java +++ b/forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java @@ -81,6 +81,7 @@ public final class PhyloXmlUtil { TAXONOMY_RANKS_LIST.add( "superorder" ); TAXONOMY_RANKS_LIST.add( "order" ); TAXONOMY_RANKS_LIST.add( "suborder" ); + TAXONOMY_RANKS_LIST.add( "infraorder" ); TAXONOMY_RANKS_LIST.add( "superfamily" ); TAXONOMY_RANKS_LIST.add( "family" ); TAXONOMY_RANKS_LIST.add( "subfamily" ); @@ -132,6 +133,7 @@ public final class PhyloXmlUtil { TAXONOMY_RANKS_SET.add( "superorder" ); TAXONOMY_RANKS_SET.add( "order" ); TAXONOMY_RANKS_SET.add( "suborder" ); + TAXONOMY_RANKS_SET.add( "infraorder" ); TAXONOMY_RANKS_SET.add( "superfamily" ); TAXONOMY_RANKS_SET.add( "family" ); TAXONOMY_RANKS_SET.add( "subfamily" ); diff --git a/forester/java/src/org/forester/surfacing/SurfacingUtil.java b/forester/java/src/org/forester/surfacing/SurfacingUtil.java index e1ee0ab..eef4ff9 100644 --- a/forester/java/src/org/forester/surfacing/SurfacingUtil.java +++ b/forester/java/src/org/forester/surfacing/SurfacingUtil.java @@ -2297,6 +2297,7 @@ public final class SurfacingUtil { final DescriptiveStatistics stats = new BasicDescriptiveStatistics(); for( final Protein protein : protein_list ) { final int domains = protein.getNumberOfProteinDomains(); + //System.out.println( domains ); stats.addValue( domains ); all_genomes_domains_per_potein_stats.addValue( domains ); if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) { @@ -2321,6 +2322,7 @@ public final class SurfacingUtil { else if ( domains > 1 ) { for( final Domain d : protein.getProteinDomains() ) { final String domain = d.getDomainId().getId(); + // System.out.println( domain ); if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) { if ( domains_which_are_always_single.contains( domain ) ) { domains_which_are_always_single.remove( domain ); diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 8b440a3..723ace8 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -3402,11 +3402,11 @@ public final class Test { parser1.parse(); final HmmscanPerDomainTableParser parser2 = new HmmscanPerDomainTableParser( new File( test_dir + ForesterUtil.getFileSeparator() + "hmmscan30b3_output_2" ), "MONBR", INDIVIDUAL_SCORE_CUTOFF.NONE ); - final List domain_collections = parser2.parse(); + final List proteins = parser2.parse(); if ( parser2.getProteinsEncountered() != 4 ) { return false; } - if ( domain_collections.size() != 4 ) { + if ( proteins.size() != 4 ) { return false; } if ( parser2.getDomainsEncountered() != 69 ) { @@ -3418,11 +3418,19 @@ public final class Test { if ( parser2.getDomainsIgnoredDueToEval() != 0 ) { return false; } - final Protein p1 = domain_collections.get( 0 ); + final Protein p1 = proteins.get( 0 ); if ( p1.getNumberOfProteinDomains() != 15 ) { return false; } - final Protein p4 = domain_collections.get( 3 ); + final Protein p2 = proteins.get( 1 ); + if ( p2.getNumberOfProteinDomains() != 51 ) { + return false; + } + final Protein p3 = proteins.get( 2 ); + if ( p3.getNumberOfProteinDomains() != 2 ) { + return false; + } + final Protein p4 = proteins.get( 3 ); if ( p4.getNumberOfProteinDomains() != 1 ) { return false; } diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index 5af580b..de93bd1 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -573,8 +573,8 @@ public final class ForesterUtil { final Map data, final String entry_separator, final String data_separator ) throws IOException { + boolean first = true; for( final Entry entry : data.entrySet() ) { - boolean first = true; if ( !first ) { writer.write( data_separator ); }