in progress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 2 Nov 2011 20:24:45 +0000 (20:24 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 2 Nov 2011 20:24:45 +0000 (20:24 +0000)
forester/java/src/org/forester/application/surfacing.java
forester/java/src/org/forester/archaeopteryx/Constants.java
forester/java/src/org/forester/io/parsers/HmmscanPerDomainTableParser.java
forester/java/src/org/forester/io/parsers/phyloxml/PhyloXmlUtil.java
forester/java/src/org/forester/surfacing/SurfacingUtil.java
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/util/ForesterUtil.java

index b795c3e..12274f7 100644 (file)
@@ -36,6 +36,7 @@ import java.util.Date;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.SortedSet;
@@ -1721,6 +1722,7 @@ public class surfacing {
         catch ( final IOException e2 ) {
             ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() );
         }
         catch ( final IOException e2 ) {
             ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() );
         }
+        final DescriptiveStatistics protein_coverage_stats = new BasicDescriptiveStatistics();
         final DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics();
         final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo = new TreeMap<Integer, Integer>();
         final SortedSet<String> domains_which_are_always_single = new TreeSet<String>();
         final DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics();
         final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo = new TreeMap<Integer, Integer>();
         final SortedSet<String> domains_which_are_always_single = new TreeSet<String>();
@@ -1748,6 +1750,7 @@ public class surfacing {
         catch ( final IOException e3 ) {
             e3.printStackTrace();
         }
         catch ( final IOException e3 ) {
             e3.printStackTrace();
         }
+        // Main loop:
         for( int i = 0; i < number_of_genomes; ++i ) {
             System.out.println();
             System.out.println( ( i + 1 ) + "/" + number_of_genomes );
         for( int i = 0; i < number_of_genomes; ++i ) {
             System.out.println();
             System.out.println( ( i + 1 ) + "/" + number_of_genomes );
@@ -1811,10 +1814,16 @@ public class surfacing {
                 System.out.println( "Domains ignored due to virus like id: " );
                 ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToVirusLikeIdCountsMap() );
             }
                 System.out.println( "Domains ignored due to virus like id: " );
                 ForesterUtil.printCountingMap( parser.getDomainsIgnoredDueToVirusLikeIdCountsMap() );
             }
+            final double coverage = ( double ) protein_list.size() / parser.getProteinsEncountered();
+            protein_coverage_stats.addValue( coverage );
             System.out.println( "Number of proteins encountered                 : " + parser.getProteinsEncountered() );
             log( "Number of proteins encountered                 : " + parser.getProteinsEncountered(), log_writer );
             System.out.println( "Number of proteins stored                      : " + protein_list.size() );
             log( "Number of proteins stored                      : " + protein_list.size(), log_writer );
             System.out.println( "Number of proteins encountered                 : " + parser.getProteinsEncountered() );
             log( "Number of proteins encountered                 : " + parser.getProteinsEncountered(), log_writer );
             System.out.println( "Number of proteins stored                      : " + protein_list.size() );
             log( "Number of proteins stored                      : " + protein_list.size(), log_writer );
+            System.out.println( "Coverage                                       : "
+                    + ForesterUtil.roundToInt( 100.0 * coverage ) + "%" );
+            log( "Coverage                                       : " + ForesterUtil.roundToInt( 100.0 * coverage )
+                    + "%", log_writer );
             System.out.println( "Domains encountered                            : " + parser.getDomainsEncountered() );
             log( "Domains encountered                            : " + parser.getDomainsEncountered(), log_writer );
             System.out.println( "Domains stored                                 : " + parser.getDomainsStored() );
             System.out.println( "Domains encountered                            : " + parser.getDomainsEncountered() );
             log( "Domains encountered                            : " + parser.getDomainsEncountered(), log_writer );
             System.out.println( "Domains stored                                 : " + parser.getDomainsStored() );
@@ -1894,7 +1903,7 @@ public class surfacing {
             catch ( final IOException e ) {
                 ForesterUtil.fatalError( surfacing.PRG_NAME, e.toString() );
             }
             catch ( final IOException e ) {
                 ForesterUtil.fatalError( surfacing.PRG_NAME, e.toString() );
             }
-            SurfacingUtil.domainsPerProteinsStatistics( input_file_properties[ i ][ 0 ],
+            SurfacingUtil.domainsPerProteinsStatistics( input_file_properties[ i ][ 1 ],
                                                         protein_list,
                                                         all_genomes_domains_per_potein_stats,
                                                         all_genomes_domains_per_potein_histo,
                                                         protein_list,
                                                         all_genomes_domains_per_potein_stats,
                                                         all_genomes_domains_per_potein_histo,
@@ -1952,17 +1961,6 @@ public class surfacing {
             }
             System.gc();
         } // for( int i = 0; i < number_of_genomes; ++i ) {
             }
             System.gc();
         } // for( int i = 0; i < number_of_genomes; ++i ) {
-        try {
-            per_genome_domain_promiscuity_statistics_writer.flush();
-            per_genome_domain_promiscuity_statistics_writer.close();
-            dc_data_writer.flush();
-            dc_data_writer.close();
-            log_writer.flush();
-            log_writer.close();
-        }
-        catch ( final IOException e2 ) {
-            ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
-        }
         ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: "
                 + per_genome_domain_promiscuity_statistics_file );
         //
         ForesterUtil.programMessage( PRG_NAME, "Wrote domain promiscuities to: "
                 + per_genome_domain_promiscuity_statistics_file );
         //
@@ -1981,8 +1979,8 @@ public class surfacing {
             domains_per_potein_stats_writer.write( "\t" );
             domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMax() + "" );
             domains_per_potein_stats_writer.write( "\n" );
             domains_per_potein_stats_writer.write( "\t" );
             domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMax() + "" );
             domains_per_potein_stats_writer.write( "\n" );
-            domains_per_potein_stats_writer.flush();
             domains_per_potein_stats_writer.close();
             domains_per_potein_stats_writer.close();
+            printOutPercentageOfMultidomainProteins( all_genomes_domains_per_potein_histo, log_writer );
             ForesterUtil.map2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
                     + "__all_genomes_domains_per_potein_histo.txt" ), all_genomes_domains_per_potein_histo, "\t", "\n" );
             ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
             ForesterUtil.map2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
                     + "__all_genomes_domains_per_potein_histo.txt" ), all_genomes_domains_per_potein_histo, "\t", "\n" );
             ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
@@ -1991,6 +1989,16 @@ public class surfacing {
                     + "__domains_single_or_combined.txt" ), domains_which_are_sometimes_single_sometimes_not, "\n" );
             ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
                     + "__domains_always_combined.txt" ), domains_which_never_single, "\n" );
                     + "__domains_single_or_combined.txt" ), domains_which_are_sometimes_single_sometimes_not, "\n" );
             ForesterUtil.collection2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
                     + "__domains_always_combined.txt" ), domains_which_never_single, "\n" );
+            ForesterUtil.programMessage( PRG_NAME,
+                                         "Average of proteins with a least one domain assigned: "
+                                                 + ( 100 * protein_coverage_stats.arithmeticMean() ) + "% (+/-"
+                                                 + ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)" );
+            ForesterUtil.programMessage( PRG_NAME, "Range of proteins with a least one domain assigned: " + 100
+                    * protein_coverage_stats.getMin() + "%-" + 100 * protein_coverage_stats.getMax() + "%" );
+            log( "Average of prot with a least one dom assigned  : " + ( 100 * protein_coverage_stats.arithmeticMean() )
+                    + "% (+/-" + ( 100 * protein_coverage_stats.sampleStandardDeviation() ) + "%)", log_writer );
+            log( "Range of prot with a least one dom assigned    : " + 100 * protein_coverage_stats.getMin() + "%-"
+                    + 100 * protein_coverage_stats.getMax() + "%", log_writer );
         }
         catch ( final IOException e2 ) {
             ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
         }
         catch ( final IOException e2 ) {
             ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
@@ -2005,6 +2013,14 @@ public class surfacing {
                 }
             }
         }
                 }
             }
         }
+        try {
+            per_genome_domain_promiscuity_statistics_writer.close();
+            dc_data_writer.close();
+            log_writer.close();
+        }
+        catch ( final IOException e2 ) {
+            ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
+        }
         if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) {
             try {
                 SurfacingUtil.executeDomainLengthAnalysis( input_file_properties,
         if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) {
             try {
                 SurfacingUtil.executeDomainLengthAnalysis( input_file_properties,
@@ -2345,6 +2361,17 @@ public class surfacing {
         System.out.println();
     }
 
         System.out.println();
     }
 
+    private static void printOutPercentageOfMultidomainProteins( final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo,
+                                                                 final Writer log_writer ) {
+        int sum = 0;
+        for( final Entry<Integer, Integer> entry : all_genomes_domains_per_potein_histo.entrySet() ) {
+            sum += entry.getValue();
+        }
+        final double percentage = 100.0 * ( sum - all_genomes_domains_per_potein_histo.get( 1 ) ) / sum;
+        ForesterUtil.programMessage( PRG_NAME, "Percentage of multidomain proteins: " + percentage + "%" );
+        log( "Percentage of multidomain proteins:            : " + percentage + "%", log_writer );
+    }
+
     private static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree,
                                                               final String[][] input_file_properties ) {
         final String[] genomes = new String[ input_file_properties.length ];
     private static void preparePhylogenyForParsimonyAnalyses( final Phylogeny intree,
                                                               final String[][] input_file_properties ) {
         final String[] genomes = new String[ input_file_properties.length ];
index ea932fe..c77853d 100644 (file)
@@ -41,8 +41,8 @@ public final class Constants {
     public final static boolean __SNAPSHOT_RELEASE                                            = true;                                                     // TODO remove me
     public final static boolean __SYNTH_LF                                                    = false;                                                    // TODO remove me
     public final static String  PRG_NAME                                                      = "Archaeopteryx";
     public final static boolean __SNAPSHOT_RELEASE                                            = true;                                                     // TODO remove me
     public final static boolean __SYNTH_LF                                                    = false;                                                    // TODO remove me
     public final static String  PRG_NAME                                                      = "Archaeopteryx";
-    final static String         VERSION                                                       = "0.962 beta 2N";
-    final static String         PRG_DATE                                                      = "2011.09.17";
+    final static String         VERSION                                                       = "0.963 beta BG";
+    final static String         PRG_DATE                                                      = "2011.11.02";
     final static String         DEFAULT_CONFIGURATION_FILE_NAME                               = "_aptx_configuration_file";
     final static String[]       DEFAULT_FONT_CHOICES                                          = { "Verdana", "Tahoma",
             "Arial", "Helvetica", "Dialog", "Lucida Sans", "SansSerif", "Sans-serif", "Sans" };
     final static String         DEFAULT_CONFIGURATION_FILE_NAME                               = "_aptx_configuration_file";
     final static String[]       DEFAULT_FONT_CHOICES                                          = { "Verdana", "Tahoma",
             "Arial", "Helvetica", "Dialog", "Lucida Sans", "SansSerif", "Sans-serif", "Sans" };
index ac6a2bc..186ecc8 100644 (file)
@@ -482,11 +482,11 @@ public final class HmmscanPerDomainTableParser {
         _domains_ignored_due_to_duf = domains_ignored_due_to_duf;
     }
 
         _domains_ignored_due_to_duf = domains_ignored_due_to_duf;
     }
 
-    public void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
+    private void setDomainsIgnoredDueToEval( final int domains_ignored_due_to_e_value ) {
         _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
     }
 
         _domains_ignored_due_to_e_value = domains_ignored_due_to_e_value;
     }
 
-    public void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
+    private void setDomainsIgnoredDueToIndividualScoreCutoff( final int domains_ignored_due_to_individual_score_cutoff ) {
         _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
     }
 
         _domains_ignored_due_to_individual_score_cutoff = domains_ignored_due_to_individual_score_cutoff;
     }
 
index 9cd9bd9..dddecec 100644 (file)
@@ -81,6 +81,7 @@ public final class PhyloXmlUtil {
         TAXONOMY_RANKS_LIST.add( "superorder" );
         TAXONOMY_RANKS_LIST.add( "order" );
         TAXONOMY_RANKS_LIST.add( "suborder" );
         TAXONOMY_RANKS_LIST.add( "superorder" );
         TAXONOMY_RANKS_LIST.add( "order" );
         TAXONOMY_RANKS_LIST.add( "suborder" );
+        TAXONOMY_RANKS_LIST.add( "infraorder" );
         TAXONOMY_RANKS_LIST.add( "superfamily" );
         TAXONOMY_RANKS_LIST.add( "family" );
         TAXONOMY_RANKS_LIST.add( "subfamily" );
         TAXONOMY_RANKS_LIST.add( "superfamily" );
         TAXONOMY_RANKS_LIST.add( "family" );
         TAXONOMY_RANKS_LIST.add( "subfamily" );
@@ -132,6 +133,7 @@ public final class PhyloXmlUtil {
         TAXONOMY_RANKS_SET.add( "superorder" );
         TAXONOMY_RANKS_SET.add( "order" );
         TAXONOMY_RANKS_SET.add( "suborder" );
         TAXONOMY_RANKS_SET.add( "superorder" );
         TAXONOMY_RANKS_SET.add( "order" );
         TAXONOMY_RANKS_SET.add( "suborder" );
+        TAXONOMY_RANKS_SET.add( "infraorder" );
         TAXONOMY_RANKS_SET.add( "superfamily" );
         TAXONOMY_RANKS_SET.add( "family" );
         TAXONOMY_RANKS_SET.add( "subfamily" );
         TAXONOMY_RANKS_SET.add( "superfamily" );
         TAXONOMY_RANKS_SET.add( "family" );
         TAXONOMY_RANKS_SET.add( "subfamily" );
index e1ee0ab..eef4ff9 100644 (file)
@@ -2297,6 +2297,7 @@ public final class SurfacingUtil {
         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
         for( final Protein protein : protein_list ) {
             final int domains = protein.getNumberOfProteinDomains();
         final DescriptiveStatistics stats = new BasicDescriptiveStatistics();
         for( final Protein protein : protein_list ) {
             final int domains = protein.getNumberOfProteinDomains();
+            //System.out.println( domains );
             stats.addValue( domains );
             all_genomes_domains_per_potein_stats.addValue( domains );
             if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) {
             stats.addValue( domains );
             all_genomes_domains_per_potein_stats.addValue( domains );
             if ( !all_genomes_domains_per_potein_histo.containsKey( domains ) ) {
@@ -2321,6 +2322,7 @@ public final class SurfacingUtil {
             else if ( domains > 1 ) {
                 for( final Domain d : protein.getProteinDomains() ) {
                     final String domain = d.getDomainId().getId();
             else if ( domains > 1 ) {
                 for( final Domain d : protein.getProteinDomains() ) {
                     final String domain = d.getDomainId().getId();
+                    // System.out.println( domain );
                     if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
                         if ( domains_which_are_always_single.contains( domain ) ) {
                             domains_which_are_always_single.remove( domain );
                     if ( !domains_which_are_sometimes_single_sometimes_not.contains( domain ) ) {
                         if ( domains_which_are_always_single.contains( domain ) ) {
                             domains_which_are_always_single.remove( domain );
index 8b440a3..723ace8 100644 (file)
@@ -3402,11 +3402,11 @@ public final class Test {
             parser1.parse();
             final HmmscanPerDomainTableParser parser2 = new HmmscanPerDomainTableParser( new File( test_dir
                     + ForesterUtil.getFileSeparator() + "hmmscan30b3_output_2" ), "MONBR", INDIVIDUAL_SCORE_CUTOFF.NONE );
             parser1.parse();
             final HmmscanPerDomainTableParser parser2 = new HmmscanPerDomainTableParser( new File( test_dir
                     + ForesterUtil.getFileSeparator() + "hmmscan30b3_output_2" ), "MONBR", INDIVIDUAL_SCORE_CUTOFF.NONE );
-            final List<Protein> domain_collections = parser2.parse();
+            final List<Protein> proteins = parser2.parse();
             if ( parser2.getProteinsEncountered() != 4 ) {
                 return false;
             }
             if ( parser2.getProteinsEncountered() != 4 ) {
                 return false;
             }
-            if ( domain_collections.size() != 4 ) {
+            if ( proteins.size() != 4 ) {
                 return false;
             }
             if ( parser2.getDomainsEncountered() != 69 ) {
                 return false;
             }
             if ( parser2.getDomainsEncountered() != 69 ) {
@@ -3418,11 +3418,19 @@ public final class Test {
             if ( parser2.getDomainsIgnoredDueToEval() != 0 ) {
                 return false;
             }
             if ( parser2.getDomainsIgnoredDueToEval() != 0 ) {
                 return false;
             }
-            final Protein p1 = domain_collections.get( 0 );
+            final Protein p1 = proteins.get( 0 );
             if ( p1.getNumberOfProteinDomains() != 15 ) {
                 return false;
             }
             if ( p1.getNumberOfProteinDomains() != 15 ) {
                 return false;
             }
-            final Protein p4 = domain_collections.get( 3 );
+            final Protein p2 = proteins.get( 1 );
+            if ( p2.getNumberOfProteinDomains() != 51 ) {
+                return false;
+            }
+            final Protein p3 = proteins.get( 2 );
+            if ( p3.getNumberOfProteinDomains() != 2 ) {
+                return false;
+            }
+            final Protein p4 = proteins.get( 3 );
             if ( p4.getNumberOfProteinDomains() != 1 ) {
                 return false;
             }
             if ( p4.getNumberOfProteinDomains() != 1 ) {
                 return false;
             }
index 5af580b..de93bd1 100644 (file)
@@ -573,8 +573,8 @@ public final class ForesterUtil {
                                          final Map<?, ?> data,
                                          final String entry_separator,
                                          final String data_separator ) throws IOException {
                                          final Map<?, ?> data,
                                          final String entry_separator,
                                          final String data_separator ) throws IOException {
+        boolean first = true;
         for( final Entry<?, ?> entry : data.entrySet() ) {
         for( final Entry<?, ?> entry : data.entrySet() ) {
-            boolean first = true;
             if ( !first ) {
                 writer.write( data_separator );
             }
             if ( !first ) {
                 writer.write( data_separator );
             }