inprogress
[jalview.git] / forester / java / src / org / forester / application / surfacing.java
index 66f1b8f..89ce71c 100644 (file)
@@ -178,7 +178,7 @@ public class surfacing {
     final static private String                                     NO_ENGULFING_OVERLAP_OPTION                                                   = "no_eo";
     final static private String                                     IGNORE_COMBINATION_WITH_SAME_OPTION                                           = "ignore_self_comb";
     final static private String                                     PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION                                       = "dc_regain_stats";
-    final static private String                                     DA_ANALYSIS_OPTION                                                            = "DA_analyis";
+    final static private String                                     DA_ANALYSIS_OPTION                                                            = "da_analyis";
     final static private String                                     USE_LAST_IN_FITCH_OPTION                                                      = "last";
     public final static String                                      PAIRWISE_DOMAIN_COMPARISONS_PREFIX                                            = "pwc_";
     final static private String                                     PAIRWISE_DOMAIN_COMPARISONS_OPTION                                            = "pwc";
@@ -215,8 +215,8 @@ public class surfacing {
     final static private String                                     INPUT_GENOMES_FILE_OPTION                                                     = "genomes";
     final static private String                                     INPUT_SPECIES_TREE_OPTION                                                     = "species_tree";
     final static private String                                     SEQ_EXTRACT_OPTION                                                            = "prot_extract";
-    final static private String                                     PRG_VERSION                                                                   = "2.403";
-    final static private String                                     PRG_DATE                                                                      = "131127";
+    final static private String                                     PRG_VERSION                                                                   = "2.404";
+    final static private String                                     PRG_DATE                                                                      = "140319";
     final static private String                                     E_MAIL                                                                        = "czmasek@burnham.org";
     final static private String                                     WWW                                                                           = "https://sites.google.com/site/cmzmasek/home/software/forester/surfacing";
     final static private boolean                                    IGNORE_DUFS_DEFAULT                                                           = true;
@@ -241,7 +241,7 @@ public class surfacing {
     private static final String                                     OUTPUT_DOMAIN_COMBINATIONS_GAINED_MORE_THAN_ONCE_ANALYSIS_SUFFIX              = "_fitch_dc_gains_counts";
     private static final String                                     OUTPUT_DOMAIN_COMBINATIONS_LOST_MORE_THAN_ONCE_ANALYSIS_SUFFIX                = "_fitch_dc_losses_counts";
     private static final String                                     DOMAIN_LENGTHS_ANALYSIS_SUFFIX                                                = "_domain_lengths_analysis";
-    private static final boolean                                    PERFORM_DOMAIN_LENGTH_ANALYSIS                                                = true;
+    private static final String                                     PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION                                         = "dla";
     public static final String                                      ALL_PFAMS_ENCOUNTERED_SUFFIX                                                  = "_all_encountered_pfams";
     public static final String                                      ALL_PFAMS_ENCOUNTERED_WITH_GO_ANNOTATION_SUFFIX                               = "_all_encountered_pfams_with_go_annotation";
     public static final String                                      ENCOUNTERED_PFAMS_SUMMARY_SUFFIX                                              = "_encountered_pfams_summary";
@@ -345,6 +345,7 @@ public class surfacing {
         allowed_options.add( DA_ANALYSIS_OPTION );
         allowed_options.add( USE_LAST_IN_FITCH_OPTION );
         allowed_options.add( PERFORM_DC_FITCH );
+        allowed_options.add( PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION );
         boolean ignore_dufs = surfacing.IGNORE_DUFS_DEFAULT;
         boolean ignore_combination_with_same = surfacing.IGNORE_COMBINATION_WITH_SAME_DEFAULLT;
         double fs_e_value_max = surfacing.MAX_E_VALUE_DEFAULT;
@@ -378,9 +379,9 @@ public class surfacing {
         if ( cla.isOptionSet( DOMAIN_COMBINITONS_OUTPUT_OPTION_FOR_GRAPH_ANALYSIS ) ) {
             output_binary_domain_combinationsfor_graph_analysis = true;
         }
-        final boolean output_binary_domain_combinationsfor_counts = false;
+        boolean output_binary_domain_combinationsfor_counts = false;
         if ( cla.isOptionSet( DOMAIN_COMBINITONS_COUNTS_OUTPUT_OPTION ) ) {
-            output_binary_domain_combinationsfor_graph_analysis = true;
+            output_binary_domain_combinationsfor_counts = true;
         }
         if ( cla.isOptionSet( surfacing.MAX_FS_E_VALUE_OPTION ) ) {
             try {
@@ -420,6 +421,10 @@ public class surfacing {
         if ( cla.isOptionSet( surfacing.IGNORE_COMBINATION_WITH_SAME_OPTION ) ) {
             ignore_combination_with_same = true;
         }
+        boolean domain_length_analysis = false;
+        if ( cla.isOptionSet( surfacing.PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION ) ) {
+            domain_length_analysis = true;
+        }
         boolean ignore_domains_without_combs_in_all_spec = IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_DEFAULT;
         if ( cla.isOptionSet( surfacing.IGNORE_DOMAINS_WITHOUT_COMBINATIONS_IN_ALL_SPECIES_OPTION ) ) {
             ignore_domains_without_combs_in_all_spec = true;
@@ -963,7 +968,7 @@ public class surfacing {
         File[] secondary_features_map_files = null;
         final File domain_lengths_analysis_outfile = new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
                 + DOMAIN_LENGTHS_ANALYSIS_SUFFIX );
-        if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) {
+        if ( domain_length_analysis ) {
             SurfacingUtil.checkForOutputFileWriteability( domain_lengths_analysis_outfile );
         }
         if ( cla.isOptionSet( surfacing.SECONDARY_FEATURES_PARSIMONY_MAP_FILE ) ) {
@@ -1391,7 +1396,6 @@ public class surfacing {
             all_bin_domain_combinations_gained_fitch = new ArrayList<BinaryDomainCombination>();
             all_bin_domain_combinations_lost_fitch = new ArrayList<BinaryDomainCombination>();
         }
-        DomainLengthsTable domain_lengths_table = new DomainLengthsTable();
         final File per_genome_domain_promiscuity_statistics_file = new File( out_dir + ForesterUtil.FILE_SEPARATOR
                 + output_file + D_PROMISCUITY_FILE_SUFFIX );
         BufferedWriter per_genome_domain_promiscuity_statistics_writer = null;
@@ -1427,8 +1431,8 @@ public class surfacing {
         catch ( final IOException e2 ) {
             ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getMessage() );
         }
-        final DescriptiveStatistics protein_coverage_stats = new BasicDescriptiveStatistics();
-        final DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics();
+        DescriptiveStatistics protein_coverage_stats = new BasicDescriptiveStatistics();
+        DescriptiveStatistics all_genomes_domains_per_potein_stats = new BasicDescriptiveStatistics();
         final SortedMap<Integer, Integer> all_genomes_domains_per_potein_histo = new TreeMap<Integer, Integer>();
         final SortedSet<String> domains_which_are_always_single = new TreeSet<String>();
         final SortedSet<String> domains_which_are_sometimes_single_sometimes_not = new TreeSet<String>();
@@ -1462,6 +1466,10 @@ public class surfacing {
             protein_length_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
             domain_number_stats_by_dc = new HashMap<String, DescriptiveStatistics>();
         }
+        DomainLengthsTable domain_lengths_table = null;
+        if ( domain_length_analysis ) {
+            domain_lengths_table = new DomainLengthsTable();
+        }
         // Main loop:
         final SortedMap<String, Set<String>> distinct_domain_architecutures_per_genome = new TreeMap<String, Set<String>>();
         final SortedMap<String, Integer> distinct_domain_architecuture_counts = new TreeMap<String, Integer>();
@@ -1648,7 +1656,9 @@ public class surfacing {
                                                         domains_which_are_sometimes_single_sometimes_not,
                                                         domains_which_never_single,
                                                         domains_per_potein_stats_writer );
-            domain_lengths_table.addLengths( protein_list );
+            if ( domain_length_analysis ) {
+                domain_lengths_table.addLengths( protein_list );
+            }
             if ( !da_analysis ) {
                 gwcd_list.add( BasicGenomeWideCombinableDomains
                         .createInstance( protein_list,
@@ -1728,8 +1738,10 @@ public class surfacing {
             domains_per_potein_stats_writer.write( "\t" );
             domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.sampleStandardDeviation() + "" );
             domains_per_potein_stats_writer.write( "\t" );
-            domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.median() + "" );
-            domains_per_potein_stats_writer.write( "\t" );
+            if ( all_genomes_domains_per_potein_stats.getN() <= 300 ) {
+                domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.median() + "" );
+                domains_per_potein_stats_writer.write( "\t" );
+            }
             domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getN() + "" );
             domains_per_potein_stats_writer.write( "\t" );
             domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMin() + "" );
@@ -1737,6 +1749,7 @@ public class surfacing {
             domains_per_potein_stats_writer.write( all_genomes_domains_per_potein_stats.getMax() + "" );
             domains_per_potein_stats_writer.write( "\n" );
             domains_per_potein_stats_writer.close();
+            all_genomes_domains_per_potein_stats = null;
             SurfacingUtil.printOutPercentageOfMultidomainProteins( all_genomes_domains_per_potein_histo, log_writer );
             ForesterUtil.map2file( new File( out_dir + ForesterUtil.FILE_SEPARATOR + output_file
                     + "_all_genomes_domains_per_potein_histo.txt" ), all_genomes_domains_per_potein_histo, "\t", "\n" );
@@ -1761,6 +1774,7 @@ public class surfacing {
                                        + ( 100 * protein_coverage_stats.getMin() ) + "%-"
                                        + ( 100 * protein_coverage_stats.getMax() ) + "%",
                                log_writer );
+            protein_coverage_stats = null;
         }
         catch ( final IOException e2 ) {
             ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
@@ -1783,7 +1797,7 @@ public class surfacing {
         catch ( final IOException e2 ) {
             ForesterUtil.fatalError( surfacing.PRG_NAME, e2.getLocalizedMessage() );
         }
-        if ( PERFORM_DOMAIN_LENGTH_ANALYSIS ) {
+        if ( domain_length_analysis ) {
             try {
                 SurfacingUtil.executeDomainLengthAnalysis( input_file_properties,
                                                            number_of_genomes,
@@ -1803,7 +1817,8 @@ public class surfacing {
         final DomainSimilarityCalculator calc = new BasicDomainSimilarityCalculator( domain_similarity_sort_field,
                                                                                      sort_by_species_count_first,
                                                                                      number_of_genomes == 2,
-                                                                                     CALC_SIMILARITY_SCORES );
+                                                                                     CALC_SIMILARITY_SCORES,
+                                                                                     true );
         switch ( scoring ) {
             case COMBINATIONS:
                 pw_calc = new CombinationsBasedPairwiseDomainSimilarityCalculator();
@@ -2047,7 +2062,8 @@ public class surfacing {
             SurfacingUtil.writeProteinListsForAllSpecies( out_dir,
                                                           protein_lists_per_species,
                                                           gwcd_list,
-                                                          output_list_of_all_proteins_per_domain_e_value_max );
+                                                          output_list_of_all_proteins_per_domain_e_value_max,
+                                                          positive_filter_file != null ? filter : null );
         }
         gwcd_list = null;
         if ( all_bin_domain_combinations_gained_fitch != null ) {
@@ -2177,18 +2193,12 @@ public class surfacing {
         System.out.println( surfacing.WRITE_TO_NEXUS_OPTION + ": to output in Nexus format" );
         System.out.println( PERFORM_DC_FITCH + ": to perform DC Fitch parsimony" );
         System.out.println( PERFORM_DC_REGAIN_PROTEINS_STATS_OPTION + ": to perform DC regain protein statistics" );
-        System.out.println( DA_ANALYSIS_OPTION + ": to do DA analysis" );
+        System.out.println( DA_ANALYSIS_OPTION + ": to perform DA analysis" );
+        System.out.println( PERFORM_DOMAIN_LENGTH_ANALYSIS_OPTION + ": to perform domain length analysis" );
         System.out.println();
-        System.out.println( "Example 1: java -Xms128m -Xmx512m -cp path/to/forester.jar"
-                + " org.forester.application.surfacing p2g=pfam2go_2012_02_07.txt -dufs -cos=Pfam_260_NC1"
-                + " -no_eo -mo=0 -genomes=eukaryotes.txt -out_dir=out -o=o "
-                + " -species_tree=tol.xml -obo=gene_ontology_2012_02_07.obo -pos_filter=f.txt -all_prot" );
         System.out.println();
-        System.out.println( "Example 2: java -Xms128m -Xmx512m -cp path/to/forester.jar"
-                + " org.forester.application.surfacing -detail=punctilious -o=TEST.html -pwc=TEST"
-                + " -cos=Pfam_ls_22_TC2 -p2g=pfam2go -obo=gene_ontology_edit.obo "
-                + "-dc_sort=dom -ignore_with_self -no_singles -ie=0.001 -mo=1 -no_eo -genomes=eukaryotes.txt "
-                + "-ds_output=detailed_html -scoring=domains -sort=alpha " );
+        System.out
+                .println( "Example: surfacing -p2g=pfam2go_130621.txt -obo=gene_onotology_130621.obo -species_tree=tol_156.xml -last -detail=punctilious -ignore_viral_ids -no_eo -ie=0.1 -dufs -genomes=genomes_all.txt -pos_filter=tf_1.txt -all_prot -all_prot_e=0.1 -out_dir=_tf1_e01_ape01 -o=tf1_e01_ape01" );
         System.out.println();
     }
 }