inprogress (not working)
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 23 Apr 2014 01:32:25 +0000 (01:32 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 23 Apr 2014 01:32:25 +0000 (01:32 +0000)
forester/java/src/org/forester/application/msa_compactor.java

index 4c50028..91a706f 100644 (file)
@@ -46,6 +46,14 @@ public class msa_compactor {
     final static private String STEP_OPTION                            = "s";
     final static private String LENGTH_OPTION                          = "l";
     final static private String REALIGN_OPTION                         = "a";
+    //
+    final static private String STEP_FOR_DIAGNOSTICS_OPTION            = "sd";
+    final static private String MIN_LENGTH_OPTION                      = "ml";
+    final static private String GAP_RATIO_LENGTH_OPTION                = "gr";
+    final static private String REPORT_ALN_MEAN_IDENTITY               = "q";
+    final static private String OUTPUT_FORMAT_PHYLIP_OPTION            = "f";
+    final static private String OUTPUT_REMOVED_SEQS_OPTION             = "ro";
+    //
     final static private String PATH_TO_MAFFT_OPTION                   = "mafft";
     final static private String DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION = "nn";
     final static private String PRG_NAME                               = "msa_compactor";
@@ -75,6 +83,53 @@ public class msa_compactor {
             boolean realign = false;
             boolean norm = true;
             String path_to_mafft = null;
+            //            final static private String STEP_FOR_DIAGNOSTICS_OPTION            = "sd";
+            //            final static private String MIN_LENGTH_OPTION                      = "ml";
+            //            final static private String GAP_RATIO_LENGTH_OPTION                = "gr";
+            //            final static private String REPORT_ALN_MEAN_IDENTITY               = "q";
+            //            final static private String OUTPUT_FORMAT_PHYLIP_OPTION            = "f";
+            //            final static private String OUTPUT_REMOVED_SEQS_OPTION             = "ro";
+            int step_for_diagnostics = -1;
+            int min_length = -1;
+            double gap_ratio = -1;
+            boolean report_aln_mean_identity = false;
+            Format output_format = FASTA;
+            final File roved_seqs_out_base = null;
+            //
+            if ( cla.isOptionSet( STEP_FOR_DIAGNOSTICS_OPTION ) ) {
+                step_for_diagnostics = cla.getOptionValueAsInt( STEP_FOR_DIAGNOSTICS_OPTION );
+                if ( ( step_for_diagnostics < 1 )
+                        || ( ( step_for_diagnostics > msa.getNumberOfSequences() ) || ( ( worst_remove > 0 ) && ( step_for_diagnostics > worst_remove ) ) ) ) {
+                    ForesterUtil.fatalError( PRG_NAME, "value for diagnostic step is out of range: "
+                            + step_for_diagnostics );
+                }
+            }
+            if ( cla.isOptionSet( MIN_LENGTH_OPTION ) ) {
+                min_length = cla.getOptionValueAsInt( MIN_LENGTH_OPTION );
+                if ( ( min_length < 1 ) || ( min_length > longest_msa_seq ) ) {
+                    ForesterUtil.fatalError( PRG_NAME, "value for minimal sequence length is out of range: "
+                            + min_length );
+                }
+            }
+            if ( cla.isOptionSet( MIN_LENGTH_OPTION ) ) {
+                gap_ratio = cla.getOptionValueAsDouble( GAP_RATIO_LENGTH_OPTION );
+                if ( ( gap_ratio < 0 ) || ( gap_ratio > 1 ) ) {
+                    ForesterUtil.fatalError( PRG_NAME, "gap ratio is out of range: " + gap_ratio );
+                }
+            }
+            if ( cla.isOptionSet( REPORT_ALN_MEAN_IDENTITY ) ) {
+                report_aln_mean_identity = true;
+            }
+            if ( cla.isOptionSet( OUTPUT_FORMAT_PHYLIP_OPTION ) ) {
+                output_format = PHYLIP;
+            }
+            if ( cla.isOptionSet( OUTPUT_REMOVED_SEQS_OPTION ) ) {
+                gap_ratio = cla.getOptionValueAsCleanString( OUTPUT_REMOVED_SEQS_OPTION );
+                if ( ( gap_ratio < 0 ) || ( gap_ratio > 1 ) ) {
+                    ForesterUtil.fatalError( PRG_NAME, "gap ratio is out of range: " + gap_ratio );
+                }
+            }
+            //
             final List<String> allowed_options = new ArrayList<String>();
             allowed_options.add( REMOVE_WORST_OFFENDERS_OPTION );
             allowed_options.add( AV_GAPINESS_OPTION );
@@ -83,6 +138,12 @@ public class msa_compactor {
             allowed_options.add( DO_NOT_NORMALIZE_FOR_EFF_LENGTH_OPTION );
             allowed_options.add( STEP_OPTION );
             allowed_options.add( PATH_TO_MAFFT_OPTION );
+            allowed_options.add( STEP_FOR_DIAGNOSTICS_OPTION );
+            allowed_options.add( MIN_LENGTH_OPTION );
+            allowed_options.add( GAP_RATIO_LENGTH_OPTION );
+            allowed_options.add( REPORT_ALN_MEAN_IDENTITY );
+            allowed_options.add( OUTPUT_FORMAT_PHYLIP_OPTION );
+            allowed_options.add( OUTPUT_REMOVED_SEQS_OPTION );
             final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
             if ( dissallowed_options.length() > 0 ) {
                 ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
@@ -205,11 +266,22 @@ public class msa_compactor {
         System.out.println( " options: " );
         System.out.println();
         System.out.println( "   -" + REMOVE_WORST_OFFENDERS_OPTION
-                + "=<integer>  number of worst offender sequences to remove" );
-        System.out.println( "   -" + LENGTH_OPTION + "=<integer>  target MSA length" );
-        System.out.println( "   -" + AV_GAPINESS_OPTION + "=<decimal>  target gap-ratio (0.0-1.0)" );
-        System.out.println( "   -" + STEP_OPTION + "=<integer>  step (for output and re-aligning)" );
-        System.out.println( "   -" + REALIGN_OPTION + "            to realign using MAFFT" + mafft_comment );
+                + "=<integer>   number of worst offender sequences to remove" );
+        System.out.println( "   -" + LENGTH_OPTION + "=<integer>   target MSA length" );
+        System.out.println( "   -" + AV_GAPINESS_OPTION + "=<decimal>   target gap-ratio (0.0-1.0)" );
+        System.out.println( "   -" + STEP_OPTION + "=<integer>   step for output and re-aligning (default: 1)" );
+        System.out.println( "   -" + REALIGN_OPTION + "             to realign using MAFFT" + mafft_comment );
+        System.out.println( "   -" + STEP_FOR_DIAGNOSTICS_OPTION
+                + "=<integer>  step for diagnostics reports (default: 1)" );
+        System.out.println( "   -" + MIN_LENGTH_OPTION
+                + "=<integer>  minimal effecive sequence length (for deleting of shorter sequences)" );
+        System.out.println( "   -" + GAP_RATIO_LENGTH_OPTION
+                + "=<decimal>  maximal allowed gap ratio per column (for deleting of columms) (0.0-1.0)" );
+        System.out.println( "   -" + REPORT_ALN_MEAN_IDENTITY
+                + "             to report mean identity diagnostic (not recommended for very large alignments)" );
+        System.out.println( "   -" + OUTPUT_FORMAT_PHYLIP_OPTION
+                + "             to write output alignments in phylip format instead of fasta" );
+        System.out.println( "   -" + OUTPUT_REMOVED_SEQS_OPTION + "=<file>     to output the removed sequences" );
         System.out.println();
         System.out.println();
         System.out.println();