JAL-2844 partitioning code made slightly clearer
[jalview.git] / forester / java / src / org / forester / application / rio.java
index fac270c..24acfe8 100644 (file)
@@ -39,19 +39,14 @@ import org.forester.util.EasyWriter;
 import org.forester.util.ForesterUtil;
 
 public class rio {
+    //
 
-    public final static String PRG_NAME                       = "rio";
-    public final static String PRG_VERSION                    = "5.000";
-    public final static String PRG_DATE                       = "170411";
+    public final static String  PRG_NAME                       = "rio";
+    public final static String  PRG_VERSION                    = "5.900";
+    public final static String  PRG_DATE                       = "170420";
     final static private String E_MAIL                         = "phyloxml@gmail.com";
     final static private String WWW                            = "https://sites.google.com/site/cmzmasek/home/software/forester";
     final static private String HELP_OPTION_1                  = "help";
-    final static private String LOGFILE_SUFFIX                 = "_RIO_log.tsv";
-    final static private String STRIPPED_SPECIES_TREE_SUFFIX   = "_RIO_sst.xml";
-    final static private String ORTHO_OUTTABLE_SUFFIX          = "_RIO_orthologies.tsv";
-    final static private String OUT_MIN_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_min_dup_";
-    final static private String OUT_MED_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_med_dup_";
-    final static private String ORTHOLOG_GROUPS_SUFFIX         = "_RIO_ortholog_groups.tsv";
     final static private String HELP_OPTION_2                  = "h";
     final static private String GT_FIRST                       = "f";
     final static private String GT_LAST                        = "l";
@@ -59,7 +54,14 @@ public class rio {
     final static private String OUTGROUP                       = "o";
     final static private String USE_SDIR                       = "s";
     final static private String GENE_TREES_SUFFIX_OPTION       = "g";
+    final static private String MAPPINGS_DIR_OPTION            = "m";
+    final static private String MAPPINGS_SUFFIX_OPTION         = "ms";
+    final static private String CONSENSUS_TREES_DIR_OPTION     = "co";
+    final static private String CONSENSUS_TREES_SUFFIX_OPTION  = "cos";
+    final static private String MAPPINGS_SUFFIX_DEFAULT        = ".nim";
+    final static private String CONSENSUS_TREE_SUFFIX_DEFAULT  = ".xml";
     final static private String ORTHOLOG_GROUPS_CUTOFF_OPTION  = "c";
+    final static private String GENE_TREES_SUFFIX_DEFAULT      = ".mlt";
     final static private double ORTHOLOG_GROUPS_CUTOFF_DEFAULT = 0.5;
 
     public static void main( final String[] args ) {
@@ -94,6 +96,10 @@ public class rio {
         allowed_options.add( USE_SDIR );
         allowed_options.add( GENE_TREES_SUFFIX_OPTION );
         allowed_options.add( ORTHOLOG_GROUPS_CUTOFF_OPTION );
+        allowed_options.add( MAPPINGS_DIR_OPTION );
+        allowed_options.add( MAPPINGS_SUFFIX_OPTION );
+        allowed_options.add( CONSENSUS_TREES_DIR_OPTION );
+        allowed_options.add( CONSENSUS_TREES_SUFFIX_OPTION );
         final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
         if ( dissallowed_options.length() > 0 ) {
             ForesterUtil.fatalError( "unknown option(s): " + dissallowed_options );
@@ -106,6 +112,9 @@ public class rio {
             if ( !gene_trees_file.exists() ) {
                 ForesterUtil.fatalError( "gene trees directory \"" + gene_trees_file + "\" does not exist" );
             }
+            if ( gene_trees_file.listFiles().length < 1 ) {
+                ForesterUtil.fatalError( "gene trees directory \"" + gene_trees_file + "\" is empty" );
+            }
             use_dir = true;
             indir = gene_trees_file;
         }
@@ -274,7 +283,88 @@ public class rio {
             gene_trees_suffix = cla.getOptionValueAsCleanString( GENE_TREES_SUFFIX_OPTION );
         }
         else {
-            gene_trees_suffix = ".mlt";
+            gene_trees_suffix = GENE_TREES_SUFFIX_DEFAULT;
+        }
+        final boolean perform_id_mapping;
+        final File id_mapping_dir;
+        if ( cla.isOptionSet( MAPPINGS_DIR_OPTION ) ) {
+            id_mapping_dir = new File( cla.getOptionValue( MAPPINGS_DIR_OPTION ) );
+            perform_id_mapping = true;
+            if ( !use_dir ) {
+                ForesterUtil.fatalError( "no id mapping when operating on indivual gene trees" );
+            }
+            if ( !id_mapping_dir.exists() ) {
+                ForesterUtil.fatalError( "id mappings directory \"" + id_mapping_dir + "\" does not exist" );
+            }
+            if ( !id_mapping_dir.isDirectory() ) {
+                ForesterUtil.fatalError( "id mappings directory \"" + id_mapping_dir + "\" is not a directory" );
+            }
+            if ( id_mapping_dir.listFiles().length < 1 ) {
+                ForesterUtil.fatalError( "id mappings directory \"" + id_mapping_dir + "\" is empty" );
+            }
+        }
+        else {
+            id_mapping_dir = null;
+            perform_id_mapping = false;
+        }
+        final String id_mapping_suffix;
+        if ( cla.isOptionSet( MAPPINGS_SUFFIX_OPTION ) ) {
+            if ( !use_dir ) {
+                ForesterUtil.fatalError( "no id mapping file suffix option when operating on indivual gene trees" );
+            }
+            if ( !perform_id_mapping ) {
+                ForesterUtil.fatalError( "no id mapping directory given" );
+            }
+            if ( !cla.isOptionHasAValue( MAPPINGS_SUFFIX_OPTION ) ) {
+                ForesterUtil.fatalError( "no value for -" + MAPPINGS_SUFFIX_OPTION );
+            }
+            id_mapping_suffix = cla.getOptionValueAsCleanString( MAPPINGS_SUFFIX_OPTION );
+        }
+        else {
+            id_mapping_suffix = MAPPINGS_SUFFIX_DEFAULT;
+        }
+        boolean perform_gsdir_on_best_tree;
+        final File best_trees_indir;
+        if ( cla.isOptionSet( CONSENSUS_TREES_DIR_OPTION ) ) {
+            best_trees_indir = new File( cla.getOptionValue( CONSENSUS_TREES_DIR_OPTION ) );
+            perform_gsdir_on_best_tree = true;
+            if ( !use_dir ) {
+                ForesterUtil
+                        .fatalError( "no consensus (\"best\") gene tree GSDIR analysis when operating on individual gene trees" );
+            }
+            if ( !best_trees_indir.exists() ) {
+                ForesterUtil.fatalError( "consensus (\"best\") gene tree directory \"" + best_trees_indir
+                        + "\" does not exist" );
+            }
+            if ( !best_trees_indir.isDirectory() ) {
+                ForesterUtil.fatalError( "consensus (\"best\") gene tree directory \"" + best_trees_indir
+                        + "\" is not a directory" );
+            }
+            if ( best_trees_indir.listFiles().length < 1 ) {
+                ForesterUtil
+                        .fatalError( "consensus (\"best\") gene tree directory \"" + best_trees_indir + "\" is empty" );
+            }
+        }
+        else {
+            best_trees_indir = null;
+            perform_gsdir_on_best_tree = false;
+        }
+        final String best_trees_suffix;
+        if ( cla.isOptionSet( CONSENSUS_TREES_SUFFIX_OPTION ) ) {
+            if ( !use_dir ) {
+                ForesterUtil
+                        .fatalError( "no consensus (\"best\") gene tree suffix option when operating on individual gene trees" );
+            }
+            if ( !perform_gsdir_on_best_tree ) {
+                ForesterUtil.fatalError( "no consensus (\"best\") gene tree directory given" );
+            }
+            if ( !cla.isOptionHasAValue( CONSENSUS_TREES_SUFFIX_OPTION ) ) {
+                ForesterUtil.fatalError( "no value for -" + CONSENSUS_TREES_SUFFIX_OPTION );
+            }
+            best_trees_suffix = cla.getOptionValueAsCleanString( CONSENSUS_TREES_SUFFIX_OPTION );
+        }
+        else {
+            best_trees_suffix = CONSENSUS_TREE_SUFFIX_DEFAULT;
         }
         ForesterUtil.fatalErrorIfFileNotReadable( species_tree_file );
         if ( !use_dir && orthology_outtable.exists() ) {
@@ -294,6 +384,24 @@ public class rio {
         catch ( final IOException e ) {
             ForesterUtil.fatalError( e.getLocalizedMessage() );
         }
+        if ( perform_id_mapping ) {
+            try {
+                System.out.println( "Id mappings in-dir                  :\t" + id_mapping_dir.getCanonicalPath() );
+            }
+            catch ( IOException e ) {
+                ForesterUtil.fatalError( e.getLocalizedMessage() );
+            }
+            System.out.println( "Id mappings suffix                  :\t" + id_mapping_suffix );
+        }
+        if ( perform_gsdir_on_best_tree ) {
+            try {
+                System.out.println( "Consensus (\"best\") gene trees in-dir:\t" + best_trees_indir.getCanonicalPath() );
+            }
+            catch ( IOException e ) {
+                ForesterUtil.fatalError( e.getLocalizedMessage() );
+            }
+            System.out.println( "Consensus (\"best\") gene trees suffix:\t" + best_trees_suffix );
+        }
         if ( use_dir ) {
             System.out.println( "Out-dir                             :\t" + outdir );
         }
@@ -408,6 +516,26 @@ public class rio {
                 log.print( "\t" );
                 log.print( species_tree_file.getCanonicalPath() );
                 log.println();
+                if ( perform_id_mapping ) {
+                    log.print( "# Id mappings in-dir" );
+                    log.print( "\t" );
+                    log.print( id_mapping_dir.getCanonicalPath() );
+                    log.println();
+                    log.print( "# Id mappings suffix" );
+                    log.print( "\t" );
+                    log.print( id_mapping_suffix );
+                    log.println();
+                }
+                if ( perform_gsdir_on_best_tree ) {
+                    log.print( "# Consensus (\"best\") gene tree dir" );
+                    log.print( "\t" );
+                    log.print( best_trees_indir.getCanonicalPath() );
+                    log.println();
+                    log.print( "# Consensus (\"best\") gene tree suffix" );
+                    log.print( "\t" );
+                    log.print( best_trees_suffix );
+                    log.println();
+                }
                 log.print( "# Out-dir" );
                 log.print( "\t" );
                 log.print( outdir.getCanonicalPath() );
@@ -450,7 +578,7 @@ public class rio {
                 log.print( "\t" );
                 log.print( "EXT NODES" );
                 log.print( "\t" );
-                log.print(  ortholog_group_cutoff + " O GROUPS" );
+                log.print( ortholog_group_cutoff + " O GROUPS" );
                 log.print( "\t" );
                 log.print( "0.05 O GROUPS" );
                 log.print( "\t" );
@@ -462,6 +590,12 @@ public class rio {
                 log.print( "\t" );
                 log.print( "0.95 O GROUPS" );
                 log.print( "\t" );
+                if ( perform_gsdir_on_best_tree ) {
+                    log.print( "BEST TREE DUP" );
+                    log.print( "\t" );
+                    log.print( "MEDIAN DUP - BEST TREE DUP" );
+                    log.print( "\t" );
+                }
                 log.print( "MEDIAN DUP" );
                 log.print( "\t" );
                 log.print( "MEAN DUP" );
@@ -492,25 +626,37 @@ public class rio {
                 }
                 try {
                     RIOUtil.executeAnalysis( gf,
-                                     species_tree_file,
-                                     new File( outdir.getCanonicalFile() + "/" + outname + ORTHO_OUTTABLE_SUFFIX ),
-                                     new File( outdir.getCanonicalFile() + "/" + outname + ORTHOLOG_GROUPS_SUFFIX ),
-                                     new File( outdir.getCanonicalFile() + "/" + outname + LOGFILE_SUFFIX ),
-                                     outgroup,
-                                     rerooting,
-                                     gt_first,
-                                     gt_last,
-                                     new File( outdir.getCanonicalFile() + "/" + outname
-                                             + STRIPPED_SPECIES_TREE_SUFFIX ),
-                                     new File( outdir.getCanonicalFile() + "/" + outname
-                                             + OUT_MIN_DUP_GENE_TREE_SUFFIX ),
-                                     new File( outdir.getCanonicalFile() + "/" + outname
-                                             + OUT_MED_DUP_GENE_TREE_SUFFIX ),
-                                     true,
-                                     algorithm,
-                                     true,
-                                     log,
-                                     ortholog_group_cutoff );
+                                             species_tree_file,
+                                             new File( outdir.getCanonicalFile() + "/" + outname
+                                                     + RIOUtil.ORTHO_OUTTABLE_SUFFIX ),
+                                             new File( outdir.getCanonicalFile() + "/" + outname
+                                                     + RIOUtil.ORTHO_OUTTABLE_WITH_MAP_SUFFIX ),
+                                             new File( outdir.getCanonicalFile() + "/" + outname
+                                                     + RIOUtil.ORTHOLOG_GROUPS_SUFFIX ),
+                                             new File( outdir.getCanonicalFile() + "/" + outname
+                                                     + RIOUtil.LOGFILE_SUFFIX ),
+                                             outgroup,
+                                             rerooting,
+                                             gt_first,
+                                             gt_last,
+                                             new File( outdir.getCanonicalFile() + "/" + outname
+                                                     + RIOUtil.STRIPPED_SPECIES_TREE_SUFFIX ),
+                                             new File( outdir.getCanonicalFile() + "/" + outname
+                                                     + RIOUtil.OUT_MIN_DUP_GENE_TREE_SUFFIX ),
+                                             new File( outdir.getCanonicalFile() + "/" + outname
+                                                     + RIOUtil.OUT_MED_DUP_GENE_TREE_SUFFIX ),
+                                             true,
+                                             algorithm,
+                                             true,
+                                             log,
+                                             ortholog_group_cutoff,
+                                             perform_id_mapping,
+                                             id_mapping_dir,
+                                             id_mapping_suffix,
+                                             perform_gsdir_on_best_tree,
+                                             outdir,
+                                             best_trees_indir,
+                                             best_trees_suffix );
                 }
                 catch ( IOException e ) {
                     ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
@@ -521,27 +667,32 @@ public class rio {
             System.out.println();
         }
         else {
-            String outname = orthology_outtable.toString();
-            if ( outname.indexOf( "." ) > 0 ) {
-                outname = outname.substring( 0, outname.lastIndexOf( "." ) );
-            }
+            String outname = ForesterUtil.removeFileExtension( orthology_outtable.toString() );
             RIOUtil.executeAnalysis( gene_trees_file,
-                             species_tree_file,
-                             orthology_outtable,
-                             new File( outname + ORTHOLOG_GROUPS_SUFFIX ),
-                             logfile,
-                             outgroup,
-                             rerooting,
-                             gt_first,
-                             gt_last,
-                             new File( outname + STRIPPED_SPECIES_TREE_SUFFIX ),
-                             new File( outname + OUT_MIN_DUP_GENE_TREE_SUFFIX ),
-                             new File( outname + OUT_MED_DUP_GENE_TREE_SUFFIX ),
-                             algorithm == ALGORITHM.GSDIR,
-                             algorithm,
-                             false,
-                             null,
-                             ortholog_group_cutoff );
+                                     species_tree_file,
+                                     orthology_outtable,
+                                     null,
+                                     new File( outname + RIOUtil.ORTHOLOG_GROUPS_SUFFIX ),
+                                     logfile,
+                                     outgroup,
+                                     rerooting,
+                                     gt_first,
+                                     gt_last,
+                                     new File( outname + RIOUtil.STRIPPED_SPECIES_TREE_SUFFIX ),
+                                     new File( outname + RIOUtil.OUT_MIN_DUP_GENE_TREE_SUFFIX ),
+                                     new File( outname + RIOUtil.OUT_MED_DUP_GENE_TREE_SUFFIX ),
+                                     algorithm == ALGORITHM.GSDIR,
+                                     algorithm,
+                                     false,
+                                     null,
+                                     ortholog_group_cutoff,
+                                     false,
+                                     null,
+                                     null,
+                                     false,
+                                     null,
+                                     null,
+                                     null );
         }
         if ( !use_dir ) {
             time = System.currentTimeMillis() - time;
@@ -583,7 +734,16 @@ public class rio {
                 + "             : to use SDIR instead of GSDIR (faster, but non-binary species trees are" );
         System.out.println( "                   disallowed, as are most options)" );
         System.out.println( "  -" + GENE_TREES_SUFFIX_OPTION
-                + "=<suffix>    : suffix for gene trees when operating on gene tree directories (default: .mlt)" );
+                + "=<suffix>    : suffix for gene trees when operating on gene tree directories (default: "
+                + GENE_TREES_SUFFIX_DEFAULT + ")" );
+        System.out.println( "  -" + MAPPINGS_DIR_OPTION + "=<dir>       : directory for id mapping files" );
+        System.out.println( "  -" + MAPPINGS_SUFFIX_OPTION + "=<suffix>   : suffix for id mapping files (default: "
+                + MAPPINGS_SUFFIX_DEFAULT + ")" );
+        System.out.println( "  -" + CONSENSUS_TREES_DIR_OPTION
+                + "=<dir>      : directory with consenus (\"best\") gene trees to be analyzed with GSDIR" );
+        System.out.println( "  -" + CONSENSUS_TREES_SUFFIX_OPTION
+                + "=<suffix>  : suffix for consenus (\"best\") gene trees (default: " + CONSENSUS_TREE_SUFFIX_DEFAULT
+                + ")" );
         System.out.println();
         System.out.println( " Formats" );
         System.out
@@ -599,6 +759,10 @@ public class rio {
         System.out.println( "  rio gene_trees.nh species.xml outtable.tsv log.txt" );
         System.out.println( "  rio -c=0.9 -f=10 -l=100 -r=none gene_trees.xml species.xml outtable.tsv log.txt" );
         System.out.println( "  rio -g=.xml gene_trees_dir species.xml out_dir log.tsv" );
+        System.out.println( "  rio -g=.mlt -m=id_maps_dir -ms=.nim -c=0.8 gene_trees_dir species.xml out_dir log.tsv" );
+        System.out.println( "  rio -m=id_maps_dir -c=0.8 gene_trees_dir species.xml out_dir log.tsv" );
+        System.out
+                .println( "  rio -m=id_maps_dir -co=consensus_dir -cos=.xml -c=0.8 gene_trees_dir species.xml out_dir log.tsv" );
         System.out.println();
         System.exit( -1 );
     }