in progress...
authorcmzmasek <chris.zma@outlook.com>
Tue, 25 Apr 2017 01:01:53 +0000 (18:01 -0700)
committercmzmasek <chris.zma@outlook.com>
Tue, 25 Apr 2017 01:01:53 +0000 (18:01 -0700)
forester/java/src/org/forester/application/rio.java
forester/java/src/org/forester/rio/RIOUtil.java

index aa50639..d915234 100644 (file)
@@ -47,13 +47,6 @@ public class rio {
     final static private String E_MAIL                         = "phyloxml@gmail.com";
     final static private String WWW                            = "https://sites.google.com/site/cmzmasek/home/software/forester";
     final static private String HELP_OPTION_1                  = "help";
-    final static private String LOGFILE_SUFFIX                 = "_RIO_log.tsv";
-    final static private String STRIPPED_SPECIES_TREE_SUFFIX   = "_RIO_sst.xml";
-    final static private String ORTHO_OUTTABLE_SUFFIX          = "_RIO_orthologies.tsv";
-    final static private String ORTHO_OUTTABLE_WITH_MAP_SUFFIX = "_RIO_orthologies_ext_map.tsv";
-    final static private String OUT_MIN_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_min_dup_";
-    final static private String OUT_MED_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_med_dup_";
-    final static private String ORTHOLOG_GROUPS_SUFFIX         = "_RIO_ortholog_groups.tsv";
     final static private String HELP_OPTION_2                  = "h";
     final static private String GT_FIRST                       = "f";
     final static private String GT_LAST                        = "l";
@@ -63,7 +56,10 @@ public class rio {
     final static private String GENE_TREES_SUFFIX_OPTION       = "g";
     final static private String MAPPINGS_DIR_OPTION            = "m";
     final static private String MAPPINGS_SUFFIX_OPTION         = "ms";
+    final static private String CONSENSUS_TREES_DIR_OPTION     = "co";
+    final static private String CONSENSUS_TREES_SUFFIX_OPTION  = "cos";
     final static private String MAPPINGS_SUFFIX_DEFAULT        = ".nim";
+    final static private String CONSENSUS_TREE_SUFFIX_DEFAULT  = ".xml";
     final static private String ORTHOLOG_GROUPS_CUTOFF_OPTION  = "c";
     final static private String GENE_TREES_SUFFIX_DEFAULT      = ".mlt";
     final static private double ORTHOLOG_GROUPS_CUTOFF_DEFAULT = 0.5;
@@ -102,6 +98,8 @@ public class rio {
         allowed_options.add( ORTHOLOG_GROUPS_CUTOFF_OPTION );
         allowed_options.add( MAPPINGS_DIR_OPTION );
         allowed_options.add( MAPPINGS_SUFFIX_OPTION );
+        allowed_options.add( CONSENSUS_TREES_DIR_OPTION );
+        allowed_options.add( CONSENSUS_TREES_SUFFIX_OPTION );
         final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
         if ( dissallowed_options.length() > 0 ) {
             ForesterUtil.fatalError( "unknown option(s): " + dissallowed_options );
@@ -325,6 +323,50 @@ public class rio {
         else {
             id_mapping_suffix = MAPPINGS_SUFFIX_DEFAULT;
         }
+        boolean perform_gsdir_on_best_tree;
+        final File best_trees_indir;
+        if ( cla.isOptionSet( CONSENSUS_TREES_DIR_OPTION ) ) {
+            best_trees_indir = new File( cla.getOptionValue( CONSENSUS_TREES_DIR_OPTION ) );
+            perform_gsdir_on_best_tree = true;
+            if ( !use_dir ) {
+                ForesterUtil
+                        .fatalError( "no consensus (\"best\") gene tree GSDIR analysis when operating on individual gene trees" );
+            }
+            if ( !best_trees_indir.exists() ) {
+                ForesterUtil.fatalError( "consensus (\"best\") gene tree directory \"" + best_trees_indir
+                        + "\" does not exist" );
+            }
+            if ( !best_trees_indir.isDirectory() ) {
+                ForesterUtil.fatalError( "consensus (\"best\") gene tree directory \"" + best_trees_indir
+                        + "\" is not a directory" );
+            }
+            if ( best_trees_indir.listFiles().length < 1 ) {
+                ForesterUtil
+                        .fatalError( "consensus (\"best\") gene tree directory \"" + best_trees_indir + "\" is empty" );
+            }
+        }
+        else {
+            best_trees_indir = null;
+            perform_gsdir_on_best_tree = false;
+        }
+        final String best_trees_suffix;
+        if ( cla.isOptionSet( CONSENSUS_TREES_SUFFIX_OPTION ) ) {
+            if ( !use_dir ) {
+                ForesterUtil
+                        .fatalError( "no consensus (\"best\") gene tree suffix option when operating on individual gene trees" );
+            }
+            if ( !perform_gsdir_on_best_tree ) {
+                ForesterUtil.fatalError( "no consensus (\"best\") gene tree directory given" );
+            }
+            if ( !cla.isOptionHasAValue( CONSENSUS_TREES_SUFFIX_OPTION ) ) {
+                ForesterUtil.fatalError( "no value for -" + CONSENSUS_TREES_SUFFIX_OPTION );
+            }
+            best_trees_suffix = cla.getOptionValueAsCleanString( CONSENSUS_TREES_SUFFIX_OPTION );
+        }
+        else {
+            best_trees_suffix = CONSENSUS_TREE_SUFFIX_DEFAULT;
+        }
+        ////////////////////////////////
         ForesterUtil.fatalErrorIfFileNotReadable( species_tree_file );
         if ( !use_dir && orthology_outtable.exists() ) {
             ForesterUtil.fatalError( "\"" + orthology_outtable + "\" already exists" );
@@ -352,6 +394,15 @@ public class rio {
             }
             System.out.println( "Id mappings suffix                  :\t" + id_mapping_suffix );
         }
+        if ( perform_gsdir_on_best_tree ) {
+            try {
+                System.out.println( "Consensus (\"best\") gene tree dir    :\t" + best_trees_indir.getCanonicalPath() );
+            }
+            catch ( IOException e ) {
+                ForesterUtil.fatalError( e.getLocalizedMessage() );
+            }
+            System.out.println( "Consensus (\"best\") gene tree suffix :\t" + best_trees_suffix );
+        }
         if ( use_dir ) {
             System.out.println( "Out-dir                             :\t" + outdir );
         }
@@ -520,9 +571,11 @@ public class rio {
                 log.print( "\t" );
                 log.print( "0.95 O GROUPS" );
                 log.print( "\t" );
-                if ( true ) { //TODO
+                if ( perform_gsdir_on_best_tree ) {
                     log.print( "BEST TREE DUP" );
                     log.print( "\t" );
+                    log.print( "MEDIAN DUP - BEST TREE DUP" );
+                    log.print( "\t" );
                 }
                 log.print( "MEDIAN DUP" );
                 log.print( "\t" );
@@ -556,22 +609,23 @@ public class rio {
                     RIOUtil.executeAnalysis( gf,
                                              species_tree_file,
                                              new File( outdir.getCanonicalFile() + "/" + outname
-                                                     + ORTHO_OUTTABLE_SUFFIX ),
+                                                     + RIOUtil.ORTHO_OUTTABLE_SUFFIX ),
+                                             new File( outdir.getCanonicalFile() + "/" + outname
+                                                     + RIOUtil.ORTHO_OUTTABLE_WITH_MAP_SUFFIX ),
                                              new File( outdir.getCanonicalFile() + "/" + outname
-                                                     + ORTHO_OUTTABLE_WITH_MAP_SUFFIX ),
+                                                     + RIOUtil.ORTHOLOG_GROUPS_SUFFIX ),
                                              new File( outdir.getCanonicalFile() + "/" + outname
-                                                     + ORTHOLOG_GROUPS_SUFFIX ),
-                                             new File( outdir.getCanonicalFile() + "/" + outname + LOGFILE_SUFFIX ),
+                                                     + RIOUtil.LOGFILE_SUFFIX ),
                                              outgroup,
                                              rerooting,
                                              gt_first,
                                              gt_last,
                                              new File( outdir.getCanonicalFile() + "/" + outname
-                                                     + STRIPPED_SPECIES_TREE_SUFFIX ),
+                                                     + RIOUtil.STRIPPED_SPECIES_TREE_SUFFIX ),
                                              new File( outdir.getCanonicalFile() + "/" + outname
-                                                     + OUT_MIN_DUP_GENE_TREE_SUFFIX ),
+                                                     + RIOUtil.OUT_MIN_DUP_GENE_TREE_SUFFIX ),
                                              new File( outdir.getCanonicalFile() + "/" + outname
-                                                     + OUT_MED_DUP_GENE_TREE_SUFFIX ),
+                                                     + RIOUtil.OUT_MED_DUP_GENE_TREE_SUFFIX ),
                                              true,
                                              algorithm,
                                              true,
@@ -579,7 +633,11 @@ public class rio {
                                              ortholog_group_cutoff,
                                              perform_id_mapping,
                                              id_mapping_dir,
-                                             id_mapping_suffix );
+                                             id_mapping_suffix,
+                                             perform_gsdir_on_best_tree,
+                                             outdir,
+                                             best_trees_indir,
+                                             best_trees_suffix );
                 }
                 catch ( IOException e ) {
                     ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
@@ -590,23 +648,20 @@ public class rio {
             System.out.println();
         }
         else {
-            String outname = orthology_outtable.toString();
-            if ( outname.indexOf( "." ) > 0 ) {
-                outname = outname.substring( 0, outname.lastIndexOf( "." ) );
-            }
+            String outname = ForesterUtil.removeFileExtension( orthology_outtable.toString() );
             RIOUtil.executeAnalysis( gene_trees_file,
                                      species_tree_file,
                                      orthology_outtable,
                                      null,
-                                     new File( outname + ORTHOLOG_GROUPS_SUFFIX ),
+                                     new File( outname + RIOUtil.ORTHOLOG_GROUPS_SUFFIX ),
                                      logfile,
                                      outgroup,
                                      rerooting,
                                      gt_first,
                                      gt_last,
-                                     new File( outname + STRIPPED_SPECIES_TREE_SUFFIX ),
-                                     new File( outname + OUT_MIN_DUP_GENE_TREE_SUFFIX ),
-                                     new File( outname + OUT_MED_DUP_GENE_TREE_SUFFIX ),
+                                     new File( outname + RIOUtil.STRIPPED_SPECIES_TREE_SUFFIX ),
+                                     new File( outname + RIOUtil.OUT_MIN_DUP_GENE_TREE_SUFFIX ),
+                                     new File( outname + RIOUtil.OUT_MED_DUP_GENE_TREE_SUFFIX ),
                                      algorithm == ALGORITHM.GSDIR,
                                      algorithm,
                                      false,
@@ -614,6 +669,10 @@ public class rio {
                                      ortholog_group_cutoff,
                                      false,
                                      null,
+                                     null,
+                                     false,
+                                     null,
+                                     null,
                                      null );
         }
         if ( !use_dir ) {
@@ -661,6 +720,12 @@ public class rio {
         System.out.println( "  -" + MAPPINGS_DIR_OPTION + "=<dir>       : directory for id mapping files" );
         System.out.println( "  -" + MAPPINGS_SUFFIX_OPTION + "=<suffix>   : suffix for id mapping files (default: "
                 + MAPPINGS_SUFFIX_DEFAULT + ")" );
+        System.out.println( "  -" + CONSENSUS_TREES_DIR_OPTION
+                + "=<dir>      : directory with consenus (\"best\") gene trees to be analyzed with GSDIR" );
+        System.out.println( "  -" + CONSENSUS_TREES_SUFFIX_OPTION
+                + "=<suffix>  : suffix for consenus (\"best\") gene trees (default: " + CONSENSUS_TREE_SUFFIX_DEFAULT
+                + ")" );
+        ///
         System.out.println();
         System.out.println( " Formats" );
         System.out
@@ -678,6 +743,8 @@ public class rio {
         System.out.println( "  rio -g=.xml gene_trees_dir species.xml out_dir log.tsv" );
         System.out.println( "  rio -g=.mlt -m=id_maps_dir -ms=.nim -c=0.8 gene_trees_dir species.xml out_dir log.tsv" );
         System.out.println( "  rio -m=id_maps_dir -c=0.8 gene_trees_dir species.xml out_dir log.tsv" );
+        System.out
+                .println( "  rio -m=id_maps_dir -co=consensus_dir -cos=.xml -c=0.8 gene_trees_dir species.xml out_dir log.tsv" );
         System.out.println();
         System.exit( -1 );
     }
index 03d2796..89b753c 100644 (file)
@@ -2,18 +2,15 @@
 package org.forester.rio;
 
 import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.math.RoundingMode;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Map;
 import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeSet;
 
-import javax.swing.JOptionPane;
-
-import org.forester.archaeopteryx.AptxUtil;
 import org.forester.datastructures.IntMatrix;
 import org.forester.io.parsers.IteratingPhylogenyParser;
 import org.forester.io.parsers.PhylogenyParser;
@@ -45,6 +42,15 @@ import org.forester.util.ForesterUtil;
 
 public final class RIOUtil {
 
+    public final static String STRIPPED_SPECIES_TREE_SUFFIX   = "_RIO_stripped_species_tree.xml";
+    public final static String ORTHO_OUTTABLE_SUFFIX          = "_RIO_orthologies.tsv";
+    public final static String ORTHO_OUTTABLE_WITH_MAP_SUFFIX = "_RIO_orthologies_ext_map.tsv";
+    public final static String OUT_MIN_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_min_dup_";
+    public final static String OUT_MED_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_med_dup_";
+    public final static String BEST_TREE_SUFFIX               = "_RIO_consensus_gene_tree_dup_";
+    public final static String ORTHOLOG_GROUPS_SUFFIX         = "_RIO_ortholog_groups.tsv";
+    public final static String LOGFILE_SUFFIX                 = "_RIO_log.tsv";
+
     public static final void executeAnalysis( final File gene_trees_file,
                                               final File species_tree_file,
                                               final File orthology_outtable,
@@ -65,7 +71,11 @@ public final class RIOUtil {
                                               final double ortholog_group_cutoff,
                                               final boolean perform_id_mapping,
                                               final File id_mapping_dir,
-                                              final String id_mapping_suffix ) {
+                                              final String id_mapping_suffix,
+                                              final boolean perform_gsdir_on_best_tree,
+                                              final File outdir,
+                                              final File best_trees_indir,
+                                              final String best_trees_suffix ) {
         try {
             final SortedMap<String, String> id_map;
             if ( perform_id_mapping ) {
@@ -131,53 +141,18 @@ public final class RIOUtil {
             else {
                 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
             }
-            ////////////////////////////////////////////
-            ////////////////////////////////////////////
-            //TODO
-            final boolean perform_gsdir_on_best_tree = true;
-            final File best_trees_dir = new File( "best_trees" );
-            final String best_trees_suffix = ".xml";
             final GSDIR gsdir_for_best_tree;
             if ( perform_gsdir_on_best_tree ) {
-                final Phylogeny best_tree = obtainTree( best_trees_dir, gene_trees_file.getName(), best_trees_suffix );
-                final Phylogeny species_tree = SDIutil
-                        .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
-                PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
-                best_tree.setRooted( true );
-                species_tree.setRooted( true );
-                if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) {
-                    throw new IOException( "gene tree matching to ["
-                            + ForesterUtil.removeFileExtension( gene_trees_file.getName() )
-                            + "] is not completely binary" );
-                }
-                final PhylogenyNodeIterator it = best_tree.iteratorExternalForward();
-                while ( it.hasNext() ) {
-                    final PhylogenyNode n = it.next();
-                    final String name = n.getName().trim();
-                    if ( !ForesterUtil.isEmpty( name ) ) {
-                        try {
-                            ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE );
-                        }
-                        catch ( final PhyloXmlDataFormatException e ) {
-                            // Ignore.
-                        }
-                    }
-                }
-                gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true );
-                final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree();
-                System.out.println( gsdir_for_best_tree.getMinDuplicationsSum() );
-                result_gene_tree.setRerootable( false );
-                PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(),
-                                                  true,
-                                                  true,
-                                                  DESCENDANT_SORT_PRIORITY.NODE_NAME );
-                writeTree( result_gene_tree, new File( gene_trees_file.getName() + "____.xml" ), null, id_map );
+                gsdir_for_best_tree = analyzeConsensusTree( gene_trees_file,
+                                                            species_tree_file,
+                                                            outdir,
+                                                            best_trees_indir,
+                                                            id_map,
+                                                            best_trees_suffix );
             }
             else {
                 gsdir_for_best_tree = null;
             }
-            ////////////////////////////////////////////
-            ////////////////////////////////////////////
             final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
             if ( perform_id_mapping ) {
                 writeOrthologyTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir, id_map, true );
@@ -278,6 +253,13 @@ public final class RIOUtil {
                 log.print( "\t" );
                 log.print( Integer.toString( ortholog_groups_095 ) );
                 //
+                if ( true ) {
+                    log.print( "\t" );
+                    log.print( Integer.toString( gsdir_for_best_tree.getMinDuplicationsSum() ) );
+                    log.print( "\t" );
+                    log.print( df.format( median - gsdir_for_best_tree.getMinDuplicationsSum() ) );
+                }
+                //
                 log.print( "\t" );
                 if ( stats.getN() > 3 ) {
                     log.print( df.format( median ) );
@@ -350,6 +332,51 @@ public final class RIOUtil {
         }
     }
 
+    private final static GSDIR analyzeConsensusTree( final File gene_trees_file,
+                                                     final File species_tree_file,
+                                                     final File outdir,
+                                                     final File best_trees_indir,
+                                                     final SortedMap<String, String> id_map,
+                                                     final String best_trees_suffix )
+            throws IOException, FileNotFoundException, PhyloXmlDataFormatException, SDIException {
+        final File the_one = ForesterUtil.getMatchingFile( best_trees_indir,
+                                                           gene_trees_file.getName(),
+                                                           best_trees_suffix );
+        final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+        final Phylogeny best_tree = factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];
+        final Phylogeny species_tree = SDIutil
+                .parseSpeciesTree( best_tree, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
+        PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
+        best_tree.setRooted( true );
+        species_tree.setRooted( true );
+        if ( !best_tree.isCompletelyBinaryAllow3ChildrenAtRoot() ) {
+            throw new IOException( "gene tree matching to ["
+                    + ForesterUtil.removeFileExtension( gene_trees_file.getName() ) + "] is not completely binary" );
+        }
+        final PhylogenyNodeIterator it = best_tree.iteratorExternalForward();
+        while ( it.hasNext() ) {
+            final PhylogenyNode n = it.next();
+            final String name = n.getName().trim();
+            if ( !ForesterUtil.isEmpty( name ) ) {
+                try {
+                    ParserUtils.extractTaxonomyDataFromNodeName( n, TAXONOMY_EXTRACTION.AGGRESSIVE );
+                }
+                catch ( final PhyloXmlDataFormatException e ) {
+                    // Ignore.
+                }
+            }
+        }
+        final GSDIR gsdir_for_best_tree = new GSDIR( best_tree, species_tree, true, true, true );
+        final Phylogeny result_gene_tree = gsdir_for_best_tree.getMinDuplicationsSumGeneTree();
+        result_gene_tree.setRerootable( false );
+        PhylogenyMethods.orderAppearance( result_gene_tree.getRoot(), true, true, DESCENDANT_SORT_PRIORITY.NODE_NAME );
+        final String outname = ForesterUtil.removeFileExtension( the_one.getName() );
+        final File outfile = new File( outdir.getCanonicalFile() + "/" + outname + RIOUtil.BEST_TREE_SUFFIX
+                + gsdir_for_best_tree.getMinDuplicationsSum() + ".xml" );
+        writeTree( result_gene_tree, outfile, null, id_map );
+        return gsdir_for_best_tree;
+    }
+
     private static final void writeOrthologyTable( final File table_outfile,
                                                    final int gene_trees_analyzed,
                                                    final IntMatrix m,
@@ -589,11 +616,4 @@ public final class RIOUtil {
         final BasicTable<String> t = BasicTableParser.parse( the_one, '\t' );
         return t.getColumnsAsMap( 0, 1 );
     }
-
-    private final static Phylogeny obtainTree( final File dir, final String prefix, final String suffix )
-            throws IOException {
-        final File the_one = ForesterUtil.getMatchingFile( dir, prefix, suffix );
-        final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
-        return factory.create( the_one, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];
-    }
 }