"rio" work
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 12 Dec 2012 04:45:47 +0000 (04:45 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Wed, 12 Dec 2012 04:45:47 +0000 (04:45 +0000)
forester/java/src/org/forester/application/rio.java
forester/java/src/org/forester/io/parsers/util/ParserUtils.java
forester/java/src/org/forester/rio/RIO.java
forester/java/src/org/forester/sdi/GSDIR.java

index 397329c..5bf670b 100644 (file)
@@ -31,10 +31,14 @@ import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.SortedSet;
+import java.util.TreeSet;
 
 import org.forester.datastructures.IntMatrix;
 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
 import org.forester.phylogeny.Phylogeny;
+import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Taxonomy;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.factories.PhylogenyFactory;
 import org.forester.rio.RIO;
@@ -48,8 +52,8 @@ import org.forester.util.ForesterUtil;
 public class rio {
 
     final static private String PRG_NAME      = "rio";
-    final static private String PRG_VERSION   = "3.00 beta 4";
-    final static private String PRG_DATE      = "2012.12.10";
+    final static private String PRG_VERSION   = "4.000 beta 1";
+    final static private String PRG_DATE      = "2012.12.11";
     final static private String E_MAIL        = "czmasek@burnham.org";
     final static private String WWW           = "www.phylosoft.org/forester/";
     final static private String HELP_OPTION_1 = "help";
@@ -74,7 +78,7 @@ public class rio {
         if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) {
             printHelp();
         }
-        if ( ( args.length < 2 ) || ( args.length > 10 ) ) {
+        if ( ( args.length < 3 ) || ( args.length > 5 ) ) {
             System.out.println();
             System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" );
             System.out.println();
@@ -104,18 +108,23 @@ public class rio {
         boolean sdir = false;
         if ( cla.isOptionSet( USE_SDIR ) ) {
             sdir = true;
+            if ( logfile != null ) {
+                ForesterUtil.fatalError( PRG_NAME, "logfile output only for GSDIR algorithm" );
+            }
         }
         long time = 0;
         System.out.println( "Gene trees                : " + gene_trees_file );
         System.out.println( "Species tree              : " + species_tree_file );
         System.out.println( "All vs all orthology table: " + othology_outtable );
         if ( !sdir ) {
+            if ( logfile != null ) {
+                System.out.println( "Logfile                   : " + logfile );
+            }
             System.out.println( "Non binary species tree   : allowed (GSDIR algorithm)" );
         }
         else {
             System.out.println( "Non binary species tree   : disallowed (SDIR algorithm)" );
         }
-        System.out.println();
         time = System.currentTimeMillis();
         Phylogeny species_tree = null;
         try {
@@ -137,8 +146,14 @@ public class rio {
             algorithm = ALGORITHM.GSDIR;
         }
         try {
-            final RIO rio = new RIO( gene_trees_file, species_tree, algorithm );
+            final RIO rio = new RIO( gene_trees_file, species_tree, algorithm, logfile != null );
+            if ( algorithm == ALGORITHM.GSDIR ) {
+                System.out.println( "Taxonomy linking based on : " + rio.getGSDIRtaxCompBase() );
+            }
             tableOutput( othology_outtable, rio );
+            if ( ( algorithm == ALGORITHM.GSDIR ) && ( logfile != null ) ) {
+                writeLogFile( logfile, rio );
+            }
         }
         catch ( final RIOException e ) {
             ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
@@ -152,15 +167,43 @@ public class rio {
         catch ( final Exception e ) {
             ForesterUtil.unexpectedFatalError( PRG_NAME, e );
         }
-        if ( othology_outtable != null ) {
-            ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + othology_outtable + "\"" );
-        }
         time = System.currentTimeMillis() - time;
         ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" );
         ForesterUtil.programMessage( PRG_NAME, "OK" );
         System.exit( 0 );
     }
 
+    private static void writeLogFile( final File logfile, final RIO rio ) throws IOException {
+        final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
+        out.println( "Species stripped from gene trees:" );
+        final SortedSet<String> rn = new TreeSet<String>();
+        for( final PhylogenyNode n : rio.getRemovedGeneTreeNodes() ) {
+            final Taxonomy t = n.getNodeData().getTaxonomy();
+            switch ( rio.getGSDIRtaxCompBase() ) {
+                case CODE: {
+                    rn.add( t.getTaxonomyCode() );
+                    break;
+                }
+                case ID: {
+                    rn.add( t.getIdentifier().toString() );
+                    break;
+                }
+                case SCIENTIFIC_NAME: {
+                    rn.add( t.getScientificName() );
+                    break;
+                }
+            }
+        }
+        for( final String s : rn ) {
+            out.println( s );
+        }
+        out.println();
+        out.println( "Some information about duplication numbers in gene trees:" );
+        out.println( rio.getLog().toString() );
+        out.close();
+        ForesterUtil.programMessage( PRG_NAME, "wrote log to \"" + logfile + "\"" );
+    }
+
     private static void tableOutput( final File table_outfile, final RIO rio ) throws IOException, RIOException {
         final IntMatrix m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
         writeTable( table_outfile, rio, m );
@@ -204,7 +247,7 @@ public class rio {
         System.out.println();
         System.out.println( " Options" );
         System.out.println( "  -" + USE_SDIR
-                + "  : to use SDIR instead of GSDIR (faster, but non-binary species trees are disallowed)" );
+                + " : to use SDIR instead of GSDIR (faster, but non-binary species trees are disallowed)" );
         System.out.println();
         System.out.println( " Formats" );
         System.out.println( "  The species tree is expected to be in phyloXML format." );
index 35ec60c..4baf7ce 100644 (file)
@@ -51,9 +51,10 @@ import org.forester.util.ForesterUtil;
 
 public final class ParserUtils {
 
-    final private static Pattern TAXOMONY_CODE_PATTERN_1  = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA" );
-    final private static Pattern TAXOMONY_CODE_PATTERN_2  = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA)[^A-Za-z].*" );
-    final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA)/\\d+-\\d+" );
+    final private static Pattern TAXOMONY_CODE_PATTERN_1  = Pattern.compile( "[A-Z0-9]{5}|RAT|PIG|PEA|CAP" );
+    final private static Pattern TAXOMONY_CODE_PATTERN_2  = Pattern
+                                                                  .compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)[^A-Za-z].*" );
+    final private static Pattern TAXOMONY_CODE_PATTERN_PF = Pattern.compile( "([A-Z0-9]{5}|RAT|PIG|PEA|CAP)/\\d+-\\d+" );
 
     final public static PhylogenyParser createParserDependingFileContents( final File file,
                                                                            final boolean phyloxml_validate_against_xsd )
index 2bcd5e9..fd8ae3f 100644 (file)
@@ -51,6 +51,7 @@ import org.forester.sdi.SDI.ALGORITHM;
 import org.forester.sdi.SDI.TaxonomyComparisonBase;
 import org.forester.sdi.SDIException;
 import org.forester.sdi.SDIR;
+import org.forester.util.BasicDescriptiveStatistics;
 import org.forester.util.ForesterUtil;
 
 public final class RIO {
@@ -62,13 +63,27 @@ public final class RIO {
     private int                    _samples;
     private int                    _ext_nodes;
     private TaxonomyComparisonBase _gsdir_tax_comp_base;
+    private StringBuilder          _log;
+    private boolean                _produce_log;
 
-    public RIO( final File gene_trees_file, final Phylogeny species_tree, final ALGORITHM algorithm )
-            throws IOException, SDIException, RIOException {
-        init();
+    public RIO( final File gene_trees_file,
+                final Phylogeny species_tree,
+                final ALGORITHM algorithm,
+                final boolean produce_log ) throws IOException, SDIException, RIOException {
+        init( produce_log );
         inferOrthologs( gene_trees_file, species_tree, algorithm );
     }
 
+    private final void init( final boolean produce_log ) {
+        _produce_log = produce_log;
+        _samples = -1;
+        _ext_nodes = -1;
+        _log = null;
+        _gsdir_tax_comp_base = null;
+        _analyzed_gene_trees = null;
+        _removed_gene_tree_nodes = null;
+    }
+
     public final Phylogeny[] getAnalyzedGeneTrees() {
         return _analyzed_gene_trees;
     }
@@ -106,15 +121,32 @@ public final class RIO {
             nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
         }
         final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
-        // Removes from species_tree all species not found in gene_tree.
-        final List<PhylogenyNode> _removed_species_tree_ext_nodes = PhylogenyMethods
-                .taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
-        if ( species_tree.isEmpty() ) {
-            throw new RIOException( "failed to establish species based mapping between gene and species trees" );
+        if ( algorithm == ALGORITHM.SDIR ) {
+            // Removes from species_tree all species not found in gene_tree.
+            PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
+            if ( species_tree.isEmpty() ) {
+                throw new RIOException( "failed to establish species based mapping between gene and species trees" );
+            }
+        }
+        if ( _produce_log ) {
+            _log = new StringBuilder();
         }
         _analyzed_gene_trees = new Phylogeny[ gene_trees.length ];
         int i = 0;
         int gene_tree_ext_nodes = 0;
+        if ( _produce_log ) {
+            _log.append( "#" );
+            _log.append( "\t" );
+            _log.append( "with minimal number of duplications" );
+            _log.append( "/" );
+            _log.append( "root placements" );
+            _log.append( "\t[" );
+            _log.append( "min" );
+            _log.append( "-" );
+            _log.append( "max" );
+            _log.append( "]" );
+            _log.append( ForesterUtil.LINE_SEPARATOR );
+        }
         for( final Phylogeny gt : gene_trees ) {
             if ( algorithm == ALGORITHM.SDIR ) {
                 // Removes from gene_tree all species not found in species_tree.
@@ -137,11 +169,6 @@ public final class RIO {
         setNumberOfSamples( gene_trees.length );
     }
 
-    private final void init() {
-        _samples = 1;
-        _ext_nodes = 0;
-    }
-
     private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
                                                       final Phylogeny species_tree,
                                                       final ALGORITHM algorithm,
@@ -160,8 +187,29 @@ public final class RIO {
                 break;
             }
             case GSDIR: {
+                //  System.out.println( "gene/species tree size before: " + gene_tree.getNumberOfExternalNodes() + "/"
+                //         + species_tree.getNumberOfExternalNodes() );
                 final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0 );
+                // System.out.println( "gene/species tree size before: " + gene_tree.getNumberOfExternalNodes() + "/"
+                //         + species_tree.getNumberOfExternalNodes() );
                 assigned_tree = gsdir.getMinDuplicationsSumGeneTrees().get( 0 );
+                if ( i == 0 ) {
+                    _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
+                }
+                if ( _produce_log ) {
+                    final BasicDescriptiveStatistics stats = gsdir.getDuplicationsSumStats();
+                    _log.append( i );
+                    _log.append( "\t" );
+                    _log.append( gsdir.getMinDuplicationsSumGeneTrees().size() );
+                    _log.append( "/" );
+                    _log.append( stats.getN() );
+                    _log.append( "\t[" );
+                    _log.append( ( int ) stats.getMin() );
+                    _log.append( "-" );
+                    _log.append( ( int ) stats.getMax() );
+                    _log.append( "]" );
+                    _log.append( ForesterUtil.LINE_SEPARATOR );
+                }
                 _gsdir_tax_comp_base = gsdir.getTaxCompBase();
                 break;
             }
@@ -177,10 +225,7 @@ public final class RIO {
         _ext_nodes = i;
     }
 
-    private final void setNumberOfSamples( int i ) {
-        if ( i < 1 ) {
-            i = 1;
-        }
+    private final void setNumberOfSamples( final int i ) {
         _samples = i;
     }
 
@@ -240,4 +285,12 @@ public final class RIO {
         }
         return m;
     }
+
+    public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
+        return _gsdir_tax_comp_base;
+    }
+
+    public final StringBuilder getLog() {
+        return _log;
+    }
 }
index 035a443..70cc009 100644 (file)
@@ -79,7 +79,7 @@ public class GSDIR extends GSDI {
             }\r
             _duplications_sum_stats.addValue( _duplications_sum );\r
         }\r
-        System.out.println( _duplications_sum_stats.getSummaryAsString() );\r
+        //System.out.println( _duplications_sum_stats.getSummaryAsString() );\r
     }\r
 \r
     public int getMinDuplicationsSum() {\r