rio
authorcmzmasek <cmzmasek@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Mon, 17 Dec 2012 06:30:02 +0000 (06:30 +0000)
committercmzmasek <cmzmasek@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Mon, 17 Dec 2012 06:30:02 +0000 (06:30 +0000)
forester/java/src/org/forester/application/rio.java
forester/java/src/org/forester/rio/RIO.java
forester/java/src/org/forester/rio/TestRIO.java

index b95c032..8af0486 100644 (file)
@@ -43,8 +43,10 @@ import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.factories.PhylogenyFactory;
 import org.forester.rio.RIO;
 import org.forester.rio.RIOException;
+import org.forester.rio.RIO.REROOTING;
 import org.forester.sdi.SDIException;
 import org.forester.sdi.SDIutil.ALGORITHM;
+import org.forester.util.BasicDescriptiveStatistics;
 import org.forester.util.CommandLineArguments;
 import org.forester.util.EasyWriter;
 import org.forester.util.ForesterUtil;
@@ -103,6 +105,7 @@ public class rio {
         else {
             logfile = null;
         }
+        String outgroup = "";
         ForesterUtil.fatalErrorIfFileNotReadable( PRG_NAME, gene_trees_file );
         ForesterUtil.fatalErrorIfFileNotReadable( PRG_NAME, species_tree_file );
         if ( othology_outtable.exists() ) {
@@ -149,7 +152,7 @@ public class rio {
             algorithm = ALGORITHM.GSDIR;
         }
         try {
-            final RIO rio = new RIO( gene_trees_file, species_tree, algorithm, logfile != null, true );
+            final RIO rio = new RIO( gene_trees_file, species_tree, algorithm, REROOTING.BY_ALGORITHM, outgroup ,  logfile != null, true );
             if ( algorithm == ALGORITHM.GSDIR ) {
                 ForesterUtil.programMessage( PRG_NAME, "taxonomy linking based on: " + rio.getGSDIRtaxCompBase() );
             }
@@ -157,6 +160,11 @@ public class rio {
             if ( ( algorithm == ALGORITHM.GSDIR ) && ( logfile != null ) ) {
                 writeLogFile( logfile, rio );
             }
+            final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
+            ForesterUtil.programMessage( PRG_NAME, "Mean: " + stats.arithmeticMean() + "("  + stats.sampleStandardDeviation() + ")" );
+            ForesterUtil.programMessage( PRG_NAME, "Min: " + (int) stats.getMin() );
+            ForesterUtil.programMessage( PRG_NAME, "Max: " + (int) stats.getMax() );
+            
         }
         catch ( final RIOException e ) {
             ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
@@ -170,6 +178,7 @@ public class rio {
         catch ( final Exception e ) {
             ForesterUtil.unexpectedFatalError( PRG_NAME, e );
         }
+      
         time = System.currentTimeMillis() - time;
         ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" );
         ForesterUtil.programMessage( PRG_NAME, "OK" );
index 053927d..8e1e5bb 100644 (file)
@@ -46,6 +46,7 @@ import org.forester.phylogeny.PhylogenyMethods;
 import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.factories.PhylogenyFactory;
+import org.forester.sdi.GSDI;
 import org.forester.sdi.GSDIR;
 import org.forester.sdi.SDIException;
 import org.forester.sdi.SDIR;
@@ -56,32 +57,41 @@ import org.forester.util.ForesterUtil;
 
 public final class RIO {
 
-    private final static boolean   ROOT_BY_MINIMIZING_SUM_OF_DUPS = true;
-    private final static boolean   ROOT_BY_MINIMIZING_TREE_HEIGHT = true;
-    private Phylogeny[]            _analyzed_gene_trees;
-    private List<PhylogenyNode>    _removed_gene_tree_nodes;
-    private int                    _ext_nodes;
-    private TaxonomyComparisonBase _gsdir_tax_comp_base;
-    private StringBuilder          _log;
-    private boolean                _produce_log;
-    private boolean                _verbose;
+    public enum REROOTING {
+        NONE, BY_ALGORITHM, MIDPOINT, OUTGROUP;
+    }
+    private Phylogeny[]                _analyzed_gene_trees;
+    private List<PhylogenyNode>        _removed_gene_tree_nodes;
+    private int                        _ext_nodes;
+    private TaxonomyComparisonBase     _gsdir_tax_comp_base;
+    private StringBuilder              _log;
+    private BasicDescriptiveStatistics _duplications_stats;
+    private boolean                    _produce_log;
+    private boolean                    _verbose;
+    private REROOTING                  _rerooting;
 
     public RIO( final File gene_trees_file,
                 final Phylogeny species_tree,
                 final ALGORITHM algorithm,
+                final REROOTING rerooting,
+                final String outgroup,
                 final boolean produce_log,
                 final boolean verbose ) throws IOException, SDIException, RIOException {
-        init( produce_log, verbose );
-        inferOrthologs( gene_trees_file, species_tree, algorithm );
+        checkReRooting( rerooting, outgroup );
+        init( produce_log, verbose, rerooting );
+        inferOrthologs( gene_trees_file, species_tree, algorithm, outgroup );
     }
 
     public RIO( final Phylogeny[] gene_trees,
                 final Phylogeny species_tree,
                 final ALGORITHM algorithm,
+                final REROOTING rerooting,
+                final String outgroup,
                 final boolean produce_log,
                 final boolean verbose ) throws IOException, SDIException, RIOException {
-        init( produce_log, verbose );
-        inferOrthologs( gene_trees, species_tree, algorithm );
+        checkReRooting( rerooting, outgroup );
+        init( produce_log, verbose, rerooting );
+        inferOrthologs( gene_trees, species_tree, algorithm, outgroup );
     }
 
     public final Phylogeny[] getAnalyzedGeneTrees() {
@@ -112,7 +122,8 @@ public final class RIO {
 
     private final void inferOrthologs( final File gene_trees_file,
                                        final Phylogeny species_tree,
-                                       final ALGORITHM algorithm ) throws SDIException, RIOException,
+                                       final ALGORITHM algorithm,
+                                       final String outgroup ) throws SDIException, RIOException,
             FileNotFoundException, IOException {
         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
         final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
@@ -123,12 +134,13 @@ public final class RIO {
             nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
         }
         final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
-        inferOrthologs( gene_trees, species_tree, algorithm );
+        inferOrthologs( gene_trees, species_tree, algorithm, outgroup );
     }
 
     private final void inferOrthologs( final Phylogeny[] gene_trees,
                                        final Phylogeny species_tree,
-                                       final ALGORITHM algorithm ) throws SDIException, RIOException,
+                                       final ALGORITHM algorithm,
+                                       final String outgroup ) throws SDIException, RIOException,
             FileNotFoundException, IOException {
         if ( algorithm == ALGORITHM.SDIR ) {
             // Removes from species_tree all species not found in gene_tree.
@@ -139,7 +151,9 @@ public final class RIO {
         }
         if ( _produce_log ) {
             _log = new StringBuilder();
-            writeLogSubHeader();
+            if ( _rerooting == REROOTING.BY_ALGORITHM ) {
+                writeLogSubHeader();
+            }
         }
         _analyzed_gene_trees = new Phylogeny[ gene_trees.length ];
         int gene_tree_ext_nodes = 0;
@@ -166,7 +180,7 @@ public final class RIO {
                     throw new RIOException( "failed to establish species based mapping between gene and species trees" );
                 }
             }
-            _analyzed_gene_trees[ i ] = performOrthologInference( gt, species_tree, algorithm, i );
+            _analyzed_gene_trees[ i ] = performOrthologInference( gt, species_tree, algorithm, outgroup, i );
         }
         if ( _verbose ) {
             System.out.println();
@@ -174,51 +188,31 @@ public final class RIO {
         }
     }
 
-    private final void init( final boolean produce_log, final boolean verbose ) {
+    private final void init( final boolean produce_log, final boolean verbose, final REROOTING rerooting ) {
         _produce_log = produce_log;
         _verbose = verbose;
+        _rerooting = rerooting;
         _ext_nodes = -1;
         _log = null;
         _gsdir_tax_comp_base = null;
         _analyzed_gene_trees = null;
         _removed_gene_tree_nodes = null;
+        _duplications_stats = new BasicDescriptiveStatistics();
     }
 
     private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
                                                       final Phylogeny species_tree,
                                                       final ALGORITHM algorithm,
+                                                      final String outgroup,
                                                       final int i ) throws SDIException, RIOException {
         final Phylogeny assigned_tree;
         switch ( algorithm ) {
             case SDIR: {
-                final SDIR sdir = new SDIR();
-                assigned_tree = sdir.infer( gene_tree,
-                                            species_tree,
-                                            false,
-                                            RIO.ROOT_BY_MINIMIZING_SUM_OF_DUPS,
-                                            RIO.ROOT_BY_MINIMIZING_TREE_HEIGHT,
-                                            true,
-                                            1 )[ 0 ];
+                assigned_tree = performOrthologInferenceBySDI( gene_tree, species_tree );
                 break;
             }
             case GSDIR: {
-                final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0 );
-                final List<Phylogeny> assigned_trees = gsdir.getMinDuplicationsSumGeneTrees();
-                if ( i == 0 ) {
-                    _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
-                    for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
-                        if ( !r.getNodeData().isHasTaxonomy() ) {
-                            throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #1: "
-                                    + r.toString() );
-                        }
-                    }
-                }
-                final List<Integer> shortests = GSDIR.getIndexesOfShortestTree( assigned_trees );
-                assigned_tree = assigned_trees.get( shortests.get( 0 ) );
-                if ( _produce_log ) {
-                    writeStatsToLog( i, gsdir, shortests );
-                }
-                _gsdir_tax_comp_base = gsdir.getTaxCompBase();
+                assigned_tree = performOrthologInferenceByGSDI( gene_tree, species_tree, outgroup, i );
                 break;
             }
             default: {
@@ -236,6 +230,70 @@ public final class RIO {
         return assigned_tree;
     }
 
+    private final Phylogeny performOrthologInferenceByGSDI( final Phylogeny gene_tree,
+                                                            final Phylogeny species_tree,
+                                                            final String outgroup,
+                                                            final int i ) throws SDIException, RIOException {
+        final Phylogeny assigned_tree;
+        if ( _rerooting == REROOTING.BY_ALGORITHM ) {
+            final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0 );
+            final List<Phylogeny> assigned_trees = gsdir.getMinDuplicationsSumGeneTrees();
+            if ( i == 0 ) {
+                _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
+                for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
+                    if ( !r.getNodeData().isHasTaxonomy() ) {
+                        throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #1: "
+                                + r.toString() );
+                    }
+                }
+            }
+            final List<Integer> shortests = GSDIR.getIndexesOfShortestTree( assigned_trees );
+            assigned_tree = assigned_trees.get( shortests.get( 0 ) );
+            if ( _produce_log ) {
+                writeStatsToLog( i, gsdir, shortests );
+            }
+            if ( i == 0 ) {
+                _gsdir_tax_comp_base = gsdir.getTaxCompBase();
+            }
+            _duplications_stats.addValue( gsdir.getMinDuplicationsSum() );
+        }
+        else {
+            if ( _rerooting == REROOTING.MIDPOINT ) {
+                PhylogenyMethods.midpointRoot( gene_tree );
+            }
+            else if ( _rerooting == REROOTING.OUTGROUP ) {
+                PhylogenyNode n;
+                try {
+                    n = gene_tree.getNode( outgroup );
+                }
+                catch ( IllegalArgumentException e ) {
+                    throw new RIOException( "failed to perform re-rooting by outgroup: " + e.getLocalizedMessage() );
+                }
+                gene_tree.reRoot( n );
+            }
+            final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true );
+            _removed_gene_tree_nodes = gsdi.getStrippedExternalGeneTreeNodes();
+            for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
+                if ( !r.getNodeData().isHasTaxonomy() ) {
+                    throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #1: "
+                            + r.toString() );
+                }
+            }
+            assigned_tree = gene_tree;
+            if ( i == 0 ) {
+                _gsdir_tax_comp_base = gsdi.getTaxCompBase();
+            }
+            _duplications_stats.addValue( gsdi.getDuplicationsSum() );
+        }
+        return assigned_tree;
+    }
+
+    private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree )
+            throws SDIException {
+        final SDIR sdir = new SDIR();
+        return sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ];
+    }
+
     private void writeLogSubHeader() {
         _log.append( "#" );
         _log.append( "\t" );
@@ -325,4 +383,14 @@ public final class RIO {
         }
         return m;
     }
+
+    private final static void checkReRooting( final REROOTING rerooting, final String outgroup ) {
+        if ( !ForesterUtil.isEmpty( outgroup ) && rerooting != REROOTING.OUTGROUP ) {
+            throw new IllegalArgumentException( "can only use outgroup when re-rooting in this manner" );
+        }
+    }
+
+    public BasicDescriptiveStatistics getDuplicationsStatistics() {
+        return _duplications_stats;
+    }
 }
index 06468af..1480e8a 100644 (file)
@@ -8,6 +8,7 @@ import org.forester.phylogeny.PhylogenyMethods;
 import org.forester.phylogeny.PhylogenyMethods.PhylogenyNodeField;
 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
 import org.forester.phylogeny.factories.PhylogenyFactory;
+import org.forester.rio.RIO.REROOTING;
 import org.forester.sdi.SDIutil.ALGORITHM;
 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
 import org.forester.util.ForesterUtil;
@@ -40,7 +41,7 @@ public final class TestRIO {
             species_tree_1.setRooted( true );
             PhylogenyMethods.transferNodeNameToField( species_tree_1, PhylogenyNodeField.TAXONOMY_CODE, true );
             //Archaeopteryx.createApplication( species_trees_1 );
-            RIO rio = new RIO( gene_trees_1, species_tree_1, ALGORITHM.GSDIR, true, false );
+            RIO rio = new RIO( gene_trees_1, species_tree_1, ALGORITHM.GSDIR, REROOTING.BY_ALGORITHM, "", true, false );
             if ( rio.getAnalyzedGeneTrees().length != 5 ) {
                 return false;
             }
@@ -79,7 +80,7 @@ public final class TestRIO {
             final Phylogeny species_tree_2 = factory.create( species_trees_2_str, new NHXParser() )[ 0 ];
             species_tree_2.setRooted( true );
             PhylogenyMethods.transferNodeNameToField( species_tree_2, PhylogenyNodeField.TAXONOMY_CODE, true );
-            rio = new RIO( gene_trees_2, species_tree_2, ALGORITHM.GSDIR, true, false );
+            rio = new RIO( gene_trees_2, species_tree_2, ALGORITHM.GSDIR, REROOTING.BY_ALGORITHM, null, true, false );
             m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
             // System.out.println( m.toString() );
             if ( !m.getRowAsString( 0, ',' ).equals( "ARATH,5,5,5,5,5,5" ) ) {