rio - gsdir work...
[jalview.git] / forester / java / src / org / forester / rio / RIO.java
index 84ae28d..d844d2b 100644 (file)
@@ -41,7 +41,9 @@ import java.util.TreeSet;
 
 import org.forester.datastructures.IntMatrix;
 import org.forester.io.parsers.PhylogenyParser;
+import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
 import org.forester.io.parsers.nhx.NHXParser;
+import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
 import org.forester.io.parsers.util.ParserUtils;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyMethods;
@@ -53,6 +55,7 @@ import org.forester.sdi.GSDI;
 import org.forester.sdi.GSDIR;
 import org.forester.sdi.SDIException;
 import org.forester.sdi.SDIR;
+import org.forester.sdi.SDIutil;
 import org.forester.sdi.SDIutil.ALGORITHM;
 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
 import org.forester.util.BasicDescriptiveStatistics;
@@ -64,6 +67,7 @@ public final class RIO {
     private Phylogeny[]                      _analyzed_gene_trees;
     private List<PhylogenyNode>              _removed_gene_tree_nodes;
     private int                              _ext_nodes;
+    private int                              _int_nodes;
     private TaxonomyComparisonBase           _gsdir_tax_comp_base;
     private final StringBuilder              _log;
     private final BasicDescriptiveStatistics _duplications_stats;
@@ -86,11 +90,13 @@ public final class RIO {
         else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
             first = 0;
         }
-        checkPreconditions( gene_trees, rerooting, outgroup, first, last );
+        removeSingleDescendentsNodes( species_tree, verbose );
+        checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last );
         _produce_log = produce_log;
         _verbose = verbose;
         _rerooting = rerooting;
         _ext_nodes = -1;
+        _int_nodes = -1;
         _log = new StringBuilder();
         _gsdir_tax_comp_base = null;
         _analyzed_gene_trees = null;
@@ -117,6 +123,16 @@ public final class RIO {
         return _ext_nodes;
     }
 
+    /**
+     * Returns the numbers of number of int nodes in gene trees analyzed (after
+     * stripping).
+     * 
+     * @return number of int nodes in gene trees analyzed (after stripping)
+     */
+    public final int getIntNodesOfAnalyzedGeneTrees() {
+        return _int_nodes;
+    }
+
     public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
         return _gsdir_tax_comp_base;
     }
@@ -145,7 +161,7 @@ public final class RIO {
         }
         final Phylogeny[] my_gene_trees;
         if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) {
-            my_gene_trees = new Phylogeny[ 1 + last - first ];
+            my_gene_trees = new Phylogeny[ ( 1 + last ) - first ];
             int c = 0;
             for( int i = first; i <= last; ++i ) {
                 my_gene_trees[ c++ ] = gene_trees[ i ];
@@ -249,6 +265,7 @@ public final class RIO {
         }
         if ( i == 0 ) {
             _ext_nodes = assigned_tree.getNumberOfExternalNodes();
+            _int_nodes = assigned_tree.getNumberOfInternalNodes();
         }
         else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) {
             throw new RIOException( "after stripping gene tree #" + ( i + 1 )
@@ -328,12 +345,20 @@ public final class RIO {
         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#" );
         log( "Gene trees analyzed                             : " + _duplications_stats.getN() );
         log( "Mean number of duplications                     : " + df.format( _duplications_stats.arithmeticMean() )
-                + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" );
+                + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" + " ("
+                + df.format( ( 100.0 * _duplications_stats.arithmeticMean() ) / getIntNodesOfAnalyzedGeneTrees() )
+                + "%)" );
         if ( _duplications_stats.getN() > 3 ) {
-            log( "Median number of duplications                   : " + df.format( _duplications_stats.median() ) );
-        }
-        log( "Minimum duplications                            : " + ( int ) _duplications_stats.getMin() );
-        log( "Maximum duplications                            : " + ( int ) _duplications_stats.getMax() );
+            log( "Median number of duplications                   : " + df.format( _duplications_stats.median() )
+                    + " (" + df.format( ( 100.0 * _duplications_stats.median() ) / getIntNodesOfAnalyzedGeneTrees() )
+                    + "%)" );
+        }
+        log( "Minimum duplications                            : " + ( int ) _duplications_stats.getMin() + " ("
+                + df.format( ( 100.0 * _duplications_stats.getMin() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" );
+        log( "Maximum duplications                            : " + ( int ) _duplications_stats.getMax() + " ("
+                + df.format( ( 100.0 * _duplications_stats.getMax() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" );
+        log( "Gene tree internal nodes                        : " + getIntNodesOfAnalyzedGeneTrees() );
+        log( "Gene tree external nodes                        : " + getExtNodesOfAnalyzedGeneTrees() );
     }
 
     private final void preLog( final Phylogeny[] gene_trees,
@@ -342,7 +367,7 @@ public final class RIO {
                                final String outgroup,
                                final int first,
                                final int last ) {
-        log( "Number of gene tree (total)                     : " + gene_trees.length );
+        log( "Number of gene trees (total)                    : " + gene_trees.length );
         log( "Algorithm                                       : " + algorithm );
         log( "Species tree external nodes (prior to stripping): " + species_tree.getNumberOfExternalNodes() );
         log( "Species tree polytomies (prior to stripping)    : "
@@ -465,22 +490,34 @@ public final class RIO {
     }
 
     public final static RIO executeAnalysis( final File gene_trees_file,
+                                             final File species_tree_file,
+                                             final ALGORITHM algorithm,
+                                             final REROOTING rerooting,
+                                             final String outgroup,
+                                             final int first,
+                                             final int last,
+                                             final boolean produce_log,
+                                             final boolean verbose ) throws IOException, SDIException, RIOException {
+        final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file );
+        if ( gene_trees.length < 1 ) {
+            throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" );
+        }
+        final Phylogeny species_tree = SDIutil.parseSpeciesTree( gene_trees[ 0 ],
+                                                                 species_tree_file,
+                                                                 false,
+                                                                 true,
+                                                                 TAXONOMY_EXTRACTION.NO );
+        return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose );
+    }
+
+    public final static RIO executeAnalysis( final File gene_trees_file,
                                              final Phylogeny species_tree,
                                              final ALGORITHM algorithm,
                                              final REROOTING rerooting,
                                              final String outgroup,
                                              final boolean produce_log,
                                              final boolean verbose ) throws IOException, SDIException, RIOException {
-        final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
-        final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
-        if ( p instanceof NHXParser ) {
-            final NHXParser nhx = ( NHXParser ) p;
-            nhx.setReplaceUnderscores( false );
-            nhx.setIgnoreQuotes( true );
-            nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
-        }
-        final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
-        return new RIO( gene_trees,
+        return new RIO( parseGeneTrees( gene_trees_file ),
                         species_tree,
                         algorithm,
                         rerooting,
@@ -500,16 +537,15 @@ public final class RIO {
                                              final int last,
                                              final boolean produce_log,
                                              final boolean verbose ) throws IOException, SDIException, RIOException {
-        final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
-        final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
-        if ( p instanceof NHXParser ) {
-            final NHXParser nhx = ( NHXParser ) p;
-            nhx.setReplaceUnderscores( false );
-            nhx.setIgnoreQuotes( true );
-            nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
-        }
-        final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
-        return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose );
+        return new RIO( parseGeneTrees( gene_trees_file ),
+                        species_tree,
+                        algorithm,
+                        rerooting,
+                        outgroup,
+                        first,
+                        last,
+                        produce_log,
+                        verbose );
     }
 
     public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree )
@@ -556,10 +592,14 @@ public final class RIO {
     }
 
     private final static void checkPreconditions( final Phylogeny[] gene_trees,
+                                                  final Phylogeny species_tree,
                                                   final REROOTING rerooting,
                                                   final String outgroup,
                                                   final int first,
                                                   final int last ) throws RIOException {
+        if ( !species_tree.isRooted() ) {
+            throw new RIOException( "species tree is not rooted" );
+        }
         if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
                 && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) {
             throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
@@ -585,6 +625,36 @@ public final class RIO {
         }
     }
 
+    private final static Phylogeny[] parseGeneTrees( final File gene_trees_file ) throws FileNotFoundException,
+            IOException {
+        final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+        final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
+        if ( p instanceof NHXParser ) {
+            final NHXParser nhx = ( NHXParser ) p;
+            nhx.setReplaceUnderscores( false );
+            nhx.setIgnoreQuotes( true );
+            nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
+        }
+        else if ( p instanceof NexusPhylogeniesParser ) {
+            final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
+            nex.setReplaceUnderscores( false );
+            nex.setIgnoreQuotes( true );
+            nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.YES );
+        }
+        return factory.create( gene_trees_file, p );
+    }
+
+    private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) {
+        final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree );
+        if ( o > 0 ) {
+            if ( verbose ) {
+                System.out.println( "warning: species tree has " + o
+                        + " internal nodes with only one descendent which are therefore going to be removed" );
+            }
+            PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
+        }
+    }
+
     public enum REROOTING {
         NONE, BY_ALGORITHM, MIDPOINT, OUTGROUP;
     }