rio - gsdir work...
[jalview.git] / forester / java / src / org / forester / rio / RIO.java
index 5ba246d..d844d2b 100644 (file)
@@ -41,7 +41,9 @@ import java.util.TreeSet;
 
 import org.forester.datastructures.IntMatrix;
 import org.forester.io.parsers.PhylogenyParser;
+import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
 import org.forester.io.parsers.nhx.NHXParser;
+import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
 import org.forester.io.parsers.util.ParserUtils;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyMethods;
@@ -53,6 +55,7 @@ import org.forester.sdi.GSDI;
 import org.forester.sdi.GSDIR;
 import org.forester.sdi.SDIException;
 import org.forester.sdi.SDIR;
+import org.forester.sdi.SDIutil;
 import org.forester.sdi.SDIutil.ALGORITHM;
 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
 import org.forester.util.BasicDescriptiveStatistics;
@@ -60,9 +63,11 @@ import org.forester.util.ForesterUtil;
 
 public final class RIO {
 
+    public static final int                  DEFAULT_RANGE = -1;
     private Phylogeny[]                      _analyzed_gene_trees;
     private List<PhylogenyNode>              _removed_gene_tree_nodes;
     private int                              _ext_nodes;
+    private int                              _int_nodes;
     private TaxonomyComparisonBase           _gsdir_tax_comp_base;
     private final StringBuilder              _log;
     private final BasicDescriptiveStatistics _duplications_stats;
@@ -75,21 +80,23 @@ public final class RIO {
                  final ALGORITHM algorithm,
                  final REROOTING rerooting,
                  final String outgroup,
-                 final int first,
-                 final int last,
+                 int first,
+                 int last,
                  final boolean produce_log,
                  final boolean verbose ) throws IOException, SDIException, RIOException {
-        if ( !ForesterUtil.isEmpty( outgroup ) && ( rerooting != REROOTING.OUTGROUP ) ) {
-            throw new IllegalArgumentException( "can only use outgroup when re-rooting by outgroup" );
+        if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
+            last = gene_trees.length - 1;
         }
-        if ( !( ( last == -1 ) && ( first == -1 ) )
-                && ( ( last < first ) || ( last >= gene_trees.length ) || ( first >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) {
-            throw new IllegalArgumentException( "gene tree range is out of range: " + first + "-" + last );
+        else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
+            first = 0;
         }
+        removeSingleDescendentsNodes( species_tree, verbose );
+        checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last );
         _produce_log = produce_log;
         _verbose = verbose;
         _rerooting = rerooting;
         _ext_nodes = -1;
+        _int_nodes = -1;
         _log = new StringBuilder();
         _gsdir_tax_comp_base = null;
         _analyzed_gene_trees = null;
@@ -116,6 +123,16 @@ public final class RIO {
         return _ext_nodes;
     }
 
+    /**
+     * Returns the numbers of number of int nodes in gene trees analyzed (after
+     * stripping).
+     * 
+     * @return number of int nodes in gene trees analyzed (after stripping)
+     */
+    public final int getIntNodesOfAnalyzedGeneTrees() {
+        return _int_nodes;
+    }
+
     public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
         return _gsdir_tax_comp_base;
     }
@@ -142,15 +159,9 @@ public final class RIO {
                 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
             }
         }
-        if ( log() ) {
-            preLog( gene_trees, species_tree, algorithm, outgroup );
-        }
         final Phylogeny[] my_gene_trees;
         if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) {
-            if ( log() ) {
-                log( "Gene tree range: " + first + "-" + last );
-            }
-            my_gene_trees = new Phylogeny[ 1 + last - first ];
+            my_gene_trees = new Phylogeny[ ( 1 + last ) - first ];
             int c = 0;
             for( int i = first; i <= last; ++i ) {
                 my_gene_trees[ c++ ] = gene_trees[ i ];
@@ -159,14 +170,17 @@ public final class RIO {
         else {
             my_gene_trees = gene_trees;
         }
-        if ( _verbose && ( my_gene_trees.length > 10 ) ) {
+        if ( log() ) {
+            preLog( gene_trees, species_tree, algorithm, outgroup, first, last );
+        }
+        if ( _verbose && ( my_gene_trees.length >= 4 ) ) {
             System.out.println();
         }
         _analyzed_gene_trees = new Phylogeny[ my_gene_trees.length ];
         int gene_tree_ext_nodes = 0;
         for( int i = 0; i < my_gene_trees.length; ++i ) {
             final Phylogeny gt = my_gene_trees[ i ];
-            if ( _verbose && ( my_gene_trees.length > 10 ) ) {
+            if ( _verbose && ( my_gene_trees.length > 4 ) ) {
                 ForesterUtil.updateProgress( ( ( double ) i ) / my_gene_trees.length );
             }
             if ( i == 0 ) {
@@ -189,7 +203,7 @@ public final class RIO {
         if ( log() ) {
             postLog( species_tree );
         }
-        if ( _verbose && ( my_gene_trees.length > 10 ) ) {
+        if ( _verbose && ( my_gene_trees.length > 4 ) ) {
             System.out.println();
             System.out.println();
         }
@@ -251,6 +265,7 @@ public final class RIO {
         }
         if ( i == 0 ) {
             _ext_nodes = assigned_tree.getNumberOfExternalNodes();
+            _int_nodes = assigned_tree.getNumberOfInternalNodes();
         }
         else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) {
             throw new RIOException( "after stripping gene tree #" + ( i + 1 )
@@ -292,13 +307,7 @@ public final class RIO {
                 PhylogenyMethods.midpointRoot( gene_tree );
             }
             else if ( _rerooting == REROOTING.OUTGROUP ) {
-                PhylogenyNode n;
-                try {
-                    n = gene_tree.getNode( outgroup );
-                }
-                catch ( final IllegalArgumentException e ) {
-                    throw new RIOException( "failed to perform re-rooting by outgroup: " + e.getLocalizedMessage() );
-                }
+                final PhylogenyNode n = gene_tree.getNode( outgroup );
                 gene_tree.reRoot( n );
             }
             final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true );
@@ -336,19 +345,29 @@ public final class RIO {
         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#" );
         log( "Gene trees analyzed                             : " + _duplications_stats.getN() );
         log( "Mean number of duplications                     : " + df.format( _duplications_stats.arithmeticMean() )
-                + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" );
+                + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" + " ("
+                + df.format( ( 100.0 * _duplications_stats.arithmeticMean() ) / getIntNodesOfAnalyzedGeneTrees() )
+                + "%)" );
         if ( _duplications_stats.getN() > 3 ) {
-            log( "Median number of duplications                   : " + df.format( _duplications_stats.median() ) );
+            log( "Median number of duplications                   : " + df.format( _duplications_stats.median() )
+                    + " (" + df.format( ( 100.0 * _duplications_stats.median() ) / getIntNodesOfAnalyzedGeneTrees() )
+                    + "%)" );
         }
-        log( "Minimum duplications                            : " + ( int ) _duplications_stats.getMin() );
-        log( "Maximum duplications                            : " + ( int ) _duplications_stats.getMax() );
+        log( "Minimum duplications                            : " + ( int ) _duplications_stats.getMin() + " ("
+                + df.format( ( 100.0 * _duplications_stats.getMin() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" );
+        log( "Maximum duplications                            : " + ( int ) _duplications_stats.getMax() + " ("
+                + df.format( ( 100.0 * _duplications_stats.getMax() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" );
+        log( "Gene tree internal nodes                        : " + getIntNodesOfAnalyzedGeneTrees() );
+        log( "Gene tree external nodes                        : " + getExtNodesOfAnalyzedGeneTrees() );
     }
 
     private final void preLog( final Phylogeny[] gene_trees,
                                final Phylogeny species_tree,
                                final ALGORITHM algorithm,
-                               final String outgroup ) {
-        log( "Number of gene tree (total)                     : " + gene_trees.length );
+                               final String outgroup,
+                               final int first,
+                               final int last ) {
+        log( "Number of gene trees (total)                    : " + gene_trees.length );
         log( "Algorithm                                       : " + algorithm );
         log( "Species tree external nodes (prior to stripping): " + species_tree.getNumberOfExternalNodes() );
         log( "Species tree polytomies (prior to stripping)    : "
@@ -373,6 +392,9 @@ public final class RIO {
             }
         }
         log( "Re-rooting                                      : " + rs );
+        if ( ( first >= 0 ) || ( last >= 0 ) ) {
+            log( "Gene trees analyzed range                       : " + first + "-" + last );
+        }
         if ( _rerooting == REROOTING.BY_ALGORITHM ) {
             writeLogSubHeader();
         }
@@ -468,24 +490,44 @@ public final class RIO {
     }
 
     public final static RIO executeAnalysis( final File gene_trees_file,
-                                             final Phylogeny species_tree,
+                                             final File species_tree_file,
                                              final ALGORITHM algorithm,
                                              final REROOTING rerooting,
                                              final String outgroup,
+                                             final int first,
+                                             final int last,
                                              final boolean produce_log,
                                              final boolean verbose ) throws IOException, SDIException, RIOException {
-        final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
-        final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
-        if ( p instanceof NHXParser ) {
-            final NHXParser nhx = ( NHXParser ) p;
-            nhx.setReplaceUnderscores( false );
-            nhx.setIgnoreQuotes( true );
-            nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
+        final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file );
+        if ( gene_trees.length < 1 ) {
+            throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" );
         }
-        final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
-        return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, -1, -1, produce_log, verbose );
+        final Phylogeny species_tree = SDIutil.parseSpeciesTree( gene_trees[ 0 ],
+                                                                 species_tree_file,
+                                                                 false,
+                                                                 true,
+                                                                 TAXONOMY_EXTRACTION.NO );
+        return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose );
+    }
+
+    public final static RIO executeAnalysis( final File gene_trees_file,
+                                             final Phylogeny species_tree,
+                                             final ALGORITHM algorithm,
+                                             final REROOTING rerooting,
+                                             final String outgroup,
+                                             final boolean produce_log,
+                                             final boolean verbose ) throws IOException, SDIException, RIOException {
+        return new RIO( parseGeneTrees( gene_trees_file ),
+                        species_tree,
+                        algorithm,
+                        rerooting,
+                        outgroup,
+                        DEFAULT_RANGE,
+                        DEFAULT_RANGE,
+                        produce_log,
+                        verbose );
     }
-    
+
     public final static RIO executeAnalysis( final File gene_trees_file,
                                              final Phylogeny species_tree,
                                              final ALGORITHM algorithm,
@@ -495,21 +537,28 @@ public final class RIO {
                                              final int last,
                                              final boolean produce_log,
                                              final boolean verbose ) throws IOException, SDIException, RIOException {
-        final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
-        final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
-        if ( p instanceof NHXParser ) {
-            final NHXParser nhx = ( NHXParser ) p;
-            nhx.setReplaceUnderscores( false );
-            nhx.setIgnoreQuotes( true );
-            nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
-        }
-        final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
-        return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose );
+        return new RIO( parseGeneTrees( gene_trees_file ),
+                        species_tree,
+                        algorithm,
+                        rerooting,
+                        outgroup,
+                        first,
+                        last,
+                        produce_log,
+                        verbose );
     }
 
     public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree )
             throws IOException, SDIException, RIOException {
-        return new RIO( gene_trees, species_tree, ALGORITHM.GSDIR, REROOTING.BY_ALGORITHM, null, -1, -1, false, false );
+        return new RIO( gene_trees,
+                        species_tree,
+                        ALGORITHM.GSDIR,
+                        REROOTING.BY_ALGORITHM,
+                        null,
+                        DEFAULT_RANGE,
+                        DEFAULT_RANGE,
+                        false,
+                        false );
     }
 
     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
@@ -519,9 +568,17 @@ public final class RIO {
                                              final String outgroup,
                                              final boolean produce_log,
                                              final boolean verbose ) throws IOException, SDIException, RIOException {
-        return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, -1, -1, produce_log, verbose );
+        return new RIO( gene_trees,
+                        species_tree,
+                        algorithm,
+                        rerooting,
+                        outgroup,
+                        DEFAULT_RANGE,
+                        DEFAULT_RANGE,
+                        produce_log,
+                        verbose );
     }
-    
+
     public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
                                              final Phylogeny species_tree,
                                              final ALGORITHM algorithm,
@@ -534,6 +591,70 @@ public final class RIO {
         return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose );
     }
 
+    private final static void checkPreconditions( final Phylogeny[] gene_trees,
+                                                  final Phylogeny species_tree,
+                                                  final REROOTING rerooting,
+                                                  final String outgroup,
+                                                  final int first,
+                                                  final int last ) throws RIOException {
+        if ( !species_tree.isRooted() ) {
+            throw new RIOException( "species tree is not rooted" );
+        }
+        if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
+                && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) {
+            throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
+                    + last + " (out of " + gene_trees.length + ")" );
+        }
+        if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
+            throw new RIOException( "outgroup not set for midpoint rooting" );
+        }
+        if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
+            throw new RIOException( "outgroup only used for midpoint rooting" );
+        }
+        if ( ( rerooting == REROOTING.MIDPOINT )
+                && ( PhylogenyMethods.calculateMaxDistanceToRoot( gene_trees[ 0 ] ) <= 0 ) ) {
+            throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
+        }
+        if ( rerooting == REROOTING.OUTGROUP ) {
+            try {
+                gene_trees[ 0 ].getNode( outgroup );
+            }
+            catch ( final IllegalArgumentException e ) {
+                throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
+            }
+        }
+    }
+
+    private final static Phylogeny[] parseGeneTrees( final File gene_trees_file ) throws FileNotFoundException,
+            IOException {
+        final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+        final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
+        if ( p instanceof NHXParser ) {
+            final NHXParser nhx = ( NHXParser ) p;
+            nhx.setReplaceUnderscores( false );
+            nhx.setIgnoreQuotes( true );
+            nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
+        }
+        else if ( p instanceof NexusPhylogeniesParser ) {
+            final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
+            nex.setReplaceUnderscores( false );
+            nex.setIgnoreQuotes( true );
+            nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.YES );
+        }
+        return factory.create( gene_trees_file, p );
+    }
+
+    private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) {
+        final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree );
+        if ( o > 0 ) {
+            if ( verbose ) {
+                System.out.println( "warning: species tree has " + o
+                        + " internal nodes with only one descendent which are therefore going to be removed" );
+            }
+            PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
+        }
+    }
+
     public enum REROOTING {
         NONE, BY_ALGORITHM, MIDPOINT, OUTGROUP;
     }