X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Frio%2FRIO.java;h=d844d2b709aaac20c6e65f790add9b407d6c16e5;hb=5309828c338cdf84ef10f70dc2472cf27016a75b;hp=30f2fffa709ac24941894067f948f68a0b3133f4;hpb=89f36e42fb462f8d7b59def3ead995fb16a87b59;p=jalview.git diff --git a/forester/java/src/org/forester/rio/RIO.java b/forester/java/src/org/forester/rio/RIO.java index 30f2fff..d844d2b 100644 --- a/forester/java/src/org/forester/rio/RIO.java +++ b/forester/java/src/org/forester/rio/RIO.java @@ -41,7 +41,9 @@ import java.util.TreeSet; import org.forester.datastructures.IntMatrix; import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.nexus.NexusPhylogeniesParser; import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; import org.forester.io.parsers.util.ParserUtils; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; @@ -53,6 +55,7 @@ import org.forester.sdi.GSDI; import org.forester.sdi.GSDIR; import org.forester.sdi.SDIException; import org.forester.sdi.SDIR; +import org.forester.sdi.SDIutil; import org.forester.sdi.SDIutil.ALGORITHM; import org.forester.sdi.SDIutil.TaxonomyComparisonBase; import org.forester.util.BasicDescriptiveStatistics; @@ -60,9 +63,11 @@ import org.forester.util.ForesterUtil; public final class RIO { + public static final int DEFAULT_RANGE = -1; private Phylogeny[] _analyzed_gene_trees; private List _removed_gene_tree_nodes; private int _ext_nodes; + private int _int_nodes; private TaxonomyComparisonBase _gsdir_tax_comp_base; private final StringBuilder _log; private final BasicDescriptiveStatistics _duplications_stats; @@ -75,21 +80,23 @@ public final class RIO { final ALGORITHM algorithm, final REROOTING rerooting, final String outgroup, - final int first, - final int last, + int first, + int last, final boolean produce_log, final boolean verbose ) throws IOException, SDIException, RIOException { - if ( !ForesterUtil.isEmpty( outgroup ) && ( rerooting != REROOTING.OUTGROUP ) ) { - throw new IllegalArgumentException( "can only use outgroup when re-rooting by outgroup" ); + if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) { + last = gene_trees.length - 1; } - if ( !( ( last == -1 ) && ( first == -1 ) ) - && ( ( last < first ) || ( last >= gene_trees.length ) || ( first >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) { - throw new IllegalArgumentException( "gene tree range is out of range: " + first + "-" + last ); + else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) { + first = 0; } + removeSingleDescendentsNodes( species_tree, verbose ); + checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last ); _produce_log = produce_log; _verbose = verbose; _rerooting = rerooting; _ext_nodes = -1; + _int_nodes = -1; _log = new StringBuilder(); _gsdir_tax_comp_base = null; _analyzed_gene_trees = null; @@ -116,6 +123,16 @@ public final class RIO { return _ext_nodes; } + /** + * Returns the numbers of number of int nodes in gene trees analyzed (after + * stripping). + * + * @return number of int nodes in gene trees analyzed (after stripping) + */ + public final int getIntNodesOfAnalyzedGeneTrees() { + return _int_nodes; + } + public final TaxonomyComparisonBase getGSDIRtaxCompBase() { return _gsdir_tax_comp_base; } @@ -142,15 +159,9 @@ public final class RIO { throw new RIOException( "failed to establish species based mapping between gene and species trees" ); } } - if ( log() ) { - preLog( gene_trees, species_tree, algorithm, outgroup ); - } final Phylogeny[] my_gene_trees; if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) { - if ( log() ) { - log( "Gene tree range: " + first + "-" + last ); - } - my_gene_trees = new Phylogeny[ 1 + last - first ]; + my_gene_trees = new Phylogeny[ ( 1 + last ) - first ]; int c = 0; for( int i = first; i <= last; ++i ) { my_gene_trees[ c++ ] = gene_trees[ i ]; @@ -159,14 +170,17 @@ public final class RIO { else { my_gene_trees = gene_trees; } - if ( _verbose && ( my_gene_trees.length > 10 ) ) { + if ( log() ) { + preLog( gene_trees, species_tree, algorithm, outgroup, first, last ); + } + if ( _verbose && ( my_gene_trees.length >= 4 ) ) { System.out.println(); } _analyzed_gene_trees = new Phylogeny[ my_gene_trees.length ]; int gene_tree_ext_nodes = 0; for( int i = 0; i < my_gene_trees.length; ++i ) { final Phylogeny gt = my_gene_trees[ i ]; - if ( _verbose && ( my_gene_trees.length > 10 ) ) { + if ( _verbose && ( my_gene_trees.length > 4 ) ) { ForesterUtil.updateProgress( ( ( double ) i ) / my_gene_trees.length ); } if ( i == 0 ) { @@ -189,7 +203,7 @@ public final class RIO { if ( log() ) { postLog( species_tree ); } - if ( _verbose && ( my_gene_trees.length > 10 ) ) { + if ( _verbose && ( my_gene_trees.length > 4 ) ) { System.out.println(); System.out.println(); } @@ -251,6 +265,7 @@ public final class RIO { } if ( i == 0 ) { _ext_nodes = assigned_tree.getNumberOfExternalNodes(); + _int_nodes = assigned_tree.getNumberOfInternalNodes(); } else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) { throw new RIOException( "after stripping gene tree #" + ( i + 1 ) @@ -292,13 +307,7 @@ public final class RIO { PhylogenyMethods.midpointRoot( gene_tree ); } else if ( _rerooting == REROOTING.OUTGROUP ) { - PhylogenyNode n; - try { - n = gene_tree.getNode( outgroup ); - } - catch ( final IllegalArgumentException e ) { - throw new RIOException( "failed to perform re-rooting by outgroup: " + e.getLocalizedMessage() ); - } + final PhylogenyNode n = gene_tree.getNode( outgroup ); gene_tree.reRoot( n ); } final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true ); @@ -336,19 +345,29 @@ public final class RIO { final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#" ); log( "Gene trees analyzed : " + _duplications_stats.getN() ); log( "Mean number of duplications : " + df.format( _duplications_stats.arithmeticMean() ) - + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" ); + + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" + " (" + + df.format( ( 100.0 * _duplications_stats.arithmeticMean() ) / getIntNodesOfAnalyzedGeneTrees() ) + + "%)" ); if ( _duplications_stats.getN() > 3 ) { - log( "Median number of duplications : " + df.format( _duplications_stats.median() ) ); + log( "Median number of duplications : " + df.format( _duplications_stats.median() ) + + " (" + df.format( ( 100.0 * _duplications_stats.median() ) / getIntNodesOfAnalyzedGeneTrees() ) + + "%)" ); } - log( "Minimum duplications : " + ( int ) _duplications_stats.getMin() ); - log( "Maximum duplications : " + ( int ) _duplications_stats.getMax() ); + log( "Minimum duplications : " + ( int ) _duplications_stats.getMin() + " (" + + df.format( ( 100.0 * _duplications_stats.getMin() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" ); + log( "Maximum duplications : " + ( int ) _duplications_stats.getMax() + " (" + + df.format( ( 100.0 * _duplications_stats.getMax() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" ); + log( "Gene tree internal nodes : " + getIntNodesOfAnalyzedGeneTrees() ); + log( "Gene tree external nodes : " + getExtNodesOfAnalyzedGeneTrees() ); } private final void preLog( final Phylogeny[] gene_trees, final Phylogeny species_tree, final ALGORITHM algorithm, - final String outgroup ) { - log( "Number of gene tree (total) : " + gene_trees.length ); + final String outgroup, + final int first, + final int last ) { + log( "Number of gene trees (total) : " + gene_trees.length ); log( "Algorithm : " + algorithm ); log( "Species tree external nodes (prior to stripping): " + species_tree.getNumberOfExternalNodes() ); log( "Species tree polytomies (prior to stripping) : " @@ -373,6 +392,9 @@ public final class RIO { } } log( "Re-rooting : " + rs ); + if ( ( first >= 0 ) || ( last >= 0 ) ) { + log( "Gene trees analyzed range : " + first + "-" + last ); + } if ( _rerooting == REROOTING.BY_ALGORITHM ) { writeLogSubHeader(); } @@ -468,27 +490,75 @@ public final class RIO { } public final static RIO executeAnalysis( final File gene_trees_file, - final Phylogeny species_tree, + final File species_tree_file, final ALGORITHM algorithm, final REROOTING rerooting, final String outgroup, + final int first, + final int last, final boolean produce_log, final boolean verbose ) throws IOException, SDIException, RIOException { - final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true ); - if ( p instanceof NHXParser ) { - final NHXParser nhx = ( NHXParser ) p; - nhx.setReplaceUnderscores( false ); - nhx.setIgnoreQuotes( true ); - nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES ); + final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file ); + if ( gene_trees.length < 1 ) { + throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" ); } - final Phylogeny[] gene_trees = factory.create( gene_trees_file, p ); - return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, -1, -1, produce_log, verbose ); + final Phylogeny species_tree = SDIutil.parseSpeciesTree( gene_trees[ 0 ], + species_tree_file, + false, + true, + TAXONOMY_EXTRACTION.NO ); + return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose ); + } + + public final static RIO executeAnalysis( final File gene_trees_file, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final boolean produce_log, + final boolean verbose ) throws IOException, SDIException, RIOException { + return new RIO( parseGeneTrees( gene_trees_file ), + species_tree, + algorithm, + rerooting, + outgroup, + DEFAULT_RANGE, + DEFAULT_RANGE, + produce_log, + verbose ); + } + + public final static RIO executeAnalysis( final File gene_trees_file, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final int first, + final int last, + final boolean produce_log, + final boolean verbose ) throws IOException, SDIException, RIOException { + return new RIO( parseGeneTrees( gene_trees_file ), + species_tree, + algorithm, + rerooting, + outgroup, + first, + last, + produce_log, + verbose ); } public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree ) throws IOException, SDIException, RIOException { - return new RIO( gene_trees, species_tree, ALGORITHM.GSDIR, REROOTING.BY_ALGORITHM, null, -1, -1, false, false ); + return new RIO( gene_trees, + species_tree, + ALGORITHM.GSDIR, + REROOTING.BY_ALGORITHM, + null, + DEFAULT_RANGE, + DEFAULT_RANGE, + false, + false ); } public final static RIO executeAnalysis( final Phylogeny[] gene_trees, @@ -498,7 +568,91 @@ public final class RIO { final String outgroup, final boolean produce_log, final boolean verbose ) throws IOException, SDIException, RIOException { - return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, -1, -1, produce_log, verbose ); + return new RIO( gene_trees, + species_tree, + algorithm, + rerooting, + outgroup, + DEFAULT_RANGE, + DEFAULT_RANGE, + produce_log, + verbose ); + } + + public final static RIO executeAnalysis( final Phylogeny[] gene_trees, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final int first, + final int last, + final boolean produce_log, + final boolean verbose ) throws IOException, SDIException, RIOException { + return new RIO( gene_trees, species_tree, algorithm, rerooting, outgroup, first, last, produce_log, verbose ); + } + + private final static void checkPreconditions( final Phylogeny[] gene_trees, + final Phylogeny species_tree, + final REROOTING rerooting, + final String outgroup, + final int first, + final int last ) throws RIOException { + if ( !species_tree.isRooted() ) { + throw new RIOException( "species tree is not rooted" ); + } + if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) ) + && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) { + throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to " + + last + " (out of " + gene_trees.length + ")" ); + } + if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) { + throw new RIOException( "outgroup not set for midpoint rooting" ); + } + if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) { + throw new RIOException( "outgroup only used for midpoint rooting" ); + } + if ( ( rerooting == REROOTING.MIDPOINT ) + && ( PhylogenyMethods.calculateMaxDistanceToRoot( gene_trees[ 0 ] ) <= 0 ) ) { + throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" ); + } + if ( rerooting == REROOTING.OUTGROUP ) { + try { + gene_trees[ 0 ].getNode( outgroup ); + } + catch ( final IllegalArgumentException e ) { + throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() ); + } + } + } + + private final static Phylogeny[] parseGeneTrees( final File gene_trees_file ) throws FileNotFoundException, + IOException { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true ); + if ( p instanceof NHXParser ) { + final NHXParser nhx = ( NHXParser ) p; + nhx.setReplaceUnderscores( false ); + nhx.setIgnoreQuotes( true ); + nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES ); + } + else if ( p instanceof NexusPhylogeniesParser ) { + final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p; + nex.setReplaceUnderscores( false ); + nex.setIgnoreQuotes( true ); + nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.YES ); + } + return factory.create( gene_trees_file, p ); + } + + private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) { + final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree ); + if ( o > 0 ) { + if ( verbose ) { + System.out.println( "warning: species tree has " + o + + " internal nodes with only one descendent which are therefore going to be removed" ); + } + PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree ); + } } public enum REROOTING {