X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Frio%2FRIO.java;h=1f4f3e6df9c95fa9db101a2be6627ebf80b2b5ae;hb=06b38f91bc061d8ab1dfea3b6238c94c95a30d26;hp=dea0492091a1c6aad4b51f9661c2279ce177a596;hpb=9a9e40a5768c100cbbb70e32a2637c82890d2d53;p=jalview.git diff --git a/forester/java/src/org/forester/rio/RIO.java b/forester/java/src/org/forester/rio/RIO.java index dea0492..1f4f3e6 100644 --- a/forester/java/src/org/forester/rio/RIO.java +++ b/forester/java/src/org/forester/rio/RIO.java @@ -23,77 +23,138 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.rio; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; +import java.text.DecimalFormat; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; import org.forester.datastructures.IntMatrix; +import org.forester.io.parsers.IteratingPhylogenyParser; import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.nexus.NexusPhylogeniesParser; import org.forester.io.parsers.nhx.NHXParser; +import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION; import org.forester.io.parsers.util.ParserUtils; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; -import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.sdi.GSDI; import org.forester.sdi.GSDIR; -import org.forester.sdi.SDI; import org.forester.sdi.SDIException; import org.forester.sdi.SDIR; +import org.forester.sdi.SDIutil; +import org.forester.sdi.SDIutil.ALGORITHM; +import org.forester.sdi.SDIutil.TaxonomyComparisonBase; +import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.ForesterUtil; public final class RIO { - private final static boolean ROOT_BY_MINIMIZING_SUM_OF_DUPS = true; - private final static boolean ROOT_BY_MINIMIZING_TREE_HEIGHT = true; - private Phylogeny[] _analyzed_gene_trees; - private HashMap> _o_maps; - private HashMap> _so_maps; - private HashMap> _up_maps; - private List _seq_names; - private List _removed_gene_tree_nodes; - private int _samples; - private int _ext_nodes; - - /** - * Default constructor. - * @throws SDIException - * @throws IOException - * @throws RIOException - */ - public RIO( final File gene_trees_file, - final Phylogeny species_tree, - final String query, - final SDI.ALGORITHM algorithm ) throws IOException, SDIException, RIOException { - if ( ForesterUtil.isEmpty( query ) ) { - throw new IllegalArgumentException( "query is empty" ); - } - init(); - inferOrthologs( gene_trees_file, species_tree, query, algorithm ); + public static final int DEFAULT_RANGE = -1; + private static final int END_OF_GT = Integer.MAX_VALUE; + private static IntMatrix _m; + private Phylogeny[] _analyzed_gene_trees; + private List _removed_gene_tree_nodes; + private int _ext_nodes; + private int _int_nodes; + private TaxonomyComparisonBase _gsdir_tax_comp_base; + private final StringBuilder _log; + private final BasicDescriptiveStatistics _duplications_stats; + private final boolean _produce_log; + private final boolean _verbose; + private final REROOTING _rerooting; + private final Phylogeny _species_tree; + private Phylogeny _min_dub_gene_tree; + + private RIO( final IteratingPhylogenyParser p, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + int first, + int last, + final boolean produce_log, + final boolean verbose, + final boolean transfer_taxonomy ) throws IOException, SDIException, RIOException { + if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) { + last = END_OF_GT; + } + else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) { + first = 0; + } + removeSingleDescendentsNodes( species_tree, verbose ); + p.reset(); + checkPreconditions( p, species_tree, rerooting, outgroup, first, last ); + _produce_log = produce_log; + _verbose = verbose; + _rerooting = rerooting; + _ext_nodes = -1; + _int_nodes = -1; + _log = new StringBuilder(); + _gsdir_tax_comp_base = null; + _analyzed_gene_trees = null; + _removed_gene_tree_nodes = null; + _duplications_stats = new BasicDescriptiveStatistics(); + p.reset(); + inferOrthologs( p, species_tree, algorithm, outgroup, first, last, transfer_taxonomy ); + _species_tree = species_tree; } - public RIO( final File gene_trees_file, final Phylogeny species_tree, final SDI.ALGORITHM algorithm ) - throws IOException, SDIException, RIOException { - init(); - inferOrthologs( gene_trees_file, species_tree, null, algorithm ); + private RIO( final Phylogeny[] gene_trees, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + int first, + int last, + final boolean produce_log, + final boolean verbose, + final boolean transfer_taxonomy ) throws IOException, SDIException, RIOException { + if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) { + last = gene_trees.length - 1; + } + else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) { + first = 0; + } + removeSingleDescendentsNodes( species_tree, verbose ); + checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last ); + _produce_log = produce_log; + _verbose = verbose; + _rerooting = rerooting; + _ext_nodes = -1; + _int_nodes = -1; + _log = new StringBuilder(); + _gsdir_tax_comp_base = null; + _analyzed_gene_trees = null; + _removed_gene_tree_nodes = null; + _duplications_stats = new BasicDescriptiveStatistics(); + inferOrthologs( gene_trees, species_tree, algorithm, outgroup, first, last, transfer_taxonomy ); + _species_tree = species_tree; } public final Phylogeny[] getAnalyzedGeneTrees() { return _analyzed_gene_trees; } + public final BasicDescriptiveStatistics getDuplicationsStatistics() { + return _duplications_stats; + } + /** * Returns the numbers of number of ext nodes in gene trees analyzed (after * stripping). @@ -104,460 +165,396 @@ public final class RIO { return _ext_nodes; } + public final TaxonomyComparisonBase getGSDIRtaxCompBase() { + return _gsdir_tax_comp_base; + } + /** - * Returns a HashMap containing the inferred "ultra paralogs" of the - * external gene tree node with the sequence name seq_name. Sequence names - * are the keys (String), numbers of observations are the values (Int). - * "ultra paralogs" are to be inferred by method "inferOrthologs". Throws an - * exception if seq_name is not found. + * Returns the numbers of number of int nodes in gene trees analyzed (after + * stripping). * - * @param seq_name - * sequence name of a external node of the gene trees - * @return HashMap containing the inferred ultra paralogs - * (name(String)->value(Int)) + * @return number of int nodes in gene trees analyzed (after stripping) */ - public final HashMap getInferredUltraParalogs( final String seq_name ) { - if ( _up_maps == null ) { - return null; - } - return _up_maps.get( seq_name ); + public final int getIntNodesOfAnalyzedGeneTrees() { + return _int_nodes; } - public final int getNumberOfSamples() { - return _samples; + public final StringBuilder getLog() { + return _log; } - /** - * Returns a String containg the names of orthologs of the PhylogenyNode - * with seq name query_name. The String also contains how many times a - * particular ortholog has been observed. - *

- *

    - * The output order is (per line): Name, Ortholog, Subtree neighbor, Super - * ortholog, Distance - *
- *

- * The sort priority of this is determined by sort in the following manner: - *

    - *
  • 0 : Ortholog - *
  • 1 : Ortholog, Super ortholog - *
  • 2 : Super ortholog, Ortholog - *
- *

- * Returns "-" if no putative orthologs have been found (given - * threshold_orthologs). - *

- * Orthologs are to be inferred by method "inferOrthologs". - *

- * (Last modified: 05/08/01) - * - * @param query_name - * sequence name of a external node of the gene trees - * @param sort - * order and sort priority - * @param threshold_orthologs - * the minimal number of observations for a a sequence to be - * reported as orthologous, in percents (0.0-100.0%) - * @param threshold_subtreeneighborings - * the minimal number of observations for a a sequence to be - * reported as orthologous, in percents (0.0-100.0%) - * @return String containing the inferred orthologs, String containing "-" - * if no orthologs have been found null in case of error - */ - public final StringBuffer inferredOrthologsToString( final String query_name, int sort, double threshold_orthologs ) { - HashMap o_hashmap = null; - HashMap s_hashmap = null; - String name = ""; - double o = 0.0; // Orthologs. - double s = 0.0; // Super orthologs. - double value1 = 0.0; - double value2 = 0.0; - final ArrayList nv = new ArrayList(); - if ( ( _o_maps == null ) || ( _so_maps == null ) ) { - throw new RuntimeException( "orthologs have not been calculated (successfully)" ); - } - if ( ( sort < 0 ) || ( sort > 2 ) ) { - sort = 1; - } - if ( threshold_orthologs < 0.0 ) { - threshold_orthologs = 0.0; - } - else if ( threshold_orthologs > 100.0 ) { - threshold_orthologs = 100.0; - } - o_hashmap = getInferredOrthologs( query_name ); - s_hashmap = getInferredSuperOrthologs( query_name ); - if ( ( o_hashmap == null ) || ( s_hashmap == null ) ) { - throw new RuntimeException( "Orthologs for " + query_name + " were not established" ); - } - final StringBuffer orthologs = new StringBuffer(); - if ( _seq_names.size() > 0 ) { - I: for( int i = 0; i < _seq_names.size(); ++i ) { - name = _seq_names.get( i ); - if ( name.equals( query_name ) ) { - continue I; - } - o = getBootstrapValueFromHash( o_hashmap, name ); - if ( o < threshold_orthologs ) { - continue I; - } - s = getBootstrapValueFromHash( s_hashmap, name ); - switch ( sort ) { - case 0: - nv.add( new ResultLine( name, o, 5 ) ); - break; - case 1: - nv.add( new ResultLine( name, o, s, 5 ) ); - break; - case 2: - nv.add( new ResultLine( name, s, o, 5 ) ); - break; - default: - nv.add( new ResultLine( name, o, 5 ) ); - } - } // End of I for loop. - if ( ( nv != null ) && ( nv.size() > 0 ) ) { - orthologs.append( "seq name\t\tortho\ts-ortho" + ForesterUtil.LINE_SEPARATOR ); - final ResultLine[] nv_array = new ResultLine[ nv.size() ]; - for( int j = 0; j < nv.size(); ++j ) { - nv_array[ j ] = nv.get( j ); - } - Arrays.sort( nv_array ); - for( final ResultLine element : nv_array ) { - name = element.getKey(); - value1 = element.getValue1(); - value2 = element.getValue2(); - orthologs.append( addNameAndValues( name, value1, value2, sort ) ); - } - } - } - // No orthologs found. - if ( ( orthologs == null ) || ( orthologs.length() < 1 ) ) { - orthologs.append( "-" ); - } - return orthologs; + final public Phylogeny getMinDuplicationsGeneTree() { + return _min_dub_gene_tree; } - /** - * Returns a String containg the names of orthologs of the PhylogenyNode - * with seq name query_name. The String also contains how many times a - * particular ortholog has been observed. Returns "-" if no putative - * orthologs have been found (given threshold_orthologs). - *

- * Orthologs are to be inferred by method "inferOrthologs". - * - * @param query_name - * sequence name of a external node of the gene trees - * @param return_dists - * @param threshold_ultra_paralogs - * between 1 and 100 - * @return String containing the inferred orthologs, String containing "-" - * if no orthologs have been found null in case of error - */ - public final String inferredUltraParalogsToString( final String query_name, double threshold_ultra_paralogs ) { - HashMap sp_hashmap = null; - String name = "", ultra_paralogs = ""; - int sort = 0; - double sp = 0.0; - double value1 = 0.0; - double value2 = 0.0; - final List nv = new ArrayList(); - if ( threshold_ultra_paralogs < 1.0 ) { - threshold_ultra_paralogs = 1.0; - } - else if ( threshold_ultra_paralogs > 100.0 ) { - threshold_ultra_paralogs = 100.0; - } - if ( _up_maps == null ) { - throw new RuntimeException( "Ultra paralogs have not been calculated (successfully)." ); - } - sp_hashmap = getInferredUltraParalogs( query_name ); - if ( sp_hashmap == null ) { - throw new RuntimeException( "Ultra paralogs for " + query_name + " were not established" ); - } - if ( _seq_names.size() > 0 ) { - I: for( int i = 0; i < _seq_names.size(); ++i ) { - name = _seq_names.get( i ); - if ( name.equals( query_name ) ) { - continue I; + public final IntMatrix getOrthologTable() { + return _m; + } + + public final List getRemovedGeneTreeNodes() { + return _removed_gene_tree_nodes; + } + + public final Phylogeny getSpeciesTree() { + return _species_tree; + } + + private final void inferOrthologs( final IteratingPhylogenyParser parser, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final String outgroup, + int first, + final int last, + final boolean transfer_taxonomy ) throws SDIException, RIOException, + FileNotFoundException, IOException { + if ( !parser.hasNext() ) { + throw new RIOException( "no gene trees to analyze" ); + } + if ( log() ) { + preLog( -1, species_tree, algorithm, outgroup ); + } + if ( _verbose ) { + System.out.println(); + } + final DecimalFormat pf = new java.text.DecimalFormat( "000" ); + int gene_tree_ext_nodes = 0; + int i = 0; + int counter = 0; + final boolean no_range = ( first < 0 ) || ( last < first ); + while ( parser.hasNext() ) { + final Phylogeny gt = parser.next(); + if ( no_range || ( ( i >= first ) && ( i <= last ) ) ) { + if ( gt.isEmpty() ) { + throw new RIOException( "gene tree #" + i + " is empty" ); + } + if ( gt.getNumberOfExternalNodes() == 1 ) { + throw new RIOException( "gene tree #" + i + " has only one external node" ); } - sp = getBootstrapValueFromHash( sp_hashmap, name ); - if ( sp < threshold_ultra_paralogs ) { - continue I; + if ( _verbose ) { + ForesterUtil.updateProgress( i, pf ); } - nv.add( new ResultLine( name, sp, 5 ) ); - } // End of I for loop. - if ( ( nv != null ) && ( nv.size() > 0 ) ) { - final ResultLine[] nv_array = new ResultLine[ nv.size() ]; - for( int j = 0; j < nv.size(); ++j ) { - nv_array[ j ] = nv.get( j ); + if ( counter == 0 ) { + if ( algorithm == ALGORITHM.SDIR ) { + // Removes from species_tree all species not found in gene_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gt, species_tree ); + if ( species_tree.isEmpty() ) { + throw new RIOException( "failed to establish species based mapping between gene and species trees" ); + } + } + gene_tree_ext_nodes = gt.getNumberOfExternalNodes(); + } + else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) { + throw new RIOException( "gene tree #" + i + " has a different number of external nodes (" + + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + + gene_tree_ext_nodes + ")" ); } - Arrays.sort( nv_array ); - sort = 90; - for( final ResultLine element : nv_array ) { - name = element.getKey(); - value1 = element.getValue1(); - value2 = element.getValue2(); - ultra_paralogs += addNameAndValues( name, value1, value2, sort ); + if ( algorithm == ALGORITHM.SDIR ) { + // Removes from gene_tree all species not found in species_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt ); + if ( gt.isEmpty() ) { + throw new RIOException( "failed to establish species based mapping between gene and species trees" ); + } } + final Phylogeny analyzed_gt = performOrthologInference( gt, + species_tree, + algorithm, + outgroup, + counter, + transfer_taxonomy ); + RIO.calculateOrthologTable( analyzed_gt, true, counter ); + ++counter; } + ++i; } - // No ultra paralogs found. - if ( ( ultra_paralogs == null ) || ( ultra_paralogs.length() < 1 ) ) { - ultra_paralogs = "-"; + if ( ( first >= 0 ) && ( counter == 0 ) && ( i > 0 ) ) { + throw new RIOException( "attempt to analyze first gene tree #" + first + " in a set of " + i ); } - return ultra_paralogs; - } - - // Helper method for inferredOrthologsToString. - // inferredOrthologsToArrayList, - // and inferredUltraParalogsToString. - private final double getBootstrapValueFromHash( final HashMap h, final String name ) { - if ( !h.containsKey( name ) ) { - return 0.0; + if ( no_range ) { + first = 0; } - final int i = h.get( name ); - return ( ( i * 100.0 ) / getNumberOfSamples() ); - } - - /** - * Returns a HashMap containing the inferred orthologs of the external gene - * tree node with the sequence name seq_name. Sequence names are the keys - * (String), numbers of observations are the values (Int). Orthologs are to - * be inferred by method "inferOrthologs". Throws an exception if seq_name - * is not found. - * - * @param seq_name - * sequence name of a external node of the gene trees - * @return HashMap containing the inferred orthologs - * (name(String)->value(Int)) - */ - private final HashMap getInferredOrthologs( final String seq_name ) { - if ( _o_maps == null ) { - return null; + if ( log() ) { + postLog( species_tree, first, ( first + counter ) - 1 ); } - return _o_maps.get( seq_name ); - } - - /** - * Returns a HashMap containing the inferred "super orthologs" of the - * external gene tree node with the sequence name seq_name. Sequence names - * are the keys (String), numbers of observations are the values (Int). - * Super orthologs are to be inferred by method "inferOrthologs". Throws an - * exception if seq_name is not found. - * - * @param seq_name - * sequence name of a external node of the gene trees - * @return HashMap containing the inferred super orthologs - * (name(String)->value(Int)) - */ - private final HashMap getInferredSuperOrthologs( final String seq_name ) { - if ( _so_maps == null ) { - return null; + if ( _verbose ) { + System.out.println(); + System.out.println(); } - return _so_maps.get( seq_name ); } - /** - * Infers the orthologs (as well the "super orthologs", the "subtree - * neighbors", and the "ultra paralogs") for each external node of the gene - * Trees in multiple tree File gene_trees_file (=output of PHYLIP NEIGHBOR, - * for example). Tallies how many times each sequence is (super-) - * orthologous towards the query. Tallies how many times each sequence is - * ultra paralogous towards the query. Tallies how many times each sequence - * is a subtree neighbor of the query. Gene duplications are inferred using - * SDI. Modifies its argument species_tree. Is a little faster than - * "inferOrthologs(File,Phylogeny)" since orthologs are only inferred for - * query. - *

- * To obtain the results use the methods listed below. - * - * @param gene_trees_file - * a File containing gene Trees in NH format, which is the result - * of performing a bootstrap analysis in PHYLIP - * @param species_tree - * a species Phylogeny, which has species names in its species - * fields - * @param query - * the sequence name of the squence whose orthologs are to be - * inferred - * @throws SDIException - * @throws RIOException - * @throws IOException - * @throws FileNotFoundException - */ - private final void inferOrthologs( final File gene_trees_file, + private final void inferOrthologs( final Phylogeny[] gene_trees, final Phylogeny species_tree, - final String query, - final SDI.ALGORITHM algorithm ) throws SDIException, RIOException, + final ALGORITHM algorithm, + final String outgroup, + final int first, + final int last, + final boolean transfer_taxonomy ) throws SDIException, RIOException, FileNotFoundException, IOException { - // Read in first tree to get its sequence names - // and strip species_tree. - final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true ); - if ( p instanceof NHXParser ) { - final NHXParser nhx = ( NHXParser ) p; - nhx.setReplaceUnderscores( false ); - nhx.setIgnoreQuotes( true ); - nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES ); - } - final Phylogeny[] gene_trees = factory.create( gene_trees_file, p ); - // Removes from species_tree all species not found in gene_tree. - final List _removed_gene_tree_nodes = PhylogenyMethods - .taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree ); - if ( species_tree.isEmpty() ) { - throw new RIOException( "failed to establish species based mapping between gene and species trees" ); - } - if ( !ForesterUtil.isEmpty( query ) ) { - PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_trees[ 0 ] ); - if ( gene_trees[ 0 ].isEmpty() ) { + if ( algorithm == ALGORITHM.SDIR ) { + // Removes from species_tree all species not found in gene_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree ); + if ( species_tree.isEmpty() ) { throw new RIOException( "failed to establish species based mapping between gene and species trees" ); } - _seq_names = getAllExternalSequenceNames( gene_trees[ 0 ] ); - if ( ( _seq_names == null ) || ( _seq_names.size() < 1 ) ) { - throw new RIOException( "could not get sequence names" ); + } + final Phylogeny[] my_gene_trees; + if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) { + my_gene_trees = new Phylogeny[ ( 1 + last ) - first ]; + int c = 0; + for( int i = first; i <= last; ++i ) { + my_gene_trees[ c++ ] = gene_trees[ i ]; } - _o_maps = new HashMap>(); - _so_maps = new HashMap>(); - _up_maps = new HashMap>(); - _o_maps.put( query, new HashMap( _seq_names.size() ) ); - _so_maps.put( query, new HashMap( _seq_names.size() ) ); - _up_maps.put( query, new HashMap( _seq_names.size() ) ); } - _analyzed_gene_trees = new Phylogeny[ gene_trees.length ]; - int c = 0; + else { + my_gene_trees = gene_trees; + } + if ( log() ) { + preLog( gene_trees.length, species_tree, algorithm, outgroup ); + } + if ( _verbose && ( my_gene_trees.length >= 4 ) ) { + System.out.println(); + } + _analyzed_gene_trees = new Phylogeny[ my_gene_trees.length ]; int gene_tree_ext_nodes = 0; - for( final Phylogeny gt : gene_trees ) { - // Removes from gene_tree all species not found in species_tree. - PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt ); + for( int i = 0; i < my_gene_trees.length; ++i ) { + final Phylogeny gt = my_gene_trees[ i ]; if ( gt.isEmpty() ) { - throw new RIOException( "failed to establish species based mapping between gene and species trees" ); + throw new RIOException( "gene tree #" + i + " is empty" ); + } + if ( gt.getNumberOfExternalNodes() == 1 ) { + throw new RIOException( "gene tree #" + i + " has only one external node" ); } - if ( c == 0 ) { + if ( _verbose && ( my_gene_trees.length > 4 ) ) { + ForesterUtil.updateProgress( ( ( double ) i ) / my_gene_trees.length ); + } + if ( i == 0 ) { gene_tree_ext_nodes = gt.getNumberOfExternalNodes(); } else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) { - throw new RIOException( "(cleaned up) gene tree #" + ( c + 1 ) - + " has a different number of external nodes (" + gt.getNumberOfExternalNodes() - + ") than those gene trees preceding it (" + gene_tree_ext_nodes + ")" ); + throw new RIOException( "gene tree #" + i + " has a different number of external nodes (" + + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + gene_tree_ext_nodes + + ")" ); + } + if ( algorithm == ALGORITHM.SDIR ) { + // Removes from gene_tree all species not found in species_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt ); + if ( gt.isEmpty() ) { + throw new RIOException( "failed to establish species based mapping between gene and species trees" ); + } } - _analyzed_gene_trees[ c++ ] = performOrthologInference( gt, species_tree, query, algorithm ); + _analyzed_gene_trees[ i ] = performOrthologInference( gt, + species_tree, + algorithm, + outgroup, + i, + transfer_taxonomy ); } - setNumberOfSamples( gene_trees.length ); + if ( log() ) { + postLog( species_tree, first, last ); + } + if ( _verbose && ( my_gene_trees.length > 4 ) ) { + System.out.println(); + System.out.println(); + } + } + + private final boolean log() { + return _produce_log; + } + + private final void log( final String s ) { + _log.append( s ); + _log.append( ForesterUtil.LINE_SEPARATOR ); + } + + private final void logRemovedGeneTreeNodes() { + log( "Species stripped from gene trees:" ); + final SortedSet rn = new TreeSet(); + for( final PhylogenyNode n : getRemovedGeneTreeNodes() ) { + final Taxonomy t = n.getNodeData().getTaxonomy(); + switch ( getGSDIRtaxCompBase() ) { + case CODE: { + rn.add( t.getTaxonomyCode() ); + break; + } + case ID: { + rn.add( t.getIdentifier().toString() ); + break; + } + case SCIENTIFIC_NAME: { + rn.add( t.getScientificName() ); + break; + } + } + } + for( final String s : rn ) { + log( s ); + } + log( "" ); } private final Phylogeny performOrthologInference( final Phylogeny gene_tree, final Phylogeny species_tree, - final String query, - final SDI.ALGORITHM algorithm ) throws SDIException, RIOException { + final ALGORITHM algorithm, + final String outgroup, + final int i, + final boolean transfer_taxonomy ) throws SDIException, + RIOException { final Phylogeny assigned_tree; switch ( algorithm ) { case SDIR: { - final SDIR sdir = new SDIR(); - assigned_tree = sdir.infer( gene_tree, - species_tree, - false, - RIO.ROOT_BY_MINIMIZING_SUM_OF_DUPS, - RIO.ROOT_BY_MINIMIZING_TREE_HEIGHT, - true, - 1 )[ 0 ]; + assigned_tree = performOrthologInferenceBySDI( gene_tree, species_tree ); break; } case GSDIR: { - final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, 1 ); - assigned_tree = gsdir.getMinDuplicationsSumGeneTrees().get( 1 ); + assigned_tree = performOrthologInferenceByGSDI( gene_tree, species_tree, outgroup, i, transfer_taxonomy ); break; } default: { throw new IllegalArgumentException( "illegal algorithm: " + algorithm ); } } - setExtNodesOfAnalyzedGeneTrees( assigned_tree.getNumberOfExternalNodes() ); - if ( !ForesterUtil.isEmpty( query ) ) { - final List nodes = getNodesViaSequenceName( assigned_tree, query ); - if ( nodes.size() > 1 ) { - throw new RIOException( "node named [" + query + "] not unique" ); - } - else if ( nodes.isEmpty() ) { - throw new RIOException( "no node containing a sequence named [" + query + "] found" ); - } - final PhylogenyNode query_node = nodes.get( 0 ); - updateCounts( _o_maps, query, PhylogenyMethods.getOrthologousNodes( assigned_tree, query_node ) ); - updateCounts( _so_maps, query, PhylogenyMethods.getSuperOrthologousNodes( query_node ) ); - updateCounts( _up_maps, query, PhylogenyMethods.getUltraParalogousNodes( query_node ) ); + if ( i == 0 ) { + _ext_nodes = assigned_tree.getNumberOfExternalNodes(); + _int_nodes = assigned_tree.getNumberOfInternalNodes(); + } + else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) { + throw new RIOException( "after stripping gene tree #" + i + " has a different number of external nodes (" + + assigned_tree.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + _ext_nodes + + ")" ); } return assigned_tree; } - private final void init() { - _o_maps = null; - _so_maps = null; - _up_maps = null; - _seq_names = null; - _samples = 1; - _ext_nodes = 0; + private final Phylogeny performOrthologInferenceByGSDI( final Phylogeny gene_tree, + final Phylogeny species_tree, + final String outgroup, + final int i, + final boolean transfer_taxonomy ) throws SDIException, + RIOException { + final Phylogeny assigned_tree; + final int dups; + if ( _rerooting == REROOTING.BY_ALGORITHM ) { + final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0, transfer_taxonomy ); + assigned_tree = gsdir.getMinDuplicationsSumGeneTree(); + if ( i == 0 ) { + _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes(); + for( final PhylogenyNode r : _removed_gene_tree_nodes ) { + if ( !r.getNodeData().isHasTaxonomy() ) { + throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #" + + i + ": " + r.toString() ); + } + } + } + if ( i == 0 ) { + _gsdir_tax_comp_base = gsdir.getTaxCompBase(); + } + dups = gsdir.getMinDuplicationsSum(); + } + else { + if ( _rerooting == REROOTING.MIDPOINT ) { + PhylogenyMethods.midpointRoot( gene_tree ); + } + else if ( _rerooting == REROOTING.OUTGROUP ) { + final PhylogenyNode n = gene_tree.getNode( outgroup ); + gene_tree.reRoot( n ); + } + final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true, transfer_taxonomy ); + _removed_gene_tree_nodes = gsdi.getStrippedExternalGeneTreeNodes(); + for( final PhylogenyNode r : _removed_gene_tree_nodes ) { + if ( !r.getNodeData().isHasTaxonomy() ) { + throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #" + i + + ": " + r.toString() ); + } + } + assigned_tree = gene_tree; + if ( i == 0 ) { + _gsdir_tax_comp_base = gsdi.getTaxCompBase(); + } + dups = gsdi.getDuplicationsSum(); + } + if ( ( i == 0 ) || ( dups < _duplications_stats.getMin() ) ) { + _min_dub_gene_tree = assigned_tree; + } + _duplications_stats.addValue( dups ); + return assigned_tree; } - private final void setExtNodesOfAnalyzedGeneTrees( final int i ) { - _ext_nodes = i; + private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree ) + throws SDIException { + final SDIR sdir = new SDIR(); + return sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ]; } - private final void setNumberOfSamples( int i ) { - if ( i < 1 ) { - i = 1; - } - _samples = i; + private final void postLog( final Phylogeny species_tree, final int first, final int last ) { + log( "" ); + if ( ( getRemovedGeneTreeNodes() != null ) && ( getRemovedGeneTreeNodes().size() > 0 ) ) { + logRemovedGeneTreeNodes(); + } + log( "Species tree external nodes (after stripping) : " + species_tree.getNumberOfExternalNodes() ); + log( "Species tree polytomies (after stripping) : " + + PhylogenyMethods.countNumberOfPolytomies( species_tree ) ); + log( "Taxonomy linking based on : " + getGSDIRtaxCompBase() ); + final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#" ); + if ( ( first >= 0 ) && ( last >= 0 ) ) { + log( "Gene trees analyzed range : " + first + "-" + last ); + } + log( "Gene trees analyzed : " + _duplications_stats.getN() ); + log( "Mean number of duplications : " + df.format( _duplications_stats.arithmeticMean() ) + + " (sd: " + df.format( _duplications_stats.sampleStandardDeviation() ) + ")" + " (" + + df.format( ( 100.0 * _duplications_stats.arithmeticMean() ) / getIntNodesOfAnalyzedGeneTrees() ) + + "%)" ); + if ( _duplications_stats.getN() > 3 ) { + log( "Median number of duplications : " + df.format( _duplications_stats.median() ) + + " (" + df.format( ( 100.0 * _duplications_stats.median() ) / getIntNodesOfAnalyzedGeneTrees() ) + + "%)" ); + } + log( "Minimum duplications : " + ( int ) _duplications_stats.getMin() + " (" + + df.format( ( 100.0 * _duplications_stats.getMin() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" ); + log( "Maximum duplications : " + ( int ) _duplications_stats.getMax() + " (" + + df.format( ( 100.0 * _duplications_stats.getMax() ) / getIntNodesOfAnalyzedGeneTrees() ) + "%)" ); + log( "Gene tree internal nodes : " + getIntNodesOfAnalyzedGeneTrees() ); + log( "Gene tree external nodes : " + getExtNodesOfAnalyzedGeneTrees() ); } - // Helper for doInferOrthologs( Phylogeny, Phylogeny, String ) - // and doInferOrthologs( Phylogeny, Phylogeny ). - private final void updateCounts( final HashMap> counter_map, - final String query_seq_name, - final List nodes ) { - final HashMap hash_map = counter_map.get( query_seq_name ); - if ( hash_map == null ) { - throw new RuntimeException( "unexpected error in updateCounts" ); - } - for( int j = 0; j < nodes.size(); ++j ) { - String seq_name; - if ( ( nodes.get( j ) ).getNodeData().isHasSequence() - && !ForesterUtil.isEmpty( ( nodes.get( j ) ).getNodeData().getSequence().getName() ) ) { - seq_name = ( nodes.get( j ) ).getNodeData().getSequence().getName(); + private final void preLog( final int gene_trees, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final String outgroup ) { + if ( gene_trees > 0 ) { + log( "Number of gene trees (total) : " + gene_trees ); + } + log( "Algorithm : " + algorithm ); + log( "Species tree external nodes (prior to stripping): " + species_tree.getNumberOfExternalNodes() ); + log( "Species tree polytomies (prior to stripping) : " + + PhylogenyMethods.countNumberOfPolytomies( species_tree ) ); + String rs = ""; + switch ( _rerooting ) { + case BY_ALGORITHM: { + rs = "minimizing duplications"; + break; } - else { - seq_name = ( nodes.get( j ) ).getName(); + case MIDPOINT: { + rs = "midpoint"; + break; } - if ( hash_map.containsKey( seq_name ) ) { - hash_map.put( seq_name, hash_map.get( seq_name ) + 1 ); + case OUTGROUP: { + rs = "outgroup: " + outgroup; + break; } - else { - hash_map.put( seq_name, 1 ); + case NONE: { + rs = "none"; + break; } } + log( "Re-rooting : " + rs ); } public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees, final boolean sort ) throws RIOException { final List labels = new ArrayList(); final Set labels_set = new HashSet(); - String label; for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) { - if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) { - label = n.getNodeData().getSequence().getName(); - } - else if ( n.getNodeData().isHasSequence() - && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) { - label = n.getNodeData().getSequence().getSymbol(); - } - else if ( !ForesterUtil.isEmpty( n.getName() ) ) { - label = n.getName(); - } - else { - throw new IllegalArgumentException( "node " + n + " has no appropriate label" ); - } - if ( labels_set.contains( label ) ) { - throw new IllegalArgumentException( "label " + label + " is not unique" ); - } + final String label = obtainLabel( labels_set, n ); labels_set.add( label ); labels.add( label ); } @@ -568,224 +565,394 @@ public final class RIO { int counter = 0; for( final Phylogeny gt : analyzed_gene_trees ) { counter++; - PhylogenyMethods.preOrderReId( gt ); - final HashMap map = PhylogenyMethods.createNameToExtNodeMap( gt ); - for( int x = 0; x < m.size(); ++x ) { - final String mx = m.getLabel( x ); - final PhylogenyNode nx = map.get( mx ); - if ( nx == null ) { - throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter ); - } - String my; - PhylogenyNode ny; - for( int y = 0; y < m.size(); ++y ) { - my = m.getLabel( y ); - ny = map.get( my ); - if ( ny == null ) { - throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter ); - } - if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) { - m.inreaseByOne( x, y ); - } - } - } + updateCounts( m, counter, gt ); } return m; } - /** - * Returns the order in which ortholog (o), "super ortholog" (s) and - * distance (d) are returned and sorted (priority of sort always goes from - * left to right), given sort. For the meaning of sort - * - * @see #inferredOrthologsToString(String,int,double,double) - * - * @param sort - * determines order and sort priority - * @return String indicating the order - */ - public final static String getOrder( final int sort ) { - String order = ""; - switch ( sort ) { - case 0: - order = "orthologies"; - break; - case 1: - order = "orthologies > super orthologies"; - break; - case 2: - order = "super orthologies > orthologies"; - break; - default: - order = "orthologies"; - break; - } - return order; + public final static RIO executeAnalysis( final File gene_trees_file, + final File species_tree_file, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final int first, + final int last, + final boolean produce_log, + final boolean verbose, + final boolean transfer_taxonomy ) throws IOException, SDIException, + RIOException { + final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file ); + if ( gene_trees.length < 1 ) { + throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" ); + } + final Phylogeny species_tree = SDIutil.parseSpeciesTree( gene_trees[ 0 ], + species_tree_file, + false, + true, + TAXONOMY_EXTRACTION.NO ); + return new RIO( gene_trees, + species_tree, + algorithm, + rerooting, + outgroup, + first, + last, + produce_log, + verbose, + transfer_taxonomy ); } - public final static StringBuffer getOrderHelp() { - final StringBuffer sb = new StringBuffer(); - sb.append( " 0: orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 1: orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 2: super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR ); - return sb; + public final static RIO executeAnalysis( final File gene_trees_file, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final boolean produce_log, + final boolean verbose, + final boolean transfer_taxonomy ) throws IOException, SDIException, + RIOException { + return new RIO( parseGeneTrees( gene_trees_file ), + species_tree, + algorithm, + rerooting, + outgroup, + DEFAULT_RANGE, + DEFAULT_RANGE, + produce_log, + verbose, + transfer_taxonomy ); } - // Helper method for inferredOrthologsToString - // and inferredUltraParalogsToString. - private final static String addNameAndValues( final String name, - final double value1, - final double value2, - final int sort ) { - final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#####" ); - df.setDecimalSeparatorAlwaysShown( false ); - String line = ""; - if ( name.length() < 8 ) { - line += ( name + "\t\t\t" ); - } - else if ( name.length() < 16 ) { - line += ( name + "\t\t" ); - } - else { - line += ( name + "\t" ); - } - switch ( sort ) { - case 0: - line += addToLine( value1, df ); - line += "-\t"; - break; - case 1: - line += addToLine( value1, df ); - line += addToLine( value2, df ); - break; - case 2: - line += addToLine( value2, df ); - line += addToLine( value1, df ); - break; - case 90: - line += addToLine( value1, df ); - line += "-\t"; - break; - case 91: - line += addToLine( value1, df ); - line += addToLine( value2, df ); - break; - } - line += ForesterUtil.LINE_SEPARATOR; - return line; + public final static RIO executeAnalysis( final File gene_trees_file, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final int first, + final int last, + final boolean produce_log, + final boolean verbose, + final boolean transfer_taxonomy ) throws IOException, SDIException, + RIOException { + return new RIO( parseGeneTrees( gene_trees_file ), + species_tree, + algorithm, + rerooting, + outgroup, + first, + last, + produce_log, + verbose, + transfer_taxonomy ); } - // Helper for addNameAndValues. - private final static String addToLine( final double value, final java.text.DecimalFormat df ) { - String s = ""; - if ( value != ResultLine.DEFAULT ) { - s = df.format( value ) + "\t"; - } - else { - s = "-\t"; - } - return s; + public final static RIO executeAnalysis( final IteratingPhylogenyParser p, + final File species_tree_file, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final int first, + final int last, + final boolean produce_log, + final boolean verbose, + final boolean transfer_taxonomy ) throws IOException, SDIException, + RIOException { + final Phylogeny g0 = p.next(); + if ( ( g0 == null ) || g0.isEmpty() || ( g0.getNumberOfExternalNodes() < 2 ) ) { + throw new RIOException( "input file does not seem to contain any gene trees" ); + } + final Phylogeny species_tree = SDIutil.parseSpeciesTree( g0, + species_tree_file, + false, + true, + TAXONOMY_EXTRACTION.NO ); + p.reset(); + return new RIO( p, + species_tree, + algorithm, + rerooting, + outgroup, + first, + last, + produce_log, + verbose, + transfer_taxonomy ); } - private final static List getAllExternalSequenceNames( final Phylogeny phy ) throws RIOException { - final List names = new ArrayList(); - for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { - final PhylogenyNode n = iter.next(); - if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) { - names.add( n.getNodeData().getSequence().getName() ); - } - else if ( !ForesterUtil.isEmpty( n.getName() ) ) { - names.add( n.getName() ); - } - else { - throw new RIOException( "node has no (sequence) name: " + n ); - } - } - return names; + public final static RIO executeAnalysis( final IteratingPhylogenyParser p, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final boolean produce_log, + final boolean verbose, + final boolean transfer_taxonomy ) throws IOException, SDIException, + RIOException { + return new RIO( p, + species_tree, + algorithm, + rerooting, + outgroup, + DEFAULT_RANGE, + DEFAULT_RANGE, + produce_log, + verbose, + transfer_taxonomy ); } - private final static List getNodesViaSequenceName( final Phylogeny phy, final String seq_name ) { - final List nodes = new ArrayList(); - for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { - final PhylogenyNode n = iter.next(); - if ( n.getNodeData().isHasSequence() && n.getNodeData().getSequence().getName().equals( seq_name ) ) { - nodes.add( n ); - } - if ( !n.getNodeData().isHasSequence() && n.getName().equals( seq_name ) ) { - nodes.add( n ); - } - } - return nodes; + public final static RIO executeAnalysis( final IteratingPhylogenyParser p, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final int first, + final int last, + final boolean produce_log, + final boolean verbose, + final boolean transfer_taxonomy ) throws IOException, SDIException, + RIOException { + return new RIO( p, + species_tree, + algorithm, + rerooting, + outgroup, + first, + last, + produce_log, + verbose, + transfer_taxonomy ); } - public final List getRemovedGeneTreeNodes() { - return _removed_gene_tree_nodes; + public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree ) + throws IOException, SDIException, RIOException { + return new RIO( gene_trees, + species_tree, + ALGORITHM.GSDIR, + REROOTING.BY_ALGORITHM, + null, + DEFAULT_RANGE, + DEFAULT_RANGE, + false, + false, + false ); } - private final class ResultLine implements Comparable { + public final static RIO executeAnalysis( final Phylogeny[] gene_trees, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final boolean produce_log, + final boolean verbose, + final boolean transfer_taxonomy ) throws IOException, SDIException, + RIOException { + return new RIO( gene_trees, + species_tree, + algorithm, + rerooting, + outgroup, + DEFAULT_RANGE, + DEFAULT_RANGE, + produce_log, + verbose, + transfer_taxonomy ); + } - public static final int DEFAULT = -999; - private final String _key; - private final double _value1; - private final double _value2; - private int[] _p; + public final static RIO executeAnalysis( final Phylogeny[] gene_trees, + final Phylogeny species_tree, + final ALGORITHM algorithm, + final REROOTING rerooting, + final String outgroup, + final int first, + final int last, + final boolean produce_log, + final boolean verbose, + final boolean transfer_taxonomy ) throws IOException, SDIException, + RIOException { + return new RIO( gene_trees, + species_tree, + algorithm, + rerooting, + outgroup, + first, + last, + produce_log, + verbose, + transfer_taxonomy ); + } - ResultLine( final String name, final double value1, final double value2, final int c ) { - setSigns(); - _key = name; - _value1 = value1; - _value2 = value2; - if ( ( c >= 0 ) && ( c <= 2 ) ) { - _p[ c ] = -1; + private final static void calculateOrthologTable( final Phylogeny g, final boolean sort, final int counter ) + throws RIOException { + if ( counter == 0 ) { + final List labels = new ArrayList(); + final Set labels_set = new HashSet(); + for( final PhylogenyNode n : g.getExternalNodes() ) { + final String label = obtainLabel( labels_set, n ); + labels_set.add( label ); + labels.add( label ); + } + if ( sort ) { + Collections.sort( labels ); } + _m = new IntMatrix( labels ); } + updateCounts( _m, counter, g ); + } - ResultLine( final String name, final double value1, final int c ) { - setSigns(); - _key = name; - _value1 = value1; - _value2 = ResultLine.DEFAULT; - if ( c == 0 ) { - _p[ 0 ] = -1; + private final static void checkPreconditions( final IteratingPhylogenyParser p, + final Phylogeny species_tree, + final REROOTING rerooting, + final String outgroup, + final int first, + final int last ) throws RIOException, IOException { + final Phylogeny g0 = p.next(); + if ( ( g0 == null ) || g0.isEmpty() ) { + throw new RIOException( "input file does not seem to contain any gene trees" ); + } + if ( g0.getNumberOfExternalNodes() < 2 ) { + throw new RIOException( "input file does not seem to contain any useable gene trees" ); + } + if ( !species_tree.isRooted() ) { + throw new RIOException( "species tree is not rooted" ); + } + if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) ) + && ( ( last < first ) || ( last < 0 ) || ( first < 0 ) ) ) { + throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to " + + last ); + } + if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) { + throw new RIOException( "outgroup not set for midpoint rooting" ); + } + if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) { + throw new RIOException( "outgroup only used for midpoint rooting" ); + } + if ( ( rerooting == REROOTING.MIDPOINT ) && ( PhylogenyMethods.calculateMaxDistanceToRoot( g0 ) <= 0 ) ) { + throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" ); + } + if ( rerooting == REROOTING.OUTGROUP ) { + try { + g0.getNode( outgroup ); + } + catch ( final IllegalArgumentException e ) { + throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() ); } } + } - @Override - public int compareTo( final ResultLine n ) { - if ( ( getValue1() != ResultLine.DEFAULT ) && ( n.getValue1() != ResultLine.DEFAULT ) ) { - if ( getValue1() < n.getValue1() ) { - return _p[ 0 ]; - } - if ( getValue1() > n.getValue1() ) { - return ( -_p[ 0 ] ); - } + private final static void checkPreconditions( final Phylogeny[] gene_trees, + final Phylogeny species_tree, + final REROOTING rerooting, + final String outgroup, + final int first, + final int last ) throws RIOException { + if ( !species_tree.isRooted() ) { + throw new RIOException( "species tree is not rooted" ); + } + if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) ) + && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) { + throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to " + + last + " (out of " + gene_trees.length + ")" ); + } + if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) { + throw new RIOException( "outgroup not set for midpoint rooting" ); + } + if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) { + throw new RIOException( "outgroup only used for midpoint rooting" ); + } + if ( ( rerooting == REROOTING.MIDPOINT ) + && ( PhylogenyMethods.calculateMaxDistanceToRoot( gene_trees[ 0 ] ) <= 0 ) ) { + throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" ); + } + if ( rerooting == REROOTING.OUTGROUP ) { + try { + gene_trees[ 0 ].getNode( outgroup ); } - if ( ( getValue2() != ResultLine.DEFAULT ) && ( n.getValue2() != ResultLine.DEFAULT ) ) { - if ( getValue2() < n.getValue2() ) { - return _p[ 1 ]; - } - if ( getValue2() > n.getValue2() ) { - return ( -_p[ 1 ] ); - } + catch ( final IllegalArgumentException e ) { + throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() ); } - return ( getKey().compareTo( n.getKey() ) ); } + } - String getKey() { - return _key; + private final static String obtainLabel( final Set labels_set, final PhylogenyNode n ) throws RIOException { + String label; + if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) { + label = n.getNodeData().getSequence().getName(); } + else if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) { + label = n.getNodeData().getSequence().getSymbol(); + } + else if ( n.getNodeData().isHasSequence() + && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getGeneName() ) ) { + label = n.getNodeData().getSequence().getGeneName(); + } + else if ( !ForesterUtil.isEmpty( n.getName() ) ) { + label = n.getName(); + } + else { + throw new RIOException( "node " + n + " has no appropriate label" ); + } + if ( labels_set.contains( label ) ) { + throw new RIOException( "label " + label + " is not unique" ); + } + return label; + } - double getValue1() { - return _value1; + private final static Phylogeny[] parseGeneTrees( final File gene_trees_file ) throws FileNotFoundException, + IOException { + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true ); + if ( p instanceof NHXParser ) { + final NHXParser nhx = ( NHXParser ) p; + nhx.setReplaceUnderscores( false ); + nhx.setIgnoreQuotes( true ); + nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE ); } + else if ( p instanceof NexusPhylogeniesParser ) { + final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p; + nex.setReplaceUnderscores( false ); + nex.setIgnoreQuotes( true ); + nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE ); + } + return factory.create( gene_trees_file, p ); + } - double getValue2() { - return _value2; + private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) { + final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree ); + if ( o > 0 ) { + if ( verbose ) { + System.out.println( "warning: species tree has " + o + + " internal nodes with only one descendent which are therefore going to be removed" ); + } + PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree ); } + } - private void setSigns() { - _p = new int[ 2 ]; - _p[ 0 ] = _p[ 1 ] = +1; + private final static void updateCounts( final IntMatrix m, final int counter, final Phylogeny g ) + throws RIOException { + PhylogenyMethods.preOrderReId( g ); + final HashMap map = PhylogenyMethods.createNameToExtNodeMap( g ); + for( int x = 0; x < m.size(); ++x ) { + final String mx = m.getLabel( x ); + final PhylogenyNode nx = map.get( mx ); + if ( nx == null ) { + throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter ); + } + String my; + PhylogenyNode ny; + for( int y = 0; y < m.size(); ++y ) { + my = m.getLabel( y ); + ny = map.get( my ); + if ( ny == null ) { + throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter ); + } + if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) { + m.inreaseByOne( x, y ); + } + } } - } // ResultLine + } + + public enum REROOTING { + NONE, BY_ALGORITHM, MIDPOINT, OUTGROUP; + } }