X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fsdi%2FRIO.java;h=31e2fc7c009f75187309e1485a6e7af7b319b1e8;hb=e9ca0dc1764303d53fc6b9b087f33cdee53726ea;hp=47a88b8fe225a37d1d5a9f548d591cd2c529653e;hpb=a6973e954d3086547591afe91f6ea76624eb45c8;p=jalview.git diff --git a/forester/java/src/org/forester/sdi/RIO.java b/forester/java/src/org/forester/sdi/RIO.java index 47a88b8..31e2fc7 100644 --- a/forester/java/src/org/forester/sdi/RIO.java +++ b/forester/java/src/org/forester/sdi/RIO.java @@ -28,6 +28,7 @@ package org.forester.sdi; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -37,9 +38,7 @@ import java.util.List; import java.util.Set; import org.forester.datastructures.IntMatrix; -import org.forester.evoinference.matrix.distance.DistanceMatrix; import org.forester.io.parsers.PhylogenyParser; -import org.forester.io.parsers.SymmetricalDistanceMatrixParser; import org.forester.io.parsers.nhx.NHXParser; import org.forester.io.parsers.util.ParserUtils; import org.forester.phylogeny.Phylogeny; @@ -50,124 +49,41 @@ import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; -/* - * @author Christian M. Zmasek - */ public final class RIO { - private final static boolean ROOT_BY_MINIMIZING_MAPPING_COST = false; - private final static boolean ROOT_BY_MINIMIZING_SUM_OF_DUPS = true; - private final static boolean ROOT_BY_MINIMIZING_TREE_HEIGHT = true; - private final static boolean TIME = false; - private HashMap> _o_hash_maps; - private HashMap> _so_hash_maps; - private HashMap> _up_hash_maps; - private HashMap> _sn_hash_maps; // HashMap of HashMaps - private DistanceMatrix _m; - private HashMap _l; + private final static boolean ROOT_BY_MINIMIZING_SUM_OF_DUPS = true; + private final static boolean ROOT_BY_MINIMIZING_TREE_HEIGHT = true; + private Phylogeny[] _analyzed_gene_trees; + private HashMap> _o_maps; + private HashMap> _so_maps; + private HashMap> _up_maps; private List _seq_names; - private int _bootstraps; - private int _ext_nodes_; - private long _time; + private int _samples; + private int _ext_nodes; /** * Default constructor. + * @throws SDIException + * @throws IOException + * @throws RIOException */ - public RIO() { - reset(); - } - - public IntMatrix calculateOrthologTable( Phylogeny[] gene_trees ) { - List labels = new ArrayList(); - Set labels_set = new HashSet(); - String label; - for( PhylogenyNode n : gene_trees[ 0 ].getExternalNodes() ) { - if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) { - label = n.getNodeData().getSequence().getName(); - } - else if ( n.getNodeData().isHasSequence() - && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) { - label = n.getNodeData().getSequence().getSymbol(); - } - else if ( !ForesterUtil.isEmpty( n.getName() ) ) { - label = n.getName(); - } - else { - throw new IllegalArgumentException( "node " + n + " has no appropriate label" ); - } - if ( labels_set.contains( label ) ) { - throw new IllegalArgumentException( "label " + label + " is not unique" ); - } - labels_set.add( label ); - labels.add( label ); - } - IntMatrix m = new IntMatrix( labels ); - int counter = 0; - for( Phylogeny gt : gene_trees ) { - System.out.println( counter ); - counter++; - for( int x = 0; x < m.size(); ++x ) { - PhylogenyNode nx = gt.getNode( m.getLabel( x ) ); - for( int y = 0; y < m.size(); ++y ) { - PhylogenyNode ny = gt.getNode( m.getLabel( y ) ); - if ( PhylogenyMethods.isAreOrthologous( nx, ny ) ) { - m.set( x, y, m.get( x, y ) + 1 ); - //System.out.println( x + " " + y ); - } - } - } - } - return m; - } - - /** - * Returns the numbers of trees analyzed. - * - * @return the numbers of trees analyzed - */ - public final int getBootstraps() { - return _bootstraps; - } - - // Helper method for inferredOrthologsToString. - // inferredOrthologsToArrayList, - // and inferredUltraParalogsToString. - private final double getBootstrapValueFromHash( final HashMap h, final String name ) { - if ( !h.containsKey( name ) ) { - return 0.0; + public RIO( final File gene_trees_file, final Phylogeny species_tree, final String query ) throws IOException, + SDIException, RIOException { + if ( ForesterUtil.isEmpty( query ) ) { + throw new IllegalArgumentException( "query is empty" ); } - final int i = h.get( name ); - return ( i * 100.0 / getBootstraps() ); + init(); + inferOrthologs( gene_trees_file, species_tree, query ); } - /** - * Returns the distance to a sequences/taxa after a distance list file has - * been read in with readDistanceList(File). Throws an exception if name is - * not found or if no list has been read in. - * - * @param name - * a sequence name - */ - public final double getDistance( String name ) { - double distance = 0.0; - name = name.trim(); - if ( _l == null ) { - throw new RuntimeException( "Distance list has probably not been read in (successfully)." ); - } - if ( _l.get( name ) == null ) { - throw new IllegalArgumentException( name + " not found." ); - } - distance = ( _l.get( name ) ).doubleValue(); - return distance; + public RIO( final File gene_trees_file, final Phylogeny species_tree ) throws IOException, SDIException, + RIOException { + init(); + inferOrthologs( gene_trees_file, species_tree, null ); } - public final double getDistance( final String name1, final String name2 ) { - try { - return _m.getValue( _m.getIndex( name1 ), _m.getIndex( name2 ) ); - } - catch ( final Exception e ) { - return 1; - } + public final Phylogeny[] getAnalyzedGeneTrees() { + return _analyzed_gene_trees; } /** @@ -177,52 +93,7 @@ public final class RIO { * @return number of ext nodes in gene trees analyzed (after stripping) */ public final int getExtNodesOfAnalyzedGeneTrees() { - return _ext_nodes_; - } - - /** - * Returns a HashMap containing the inferred orthologs of the external gene - * tree node with the sequence name seq_name. Sequence names are the keys - * (String), numbers of observations are the values (Int). Orthologs are to - * be inferred by method "inferOrthologs". Throws an exception if seq_name - * is not found. - * - * @param seq_name - * sequence name of a external node of the gene trees - * @return HashMap containing the inferred orthologs - * (name(String)->value(Int)) - */ - public final HashMap getInferredOrthologs( final String seq_name ) { - if ( _o_hash_maps == null ) { - return null; - } - return _o_hash_maps.get( seq_name ); - } - - private final HashMap getInferredSubtreeNeighbors( final String seq_name ) { - if ( _sn_hash_maps == null ) { - return null; - } - return _sn_hash_maps.get( seq_name ); - } - - /** - * Returns a HashMap containing the inferred "super orthologs" of the - * external gene tree node with the sequence name seq_name. Sequence names - * are the keys (String), numbers of observations are the values (Int). - * Super orthologs are to be inferred by method "inferOrthologs". Throws an - * exception if seq_name is not found. - * - * @param seq_name - * sequence name of a external node of the gene trees - * @return HashMap containing the inferred super orthologs - * (name(String)->value(Int)) - */ - public final HashMap getInferredSuperOrthologs( final String seq_name ) { - if ( _so_hash_maps == null ) { - return null; - } - return _so_hash_maps.get( seq_name ); + return _ext_nodes; } /** @@ -238,196 +109,14 @@ public final class RIO { * (name(String)->value(Int)) */ public final HashMap getInferredUltraParalogs( final String seq_name ) { - if ( _up_hash_maps == null ) { + if ( _up_maps == null ) { return null; } - return _up_hash_maps.get( seq_name ); - } - - /** - * Returns the time (in ms) needed to run "inferOrthologs". Final variable - * TIME needs to be set to true. - * - * @return time (in ms) needed to run method "inferOrthologs" - */ - public long getTime() { - return _time; - } - - /** - * Infers the orthologs (as well the "super orthologs", the "subtree - * neighbors", and the "ultra paralogs") for each external node of the gene - * Trees in multiple tree File gene_trees_file (=output of PHYLIP NEIGHBOR, - * for example). Tallies how many times each sequence is (super-) - * orthologous towards the query. Tallies how many times each sequence is - * ultra paralogous towards the query. Tallies how many times each sequence - * is a subtree neighbor of the query. Gene duplications are inferred using - * SDI. Modifies its argument species_tree. Is a little faster than - * "inferOrthologs(File,Phylogeny)" since orthologs are only inferred for - * query. - *

- * To obtain the results use the methods listed below. - * - * @param gene_trees_file - * a File containing gene Trees in NH format, which is the result - * of performing a bootstrap analysis in PHYLIP - * @param species_tree - * a species Phylogeny, which has species names in its species - * fields - * @param query - * the sequence name of the squence whose orthologs are to be - * inferred - * @throws SDIException - */ - public void inferOrthologs( final File gene_trees_file, final Phylogeny species_tree, final String query ) - throws IOException, SDIException { - int bs = 0; - if ( RIO.TIME ) { - _time = System.currentTimeMillis(); - } - // Read in first tree to get its sequence names - // and strip species_tree. - final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true ); - if ( p instanceof NHXParser ) { - final NHXParser nhx = ( NHXParser ) p; - nhx.setReplaceUnderscores( false ); - nhx.setIgnoreQuotes( true ); - nhx.setTaxonomyExtraction( PhylogenyMethods.TAXONOMY_EXTRACTION.YES ); - } - final Phylogeny gene_tree = factory.create( gene_trees_file, p )[ 0 ]; - System.out.println( "species " + species_tree.toString() ); - // Removes from species_tree all species not found in gene_tree. - PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_tree, species_tree ); - PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_tree ); - _seq_names = getAllExternalSequenceNames( gene_tree ); - if ( ( _seq_names == null ) || ( _seq_names.size() < 1 ) ) { - throw new IOException( "could not get sequence names" ); - } - _o_hash_maps = new HashMap>(); - _so_hash_maps = new HashMap>(); - _up_hash_maps = new HashMap>(); - _sn_hash_maps = new HashMap>(); - _o_hash_maps.put( query, new HashMap( _seq_names.size() ) ); - _so_hash_maps.put( query, new HashMap( _seq_names.size() ) ); - _up_hash_maps.put( query, new HashMap( _seq_names.size() ) ); - _sn_hash_maps.put( query, new HashMap( _seq_names.size() ) ); - // Go through all gene trees in the file. - final Phylogeny[] gene_trees = factory.create( gene_trees_file, p ); - Phylogeny[] assigned_trees = new Phylogeny[ gene_trees.length ]; - int c = 0; - for( final Phylogeny gt : gene_trees ) { - bs++; - // Removes from gene_tree all species not found in species_tree. - PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt ); - assigned_trees[ c++ ] = inferOrthologsHelper( gt, species_tree, query ); - // System.out.println( bs ); - } - IntMatrix m = calculateOrthologTable( assigned_trees ); - System.out.println( m.toString() ); - setBootstraps( bs ); - if ( RIO.TIME ) { - _time = ( System.currentTimeMillis() - _time ); - } - } - - public List getNodesViaSequenceName( final Phylogeny phy, final String seq_name ) { - final List nodes = new ArrayList(); - for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { - final PhylogenyNode n = iter.next(); - if ( n.getNodeData().isHasSequence() && n.getNodeData().getSequence().getName().equals( seq_name ) ) { - nodes.add( n ); - } - if ( !n.getNodeData().isHasSequence() && n.getName().equals( seq_name ) ) { - nodes.add( n ); - } - } - return nodes; - } - - // Helper method which performs the actual ortholog inference for - // the external node with seqname query. - private Phylogeny inferOrthologsHelper( final Phylogeny gene_tree, final Phylogeny species_tree, final String query ) - throws SDIException { - Phylogeny assigned_tree = null; - List nodes = null; - final SDIR sdiunrooted = new SDIR(); - List orthologs = null; - List super_orthologs = null; - List ultra_paralogs = null; - List subtree_neighbors = null; - assigned_tree = sdiunrooted.infer( gene_tree, - species_tree, - RIO.ROOT_BY_MINIMIZING_MAPPING_COST, - RIO.ROOT_BY_MINIMIZING_SUM_OF_DUPS, - RIO.ROOT_BY_MINIMIZING_TREE_HEIGHT, - true, - 1 )[ 0 ]; - setExtNodesOfAnalyzedGeneTrees( assigned_tree.getNumberOfExternalNodes() ); - nodes = getNodesViaSequenceName( assigned_tree, query ); - if ( nodes.size() > 1 ) { - throw new IllegalArgumentException( "node named [" + query + "] not unique" ); - } - else if ( nodes.isEmpty() ) { - throw new IllegalArgumentException( "no node containing a sequence named [" + query + "] found" ); - } - final PhylogenyNode query_node = nodes.get( 0 ); - final PhylogenyMethods methods = PhylogenyMethods.getInstance(); - orthologs = methods.getOrthologousNodes( assigned_tree, query_node ); - updateHash( _o_hash_maps, query, orthologs ); - super_orthologs = PhylogenyMethods.getSuperOrthologousNodes( query_node ); - updateHash( _so_hash_maps, query, super_orthologs ); - subtree_neighbors = getSubtreeNeighbors( query_node, 2 ); - updateHash( _sn_hash_maps, query, subtree_neighbors ); - ultra_paralogs = PhylogenyMethods.getUltraParalogousNodes( query_node ); - updateHash( _up_hash_maps, query, ultra_paralogs ); - return assigned_tree; + return _up_maps.get( seq_name ); } - /** - * Returns an ArrayList containg the names of orthologs of the PhylogenyNode - * with seq name seq_name. - * - * @param seq_name - * sequence name of a external node of the gene trees - * @param threshold_orthologs - * the minimal number of observations for a a sequence to be - * reported as orthologous as percentage (0.0-100.0%) - * @return ArrayList containg the names of orthologs of the PhylogenyNode - * with seq name seq_name - */ - public ArrayList inferredOrthologsToArrayList( final String seq_name, double threshold_orthologs ) { - HashMap o_hashmap = null; - String name = null; - double o = 0.0; - final ArrayList arraylist = new ArrayList(); - if ( _o_hash_maps == null ) { - throw new RuntimeException( "Orthologs have not been calculated (successfully)." ); - } - if ( threshold_orthologs < 0.0 ) { - threshold_orthologs = 0.0; - } - else if ( threshold_orthologs > 100.0 ) { - threshold_orthologs = 100.0; - } - o_hashmap = getInferredOrthologs( seq_name ); - if ( o_hashmap == null ) { - throw new RuntimeException( "Orthologs for " + seq_name + " were not established." ); - } - if ( _seq_names.size() > 0 ) { - I: for( int i = 0; i < _seq_names.size(); ++i ) { - name = _seq_names.get( i ); - if ( name.equals( seq_name ) ) { - continue I; - } - o = getBootstrapValueFromHash( o_hashmap, name ); - if ( o < threshold_orthologs ) { - continue I; - } - arraylist.add( name ); - } - } - return arraylist; + public final int getNumberOfSamples() { + return _samples; } /** @@ -445,21 +134,6 @@ public final class RIO { *

  • 0 : Ortholog *
  • 1 : Ortholog, Super ortholog *
  • 2 : Super ortholog, Ortholog - *
  • 3 : Ortholog, Distance - *
  • 4 : Distance, Ortholog - *
  • 5 : Ortholog, Super ortholog, Distance - *
  • 6 : Ortholog, Distance, Super ortholog - *
  • 7 : Super ortholog, Ortholog, Distance - *
  • 8 : Super ortholog, Distance, Ortholog - *
  • 9 : Distance, Ortholog, Super ortholog - *
  • 10 : Distance, Super ortholog, Ortholog - *
  • 11 : Ortholog, Subtree neighbor, Distance - *
  • 12 : Ortholog, Subtree neighbor, Super ortholog, Distance (default) - *
  • 13 : Ortholog, Super ortholog, Subtree neighbor, Distance - *
  • 14 : Subtree neighbor, Ortholog, Super ortholog, Distance - *
  • 15 : Subtree neighbor, Distance, Ortholog, Super ortholog - *
  • 16 : Ortholog, Distance, Subtree neighbor, Super ortholog - *
  • 17 : Ortholog, Subtree neighbor, Distance, Super ortholog * *

    * Returns "-" if no putative orthologs have been found (given @@ -481,32 +155,21 @@ public final class RIO { * reported as orthologous, in percents (0.0-100.0%) * @return String containing the inferred orthologs, String containing "-" * if no orthologs have been found null in case of error - * @see #inferOrthologs(File,Phylogeny,String) - * @see #inferOrthologs(Phylogeny[],Phylogeny) - * @see #inferOrthologs(File,Phylogeny) - * @see #getOrder(int) */ - public StringBuffer inferredOrthologsToString( final String query_name, - int sort, - double threshold_orthologs, - double threshold_subtreeneighborings ) { + public final StringBuffer inferredOrthologsToString( final String query_name, int sort, double threshold_orthologs ) { HashMap o_hashmap = null; HashMap s_hashmap = null; - HashMap n_hashmap = null; String name = ""; - double o = 0.0, // Orthologs. - s = 0.0, // Super orthologs. - sn = 0.0, // Subtree neighbors. - value1 = 0.0, value2 = 0.0, value3 = 0.0, value4 = 0.0, d = 0.0; - final ArrayList nv = new ArrayList(); - if ( ( _o_hash_maps == null ) || ( _so_hash_maps == null ) || ( _sn_hash_maps == null ) ) { - throw new RuntimeException( "Orthologs have not been calculated (successfully)" ); - } - if ( ( sort < 0 ) || ( sort > 17 ) ) { - sort = 12; + double o = 0.0; // Orthologs. + double s = 0.0; // Super orthologs. + double value1 = 0.0; + double value2 = 0.0; + final ArrayList nv = new ArrayList(); + if ( ( _o_maps == null ) || ( _so_maps == null ) ) { + throw new RuntimeException( "orthologs have not been calculated (successfully)" ); } - if ( ( sort > 2 ) && ( _m == null ) && ( _l == null ) ) { - throw new RuntimeException( "Distance list or matrix have not been read in (successfully)" ); + if ( ( sort < 0 ) || ( sort > 2 ) ) { + sort = 1; } if ( threshold_orthologs < 0.0 ) { threshold_orthologs = 0.0; @@ -514,16 +177,9 @@ public final class RIO { else if ( threshold_orthologs > 100.0 ) { threshold_orthologs = 100.0; } - if ( threshold_subtreeneighborings < 0.0 ) { - threshold_subtreeneighborings = 0.0; - } - else if ( threshold_subtreeneighborings > 100.0 ) { - threshold_subtreeneighborings = 100.0; - } o_hashmap = getInferredOrthologs( query_name ); s_hashmap = getInferredSuperOrthologs( query_name ); - n_hashmap = getInferredSubtreeNeighbors( query_name ); - if ( ( o_hashmap == null ) || ( s_hashmap == null ) || ( n_hashmap == null ) ) { + if ( ( o_hashmap == null ) || ( s_hashmap == null ) ) { throw new RuntimeException( "Orthologs for " + query_name + " were not established" ); } final StringBuffer orthologs = new StringBuffer(); @@ -537,92 +193,33 @@ public final class RIO { if ( o < threshold_orthologs ) { continue I; } - sn = getBootstrapValueFromHash( n_hashmap, name ); - if ( sn < threshold_subtreeneighborings ) { - continue I; - } s = getBootstrapValueFromHash( s_hashmap, name ); - if ( sort >= 3 ) { - if ( _m != null ) { - d = getDistance( query_name, name ); - } - else { - d = getDistance( name ); - } - } switch ( sort ) { case 0: - nv.add( new Tuplet( name, o, 5 ) ); + nv.add( new ResultLine( name, o, 5 ) ); break; case 1: - nv.add( new Tuplet( name, o, s, 5 ) ); + nv.add( new ResultLine( name, o, s, 5 ) ); break; case 2: - nv.add( new Tuplet( name, s, o, 5 ) ); - break; - case 3: - nv.add( new Tuplet( name, o, d, 1 ) ); - break; - case 4: - nv.add( new Tuplet( name, d, o, 0 ) ); - break; - case 5: - nv.add( new Tuplet( name, o, s, d, 2 ) ); - break; - case 6: - nv.add( new Tuplet( name, o, d, s, 1 ) ); - break; - case 7: - nv.add( new Tuplet( name, s, o, d, 2 ) ); - break; - case 8: - nv.add( new Tuplet( name, s, d, o, 1 ) ); - break; - case 9: - nv.add( new Tuplet( name, d, o, s, 0 ) ); - break; - case 10: - nv.add( new Tuplet( name, d, s, o, 0 ) ); - break; - case 11: - nv.add( new Tuplet( name, o, sn, d, 2 ) ); - break; - case 12: - nv.add( new Tuplet( name, o, sn, s, d, 3 ) ); - break; - case 13: - nv.add( new Tuplet( name, o, s, sn, d, 3 ) ); - break; - case 14: - nv.add( new Tuplet( name, sn, o, s, d, 3 ) ); - break; - case 15: - nv.add( new Tuplet( name, sn, d, o, s, 1 ) ); - break; - case 16: - nv.add( new Tuplet( name, o, d, sn, s, 1 ) ); - break; - case 17: - nv.add( new Tuplet( name, o, sn, d, s, 2 ) ); + nv.add( new ResultLine( name, s, o, 5 ) ); break; default: - nv.add( new Tuplet( name, o, 5 ) ); + nv.add( new ResultLine( name, o, 5 ) ); } } // End of I for loop. if ( ( nv != null ) && ( nv.size() > 0 ) ) { - orthologs.append( "[seq name]\t\t[ortho]\t[st-n]\t[sup-o]\t[dist]" + ForesterUtil.LINE_SEPARATOR ); - final Tuplet[] nv_array = new Tuplet[ nv.size() ]; + orthologs.append( "seq name\t\tortho\ts-ortho" + ForesterUtil.LINE_SEPARATOR ); + final ResultLine[] nv_array = new ResultLine[ nv.size() ]; for( int j = 0; j < nv.size(); ++j ) { nv_array[ j ] = nv.get( j ); } Arrays.sort( nv_array ); - for( int i = 0; i < nv_array.length; ++i ) { - name = nv_array[ i ].getKey(); - value1 = nv_array[ i ].getValue1(); - value2 = nv_array[ i ].getValue2(); - value3 = nv_array[ i ].getValue3(); - value4 = nv_array[ i ].getValue4(); - orthologs.append( addNameAndValues( name, value1, value2, value3, value4, sort ) ); + for( final ResultLine element : nv_array ) { + name = element.getKey(); + value1 = element.getValue1(); + value2 = element.getValue2(); + orthologs.append( addNameAndValues( name, value1, value2, sort ) ); } } } @@ -631,7 +228,7 @@ public final class RIO { orthologs.append( "-" ); } return orthologs; - } // inferredOrthologsToString( String, int, double ) + } /** * Returns a String containg the names of orthologs of the PhylogenyNode @@ -649,26 +246,23 @@ public final class RIO { * @return String containing the inferred orthologs, String containing "-" * if no orthologs have been found null in case of error */ - public String inferredUltraParalogsToString( final String query_name, - final boolean return_dists, - double threshold_ultra_paralogs ) { + public final String inferredUltraParalogsToString( final String query_name, double threshold_ultra_paralogs ) { HashMap sp_hashmap = null; String name = "", ultra_paralogs = ""; int sort = 0; - double sp = 0.0, value1 = 0.0, value2 = 0.0, d = 0.0; - final List nv = new ArrayList(); + double sp = 0.0; + double value1 = 0.0; + double value2 = 0.0; + final List nv = new ArrayList(); if ( threshold_ultra_paralogs < 1.0 ) { threshold_ultra_paralogs = 1.0; } else if ( threshold_ultra_paralogs > 100.0 ) { threshold_ultra_paralogs = 100.0; } - if ( _up_hash_maps == null ) { + if ( _up_maps == null ) { throw new RuntimeException( "Ultra paralogs have not been calculated (successfully)." ); } - if ( return_dists && ( _m == null ) && ( _l == null ) ) { - throw new RuntimeException( "Distance list or matrix have not been read in (successfully)." ); - } sp_hashmap = getInferredUltraParalogs( query_name ); if ( sp_hashmap == null ) { throw new RuntimeException( "Ultra paralogs for " + query_name + " were not established" ); @@ -683,36 +277,20 @@ public final class RIO { if ( sp < threshold_ultra_paralogs ) { continue I; } - if ( return_dists ) { - if ( _m != null ) { - d = getDistance( query_name, name ); - } - else { - d = getDistance( name ); - } - nv.add( new Tuplet( name, sp, d, 1 ) ); - } - else { - nv.add( new Tuplet( name, sp, 5 ) ); - } + nv.add( new ResultLine( name, sp, 5 ) ); } // End of I for loop. if ( ( nv != null ) && ( nv.size() > 0 ) ) { - final Tuplet[] nv_array = new Tuplet[ nv.size() ]; + final ResultLine[] nv_array = new ResultLine[ nv.size() ]; for( int j = 0; j < nv.size(); ++j ) { nv_array[ j ] = nv.get( j ); } Arrays.sort( nv_array ); - if ( return_dists ) { - sort = 91; - } - else { - sort = 90; - } - for( int i = 0; i < nv_array.length; ++i ) { - name = nv_array[ i ].getKey(); - value1 = nv_array[ i ].getValue1(); - value2 = nv_array[ i ].getValue2(); - ultra_paralogs += addNameAndValues( name, value1, value2, 0.0, 0.0, sort ); + sort = 90; + for( final ResultLine element : nv_array ) { + name = element.getKey(); + value1 = element.getValue1(); + value2 = element.getValue2(); + ultra_paralogs += addNameAndValues( name, value1, value2, sort ); } } } @@ -723,66 +301,188 @@ public final class RIO { return ultra_paralogs; } - public final void readDistanceMatrix( final File matrix_file ) throws IOException { - DistanceMatrix[] matrices = null; - final SymmetricalDistanceMatrixParser parser = SymmetricalDistanceMatrixParser.createInstance(); - matrices = parser.parse( matrix_file ); - if ( ( matrices == null ) || ( matrices.length == 0 ) ) { - throw new IOException( "failed to parse distance matrix from [" + matrix_file + "]" ); - } - if ( matrices.length > 1 ) { - throw new IOException( "[" + matrix_file + "] contains more than once distance matrix" ); + // Helper method for inferredOrthologsToString. + // inferredOrthologsToArrayList, + // and inferredUltraParalogsToString. + private final double getBootstrapValueFromHash( final HashMap h, final String name ) { + if ( !h.containsKey( name ) ) { + return 0.0; } - _m = matrices[ 0 ]; + final int i = h.get( name ); + return ( ( i * 100.0 ) / getNumberOfSamples() ); } /** - * Brings this into the same state as immediately after construction. + * Returns a HashMap containing the inferred orthologs of the external gene + * tree node with the sequence name seq_name. Sequence names are the keys + * (String), numbers of observations are the values (Int). Orthologs are to + * be inferred by method "inferOrthologs". Throws an exception if seq_name + * is not found. + * + * @param seq_name + * sequence name of a external node of the gene trees + * @return HashMap containing the inferred orthologs + * (name(String)->value(Int)) */ - private final void reset() { - _o_hash_maps = null; - _so_hash_maps = null; - _up_hash_maps = null; - _seq_names = null; - _m = null; - _l = null; - _bootstraps = 1; - _ext_nodes_ = 0; - _time = 0; + private final HashMap getInferredOrthologs( final String seq_name ) { + if ( _o_maps == null ) { + return null; + } + return _o_maps.get( seq_name ); } /** - * Sets the numbers of trees analyzed. - * @param the - * numbers of trees analyzed + * Returns a HashMap containing the inferred "super orthologs" of the + * external gene tree node with the sequence name seq_name. Sequence names + * are the keys (String), numbers of observations are the values (Int). + * Super orthologs are to be inferred by method "inferOrthologs". Throws an + * exception if seq_name is not found. + * + * @param seq_name + * sequence name of a external node of the gene trees + * @return HashMap containing the inferred super orthologs + * (name(String)->value(Int)) */ - private void setBootstraps( int i ) { - if ( i < 1 ) { - i = 1; + private final HashMap getInferredSuperOrthologs( final String seq_name ) { + if ( _so_maps == null ) { + return null; } - _bootstraps = i; + return _so_maps.get( seq_name ); } /** - * Sets number of ext nodes in gene trees analyzed (after stripping). - * @param the - * number of ext nodes in gene trees analyzed (after stripping) - */ - private void setExtNodesOfAnalyzedGeneTrees( int i ) { - if ( i < 1 ) { - i = 0; + * Infers the orthologs (as well the "super orthologs", the "subtree + * neighbors", and the "ultra paralogs") for each external node of the gene + * Trees in multiple tree File gene_trees_file (=output of PHYLIP NEIGHBOR, + * for example). Tallies how many times each sequence is (super-) + * orthologous towards the query. Tallies how many times each sequence is + * ultra paralogous towards the query. Tallies how many times each sequence + * is a subtree neighbor of the query. Gene duplications are inferred using + * SDI. Modifies its argument species_tree. Is a little faster than + * "inferOrthologs(File,Phylogeny)" since orthologs are only inferred for + * query. + *

    + * To obtain the results use the methods listed below. + * + * @param gene_trees_file + * a File containing gene Trees in NH format, which is the result + * of performing a bootstrap analysis in PHYLIP + * @param species_tree + * a species Phylogeny, which has species names in its species + * fields + * @param query + * the sequence name of the squence whose orthologs are to be + * inferred + * @throws SDIException + * @throws RIOException + * @throws IOException + * @throws FileNotFoundException + */ + private final void inferOrthologs( final File gene_trees_file, final Phylogeny species_tree, final String query ) + throws SDIException, RIOException, FileNotFoundException, IOException { + // Read in first tree to get its sequence names + // and strip species_tree. + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true ); + if ( p instanceof NHXParser ) { + final NHXParser nhx = ( NHXParser ) p; + nhx.setReplaceUnderscores( false ); + nhx.setIgnoreQuotes( true ); + nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES ); } - _ext_nodes_ = i; + final Phylogeny[] gene_trees = factory.create( gene_trees_file, p ); + // Removes from species_tree all species not found in gene_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree ); + if ( species_tree.isEmpty() ) { + throw new RIOException( "failed to establish species based mapping between gene and species trees" ); + } + if ( !ForesterUtil.isEmpty( query ) ) { + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_trees[ 0 ] ); + if ( gene_trees[ 0 ].isEmpty() ) { + throw new RIOException( "failed to establish species based mapping between gene and species trees" ); + } + _seq_names = getAllExternalSequenceNames( gene_trees[ 0 ] ); + if ( ( _seq_names == null ) || ( _seq_names.size() < 1 ) ) { + throw new RIOException( "could not get sequence names" ); + } + _o_maps = new HashMap>(); + _so_maps = new HashMap>(); + _up_maps = new HashMap>(); + _o_maps.put( query, new HashMap( _seq_names.size() ) ); + _so_maps.put( query, new HashMap( _seq_names.size() ) ); + _up_maps.put( query, new HashMap( _seq_names.size() ) ); + } + _analyzed_gene_trees = new Phylogeny[ gene_trees.length ]; + int c = 0; + for( final Phylogeny gt : gene_trees ) { + // Removes from gene_tree all species not found in species_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt ); + if ( gt.isEmpty() ) { + throw new RIOException( "failed to establish species based mapping between gene and species trees" ); + } + _analyzed_gene_trees[ c++ ] = inferOrthologsHelper( gt, species_tree, query ); + } + setNumberOfSamples( gene_trees.length ); + } + + // Helper method which performs the actual ortholog inference for + // the external node with seqname query. + private final Phylogeny inferOrthologsHelper( final Phylogeny gene_tree, + final Phylogeny species_tree, + final String query ) throws SDIException, RIOException { + final SDIR sdiunrooted = new SDIR(); + final Phylogeny assigned_tree = sdiunrooted.infer( gene_tree, + species_tree, + false, + RIO.ROOT_BY_MINIMIZING_SUM_OF_DUPS, + RIO.ROOT_BY_MINIMIZING_TREE_HEIGHT, + true, + 1 )[ 0 ]; + setExtNodesOfAnalyzedGeneTrees( assigned_tree.getNumberOfExternalNodes() ); + if ( !ForesterUtil.isEmpty( query ) ) { + final List nodes = getNodesViaSequenceName( assigned_tree, query ); + if ( nodes.size() > 1 ) { + throw new RIOException( "node named [" + query + "] not unique" ); + } + else if ( nodes.isEmpty() ) { + throw new RIOException( "no node containing a sequence named [" + query + "] found" ); + } + final PhylogenyNode query_node = nodes.get( 0 ); + updateCounts( _o_maps, query, PhylogenyMethods.getOrthologousNodes( assigned_tree, query_node ) ); + updateCounts( _so_maps, query, PhylogenyMethods.getSuperOrthologousNodes( query_node ) ); + updateCounts( _up_maps, query, PhylogenyMethods.getUltraParalogousNodes( query_node ) ); + } + return assigned_tree; + } + + private final void init() { + _o_maps = null; + _so_maps = null; + _up_maps = null; + _seq_names = null; + _samples = 1; + _ext_nodes = 0; + } + + private final void setExtNodesOfAnalyzedGeneTrees( final int i ) { + _ext_nodes = i; + } + + private final void setNumberOfSamples( int i ) { + if ( i < 1 ) { + i = 1; + } + _samples = i; } // Helper for doInferOrthologs( Phylogeny, Phylogeny, String ) // and doInferOrthologs( Phylogeny, Phylogeny ). - private void updateHash( final HashMap> counter_map, - final String query_seq_name, - final List nodes ) { + private final void updateCounts( final HashMap> counter_map, + final String query_seq_name, + final List nodes ) { final HashMap hash_map = counter_map.get( query_seq_name ); if ( hash_map == null ) { - throw new RuntimeException( "Unexpected failure in method updateHash." ); + throw new RuntimeException( "unexpected error in updateCounts" ); } for( int j = 0; j < nodes.size(); ++j ) { String seq_name; @@ -802,13 +502,102 @@ public final class RIO { } } + public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees ) throws RIOException { + final List labels = new ArrayList(); + final Set labels_set = new HashSet(); + String label; + for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) { + if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) { + label = n.getNodeData().getSequence().getName(); + } + else if ( n.getNodeData().isHasSequence() + && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) { + label = n.getNodeData().getSequence().getSymbol(); + } + else if ( !ForesterUtil.isEmpty( n.getName() ) ) { + label = n.getName(); + } + else { + throw new IllegalArgumentException( "node " + n + " has no appropriate label" ); + } + if ( labels_set.contains( label ) ) { + throw new IllegalArgumentException( "label " + label + " is not unique" ); + } + labels_set.add( label ); + labels.add( label ); + } + final IntMatrix m = new IntMatrix( labels ); + int counter = 0; + for( final Phylogeny gt : analyzed_gene_trees ) { + counter++; + PhylogenyMethods.preOrderReId( gt ); + final HashMap map = PhylogenyMethods.createNameToExtNodeMap( gt ); + for( int x = 0; x < m.size(); ++x ) { + final String mx = m.getLabel( x ); + final PhylogenyNode nx = map.get( mx ); + if ( nx == null ) { + throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter ); + } + String my; + PhylogenyNode ny; + for( int y = 0; y < m.size(); ++y ) { + my = m.getLabel( y ); + ny = map.get( my ); + if ( ny == null ) { + throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter ); + } + if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) { + m.inreaseByOne( x, y ); + } + } + } + } + return m; + } + + /** + * Returns the order in which ortholog (o), "super ortholog" (s) and + * distance (d) are returned and sorted (priority of sort always goes from + * left to right), given sort. For the meaning of sort + * + * @see #inferredOrthologsToString(String,int,double,double) + * + * @param sort + * determines order and sort priority + * @return String indicating the order + */ + public final static String getOrder( final int sort ) { + String order = ""; + switch ( sort ) { + case 0: + order = "orthologies"; + break; + case 1: + order = "orthologies > super orthologies"; + break; + case 2: + order = "super orthologies > orthologies"; + break; + default: + order = "orthologies"; + break; + } + return order; + } + + public final static StringBuffer getOrderHelp() { + final StringBuffer sb = new StringBuffer(); + sb.append( " 0: orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 1: orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 2: super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR ); + return sb; + } + // Helper method for inferredOrthologsToString // and inferredUltraParalogsToString. private final static String addNameAndValues( final String name, final double value1, final double value2, - final double value3, - final double value4, final int sort ) { final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.#####" ); df.setDecimalSeparatorAlwaysShown( false ); @@ -826,111 +615,15 @@ public final class RIO { case 0: line += addToLine( value1, df ); line += "-\t"; - line += "-\t"; - line += "-\t"; break; case 1: line += addToLine( value1, df ); - line += "-\t"; line += addToLine( value2, df ); - line += "-\t"; break; case 2: line += addToLine( value2, df ); - line += "-\t"; - line += addToLine( value1, df ); - line += "-\t"; - break; - case 3: - line += addToLine( value1, df ); - line += "-\t"; - line += "-\t"; - line += addToLine( value2, df ); - break; - case 4: - line += addToLine( value2, df ); - line += "-\t"; - line += "-\t"; line += addToLine( value1, df ); break; - case 5: - line += addToLine( value1, df ); - line += "-\t"; - line += addToLine( value2, df ); - line += addToLine( value3, df ); - break; - case 6: - line += addToLine( value1, df ); - line += "-\t"; - line += addToLine( value3, df ); - line += addToLine( value2, df ); - break; - case 7: - line += addToLine( value2, df ); - line += "-\t"; - line += addToLine( value1, df ); - line += addToLine( value3, df ); - break; - case 8: - line += addToLine( value3, df ); - line += "-\t"; - line += addToLine( value1, df ); - line += addToLine( value2, df ); - break; - case 9: - line += addToLine( value2, df ); - line += "-\t"; - line += addToLine( value3, df ); - line += addToLine( value1, df ); - break; - case 10: - line += addToLine( value3, df ); - line += "-\t"; - line += addToLine( value2, df ); - line += addToLine( value1, df ); - break; - case 11: - line += addToLine( value1, df ); - line += addToLine( value2, df ); - line += "-\t"; - line += addToLine( value3, df ); - break; - case 12: - line += addToLine( value1, df ); - line += addToLine( value2, df ); - line += addToLine( value3, df ); - line += addToLine( value4, df ); - break; - case 13: - line += addToLine( value1, df ); - line += addToLine( value3, df ); - line += addToLine( value2, df ); - line += addToLine( value4, df ); - break; - case 14: - line += addToLine( value2, df ); - line += addToLine( value1, df ); - line += addToLine( value3, df ); - line += addToLine( value4, df ); - break; - case 15: - line += addToLine( value3, df ); - line += addToLine( value1, df ); - line += addToLine( value4, df ); - line += addToLine( value2, df ); - break; - case 16: - line += addToLine( value1, df ); - line += addToLine( value3, df ); - line += addToLine( value4, df ); - line += addToLine( value2, df ); - break; - case 17: - line += addToLine( value1, df ); - line += addToLine( value2, df ); - line += addToLine( value4, df ); - line += addToLine( value3, df ); - break; case 90: line += addToLine( value1, df ); line += "-\t"; @@ -947,7 +640,7 @@ public final class RIO { // Helper for addNameAndValues. private final static String addToLine( final double value, final java.text.DecimalFormat df ) { String s = ""; - if ( value != Tuplet.DEFAULT ) { + if ( value != ResultLine.DEFAULT ) { s = df.format( value ) + "\t"; } else { @@ -956,7 +649,7 @@ public final class RIO { return s; } - private static List getAllExternalSequenceNames( final Phylogeny phy ) { + private final static List getAllExternalSequenceNames( final Phylogeny phy ) throws RIOException { final List names = new ArrayList(); for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode n = iter.next(); @@ -967,134 +660,90 @@ public final class RIO { names.add( n.getName() ); } else { - throw new IllegalArgumentException( "node has no (sequence) name: " + n ); + throw new RIOException( "node has no (sequence) name: " + n ); } } return names; } - /** - * Returns the order in which ortholog (o), "super ortholog" (s) and - * distance (d) are returned and sorted (priority of sort always goes from - * left to right), given sort. For the meaning of sort - * - * @see #inferredOrthologsToString(String,int,double,double) - * - * @param sort - * determines order and sort priority - * @return String indicating the order - */ - public final static String getOrder( final int sort ) { - String order = ""; - switch ( sort ) { - case 0: - order = "orthologies"; - break; - case 1: - order = "orthologies > super orthologies"; - break; - case 2: - order = "super orthologies > orthologies"; - break; - case 3: - order = "orthologies > distance to query"; - break; - case 4: - order = "distance to query > orthologies"; - break; - case 5: - order = "orthologies > super orthologies > distance to query"; - break; - case 6: - order = "orthologies > distance to query > super orthologies"; - break; - case 7: - order = "super orthologies > orthologies > distance to query"; - break; - case 8: - order = "super orthologies > distance to query > orthologies"; - break; - case 9: - order = "distance to query > orthologies > super orthologies"; - break; - case 10: - order = "distance to query > super orthologies > orthologies"; - break; - case 11: - order = "orthologies > subtree neighbors > distance to query"; - break; - case 12: - order = "orthologies > subtree neighbors > super orthologies > distance to query"; - break; - case 13: - order = "orthologies > super orthologies > subtree neighbors > distance to query"; - break; - case 14: - order = "subtree neighbors > orthologies > super orthologies > distance to query"; - break; - case 15: - order = "subtree neighbors > distance to query > orthologies > super orthologies"; - break; - case 16: - order = "orthologies > distance to query > subtree neighbors > super orthologies"; - break; - case 17: - order = "orthologies > subtree neighbors > distance to query > super orthologies"; - break; - default: - order = "orthologies"; - break; + private final static List getNodesViaSequenceName( final Phylogeny phy, final String seq_name ) { + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getNodeData().isHasSequence() && n.getNodeData().getSequence().getName().equals( seq_name ) ) { + nodes.add( n ); + } + if ( !n.getNodeData().isHasSequence() && n.getName().equals( seq_name ) ) { + nodes.add( n ); + } } - return order; - } - - public final static StringBuffer getOrderHelp() { - final StringBuffer sb = new StringBuffer(); - sb.append( " 0: orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 1: orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 2: super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 3: orthologies > distance to query" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 4: distance to query > orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 5: orthologies > super orthologies > distance to query" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 6: orthologies > distance to query > super orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 7: super orthologies > orthologies > distance to query" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 8: super orthologies > distance to query > orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 9: distance to query > orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 10: distance to query > super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 11: orthologies > subtree neighbors > distance to query" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 12: orthologies > subtree neighbors > super orthologies > distance to query" - + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 13: orthologies > super orthologies > subtree neighbors > distance to query" - + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 14: subtree neighbors > orthologies > super orthologies > distance to query" - + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 15: subtree neighbors > distance to query > orthologies > super orthologies" - + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 16: orthologies > distance to query > subtree neighbors > super orthologies" - + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 17: orthologies > subtree neighbors > distance to query > super orthologies" - + ForesterUtil.LINE_SEPARATOR ); - return sb; + return nodes; } - private final static List getSubtreeNeighbors( final PhylogenyNode query, final int level ) { - PhylogenyNode node = query; - if ( !node.isExternal() ) { - return null; + private final class ResultLine implements Comparable { + + public static final int DEFAULT = -999; + private final String _key; + private final double _value1; + private final double _value2; + private int[] _p; + + ResultLine( final String name, final double value1, final double value2, final int c ) { + setSigns(); + _key = name; + _value1 = value1; + _value2 = value2; + if ( ( c >= 0 ) && ( c <= 2 ) ) { + _p[ c ] = -1; + } } - if ( !node.isRoot() ) { - node = node.getParent(); + + ResultLine( final String name, final double value1, final int c ) { + setSigns(); + _key = name; + _value1 = value1; + _value2 = ResultLine.DEFAULT; + if ( c == 0 ) { + _p[ 0 ] = -1; + } } - if ( level == 2 ) { - if ( !node.isRoot() ) { - node = node.getParent(); + + @Override + public int compareTo( final ResultLine n ) { + if ( ( getValue1() != ResultLine.DEFAULT ) && ( n.getValue1() != ResultLine.DEFAULT ) ) { + if ( getValue1() < n.getValue1() ) { + return _p[ 0 ]; + } + if ( getValue1() > n.getValue1() ) { + return ( -_p[ 0 ] ); + } } + if ( ( getValue2() != ResultLine.DEFAULT ) && ( n.getValue2() != ResultLine.DEFAULT ) ) { + if ( getValue2() < n.getValue2() ) { + return _p[ 1 ]; + } + if ( getValue2() > n.getValue2() ) { + return ( -_p[ 1 ] ); + } + } + return ( getKey().compareTo( n.getKey() ) ); } - else { - throw new IllegalArgumentException( "currently only supporting level 2 subtree neighbors " ); + + String getKey() { + return _key; } - final List sn = node.getAllExternalDescendants(); - sn.remove( query ); - return sn; - } + + double getValue1() { + return _value1; + } + + double getValue2() { + return _value2; + } + + private void setSigns() { + _p = new int[ 2 ]; + _p[ 0 ] = _p[ 1 ] = +1; + } + } // ResultLine }