From 5309073b8da93adc2ece844bb84ff2735805b38c Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Wed, 28 Nov 2012 04:28:44 +0000 Subject: [PATCH] "rio" work --- .../java/src/org/forester/application/rio.java | 184 ++++-- .../org/forester/phylogeny/PhylogenyMethods.java | 5 +- forester/java/src/org/forester/sdi/RIO.java | 615 +++++++++----------- 3 files changed, 398 insertions(+), 406 deletions(-) diff --git a/forester/java/src/org/forester/application/rio.java b/forester/java/src/org/forester/application/rio.java index 18fa912..7610fc5 100644 --- a/forester/java/src/org/forester/application/rio.java +++ b/forester/java/src/org/forester/application/rio.java @@ -29,23 +29,26 @@ package org.forester.application; import java.io.File; import java.io.FileWriter; +import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; +import org.forester.datastructures.IntMatrix; import org.forester.io.parsers.phyloxml.PhyloXmlParser; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.sdi.RIO; import org.forester.util.CommandLineArguments; +import org.forester.util.EasyWriter; import org.forester.util.ForesterUtil; public class rio { final static private String PRG_NAME = "rio"; final static private String PRG_VERSION = "3.00 beta 1"; - final static private String PRG_DATE = "2010.01.15"; + final static private String PRG_DATE = "2012.11.27"; final static private String E_MAIL = "czmasek@burnham.org"; final static private String WWW = "www.phylosoft.org/forester/"; final static private String HELP_OPTION_1 = "help"; @@ -74,14 +77,12 @@ public class rio { } if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) { printHelp(); - System.exit( 0 ); } - if ( ( args.length < 3 ) || ( args.length > 10 ) ) { + if ( ( args.length < 2 ) || ( args.length > 10 ) ) { System.out.println(); System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" ); System.out.println(); printHelp(); - System.exit( -1 ); } final List allowed_options = new ArrayList(); allowed_options.add( QUERY_OPTION ); @@ -94,38 +95,44 @@ public class rio { if ( dissallowed_options.length() > 0 ) { ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options ); } - final File multiple_trees_file = cla.getFile( 0 ); + final File gene_trees_file = cla.getFile( 0 ); final File species_tree_file = cla.getFile( 1 ); - final File outfile = cla.getFile( 2 ); - ForesterUtil.fatalErrorIfFileNotReadable( PRG_NAME, multiple_trees_file ); + File outfile = null; + if ( cla.getNumberOfNames() > 2 ) { + outfile = cla.getFile( 2 ); + } + ForesterUtil.fatalErrorIfFileNotReadable( PRG_NAME, gene_trees_file ); ForesterUtil.fatalErrorIfFileNotReadable( PRG_NAME, species_tree_file ); if ( outfile.exists() ) { ForesterUtil.fatalError( PRG_NAME, "[" + outfile + "] already exists" ); } - String seq_name = null; + String query = null; if ( cla.isOptionSet( QUERY_OPTION ) ) { - seq_name = cla.getOptionValue( QUERY_OPTION ); + query = cla.getOptionValue( QUERY_OPTION ); } File table_outfile = null; if ( cla.isOptionSet( TABLE_OUTPUT_OPTION ) ) { table_outfile = new File( cla.getOptionValue( TABLE_OUTPUT_OPTION ) ); if ( table_outfile.exists() ) { - ForesterUtil.fatalError( PRG_NAME, "[" + outfile + "] already exists" ); + ForesterUtil.fatalError( PRG_NAME, "[" + table_outfile + "] already exists" ); } } boolean output_ultraparalogs = false; if ( cla.isOptionSet( OUTPUT_ULTRA_P_OPTION ) ) { output_ultraparalogs = true; } - double t_orthologs = 0.0; - double threshold_ultra_paralogs = 0.0; + double cutoff_for_orthologs = 50; + double cutoff_for_ultra_paralogs = 50; int sort = 2; try { if ( cla.isOptionSet( CUTOFF_ORTHO_OPTION ) ) { - t_orthologs = cla.getOptionValueAsDouble( CUTOFF_ORTHO_OPTION ); + cutoff_for_orthologs = cla.getOptionValueAsDouble( CUTOFF_ORTHO_OPTION ); } if ( cla.isOptionSet( CUTOFF_ULTRA_P_OPTION ) ) { - threshold_ultra_paralogs = cla.getOptionValueAsDouble( CUTOFF_ULTRA_P_OPTION ); + cutoff_for_ultra_paralogs = cla.getOptionValueAsDouble( CUTOFF_ULTRA_P_OPTION ); + if ( !output_ultraparalogs ) { + printHelp(); + } } if ( cla.isOptionSet( SORT_OPTION ) ) { sort = cla.getOptionValueAsInt( SORT_OPTION ); @@ -134,24 +141,31 @@ public class rio { catch ( final Exception e ) { ForesterUtil.fatalError( PRG_NAME, "error in command line: " + e.getLocalizedMessage() ); } - if ( sort < 0 ) { - sort = 0; + if ( ( cutoff_for_orthologs < 0 ) || ( cutoff_for_ultra_paralogs < 0 ) || ( sort < 0 ) || ( sort > 2 ) ) { + printHelp(); + } + if ( ( ( query == null ) && ( outfile != null ) ) || ( ( query != null ) && ( outfile == null ) ) ) { + printHelp(); } - else if ( sort > 2 ) { - sort = 2; + if ( output_ultraparalogs && ( outfile == null ) ) { + printHelp(); } long time = 0; - System.out.println( "\n" ); - System.out.println( "Gene trees: " + multiple_trees_file ); - System.out.println( "Species tree: " + species_tree_file ); - System.out.println( "Query: " + seq_name ); - System.out.println( "Outfile: " + outfile ); - System.out.println( "Outfile: " + table_outfile ); - System.out.println( "Sort: " + sort ); - System.out.println( "Threshold orthologs: " + t_orthologs ); - if ( output_ultraparalogs ) { - System.out.println( "Threshold ultra paralogs: " + threshold_ultra_paralogs ); + System.out.println( "Gene trees : " + gene_trees_file ); + System.out.println( "Species tree : " + species_tree_file ); + if ( query != null ) { + System.out.println( "Query : " + query ); + System.out.println( "Outfile : " + outfile ); + System.out.println( "Sort : " + sort ); + System.out.println( "Cutoff for orthologs : " + cutoff_for_orthologs ); + if ( output_ultraparalogs ) { + System.out.println( "Cutoff for ultra paralogs : " + cutoff_for_ultra_paralogs ); + } + } + if ( table_outfile != null ) { + System.out.println( "Table output : " + table_outfile ); } + System.out.println(); time = System.currentTimeMillis(); Phylogeny species_tree = null; try { @@ -163,68 +177,110 @@ public class rio { System.exit( -1 ); } if ( !species_tree.isRooted() ) { - ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not rooted" ); + ForesterUtil.printErrorMessage( PRG_NAME, "species tree is not rooted" ); System.exit( -1 ); } if ( !species_tree.isCompletelyBinary() ) { - ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not completely binary" ); + ForesterUtil.printErrorMessage( PRG_NAME, "species tree is not completely binary" ); System.exit( -1 ); } - final RIO rio_instance = new RIO(); - final StringBuffer output = new StringBuffer(); - PrintWriter out = null; try { - rio_instance.inferOrthologs( multiple_trees_file, species_tree.copy(), seq_name ); - output.append( rio_instance.inferredOrthologsToString( seq_name, sort, t_orthologs ) ); - if ( output_ultraparalogs ) { - output.append( "\n\nUltra paralogs:\n" ); - output.append( rio_instance.inferredUltraParalogsToString( seq_name, threshold_ultra_paralogs ) ); + RIO rio; + if ( ForesterUtil.isEmpty( query ) ) { + rio = new RIO( gene_trees_file, species_tree ); + } + else { + rio = new RIO( gene_trees_file, species_tree, query ); + } + if ( outfile != null ) { + final StringBuilder output = new StringBuilder(); + output.append( rio.inferredOrthologsToString( query, sort, cutoff_for_orthologs ) ); + if ( output_ultraparalogs ) { + output.append( "\n\nUltra paralogs:\n" ); + output.append( rio.inferredUltraParalogsToString( query, cutoff_for_ultra_paralogs ) ); + } + output.append( "\n\nSort priority: " + RIO.getOrder( sort ) ); + output.append( "\nExt nodes : " + rio.getExtNodesOfAnalyzedGeneTrees() ); + output.append( "\nSamples : " + rio.getNumberOfSamples() + "\n" ); + final PrintWriter out = new PrintWriter( new FileWriter( outfile ), true ); + out.println( output ); + out.close(); + } + if ( table_outfile != null ) { + tableOutput( table_outfile, rio ); } - output.append( "\n\nSort priority: " + RIO.getOrder( sort ) ); - output.append( "\nExt nodes : " + rio_instance.getExtNodesOfAnalyzedGeneTrees() ); - output.append( "\nSamples : " + rio_instance.getNumberOfSamples() + "\n" ); - out = new PrintWriter( new FileWriter( outfile ), true ); } catch ( final Exception e ) { ForesterUtil.printErrorMessage( PRG_NAME, e.getLocalizedMessage() ); e.printStackTrace(); System.exit( -1 ); } - out.println( output ); - out.close(); - ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + outfile + "\"" ); + if ( outfile != null ) { + ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + outfile + "\"" ); + } time = System.currentTimeMillis() - time; ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" ); - ForesterUtil.programMessage( PRG_NAME, "OK." ); + ForesterUtil.programMessage( PRG_NAME, "OK" ); System.exit( 0 ); } + private static void tableOutput( final File table_outfile, final RIO rio ) throws IOException { + final IntMatrix m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees() ); + writeTable( table_outfile, rio, m ); + } + + private static void writeTable( final File table_outfile, final RIO rio, final IntMatrix m ) throws IOException { + final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile ); + final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.###" ); + df.setDecimalSeparatorAlwaysShown( false ); + w.print( "\t" ); + for( int i = 0; i < m.size(); ++i ) { + w.print( "\t" ); + w.print( m.getLabel( i ) ); + } + w.println(); + for( int x = 0; x < m.size(); ++x ) { + w.print( m.getLabel( x ) ); + w.print( "\t" ); + for( int y = 0; y < m.size(); ++y ) { + w.print( "\t" ); + if ( x == y ) { + if ( m.get( x, y ) != rio.getNumberOfSamples() ) { + ForesterUtil.unexpectedFatalError( PRG_NAME, "diagonal value is off" ); + } + w.print( "-" ); + } + else { + w.print( df.format( ( ( double ) m.get( x, y ) ) / rio.getNumberOfSamples() ) ); + } + } + w.println(); + } + w.close(); + ForesterUtil.programMessage( PRG_NAME, "wrote table to \"" + table_outfile + "\"" ); + } + private final static void printHelp() { - System.out.println( "Usage:" ); + System.out.println( "usage:" ); System.out.println(); System.out.println( PRG_NAME + " [options] [outfile]" ); System.out.println(); - System.out.println( "options:" ); + System.out.println( " options:" ); + System.out.println(); + System.out.println( " -" + CUTOFF_ORTHO_OPTION + " : cutoff for ortholog output (default: 50)" ); + System.out.println( " -" + TABLE_OUTPUT_OPTION + " : file-name for output table" ); + System.out.println( " -" + QUERY_OPTION + " : name for query (sequence/node)" ); + System.out.println( " -" + SORT_OPTION + " : sort (default: 2)" ); + System.out.println( " -" + OUTPUT_ULTRA_P_OPTION + + " : to output ultra-paralogs (species specific expansions/paralogs)" ); + System.out.println( " -" + CUTOFF_ULTRA_P_OPTION + " : cutoff for ultra-paralog output (default: 50)" ); System.out.println(); - // System.out.println( " -" + STRICT_OPTION - // + " : strict [default: non-strict]: all nodes between 'target' and 'evaluators' must match" ); - // System.out.println( " -" + NORMALIZE_OPTION - // + "=: normalize to this value (e.g. 100 for most bootstrap analyses) [default: no normalization]" ); - // System.out.println( " -" + FIRST_OPTION + "=: first evaluator topology to use (0-based) [default: 0]" ); - // System.out.println( " -" + LAST_OPTION - // + "=: last evaluator topology to use (0-based) [default: use all until final topology]" ); - // System.out.println(); - // System.out.println( "M= (String) Multiple gene tree file (mandatory)" ); - // System.out.println( "N= (String) Query sequence name (mandatory)" ); - // System.out.println( "S= (String) Species tree file (mandatory)" ); - // System.out.println( "O= (String) Output file name -- overwritten without warning! (mandatory)" ); - // System.out.println( "P= (int) Sort priority" ); - // System.out.println( "L= (double) Threshold orthologs for output" ); - // System.out.println( " Sort priority (\"P=\"):" ); + System.out.println( " sort:" ); System.out.println( RIO.getOrderHelp().toString() ); System.out.println(); System.out - .println( " Example: \"rio -q=D_NEMVE -s=1 -t=out -u Bcl-2_e1_20_mafft_05_40_fme.mlt species.xml out\"" ); + .println( " example: \"rio gene_trees.nh species.xml outfile -q=D_HUMAN -t=outtable -u -cu=60 -co=60\"" ); System.out.println(); + System.exit( -1 ); } } diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java index 3c33301..64c7965 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyMethods.java @@ -1063,13 +1063,14 @@ public class PhylogenyMethods { * @param n * external PhylogenyNode whose strictly speciation related Nodes * are to be returned - * @return Vector of references to all strictly speciation related Nodes of + * @return References to all strictly speciation related Nodes of * PhylogenyNode n of this Phylogeny, null if this Phylogeny is * empty or if n is internal */ public static List getSuperOrthologousNodes( final PhylogenyNode n ) { // FIXME - PhylogenyNode node = n, deepest = null; + PhylogenyNode node = n; + PhylogenyNode deepest = null; final List v = new ArrayList(); if ( !node.isExternal() ) { return null; diff --git a/forester/java/src/org/forester/sdi/RIO.java b/forester/java/src/org/forester/sdi/RIO.java index 3c2c298..c2402b3 100644 --- a/forester/java/src/org/forester/sdi/RIO.java +++ b/forester/java/src/org/forester/sdi/RIO.java @@ -48,84 +48,39 @@ import org.forester.phylogeny.factories.PhylogenyFactory; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; -/* - * @author Christian M. Zmasek - */ public final class RIO { private final static boolean ROOT_BY_MINIMIZING_SUM_OF_DUPS = true; private final static boolean ROOT_BY_MINIMIZING_TREE_HEIGHT = true; - private HashMap> _o_hash_maps; - private HashMap> _so_hash_maps; - private HashMap> _up_hash_maps; + private Phylogeny[] _analyzed_gene_trees; + private HashMap> _o_maps; + private HashMap> _so_maps; + private HashMap> _up_maps; private List _seq_names; private int _samples; - private int _ext_nodes_; + private int _ext_nodes; /** * Default constructor. + * @throws SDIException + * @throws IOException */ - public RIO() { - reset(); - } - - public static IntMatrix calculateOrthologTable( final Phylogeny[] gene_trees ) { - final List labels = new ArrayList(); - final Set labels_set = new HashSet(); - String label; - for( final PhylogenyNode n : gene_trees[ 0 ].getExternalNodes() ) { - if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) { - label = n.getNodeData().getSequence().getName(); - } - else if ( n.getNodeData().isHasSequence() - && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) { - label = n.getNodeData().getSequence().getSymbol(); - } - else if ( !ForesterUtil.isEmpty( n.getName() ) ) { - label = n.getName(); - } - else { - throw new IllegalArgumentException( "node " + n + " has no appropriate label" ); - } - if ( labels_set.contains( label ) ) { - throw new IllegalArgumentException( "label " + label + " is not unique" ); - } - labels_set.add( label ); - labels.add( label ); - } - final IntMatrix m = new IntMatrix( labels ); - int counter = 0; - for( final Phylogeny gt : gene_trees ) { - System.out.println( counter ); - counter++; - PhylogenyMethods.preOrderReId( gt ); - final HashMap map = PhylogenyMethods.createNameToExtNodeMap( gt ); - for( int x = 0; x < m.size(); ++x ) { - final PhylogenyNode nx = map.get( m.getLabel( x ) ); - for( int y = 0; y < m.size(); ++y ) { - if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, map.get( m.getLabel( y ) ) ) - .isDuplication() ) { - m.inreaseByOne( x, y ); - } - } - } + public RIO( final File gene_trees_file, final Phylogeny species_tree, final String query ) throws IOException, + SDIException { + if ( ForesterUtil.isEmpty( query ) ) { + throw new IllegalArgumentException( "query is empty" ); } - return m; + init(); + inferOrthologs( gene_trees_file, species_tree, query ); } - public final int getNumberOfSamples() { - return _samples; + public RIO( final File gene_trees_file, final Phylogeny species_tree ) throws IOException, SDIException { + init(); + inferOrthologs( gene_trees_file, species_tree, null ); } - // Helper method for inferredOrthologsToString. - // inferredOrthologsToArrayList, - // and inferredUltraParalogsToString. - private final double getBootstrapValueFromHash( final HashMap h, final String name ) { - if ( !h.containsKey( name ) ) { - return 0.0; - } - final int i = h.get( name ); - return ( ( i * 100.0 ) / getNumberOfSamples() ); + public final Phylogeny[] getAnalyzedGeneTrees() { + return _analyzed_gene_trees; } /** @@ -135,45 +90,7 @@ public final class RIO { * @return number of ext nodes in gene trees analyzed (after stripping) */ public final int getExtNodesOfAnalyzedGeneTrees() { - return _ext_nodes_; - } - - /** - * Returns a HashMap containing the inferred orthologs of the external gene - * tree node with the sequence name seq_name. Sequence names are the keys - * (String), numbers of observations are the values (Int). Orthologs are to - * be inferred by method "inferOrthologs". Throws an exception if seq_name - * is not found. - * - * @param seq_name - * sequence name of a external node of the gene trees - * @return HashMap containing the inferred orthologs - * (name(String)->value(Int)) - */ - public final HashMap getInferredOrthologs( final String seq_name ) { - if ( _o_hash_maps == null ) { - return null; - } - return _o_hash_maps.get( seq_name ); - } - - /** - * Returns a HashMap containing the inferred "super orthologs" of the - * external gene tree node with the sequence name seq_name. Sequence names - * are the keys (String), numbers of observations are the values (Int). - * Super orthologs are to be inferred by method "inferOrthologs". Throws an - * exception if seq_name is not found. - * - * @param seq_name - * sequence name of a external node of the gene trees - * @return HashMap containing the inferred super orthologs - * (name(String)->value(Int)) - */ - public final HashMap getInferredSuperOrthologs( final String seq_name ) { - if ( _so_hash_maps == null ) { - return null; - } - return _so_hash_maps.get( seq_name ); + return _ext_nodes; } /** @@ -189,172 +106,14 @@ public final class RIO { * (name(String)->value(Int)) */ public final HashMap getInferredUltraParalogs( final String seq_name ) { - if ( _up_hash_maps == null ) { + if ( _up_maps == null ) { return null; } - return _up_hash_maps.get( seq_name ); + return _up_maps.get( seq_name ); } - /** - * Infers the orthologs (as well the "super orthologs", the "subtree - * neighbors", and the "ultra paralogs") for each external node of the gene - * Trees in multiple tree File gene_trees_file (=output of PHYLIP NEIGHBOR, - * for example). Tallies how many times each sequence is (super-) - * orthologous towards the query. Tallies how many times each sequence is - * ultra paralogous towards the query. Tallies how many times each sequence - * is a subtree neighbor of the query. Gene duplications are inferred using - * SDI. Modifies its argument species_tree. Is a little faster than - * "inferOrthologs(File,Phylogeny)" since orthologs are only inferred for - * query. - *

- * To obtain the results use the methods listed below. - * - * @param gene_trees_file - * a File containing gene Trees in NH format, which is the result - * of performing a bootstrap analysis in PHYLIP - * @param species_tree - * a species Phylogeny, which has species names in its species - * fields - * @param query - * the sequence name of the squence whose orthologs are to be - * inferred - * @throws SDIException - */ - public void inferOrthologs( final File gene_trees_file, final Phylogeny species_tree, final String query ) - throws IOException, SDIException { - int bs = 0; - // Read in first tree to get its sequence names - // and strip species_tree. - final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); - final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true ); - if ( p instanceof NHXParser ) { - final NHXParser nhx = ( NHXParser ) p; - nhx.setReplaceUnderscores( false ); - nhx.setIgnoreQuotes( true ); - nhx.setTaxonomyExtraction( PhylogenyMethods.TAXONOMY_EXTRACTION.YES ); - } - final Phylogeny[] gene_trees = factory.create( gene_trees_file, p ); - // Removes from species_tree all species not found in gene_tree. - PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree ); - PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_trees[ 0 ] ); - _seq_names = getAllExternalSequenceNames( gene_trees[ 0 ] ); - if ( ( _seq_names == null ) || ( _seq_names.size() < 1 ) ) { - throw new IOException( "could not get sequence names" ); - } - _o_hash_maps = new HashMap>(); - _so_hash_maps = new HashMap>(); - _up_hash_maps = new HashMap>(); - _o_hash_maps.put( query, new HashMap( _seq_names.size() ) ); - _so_hash_maps.put( query, new HashMap( _seq_names.size() ) ); - _up_hash_maps.put( query, new HashMap( _seq_names.size() ) ); - // Go through all gene trees in the file. - final Phylogeny[] assigned_trees = new Phylogeny[ gene_trees.length ]; - System.out.println( "gene trees" + gene_trees.length ); - int c = 0; - for( final Phylogeny gt : gene_trees ) { - bs++; - // Removes from gene_tree all species not found in species_tree. - PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt ); - assigned_trees[ c++ ] = inferOrthologsHelper( gt, species_tree, query ); - } - final IntMatrix m = calculateOrthologTable( assigned_trees ); - System.out.println( m.toString() ); - setNumberOfSamples( gene_trees.length ); - } - - public List getNodesViaSequenceName( final Phylogeny phy, final String seq_name ) { - final List nodes = new ArrayList(); - for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { - final PhylogenyNode n = iter.next(); - if ( n.getNodeData().isHasSequence() && n.getNodeData().getSequence().getName().equals( seq_name ) ) { - nodes.add( n ); - } - if ( !n.getNodeData().isHasSequence() && n.getName().equals( seq_name ) ) { - nodes.add( n ); - } - } - return nodes; - } - - // Helper method which performs the actual ortholog inference for - // the external node with seqname query. - private Phylogeny inferOrthologsHelper( final Phylogeny gene_tree, final Phylogeny species_tree, final String query ) - throws SDIException { - Phylogeny assigned_tree = null; - List nodes = null; - final SDIR sdiunrooted = new SDIR(); - List orthologs = null; - List super_orthologs = null; - List ultra_paralogs = null; - assigned_tree = sdiunrooted.infer( gene_tree, - species_tree, - false, - RIO.ROOT_BY_MINIMIZING_SUM_OF_DUPS, - RIO.ROOT_BY_MINIMIZING_TREE_HEIGHT, - true, - 1 )[ 0 ]; - setExtNodesOfAnalyzedGeneTrees( assigned_tree.getNumberOfExternalNodes() ); - nodes = getNodesViaSequenceName( assigned_tree, query ); - if ( nodes.size() > 1 ) { - throw new IllegalArgumentException( "node named [" + query + "] not unique" ); - } - else if ( nodes.isEmpty() ) { - throw new IllegalArgumentException( "no node containing a sequence named [" + query + "] found" ); - } - final PhylogenyNode query_node = nodes.get( 0 ); - orthologs = PhylogenyMethods.getOrthologousNodes( assigned_tree, query_node ); - updateHash( _o_hash_maps, query, orthologs ); - super_orthologs = PhylogenyMethods.getSuperOrthologousNodes( query_node ); - updateHash( _so_hash_maps, query, super_orthologs ); - ultra_paralogs = PhylogenyMethods.getUltraParalogousNodes( query_node ); - updateHash( _up_hash_maps, query, ultra_paralogs ); - return assigned_tree; - } - - /** - * Returns an ArrayList containg the names of orthologs of the PhylogenyNode - * with seq name seq_name. - * - * @param seq_name - * sequence name of a external node of the gene trees - * @param threshold_orthologs - * the minimal number of observations for a a sequence to be - * reported as orthologous as percentage (0.0-100.0%) - * @return ArrayList containg the names of orthologs of the PhylogenyNode - * with seq name seq_name - */ - public ArrayList inferredOrthologsToArrayList( final String seq_name, double threshold_orthologs ) { - HashMap o_hashmap = null; - String name = null; - double o = 0.0; - final ArrayList arraylist = new ArrayList(); - if ( _o_hash_maps == null ) { - throw new RuntimeException( "Orthologs have not been calculated (successfully)." ); - } - if ( threshold_orthologs < 0.0 ) { - threshold_orthologs = 0.0; - } - else if ( threshold_orthologs > 100.0 ) { - threshold_orthologs = 100.0; - } - o_hashmap = getInferredOrthologs( seq_name ); - if ( o_hashmap == null ) { - throw new RuntimeException( "Orthologs for " + seq_name + " were not established." ); - } - if ( _seq_names.size() > 0 ) { - I: for( int i = 0; i < _seq_names.size(); ++i ) { - name = _seq_names.get( i ); - if ( name.equals( seq_name ) ) { - continue I; - } - o = getBootstrapValueFromHash( o_hashmap, name ); - if ( o < threshold_orthologs ) { - continue I; - } - arraylist.add( name ); - } - } - return arraylist; + public final int getNumberOfSamples() { + return _samples; } /** @@ -393,12 +152,8 @@ public final class RIO { * reported as orthologous, in percents (0.0-100.0%) * @return String containing the inferred orthologs, String containing "-" * if no orthologs have been found null in case of error - * @see #inferOrthologs(File,Phylogeny,String) - * @see #inferOrthologs(Phylogeny[],Phylogeny) - * @see #inferOrthologs(File,Phylogeny) - * @see #getOrder(int) */ - public StringBuffer inferredOrthologsToString( final String query_name, int sort, double threshold_orthologs ) { + public final StringBuffer inferredOrthologsToString( final String query_name, int sort, double threshold_orthologs ) { HashMap o_hashmap = null; HashMap s_hashmap = null; String name = ""; @@ -407,7 +162,7 @@ public final class RIO { double value1 = 0.0; double value2 = 0.0; final ArrayList nv = new ArrayList(); - if ( ( _o_hash_maps == null ) || ( _so_hash_maps == null ) ) { + if ( ( _o_maps == null ) || ( _so_maps == null ) ) { throw new RuntimeException( "orthologs have not been calculated (successfully)" ); } if ( ( sort < 0 ) || ( sort > 2 ) ) { @@ -451,7 +206,7 @@ public final class RIO { } } // End of I for loop. if ( ( nv != null ) && ( nv.size() > 0 ) ) { - orthologs.append( "[seq name]\t\t[ortho]\t[st-n]\t[sup-o]\t[dist]" + ForesterUtil.LINE_SEPARATOR ); + orthologs.append( "seq name\t\tortho\ts-ortho" + ForesterUtil.LINE_SEPARATOR ); final ResultLine[] nv_array = new ResultLine[ nv.size() ]; for( int j = 0; j < nv.size(); ++j ) { nv_array[ j ] = nv.get( j ); @@ -470,7 +225,7 @@ public final class RIO { orthologs.append( "-" ); } return orthologs; - } // inferredOrthologsToString( String, int, double ) + } /** * Returns a String containg the names of orthologs of the PhylogenyNode @@ -488,7 +243,7 @@ public final class RIO { * @return String containing the inferred orthologs, String containing "-" * if no orthologs have been found null in case of error */ - public String inferredUltraParalogsToString( final String query_name, double threshold_ultra_paralogs ) { + public final String inferredUltraParalogsToString( final String query_name, double threshold_ultra_paralogs ) { HashMap sp_hashmap = null; String name = "", ultra_paralogs = ""; int sort = 0; @@ -502,7 +257,7 @@ public final class RIO { else if ( threshold_ultra_paralogs > 100.0 ) { threshold_ultra_paralogs = 100.0; } - if ( _up_hash_maps == null ) { + if ( _up_maps == null ) { throw new RuntimeException( "Ultra paralogs have not been calculated (successfully)." ); } sp_hashmap = getInferredUltraParalogs( query_name ); @@ -543,43 +298,173 @@ public final class RIO { return ultra_paralogs; } + // Helper method for inferredOrthologsToString. + // inferredOrthologsToArrayList, + // and inferredUltraParalogsToString. + private final double getBootstrapValueFromHash( final HashMap h, final String name ) { + if ( !h.containsKey( name ) ) { + return 0.0; + } + final int i = h.get( name ); + return ( ( i * 100.0 ) / getNumberOfSamples() ); + } + /** - * Brings this into the same state as immediately after construction. + * Returns a HashMap containing the inferred orthologs of the external gene + * tree node with the sequence name seq_name. Sequence names are the keys + * (String), numbers of observations are the values (Int). Orthologs are to + * be inferred by method "inferOrthologs". Throws an exception if seq_name + * is not found. + * + * @param seq_name + * sequence name of a external node of the gene trees + * @return HashMap containing the inferred orthologs + * (name(String)->value(Int)) */ - private final void reset() { - _o_hash_maps = null; - _so_hash_maps = null; - _up_hash_maps = null; - _seq_names = null; - _samples = 1; - _ext_nodes_ = 0; + private final HashMap getInferredOrthologs( final String seq_name ) { + if ( _o_maps == null ) { + return null; + } + return _o_maps.get( seq_name ); } - private void setNumberOfSamples( int i ) { - if ( i < 1 ) { - i = 1; + /** + * Returns a HashMap containing the inferred "super orthologs" of the + * external gene tree node with the sequence name seq_name. Sequence names + * are the keys (String), numbers of observations are the values (Int). + * Super orthologs are to be inferred by method "inferOrthologs". Throws an + * exception if seq_name is not found. + * + * @param seq_name + * sequence name of a external node of the gene trees + * @return HashMap containing the inferred super orthologs + * (name(String)->value(Int)) + */ + private final HashMap getInferredSuperOrthologs( final String seq_name ) { + if ( _so_maps == null ) { + return null; } - System.out.println( "samples: " + i ); - _samples = i; + return _so_maps.get( seq_name ); } /** - * Sets number of ext nodes in gene trees analyzed (after stripping). - * @param the - * number of ext nodes in gene trees analyzed (after stripping) + * Infers the orthologs (as well the "super orthologs", the "subtree + * neighbors", and the "ultra paralogs") for each external node of the gene + * Trees in multiple tree File gene_trees_file (=output of PHYLIP NEIGHBOR, + * for example). Tallies how many times each sequence is (super-) + * orthologous towards the query. Tallies how many times each sequence is + * ultra paralogous towards the query. Tallies how many times each sequence + * is a subtree neighbor of the query. Gene duplications are inferred using + * SDI. Modifies its argument species_tree. Is a little faster than + * "inferOrthologs(File,Phylogeny)" since orthologs are only inferred for + * query. + *

+ * To obtain the results use the methods listed below. + * + * @param gene_trees_file + * a File containing gene Trees in NH format, which is the result + * of performing a bootstrap analysis in PHYLIP + * @param species_tree + * a species Phylogeny, which has species names in its species + * fields + * @param query + * the sequence name of the squence whose orthologs are to be + * inferred + * @throws SDIException */ - private void setExtNodesOfAnalyzedGeneTrees( int i ) { + private final void inferOrthologs( final File gene_trees_file, final Phylogeny species_tree, final String query ) + throws IOException, SDIException { + // Read in first tree to get its sequence names + // and strip species_tree. + final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance(); + final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true ); + if ( p instanceof NHXParser ) { + final NHXParser nhx = ( NHXParser ) p; + nhx.setReplaceUnderscores( false ); + nhx.setIgnoreQuotes( true ); + nhx.setTaxonomyExtraction( PhylogenyMethods.TAXONOMY_EXTRACTION.YES ); + } + final Phylogeny[] gene_trees = factory.create( gene_trees_file, p ); + // Removes from species_tree all species not found in gene_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree ); + if ( !ForesterUtil.isEmpty( query ) ) { + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_trees[ 0 ] ); + _seq_names = getAllExternalSequenceNames( gene_trees[ 0 ] ); + if ( ( _seq_names == null ) || ( _seq_names.size() < 1 ) ) { + throw new IOException( "could not get sequence names" ); + } + _o_maps = new HashMap>(); + _so_maps = new HashMap>(); + _up_maps = new HashMap>(); + _o_maps.put( query, new HashMap( _seq_names.size() ) ); + _so_maps.put( query, new HashMap( _seq_names.size() ) ); + _up_maps.put( query, new HashMap( _seq_names.size() ) ); + } + _analyzed_gene_trees = new Phylogeny[ gene_trees.length ]; + int c = 0; + for( final Phylogeny gt : gene_trees ) { + // Removes from gene_tree all species not found in species_tree. + PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt ); + _analyzed_gene_trees[ c++ ] = inferOrthologsHelper( gt, species_tree, query ); + } + setNumberOfSamples( gene_trees.length ); + } + + // Helper method which performs the actual ortholog inference for + // the external node with seqname query. + private final Phylogeny inferOrthologsHelper( final Phylogeny gene_tree, + final Phylogeny species_tree, + final String query ) throws SDIException { + final SDIR sdiunrooted = new SDIR(); + final Phylogeny assigned_tree = sdiunrooted.infer( gene_tree, + species_tree, + false, + RIO.ROOT_BY_MINIMIZING_SUM_OF_DUPS, + RIO.ROOT_BY_MINIMIZING_TREE_HEIGHT, + true, + 1 )[ 0 ]; + setExtNodesOfAnalyzedGeneTrees( assigned_tree.getNumberOfExternalNodes() ); + if ( !ForesterUtil.isEmpty( query ) ) { + final List nodes = getNodesViaSequenceName( assigned_tree, query ); + if ( nodes.size() > 1 ) { + throw new IllegalArgumentException( "node named [" + query + "] not unique" ); + } + else if ( nodes.isEmpty() ) { + throw new IllegalArgumentException( "no node containing a sequence named [" + query + "] found" ); + } + final PhylogenyNode query_node = nodes.get( 0 ); + updateCounts( _o_maps, query, PhylogenyMethods.getOrthologousNodes( assigned_tree, query_node ) ); + updateCounts( _so_maps, query, PhylogenyMethods.getSuperOrthologousNodes( query_node ) ); + updateCounts( _up_maps, query, PhylogenyMethods.getUltraParalogousNodes( query_node ) ); + } + return assigned_tree; + } + + private final void init() { + _o_maps = null; + _so_maps = null; + _up_maps = null; + _seq_names = null; + _samples = 1; + _ext_nodes = 0; + } + + private final void setExtNodesOfAnalyzedGeneTrees( final int i ) { + _ext_nodes = i; + } + + private final void setNumberOfSamples( int i ) { if ( i < 1 ) { - i = 0; + i = 1; } - _ext_nodes_ = i; + _samples = i; } // Helper for doInferOrthologs( Phylogeny, Phylogeny, String ) // and doInferOrthologs( Phylogeny, Phylogeny ). - private void updateHash( final HashMap> counter_map, - final String query_seq_name, - final List nodes ) { + private final void updateCounts( final HashMap> counter_map, + final String query_seq_name, + final List nodes ) { final HashMap hash_map = counter_map.get( query_seq_name ); if ( hash_map == null ) { throw new RuntimeException( "Unexpected failure in method updateHash." ); @@ -602,6 +487,87 @@ public final class RIO { } } + public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees ) { + final List labels = new ArrayList(); + final Set labels_set = new HashSet(); + String label; + for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) { + if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) { + label = n.getNodeData().getSequence().getName(); + } + else if ( n.getNodeData().isHasSequence() + && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) { + label = n.getNodeData().getSequence().getSymbol(); + } + else if ( !ForesterUtil.isEmpty( n.getName() ) ) { + label = n.getName(); + } + else { + throw new IllegalArgumentException( "node " + n + " has no appropriate label" ); + } + if ( labels_set.contains( label ) ) { + throw new IllegalArgumentException( "label " + label + " is not unique" ); + } + labels_set.add( label ); + labels.add( label ); + } + final IntMatrix m = new IntMatrix( labels ); + int counter = 0; + for( final Phylogeny gt : analyzed_gene_trees ) { + counter++; + PhylogenyMethods.preOrderReId( gt ); + final HashMap map = PhylogenyMethods.createNameToExtNodeMap( gt ); + for( int x = 0; x < m.size(); ++x ) { + final PhylogenyNode nx = map.get( m.getLabel( x ) ); + for( int y = 0; y < m.size(); ++y ) { + if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, map.get( m.getLabel( y ) ) ) + .isDuplication() ) { + m.inreaseByOne( x, y ); + } + } + } + } + return m; + } + + /** + * Returns the order in which ortholog (o), "super ortholog" (s) and + * distance (d) are returned and sorted (priority of sort always goes from + * left to right), given sort. For the meaning of sort + * + * @see #inferredOrthologsToString(String,int,double,double) + * + * @param sort + * determines order and sort priority + * @return String indicating the order + */ + public final static String getOrder( final int sort ) { + String order = ""; + switch ( sort ) { + case 0: + order = "orthologies"; + break; + case 1: + order = "orthologies > super orthologies"; + break; + case 2: + order = "super orthologies > orthologies"; + break; + default: + order = "orthologies"; + break; + } + return order; + } + + public final static StringBuffer getOrderHelp() { + final StringBuffer sb = new StringBuffer(); + sb.append( " 0: orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 1: orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR ); + sb.append( " 2: super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR ); + return sb; + } + // Helper method for inferredOrthologsToString // and inferredUltraParalogsToString. private final static String addNameAndValues( final String name, @@ -658,7 +624,7 @@ public final class RIO { return s; } - private static List getAllExternalSequenceNames( final Phylogeny phy ) { + private final static List getAllExternalSequenceNames( final Phylogeny phy ) { final List names = new ArrayList(); for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) { final PhylogenyNode n = iter.next(); @@ -675,45 +641,21 @@ public final class RIO { return names; } - /** - * Returns the order in which ortholog (o), "super ortholog" (s) and - * distance (d) are returned and sorted (priority of sort always goes from - * left to right), given sort. For the meaning of sort - * - * @see #inferredOrthologsToString(String,int,double,double) - * - * @param sort - * determines order and sort priority - * @return String indicating the order - */ - public final static String getOrder( final int sort ) { - String order = ""; - switch ( sort ) { - case 0: - order = "orthologies"; - break; - case 1: - order = "orthologies > super orthologies"; - break; - case 2: - order = "super orthologies > orthologies"; - break; - default: - order = "orthologies"; - break; + private final static List getNodesViaSequenceName( final Phylogeny phy, final String seq_name ) { + final List nodes = new ArrayList(); + for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) { + final PhylogenyNode n = iter.next(); + if ( n.getNodeData().isHasSequence() && n.getNodeData().getSequence().getName().equals( seq_name ) ) { + nodes.add( n ); + } + if ( !n.getNodeData().isHasSequence() && n.getName().equals( seq_name ) ) { + nodes.add( n ); + } } - return order; - } - - public final static StringBuffer getOrderHelp() { - final StringBuffer sb = new StringBuffer(); - sb.append( " 0: orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 1: orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR ); - sb.append( " 2: super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR ); - return sb; + return nodes; } - class ResultLine implements Comparable { + private final class ResultLine implements Comparable { public static final int DEFAULT = -999; private final String _key; @@ -721,13 +663,6 @@ public final class RIO { private final double _value2; private int[] _p; - ResultLine() { - setSigns(); - _key = ""; - _value1 = ResultLine.DEFAULT; - _value2 = ResultLine.DEFAULT; - } - ResultLine( final String name, final double value1, final double value2, final int c ) { setSigns(); _key = name; -- 1.7.10.2