import java.io.File;
import java.io.FileWriter;
+import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
+import org.forester.datastructures.IntMatrix;
import org.forester.io.parsers.phyloxml.PhyloXmlParser;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
import org.forester.phylogeny.factories.PhylogenyFactory;
import org.forester.sdi.RIO;
import org.forester.util.CommandLineArguments;
+import org.forester.util.EasyWriter;
import org.forester.util.ForesterUtil;
public class rio {
final static private String PRG_NAME = "rio";
final static private String PRG_VERSION = "3.00 beta 1";
- final static private String PRG_DATE = "2010.01.15";
+ final static private String PRG_DATE = "2012.11.27";
final static private String E_MAIL = "czmasek@burnham.org";
final static private String WWW = "www.phylosoft.org/forester/";
final static private String HELP_OPTION_1 = "help";
}
if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) {
printHelp();
- System.exit( 0 );
}
- if ( ( args.length < 3 ) || ( args.length > 10 ) ) {
+ if ( ( args.length < 2 ) || ( args.length > 10 ) ) {
System.out.println();
System.out.println( "[" + PRG_NAME + "] incorrect number of arguments" );
System.out.println();
printHelp();
- System.exit( -1 );
}
final List<String> allowed_options = new ArrayList<String>();
allowed_options.add( QUERY_OPTION );
if ( dissallowed_options.length() > 0 ) {
ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
}
- final File multiple_trees_file = cla.getFile( 0 );
+ final File gene_trees_file = cla.getFile( 0 );
final File species_tree_file = cla.getFile( 1 );
- final File outfile = cla.getFile( 2 );
- ForesterUtil.fatalErrorIfFileNotReadable( PRG_NAME, multiple_trees_file );
+ File outfile = null;
+ if ( cla.getNumberOfNames() > 2 ) {
+ outfile = cla.getFile( 2 );
+ }
+ ForesterUtil.fatalErrorIfFileNotReadable( PRG_NAME, gene_trees_file );
ForesterUtil.fatalErrorIfFileNotReadable( PRG_NAME, species_tree_file );
if ( outfile.exists() ) {
ForesterUtil.fatalError( PRG_NAME, "[" + outfile + "] already exists" );
}
- String seq_name = null;
+ String query = null;
if ( cla.isOptionSet( QUERY_OPTION ) ) {
- seq_name = cla.getOptionValue( QUERY_OPTION );
+ query = cla.getOptionValue( QUERY_OPTION );
}
File table_outfile = null;
if ( cla.isOptionSet( TABLE_OUTPUT_OPTION ) ) {
table_outfile = new File( cla.getOptionValue( TABLE_OUTPUT_OPTION ) );
if ( table_outfile.exists() ) {
- ForesterUtil.fatalError( PRG_NAME, "[" + outfile + "] already exists" );
+ ForesterUtil.fatalError( PRG_NAME, "[" + table_outfile + "] already exists" );
}
}
boolean output_ultraparalogs = false;
if ( cla.isOptionSet( OUTPUT_ULTRA_P_OPTION ) ) {
output_ultraparalogs = true;
}
- double t_orthologs = 0.0;
- double threshold_ultra_paralogs = 0.0;
+ double cutoff_for_orthologs = 50;
+ double cutoff_for_ultra_paralogs = 50;
int sort = 2;
try {
if ( cla.isOptionSet( CUTOFF_ORTHO_OPTION ) ) {
- t_orthologs = cla.getOptionValueAsDouble( CUTOFF_ORTHO_OPTION );
+ cutoff_for_orthologs = cla.getOptionValueAsDouble( CUTOFF_ORTHO_OPTION );
}
if ( cla.isOptionSet( CUTOFF_ULTRA_P_OPTION ) ) {
- threshold_ultra_paralogs = cla.getOptionValueAsDouble( CUTOFF_ULTRA_P_OPTION );
+ cutoff_for_ultra_paralogs = cla.getOptionValueAsDouble( CUTOFF_ULTRA_P_OPTION );
+ if ( !output_ultraparalogs ) {
+ printHelp();
+ }
}
if ( cla.isOptionSet( SORT_OPTION ) ) {
sort = cla.getOptionValueAsInt( SORT_OPTION );
catch ( final Exception e ) {
ForesterUtil.fatalError( PRG_NAME, "error in command line: " + e.getLocalizedMessage() );
}
- if ( sort < 0 ) {
- sort = 0;
+ if ( ( cutoff_for_orthologs < 0 ) || ( cutoff_for_ultra_paralogs < 0 ) || ( sort < 0 ) || ( sort > 2 ) ) {
+ printHelp();
+ }
+ if ( ( ( query == null ) && ( outfile != null ) ) || ( ( query != null ) && ( outfile == null ) ) ) {
+ printHelp();
}
- else if ( sort > 2 ) {
- sort = 2;
+ if ( output_ultraparalogs && ( outfile == null ) ) {
+ printHelp();
}
long time = 0;
- System.out.println( "\n" );
- System.out.println( "Gene trees: " + multiple_trees_file );
- System.out.println( "Species tree: " + species_tree_file );
- System.out.println( "Query: " + seq_name );
- System.out.println( "Outfile: " + outfile );
- System.out.println( "Outfile: " + table_outfile );
- System.out.println( "Sort: " + sort );
- System.out.println( "Threshold orthologs: " + t_orthologs );
- if ( output_ultraparalogs ) {
- System.out.println( "Threshold ultra paralogs: " + threshold_ultra_paralogs );
+ System.out.println( "Gene trees : " + gene_trees_file );
+ System.out.println( "Species tree : " + species_tree_file );
+ if ( query != null ) {
+ System.out.println( "Query : " + query );
+ System.out.println( "Outfile : " + outfile );
+ System.out.println( "Sort : " + sort );
+ System.out.println( "Cutoff for orthologs : " + cutoff_for_orthologs );
+ if ( output_ultraparalogs ) {
+ System.out.println( "Cutoff for ultra paralogs : " + cutoff_for_ultra_paralogs );
+ }
+ }
+ if ( table_outfile != null ) {
+ System.out.println( "Table output : " + table_outfile );
}
+ System.out.println();
time = System.currentTimeMillis();
Phylogeny species_tree = null;
try {
System.exit( -1 );
}
if ( !species_tree.isRooted() ) {
- ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not rooted" );
+ ForesterUtil.printErrorMessage( PRG_NAME, "species tree is not rooted" );
System.exit( -1 );
}
if ( !species_tree.isCompletelyBinary() ) {
- ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not completely binary" );
+ ForesterUtil.printErrorMessage( PRG_NAME, "species tree is not completely binary" );
System.exit( -1 );
}
- final RIO rio_instance = new RIO();
- final StringBuffer output = new StringBuffer();
- PrintWriter out = null;
try {
- rio_instance.inferOrthologs( multiple_trees_file, species_tree.copy(), seq_name );
- output.append( rio_instance.inferredOrthologsToString( seq_name, sort, t_orthologs ) );
- if ( output_ultraparalogs ) {
- output.append( "\n\nUltra paralogs:\n" );
- output.append( rio_instance.inferredUltraParalogsToString( seq_name, threshold_ultra_paralogs ) );
+ RIO rio;
+ if ( ForesterUtil.isEmpty( query ) ) {
+ rio = new RIO( gene_trees_file, species_tree );
+ }
+ else {
+ rio = new RIO( gene_trees_file, species_tree, query );
+ }
+ if ( outfile != null ) {
+ final StringBuilder output = new StringBuilder();
+ output.append( rio.inferredOrthologsToString( query, sort, cutoff_for_orthologs ) );
+ if ( output_ultraparalogs ) {
+ output.append( "\n\nUltra paralogs:\n" );
+ output.append( rio.inferredUltraParalogsToString( query, cutoff_for_ultra_paralogs ) );
+ }
+ output.append( "\n\nSort priority: " + RIO.getOrder( sort ) );
+ output.append( "\nExt nodes : " + rio.getExtNodesOfAnalyzedGeneTrees() );
+ output.append( "\nSamples : " + rio.getNumberOfSamples() + "\n" );
+ final PrintWriter out = new PrintWriter( new FileWriter( outfile ), true );
+ out.println( output );
+ out.close();
+ }
+ if ( table_outfile != null ) {
+ tableOutput( table_outfile, rio );
}
- output.append( "\n\nSort priority: " + RIO.getOrder( sort ) );
- output.append( "\nExt nodes : " + rio_instance.getExtNodesOfAnalyzedGeneTrees() );
- output.append( "\nSamples : " + rio_instance.getNumberOfSamples() + "\n" );
- out = new PrintWriter( new FileWriter( outfile ), true );
}
catch ( final Exception e ) {
ForesterUtil.printErrorMessage( PRG_NAME, e.getLocalizedMessage() );
e.printStackTrace();
System.exit( -1 );
}
- out.println( output );
- out.close();
- ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + outfile + "\"" );
+ if ( outfile != null ) {
+ ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + outfile + "\"" );
+ }
time = System.currentTimeMillis() - time;
ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" );
- ForesterUtil.programMessage( PRG_NAME, "OK." );
+ ForesterUtil.programMessage( PRG_NAME, "OK" );
System.exit( 0 );
}
+ private static void tableOutput( final File table_outfile, final RIO rio ) throws IOException {
+ final IntMatrix m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees() );
+ writeTable( table_outfile, rio, m );
+ }
+
+ private static void writeTable( final File table_outfile, final RIO rio, final IntMatrix m ) throws IOException {
+ final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
+ final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.###" );
+ df.setDecimalSeparatorAlwaysShown( false );
+ w.print( "\t" );
+ for( int i = 0; i < m.size(); ++i ) {
+ w.print( "\t" );
+ w.print( m.getLabel( i ) );
+ }
+ w.println();
+ for( int x = 0; x < m.size(); ++x ) {
+ w.print( m.getLabel( x ) );
+ w.print( "\t" );
+ for( int y = 0; y < m.size(); ++y ) {
+ w.print( "\t" );
+ if ( x == y ) {
+ if ( m.get( x, y ) != rio.getNumberOfSamples() ) {
+ ForesterUtil.unexpectedFatalError( PRG_NAME, "diagonal value is off" );
+ }
+ w.print( "-" );
+ }
+ else {
+ w.print( df.format( ( ( double ) m.get( x, y ) ) / rio.getNumberOfSamples() ) );
+ }
+ }
+ w.println();
+ }
+ w.close();
+ ForesterUtil.programMessage( PRG_NAME, "wrote table to \"" + table_outfile + "\"" );
+ }
+
private final static void printHelp() {
- System.out.println( "Usage:" );
+ System.out.println( "usage:" );
System.out.println();
System.out.println( PRG_NAME + " [options] <gene trees file> <species tree file> [outfile]" );
System.out.println();
- System.out.println( "options:" );
+ System.out.println( " options:" );
+ System.out.println();
+ System.out.println( " -" + CUTOFF_ORTHO_OPTION + " : cutoff for ortholog output (default: 50)" );
+ System.out.println( " -" + TABLE_OUTPUT_OPTION + " : file-name for output table" );
+ System.out.println( " -" + QUERY_OPTION + " : name for query (sequence/node)" );
+ System.out.println( " -" + SORT_OPTION + " : sort (default: 2)" );
+ System.out.println( " -" + OUTPUT_ULTRA_P_OPTION
+ + " : to output ultra-paralogs (species specific expansions/paralogs)" );
+ System.out.println( " -" + CUTOFF_ULTRA_P_OPTION + " : cutoff for ultra-paralog output (default: 50)" );
System.out.println();
- // System.out.println( " -" + STRICT_OPTION
- // + " : strict [default: non-strict]: all nodes between 'target' and 'evaluators' must match" );
- // System.out.println( " -" + NORMALIZE_OPTION
- // + "=<d>: normalize to this value (e.g. 100 for most bootstrap analyses) [default: no normalization]" );
- // System.out.println( " -" + FIRST_OPTION + "=<i>: first evaluator topology to use (0-based) [default: 0]" );
- // System.out.println( " -" + LAST_OPTION
- // + "=<i>: last evaluator topology to use (0-based) [default: use all until final topology]" );
- // System.out.println();
- // System.out.println( "M= (String) Multiple gene tree file (mandatory)" );
- // System.out.println( "N= (String) Query sequence name (mandatory)" );
- // System.out.println( "S= (String) Species tree file (mandatory)" );
- // System.out.println( "O= (String) Output file name -- overwritten without warning! (mandatory)" );
- // System.out.println( "P= (int) Sort priority" );
- // System.out.println( "L= (double) Threshold orthologs for output" );
- // System.out.println( " Sort priority (\"P=\"):" );
+ System.out.println( " sort:" );
System.out.println( RIO.getOrderHelp().toString() );
System.out.println();
System.out
- .println( " Example: \"rio -q=D_NEMVE -s=1 -t=out -u Bcl-2_e1_20_mafft_05_40_fme.mlt species.xml out\"" );
+ .println( " example: \"rio gene_trees.nh species.xml outfile -q=D_HUMAN -t=outtable -u -cu=60 -co=60\"" );
System.out.println();
+ System.exit( -1 );
}
}
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
import org.forester.util.ForesterUtil;
-/*
- * @author Christian M. Zmasek
- */
public final class RIO {
private final static boolean ROOT_BY_MINIMIZING_SUM_OF_DUPS = true;
private final static boolean ROOT_BY_MINIMIZING_TREE_HEIGHT = true;
- private HashMap<String, HashMap<String, Integer>> _o_hash_maps;
- private HashMap<String, HashMap<String, Integer>> _so_hash_maps;
- private HashMap<String, HashMap<String, Integer>> _up_hash_maps;
+ private Phylogeny[] _analyzed_gene_trees;
+ private HashMap<String, HashMap<String, Integer>> _o_maps;
+ private HashMap<String, HashMap<String, Integer>> _so_maps;
+ private HashMap<String, HashMap<String, Integer>> _up_maps;
private List<String> _seq_names;
private int _samples;
- private int _ext_nodes_;
+ private int _ext_nodes;
/**
* Default constructor.
+ * @throws SDIException
+ * @throws IOException
*/
- public RIO() {
- reset();
- }
-
- public static IntMatrix calculateOrthologTable( final Phylogeny[] gene_trees ) {
- final List<String> labels = new ArrayList<String>();
- final Set<String> labels_set = new HashSet<String>();
- String label;
- for( final PhylogenyNode n : gene_trees[ 0 ].getExternalNodes() ) {
- if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
- label = n.getNodeData().getSequence().getName();
- }
- else if ( n.getNodeData().isHasSequence()
- && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) {
- label = n.getNodeData().getSequence().getSymbol();
- }
- else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
- label = n.getName();
- }
- else {
- throw new IllegalArgumentException( "node " + n + " has no appropriate label" );
- }
- if ( labels_set.contains( label ) ) {
- throw new IllegalArgumentException( "label " + label + " is not unique" );
- }
- labels_set.add( label );
- labels.add( label );
- }
- final IntMatrix m = new IntMatrix( labels );
- int counter = 0;
- for( final Phylogeny gt : gene_trees ) {
- System.out.println( counter );
- counter++;
- PhylogenyMethods.preOrderReId( gt );
- final HashMap<String, PhylogenyNode> map = PhylogenyMethods.createNameToExtNodeMap( gt );
- for( int x = 0; x < m.size(); ++x ) {
- final PhylogenyNode nx = map.get( m.getLabel( x ) );
- for( int y = 0; y < m.size(); ++y ) {
- if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, map.get( m.getLabel( y ) ) )
- .isDuplication() ) {
- m.inreaseByOne( x, y );
- }
- }
- }
+ public RIO( final File gene_trees_file, final Phylogeny species_tree, final String query ) throws IOException,
+ SDIException {
+ if ( ForesterUtil.isEmpty( query ) ) {
+ throw new IllegalArgumentException( "query is empty" );
}
- return m;
+ init();
+ inferOrthologs( gene_trees_file, species_tree, query );
}
- public final int getNumberOfSamples() {
- return _samples;
+ public RIO( final File gene_trees_file, final Phylogeny species_tree ) throws IOException, SDIException {
+ init();
+ inferOrthologs( gene_trees_file, species_tree, null );
}
- // Helper method for inferredOrthologsToString.
- // inferredOrthologsToArrayList,
- // and inferredUltraParalogsToString.
- private final double getBootstrapValueFromHash( final HashMap<String, Integer> h, final String name ) {
- if ( !h.containsKey( name ) ) {
- return 0.0;
- }
- final int i = h.get( name );
- return ( ( i * 100.0 ) / getNumberOfSamples() );
+ public final Phylogeny[] getAnalyzedGeneTrees() {
+ return _analyzed_gene_trees;
}
/**
* @return number of ext nodes in gene trees analyzed (after stripping)
*/
public final int getExtNodesOfAnalyzedGeneTrees() {
- return _ext_nodes_;
- }
-
- /**
- * Returns a HashMap containing the inferred orthologs of the external gene
- * tree node with the sequence name seq_name. Sequence names are the keys
- * (String), numbers of observations are the values (Int). Orthologs are to
- * be inferred by method "inferOrthologs". Throws an exception if seq_name
- * is not found.
- *
- * @param seq_name
- * sequence name of a external node of the gene trees
- * @return HashMap containing the inferred orthologs
- * (name(String)->value(Int))
- */
- public final HashMap<String, Integer> getInferredOrthologs( final String seq_name ) {
- if ( _o_hash_maps == null ) {
- return null;
- }
- return _o_hash_maps.get( seq_name );
- }
-
- /**
- * Returns a HashMap containing the inferred "super orthologs" of the
- * external gene tree node with the sequence name seq_name. Sequence names
- * are the keys (String), numbers of observations are the values (Int).
- * Super orthologs are to be inferred by method "inferOrthologs". Throws an
- * exception if seq_name is not found.
- *
- * @param seq_name
- * sequence name of a external node of the gene trees
- * @return HashMap containing the inferred super orthologs
- * (name(String)->value(Int))
- */
- public final HashMap<String, Integer> getInferredSuperOrthologs( final String seq_name ) {
- if ( _so_hash_maps == null ) {
- return null;
- }
- return _so_hash_maps.get( seq_name );
+ return _ext_nodes;
}
/**
* (name(String)->value(Int))
*/
public final HashMap<String, Integer> getInferredUltraParalogs( final String seq_name ) {
- if ( _up_hash_maps == null ) {
+ if ( _up_maps == null ) {
return null;
}
- return _up_hash_maps.get( seq_name );
+ return _up_maps.get( seq_name );
}
- /**
- * Infers the orthologs (as well the "super orthologs", the "subtree
- * neighbors", and the "ultra paralogs") for each external node of the gene
- * Trees in multiple tree File gene_trees_file (=output of PHYLIP NEIGHBOR,
- * for example). Tallies how many times each sequence is (super-)
- * orthologous towards the query. Tallies how many times each sequence is
- * ultra paralogous towards the query. Tallies how many times each sequence
- * is a subtree neighbor of the query. Gene duplications are inferred using
- * SDI. Modifies its argument species_tree. Is a little faster than
- * "inferOrthologs(File,Phylogeny)" since orthologs are only inferred for
- * query.
- * <p>
- * To obtain the results use the methods listed below.
- *
- * @param gene_trees_file
- * a File containing gene Trees in NH format, which is the result
- * of performing a bootstrap analysis in PHYLIP
- * @param species_tree
- * a species Phylogeny, which has species names in its species
- * fields
- * @param query
- * the sequence name of the squence whose orthologs are to be
- * inferred
- * @throws SDIException
- */
- public void inferOrthologs( final File gene_trees_file, final Phylogeny species_tree, final String query )
- throws IOException, SDIException {
- int bs = 0;
- // Read in first tree to get its sequence names
- // and strip species_tree.
- final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
- final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
- if ( p instanceof NHXParser ) {
- final NHXParser nhx = ( NHXParser ) p;
- nhx.setReplaceUnderscores( false );
- nhx.setIgnoreQuotes( true );
- nhx.setTaxonomyExtraction( PhylogenyMethods.TAXONOMY_EXTRACTION.YES );
- }
- final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
- // Removes from species_tree all species not found in gene_tree.
- PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
- PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_trees[ 0 ] );
- _seq_names = getAllExternalSequenceNames( gene_trees[ 0 ] );
- if ( ( _seq_names == null ) || ( _seq_names.size() < 1 ) ) {
- throw new IOException( "could not get sequence names" );
- }
- _o_hash_maps = new HashMap<String, HashMap<String, Integer>>();
- _so_hash_maps = new HashMap<String, HashMap<String, Integer>>();
- _up_hash_maps = new HashMap<String, HashMap<String, Integer>>();
- _o_hash_maps.put( query, new HashMap<String, Integer>( _seq_names.size() ) );
- _so_hash_maps.put( query, new HashMap<String, Integer>( _seq_names.size() ) );
- _up_hash_maps.put( query, new HashMap<String, Integer>( _seq_names.size() ) );
- // Go through all gene trees in the file.
- final Phylogeny[] assigned_trees = new Phylogeny[ gene_trees.length ];
- System.out.println( "gene trees" + gene_trees.length );
- int c = 0;
- for( final Phylogeny gt : gene_trees ) {
- bs++;
- // Removes from gene_tree all species not found in species_tree.
- PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
- assigned_trees[ c++ ] = inferOrthologsHelper( gt, species_tree, query );
- }
- final IntMatrix m = calculateOrthologTable( assigned_trees );
- System.out.println( m.toString() );
- setNumberOfSamples( gene_trees.length );
- }
-
- public List<PhylogenyNode> getNodesViaSequenceName( final Phylogeny phy, final String seq_name ) {
- final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
- for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) {
- final PhylogenyNode n = iter.next();
- if ( n.getNodeData().isHasSequence() && n.getNodeData().getSequence().getName().equals( seq_name ) ) {
- nodes.add( n );
- }
- if ( !n.getNodeData().isHasSequence() && n.getName().equals( seq_name ) ) {
- nodes.add( n );
- }
- }
- return nodes;
- }
-
- // Helper method which performs the actual ortholog inference for
- // the external node with seqname query.
- private Phylogeny inferOrthologsHelper( final Phylogeny gene_tree, final Phylogeny species_tree, final String query )
- throws SDIException {
- Phylogeny assigned_tree = null;
- List<PhylogenyNode> nodes = null;
- final SDIR sdiunrooted = new SDIR();
- List<PhylogenyNode> orthologs = null;
- List<PhylogenyNode> super_orthologs = null;
- List<PhylogenyNode> ultra_paralogs = null;
- assigned_tree = sdiunrooted.infer( gene_tree,
- species_tree,
- false,
- RIO.ROOT_BY_MINIMIZING_SUM_OF_DUPS,
- RIO.ROOT_BY_MINIMIZING_TREE_HEIGHT,
- true,
- 1 )[ 0 ];
- setExtNodesOfAnalyzedGeneTrees( assigned_tree.getNumberOfExternalNodes() );
- nodes = getNodesViaSequenceName( assigned_tree, query );
- if ( nodes.size() > 1 ) {
- throw new IllegalArgumentException( "node named [" + query + "] not unique" );
- }
- else if ( nodes.isEmpty() ) {
- throw new IllegalArgumentException( "no node containing a sequence named [" + query + "] found" );
- }
- final PhylogenyNode query_node = nodes.get( 0 );
- orthologs = PhylogenyMethods.getOrthologousNodes( assigned_tree, query_node );
- updateHash( _o_hash_maps, query, orthologs );
- super_orthologs = PhylogenyMethods.getSuperOrthologousNodes( query_node );
- updateHash( _so_hash_maps, query, super_orthologs );
- ultra_paralogs = PhylogenyMethods.getUltraParalogousNodes( query_node );
- updateHash( _up_hash_maps, query, ultra_paralogs );
- return assigned_tree;
- }
-
- /**
- * Returns an ArrayList containg the names of orthologs of the PhylogenyNode
- * with seq name seq_name.
- *
- * @param seq_name
- * sequence name of a external node of the gene trees
- * @param threshold_orthologs
- * the minimal number of observations for a a sequence to be
- * reported as orthologous as percentage (0.0-100.0%)
- * @return ArrayList containg the names of orthologs of the PhylogenyNode
- * with seq name seq_name
- */
- public ArrayList<String> inferredOrthologsToArrayList( final String seq_name, double threshold_orthologs ) {
- HashMap<String, Integer> o_hashmap = null;
- String name = null;
- double o = 0.0;
- final ArrayList<String> arraylist = new ArrayList<String>();
- if ( _o_hash_maps == null ) {
- throw new RuntimeException( "Orthologs have not been calculated (successfully)." );
- }
- if ( threshold_orthologs < 0.0 ) {
- threshold_orthologs = 0.0;
- }
- else if ( threshold_orthologs > 100.0 ) {
- threshold_orthologs = 100.0;
- }
- o_hashmap = getInferredOrthologs( seq_name );
- if ( o_hashmap == null ) {
- throw new RuntimeException( "Orthologs for " + seq_name + " were not established." );
- }
- if ( _seq_names.size() > 0 ) {
- I: for( int i = 0; i < _seq_names.size(); ++i ) {
- name = _seq_names.get( i );
- if ( name.equals( seq_name ) ) {
- continue I;
- }
- o = getBootstrapValueFromHash( o_hashmap, name );
- if ( o < threshold_orthologs ) {
- continue I;
- }
- arraylist.add( name );
- }
- }
- return arraylist;
+ public final int getNumberOfSamples() {
+ return _samples;
}
/**
* reported as orthologous, in percents (0.0-100.0%)
* @return String containing the inferred orthologs, String containing "-"
* if no orthologs have been found null in case of error
- * @see #inferOrthologs(File,Phylogeny,String)
- * @see #inferOrthologs(Phylogeny[],Phylogeny)
- * @see #inferOrthologs(File,Phylogeny)
- * @see #getOrder(int)
*/
- public StringBuffer inferredOrthologsToString( final String query_name, int sort, double threshold_orthologs ) {
+ public final StringBuffer inferredOrthologsToString( final String query_name, int sort, double threshold_orthologs ) {
HashMap<String, Integer> o_hashmap = null;
HashMap<String, Integer> s_hashmap = null;
String name = "";
double value1 = 0.0;
double value2 = 0.0;
final ArrayList<ResultLine> nv = new ArrayList<ResultLine>();
- if ( ( _o_hash_maps == null ) || ( _so_hash_maps == null ) ) {
+ if ( ( _o_maps == null ) || ( _so_maps == null ) ) {
throw new RuntimeException( "orthologs have not been calculated (successfully)" );
}
if ( ( sort < 0 ) || ( sort > 2 ) ) {
}
} // End of I for loop.
if ( ( nv != null ) && ( nv.size() > 0 ) ) {
- orthologs.append( "[seq name]\t\t[ortho]\t[st-n]\t[sup-o]\t[dist]" + ForesterUtil.LINE_SEPARATOR );
+ orthologs.append( "seq name\t\tortho\ts-ortho" + ForesterUtil.LINE_SEPARATOR );
final ResultLine[] nv_array = new ResultLine[ nv.size() ];
for( int j = 0; j < nv.size(); ++j ) {
nv_array[ j ] = nv.get( j );
orthologs.append( "-" );
}
return orthologs;
- } // inferredOrthologsToString( String, int, double )
+ }
/**
* Returns a String containg the names of orthologs of the PhylogenyNode
* @return String containing the inferred orthologs, String containing "-"
* if no orthologs have been found null in case of error
*/
- public String inferredUltraParalogsToString( final String query_name, double threshold_ultra_paralogs ) {
+ public final String inferredUltraParalogsToString( final String query_name, double threshold_ultra_paralogs ) {
HashMap<String, Integer> sp_hashmap = null;
String name = "", ultra_paralogs = "";
int sort = 0;
else if ( threshold_ultra_paralogs > 100.0 ) {
threshold_ultra_paralogs = 100.0;
}
- if ( _up_hash_maps == null ) {
+ if ( _up_maps == null ) {
throw new RuntimeException( "Ultra paralogs have not been calculated (successfully)." );
}
sp_hashmap = getInferredUltraParalogs( query_name );
return ultra_paralogs;
}
+ // Helper method for inferredOrthologsToString.
+ // inferredOrthologsToArrayList,
+ // and inferredUltraParalogsToString.
+ private final double getBootstrapValueFromHash( final HashMap<String, Integer> h, final String name ) {
+ if ( !h.containsKey( name ) ) {
+ return 0.0;
+ }
+ final int i = h.get( name );
+ return ( ( i * 100.0 ) / getNumberOfSamples() );
+ }
+
/**
- * Brings this into the same state as immediately after construction.
+ * Returns a HashMap containing the inferred orthologs of the external gene
+ * tree node with the sequence name seq_name. Sequence names are the keys
+ * (String), numbers of observations are the values (Int). Orthologs are to
+ * be inferred by method "inferOrthologs". Throws an exception if seq_name
+ * is not found.
+ *
+ * @param seq_name
+ * sequence name of a external node of the gene trees
+ * @return HashMap containing the inferred orthologs
+ * (name(String)->value(Int))
*/
- private final void reset() {
- _o_hash_maps = null;
- _so_hash_maps = null;
- _up_hash_maps = null;
- _seq_names = null;
- _samples = 1;
- _ext_nodes_ = 0;
+ private final HashMap<String, Integer> getInferredOrthologs( final String seq_name ) {
+ if ( _o_maps == null ) {
+ return null;
+ }
+ return _o_maps.get( seq_name );
}
- private void setNumberOfSamples( int i ) {
- if ( i < 1 ) {
- i = 1;
+ /**
+ * Returns a HashMap containing the inferred "super orthologs" of the
+ * external gene tree node with the sequence name seq_name. Sequence names
+ * are the keys (String), numbers of observations are the values (Int).
+ * Super orthologs are to be inferred by method "inferOrthologs". Throws an
+ * exception if seq_name is not found.
+ *
+ * @param seq_name
+ * sequence name of a external node of the gene trees
+ * @return HashMap containing the inferred super orthologs
+ * (name(String)->value(Int))
+ */
+ private final HashMap<String, Integer> getInferredSuperOrthologs( final String seq_name ) {
+ if ( _so_maps == null ) {
+ return null;
}
- System.out.println( "samples: " + i );
- _samples = i;
+ return _so_maps.get( seq_name );
}
/**
- * Sets number of ext nodes in gene trees analyzed (after stripping).
- * @param the
- * number of ext nodes in gene trees analyzed (after stripping)
+ * Infers the orthologs (as well the "super orthologs", the "subtree
+ * neighbors", and the "ultra paralogs") for each external node of the gene
+ * Trees in multiple tree File gene_trees_file (=output of PHYLIP NEIGHBOR,
+ * for example). Tallies how many times each sequence is (super-)
+ * orthologous towards the query. Tallies how many times each sequence is
+ * ultra paralogous towards the query. Tallies how many times each sequence
+ * is a subtree neighbor of the query. Gene duplications are inferred using
+ * SDI. Modifies its argument species_tree. Is a little faster than
+ * "inferOrthologs(File,Phylogeny)" since orthologs are only inferred for
+ * query.
+ * <p>
+ * To obtain the results use the methods listed below.
+ *
+ * @param gene_trees_file
+ * a File containing gene Trees in NH format, which is the result
+ * of performing a bootstrap analysis in PHYLIP
+ * @param species_tree
+ * a species Phylogeny, which has species names in its species
+ * fields
+ * @param query
+ * the sequence name of the squence whose orthologs are to be
+ * inferred
+ * @throws SDIException
*/
- private void setExtNodesOfAnalyzedGeneTrees( int i ) {
+ private final void inferOrthologs( final File gene_trees_file, final Phylogeny species_tree, final String query )
+ throws IOException, SDIException {
+ // Read in first tree to get its sequence names
+ // and strip species_tree.
+ final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
+ final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
+ if ( p instanceof NHXParser ) {
+ final NHXParser nhx = ( NHXParser ) p;
+ nhx.setReplaceUnderscores( false );
+ nhx.setIgnoreQuotes( true );
+ nhx.setTaxonomyExtraction( PhylogenyMethods.TAXONOMY_EXTRACTION.YES );
+ }
+ final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
+ // Removes from species_tree all species not found in gene_tree.
+ PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
+ if ( !ForesterUtil.isEmpty( query ) ) {
+ PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_trees[ 0 ] );
+ _seq_names = getAllExternalSequenceNames( gene_trees[ 0 ] );
+ if ( ( _seq_names == null ) || ( _seq_names.size() < 1 ) ) {
+ throw new IOException( "could not get sequence names" );
+ }
+ _o_maps = new HashMap<String, HashMap<String, Integer>>();
+ _so_maps = new HashMap<String, HashMap<String, Integer>>();
+ _up_maps = new HashMap<String, HashMap<String, Integer>>();
+ _o_maps.put( query, new HashMap<String, Integer>( _seq_names.size() ) );
+ _so_maps.put( query, new HashMap<String, Integer>( _seq_names.size() ) );
+ _up_maps.put( query, new HashMap<String, Integer>( _seq_names.size() ) );
+ }
+ _analyzed_gene_trees = new Phylogeny[ gene_trees.length ];
+ int c = 0;
+ for( final Phylogeny gt : gene_trees ) {
+ // Removes from gene_tree all species not found in species_tree.
+ PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
+ _analyzed_gene_trees[ c++ ] = inferOrthologsHelper( gt, species_tree, query );
+ }
+ setNumberOfSamples( gene_trees.length );
+ }
+
+ // Helper method which performs the actual ortholog inference for
+ // the external node with seqname query.
+ private final Phylogeny inferOrthologsHelper( final Phylogeny gene_tree,
+ final Phylogeny species_tree,
+ final String query ) throws SDIException {
+ final SDIR sdiunrooted = new SDIR();
+ final Phylogeny assigned_tree = sdiunrooted.infer( gene_tree,
+ species_tree,
+ false,
+ RIO.ROOT_BY_MINIMIZING_SUM_OF_DUPS,
+ RIO.ROOT_BY_MINIMIZING_TREE_HEIGHT,
+ true,
+ 1 )[ 0 ];
+ setExtNodesOfAnalyzedGeneTrees( assigned_tree.getNumberOfExternalNodes() );
+ if ( !ForesterUtil.isEmpty( query ) ) {
+ final List<PhylogenyNode> nodes = getNodesViaSequenceName( assigned_tree, query );
+ if ( nodes.size() > 1 ) {
+ throw new IllegalArgumentException( "node named [" + query + "] not unique" );
+ }
+ else if ( nodes.isEmpty() ) {
+ throw new IllegalArgumentException( "no node containing a sequence named [" + query + "] found" );
+ }
+ final PhylogenyNode query_node = nodes.get( 0 );
+ updateCounts( _o_maps, query, PhylogenyMethods.getOrthologousNodes( assigned_tree, query_node ) );
+ updateCounts( _so_maps, query, PhylogenyMethods.getSuperOrthologousNodes( query_node ) );
+ updateCounts( _up_maps, query, PhylogenyMethods.getUltraParalogousNodes( query_node ) );
+ }
+ return assigned_tree;
+ }
+
+ private final void init() {
+ _o_maps = null;
+ _so_maps = null;
+ _up_maps = null;
+ _seq_names = null;
+ _samples = 1;
+ _ext_nodes = 0;
+ }
+
+ private final void setExtNodesOfAnalyzedGeneTrees( final int i ) {
+ _ext_nodes = i;
+ }
+
+ private final void setNumberOfSamples( int i ) {
if ( i < 1 ) {
- i = 0;
+ i = 1;
}
- _ext_nodes_ = i;
+ _samples = i;
}
// Helper for doInferOrthologs( Phylogeny, Phylogeny, String )
// and doInferOrthologs( Phylogeny, Phylogeny ).
- private void updateHash( final HashMap<String, HashMap<String, Integer>> counter_map,
- final String query_seq_name,
- final List<PhylogenyNode> nodes ) {
+ private final void updateCounts( final HashMap<String, HashMap<String, Integer>> counter_map,
+ final String query_seq_name,
+ final List<PhylogenyNode> nodes ) {
final HashMap<String, Integer> hash_map = counter_map.get( query_seq_name );
if ( hash_map == null ) {
throw new RuntimeException( "Unexpected failure in method updateHash." );
}
}
+ public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees ) {
+ final List<String> labels = new ArrayList<String>();
+ final Set<String> labels_set = new HashSet<String>();
+ String label;
+ for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) {
+ if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
+ label = n.getNodeData().getSequence().getName();
+ }
+ else if ( n.getNodeData().isHasSequence()
+ && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) {
+ label = n.getNodeData().getSequence().getSymbol();
+ }
+ else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
+ label = n.getName();
+ }
+ else {
+ throw new IllegalArgumentException( "node " + n + " has no appropriate label" );
+ }
+ if ( labels_set.contains( label ) ) {
+ throw new IllegalArgumentException( "label " + label + " is not unique" );
+ }
+ labels_set.add( label );
+ labels.add( label );
+ }
+ final IntMatrix m = new IntMatrix( labels );
+ int counter = 0;
+ for( final Phylogeny gt : analyzed_gene_trees ) {
+ counter++;
+ PhylogenyMethods.preOrderReId( gt );
+ final HashMap<String, PhylogenyNode> map = PhylogenyMethods.createNameToExtNodeMap( gt );
+ for( int x = 0; x < m.size(); ++x ) {
+ final PhylogenyNode nx = map.get( m.getLabel( x ) );
+ for( int y = 0; y < m.size(); ++y ) {
+ if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, map.get( m.getLabel( y ) ) )
+ .isDuplication() ) {
+ m.inreaseByOne( x, y );
+ }
+ }
+ }
+ }
+ return m;
+ }
+
+ /**
+ * Returns the order in which ortholog (o), "super ortholog" (s) and
+ * distance (d) are returned and sorted (priority of sort always goes from
+ * left to right), given sort. For the meaning of sort
+ *
+ * @see #inferredOrthologsToString(String,int,double,double)
+ *
+ * @param sort
+ * determines order and sort priority
+ * @return String indicating the order
+ */
+ public final static String getOrder( final int sort ) {
+ String order = "";
+ switch ( sort ) {
+ case 0:
+ order = "orthologies";
+ break;
+ case 1:
+ order = "orthologies > super orthologies";
+ break;
+ case 2:
+ order = "super orthologies > orthologies";
+ break;
+ default:
+ order = "orthologies";
+ break;
+ }
+ return order;
+ }
+
+ public final static StringBuffer getOrderHelp() {
+ final StringBuffer sb = new StringBuffer();
+ sb.append( " 0: orthologies" + ForesterUtil.LINE_SEPARATOR );
+ sb.append( " 1: orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR );
+ sb.append( " 2: super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR );
+ return sb;
+ }
+
// Helper method for inferredOrthologsToString
// and inferredUltraParalogsToString.
private final static String addNameAndValues( final String name,
return s;
}
- private static List<String> getAllExternalSequenceNames( final Phylogeny phy ) {
+ private final static List<String> getAllExternalSequenceNames( final Phylogeny phy ) {
final List<String> names = new ArrayList<String>();
for( final PhylogenyNodeIterator iter = phy.iteratorExternalForward(); iter.hasNext(); ) {
final PhylogenyNode n = iter.next();
return names;
}
- /**
- * Returns the order in which ortholog (o), "super ortholog" (s) and
- * distance (d) are returned and sorted (priority of sort always goes from
- * left to right), given sort. For the meaning of sort
- *
- * @see #inferredOrthologsToString(String,int,double,double)
- *
- * @param sort
- * determines order and sort priority
- * @return String indicating the order
- */
- public final static String getOrder( final int sort ) {
- String order = "";
- switch ( sort ) {
- case 0:
- order = "orthologies";
- break;
- case 1:
- order = "orthologies > super orthologies";
- break;
- case 2:
- order = "super orthologies > orthologies";
- break;
- default:
- order = "orthologies";
- break;
+ private final static List<PhylogenyNode> getNodesViaSequenceName( final Phylogeny phy, final String seq_name ) {
+ final List<PhylogenyNode> nodes = new ArrayList<PhylogenyNode>();
+ for( final PhylogenyNodeIterator iter = phy.iteratorPreorder(); iter.hasNext(); ) {
+ final PhylogenyNode n = iter.next();
+ if ( n.getNodeData().isHasSequence() && n.getNodeData().getSequence().getName().equals( seq_name ) ) {
+ nodes.add( n );
+ }
+ if ( !n.getNodeData().isHasSequence() && n.getName().equals( seq_name ) ) {
+ nodes.add( n );
+ }
}
- return order;
- }
-
- public final static StringBuffer getOrderHelp() {
- final StringBuffer sb = new StringBuffer();
- sb.append( " 0: orthologies" + ForesterUtil.LINE_SEPARATOR );
- sb.append( " 1: orthologies > super orthologies" + ForesterUtil.LINE_SEPARATOR );
- sb.append( " 2: super orthologies > orthologies" + ForesterUtil.LINE_SEPARATOR );
- return sb;
+ return nodes;
}
- class ResultLine implements Comparable<ResultLine> {
+ private final class ResultLine implements Comparable<ResultLine> {
public static final int DEFAULT = -999;
private final String _key;
private final double _value2;
private int[] _p;
- ResultLine() {
- setSigns();
- _key = "";
- _value1 = ResultLine.DEFAULT;
- _value2 = ResultLine.DEFAULT;
- }
-
ResultLine( final String name, final double value1, final double value2, final int c ) {
setSigns();
_key = name;