2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
28 package org.forester.application;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.PrintWriter;
34 import java.util.ArrayList;
35 import java.util.HashMap;
36 import java.util.Vector;
38 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
39 import org.forester.phylogeny.Phylogeny;
40 import org.forester.phylogeny.PhylogenyMethods;
41 import org.forester.phylogeny.PhylogenyNode;
42 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
43 import org.forester.phylogeny.factories.PhylogenyFactory;
44 import org.forester.phylogeny.iterators.PreorderTreeIterator;
45 import org.forester.sdi.DistanceCalculator;
46 import org.forester.sdi.RIO;
47 import org.forester.sdi.SDIException;
48 import org.forester.sdi.SDIR;
49 import org.forester.util.ForesterUtil;
53 final static private String PRG_NAME = "RIO";
54 final static private String PRG_VERSION = "2.03 ALPHA";
55 final static private String PRG_DATE = "2010.01.15";
56 final static private String E_MAIL = "czmasek@burnham.org";
57 final static private String WWW = "www.phylosoft.org/forester/";
58 final static private boolean TIME = true;
59 final static private boolean VERBOSE = true;
60 // For method getDistances -- calculation of distances.
61 final static private boolean MINIMIZE_COST = false;
62 // For method getDistances -- calculation of distances.
63 final static private boolean MINIMIZE_DUPS = true;
64 // For method getDistances -- calculation of distances.
65 final static private boolean MINIMIZE_HEIGHT = true;
66 final static private int WARN_NO_ORTHOS_DEFAULT = 2;
67 final static private int
68 // How many sd away from mean to root.
69 WARN_MORE_THAN_ONE_ORTHO_DEFAULT = 2;
70 // How many sd away from mean to LCA of orthos.
71 final static private double THRESHOLD_ULTRA_PARALOGS_DEFAULT = 50;
72 // How many sd away from mean to LCA of orthos.
73 final static private double WARN_ONE_ORTHO_DEFAULT = 2;
75 // Factor between the two distances to their LCA
77 // Factor between the two distances to their LCA
80 * Calculates the mean and standard deviation of all nodes of Phylogeny t
81 * which have a bootstrap values zero or more. Returns null in case of
82 * failure (e.g t has no bootstrap values, or just one).
86 * reference to a tree with bootstrap values
87 * @return Array of doubles, [0] is the mean, [1] the standard deviation
89 private static double[] calculateMeanBoostrapValue( final Phylogeny t ) {
93 double x = 0.0, mean = 0.0;
94 final double[] da = new double[ 2 ];
95 final Vector<Double> bv = new Vector<Double>();
96 PhylogenyNode node = null;
97 PreorderTreeIterator i = null;
98 i = new PreorderTreeIterator( t );
99 // Calculates the mean.
100 while ( i.hasNext() ) {
102 if ( !( ( node.getParent() != null ) && node.getParent().isRoot()
103 && ( PhylogenyMethods.getConfidenceValue( node.getParent().getChildNode1() ) > 0 )
104 && ( PhylogenyMethods.getConfidenceValue( node.getParent().getChildNode2() ) > 0 ) && ( node
105 .getParent().getChildNode2() == node ) ) ) {
106 b = PhylogenyMethods.getConfidenceValue( node );
109 bv.addElement( new Double( b ) );
118 mean = ( double ) sum / n;
119 // Calculates the standard deviation.
121 for( int j = 0; j < n; ++j ) {
122 b = ( bv.elementAt( j ) ).intValue();
127 da[ 1 ] = java.lang.Math.sqrt( sum / ( n - 1.0 ) );
131 private final static void errorInCommandLine() {
132 System.out.println( "\nrio: Error in command line.\n" );
137 // Uses DistanceCalculator to calculate distances.
138 private final static StringBuffer getDistances( final File tree_file_for_dist_val,
140 final Phylogeny species_tree,
141 final String seq_name,
142 final ArrayList<String> al_ortholog_names_for_dc,
143 final HashMap<String, Integer> ortholog_hashmap,
144 final HashMap<String, Integer> super_ortholog_hashmap,
145 final int warn_more_than_one_ortho,
146 final int warn_no_orthos,
147 final double warn_one_ortho,
148 final int bootstraps,
149 final double t_orthologs_dc ) throws IOException, SDIException {
150 Phylogeny consensus_tree = null;
152 // to be a consensus tree.
153 assigned_cons_tree = null;
154 final SDIR sdiunrooted = new SDIR();
155 final ArrayList<PhylogenyNode> al_ortholog_nodes = new ArrayList<PhylogenyNode>();
160 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
161 consensus_tree = factory.create( tree_file_for_dist_val, new PhyloXmlParser() )[ 0 ];
162 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, consensus_tree );
163 assigned_cons_tree = sdiunrooted.infer( consensus_tree,
170 final DistanceCalculator dc = new DistanceCalculator();
171 final StringBuffer sb = new StringBuffer();
172 sb.append( "Given the threshold for distance calculations (" + ForesterUtil.roundToInt( t_orthologs_dc )
175 if ( al_ortholog_names_for_dc.size() == 0 ) {
176 dc.setTree( assigned_cons_tree );
177 // Remark. Calculation of mean and sd _does_ include the node
180 sd = dc.getStandardDeviation();
181 d = dc.getDistanceToRoot( seq_name );
183 sb.append( "No sequence is considered orthologous to query."
184 + "\ndistance of query to root = " + ForesterUtil.FORMATTER_06.format( d )
185 + "\nmean of distances (for all sequences) to root = " + ForesterUtil.FORMATTER_06.format( m )
186 + "\nsd of distances (for all sequences) to root = " + ForesterUtil.FORMATTER_06.format( sd )
187 + "\nn (sum of sequences in alignment plus query) = " + n );
188 if ( !( ( ( m - ( warn_no_orthos * sd ) ) < d ) && ( ( m + ( warn_no_orthos * sd ) ) > d ) ) ) {
189 sb.append( "\nWARNING: distance of query to root is outside of mean+/-" + warn_no_orthos + "*sd!" );
193 else if ( al_ortholog_names_for_dc.size() == 1 ) {
194 final String name_of_ortholog = al_ortholog_names_for_dc.get( 0 );
195 al_ortholog_nodes.add( assigned_cons_tree.getNode( name_of_ortholog ) );
196 al_ortholog_nodes.add( assigned_cons_tree.getNode( seq_name ) );
197 dc.setTreeAndExtNodes( assigned_cons_tree, al_ortholog_nodes );
198 // Remark. Calculation of mean _does_ include the node
200 d = dc.getDistanceToLCA( seq_name );
201 final double d_o = dc.getDistanceToLCA( name_of_ortholog );
202 sb.append( "One sequence is considered orthologous to query." + "\nLCA is LCA of query and its ortholog."
203 + "\ndistance of query to LCA = " + ForesterUtil.FORMATTER_06.format( d )
204 + "\ndistance of ortholog to LCA = " + ForesterUtil.FORMATTER_06.format( d_o ) );
207 && ( ( ( d_o >= d ) && ( ( d_o / d ) > warn_one_ortho ) ) || ( ( d_o < d ) && ( ( d / d_o ) > warn_one_ortho ) ) ) ) {
208 sb.append( "\nWARNING: Ratio of distances to LCA is greater than " + warn_one_ortho + "!" );
210 else if ( ( ( d_o == 0.0 ) || ( d == 0.0 ) ) && ( ( d_o != 0.0 ) || ( d != 0.0 ) ) ) {
211 sb.append( "\nWARNING: Ratio could not be calculated, " + " one distance is 0.0!" );
214 // More than one ortholog.
216 for( int i = 0; i < al_ortholog_names_for_dc.size(); ++i ) {
217 al_ortholog_nodes.add( assigned_cons_tree.getNodeViaSequenceName( al_ortholog_names_for_dc.get( i ) ) );
219 al_ortholog_nodes.add( assigned_cons_tree.getNodesViaSequenceName( seq_name ).get( 0 ) );
220 dc.setTreeAndExtNodes( assigned_cons_tree, al_ortholog_nodes );
221 // Remark. Calculation of mean and sd _does_ include the node
224 sd = dc.getStandardDeviation();
225 d = dc.getDistanceToLCA( seq_name );
227 sb.append( "More than one sequence is considered orthologous to query."
228 + "\nLCA is LCA of query and its orthologs."
229 + "\ndistance of query to LCA = "
230 + ForesterUtil.FORMATTER_06.format( d )
231 + "\nmean of distances (for query and its orthologs) to LCA = "
232 + ForesterUtil.FORMATTER_06.format( m )
233 + "\nsd of distances (for query and its orthologs) to LCA = "
234 + ForesterUtil.FORMATTER_06.format( sd )
235 + "\nn (sum of orthologs plus query) = " + n );
236 if ( !( ( ( m - ( warn_more_than_one_ortho * sd ) ) < d ) && ( ( m + ( warn_more_than_one_ortho * sd ) ) > d ) ) ) {
237 sb.append( "\n!WARNING: distance of query to LCA is outside of mean+/-" + warn_more_than_one_ortho
244 public static void main( final String[] args ) {
245 ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW );
246 File species_tree_file = null;
247 File multiple_trees_file = null;
249 File distance_matrix_file = null;
250 File tree_file_for_dist_val = null;
251 File tree_file_for_avg_bs = null;
252 String seq_name = "";
254 boolean output_ultraparalogs = false;
255 ArrayList<String> orthologs_al_for_dc = null;
256 double t_orthologs = 0.0;
258 double t_orthologs_dc = 0.0;
259 double[] bs_mean_sd = null;
261 Phylogeny species_tree = null;
262 RIO rio_instance = null;
263 PrintWriter out = null;
265 int warn_no_orthos = WARN_NO_ORTHOS_DEFAULT;
266 int warn_more_than_one_ortho = WARN_MORE_THAN_ONE_ORTHO_DEFAULT;
267 double warn_one_ortho = WARN_ONE_ORTHO_DEFAULT;
268 double threshold_ultra_paralogs = THRESHOLD_ULTRA_PARALOGS_DEFAULT;
269 if ( args.length < 2 ) {
273 else if ( ( args.length < 3 ) || ( args.length > 18 ) ) {
274 errorInCommandLine();
276 for( int i = 0; i < args.length; ++i ) {
277 if ( args[ i ].trim().charAt( 0 ) != 'p' ) {
278 if ( args[ i ].trim().length() < 3 ) {
279 errorInCommandLine();
282 arg = args[ i ].trim().substring( 2 );
286 switch ( args[ i ].trim().charAt( 0 ) ) {
288 multiple_trees_file = new File( arg );
294 species_tree_file = new File( arg );
297 outfile = new File( arg );
300 distance_matrix_file = new File( arg );
303 tree_file_for_dist_val = new File( arg );
306 tree_file_for_avg_bs = new File( arg );
309 output_ultraparalogs = true;
312 sort = Integer.parseInt( arg );
313 if ( ( sort < 0 ) || ( sort > 17 ) ) {
314 errorInCommandLine();
318 t_orthologs = Double.parseDouble( arg );
321 t_sn = Double.parseDouble( arg );
324 t_orthologs_dc = Double.parseDouble( arg );
327 threshold_ultra_paralogs = Double.parseDouble( arg );
330 warn_more_than_one_ortho = Integer.parseInt( arg );
333 warn_no_orthos = Integer.parseInt( arg );
336 warn_one_ortho = Double.parseDouble( arg );
339 errorInCommandLine();
342 catch ( final Exception e ) {
343 errorInCommandLine();
346 if ( ( seq_name == "" ) || ( species_tree_file == null ) || ( multiple_trees_file == null )
347 || ( outfile == null ) ) {
348 errorInCommandLine();
350 if ( ( sort < 0 ) || ( sort > 17 ) ) {
351 errorInCommandLine();
353 if ( ( sort > 2 ) && ( distance_matrix_file == null ) ) {
354 errorInCommandLine();
357 System.out.println( "\nMultiple trees file: " + multiple_trees_file );
358 System.out.println( "Seq name: " + seq_name );
359 System.out.println( "Species tree file: " + species_tree_file );
360 System.out.println( "Outfile: " + outfile );
361 if ( distance_matrix_file != null ) {
362 System.out.println( "Distance matrix file: " + distance_matrix_file );
364 if ( tree_file_for_dist_val != null ) {
365 if ( tree_file_for_avg_bs == null ) {
366 System.out.println( "Phy to read dists and calc mean support from: " + tree_file_for_dist_val );
369 System.out.println( "Phylogeny to read dist values from: " + tree_file_for_dist_val );
372 if ( tree_file_for_avg_bs != null ) {
373 System.out.println( "Phylogeny to calc mean bootstrap from: " + tree_file_for_avg_bs );
375 System.out.println( "Sort: " + sort );
376 System.out.println( "Threshold orthologs: " + t_orthologs );
377 System.out.println( "Threshold subtree neighborings: " + t_sn );
378 System.out.println( "Threshold orthologs for distance calc.: " + t_orthologs_dc );
379 if ( output_ultraparalogs ) {
380 System.out.println( "Threshold ultra paralogs: " + threshold_ultra_paralogs );
382 System.out.println( "More than one ortholog sd diff: " + warn_more_than_one_ortho );
383 System.out.println( "No orthologs sd diff: " + warn_no_orthos );
384 System.out.println( "One ortholog factor : " + warn_one_ortho + "\n" );
386 if ( TIME && VERBOSE ) {
387 time = System.currentTimeMillis();
390 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
391 species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ];
393 catch ( final Exception e ) {
397 if ( !species_tree.isRooted() ) {
398 ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not rooted" );
401 if ( !species_tree.isCompletelyBinary() ) {
402 ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not completely binary" );
405 rio_instance = new RIO();
406 final StringBuffer output = new StringBuffer();
408 if ( distance_matrix_file != null ) {
409 rio_instance.readDistanceMatrix( distance_matrix_file );
411 rio_instance.inferOrthologs( multiple_trees_file, species_tree.copy(), seq_name );
412 output.append( rio_instance.inferredOrthologsToString( seq_name, sort, t_orthologs, t_sn ) );
413 if ( tree_file_for_dist_val != null ) {
414 orthologs_al_for_dc = rio_instance.inferredOrthologsToArrayList( seq_name, t_orthologs_dc );
415 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
416 if ( tree_file_for_avg_bs != null ) {
417 final Phylogeny p = factory.create( tree_file_for_avg_bs, new PhyloXmlParser() )[ 0 ];
418 bs_mean_sd = calculateMeanBoostrapValue( p );
421 final Phylogeny p = factory.create( tree_file_for_dist_val, new PhyloXmlParser() )[ 0 ];
422 bs_mean_sd = calculateMeanBoostrapValue( p );
424 if ( ( bs_mean_sd != null ) && ( bs_mean_sd.length == 2 ) ) {
425 final double bs_mean = bs_mean_sd[ 0 ];
426 final double bs_sd = bs_mean_sd[ 1 ];
427 output.append( "\n\nMean bootstrap value of consensus tree (sd): "
428 + ForesterUtil.roundToInt( ( bs_mean * 100.0 ) / rio_instance.getBootstraps() ) + "% (+/-"
429 + ForesterUtil.roundToInt( ( bs_sd * 100.0 ) / rio_instance.getBootstraps() ) + "%)\n" );
431 output.append( "\n\nDistance values:\n" );
432 output.append( getDistances( tree_file_for_dist_val,
437 rio_instance.getInferredOrthologs( seq_name ),
438 rio_instance.getInferredSuperOrthologs( seq_name ),
439 warn_more_than_one_ortho,
442 rio_instance.getBootstraps(),
445 if ( output_ultraparalogs ) {
446 output.append( "\n\nUltra paralogs:\n" );
447 output.append( rio_instance
448 .inferredUltraParalogsToString( seq_name, sort > 2, threshold_ultra_paralogs ) );
450 output.append( "\n\nSort priority: " + RIO.getOrder( sort ) );
451 output.append( "\nExt nodes : " + rio_instance.getExtNodesOfAnalyzedGeneTrees() );
452 output.append( "\nSamples : " + rio_instance.getBootstraps() + "\n" );
453 out = new PrintWriter( new FileWriter( outfile ), true );
455 catch ( final Exception e ) {
456 ForesterUtil.printErrorMessage( PRG_NAME, e.getLocalizedMessage() );
460 out.println( output );
462 ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + outfile + "\"" );
463 if ( TIME && VERBOSE ) {
464 time = System.currentTimeMillis() - time;
465 ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" );
467 ForesterUtil.programMessage( PRG_NAME, "OK." );
471 private final static void printHelp() {
472 System.out.println( "M= (String) Multiple gene tree file (mandatory)" );
473 System.out.println( "N= (String) Query sequence name (mandatory)" );
474 System.out.println( "S= (String) Species tree file (mandatory)" );
475 System.out.println( "O= (String) Output file name -- overwritten without warning! (mandatory)" );
476 System.out.println( "D= (String) Distance matrix file for pairwise distances" );
477 System.out.println( "T= (String) Phylogeny file for distances of query to LCA" );
478 System.out.println( " of orthologs and for mean bootstrap value (if t= is not used)," );
479 System.out.println( " must be binary )" );
480 System.out.println( "t= (String) Phylogeny file for mean bootstrap value (if this option is used," );
481 System.out.println( " the mean bootstrap value is not calculated from the tree read in" );
482 System.out.println( " with T=), not necessary binary" );
483 System.out.println( "p To output ultra paralogs" );
484 System.out.println( "P= (int) Sort priority" );
485 System.out.println( "L= (double) Threshold orthologs for output" );
486 System.out.println( "U= (double) Threshold orthologs for distance calculation" );
487 System.out.println( "X= (int) More than one ortholog: " );
488 System.out.println( " numbers of sd the dist. to LCA has to differ from mean to generate a warning" );
489 System.out.println( "Y= (int) No orthologs:" );
490 System.out.println( " Numbers of sd the dist to root has to differ from mean to generate a warning" );
491 System.out.println( "Z= (double) One ortholog:" );
492 System.out.println( " threshold for factor between the two distances to their LCA (larger/smaller)" );
493 System.out.println( " to generate a warning" );
494 System.out.println();
495 System.out.println( " Sort priority (\"P=\"):" );
496 System.out.println( RIO.getOrderHelp().toString() );
497 System.out.println();
499 .println( " Example: \"rio M=gene_trees.xml N=bcl2_NEMVE S=species_tree.xml D=distances P=13 p O=out\"" );
500 System.out.println();