2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
28 package org.forester.application;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.PrintWriter;
34 import java.util.ArrayList;
35 import java.util.HashMap;
36 import java.util.Vector;
38 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
39 import org.forester.phylogeny.Phylogeny;
40 import org.forester.phylogeny.PhylogenyMethods;
41 import org.forester.phylogeny.PhylogenyNode;
42 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
43 import org.forester.phylogeny.factories.PhylogenyFactory;
44 import org.forester.phylogeny.iterators.PreorderTreeIterator;
45 import org.forester.sdi.DistanceCalculator;
46 import org.forester.sdi.RIO;
47 import org.forester.sdi.SDIR;
48 import org.forester.util.ForesterUtil;
52 final static private String PRG_NAME = "RIO";
53 final static private String PRG_VERSION = "2.03 ALPHA";
54 final static private String PRG_DATE = "2010.01.15";
55 final static private String E_MAIL = "czmasek@burnham.org";
56 final static private String WWW = "www.phylosoft.org/forester/";
57 final static private boolean TIME = true;
58 final static private boolean VERBOSE = true;
59 // For method getDistances -- calculation of distances.
60 final static private boolean MINIMIZE_COST = false;
61 // For method getDistances -- calculation of distances.
62 final static private boolean MINIMIZE_DUPS = true;
63 // For method getDistances -- calculation of distances.
64 final static private boolean MINIMIZE_HEIGHT = true;
65 final static private int WARN_NO_ORTHOS_DEFAULT = 2;
66 final static private int
67 // How many sd away from mean to root.
68 WARN_MORE_THAN_ONE_ORTHO_DEFAULT = 2;
69 // How many sd away from mean to LCA of orthos.
70 final static private double THRESHOLD_ULTRA_PARALOGS_DEFAULT = 50;
71 // How many sd away from mean to LCA of orthos.
72 final static private double WARN_ONE_ORTHO_DEFAULT = 2;
74 // Factor between the two distances to their LCA
76 // Factor between the two distances to their LCA
79 * Calculates the mean and standard deviation of all nodes of Phylogeny t
80 * which have a bootstrap values zero or more. Returns null in case of
81 * failure (e.g t has no bootstrap values, or just one).
85 * reference to a tree with bootstrap values
86 * @return Array of doubles, [0] is the mean, [1] the standard deviation
88 private static double[] calculateMeanBoostrapValue( final Phylogeny t ) {
92 double x = 0.0, mean = 0.0;
93 final double[] da = new double[ 2 ];
94 final Vector<Double> bv = new Vector<Double>();
95 PhylogenyNode node = null;
96 PreorderTreeIterator i = null;
97 i = new PreorderTreeIterator( t );
98 // Calculates the mean.
99 while ( i.hasNext() ) {
101 if ( !( ( node.getParent() != null ) && node.getParent().isRoot()
102 && ( PhylogenyMethods.getConfidenceValue( node.getParent().getChildNode1() ) > 0 )
103 && ( PhylogenyMethods.getConfidenceValue( node.getParent().getChildNode2() ) > 0 ) && ( node
104 .getParent().getChildNode2() == node ) ) ) {
105 b = PhylogenyMethods.getConfidenceValue( node );
108 bv.addElement( new Double( b ) );
117 mean = ( double ) sum / n;
118 // Calculates the standard deviation.
120 for( int j = 0; j < n; ++j ) {
121 b = ( bv.elementAt( j ) ).intValue();
126 da[ 1 ] = java.lang.Math.sqrt( sum / ( n - 1.0 ) );
130 private final static void errorInCommandLine() {
131 System.out.println( "\nrio: Error in command line.\n" );
136 // Uses DistanceCalculator to calculate distances.
137 private final static StringBuffer getDistances( final File tree_file_for_dist_val,
139 final Phylogeny species_tree,
140 final String seq_name,
141 final ArrayList<String> al_ortholog_names_for_dc,
142 final HashMap<String, Integer> ortholog_hashmap,
143 final HashMap<String, Integer> super_ortholog_hashmap,
144 final int warn_more_than_one_ortho,
145 final int warn_no_orthos,
146 final double warn_one_ortho,
147 final int bootstraps,
148 final double t_orthologs_dc ) throws IOException {
149 Phylogeny consensus_tree = null;
151 // to be a consensus tree.
152 assigned_cons_tree = null;
153 final SDIR sdiunrooted = new SDIR();
154 final ArrayList<PhylogenyNode> al_ortholog_nodes = new ArrayList<PhylogenyNode>();
159 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
160 consensus_tree = factory.create( tree_file_for_dist_val, new PhyloXmlParser() )[ 0 ];
161 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, consensus_tree );
162 assigned_cons_tree = sdiunrooted.infer( consensus_tree,
169 final DistanceCalculator dc = new DistanceCalculator();
170 final StringBuffer sb = new StringBuffer();
171 sb.append( "Given the threshold for distance calculations (" + ForesterUtil.roundToInt( t_orthologs_dc )
174 if ( al_ortholog_names_for_dc.size() == 0 ) {
175 dc.setTree( assigned_cons_tree );
176 // Remark. Calculation of mean and sd _does_ include the node
179 sd = dc.getStandardDeviation();
180 d = dc.getDistanceToRoot( seq_name );
182 sb.append( "No sequence is considered orthologous to query."
183 + "\ndistance of query to root = " + ForesterUtil.FORMATTER_06.format( d )
184 + "\nmean of distances (for all sequences) to root = " + ForesterUtil.FORMATTER_06.format( m )
185 + "\nsd of distances (for all sequences) to root = " + ForesterUtil.FORMATTER_06.format( sd )
186 + "\nn (sum of sequences in alignment plus query) = " + n );
187 if ( !( ( ( m - ( warn_no_orthos * sd ) ) < d ) && ( ( m + ( warn_no_orthos * sd ) ) > d ) ) ) {
188 sb.append( "\nWARNING: distance of query to root is outside of mean+/-" + warn_no_orthos + "*sd!" );
192 else if ( al_ortholog_names_for_dc.size() == 1 ) {
193 final String name_of_ortholog = al_ortholog_names_for_dc.get( 0 );
194 al_ortholog_nodes.add( assigned_cons_tree.getNode( name_of_ortholog ) );
195 al_ortholog_nodes.add( assigned_cons_tree.getNode( seq_name ) );
196 dc.setTreeAndExtNodes( assigned_cons_tree, al_ortholog_nodes );
197 // Remark. Calculation of mean _does_ include the node
199 d = dc.getDistanceToLCA( seq_name );
200 final double d_o = dc.getDistanceToLCA( name_of_ortholog );
201 sb.append( "One sequence is considered orthologous to query." + "\nLCA is LCA of query and its ortholog."
202 + "\ndistance of query to LCA = " + ForesterUtil.FORMATTER_06.format( d )
203 + "\ndistance of ortholog to LCA = " + ForesterUtil.FORMATTER_06.format( d_o ) );
206 && ( ( ( d_o >= d ) && ( ( d_o / d ) > warn_one_ortho ) ) || ( ( d_o < d ) && ( ( d / d_o ) > warn_one_ortho ) ) ) ) {
207 sb.append( "\nWARNING: Ratio of distances to LCA is greater than " + warn_one_ortho + "!" );
209 else if ( ( ( d_o == 0.0 ) || ( d == 0.0 ) ) && ( ( d_o != 0.0 ) || ( d != 0.0 ) ) ) {
210 sb.append( "\nWARNING: Ratio could not be calculated, " + " one distance is 0.0!" );
213 // More than one ortholog.
215 for( int i = 0; i < al_ortholog_names_for_dc.size(); ++i ) {
216 al_ortholog_nodes.add( assigned_cons_tree.getNodeViaSequenceName( al_ortholog_names_for_dc.get( i ) ) );
218 al_ortholog_nodes.add( assigned_cons_tree.getNodesViaSequenceName( seq_name ).get( 0 ) );
219 dc.setTreeAndExtNodes( assigned_cons_tree, al_ortholog_nodes );
220 // Remark. Calculation of mean and sd _does_ include the node
223 sd = dc.getStandardDeviation();
224 d = dc.getDistanceToLCA( seq_name );
226 sb.append( "More than one sequence is considered orthologous to query."
227 + "\nLCA is LCA of query and its orthologs."
228 + "\ndistance of query to LCA = "
229 + ForesterUtil.FORMATTER_06.format( d )
230 + "\nmean of distances (for query and its orthologs) to LCA = "
231 + ForesterUtil.FORMATTER_06.format( m )
232 + "\nsd of distances (for query and its orthologs) to LCA = "
233 + ForesterUtil.FORMATTER_06.format( sd )
234 + "\nn (sum of orthologs plus query) = " + n );
235 if ( !( ( ( m - ( warn_more_than_one_ortho * sd ) ) < d ) && ( ( m + ( warn_more_than_one_ortho * sd ) ) > d ) ) ) {
236 sb.append( "\n!WARNING: distance of query to LCA is outside of mean+/-" + warn_more_than_one_ortho
243 public static void main( final String[] args ) {
244 ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW );
245 File species_tree_file = null;
246 File multiple_trees_file = null;
248 File distance_matrix_file = null;
249 File tree_file_for_dist_val = null;
250 File tree_file_for_avg_bs = null;
251 String seq_name = "";
253 boolean output_ultraparalogs = false;
254 ArrayList<String> orthologs_al_for_dc = null;
255 double t_orthologs = 0.0;
257 double t_orthologs_dc = 0.0;
258 double[] bs_mean_sd = null;
260 Phylogeny species_tree = null;
261 RIO rio_instance = null;
262 PrintWriter out = null;
264 int warn_no_orthos = WARN_NO_ORTHOS_DEFAULT;
265 int warn_more_than_one_ortho = WARN_MORE_THAN_ONE_ORTHO_DEFAULT;
266 double warn_one_ortho = WARN_ONE_ORTHO_DEFAULT;
267 double threshold_ultra_paralogs = THRESHOLD_ULTRA_PARALOGS_DEFAULT;
268 if ( args.length < 2 ) {
272 else if ( ( args.length < 3 ) || ( args.length > 18 ) ) {
273 errorInCommandLine();
275 for( int i = 0; i < args.length; ++i ) {
276 if ( args[ i ].trim().charAt( 0 ) != 'p' ) {
277 if ( args[ i ].trim().length() < 3 ) {
278 errorInCommandLine();
281 arg = args[ i ].trim().substring( 2 );
285 switch ( args[ i ].trim().charAt( 0 ) ) {
287 multiple_trees_file = new File( arg );
293 species_tree_file = new File( arg );
296 outfile = new File( arg );
299 distance_matrix_file = new File( arg );
302 tree_file_for_dist_val = new File( arg );
305 tree_file_for_avg_bs = new File( arg );
308 output_ultraparalogs = true;
311 sort = Integer.parseInt( arg );
312 if ( ( sort < 0 ) || ( sort > 17 ) ) {
313 errorInCommandLine();
317 t_orthologs = Double.parseDouble( arg );
320 t_sn = Double.parseDouble( arg );
323 t_orthologs_dc = Double.parseDouble( arg );
326 threshold_ultra_paralogs = Double.parseDouble( arg );
329 warn_more_than_one_ortho = Integer.parseInt( arg );
332 warn_no_orthos = Integer.parseInt( arg );
335 warn_one_ortho = Double.parseDouble( arg );
338 errorInCommandLine();
341 catch ( final Exception e ) {
342 errorInCommandLine();
345 if ( ( seq_name == "" ) || ( species_tree_file == null ) || ( multiple_trees_file == null )
346 || ( outfile == null ) ) {
347 errorInCommandLine();
349 if ( ( sort < 0 ) || ( sort > 17 ) ) {
350 errorInCommandLine();
352 if ( ( sort > 2 ) && ( distance_matrix_file == null ) ) {
353 errorInCommandLine();
356 System.out.println( "\nMultiple trees file: " + multiple_trees_file );
357 System.out.println( "Seq name: " + seq_name );
358 System.out.println( "Species tree file: " + species_tree_file );
359 System.out.println( "Outfile: " + outfile );
360 if ( distance_matrix_file != null ) {
361 System.out.println( "Distance matrix file: " + distance_matrix_file );
363 if ( tree_file_for_dist_val != null ) {
364 if ( tree_file_for_avg_bs == null ) {
365 System.out.println( "Phy to read dists and calc mean support from: " + tree_file_for_dist_val );
368 System.out.println( "Phylogeny to read dist values from: " + tree_file_for_dist_val );
371 if ( tree_file_for_avg_bs != null ) {
372 System.out.println( "Phylogeny to calc mean bootstrap from: " + tree_file_for_avg_bs );
374 System.out.println( "Sort: " + sort );
375 System.out.println( "Threshold orthologs: " + t_orthologs );
376 System.out.println( "Threshold subtree neighborings: " + t_sn );
377 System.out.println( "Threshold orthologs for distance calc.: " + t_orthologs_dc );
378 if ( output_ultraparalogs ) {
379 System.out.println( "Threshold ultra paralogs: " + threshold_ultra_paralogs );
381 System.out.println( "More than one ortholog sd diff: " + warn_more_than_one_ortho );
382 System.out.println( "No orthologs sd diff: " + warn_no_orthos );
383 System.out.println( "One ortholog factor : " + warn_one_ortho + "\n" );
385 if ( TIME && VERBOSE ) {
386 time = System.currentTimeMillis();
389 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
390 species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ];
392 catch ( final Exception e ) {
396 if ( !species_tree.isRooted() ) {
397 ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not rooted" );
400 if ( !species_tree.isCompletelyBinary() ) {
401 ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not completely binary" );
404 rio_instance = new RIO();
405 final StringBuffer output = new StringBuffer();
407 if ( distance_matrix_file != null ) {
408 rio_instance.readDistanceMatrix( distance_matrix_file );
410 rio_instance.inferOrthologs( multiple_trees_file, species_tree.copy(), seq_name );
411 output.append( rio_instance.inferredOrthologsToString( seq_name, sort, t_orthologs, t_sn ) );
412 if ( tree_file_for_dist_val != null ) {
413 orthologs_al_for_dc = rio_instance.inferredOrthologsToArrayList( seq_name, t_orthologs_dc );
414 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
415 if ( tree_file_for_avg_bs != null ) {
416 final Phylogeny p = factory.create( tree_file_for_avg_bs, new PhyloXmlParser() )[ 0 ];
417 bs_mean_sd = calculateMeanBoostrapValue( p );
420 final Phylogeny p = factory.create( tree_file_for_dist_val, new PhyloXmlParser() )[ 0 ];
421 bs_mean_sd = calculateMeanBoostrapValue( p );
423 if ( ( bs_mean_sd != null ) && ( bs_mean_sd.length == 2 ) ) {
424 final double bs_mean = bs_mean_sd[ 0 ];
425 final double bs_sd = bs_mean_sd[ 1 ];
426 output.append( "\n\nMean bootstrap value of consensus tree (sd): "
427 + ForesterUtil.roundToInt( ( bs_mean * 100.0 ) / rio_instance.getBootstraps() ) + "% (+/-"
428 + ForesterUtil.roundToInt( ( bs_sd * 100.0 ) / rio_instance.getBootstraps() ) + "%)\n" );
430 output.append( "\n\nDistance values:\n" );
431 output.append( getDistances( tree_file_for_dist_val,
436 rio_instance.getInferredOrthologs( seq_name ),
437 rio_instance.getInferredSuperOrthologs( seq_name ),
438 warn_more_than_one_ortho,
441 rio_instance.getBootstraps(),
444 if ( output_ultraparalogs ) {
445 output.append( "\n\nUltra paralogs:\n" );
446 output.append( rio_instance
447 .inferredUltraParalogsToString( seq_name, sort > 2, threshold_ultra_paralogs ) );
449 output.append( "\n\nSort priority: " + RIO.getOrder( sort ) );
450 output.append( "\nExt nodes : " + rio_instance.getExtNodesOfAnalyzedGeneTrees() );
451 output.append( "\nSamples : " + rio_instance.getBootstraps() + "\n" );
452 out = new PrintWriter( new FileWriter( outfile ), true );
454 catch ( final Exception e ) {
455 ForesterUtil.printErrorMessage( PRG_NAME, e.getLocalizedMessage() );
459 out.println( output );
461 ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + outfile + "\"" );
462 if ( TIME && VERBOSE ) {
463 time = System.currentTimeMillis() - time;
464 ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" );
466 ForesterUtil.programMessage( PRG_NAME, "OK." );
470 private final static void printHelp() {
471 System.out.println( "M= (String) Multiple gene tree file (mandatory)" );
472 System.out.println( "N= (String) Query sequence name (mandatory)" );
473 System.out.println( "S= (String) Species tree file (mandatory)" );
474 System.out.println( "O= (String) Output file name -- overwritten without warning! (mandatory)" );
475 System.out.println( "D= (String) Distance matrix file for pairwise distances" );
476 System.out.println( "T= (String) Phylogeny file for distances of query to LCA" );
477 System.out.println( " of orthologs and for mean bootstrap value (if t= is not used)," );
478 System.out.println( " must be binary )" );
479 System.out.println( "t= (String) Phylogeny file for mean bootstrap value (if this option is used," );
480 System.out.println( " the mean bootstrap value is not calculated from the tree read in" );
481 System.out.println( " with T=), not necessary binary" );
482 System.out.println( "p To output ultra paralogs" );
483 System.out.println( "P= (int) Sort priority" );
484 System.out.println( "L= (double) Threshold orthologs for output" );
485 System.out.println( "U= (double) Threshold orthologs for distance calculation" );
486 System.out.println( "X= (int) More than one ortholog: " );
487 System.out.println( " numbers of sd the dist. to LCA has to differ from mean to generate a warning" );
488 System.out.println( "Y= (int) No orthologs:" );
489 System.out.println( " Numbers of sd the dist to root has to differ from mean to generate a warning" );
490 System.out.println( "Z= (double) One ortholog:" );
491 System.out.println( " threshold for factor between the two distances to their LCA (larger/smaller)" );
492 System.out.println( " to generate a warning" );
493 System.out.println();
494 System.out.println( " Sort priority (\"P=\"):" );
495 System.out.println( RIO.getOrderHelp().toString() );
496 System.out.println();
498 .println( " Example: \"rio M=gene_trees.xml N=bcl2_NEMVE S=species_tree.xml D=distances P=13 p O=out\"" );
499 System.out.println();