in progress
[jalview.git] / forester / java / src / org / forester / application / rio.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
27
28 package org.forester.application;
29
30 import java.io.File;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.PrintWriter;
34 import java.util.ArrayList;
35 import java.util.HashMap;
36 import java.util.Vector;
37
38 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
39 import org.forester.phylogeny.Phylogeny;
40 import org.forester.phylogeny.PhylogenyMethods;
41 import org.forester.phylogeny.PhylogenyNode;
42 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
43 import org.forester.phylogeny.factories.PhylogenyFactory;
44 import org.forester.phylogeny.iterators.PreorderTreeIterator;
45 import org.forester.sdi.DistanceCalculator;
46 import org.forester.sdi.RIO;
47 import org.forester.sdi.SDIR;
48 import org.forester.util.ForesterUtil;
49
50 public class rio {
51
52     final static private String  PRG_NAME                         = "RIO";
53     final static private String  PRG_VERSION                      = "2.03 ALPHA";
54     final static private String  PRG_DATE                         = "2010.01.15";
55     final static private String  E_MAIL                           = "czmasek@burnham.org";
56     final static private String  WWW                              = "www.phylosoft.org/forester/";
57     final static private boolean TIME                             = true;
58     final static private boolean VERBOSE                          = true;
59     // For method getDistances -- calculation of distances.
60     final static private boolean MINIMIZE_COST                    = false;
61     // For method getDistances -- calculation of distances.
62     final static private boolean MINIMIZE_DUPS                    = true;
63     // For method getDistances -- calculation of distances.
64     final static private boolean MINIMIZE_HEIGHT                  = true;
65     final static private int     WARN_NO_ORTHOS_DEFAULT           = 2;
66     final static private int
67                                  // How many sd away from mean to root.
68                                  WARN_MORE_THAN_ONE_ORTHO_DEFAULT = 2;
69     // How many sd away from mean to LCA of orthos.
70     final static private double  THRESHOLD_ULTRA_PARALOGS_DEFAULT = 50;
71     // How many sd away from mean to LCA of orthos.
72     final static private double  WARN_ONE_ORTHO_DEFAULT           = 2;
73
74     // Factor between the two distances to their LCA
75     // (larger/smaller).
76     // Factor between the two distances to their LCA
77     // (larger/smaller).
78     /**
79      * Calculates the mean and standard deviation of all nodes of Phylogeny t
80      * which have a bootstrap values zero or more. Returns null in case of
81      * failure (e.g t has no bootstrap values, or just one).
82      * <p>
83      * 
84      * @param t
85      *            reference to a tree with bootstrap values
86      * @return Array of doubles, [0] is the mean, [1] the standard deviation
87      */
88     private static double[] calculateMeanBoostrapValue( final Phylogeny t ) {
89         double b = 0;
90         int n = 0;
91         long sum = 0;
92         double x = 0.0, mean = 0.0;
93         final double[] da = new double[ 2 ];
94         final Vector<Double> bv = new Vector<Double>();
95         PhylogenyNode node = null;
96         PreorderTreeIterator i = null;
97         i = new PreorderTreeIterator( t );
98         // Calculates the mean.
99         while ( i.hasNext() ) {
100             node = i.next();
101             if ( !( ( node.getParent() != null ) && node.getParent().isRoot()
102                     && ( PhylogenyMethods.getConfidenceValue( node.getParent().getChildNode1() ) > 0 )
103                     && ( PhylogenyMethods.getConfidenceValue( node.getParent().getChildNode2() ) > 0 ) && ( node
104                     .getParent().getChildNode2() == node ) ) ) {
105                 b = PhylogenyMethods.getConfidenceValue( node );
106                 if ( b > 0 ) {
107                     sum += b;
108                     bv.addElement( new Double( b ) );
109                     n++;
110                 }
111             }
112             // i.next();
113         }
114         if ( n < 2 ) {
115             return null;
116         }
117         mean = ( double ) sum / n;
118         // Calculates the standard deviation.
119         sum = 0;
120         for( int j = 0; j < n; ++j ) {
121             b = ( bv.elementAt( j ) ).intValue();
122             x = b - mean;
123             sum += ( x * x );
124         }
125         da[ 0 ] = mean;
126         da[ 1 ] = java.lang.Math.sqrt( sum / ( n - 1.0 ) );
127         return da;
128     }
129
130     private final static void errorInCommandLine() {
131         System.out.println( "\nrio: Error in command line.\n" );
132         printHelp();
133         System.exit( -1 );
134     }
135
136     // Uses DistanceCalculator to calculate distances.
137     private final static StringBuffer getDistances( final File tree_file_for_dist_val,
138                                                     final File outfile,
139                                                     final Phylogeny species_tree,
140                                                     final String seq_name,
141                                                     final ArrayList<String> al_ortholog_names_for_dc,
142                                                     final HashMap<String, Integer> ortholog_hashmap,
143                                                     final HashMap<String, Integer> super_ortholog_hashmap,
144                                                     final int warn_more_than_one_ortho,
145                                                     final int warn_no_orthos,
146                                                     final double warn_one_ortho,
147                                                     final int bootstraps,
148                                                     final double t_orthologs_dc ) throws IOException {
149         Phylogeny consensus_tree = null;
150         Phylogeny
151         // to be a consensus tree.
152         assigned_cons_tree = null;
153         final SDIR sdiunrooted = new SDIR();
154         final ArrayList<PhylogenyNode> al_ortholog_nodes = new ArrayList<PhylogenyNode>();
155         double m = 0.0;
156         double sd = 0.0;
157         double d = 0.0;
158         int n = 0;
159         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
160         consensus_tree = factory.create( tree_file_for_dist_val, new PhyloXmlParser() )[ 0 ];
161         PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, consensus_tree );
162         assigned_cons_tree = sdiunrooted.infer( consensus_tree,
163                                                 species_tree,
164                                                 rio.MINIMIZE_COST,
165                                                 rio.MINIMIZE_DUPS,
166                                                 rio.MINIMIZE_HEIGHT,
167                                                 true,
168                                                 1 )[ 0 ];
169         final DistanceCalculator dc = new DistanceCalculator();
170         final StringBuffer sb = new StringBuffer();
171         sb.append( "Given the threshold for distance calculations (" + ForesterUtil.roundToInt( t_orthologs_dc )
172                 + "): " );
173         // No orthologs.
174         if ( al_ortholog_names_for_dc.size() == 0 ) {
175             dc.setTree( assigned_cons_tree );
176             // Remark. Calculation of mean and sd _does_ include the node
177             // with seq_name.
178             m = dc.getMean();
179             sd = dc.getStandardDeviation();
180             d = dc.getDistanceToRoot( seq_name );
181             n = dc.getN();
182             sb.append( "No sequence is considered orthologous to query."
183                     + "\ndistance of query to root                     = " + ForesterUtil.FORMATTER_06.format( d )
184                     + "\nmean of distances (for all sequences) to root = " + ForesterUtil.FORMATTER_06.format( m )
185                     + "\nsd of distances (for all sequences) to root   = " + ForesterUtil.FORMATTER_06.format( sd )
186                     + "\nn (sum of sequences in alignment plus query)  = " + n );
187             if ( !( ( ( m - ( warn_no_orthos * sd ) ) < d ) && ( ( m + ( warn_no_orthos * sd ) ) > d ) ) ) {
188                 sb.append( "\nWARNING: distance of query to root is outside of mean+/-" + warn_no_orthos + "*sd!" );
189             }
190         }
191         // One ortholog.
192         else if ( al_ortholog_names_for_dc.size() == 1 ) {
193             final String name_of_ortholog = al_ortholog_names_for_dc.get( 0 );
194             al_ortholog_nodes.add( assigned_cons_tree.getNode( name_of_ortholog ) );
195             al_ortholog_nodes.add( assigned_cons_tree.getNode( seq_name ) );
196             dc.setTreeAndExtNodes( assigned_cons_tree, al_ortholog_nodes );
197             // Remark. Calculation of mean _does_ include the node
198             // with seq_name.
199             d = dc.getDistanceToLCA( seq_name );
200             final double d_o = dc.getDistanceToLCA( name_of_ortholog );
201             sb.append( "One sequence is considered orthologous to query." + "\nLCA is LCA of query and its ortholog."
202                     + "\ndistance of query to LCA    = " + ForesterUtil.FORMATTER_06.format( d )
203                     + "\ndistance of ortholog to LCA = " + ForesterUtil.FORMATTER_06.format( d_o ) );
204             if ( ( d_o > 0.0 )
205                     && ( d > 0.0 )
206                     && ( ( ( d_o >= d ) && ( ( d_o / d ) > warn_one_ortho ) ) || ( ( d_o < d ) && ( ( d / d_o ) > warn_one_ortho ) ) ) ) {
207                 sb.append( "\nWARNING: Ratio of distances to LCA is greater than " + warn_one_ortho + "!" );
208             }
209             else if ( ( ( d_o == 0.0 ) || ( d == 0.0 ) ) && ( ( d_o != 0.0 ) || ( d != 0.0 ) ) ) {
210                 sb.append( "\nWARNING: Ratio could not be calculated, " + " one distance is 0.0!" );
211             }
212         }
213         // More than one ortholog.
214         else {
215             for( int i = 0; i < al_ortholog_names_for_dc.size(); ++i ) {
216                 al_ortholog_nodes.add( assigned_cons_tree.getNodeViaSequenceName( al_ortholog_names_for_dc.get( i ) ) );
217             }
218             al_ortholog_nodes.add( assigned_cons_tree.getNodesViaSequenceName( seq_name ).get( 0 ) );
219             dc.setTreeAndExtNodes( assigned_cons_tree, al_ortholog_nodes );
220             // Remark. Calculation of mean and sd _does_ include the node
221             // with seq_name.
222             m = dc.getMean();
223             sd = dc.getStandardDeviation();
224             d = dc.getDistanceToLCA( seq_name );
225             n = dc.getN();
226             sb.append( "More than one sequence is considered orthologous to query."
227                     + "\nLCA is LCA of query and its orthologs."
228                     + "\ndistance of query to LCA                               = "
229                     + ForesterUtil.FORMATTER_06.format( d )
230                     + "\nmean of distances (for query and its orthologs) to LCA = "
231                     + ForesterUtil.FORMATTER_06.format( m )
232                     + "\nsd of distances (for query and its orthologs) to LCA   = "
233                     + ForesterUtil.FORMATTER_06.format( sd )
234                     + "\nn (sum of orthologs plus query)                        = " + n );
235             if ( !( ( ( m - ( warn_more_than_one_ortho * sd ) ) < d ) && ( ( m + ( warn_more_than_one_ortho * sd ) ) > d ) ) ) {
236                 sb.append( "\n!WARNING: distance of query to LCA is outside of mean+/-" + warn_more_than_one_ortho
237                         + "*sd!" );
238             }
239         }
240         return sb;
241     }
242
243     public static void main( final String[] args ) {
244         ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW );
245         File species_tree_file = null;
246         File multiple_trees_file = null;
247         File outfile = null;
248         File distance_matrix_file = null;
249         File tree_file_for_dist_val = null;
250         File tree_file_for_avg_bs = null;
251         String seq_name = "";
252         String arg = "";
253         boolean output_ultraparalogs = false;
254         ArrayList<String> orthologs_al_for_dc = null;
255         double t_orthologs = 0.0;
256         double t_sn = 0.0;
257         double t_orthologs_dc = 0.0;
258         double[] bs_mean_sd = null;
259         int sort = 13;
260         Phylogeny species_tree = null;
261         RIO rio_instance = null;
262         PrintWriter out = null;
263         long time = 0;
264         int warn_no_orthos = WARN_NO_ORTHOS_DEFAULT;
265         int warn_more_than_one_ortho = WARN_MORE_THAN_ONE_ORTHO_DEFAULT;
266         double warn_one_ortho = WARN_ONE_ORTHO_DEFAULT;
267         double threshold_ultra_paralogs = THRESHOLD_ULTRA_PARALOGS_DEFAULT;
268         if ( args.length < 2 ) {
269             printHelp();
270             System.exit( 0 );
271         }
272         else if ( ( args.length < 3 ) || ( args.length > 18 ) ) {
273             errorInCommandLine();
274         }
275         for( int i = 0; i < args.length; ++i ) {
276             if ( args[ i ].trim().charAt( 0 ) != 'p' ) {
277                 if ( args[ i ].trim().length() < 3 ) {
278                     errorInCommandLine();
279                 }
280                 else {
281                     arg = args[ i ].trim().substring( 2 );
282                 }
283             }
284             try {
285                 switch ( args[ i ].trim().charAt( 0 ) ) {
286                     case 'M':
287                         multiple_trees_file = new File( arg );
288                         break;
289                     case 'N':
290                         seq_name = arg;
291                         break;
292                     case 'S':
293                         species_tree_file = new File( arg );
294                         break;
295                     case 'O':
296                         outfile = new File( arg );
297                         break;
298                     case 'D':
299                         distance_matrix_file = new File( arg );
300                         break;
301                     case 'T':
302                         tree_file_for_dist_val = new File( arg );
303                         break;
304                     case 't':
305                         tree_file_for_avg_bs = new File( arg );
306                         break;
307                     case 'p':
308                         output_ultraparalogs = true;
309                         break;
310                     case 'P':
311                         sort = Integer.parseInt( arg );
312                         if ( ( sort < 0 ) || ( sort > 17 ) ) {
313                             errorInCommandLine();
314                         }
315                         break;
316                     case 'L':
317                         t_orthologs = Double.parseDouble( arg );
318                         break;
319                     case 'B':
320                         t_sn = Double.parseDouble( arg );
321                         break;
322                     case 'U':
323                         t_orthologs_dc = Double.parseDouble( arg );
324                         break;
325                     case 'v':
326                         threshold_ultra_paralogs = Double.parseDouble( arg );
327                         break;
328                     case 'X':
329                         warn_more_than_one_ortho = Integer.parseInt( arg );
330                         break;
331                     case 'Y':
332                         warn_no_orthos = Integer.parseInt( arg );
333                         break;
334                     case 'Z':
335                         warn_one_ortho = Double.parseDouble( arg );
336                         break;
337                     default:
338                         errorInCommandLine();
339                 }
340             }
341             catch ( final Exception e ) {
342                 errorInCommandLine();
343             }
344         }
345         if ( ( seq_name == "" ) || ( species_tree_file == null ) || ( multiple_trees_file == null )
346                 || ( outfile == null ) ) {
347             errorInCommandLine();
348         }
349         if ( ( sort < 0 ) || ( sort > 17 ) ) {
350             errorInCommandLine();
351         }
352         if ( ( sort > 2 ) && ( distance_matrix_file == null ) ) {
353             errorInCommandLine();
354         }
355         if ( VERBOSE ) {
356             System.out.println( "\nMultiple trees file:                          " + multiple_trees_file );
357             System.out.println( "Seq name:                                     " + seq_name );
358             System.out.println( "Species tree file:                            " + species_tree_file );
359             System.out.println( "Outfile:                                      " + outfile );
360             if ( distance_matrix_file != null ) {
361                 System.out.println( "Distance matrix file:                         " + distance_matrix_file );
362             }
363             if ( tree_file_for_dist_val != null ) {
364                 if ( tree_file_for_avg_bs == null ) {
365                     System.out.println( "Phy to read dists and calc mean support from: " + tree_file_for_dist_val );
366                 }
367                 else {
368                     System.out.println( "Phylogeny to read dist values from:                " + tree_file_for_dist_val );
369                 }
370             }
371             if ( tree_file_for_avg_bs != null ) {
372                 System.out.println( "Phylogeny to calc mean bootstrap from:             " + tree_file_for_avg_bs );
373             }
374             System.out.println( "Sort:                                         " + sort );
375             System.out.println( "Threshold orthologs:                          " + t_orthologs );
376             System.out.println( "Threshold subtree neighborings:               " + t_sn );
377             System.out.println( "Threshold orthologs for distance calc.:       " + t_orthologs_dc );
378             if ( output_ultraparalogs ) {
379                 System.out.println( "Threshold ultra paralogs:                     " + threshold_ultra_paralogs );
380             }
381             System.out.println( "More than one ortholog  sd diff:              " + warn_more_than_one_ortho );
382             System.out.println( "No  orthologs           sd diff:              " + warn_no_orthos );
383             System.out.println( "One ortholog            factor :              " + warn_one_ortho + "\n" );
384         }
385         if ( TIME && VERBOSE ) {
386             time = System.currentTimeMillis();
387         }
388         try {
389             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
390             species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ];
391         }
392         catch ( final Exception e ) {
393             e.printStackTrace();
394             System.exit( -1 );
395         }
396         if ( !species_tree.isRooted() ) {
397             ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not rooted" );
398             System.exit( -1 );
399         }
400         if ( !species_tree.isCompletelyBinary() ) {
401             ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not completely binary" );
402             System.exit( -1 );
403         }
404         rio_instance = new RIO();
405         final StringBuffer output = new StringBuffer();
406         try {
407             if ( distance_matrix_file != null ) {
408                 rio_instance.readDistanceMatrix( distance_matrix_file );
409             }
410             rio_instance.inferOrthologs( multiple_trees_file, species_tree.copy(), seq_name );
411             output.append( rio_instance.inferredOrthologsToString( seq_name, sort, t_orthologs, t_sn ) );
412             if ( tree_file_for_dist_val != null ) {
413                 orthologs_al_for_dc = rio_instance.inferredOrthologsToArrayList( seq_name, t_orthologs_dc );
414                 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
415                 if ( tree_file_for_avg_bs != null ) {
416                     final Phylogeny p = factory.create( tree_file_for_avg_bs, new PhyloXmlParser() )[ 0 ];
417                     bs_mean_sd = calculateMeanBoostrapValue( p );
418                 }
419                 else {
420                     final Phylogeny p = factory.create( tree_file_for_dist_val, new PhyloXmlParser() )[ 0 ];
421                     bs_mean_sd = calculateMeanBoostrapValue( p );
422                 }
423                 if ( ( bs_mean_sd != null ) && ( bs_mean_sd.length == 2 ) ) {
424                     final double bs_mean = bs_mean_sd[ 0 ];
425                     final double bs_sd = bs_mean_sd[ 1 ];
426                     output.append( "\n\nMean bootstrap value of consensus tree (sd): "
427                             + ForesterUtil.roundToInt( ( bs_mean * 100.0 ) / rio_instance.getBootstraps() ) + "% (+/-"
428                             + ForesterUtil.roundToInt( ( bs_sd * 100.0 ) / rio_instance.getBootstraps() ) + "%)\n" );
429                 }
430                 output.append( "\n\nDistance values:\n" );
431                 output.append( getDistances( tree_file_for_dist_val,
432                                              outfile,
433                                              species_tree,
434                                              seq_name,
435                                              orthologs_al_for_dc,
436                                              rio_instance.getInferredOrthologs( seq_name ),
437                                              rio_instance.getInferredSuperOrthologs( seq_name ),
438                                              warn_more_than_one_ortho,
439                                              warn_no_orthos,
440                                              warn_one_ortho,
441                                              rio_instance.getBootstraps(),
442                                              t_orthologs_dc ) );
443             }
444             if ( output_ultraparalogs ) {
445                 output.append( "\n\nUltra paralogs:\n" );
446                 output.append( rio_instance
447                         .inferredUltraParalogsToString( seq_name, sort > 2, threshold_ultra_paralogs ) );
448             }
449             output.append( "\n\nSort priority: " + RIO.getOrder( sort ) );
450             output.append( "\nExt nodes    : " + rio_instance.getExtNodesOfAnalyzedGeneTrees() );
451             output.append( "\nSamples      : " + rio_instance.getBootstraps() + "\n" );
452             out = new PrintWriter( new FileWriter( outfile ), true );
453         }
454         catch ( final Exception e ) {
455             ForesterUtil.printErrorMessage( PRG_NAME, e.getLocalizedMessage() );
456             e.printStackTrace();
457             System.exit( -1 );
458         }
459         out.println( output );
460         out.close();
461         ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + outfile + "\"" );
462         if ( TIME && VERBOSE ) {
463             time = System.currentTimeMillis() - time;
464             ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" );
465         }
466         ForesterUtil.programMessage( PRG_NAME, "OK." );
467         System.exit( 0 );
468     }
469
470     private final static void printHelp() {
471         System.out.println( "M= (String) Multiple gene tree file (mandatory)" );
472         System.out.println( "N= (String) Query sequence name (mandatory)" );
473         System.out.println( "S= (String) Species tree file (mandatory)" );
474         System.out.println( "O= (String) Output file name -- overwritten without warning! (mandatory)" );
475         System.out.println( "D= (String) Distance matrix file for pairwise distances" );
476         System.out.println( "T= (String) Phylogeny file for distances of query to LCA" );
477         System.out.println( "            of orthologs and for mean bootstrap value (if t= is not used)," );
478         System.out.println( "            must be binary )" );
479         System.out.println( "t= (String) Phylogeny file for mean bootstrap value (if this option is used," );
480         System.out.println( "            the mean bootstrap value is not calculated from the tree read in" );
481         System.out.println( "            with T=), not necessary binary" );
482         System.out.println( "p           To output ultra paralogs" );
483         System.out.println( "P= (int)    Sort priority" );
484         System.out.println( "L= (double) Threshold orthologs for output" );
485         System.out.println( "U= (double) Threshold orthologs for distance calculation" );
486         System.out.println( "X= (int)    More than one ortholog: " );
487         System.out.println( "            numbers of sd the dist. to LCA has to differ from mean to generate a warning" );
488         System.out.println( "Y= (int)    No orthologs:" );
489         System.out.println( "            Numbers of sd the dist to root has to differ from mean to generate a warning" );
490         System.out.println( "Z= (double) One ortholog:" );
491         System.out.println( "            threshold for factor between the two distances to their LCA (larger/smaller)" );
492         System.out.println( "            to generate a warning" );
493         System.out.println();
494         System.out.println( " Sort priority (\"P=\"):" );
495         System.out.println( RIO.getOrderHelp().toString() );
496         System.out.println();
497         System.out
498                 .println( " Example: \"rio M=gene_trees.xml N=bcl2_NEMVE S=species_tree.xml D=distances P=13 p O=out\"" );
499         System.out.println();
500     }
501 }