rio, lca, refactoring
[jalview.git] / forester / java / src / org / forester / application / rio.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
27
28 package org.forester.application;
29
30 import java.io.File;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.PrintWriter;
34 import java.util.ArrayList;
35 import java.util.HashMap;
36 import java.util.Vector;
37
38 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
39 import org.forester.phylogeny.Phylogeny;
40 import org.forester.phylogeny.PhylogenyMethods;
41 import org.forester.phylogeny.PhylogenyNode;
42 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
43 import org.forester.phylogeny.factories.PhylogenyFactory;
44 import org.forester.phylogeny.iterators.PreorderTreeIterator;
45 import org.forester.sdi.DistanceCalculator;
46 import org.forester.sdi.RIO;
47 import org.forester.sdi.SDIException;
48 import org.forester.sdi.SDIR;
49 import org.forester.util.ForesterUtil;
50
51 public class rio {
52
53     final static private String  PRG_NAME                         = "RIO";
54     final static private String  PRG_VERSION                      = "2.03 ALPHA";
55     final static private String  PRG_DATE                         = "2010.01.15";
56     final static private String  E_MAIL                           = "czmasek@burnham.org";
57     final static private String  WWW                              = "www.phylosoft.org/forester/";
58     final static private boolean TIME                             = true;
59     final static private boolean VERBOSE                          = true;
60     // For method getDistances -- calculation of distances.
61     final static private boolean MINIMIZE_COST                    = false;
62     // For method getDistances -- calculation of distances.
63     final static private boolean MINIMIZE_DUPS                    = true;
64     // For method getDistances -- calculation of distances.
65     final static private boolean MINIMIZE_HEIGHT                  = true;
66     final static private int     WARN_NO_ORTHOS_DEFAULT           = 2;
67     final static private int
68                                  // How many sd away from mean to root.
69                                  WARN_MORE_THAN_ONE_ORTHO_DEFAULT = 2;
70     // How many sd away from mean to LCA of orthos.
71     final static private double  THRESHOLD_ULTRA_PARALOGS_DEFAULT = 50;
72     // How many sd away from mean to LCA of orthos.
73     final static private double  WARN_ONE_ORTHO_DEFAULT           = 2;
74
75     // Factor between the two distances to their LCA
76     // (larger/smaller).
77     // Factor between the two distances to their LCA
78     // (larger/smaller).
79     /**
80      * Calculates the mean and standard deviation of all nodes of Phylogeny t
81      * which have a bootstrap values zero or more. Returns null in case of
82      * failure (e.g t has no bootstrap values, or just one).
83      * <p>
84      * 
85      * @param t
86      *            reference to a tree with bootstrap values
87      * @return Array of doubles, [0] is the mean, [1] the standard deviation
88      */
89     private static double[] calculateMeanBoostrapValue( final Phylogeny t ) {
90         double b = 0;
91         int n = 0;
92         long sum = 0;
93         double x = 0.0, mean = 0.0;
94         final double[] da = new double[ 2 ];
95         final Vector<Double> bv = new Vector<Double>();
96         PhylogenyNode node = null;
97         PreorderTreeIterator i = null;
98         i = new PreorderTreeIterator( t );
99         // Calculates the mean.
100         while ( i.hasNext() ) {
101             node = i.next();
102             if ( !( ( node.getParent() != null ) && node.getParent().isRoot()
103                     && ( PhylogenyMethods.getConfidenceValue( node.getParent().getChildNode1() ) > 0 )
104                     && ( PhylogenyMethods.getConfidenceValue( node.getParent().getChildNode2() ) > 0 ) && ( node
105                     .getParent().getChildNode2() == node ) ) ) {
106                 b = PhylogenyMethods.getConfidenceValue( node );
107                 if ( b > 0 ) {
108                     sum += b;
109                     bv.addElement( new Double( b ) );
110                     n++;
111                 }
112             }
113             // i.next();
114         }
115         if ( n < 2 ) {
116             return null;
117         }
118         mean = ( double ) sum / n;
119         // Calculates the standard deviation.
120         sum = 0;
121         for( int j = 0; j < n; ++j ) {
122             b = ( bv.elementAt( j ) ).intValue();
123             x = b - mean;
124             sum += ( x * x );
125         }
126         da[ 0 ] = mean;
127         da[ 1 ] = java.lang.Math.sqrt( sum / ( n - 1.0 ) );
128         return da;
129     }
130
131     private final static void errorInCommandLine() {
132         System.out.println( "\nrio: Error in command line.\n" );
133         printHelp();
134         System.exit( -1 );
135     }
136
137     // Uses DistanceCalculator to calculate distances.
138     private final static StringBuffer getDistances( final File tree_file_for_dist_val,
139                                                     final File outfile,
140                                                     final Phylogeny species_tree,
141                                                     final String seq_name,
142                                                     final ArrayList<String> al_ortholog_names_for_dc,
143                                                     final HashMap<String, Integer> ortholog_hashmap,
144                                                     final HashMap<String, Integer> super_ortholog_hashmap,
145                                                     final int warn_more_than_one_ortho,
146                                                     final int warn_no_orthos,
147                                                     final double warn_one_ortho,
148                                                     final int bootstraps,
149                                                     final double t_orthologs_dc ) throws IOException, SDIException {
150         Phylogeny consensus_tree = null;
151         Phylogeny
152         // to be a consensus tree.
153         assigned_cons_tree = null;
154         final SDIR sdiunrooted = new SDIR();
155         final ArrayList<PhylogenyNode> al_ortholog_nodes = new ArrayList<PhylogenyNode>();
156         double m = 0.0;
157         double sd = 0.0;
158         double d = 0.0;
159         int n = 0;
160         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
161         consensus_tree = factory.create( tree_file_for_dist_val, new PhyloXmlParser() )[ 0 ];
162         PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, consensus_tree );
163         assigned_cons_tree = sdiunrooted.infer( consensus_tree,
164                                                 species_tree,
165                                                 rio.MINIMIZE_COST,
166                                                 rio.MINIMIZE_DUPS,
167                                                 rio.MINIMIZE_HEIGHT,
168                                                 true,
169                                                 1 )[ 0 ];
170         final DistanceCalculator dc = new DistanceCalculator();
171         final StringBuffer sb = new StringBuffer();
172         sb.append( "Given the threshold for distance calculations (" + ForesterUtil.roundToInt( t_orthologs_dc )
173                 + "): " );
174         // No orthologs.
175         if ( al_ortholog_names_for_dc.size() == 0 ) {
176             dc.setTree( assigned_cons_tree );
177             // Remark. Calculation of mean and sd _does_ include the node
178             // with seq_name.
179             m = dc.getMean();
180             sd = dc.getStandardDeviation();
181             d = dc.getDistanceToRoot( seq_name );
182             n = dc.getN();
183             sb.append( "No sequence is considered orthologous to query."
184                     + "\ndistance of query to root                     = " + ForesterUtil.FORMATTER_06.format( d )
185                     + "\nmean of distances (for all sequences) to root = " + ForesterUtil.FORMATTER_06.format( m )
186                     + "\nsd of distances (for all sequences) to root   = " + ForesterUtil.FORMATTER_06.format( sd )
187                     + "\nn (sum of sequences in alignment plus query)  = " + n );
188             if ( !( ( ( m - ( warn_no_orthos * sd ) ) < d ) && ( ( m + ( warn_no_orthos * sd ) ) > d ) ) ) {
189                 sb.append( "\nWARNING: distance of query to root is outside of mean+/-" + warn_no_orthos + "*sd!" );
190             }
191         }
192         // One ortholog.
193         else if ( al_ortholog_names_for_dc.size() == 1 ) {
194             final String name_of_ortholog = al_ortholog_names_for_dc.get( 0 );
195             al_ortholog_nodes.add( assigned_cons_tree.getNode( name_of_ortholog ) );
196             al_ortholog_nodes.add( assigned_cons_tree.getNode( seq_name ) );
197             dc.setTreeAndExtNodes( assigned_cons_tree, al_ortholog_nodes );
198             // Remark. Calculation of mean _does_ include the node
199             // with seq_name.
200             d = dc.getDistanceToLCA( seq_name );
201             final double d_o = dc.getDistanceToLCA( name_of_ortholog );
202             sb.append( "One sequence is considered orthologous to query." + "\nLCA is LCA of query and its ortholog."
203                     + "\ndistance of query to LCA    = " + ForesterUtil.FORMATTER_06.format( d )
204                     + "\ndistance of ortholog to LCA = " + ForesterUtil.FORMATTER_06.format( d_o ) );
205             if ( ( d_o > 0.0 )
206                     && ( d > 0.0 )
207                     && ( ( ( d_o >= d ) && ( ( d_o / d ) > warn_one_ortho ) ) || ( ( d_o < d ) && ( ( d / d_o ) > warn_one_ortho ) ) ) ) {
208                 sb.append( "\nWARNING: Ratio of distances to LCA is greater than " + warn_one_ortho + "!" );
209             }
210             else if ( ( ( d_o == 0.0 ) || ( d == 0.0 ) ) && ( ( d_o != 0.0 ) || ( d != 0.0 ) ) ) {
211                 sb.append( "\nWARNING: Ratio could not be calculated, " + " one distance is 0.0!" );
212             }
213         }
214         // More than one ortholog.
215         else {
216             for( int i = 0; i < al_ortholog_names_for_dc.size(); ++i ) {
217                 al_ortholog_nodes.add( assigned_cons_tree.getNodeViaSequenceName( al_ortholog_names_for_dc.get( i ) ) );
218             }
219             al_ortholog_nodes.add( assigned_cons_tree.getNodesViaSequenceName( seq_name ).get( 0 ) );
220             dc.setTreeAndExtNodes( assigned_cons_tree, al_ortholog_nodes );
221             // Remark. Calculation of mean and sd _does_ include the node
222             // with seq_name.
223             m = dc.getMean();
224             sd = dc.getStandardDeviation();
225             d = dc.getDistanceToLCA( seq_name );
226             n = dc.getN();
227             sb.append( "More than one sequence is considered orthologous to query."
228                     + "\nLCA is LCA of query and its orthologs."
229                     + "\ndistance of query to LCA                               = "
230                     + ForesterUtil.FORMATTER_06.format( d )
231                     + "\nmean of distances (for query and its orthologs) to LCA = "
232                     + ForesterUtil.FORMATTER_06.format( m )
233                     + "\nsd of distances (for query and its orthologs) to LCA   = "
234                     + ForesterUtil.FORMATTER_06.format( sd )
235                     + "\nn (sum of orthologs plus query)                        = " + n );
236             if ( !( ( ( m - ( warn_more_than_one_ortho * sd ) ) < d ) && ( ( m + ( warn_more_than_one_ortho * sd ) ) > d ) ) ) {
237                 sb.append( "\n!WARNING: distance of query to LCA is outside of mean+/-" + warn_more_than_one_ortho
238                         + "*sd!" );
239             }
240         }
241         return sb;
242     }
243
244     public static void main( final String[] args ) {
245         ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW );
246         File species_tree_file = null;
247         File multiple_trees_file = null;
248         File outfile = null;
249         File distance_matrix_file = null;
250         File tree_file_for_dist_val = null;
251         File tree_file_for_avg_bs = null;
252         String seq_name = "";
253         String arg = "";
254         boolean output_ultraparalogs = false;
255         ArrayList<String> orthologs_al_for_dc = null;
256         double t_orthologs = 0.0;
257         double t_sn = 0.0;
258         double t_orthologs_dc = 0.0;
259         double[] bs_mean_sd = null;
260         int sort = 13;
261         Phylogeny species_tree = null;
262         RIO rio_instance = null;
263         PrintWriter out = null;
264         long time = 0;
265         int warn_no_orthos = WARN_NO_ORTHOS_DEFAULT;
266         int warn_more_than_one_ortho = WARN_MORE_THAN_ONE_ORTHO_DEFAULT;
267         double warn_one_ortho = WARN_ONE_ORTHO_DEFAULT;
268         double threshold_ultra_paralogs = THRESHOLD_ULTRA_PARALOGS_DEFAULT;
269         if ( args.length < 2 ) {
270             printHelp();
271             System.exit( 0 );
272         }
273         else if ( ( args.length < 3 ) || ( args.length > 18 ) ) {
274             errorInCommandLine();
275         }
276         for( final String arg2 : args ) {
277             if ( arg2.trim().charAt( 0 ) != 'p' ) {
278                 if ( arg2.trim().length() < 3 ) {
279                     errorInCommandLine();
280                 }
281                 else {
282                     arg = arg2.trim().substring( 2 );
283                 }
284             }
285             try {
286                 switch ( arg2.trim().charAt( 0 ) ) {
287                     case 'M':
288                         multiple_trees_file = new File( arg );
289                         break;
290                     case 'N':
291                         seq_name = arg;
292                         break;
293                     case 'S':
294                         species_tree_file = new File( arg );
295                         break;
296                     case 'O':
297                         outfile = new File( arg );
298                         break;
299                     case 'D':
300                         distance_matrix_file = new File( arg );
301                         break;
302                     case 'T':
303                         tree_file_for_dist_val = new File( arg );
304                         break;
305                     case 't':
306                         tree_file_for_avg_bs = new File( arg );
307                         break;
308                     case 'p':
309                         output_ultraparalogs = true;
310                         break;
311                     case 'P':
312                         sort = Integer.parseInt( arg );
313                         if ( ( sort < 0 ) || ( sort > 17 ) ) {
314                             errorInCommandLine();
315                         }
316                         break;
317                     case 'L':
318                         t_orthologs = Double.parseDouble( arg );
319                         break;
320                     case 'B':
321                         t_sn = Double.parseDouble( arg );
322                         break;
323                     case 'U':
324                         t_orthologs_dc = Double.parseDouble( arg );
325                         break;
326                     case 'v':
327                         threshold_ultra_paralogs = Double.parseDouble( arg );
328                         break;
329                     case 'X':
330                         warn_more_than_one_ortho = Integer.parseInt( arg );
331                         break;
332                     case 'Y':
333                         warn_no_orthos = Integer.parseInt( arg );
334                         break;
335                     case 'Z':
336                         warn_one_ortho = Double.parseDouble( arg );
337                         break;
338                     default:
339                         errorInCommandLine();
340                 }
341             }
342             catch ( final Exception e ) {
343                 errorInCommandLine();
344             }
345         }
346         if ( ( seq_name == "" ) || ( species_tree_file == null ) || ( multiple_trees_file == null )
347                 || ( outfile == null ) ) {
348             errorInCommandLine();
349         }
350         if ( ( sort < 0 ) || ( sort > 17 ) ) {
351             errorInCommandLine();
352         }
353         if ( ( sort > 2 ) && ( distance_matrix_file == null ) ) {
354             errorInCommandLine();
355         }
356         if ( VERBOSE ) {
357             System.out.println( "\nMultiple trees file:                          " + multiple_trees_file );
358             System.out.println( "Seq name:                                     " + seq_name );
359             System.out.println( "Species tree file:                            " + species_tree_file );
360             System.out.println( "Outfile:                                      " + outfile );
361             if ( distance_matrix_file != null ) {
362                 System.out.println( "Distance matrix file:                         " + distance_matrix_file );
363             }
364             if ( tree_file_for_dist_val != null ) {
365                 if ( tree_file_for_avg_bs == null ) {
366                     System.out.println( "Phy to read dists and calc mean support from: " + tree_file_for_dist_val );
367                 }
368                 else {
369                     System.out.println( "Phylogeny to read dist values from:                " + tree_file_for_dist_val );
370                 }
371             }
372             if ( tree_file_for_avg_bs != null ) {
373                 System.out.println( "Phylogeny to calc mean bootstrap from:             " + tree_file_for_avg_bs );
374             }
375             System.out.println( "Sort:                                         " + sort );
376             System.out.println( "Threshold orthologs:                          " + t_orthologs );
377             System.out.println( "Threshold subtree neighborings:               " + t_sn );
378             System.out.println( "Threshold orthologs for distance calc.:       " + t_orthologs_dc );
379             if ( output_ultraparalogs ) {
380                 System.out.println( "Threshold ultra paralogs:                     " + threshold_ultra_paralogs );
381             }
382             System.out.println( "More than one ortholog  sd diff:              " + warn_more_than_one_ortho );
383             System.out.println( "No  orthologs           sd diff:              " + warn_no_orthos );
384             System.out.println( "One ortholog            factor :              " + warn_one_ortho + "\n" );
385         }
386         if ( TIME && VERBOSE ) {
387             time = System.currentTimeMillis();
388         }
389         try {
390             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
391             species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ];
392         }
393         catch ( final Exception e ) {
394             e.printStackTrace();
395             System.exit( -1 );
396         }
397         if ( !species_tree.isRooted() ) {
398             ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not rooted" );
399             System.exit( -1 );
400         }
401         if ( !species_tree.isCompletelyBinary() ) {
402             ForesterUtil.printErrorMessage( PRG_NAME, "Species tree is not completely binary" );
403             System.exit( -1 );
404         }
405         rio_instance = new RIO();
406         final StringBuffer output = new StringBuffer();
407         try {
408             if ( distance_matrix_file != null ) {
409                 rio_instance.readDistanceMatrix( distance_matrix_file );
410             }
411             rio_instance.inferOrthologs( multiple_trees_file, species_tree.copy(), seq_name );
412             output.append( rio_instance.inferredOrthologsToString( seq_name, sort, t_orthologs, t_sn ) );
413             if ( tree_file_for_dist_val != null ) {
414                 orthologs_al_for_dc = rio_instance.inferredOrthologsToArrayList( seq_name, t_orthologs_dc );
415                 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
416                 if ( tree_file_for_avg_bs != null ) {
417                     final Phylogeny p = factory.create( tree_file_for_avg_bs, new PhyloXmlParser() )[ 0 ];
418                     bs_mean_sd = calculateMeanBoostrapValue( p );
419                 }
420                 else {
421                     final Phylogeny p = factory.create( tree_file_for_dist_val, new PhyloXmlParser() )[ 0 ];
422                     bs_mean_sd = calculateMeanBoostrapValue( p );
423                 }
424                 if ( ( bs_mean_sd != null ) && ( bs_mean_sd.length == 2 ) ) {
425                     final double bs_mean = bs_mean_sd[ 0 ];
426                     final double bs_sd = bs_mean_sd[ 1 ];
427                     output.append( "\n\nMean bootstrap value of consensus tree (sd): "
428                             + ForesterUtil.roundToInt( ( bs_mean * 100.0 ) / rio_instance.getBootstraps() ) + "% (+/-"
429                             + ForesterUtil.roundToInt( ( bs_sd * 100.0 ) / rio_instance.getBootstraps() ) + "%)\n" );
430                 }
431                 output.append( "\n\nDistance values:\n" );
432                 output.append( getDistances( tree_file_for_dist_val,
433                                              outfile,
434                                              species_tree,
435                                              seq_name,
436                                              orthologs_al_for_dc,
437                                              rio_instance.getInferredOrthologs( seq_name ),
438                                              rio_instance.getInferredSuperOrthologs( seq_name ),
439                                              warn_more_than_one_ortho,
440                                              warn_no_orthos,
441                                              warn_one_ortho,
442                                              rio_instance.getBootstraps(),
443                                              t_orthologs_dc ) );
444             }
445             if ( output_ultraparalogs ) {
446                 output.append( "\n\nUltra paralogs:\n" );
447                 output.append( rio_instance
448                         .inferredUltraParalogsToString( seq_name, sort > 2, threshold_ultra_paralogs ) );
449             }
450             output.append( "\n\nSort priority: " + RIO.getOrder( sort ) );
451             output.append( "\nExt nodes    : " + rio_instance.getExtNodesOfAnalyzedGeneTrees() );
452             output.append( "\nSamples      : " + rio_instance.getBootstraps() + "\n" );
453             out = new PrintWriter( new FileWriter( outfile ), true );
454         }
455         catch ( final Exception e ) {
456             ForesterUtil.printErrorMessage( PRG_NAME, e.getLocalizedMessage() );
457             e.printStackTrace();
458             System.exit( -1 );
459         }
460         out.println( output );
461         out.close();
462         ForesterUtil.programMessage( PRG_NAME, "wrote results to \"" + outfile + "\"" );
463         if ( TIME && VERBOSE ) {
464             time = System.currentTimeMillis() - time;
465             ForesterUtil.programMessage( PRG_NAME, "time: " + time + "ms" );
466         }
467         ForesterUtil.programMessage( PRG_NAME, "OK." );
468         System.exit( 0 );
469     }
470
471     private final static void printHelp() {
472         System.out.println( "M= (String) Multiple gene tree file (mandatory)" );
473         System.out.println( "N= (String) Query sequence name (mandatory)" );
474         System.out.println( "S= (String) Species tree file (mandatory)" );
475         System.out.println( "O= (String) Output file name -- overwritten without warning! (mandatory)" );
476         System.out.println( "D= (String) Distance matrix file for pairwise distances" );
477         System.out.println( "T= (String) Phylogeny file for distances of query to LCA" );
478         System.out.println( "            of orthologs and for mean bootstrap value (if t= is not used)," );
479         System.out.println( "            must be binary )" );
480         System.out.println( "t= (String) Phylogeny file for mean bootstrap value (if this option is used," );
481         System.out.println( "            the mean bootstrap value is not calculated from the tree read in" );
482         System.out.println( "            with T=), not necessary binary" );
483         System.out.println( "p           To output ultra paralogs" );
484         System.out.println( "P= (int)    Sort priority" );
485         System.out.println( "L= (double) Threshold orthologs for output" );
486         System.out.println( "U= (double) Threshold orthologs for distance calculation" );
487         System.out.println( "X= (int)    More than one ortholog: " );
488         System.out.println( "            numbers of sd the dist. to LCA has to differ from mean to generate a warning" );
489         System.out.println( "Y= (int)    No orthologs:" );
490         System.out.println( "            Numbers of sd the dist to root has to differ from mean to generate a warning" );
491         System.out.println( "Z= (double) One ortholog:" );
492         System.out.println( "            threshold for factor between the two distances to their LCA (larger/smaller)" );
493         System.out.println( "            to generate a warning" );
494         System.out.println();
495         System.out.println( " Sort priority (\"P=\"):" );
496         System.out.println( RIO.getOrderHelp().toString() );
497         System.out.println();
498         System.out
499                 .println( " Example: \"rio M=gene_trees.xml N=bcl2_NEMVE S=species_tree.xml D=distances P=13 p O=out\"" );
500         System.out.println();
501     }
502 }