cleanup
[jalview.git] / forester / java / src / org / forester / application / sdi_dir.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
27
28 package org.forester.application;
29
30 import java.io.File;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.PrintWriter;
34 import java.util.Arrays;
35
36 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
37 import org.forester.io.writers.PhylogenyWriter;
38 import org.forester.phylogeny.Phylogeny;
39 import org.forester.phylogeny.PhylogenyMethods;
40 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
41 import org.forester.phylogeny.factories.PhylogenyFactory;
42 import org.forester.sdi.SDIException;
43 import org.forester.sdi.SDIR;
44 import org.forester.sdi.SDIse;
45 import org.forester.util.ForesterUtil;
46
47 /*
48  * Allows to infer duplications - speciations on all (rooted or unrooted) gene
49  * trees in a directory by using method "infer" of class SDIunrooted. <p> The
50  * output of this is a (re)rooted tree with speciation - duplication assigned
51  * for each tree (in "gene tree directory" with suffix "suffix for gene trees"),
52  * as well as a summary list ("outputfile name"). <p> The summary list contains
53  * the following. The number in brackets indicates how many external nodes of
54  * the gene tree had to be removed since the associated species was not found in
55  * the species tree. "en" indicates the number of external nodes in the
56  * resulting (analyzed and returned) gene tree. "d" are the number of
57  * duplications, "L=" the mapping cost, "h=" the height, "d=" the minimal
58  * difference in tree heights (of the two subtrees at the root; this number is
59  * 0.0 for a midpoint rooted tree) of the resulting, analyzed and rooted gene
60  * tree(s). <p> The output file ending with "_Sdist" is a file which lists the
61  * distribution of trees sizes, "_Ddist" lists the distribution of the sums of
62  * duplications (up to a certain maximal size, set with final variables
63  * MAX_EXT_NODE_DIST and MAX_DUP_DIST).
64  * 
65  * @see SDIunrooted
66  * 
67  * @author Christian M. Zmasek
68  */
69 public class sdi_dir {
70
71     final static private String E_MAIL      = "czmasek@burnham.org";
72     final static private String WWW         = "www.phylosoft.org";
73     final static private String PRG_NAME    = "sdi_dir";
74     final static private String PRG_VERSION = "2.00";
75     final static private String PRG_DATE    = "2010.04.26";
76
77     private static void errorInCommandLine() {
78         System.out.println( "\nsdi_dir: Error in command line.\n" );
79         System.out.print( "Usage: % sdi_dir [-options] <gene tree directory> <suffix for gene trees>" );
80         System.out.println( " <species tree file name> <output directory> <outputfile name>" );
81         System.out.println( "\nOptions:" );
82         System.out.println( " -l to root by minimizing the mapping cost L (and also the sum of duplications)" );
83         System.out.println( " -d to root by minimizing the sum of duplications" );
84         System.out.println( " -h to root by minimizing tree height (can be used together with -l or -d)" );
85         System.out.println( " -w to write assigned gene trees into output directory" );
86         System.out.println( "\nGene tree directory" );
87         System.out.println( " The directory from which to read phyloXML formatted gene trees which" );
88         System.out.println( " contain taxonomic information in appropriate sub-elements of taxonomy" );
89         System.out.println( " (see: www.phyloxml.org)." );
90         System.out.println( " The gene trees can either be rooted, in which case no rooting with -l, -d, or -h " );
91         System.out.println( " is necessary, or they can be unrooted, in which case rooting is mandatory." );
92         System.out.println( "\nSuffix for gene trees" );
93         System.out.println( " Suffix of the gene trees to analyze (e.g. \".phyloxml\")." );
94         System.out.println( "\nSpecies tree file" );
95         System.out.println( " In phyloXML format, taxonomic information in appropriate sub-elements of taxonomy." );
96         System.out.println( " (see: www.phyloxml.org)." );
97         System.out.println( "\nOutput directory" );
98         System.out.println( " The directory into which the assigned gene trees will be written." );
99         System.out.println( "\nOutputfile name" );
100         System.out.println( " File name for summary output files." );
101         System.out.println( "" );
102         System.exit( -1 );
103     }
104
105     /**
106      * Runs method "infer" of class SDIunrooted on all gene trees in directory
107      * indir.
108      * <p>
109      * Trees are rooted by minimizing either the sum of duplications, the
110      * mapping cost L, or the tree height (or combinations thereof). One
111      * resulting tree for each (out of possibly many) is stored in outdir and a
112      * summary outfile is created. The distributions of the tree sizes (name of
113      * outfile + _Ddist) and the distributions of the sum of duplications per
114      * tree (name of outfile + _Sdist) are written out as well.
115      * <p>
116      * If both minimize_sum_of_dup and minimize_mapping_cost are true, trees are
117      * rooted by minimizing by minimizing the mapping cost L.
118      * <p>
119      * If minimize_sum_of_dup, minimize_mapping_cost, and minimize_height are
120      * false trees are assumed to be alreadty rooted.
121      * <p>
122      * (Last modified: 02/02/01)
123      * 
124      * @see SDIR#infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean)
125      * @param indir
126      *            a directory containing gene trees in NHX format
127      * @param species_tree_file
128      *            a species tree file in NHX format
129      * @param outdir
130      *            a directory where to write trees
131      * @param outfile
132      *            a file name for the summary file
133      * @param suffix
134      *            a suffix for the trees to read (e.g. nhx), is case sensitive
135      * @param write_trees
136      *            set to true to write out one tree with minmal duplications or
137      *            L each
138      * @param minimize_mapping_cost
139      *            set to true to root by minimizing the mapping cost L
140      * @param minimize_sum_of_dup
141      *            set to true to root by minimizing the sum of duplications
142      * @param minimize_height
143      *            set to true to root by minimizing the tree height -- if
144      *            minimize_mapping_cost is set to true or minimize_sum_of_dup is
145      *            set to true, then out of the resulting trees with minimal
146      *            mapping cost or minimal number of duplications the tree with
147      *            the minimal height is chosen
148      * @throws SDIException 
149      */
150     public static void infer( final File indir,
151                               final File species_tree_file,
152                               final File outdir,
153                               final File outfile,
154                               String suffix,
155                               final boolean write_trees,
156                               final boolean minimize_mapping_cost,
157                               boolean minimize_sum_of_dup,
158                               final boolean minimize_height ) throws IOException, SDIException {
159         final int MIN_EXT_NODES = 4; // Minimal size of trees [in ext nodes]
160         // to be analyzed.
161         final int MAX_EXT_NODES = 5000; // Maximal size of trees [in ext nodes]
162         // to be analyzed.
163         final int MAX_DUP_DIST = 50; // Max number of dups to output in dup
164         // distribution ("_Ddist").
165         final int MAX_EXT_NODE_DIST = 1000; // Max number of ext nodes to output
166         // in size
167         // distribution ("_Sdist").
168         int successful = 0, number_of_too_small_trees = 0, number_of_too_large_trees = 0, dups = 0, c = 0, ext_nodes = 0, removed = 0;
169         final int nodecount0 = 0;
170         int j = 0;
171         long total_number_of_d = 0, total_number_of_ext_nodes = 0, sum_costs = 0;
172         double sum_tree_heights = 0.0, sum_subtree_diff = 0.0;
173         Phylogeny species_tree = null;
174         String filename = null;
175         String[] filenames = null;
176         Phylogeny[] trees = null;
177         final int[] duplications = new int[ MAX_EXT_NODES - 1 ], // For dup
178         // distribution.
179         sizes = new int[ MAX_EXT_NODES - 1 ]; // For ext nodes dist.of
180         // successfully assigned trees.
181         File outtree = null;
182         PrintWriter out = null, out_ddist = null, out_sdist = null;
183         final File ddist_outfile = new File( outfile + "_Ddist" ), sdist_outfile = new File( outfile + "_Sdist" );
184         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.0#####" );
185         df.setDecimalSeparatorAlwaysShown( true );
186         if ( !indir.exists() || !indir.isDirectory() ) {
187             throw new IllegalArgumentException( indir + " does not exist or is not a directory." );
188         }
189         if ( !outdir.exists() || !outdir.isDirectory() ) {
190             throw new IllegalArgumentException( outdir + " does not exist or is not a directory." );
191         }
192         if ( outfile.exists() ) {
193             throw new IllegalArgumentException( outfile + " does already exist." );
194         }
195         if ( ddist_outfile.exists() ) {
196             throw new IllegalArgumentException( ddist_outfile + " does already exist." );
197         }
198         if ( sdist_outfile.exists() ) {
199             throw new IllegalArgumentException( sdist_outfile + " does already exist." );
200         }
201         if ( !species_tree_file.exists() || !species_tree_file.isFile() ) {
202             throw new IllegalArgumentException( species_tree_file + " does not exist or is not a file." );
203         }
204         if ( minimize_mapping_cost && minimize_sum_of_dup ) {
205             minimize_sum_of_dup = false;
206         }
207         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
208         species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ];
209         filenames = indir.list();
210         Arrays.sort( filenames );
211         suffix = suffix.trim();
212         out = new PrintWriter( new FileWriter( outfile ), true );
213         //nodecount0 = PhylogenyNode.getNodeCount();
214         for( int i = 0; i < filenames.length; ++i ) {
215             filename = filenames[ i ];
216             if ( ( suffix.length() < 1 ) || filename.endsWith( suffix ) ) {
217                 final File gene_tree_file = new File( indir.getPath(), filename );
218                 if ( gene_tree_file.exists() && gene_tree_file.isFile() ) {
219                     out.print( j + "\t" + filename );
220                     System.out.println( j + ": " + filename );
221                     j++;
222                     Phylogeny gene_tree = null;
223                     gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ];
224                     // Removes from gene_tree all species not found in
225                     // species_tree.
226                     removed = PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_tree );
227                     if ( filename.length() < 8 ) {
228                         out.print( "\t\t\t[-" + removed + "]" );
229                     }
230                     else if ( filename.length() < 16 ) {
231                         out.print( "\t\t[-" + removed + "]" );
232                     }
233                     else {
234                         out.print( "\t[-" + removed + "]" );
235                     }
236                     if ( gene_tree.getNumberOfExternalNodes() < MIN_EXT_NODES ) {
237                         out.print( "\t<" + MIN_EXT_NODES + "en\n" );
238                         number_of_too_small_trees++;
239                     }
240                     else if ( gene_tree.getNumberOfExternalNodes() > MAX_EXT_NODES ) {
241                         out.print( "\t>" + MAX_EXT_NODES + "en\n" );
242                         number_of_too_large_trees++;
243                     }
244                     else {
245                         SDIR sdiunrooted = null;
246                         // PhylogenyNode.setNodeCount( nodecount0 );
247                         sdiunrooted = new SDIR();
248                         if ( minimize_mapping_cost || minimize_sum_of_dup || minimize_height ) {
249                             trees = sdiunrooted.infer( gene_tree,
250                                                        species_tree,
251                                                        minimize_mapping_cost,
252                                                        minimize_sum_of_dup,
253                                                        minimize_height,
254                                                        write_trees,
255                                                        1 );
256                             dups = sdiunrooted.getMinimalDuplications();
257                         }
258                         else {
259                             final SDIse sdi = new SDIse( gene_tree, species_tree );
260                             trees = new Phylogeny[ 1 ];
261                             trees[ 0 ] = gene_tree;
262                             dups = sdi.getDuplicationsSum();
263                             c = sdi.computeMappingCostL();
264                             sum_costs += c;
265                             out.print( "\t L=" + c );
266                         }
267                         successful++;
268                         ext_nodes = gene_tree.getNumberOfExternalNodes();
269                         total_number_of_ext_nodes += ext_nodes;
270                         sizes[ ext_nodes ]++;
271                         out.print( "\t " + ext_nodes + "en" );
272                         total_number_of_d += dups;
273                         duplications[ dups ]++;
274                         out.print( "\t " + dups + "d" );
275                         if ( minimize_mapping_cost ) {
276                             c = sdiunrooted.getMinimalMappingCost();
277                             sum_costs += c;
278                             out.print( "\t L=" + c );
279                         }
280                         if ( minimize_height ) {
281                             out.print( "\t h=" + df.format( sdiunrooted.getMinimalTreeHeight() ) );
282                             out.print( "\t d=" + df.format( sdiunrooted.getMinimalDiffInSubTreeHeights() ) );
283                             sum_tree_heights += sdiunrooted.getMinimalTreeHeight();
284                             sum_subtree_diff += sdiunrooted.getMinimalDiffInSubTreeHeights();
285                         }
286                         out.println();
287                         if ( write_trees ) {
288                             outtree = new File( outdir, new File( filenames[ i ] ).getName() );
289                             final PhylogenyWriter writer = new PhylogenyWriter();
290                             writer.toPhyloXML( outtree, trees[ 0 ], 1 );
291                         }
292                     }
293                 }
294             }
295         }
296         //PhylogenyNode.setNodeCount( nodecount0 );
297         if ( minimize_mapping_cost ) {
298             out.println( "\nRooted by minimizing mapping cost L." );
299             System.out.println( "\nRooted by minimizing mapping cost L." );
300             if ( minimize_height ) {
301                 out.println( "Selected tree(s) with minimal height out of resulting trees." );
302                 System.out.println( "Selected tree(s) with minimal height out of resulting trees." );
303             }
304         }
305         else if ( minimize_sum_of_dup ) {
306             out.println( "\nRooted by minimizing sum of duplications." );
307             System.out.println( "\nRooted by minimizing sum of duplications." );
308             if ( minimize_height ) {
309                 out.println( "Selected tree(s) with minimal height out of resulting trees." );
310                 System.out.println( "Selected tree(s) with minimal height out of resulting trees." );
311             }
312         }
313         else if ( minimize_height ) {
314             out.println( "\nRooted by minimizing tree height." );
315             System.out.println( "\nRooted by minimizing tree height." );
316         }
317         else {
318             out.println( "\nNo (re) rooting was performed." );
319             System.out.println( "\nNo (re) rooting was performed." );
320         }
321         out.println( "\nTrees directory  : " + indir );
322         out.println( "Suffix for trees : " + suffix );
323         out.println( "Species tree     : " + species_tree_file );
324         out.println( "Output directory : " + outdir );
325         out.println( "Output file      : " + outfile );
326         out.println( "\nTotal number of attempts (tree files read)       : " + j );
327         out.println( "Total number of successfully assigned trees      : " + successful );
328         out.println( "Number of too small trees                        : " + number_of_too_small_trees );
329         out.println( "Number of too large trees                        : " + number_of_too_large_trees );
330         out.println( "\nSum of duplications                                   : " + total_number_of_d );
331         if ( minimize_mapping_cost ) {
332             out.println( "Sum of mapping costs L                                : " + sum_costs );
333         }
334         if ( minimize_height ) {
335             out.println( "Sum of tree heights                                   : " + sum_tree_heights );
336             out.println( "Sum of differences in subtree heights                 : " + sum_subtree_diff );
337         }
338         out.println( "Sum of external nodes (in successfully assigned trees): " + total_number_of_ext_nodes );
339         out.close();
340         System.out.println( "\nTotal number of attempts (tree files read)       : " + j );
341         System.out.println( "Total number of successfully assigned trees      : " + successful );
342         System.out.println( "Number of too small trees                        : " + number_of_too_small_trees );
343         System.out.println( "Number of too large trees                        : " + number_of_too_large_trees );
344         System.out.println( "\nSum of duplications                                   : " + total_number_of_d );
345         if ( minimize_mapping_cost ) {
346             System.out.println( "Sum of mapping costs L                                : " + sum_costs );
347         }
348         if ( minimize_height ) {
349             System.out.println( "Sum of tree heights                                   : " + sum_tree_heights );
350             System.out.println( "Sum of differences in subtree heights                 : " + sum_subtree_diff );
351         }
352         System.out.println( "Sum of external nodes (in successfully assigned trees): " + total_number_of_ext_nodes );
353         out_ddist = new PrintWriter( new FileWriter( ddist_outfile ), true );
354         for( int i = 0; ( i < duplications.length ) && ( i <= MAX_DUP_DIST ); ++i ) {
355             out_ddist.println( i + " " + duplications[ i ] );
356         }
357         out_ddist.close();
358         out_sdist = new PrintWriter( new FileWriter( sdist_outfile ), true );
359         for( int i = 0; ( i < sizes.length ) && ( i <= MAX_EXT_NODE_DIST ); ++i ) {
360             out_sdist.println( i + " " + sizes[ i ] );
361         }
362         out_sdist.close();
363     } // infer
364
365     /**
366      * Main method for this class.
367      * <p>
368      * (Last modified: 04/26/10)
369      * 
370      * @param [args[0]
371      *            -l to root by minimizing mapping cost L]
372      * @param [args[0]
373      *            -d to root by minimizing sum of duplications]
374      * @param [args[0]
375      *            -w to write out trees into outdir]
376      * @param [args[0]
377      *            -h to root by minimizing tree height]
378      * @param [args[0]
379      *            -n input trees are in New Hampshire format instead of NHX --
380      *            or gene tree is in NHX, but species information in gene tree
381      *            is only present in the form of SWISS-PROT sequence names]
382      * @param args[0or1]
383      *            trees directory name
384      * @param args[1or2]
385      *            suffix for gene trees
386      * @param args[2or3]
387      *            speciestree file name
388      * @param args[3or4]
389      *            output directory name
390      * @param args[4or5]
391      *            output file name
392      */
393     public static void main( final String args[] ) {
394         ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW );
395         // These are the default values.
396         boolean minimize_mapping_cost = false;
397         boolean minimize_sum_of_dup = false;
398         boolean minimize_height = false;
399         boolean write_trees = false;
400         File indir = null;
401         File speciestree_file = null;
402         File outdir = null;
403         File outfile = null;
404         String suffix = null;
405         if ( args.length == 5 ) {
406             indir = new File( args[ 0 ] );
407             suffix = args[ 1 ];
408             speciestree_file = new File( args[ 2 ] );
409             outdir = new File( args[ 3 ] );
410             outfile = new File( args[ 4 ] );
411         }
412         else if ( args.length == 6 ) {
413             if ( args[ 0 ].startsWith( "-" ) ) {
414                 minimize_mapping_cost = false;
415                 minimize_sum_of_dup = false;
416                 minimize_height = false;
417                 write_trees = false;
418                 if ( args[ 0 ].toLowerCase().indexOf( "w" ) != -1 ) {
419                     write_trees = true;
420                 }
421                 if ( args[ 0 ].toLowerCase().indexOf( "l" ) != -1 ) {
422                     minimize_mapping_cost = true;
423                 }
424                 if ( args[ 0 ].toLowerCase().indexOf( "d" ) != -1 ) {
425                     minimize_sum_of_dup = true;
426                 }
427                 if ( args[ 0 ].toLowerCase().indexOf( "h" ) != -1 ) {
428                     minimize_height = true;
429                 }
430             }
431             else {
432                 sdi_dir.errorInCommandLine();
433             }
434             indir = new File( args[ 1 ] );
435             suffix = args[ 2 ];
436             speciestree_file = new File( args[ 3 ] );
437             outdir = new File( args[ 4 ] );
438             outfile = new File( args[ 5 ] );
439         }
440         else {
441             sdi_dir.errorInCommandLine();
442         }
443         if ( minimize_mapping_cost && minimize_sum_of_dup ) {
444             minimize_sum_of_dup = false;
445         }
446         try {
447             sdi_dir.infer( indir,
448                            speciestree_file,
449                            outdir,
450                            outfile,
451                            suffix,
452                            write_trees,
453                            minimize_mapping_cost,
454                            minimize_sum_of_dup,
455                            minimize_height );
456         }
457         catch ( final Exception e ) {
458             ForesterUtil.fatalError( PRG_NAME, "error: " + e.getLocalizedMessage() );
459         }
460         ForesterUtil.programMessage( PRG_NAME, "OK." );
461     }
462 }