clean up
[jalview.git] / forester / java / src / org / forester / application / sdi_dir.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
27
28 package org.forester.application;
29
30 import java.io.File;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.PrintWriter;
34 import java.util.Arrays;
35
36 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
37 import org.forester.io.writers.PhylogenyWriter;
38 import org.forester.phylogeny.Phylogeny;
39 import org.forester.phylogeny.PhylogenyMethods;
40 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
41 import org.forester.phylogeny.factories.PhylogenyFactory;
42 import org.forester.sdi.SDIR;
43 import org.forester.sdi.SDIse;
44 import org.forester.util.ForesterUtil;
45
46 /*
47  * Allows to infer duplications - speciations on all (rooted or unrooted) gene
48  * trees in a directory by using method "infer" of class SDIunrooted. <p> The
49  * output of this is a (re)rooted tree with speciation - duplication assigned
50  * for each tree (in "gene tree directory" with suffix "suffix for gene trees"),
51  * as well as a summary list ("outputfile name"). <p> The summary list contains
52  * the following. The number in brackets indicates how many external nodes of
53  * the gene tree had to be removed since the associated species was not found in
54  * the species tree. "en" indicates the number of external nodes in the
55  * resulting (analyzed and returned) gene tree. "d" are the number of
56  * duplications, "L=" the mapping cost, "h=" the height, "d=" the minimal
57  * difference in tree heights (of the two subtrees at the root; this number is
58  * 0.0 for a midpoint rooted tree) of the resulting, analyzed and rooted gene
59  * tree(s). <p> The output file ending with "_Sdist" is a file which lists the
60  * distribution of trees sizes, "_Ddist" lists the distribution of the sums of
61  * duplications (up to a certain maximal size, set with final variables
62  * MAX_EXT_NODE_DIST and MAX_DUP_DIST).
63  * 
64  * @see SDIunrooted
65  * 
66  * @author Christian M. Zmasek
67  */
68 public class sdi_dir {
69
70     final static private String E_MAIL      = "czmasek@burnham.org";
71     final static private String WWW         = "www.phylosoft.org";
72     final static private String PRG_NAME    = "sdi_dir";
73     final static private String PRG_VERSION = "2.00";
74     final static private String PRG_DATE    = "2010.04.26";
75
76     private static void errorInCommandLine() {
77         System.out.println( "\nsdi_dir: Error in command line.\n" );
78         System.out.print( "Usage: % sdi_dir [-options] <gene tree directory> <suffix for gene trees>" );
79         System.out.println( " <species tree file name> <output directory> <outputfile name>" );
80         System.out.println( "\nOptions:" );
81         System.out.println( " -l to root by minimizing the mapping cost L (and also the sum of duplications)" );
82         System.out.println( " -d to root by minimizing the sum of duplications" );
83         System.out.println( " -h to root by minimizing tree height (can be used together with -l or -d)" );
84         System.out.println( " -w to write assigned gene trees into output directory" );
85         System.out.println( "\nGene tree directory" );
86         System.out.println( " The directory from which to read phyloXML formatted gene trees which" );
87         System.out.println( " contain taxonomic information in appropriate sub-elements of taxonomy" );
88         System.out.println( " (see: www.phyloxml.org)." );
89         System.out.println( " The gene trees can either be rooted, in which case no rooting with -l, -d, or -h " );
90         System.out.println( " is necessary, or they can be unrooted, in which case rooting is mandatory." );
91         System.out.println( "\nSuffix for gene trees" );
92         System.out.println( " Suffix of the gene trees to analyze (e.g. \".phyloxml\")." );
93         System.out.println( "\nSpecies tree file" );
94         System.out.println( " In phyloXML format, taxonomic information in appropriate sub-elements of taxonomy." );
95         System.out.println( " (see: www.phyloxml.org)." );
96         System.out.println( "\nOutput directory" );
97         System.out.println( " The directory into which the assigned gene trees will be written." );
98         System.out.println( "\nOutputfile name" );
99         System.out.println( " File name for summary output files." );
100         System.out.println( "" );
101         System.exit( -1 );
102     }
103
104     /**
105      * Runs method "infer" of class SDIunrooted on all gene trees in directory
106      * indir.
107      * <p>
108      * Trees are rooted by minimizing either the sum of duplications, the
109      * mapping cost L, or the tree height (or combinations thereof). One
110      * resulting tree for each (out of possibly many) is stored in outdir and a
111      * summary outfile is created. The distributions of the tree sizes (name of
112      * outfile + _Ddist) and the distributions of the sum of duplications per
113      * tree (name of outfile + _Sdist) are written out as well.
114      * <p>
115      * If both minimize_sum_of_dup and minimize_mapping_cost are true, trees are
116      * rooted by minimizing by minimizing the mapping cost L.
117      * <p>
118      * If minimize_sum_of_dup, minimize_mapping_cost, and minimize_height are
119      * false trees are assumed to be alreadty rooted.
120      * <p>
121      * (Last modified: 02/02/01)
122      * 
123      * @see SDIR#infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean)
124      * @param indir
125      *            a directory containing gene trees in NHX format
126      * @param species_tree_file
127      *            a species tree file in NHX format
128      * @param outdir
129      *            a directory where to write trees
130      * @param outfile
131      *            a file name for the summary file
132      * @param suffix
133      *            a suffix for the trees to read (e.g. nhx), is case sensitive
134      * @param write_trees
135      *            set to true to write out one tree with minmal duplications or
136      *            L each
137      * @param minimize_mapping_cost
138      *            set to true to root by minimizing the mapping cost L
139      * @param minimize_sum_of_dup
140      *            set to true to root by minimizing the sum of duplications
141      * @param minimize_height
142      *            set to true to root by minimizing the tree height -- if
143      *            minimize_mapping_cost is set to true or minimize_sum_of_dup is
144      *            set to true, then out of the resulting trees with minimal
145      *            mapping cost or minimal number of duplications the tree with
146      *            the minimal height is chosen
147      */
148     public static void infer( final File indir,
149                               final File species_tree_file,
150                               final File outdir,
151                               final File outfile,
152                               String suffix,
153                               final boolean write_trees,
154                               final boolean minimize_mapping_cost,
155                               boolean minimize_sum_of_dup,
156                               final boolean minimize_height ) throws IOException {
157         final int MIN_EXT_NODES = 4; // Minimal size of trees [in ext nodes]
158         // to be analyzed.
159         final int MAX_EXT_NODES = 5000; // Maximal size of trees [in ext nodes]
160         // to be analyzed.
161         final int MAX_DUP_DIST = 50; // Max number of dups to output in dup
162         // distribution ("_Ddist").
163         final int MAX_EXT_NODE_DIST = 1000; // Max number of ext nodes to output
164         // in size
165         // distribution ("_Sdist").
166         int successful = 0, number_of_too_small_trees = 0, number_of_too_large_trees = 0, dups = 0, c = 0, ext_nodes = 0, removed = 0;
167         final int nodecount0 = 0;
168         int j = 0;
169         long total_number_of_d = 0, total_number_of_ext_nodes = 0, sum_costs = 0;
170         double sum_tree_heights = 0.0, sum_subtree_diff = 0.0;
171         Phylogeny species_tree = null;
172         String filename = null;
173         String[] filenames = null;
174         Phylogeny[] trees = null;
175         final int[] duplications = new int[ MAX_EXT_NODES - 1 ], // For dup
176         // distribution.
177         sizes = new int[ MAX_EXT_NODES - 1 ]; // For ext nodes dist.of
178         // successfully assigned trees.
179         File outtree = null;
180         PrintWriter out = null, out_ddist = null, out_sdist = null;
181         final File ddist_outfile = new File( outfile + "_Ddist" ), sdist_outfile = new File( outfile + "_Sdist" );
182         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.0#####" );
183         df.setDecimalSeparatorAlwaysShown( true );
184         if ( !indir.exists() || !indir.isDirectory() ) {
185             throw new IllegalArgumentException( indir + " does not exist or is not a directory." );
186         }
187         if ( !outdir.exists() || !outdir.isDirectory() ) {
188             throw new IllegalArgumentException( outdir + " does not exist or is not a directory." );
189         }
190         if ( outfile.exists() ) {
191             throw new IllegalArgumentException( outfile + " does already exist." );
192         }
193         if ( ddist_outfile.exists() ) {
194             throw new IllegalArgumentException( ddist_outfile + " does already exist." );
195         }
196         if ( sdist_outfile.exists() ) {
197             throw new IllegalArgumentException( sdist_outfile + " does already exist." );
198         }
199         if ( !species_tree_file.exists() || !species_tree_file.isFile() ) {
200             throw new IllegalArgumentException( species_tree_file + " does not exist or is not a file." );
201         }
202         if ( minimize_mapping_cost && minimize_sum_of_dup ) {
203             minimize_sum_of_dup = false;
204         }
205         final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
206         species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ];
207         filenames = indir.list();
208         Arrays.sort( filenames );
209         suffix = suffix.trim();
210         out = new PrintWriter( new FileWriter( outfile ), true );
211         //nodecount0 = PhylogenyNode.getNodeCount();
212         for( int i = 0; i < filenames.length; ++i ) {
213             filename = filenames[ i ];
214             if ( ( suffix.length() < 1 ) || filename.endsWith( suffix ) ) {
215                 final File gene_tree_file = new File( indir.getPath(), filename );
216                 if ( gene_tree_file.exists() && gene_tree_file.isFile() ) {
217                     out.print( j + "\t" + filename );
218                     System.out.println( j + ": " + filename );
219                     j++;
220                     Phylogeny gene_tree = null;
221                     gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ];
222                     // Removes from gene_tree all species not found in
223                     // species_tree.
224                     removed = PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_tree );
225                     if ( filename.length() < 8 ) {
226                         out.print( "\t\t\t[-" + removed + "]" );
227                     }
228                     else if ( filename.length() < 16 ) {
229                         out.print( "\t\t[-" + removed + "]" );
230                     }
231                     else {
232                         out.print( "\t[-" + removed + "]" );
233                     }
234                     if ( gene_tree.getNumberOfExternalNodes() < MIN_EXT_NODES ) {
235                         out.print( "\t<" + MIN_EXT_NODES + "en\n" );
236                         number_of_too_small_trees++;
237                     }
238                     else if ( gene_tree.getNumberOfExternalNodes() > MAX_EXT_NODES ) {
239                         out.print( "\t>" + MAX_EXT_NODES + "en\n" );
240                         number_of_too_large_trees++;
241                     }
242                     else {
243                         SDIR sdiunrooted = null;
244                         // PhylogenyNode.setNodeCount( nodecount0 );
245                         sdiunrooted = new SDIR();
246                         if ( minimize_mapping_cost || minimize_sum_of_dup || minimize_height ) {
247                             trees = sdiunrooted.infer( gene_tree,
248                                                        species_tree,
249                                                        minimize_mapping_cost,
250                                                        minimize_sum_of_dup,
251                                                        minimize_height,
252                                                        write_trees,
253                                                        1 );
254                             dups = sdiunrooted.getMinimalDuplications();
255                         }
256                         else {
257                             final SDIse sdi = new SDIse( gene_tree, species_tree );
258                             trees = new Phylogeny[ 1 ];
259                             trees[ 0 ] = gene_tree;
260                             dups = sdi.getDuplicationsSum();
261                             c = sdi.computeMappingCostL();
262                             sum_costs += c;
263                             out.print( "\t L=" + c );
264                         }
265                         successful++;
266                         ext_nodes = gene_tree.getNumberOfExternalNodes();
267                         total_number_of_ext_nodes += ext_nodes;
268                         sizes[ ext_nodes ]++;
269                         out.print( "\t " + ext_nodes + "en" );
270                         total_number_of_d += dups;
271                         duplications[ dups ]++;
272                         out.print( "\t " + dups + "d" );
273                         if ( minimize_mapping_cost ) {
274                             c = sdiunrooted.getMinimalMappingCost();
275                             sum_costs += c;
276                             out.print( "\t L=" + c );
277                         }
278                         if ( minimize_height ) {
279                             out.print( "\t h=" + df.format( sdiunrooted.getMinimalTreeHeight() ) );
280                             out.print( "\t d=" + df.format( sdiunrooted.getMinimalDiffInSubTreeHeights() ) );
281                             sum_tree_heights += sdiunrooted.getMinimalTreeHeight();
282                             sum_subtree_diff += sdiunrooted.getMinimalDiffInSubTreeHeights();
283                         }
284                         out.println();
285                         if ( write_trees ) {
286                             outtree = new File( outdir, new File( filenames[ i ] ).getName() );
287                             final PhylogenyWriter writer = new PhylogenyWriter();
288                             writer.toPhyloXML( outtree, trees[ 0 ], 1 );
289                         }
290                     }
291                 }
292             }
293         }
294         //PhylogenyNode.setNodeCount( nodecount0 );
295         if ( minimize_mapping_cost ) {
296             out.println( "\nRooted by minimizing mapping cost L." );
297             System.out.println( "\nRooted by minimizing mapping cost L." );
298             if ( minimize_height ) {
299                 out.println( "Selected tree(s) with minimal height out of resulting trees." );
300                 System.out.println( "Selected tree(s) with minimal height out of resulting trees." );
301             }
302         }
303         else if ( minimize_sum_of_dup ) {
304             out.println( "\nRooted by minimizing sum of duplications." );
305             System.out.println( "\nRooted by minimizing sum of duplications." );
306             if ( minimize_height ) {
307                 out.println( "Selected tree(s) with minimal height out of resulting trees." );
308                 System.out.println( "Selected tree(s) with minimal height out of resulting trees." );
309             }
310         }
311         else if ( minimize_height ) {
312             out.println( "\nRooted by minimizing tree height." );
313             System.out.println( "\nRooted by minimizing tree height." );
314         }
315         else {
316             out.println( "\nNo (re) rooting was performed." );
317             System.out.println( "\nNo (re) rooting was performed." );
318         }
319         out.println( "\nTrees directory  : " + indir );
320         out.println( "Suffix for trees : " + suffix );
321         out.println( "Species tree     : " + species_tree_file );
322         out.println( "Output directory : " + outdir );
323         out.println( "Output file      : " + outfile );
324         out.println( "\nTotal number of attempts (tree files read)       : " + j );
325         out.println( "Total number of successfully assigned trees      : " + successful );
326         out.println( "Number of too small trees                        : " + number_of_too_small_trees );
327         out.println( "Number of too large trees                        : " + number_of_too_large_trees );
328         out.println( "\nSum of duplications                                   : " + total_number_of_d );
329         if ( minimize_mapping_cost ) {
330             out.println( "Sum of mapping costs L                                : " + sum_costs );
331         }
332         if ( minimize_height ) {
333             out.println( "Sum of tree heights                                   : " + sum_tree_heights );
334             out.println( "Sum of differences in subtree heights                 : " + sum_subtree_diff );
335         }
336         out.println( "Sum of external nodes (in successfully assigned trees): " + total_number_of_ext_nodes );
337         out.close();
338         System.out.println( "\nTotal number of attempts (tree files read)       : " + j );
339         System.out.println( "Total number of successfully assigned trees      : " + successful );
340         System.out.println( "Number of too small trees                        : " + number_of_too_small_trees );
341         System.out.println( "Number of too large trees                        : " + number_of_too_large_trees );
342         System.out.println( "\nSum of duplications                                   : " + total_number_of_d );
343         if ( minimize_mapping_cost ) {
344             System.out.println( "Sum of mapping costs L                                : " + sum_costs );
345         }
346         if ( minimize_height ) {
347             System.out.println( "Sum of tree heights                                   : " + sum_tree_heights );
348             System.out.println( "Sum of differences in subtree heights                 : " + sum_subtree_diff );
349         }
350         System.out.println( "Sum of external nodes (in successfully assigned trees): " + total_number_of_ext_nodes );
351         out_ddist = new PrintWriter( new FileWriter( ddist_outfile ), true );
352         for( int i = 0; ( i < duplications.length ) && ( i <= MAX_DUP_DIST ); ++i ) {
353             out_ddist.println( i + " " + duplications[ i ] );
354         }
355         out_ddist.close();
356         out_sdist = new PrintWriter( new FileWriter( sdist_outfile ), true );
357         for( int i = 0; ( i < sizes.length ) && ( i <= MAX_EXT_NODE_DIST ); ++i ) {
358             out_sdist.println( i + " " + sizes[ i ] );
359         }
360         out_sdist.close();
361     } // infer
362
363     /**
364      * Main method for this class.
365      * <p>
366      * (Last modified: 04/26/10)
367      * 
368      * @param [args[0]
369      *            -l to root by minimizing mapping cost L]
370      * @param [args[0]
371      *            -d to root by minimizing sum of duplications]
372      * @param [args[0]
373      *            -w to write out trees into outdir]
374      * @param [args[0]
375      *            -h to root by minimizing tree height]
376      * @param [args[0]
377      *            -n input trees are in New Hampshire format instead of NHX --
378      *            or gene tree is in NHX, but species information in gene tree
379      *            is only present in the form of SWISS-PROT sequence names]
380      * @param args[0or1]
381      *            trees directory name
382      * @param args[1or2]
383      *            suffix for gene trees
384      * @param args[2or3]
385      *            speciestree file name
386      * @param args[3or4]
387      *            output directory name
388      * @param args[4or5]
389      *            output file name
390      */
391     public static void main( final String args[] ) {
392         ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW );
393         // These are the default values.
394         boolean minimize_mapping_cost = false;
395         boolean minimize_sum_of_dup = false;
396         boolean minimize_height = false;
397         boolean write_trees = false;
398         File indir = null;
399         File speciestree_file = null;
400         File outdir = null;
401         File outfile = null;
402         String suffix = null;
403         if ( args.length == 5 ) {
404             indir = new File( args[ 0 ] );
405             suffix = args[ 1 ];
406             speciestree_file = new File( args[ 2 ] );
407             outdir = new File( args[ 3 ] );
408             outfile = new File( args[ 4 ] );
409         }
410         else if ( args.length == 6 ) {
411             if ( args[ 0 ].startsWith( "-" ) ) {
412                 minimize_mapping_cost = false;
413                 minimize_sum_of_dup = false;
414                 minimize_height = false;
415                 write_trees = false;
416                 if ( args[ 0 ].toLowerCase().indexOf( "w" ) != -1 ) {
417                     write_trees = true;
418                 }
419                 if ( args[ 0 ].toLowerCase().indexOf( "l" ) != -1 ) {
420                     minimize_mapping_cost = true;
421                 }
422                 if ( args[ 0 ].toLowerCase().indexOf( "d" ) != -1 ) {
423                     minimize_sum_of_dup = true;
424                 }
425                 if ( args[ 0 ].toLowerCase().indexOf( "h" ) != -1 ) {
426                     minimize_height = true;
427                 }
428             }
429             else {
430                 sdi_dir.errorInCommandLine();
431             }
432             indir = new File( args[ 1 ] );
433             suffix = args[ 2 ];
434             speciestree_file = new File( args[ 3 ] );
435             outdir = new File( args[ 4 ] );
436             outfile = new File( args[ 5 ] );
437         }
438         else {
439             sdi_dir.errorInCommandLine();
440         }
441         if ( minimize_mapping_cost && minimize_sum_of_dup ) {
442             minimize_sum_of_dup = false;
443         }
444         try {
445             sdi_dir.infer( indir,
446                            speciestree_file,
447                            outdir,
448                            outfile,
449                            suffix,
450                            write_trees,
451                            minimize_mapping_cost,
452                            minimize_sum_of_dup,
453                            minimize_height );
454         }
455         catch ( final Exception e ) {
456             ForesterUtil.fatalError( PRG_NAME, "error: " + e.getLocalizedMessage() );
457         }
458         ForesterUtil.programMessage( PRG_NAME, "OK." );
459     }
460 }