2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
28 package org.forester.application;
31 import java.io.FileWriter;
32 import java.io.IOException;
33 import java.io.PrintWriter;
34 import java.util.Arrays;
36 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
37 import org.forester.io.writers.PhylogenyWriter;
38 import org.forester.phylogeny.Phylogeny;
39 import org.forester.phylogeny.PhylogenyMethods;
40 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
41 import org.forester.phylogeny.factories.PhylogenyFactory;
42 import org.forester.sdi.SDIException;
43 import org.forester.sdi.SDIR;
44 import org.forester.sdi.SDIse;
45 import org.forester.util.ForesterUtil;
48 * Allows to infer duplications - speciations on all (rooted or unrooted) gene
49 * trees in a directory by using method "infer" of class SDIunrooted. <p> The
50 * output of this is a (re)rooted tree with speciation - duplication assigned
51 * for each tree (in "gene tree directory" with suffix "suffix for gene trees"),
52 * as well as a summary list ("outputfile name"). <p> The summary list contains
53 * the following. The number in brackets indicates how many external nodes of
54 * the gene tree had to be removed since the associated species was not found in
55 * the species tree. "en" indicates the number of external nodes in the
56 * resulting (analyzed and returned) gene tree. "d" are the number of
57 * duplications, "L=" the mapping cost, "h=" the height, "d=" the minimal
58 * difference in tree heights (of the two subtrees at the root; this number is
59 * 0.0 for a midpoint rooted tree) of the resulting, analyzed and rooted gene
60 * tree(s). <p> The output file ending with "_Sdist" is a file which lists the
61 * distribution of trees sizes, "_Ddist" lists the distribution of the sums of
62 * duplications (up to a certain maximal size, set with final variables
63 * MAX_EXT_NODE_DIST and MAX_DUP_DIST).
67 * @author Christian M. Zmasek
69 public class sdi_dir {
71 final static private String E_MAIL = "czmasek@burnham.org";
72 final static private String WWW = "www.phylosoft.org";
73 final static private String PRG_NAME = "sdi_dir";
74 final static private String PRG_VERSION = "2.00";
75 final static private String PRG_DATE = "2010.04.26";
77 private static void errorInCommandLine() {
78 System.out.println( "\nsdi_dir: Error in command line.\n" );
79 System.out.print( "Usage: % sdi_dir [-options] <gene tree directory> <suffix for gene trees>" );
80 System.out.println( " <species tree file name> <output directory> <outputfile name>" );
81 System.out.println( "\nOptions:" );
82 System.out.println( " -l to root by minimizing the mapping cost L (and also the sum of duplications)" );
83 System.out.println( " -d to root by minimizing the sum of duplications" );
84 System.out.println( " -h to root by minimizing tree height (can be used together with -l or -d)" );
85 System.out.println( " -w to write assigned gene trees into output directory" );
86 System.out.println( "\nGene tree directory" );
87 System.out.println( " The directory from which to read phyloXML formatted gene trees which" );
88 System.out.println( " contain taxonomic information in appropriate sub-elements of taxonomy" );
89 System.out.println( " (see: www.phyloxml.org)." );
90 System.out.println( " The gene trees can either be rooted, in which case no rooting with -l, -d, or -h " );
91 System.out.println( " is necessary, or they can be unrooted, in which case rooting is mandatory." );
92 System.out.println( "\nSuffix for gene trees" );
93 System.out.println( " Suffix of the gene trees to analyze (e.g. \".phyloxml\")." );
94 System.out.println( "\nSpecies tree file" );
95 System.out.println( " In phyloXML format, taxonomic information in appropriate sub-elements of taxonomy." );
96 System.out.println( " (see: www.phyloxml.org)." );
97 System.out.println( "\nOutput directory" );
98 System.out.println( " The directory into which the assigned gene trees will be written." );
99 System.out.println( "\nOutputfile name" );
100 System.out.println( " File name for summary output files." );
101 System.out.println( "" );
106 * Runs method "infer" of class SDIunrooted on all gene trees in directory
109 * Trees are rooted by minimizing either the sum of duplications, the
110 * mapping cost L, or the tree height (or combinations thereof). One
111 * resulting tree for each (out of possibly many) is stored in outdir and a
112 * summary outfile is created. The distributions of the tree sizes (name of
113 * outfile + _Ddist) and the distributions of the sum of duplications per
114 * tree (name of outfile + _Sdist) are written out as well.
116 * If both minimize_sum_of_dup and minimize_mapping_cost are true, trees are
117 * rooted by minimizing by minimizing the mapping cost L.
119 * If minimize_sum_of_dup, minimize_mapping_cost, and minimize_height are
120 * false trees are assumed to be alreadty rooted.
122 * (Last modified: 02/02/01)
124 * @see SDIR#infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean)
126 * a directory containing gene trees in NHX format
127 * @param species_tree_file
128 * a species tree file in NHX format
130 * a directory where to write trees
132 * a file name for the summary file
134 * a suffix for the trees to read (e.g. nhx), is case sensitive
136 * set to true to write out one tree with minmal duplications or
138 * @param minimize_mapping_cost
139 * set to true to root by minimizing the mapping cost L
140 * @param minimize_sum_of_dup
141 * set to true to root by minimizing the sum of duplications
142 * @param minimize_height
143 * set to true to root by minimizing the tree height -- if
144 * minimize_mapping_cost is set to true or minimize_sum_of_dup is
145 * set to true, then out of the resulting trees with minimal
146 * mapping cost or minimal number of duplications the tree with
147 * the minimal height is chosen
148 * @throws SDIException
150 public static void infer( final File indir,
151 final File species_tree_file,
155 final boolean write_trees,
156 final boolean minimize_mapping_cost,
157 boolean minimize_sum_of_dup,
158 final boolean minimize_height ) throws IOException, SDIException {
159 final int MIN_EXT_NODES = 4; // Minimal size of trees [in ext nodes]
161 final int MAX_EXT_NODES = 5000; // Maximal size of trees [in ext nodes]
163 final int MAX_DUP_DIST = 50; // Max number of dups to output in dup
164 // distribution ("_Ddist").
165 final int MAX_EXT_NODE_DIST = 1000; // Max number of ext nodes to output
167 // distribution ("_Sdist").
168 int successful = 0, number_of_too_small_trees = 0, number_of_too_large_trees = 0, dups = 0, c = 0, ext_nodes = 0, removed = 0;
169 final int nodecount0 = 0;
171 long total_number_of_d = 0, total_number_of_ext_nodes = 0, sum_costs = 0;
172 double sum_tree_heights = 0.0, sum_subtree_diff = 0.0;
173 Phylogeny species_tree = null;
174 String filename = null;
175 String[] filenames = null;
176 Phylogeny[] trees = null;
177 final int[] duplications = new int[ MAX_EXT_NODES - 1 ], // For dup
179 sizes = new int[ MAX_EXT_NODES - 1 ]; // For ext nodes dist.of
180 // successfully assigned trees.
182 PrintWriter out = null, out_ddist = null, out_sdist = null;
183 final File ddist_outfile = new File( outfile + "_Ddist" ), sdist_outfile = new File( outfile + "_Sdist" );
184 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.0#####" );
185 df.setDecimalSeparatorAlwaysShown( true );
186 if ( !indir.exists() || !indir.isDirectory() ) {
187 throw new IllegalArgumentException( indir + " does not exist or is not a directory." );
189 if ( !outdir.exists() || !outdir.isDirectory() ) {
190 throw new IllegalArgumentException( outdir + " does not exist or is not a directory." );
192 if ( outfile.exists() ) {
193 throw new IllegalArgumentException( outfile + " does already exist." );
195 if ( ddist_outfile.exists() ) {
196 throw new IllegalArgumentException( ddist_outfile + " does already exist." );
198 if ( sdist_outfile.exists() ) {
199 throw new IllegalArgumentException( sdist_outfile + " does already exist." );
201 if ( !species_tree_file.exists() || !species_tree_file.isFile() ) {
202 throw new IllegalArgumentException( species_tree_file + " does not exist or is not a file." );
204 if ( minimize_mapping_cost && minimize_sum_of_dup ) {
205 minimize_sum_of_dup = false;
207 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
208 species_tree = factory.create( species_tree_file, new PhyloXmlParser() )[ 0 ];
209 filenames = indir.list();
210 Arrays.sort( filenames );
211 suffix = suffix.trim();
212 out = new PrintWriter( new FileWriter( outfile ), true );
213 //nodecount0 = PhylogenyNode.getNodeCount();
214 for( int i = 0; i < filenames.length; ++i ) {
215 filename = filenames[ i ];
216 if ( ( suffix.length() < 1 ) || filename.endsWith( suffix ) ) {
217 final File gene_tree_file = new File( indir.getPath(), filename );
218 if ( gene_tree_file.exists() && gene_tree_file.isFile() ) {
219 out.print( j + "\t" + filename );
220 System.out.println( j + ": " + filename );
222 Phylogeny gene_tree = null;
223 gene_tree = factory.create( gene_tree_file, new PhyloXmlParser() )[ 0 ];
224 // Removes from gene_tree all species not found in
226 removed = PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gene_tree );
227 if ( filename.length() < 8 ) {
228 out.print( "\t\t\t[-" + removed + "]" );
230 else if ( filename.length() < 16 ) {
231 out.print( "\t\t[-" + removed + "]" );
234 out.print( "\t[-" + removed + "]" );
236 if ( gene_tree.getNumberOfExternalNodes() < MIN_EXT_NODES ) {
237 out.print( "\t<" + MIN_EXT_NODES + "en\n" );
238 number_of_too_small_trees++;
240 else if ( gene_tree.getNumberOfExternalNodes() > MAX_EXT_NODES ) {
241 out.print( "\t>" + MAX_EXT_NODES + "en\n" );
242 number_of_too_large_trees++;
245 SDIR sdiunrooted = null;
246 // PhylogenyNode.setNodeCount( nodecount0 );
247 sdiunrooted = new SDIR();
248 if ( minimize_mapping_cost || minimize_sum_of_dup || minimize_height ) {
249 trees = sdiunrooted.infer( gene_tree,
251 minimize_mapping_cost,
256 dups = sdiunrooted.getMinimalDuplications();
259 final SDIse sdi = new SDIse( gene_tree, species_tree );
260 trees = new Phylogeny[ 1 ];
261 trees[ 0 ] = gene_tree;
262 dups = sdi.getDuplicationsSum();
263 c = sdi.computeMappingCostL();
265 out.print( "\t L=" + c );
268 ext_nodes = gene_tree.getNumberOfExternalNodes();
269 total_number_of_ext_nodes += ext_nodes;
270 sizes[ ext_nodes ]++;
271 out.print( "\t " + ext_nodes + "en" );
272 total_number_of_d += dups;
273 duplications[ dups ]++;
274 out.print( "\t " + dups + "d" );
275 if ( minimize_mapping_cost ) {
276 c = sdiunrooted.getMinimalMappingCost();
278 out.print( "\t L=" + c );
280 if ( minimize_height ) {
281 out.print( "\t h=" + df.format( sdiunrooted.getMinimalTreeHeight() ) );
282 out.print( "\t d=" + df.format( sdiunrooted.getMinimalDiffInSubTreeHeights() ) );
283 sum_tree_heights += sdiunrooted.getMinimalTreeHeight();
284 sum_subtree_diff += sdiunrooted.getMinimalDiffInSubTreeHeights();
288 outtree = new File( outdir, new File( filenames[ i ] ).getName() );
289 final PhylogenyWriter writer = new PhylogenyWriter();
290 writer.toPhyloXML( outtree, trees[ 0 ], 1 );
296 //PhylogenyNode.setNodeCount( nodecount0 );
297 if ( minimize_mapping_cost ) {
298 out.println( "\nRooted by minimizing mapping cost L." );
299 System.out.println( "\nRooted by minimizing mapping cost L." );
300 if ( minimize_height ) {
301 out.println( "Selected tree(s) with minimal height out of resulting trees." );
302 System.out.println( "Selected tree(s) with minimal height out of resulting trees." );
305 else if ( minimize_sum_of_dup ) {
306 out.println( "\nRooted by minimizing sum of duplications." );
307 System.out.println( "\nRooted by minimizing sum of duplications." );
308 if ( minimize_height ) {
309 out.println( "Selected tree(s) with minimal height out of resulting trees." );
310 System.out.println( "Selected tree(s) with minimal height out of resulting trees." );
313 else if ( minimize_height ) {
314 out.println( "\nRooted by minimizing tree height." );
315 System.out.println( "\nRooted by minimizing tree height." );
318 out.println( "\nNo (re) rooting was performed." );
319 System.out.println( "\nNo (re) rooting was performed." );
321 out.println( "\nTrees directory : " + indir );
322 out.println( "Suffix for trees : " + suffix );
323 out.println( "Species tree : " + species_tree_file );
324 out.println( "Output directory : " + outdir );
325 out.println( "Output file : " + outfile );
326 out.println( "\nTotal number of attempts (tree files read) : " + j );
327 out.println( "Total number of successfully assigned trees : " + successful );
328 out.println( "Number of too small trees : " + number_of_too_small_trees );
329 out.println( "Number of too large trees : " + number_of_too_large_trees );
330 out.println( "\nSum of duplications : " + total_number_of_d );
331 if ( minimize_mapping_cost ) {
332 out.println( "Sum of mapping costs L : " + sum_costs );
334 if ( minimize_height ) {
335 out.println( "Sum of tree heights : " + sum_tree_heights );
336 out.println( "Sum of differences in subtree heights : " + sum_subtree_diff );
338 out.println( "Sum of external nodes (in successfully assigned trees): " + total_number_of_ext_nodes );
340 System.out.println( "\nTotal number of attempts (tree files read) : " + j );
341 System.out.println( "Total number of successfully assigned trees : " + successful );
342 System.out.println( "Number of too small trees : " + number_of_too_small_trees );
343 System.out.println( "Number of too large trees : " + number_of_too_large_trees );
344 System.out.println( "\nSum of duplications : " + total_number_of_d );
345 if ( minimize_mapping_cost ) {
346 System.out.println( "Sum of mapping costs L : " + sum_costs );
348 if ( minimize_height ) {
349 System.out.println( "Sum of tree heights : " + sum_tree_heights );
350 System.out.println( "Sum of differences in subtree heights : " + sum_subtree_diff );
352 System.out.println( "Sum of external nodes (in successfully assigned trees): " + total_number_of_ext_nodes );
353 out_ddist = new PrintWriter( new FileWriter( ddist_outfile ), true );
354 for( int i = 0; ( i < duplications.length ) && ( i <= MAX_DUP_DIST ); ++i ) {
355 out_ddist.println( i + " " + duplications[ i ] );
358 out_sdist = new PrintWriter( new FileWriter( sdist_outfile ), true );
359 for( int i = 0; ( i < sizes.length ) && ( i <= MAX_EXT_NODE_DIST ); ++i ) {
360 out_sdist.println( i + " " + sizes[ i ] );
366 * Main method for this class.
368 * (Last modified: 04/26/10)
371 * -l to root by minimizing mapping cost L]
373 * -d to root by minimizing sum of duplications]
375 * -w to write out trees into outdir]
377 * -h to root by minimizing tree height]
379 * -n input trees are in New Hampshire format instead of NHX --
380 * or gene tree is in NHX, but species information in gene tree
381 * is only present in the form of SWISS-PROT sequence names]
383 * trees directory name
385 * suffix for gene trees
387 * speciestree file name
389 * output directory name
393 public static void main( final String args[] ) {
394 ForesterUtil.printProgramInformation( PRG_NAME, PRG_VERSION, PRG_DATE, E_MAIL, WWW );
395 // These are the default values.
396 boolean minimize_mapping_cost = false;
397 boolean minimize_sum_of_dup = false;
398 boolean minimize_height = false;
399 boolean write_trees = false;
401 File speciestree_file = null;
404 String suffix = null;
405 if ( args.length == 5 ) {
406 indir = new File( args[ 0 ] );
408 speciestree_file = new File( args[ 2 ] );
409 outdir = new File( args[ 3 ] );
410 outfile = new File( args[ 4 ] );
412 else if ( args.length == 6 ) {
413 if ( args[ 0 ].startsWith( "-" ) ) {
414 minimize_mapping_cost = false;
415 minimize_sum_of_dup = false;
416 minimize_height = false;
418 if ( args[ 0 ].toLowerCase().indexOf( "w" ) != -1 ) {
421 if ( args[ 0 ].toLowerCase().indexOf( "l" ) != -1 ) {
422 minimize_mapping_cost = true;
424 if ( args[ 0 ].toLowerCase().indexOf( "d" ) != -1 ) {
425 minimize_sum_of_dup = true;
427 if ( args[ 0 ].toLowerCase().indexOf( "h" ) != -1 ) {
428 minimize_height = true;
432 sdi_dir.errorInCommandLine();
434 indir = new File( args[ 1 ] );
436 speciestree_file = new File( args[ 3 ] );
437 outdir = new File( args[ 4 ] );
438 outfile = new File( args[ 5 ] );
441 sdi_dir.errorInCommandLine();
443 if ( minimize_mapping_cost && minimize_sum_of_dup ) {
444 minimize_sum_of_dup = false;
447 sdi_dir.infer( indir,
453 minimize_mapping_cost,
457 catch ( final Exception e ) {
458 ForesterUtil.fatalError( PRG_NAME, "error: " + e.getLocalizedMessage() );
460 ForesterUtil.programMessage( PRG_NAME, "OK." );