analyze pplacer output...
[jalview.git] / forester / java / src / org / forester / application / gsdi.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.application;
27
28 import java.io.File;
29 import java.io.FilenameFilter;
30 import java.io.IOException;
31 import java.text.SimpleDateFormat;
32 import java.util.ArrayList;
33 import java.util.Arrays;
34 import java.util.Date;
35 import java.util.List;
36 import java.util.SortedMap;
37 import java.util.SortedSet;
38 import java.util.TreeMap;
39 import java.util.TreeSet;
40
41 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
42 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
43 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
44 import org.forester.io.writers.PhylogenyWriter;
45 import org.forester.phylogeny.Phylogeny;
46 import org.forester.phylogeny.PhylogenyMethods;
47 import org.forester.phylogeny.PhylogenyNode;
48 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
49 import org.forester.phylogeny.factories.PhylogenyFactory;
50 import org.forester.sdi.GSDI;
51 import org.forester.sdi.GSDII;
52 import org.forester.sdi.GSDIR;
53 import org.forester.sdi.SDIException;
54 import org.forester.sdi.SDIutil;
55 import org.forester.sdi.SDIutil.ALGORITHM;
56 import org.forester.util.CommandLineArguments;
57 import org.forester.util.EasyWriter;
58 import org.forester.util.ForesterConstants;
59 import org.forester.util.ForesterUtil;
60
61 public final class gsdi {
62
63     final static public boolean REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE = true;
64     final static private String ALLOW_STRIPPING_OF_GENE_TREE_OPTION    = "g";
65     final static private String GSDIR_OPTION                           = "r";
66     final static private String MOST_PARSIMONIOUS_OPTION               = "m";
67     final static private String SUFFIX_FOR_DIR_OPTION                  = "s";
68     final static private String GUESS_FORMAT_OF_SPECIES_TREE           = "q";
69     final static private String TRANSFER_TAXONOMY_OPTION               = "t";
70     final static private String HELP_OPTION_1                          = "help";
71     final static private String HELP_OPTION_2                          = "h";
72     final static private String SUFFIX_FOR_SPECIES_TREE_USED           = "_species_tree_used.xml";
73     final static private String OUTTREE_SUFFIX                         = "_gsdir.xml";
74     final static private String LOGFILE_NAME                           = "00_gsdi_log.tsv";
75     final static private String LOGFILE_SUFFIX                         = "_gsdi_log.txt";
76     final static private String REMAPPED_SUFFIX                        = "_gsdi_remapped.txt";
77     final static private String PRG_NAME                               = "gsdi";
78     final static private String PRG_VERSION                            = "1.100";
79     final static private String PRG_DATE                               = "170403";
80     final static private String PRG_DESC                               = "general speciation duplication inference";
81     final static private String E_MAIL                                 = "phyloxml@gmail.com";
82     final static private String WWW                                    = "https://sites.google.com/site/cmzmasek/home/software/forester";
83
84     public static void main( final String args[] ) {
85         try {
86             ForesterUtil.printProgramInformation( PRG_NAME,
87                                                   PRG_DESC,
88                                                   PRG_VERSION,
89                                                   PRG_DATE,
90                                                   E_MAIL,
91                                                   WWW,
92                                                   ForesterUtil.getForesterLibraryInformation() );
93             CommandLineArguments cla = null;
94             try {
95                 cla = new CommandLineArguments( args );
96             }
97             catch ( final Exception e ) {
98                 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
99             }
100             if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) {
101                 System.out.println();
102                 gsdi.print_help();
103                 System.exit( 0 );
104             }
105             else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() != 3 ) ) {
106                 System.out.println();
107                 System.out.println( "Wrong number of arguments." );
108                 System.out.println();
109                 gsdi.print_help();
110                 System.exit( -1 );
111             }
112             final List<String> allowed_options = new ArrayList<String>();
113             allowed_options.add( GSDIR_OPTION );
114             allowed_options.add( GUESS_FORMAT_OF_SPECIES_TREE );
115             allowed_options.add( MOST_PARSIMONIOUS_OPTION );
116             allowed_options.add( ALLOW_STRIPPING_OF_GENE_TREE_OPTION );
117             allowed_options.add( TRANSFER_TAXONOMY_OPTION );
118             allowed_options.add( SUFFIX_FOR_DIR_OPTION );
119             final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
120             if ( dissallowed_options.length() > 0 ) {
121                 ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
122             }
123             execute( cla );
124         }
125         catch ( final IOException e ) {
126             ForesterUtil.fatalError( gsdi.PRG_NAME, e.getMessage() );
127         }
128     }
129
130     private final static void execute( final CommandLineArguments cla ) throws IOException {
131         ALGORITHM base_algorithm = ALGORITHM.GSDI;
132         boolean most_parsimonous_duplication_model = false;
133         boolean allow_stripping_of_gene_tree = false;
134         if ( cla.isOptionSet( GSDIR_OPTION ) ) {
135             base_algorithm = ALGORITHM.GSDIR;
136         }
137         if ( cla.isOptionSet( MOST_PARSIMONIOUS_OPTION ) ) {
138             if ( base_algorithm == ALGORITHM.SDI ) {
139                 ForesterUtil.fatalError( PRG_NAME, "Cannot use most parsimonious duplication mode with SDI" );
140             }
141             most_parsimonous_duplication_model = true;
142         }
143         if ( cla.isOptionSet( ALLOW_STRIPPING_OF_GENE_TREE_OPTION ) ) {
144             if ( base_algorithm == ALGORITHM.SDI ) {
145                 ForesterUtil.fatalError( PRG_NAME, "Cannot allow stripping of gene tree with SDI" );
146             }
147             allow_stripping_of_gene_tree = true;
148         }
149         boolean transfer_taxonomy = false;
150         if ( cla.isOptionSet( TRANSFER_TAXONOMY_OPTION ) ) {
151             transfer_taxonomy = true;
152         }
153         boolean use_gene_tree_dir = false;
154         final String gene_tree_suffix;
155         if ( cla.isOptionSet( SUFFIX_FOR_DIR_OPTION ) ) {
156             gene_tree_suffix = cla.getOptionValue( SUFFIX_FOR_DIR_OPTION );
157             use_gene_tree_dir = true;
158         }
159         else {
160             gene_tree_suffix = null;
161         }
162         File gene_tree_file = null;
163         File species_tree_file = null;
164         File out_file = null;
165         File log_file = null;
166         File out_dir = null;
167         try {
168             gene_tree_file = cla.getFile( 0 );
169             species_tree_file = cla.getFile( 1 );
170             if ( use_gene_tree_dir ) {
171                 out_dir = cla.getFile( 2 );
172                 if ( out_dir.exists() ) {
173                     if ( !out_dir.isDirectory() ) {
174                         ForesterUtil
175                                 .fatalError( gsdi.PRG_NAME,
176                                              "out-directory [" + out_dir + "] already exists but is not a directory" );
177                     }
178                 }
179                 else {
180                     final boolean success = out_dir.mkdirs();
181                     if ( !success ) {
182                         ForesterUtil.fatalError( gsdi.PRG_NAME, "could not create out-directory [" + out_dir + "]" );
183                     }
184                 }
185             }
186             else {
187                 out_file = cla.getFile( 2 );
188                 log_file = new File( ForesterUtil.removeSuffix( out_file.toString() ) + LOGFILE_SUFFIX );
189             }
190         }
191         catch ( final IllegalArgumentException e ) {
192             ForesterUtil.fatalError( PRG_NAME, "error in command line: " + e.getMessage() );
193         }
194         if ( use_gene_tree_dir ) {
195             final File indir = new File( gene_tree_file.toString() );
196             if ( !indir.exists() ) {
197                 ForesterUtil.fatalError( gsdi.PRG_NAME, "in-directory [" + indir + "] does not exist" );
198             }
199             if ( !indir.isDirectory() ) {
200                 ForesterUtil.fatalError( gsdi.PRG_NAME, "in-directory [" + indir + "] is not a directory" );
201             }
202             final String species_tree_file_name = species_tree_file.getName();
203             final File gene_tree_files[] = indir.listFiles( new FilenameFilter() {
204
205                 @Override
206                 public boolean accept( final File dir, final String name ) {
207                     return ( ( name.endsWith( gene_tree_suffix ) ) && !( name.equals( species_tree_file_name ) ) );
208                 }
209             } );
210             if ( gene_tree_files.length < 1 ) {
211                 ForesterUtil.fatalError( gsdi.PRG_NAME,
212                                          "in-directory [" + indir
213                                                  + "] does not contain any gene tree files with suffix "
214                                                  + gene_tree_suffix );
215             }
216             executeDir( base_algorithm,
217                         most_parsimonous_duplication_model,
218                         allow_stripping_of_gene_tree,
219                         transfer_taxonomy,
220                         gene_tree_files,
221                         species_tree_file,
222                         out_dir );
223         }
224         else {
225             execute( base_algorithm,
226                      most_parsimonous_duplication_model,
227                      allow_stripping_of_gene_tree,
228                      transfer_taxonomy,
229                      gene_tree_file,
230                      species_tree_file,
231                      out_file,
232                      log_file );
233         }
234     }
235
236     private final static void executeDir( final ALGORITHM base_algorithm,
237                                           final boolean most_parsimonous_duplication_model,
238                                           final boolean allow_stripping_of_gene_tree,
239                                           final boolean transfer_taxonomy,
240                                           final File gene_tree_files[],
241                                           final File species_tree_file,
242                                           final File outdir )
243             throws IOException {
244         final File log_file = new File( outdir, LOGFILE_NAME );
245         if ( ForesterUtil.isWritableFile( log_file ) != null ) {
246             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( log_file ) );
247         }
248         EasyWriter log_writer = null;
249         try {
250             log_writer = ForesterUtil.createEasyWriter( log_file );
251         }
252         catch ( final IOException e ) {
253             ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to create [" + log_file + "]: " + e.getMessage() );
254         }
255         log_writer.println( "# " + PRG_NAME );
256         log_writer.println( "# Version\t" + PRG_VERSION );
257         log_writer.println( "# Date\t" + PRG_DATE );
258         log_writer.println( "# Forester version\t" + ForesterConstants.FORESTER_VERSION );
259         log_writer.println( "# Species tree\t" + species_tree_file.getCanonicalPath() );
260         if ( base_algorithm == ALGORITHM.GSDI ) {
261             log_writer.println( "# Algorithm\tGSDI" );
262         }
263         else if ( base_algorithm == ALGORITHM.GSDIR ) {
264             log_writer.println( "# Algorithm\tGSDIR" );
265         }
266         log_writer.println( "# Use most parsimonous duplication model\t" + most_parsimonous_duplication_model );
267         log_writer.println( "# Allow stripping of gene tree nodes\t" + allow_stripping_of_gene_tree );
268         log_writer.println( "# Start time\t" + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
269         log_writer.println();
270         log_writer.print( "Gene-tree file\t" );
271         log_writer.print( "Gene-tree name/#\t" );
272         log_writer.print( "Ext. nodes\t" );
273         log_writer.print( "Speciations\t" );
274         log_writer.print( "Duplications\t" );
275         if ( !most_parsimonous_duplication_model ) {
276             log_writer.print( "Spec. or Dup.\t" );
277         }
278         if ( allow_stripping_of_gene_tree ) {
279             log_writer.print( "Stripped gene-tree ext. nodes\t" );
280         }
281         log_writer.print( "Taxonomy mapping" );
282         log_writer.println();
283         int counter = 0;
284         Arrays.sort( gene_tree_files );
285         for( final File gene_tree_file : gene_tree_files ) {
286             String outname = gene_tree_file.getName();
287             if ( outname.indexOf( "." ) > 0 ) {
288                 outname = outname.substring( 0, outname.lastIndexOf( "." ) );
289             }
290             outname = outname + OUTTREE_SUFFIX;
291             counter += executeOneTreeInDir( base_algorithm,
292                                             most_parsimonous_duplication_model,
293                                             allow_stripping_of_gene_tree,
294                                             transfer_taxonomy,
295                                             gene_tree_file,
296                                             species_tree_file,
297                                             new File( outdir, outname ),
298                                             log_writer );
299             log_writer.flush();
300             System.out.print( "\r" + counter );
301         }
302         System.out.print( "\r" );
303         log_writer.close();
304         System.out.println( "Analyzed " + counter + " gene trees" );
305         System.out.println();
306         System.out.println( "Wrote log to: " + log_file.getCanonicalPath() );
307         System.out.println();
308     }
309
310     private final static int executeOneTreeInDir( final ALGORITHM base_algorithm,
311                                                   final boolean most_parsimonous_duplication_model,
312                                                   final boolean allow_stripping_of_gene_tree,
313                                                   final boolean transfer_taxonomy,
314                                                   final File gene_tree_file,
315                                                   final File species_tree_file,
316                                                   final File out_file,
317                                                   final EasyWriter log_writer )
318             throws IOException {
319         if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) {
320             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( gene_tree_file ) );
321         }
322         if ( ForesterUtil.isReadableFile( species_tree_file ) != null ) {
323             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( species_tree_file ) );
324         }
325         if ( ForesterUtil.isWritableFile( out_file ) != null ) {
326             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( out_file ) );
327         }
328         Phylogeny gene_trees[] = null;
329         try {
330             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
331             gene_trees = factory.create( gene_tree_file, PhyloXmlParser.createPhyloXmlParserXsdValidating() );
332         }
333         catch ( final IOException e ) {
334             fatalError( "error",
335                         "failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage(),
336                         log_writer );
337         }
338         int counter = 0;
339         final List<Phylogeny> out_trees = new ArrayList<Phylogeny>();
340         for( final Phylogeny gene_tree : gene_trees ) {
341             if ( !gene_tree.isEmpty() && gene_tree.getNumberOfExternalNodes() > 1 ) {
342                 Phylogeny species_tree = null;
343                 try {
344                     species_tree = SDIutil.parseSpeciesTree( gene_tree,
345                                                              species_tree_file,
346                                                              REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE,
347                                                              true,
348                                                              TAXONOMY_EXTRACTION.NO );
349                 }
350                 catch ( final PhyloXmlDataFormatException e ) {
351                     fatalError( "user error",
352                                 "failed to transfer general node name, in [" + species_tree_file + "]: "
353                                         + e.getMessage(),
354                                 log_writer );
355                 }
356                 catch ( final SDIException e ) {
357                     fatalError( "user error", e.getMessage(), log_writer );
358                 }
359                 catch ( final IOException e ) {
360                     fatalError( "error",
361                                 "Failed to read species tree from [" + species_tree_file + "]: " + e.getMessage(),
362                                 log_writer );
363                 }
364                 gene_tree.setRooted( true );
365                 species_tree.setRooted( true );
366                 if ( !gene_tree.isCompletelyBinary() ) {
367                     fatalError( "user error",
368                                 "gene tree [" + gene_tree_file + "] is not completely binary",
369                                 log_writer );
370                 }
371                 if ( base_algorithm == ALGORITHM.SDI ) {
372                     if ( !species_tree.isCompletelyBinary() ) {
373                         fatalError( "user error",
374                                     "species tree is not completely binary, use GSDI or GSDIR instead",
375                                     log_writer );
376                     }
377                 }
378                 log_writer.print( gene_tree_file.getName() );
379                 log_writer.print( "\t" );
380                 log_writer.print( ( ForesterUtil.isEmpty( gene_tree.getName() ) ? "" : gene_tree.getName() ) );
381                 if ( gene_trees.length > 1 ) {
382                     log_writer.print( ( ForesterUtil.isEmpty( gene_tree.getName() ) ? Integer.toString( counter )
383                             : ( ":" + Integer.toString( counter ) ) ) );
384                 }
385                 log_writer.print( "\t" );
386                 GSDII gsdii = null;
387                 try {
388                     if ( base_algorithm == ALGORITHM.GSDI ) {
389                         gsdii = new GSDI( gene_tree,
390                                           species_tree,
391                                           most_parsimonous_duplication_model,
392                                           allow_stripping_of_gene_tree,
393                                           true,
394                                           transfer_taxonomy );
395                     }
396                     else if ( base_algorithm == ALGORITHM.GSDIR ) {
397                         gsdii = new GSDIR( gene_tree,
398                                            species_tree,
399                                            allow_stripping_of_gene_tree,
400                                            true,
401                                            transfer_taxonomy );
402                     }
403                 }
404                 catch ( final SDIException e ) {
405                     fatalError( "user error", e.getLocalizedMessage(), log_writer );
406                 }
407                 catch ( final OutOfMemoryError e ) {
408                     ForesterUtil.outOfMemoryError( e );
409                 }
410                 catch ( final Exception e ) {
411                     e.printStackTrace();
412                     fatalError( "unexpected error", e.toString(), log_writer );
413                 }
414                 if ( base_algorithm == ALGORITHM.GSDIR ) {
415                     final Phylogeny gt = ( ( GSDIR ) gsdii ).getMinDuplicationsSumGeneTree();
416                     gt.setRerootable( false );
417                     out_trees.add( gt );
418                 }
419                 else {
420                     gene_tree.setRerootable( false );
421                     out_trees.add( gene_tree );
422                 }
423                 log_writer.print( gene_tree.getNumberOfExternalNodes() + "\t" );
424                 log_writer.print( gsdii.getSpeciationsSum() + "\t" );
425                 if ( ( base_algorithm == ALGORITHM.GSDIR ) ) {
426                     final GSDIR gsdir = ( GSDIR ) gsdii;
427                     log_writer.print( gsdir.getMinDuplicationsSum() + "\t" );
428                 }
429                 else if ( ( base_algorithm == ALGORITHM.GSDI ) ) {
430                     final GSDI gsdi = ( GSDI ) gsdii;
431                     log_writer.print( gsdi.getDuplicationsSum() + "\t" );
432                     if ( !most_parsimonous_duplication_model ) {
433                         log_writer.print( gsdi.getSpeciationOrDuplicationEventsSum() + "\t" );
434                     }
435                 }
436                 if ( allow_stripping_of_gene_tree ) {
437                     log_writer.print( gsdii.getStrippedExternalGeneTreeNodes().size() + "\t" );
438                 }
439                 log_writer.print( gsdii.getTaxCompBase().toString() );
440                 log_writer.println();
441                 ++counter;
442             }
443         }
444         if ( counter > 0 ) {
445             try {
446                 final PhylogenyWriter writer = new PhylogenyWriter();
447                 writer.toPhyloXML( out_file, out_trees, 0, ForesterUtil.LINE_SEPARATOR );
448             }
449             catch ( final IOException e ) {
450                 ForesterUtil
451                         .fatalError( PRG_NAME,
452                                      "Failed to write to [" + out_file.getCanonicalPath() + "]: " + e.getMessage() );
453             }
454         }
455         return counter;
456     }
457
458     private final static void execute( final ALGORITHM base_algorithm,
459                                        final boolean most_parsimonous_duplication_model,
460                                        final boolean allow_stripping_of_gene_tree,
461                                        final boolean transfer_taxonomy,
462                                        final File gene_tree_file,
463                                        final File species_tree_file,
464                                        final File out_file,
465                                        final File log_file )
466             throws IOException {
467         if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) {
468             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( gene_tree_file ) );
469         }
470         if ( ForesterUtil.isReadableFile( species_tree_file ) != null ) {
471             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( species_tree_file ) );
472         }
473         if ( ForesterUtil.isWritableFile( out_file ) != null ) {
474             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( out_file ) );
475         }
476         if ( ForesterUtil.isWritableFile( log_file ) != null ) {
477             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( log_file ) );
478         }
479         EasyWriter log_writer = null;
480         try {
481             log_writer = ForesterUtil.createEasyWriter( log_file );
482         }
483         catch ( final IOException e ) {
484             ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to create [" + log_file + "]: " + e.getMessage() );
485         }
486         Phylogeny species_tree = null;
487         Phylogeny gene_tree = null;
488         try {
489             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
490             gene_tree = factory.create( gene_tree_file, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];
491         }
492         catch ( final IOException e ) {
493             fatalError( "error",
494                         "failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage(),
495                         log_writer );
496         }
497         try {
498             species_tree = SDIutil.parseSpeciesTree( gene_tree,
499                                                      species_tree_file,
500                                                      REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE,
501                                                      true,
502                                                      TAXONOMY_EXTRACTION.NO );
503         }
504         catch ( final PhyloXmlDataFormatException e ) {
505             fatalError( "user error",
506                         "failed to transfer general node name, in [" + species_tree_file + "]: " + e.getMessage(),
507                         log_writer );
508         }
509         catch ( final SDIException e ) {
510             fatalError( "user error", e.getMessage(), log_writer );
511         }
512         catch ( final IOException e ) {
513             fatalError( "error",
514                         "Failed to read species tree from [" + species_tree_file + "]: " + e.getMessage(),
515                         log_writer );
516         }
517         gene_tree.setRooted( true );
518         species_tree.setRooted( true );
519         if ( !gene_tree.isCompletelyBinary() ) {
520             fatalError( "user error", "gene tree [" + gene_tree_file + "] is not completely binary", log_writer );
521         }
522         if ( base_algorithm == ALGORITHM.SDI ) {
523             if ( !species_tree.isCompletelyBinary() ) {
524                 fatalError( "user error",
525                             "species tree is not completely binary, use GSDI or GSDIR instead",
526                             log_writer );
527             }
528         }
529         log_writer.println( PRG_NAME + " - " + PRG_DESC );
530         log_writer.println( "  version         : " + PRG_VERSION );
531         log_writer.println( "  date            : " + PRG_DATE );
532         log_writer.println( "  forester version: " + ForesterConstants.FORESTER_VERSION );
533         log_writer.println();
534         log_writer.println( "Start time                               : "
535                 + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
536         System.out.println( "Start time                               : "
537                 + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
538         log_writer.println( "Gene tree file                           : " + gene_tree_file.getCanonicalPath() );
539         System.out.println( "Gene tree file                           : " + gene_tree_file.getCanonicalPath() );
540         log_writer.println( "Gene tree name                           : "
541                 + ( ForesterUtil.isEmpty( gene_tree.getName() ) ? "" : gene_tree.getName() ) );
542         System.out.println( "Gene tree name                           : "
543                 + ( ForesterUtil.isEmpty( gene_tree.getName() ) ? "" : gene_tree.getName() ) );
544         log_writer.println( "Species tree file                        : " + species_tree_file.getCanonicalPath() );
545         System.out.println( "Species tree file                        : " + species_tree_file.getCanonicalPath() );
546         log_writer.println( "Species tree name                        : "
547                 + ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) );
548         System.out.println( "Species tree name                        : "
549                 + ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) );
550         System.out.println( "Transfer taxonomy                        : " + transfer_taxonomy );
551         GSDII gsdii = null;
552         final long start_time = new Date().getTime();
553         try {
554             if ( base_algorithm == ALGORITHM.GSDI ) {
555                 System.out.println( "Algorithm                                : GSDI" );
556                 log_writer.println( "Algorithm                                : GSDI" );
557             }
558             else if ( base_algorithm == ALGORITHM.GSDIR ) {
559                 System.out.println( "Algorithm                                : GSDIR" );
560                 log_writer.println( "Algorithm                                : GSDIR" );
561             }
562             System.out.println( "Use most parsimonous duplication model   : " + most_parsimonous_duplication_model );
563             System.out.println( "Allow stripping of gene tree nodes       : " + allow_stripping_of_gene_tree );
564             log_writer.println( "Use most parsimonous duplication model   : " + most_parsimonous_duplication_model );
565             log_writer.println( "Allow stripping of gene tree nodes       : " + allow_stripping_of_gene_tree );
566             log_writer.flush();
567             if ( base_algorithm == ALGORITHM.GSDI ) {
568                 gsdii = new GSDI( gene_tree,
569                                   species_tree,
570                                   most_parsimonous_duplication_model,
571                                   allow_stripping_of_gene_tree,
572                                   true,
573                                   transfer_taxonomy );
574             }
575             else if ( base_algorithm == ALGORITHM.GSDIR ) {
576                 gsdii = new GSDIR( gene_tree, species_tree, allow_stripping_of_gene_tree, true, transfer_taxonomy );
577             }
578         }
579         catch ( final SDIException e ) {
580             fatalError( "user error", e.getLocalizedMessage(), log_writer );
581         }
582         catch ( final IOException e ) {
583             fatalError( "error", e.toString(), log_writer );
584         }
585         catch ( final OutOfMemoryError e ) {
586             ForesterUtil.outOfMemoryError( e );
587         }
588         catch ( final Exception e ) {
589             e.printStackTrace();
590             fatalError( "unexpected error", e.toString(), log_writer );
591         }
592         System.out.println( "Running time (excluding I/O)             : " + ( new Date().getTime() - start_time )
593                 + "ms" );
594         log_writer.println( "Running time (excluding I/O)             : " + ( new Date().getTime() - start_time )
595                 + "ms" );
596         System.out.println( "Mapping based on                         : " + gsdii.getTaxCompBase() );
597         log_writer.println( "Mapping based on                         : " + gsdii.getTaxCompBase() );
598         try {
599             final PhylogenyWriter writer = new PhylogenyWriter();
600             if ( base_algorithm == ALGORITHM.GSDIR ) {
601                 final Phylogeny gt = ( ( GSDIR ) gsdii ).getMinDuplicationsSumGeneTree();
602                 gt.setRerootable( false );
603                 writer.toPhyloXML( out_file, gt, 0 );
604             }
605             else {
606                 gene_tree.setRerootable( false );
607                 writer.toPhyloXML( out_file, gene_tree, 0 );
608             }
609         }
610         catch ( final IOException e ) {
611             ForesterUtil.fatalError( PRG_NAME,
612                                      "Failed to write to [" + out_file.getCanonicalPath() + "]: " + e.getMessage() );
613         }
614         System.out.println( "Wrote resulting gene tree to             : " + out_file.getCanonicalPath() );
615         log_writer.println( "Wrote resulting gene tree to             : " + out_file.getCanonicalPath() );
616         final File species_tree_used_file = new File( ForesterUtil.removeSuffix( out_file.toString() )
617                 + SUFFIX_FOR_SPECIES_TREE_USED );
618         try {
619             final PhylogenyWriter writer = new PhylogenyWriter();
620             writer.toPhyloXML( species_tree_used_file, species_tree, 0 );
621         }
622         catch ( final IOException e ) {
623             ForesterUtil.fatalError( PRG_NAME,
624                                      "Failed to write to [" + species_tree_used_file.getCanonicalPath() + "]: "
625                                              + e.getMessage() );
626         }
627         System.out.println( "Wrote (stripped) species tree to         : " + species_tree_used_file.getCanonicalPath() );
628         log_writer.println( "Wrote (stripped) species tree to         : " + species_tree_used_file.getCanonicalPath() );
629         if ( ( gsdii.getReMappedScientificNamesFromGeneTree() != null )
630                 && !gsdii.getReMappedScientificNamesFromGeneTree().isEmpty() ) {
631             System.out.println( "Number of gene tree species remapped     : "
632                     + gsdii.getReMappedScientificNamesFromGeneTree().size() );
633             log_writer.println( "Number of gene tree species remapped     : "
634                     + gsdii.getReMappedScientificNamesFromGeneTree().size() );
635             writeToRemappedFile( out_file, gsdii.getReMappedScientificNamesFromGeneTree(), log_writer );
636         }
637         System.out.println( "Number of external nodes in gene tree    : " + gene_tree.getNumberOfExternalNodes() );
638         log_writer.println( "Number of external nodes in gene tree    : " + gene_tree.getNumberOfExternalNodes() );
639         System.out.println( "Number of external nodes in species tree : " + species_tree.getNumberOfExternalNodes() );
640         log_writer.println( "Number of external nodes in species tree : " + species_tree.getNumberOfExternalNodes() );
641         final int poly = PhylogenyMethods.countNumberOfPolytomies( species_tree );
642         System.out.println( "Number of polytomies in species tree     : " + poly );
643         log_writer.println( "Number of polytomies in species tree     : " + poly );
644         System.out.println( "External nodes stripped from gene tree   : "
645                 + gsdii.getStrippedExternalGeneTreeNodes().size() );
646         log_writer.println( "External nodes stripped from gene tree   : "
647                 + gsdii.getStrippedExternalGeneTreeNodes().size() );
648         System.out
649                 .println( "External nodes stripped from species tree: " + gsdii.getStrippedSpeciesTreeNodes().size() );
650         log_writer
651                 .println( "External nodes stripped from species tree: " + gsdii.getStrippedSpeciesTreeNodes().size() );
652         System.out.println();
653         System.out.println( "Number of speciations                    : " + gsdii.getSpeciationsSum() );
654         log_writer.println( "Number of speciations                    : " + gsdii.getSpeciationsSum() );
655         if ( ( base_algorithm == ALGORITHM.GSDIR ) ) {
656             final GSDIR gsdir = ( GSDIR ) gsdii;
657             System.out.println( "Minimal number of duplications           : " + gsdir.getMinDuplicationsSum() );
658             log_writer.println( "Minimal number of duplications           : " + gsdir.getMinDuplicationsSum() );
659         }
660         else if ( ( base_algorithm == ALGORITHM.GSDI ) ) {
661             final GSDI gsdi = ( GSDI ) gsdii;
662             System.out.println( "Number of duplications                   : " + gsdi.getDuplicationsSum() );
663             log_writer.println( "Number of duplications                   : " + gsdi.getDuplicationsSum() );
664             if ( !most_parsimonous_duplication_model ) {
665                 final int u = gsdi.getSpeciationOrDuplicationEventsSum();
666                 System.out.println( "Number of potential duplications         : " + u );
667                 log_writer.println( "Number of potential duplications         : " + u );
668             }
669         }
670         log_writer.println();
671         printMappedNodesToLog( log_writer, gsdii );
672         log_writer.println();
673         printStrippedGeneTreeNodesToLog( log_writer, gsdii );
674         System.out.println();
675         System.out.println( "Wrote log to                             : " + log_file.getCanonicalPath() );
676         System.out.println();
677         log_writer.close();
678     }
679
680     private final static void fatalError( final String type, final String msg, final EasyWriter log_writer ) {
681         try {
682             log_writer.flush();
683             log_writer.println();
684             log_writer.print( type.toUpperCase() + ": " );
685             log_writer.println( msg );
686             log_writer.close();
687         }
688         catch ( final IOException e ) {
689             e.printStackTrace();
690         }
691         ForesterUtil.fatalError( gsdi.PRG_NAME, msg );
692     }
693
694     private final static void print_help() {
695         System.out.println( "Usage: " + PRG_NAME
696                 + " [-options] <gene tree file, or gene trees in-directory> <species tree> <outfile, or out-directory>" );
697         System.out.println();
698         System.out.println( "Options:" );
699         System.out.println( " -" + ALLOW_STRIPPING_OF_GENE_TREE_OPTION
700                 + "         : to allow stripping of gene tree nodes without a matching species" );
701         System.out.println( " -" + MOST_PARSIMONIOUS_OPTION
702                 + "         : use most parimonious duplication model for GSDI: " );
703         System.out.println( "              assign nodes as speciations which would otherwise be assiged" );
704         System.out.println( "              as potential duplications due to polytomies in the species tree" );
705         System.out.println( " -" + GUESS_FORMAT_OF_SPECIES_TREE
706                 + "         : to allow species tree in other formats than phyloXML (i.e. Newick, NHX, Nexus)" );
707         System.out.println( " -" + GSDIR_OPTION
708                 + "         : to use GSDIR algorithm instead of GSDI algorithm (re-rooting)" );
709         System.out.println( " -" + TRANSFER_TAXONOMY_OPTION
710                 + "         : to transfer taxonomic data from species tree to gene tree" );
711         System.out.println( " -" + SUFFIX_FOR_DIR_OPTION
712                 + "=<suffix>: suffix for gene trees for analyzing entire directory of trees" );
713         System.out.println();
714         System.out.println();
715         System.out.println( "Gene tree(s):" );
716         System.out.println( " in phyloXM format, with taxonomy and sequence data in appropriate fields" );
717         System.out.println();
718         System.out.println( "Species tree:" );
719         System.out.println( " in phyloXML format (unless option -" + GUESS_FORMAT_OF_SPECIES_TREE + " is used)" );
720         System.out.println();
721         System.out.println( "Examples: gsdi -" + ALLOW_STRIPPING_OF_GENE_TREE_OPTION
722                 + " gene_tree.xml tree_of_life.xml out.xml" );
723         System.out.println( "          gsdi -" + ALLOW_STRIPPING_OF_GENE_TREE_OPTION + " -" + SUFFIX_FOR_DIR_OPTION
724                 + "=.xml" + " gene_tree_dir tree_of_life.xml out_dir" );
725         System.out.println( "          gsdi -" + ALLOW_STRIPPING_OF_GENE_TREE_OPTION + " -" + MOST_PARSIMONIOUS_OPTION
726                 + " -" + GSDIR_OPTION + " -" + TRANSFER_TAXONOMY_OPTION + " -" + SUFFIX_FOR_DIR_OPTION + "=.xml"
727                 + " gene_tree_dir tree_of_life.xml out_dir" );
728         System.out.println();
729     }
730
731     private final static void printMappedNodesToLog( final EasyWriter log_writer, final GSDII gsdi )
732             throws IOException {
733         final SortedSet<String> ss = new TreeSet<String>();
734         for( final PhylogenyNode n : gsdi.getMappedExternalSpeciesTreeNodes() ) {
735             ss.add( n.toString() );
736         }
737         log_writer.println( "The following " + ss.size() + " species were used: " );
738         for( final String s : ss ) {
739             log_writer.println( "  " + s );
740         }
741     }
742
743     private final static void printStrippedGeneTreeNodesToLog( final EasyWriter log_writer, final GSDII gsdi )
744             throws IOException {
745         final SortedMap<String, Integer> sm = new TreeMap<String, Integer>();
746         for( final PhylogenyNode n : gsdi.getStrippedExternalGeneTreeNodes() ) {
747             final String s = n.toString();
748             if ( sm.containsKey( s ) ) {
749                 sm.put( s, sm.get( s ) + 1 );
750             }
751             else {
752                 sm.put( s, 1 );
753             }
754         }
755         log_writer.println( "The following " + sm.size() + " nodes were stripped from the gene tree: " );
756         for( final String s : sm.keySet() ) {
757             final int count = sm.get( s );
758             if ( count == 1 ) {
759                 log_writer.println( "  " + s );
760             }
761             else {
762                 log_writer.println( "  " + s + " [" + count + "]" );
763             }
764         }
765     }
766
767     private final static void writeToRemappedFile( final File out_file,
768                                                    final SortedSet<String> remapped,
769                                                    final EasyWriter log_writer )
770             throws IOException {
771         final File file = new File( ForesterUtil.removeSuffix( out_file.toString() ) + REMAPPED_SUFFIX );
772         final EasyWriter remapped_writer = ForesterUtil.createEasyWriter( file );
773         for( final String s : remapped ) {
774             remapped_writer.println( s );
775         }
776         remapped_writer.close();
777         System.out.println( "Wrote remapped gene tree species to      : " + file.getCanonicalPath() );
778         log_writer.println( "Wrote remapped gene tree species to      : " + file.getCanonicalPath() );
779     }
780 }