in progress...
[jalview.git] / forester / java / src / org / forester / application / gsdi.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.application;
27
28 import java.io.File;
29 import java.io.FilenameFilter;
30 import java.io.IOException;
31 import java.text.SimpleDateFormat;
32 import java.util.ArrayList;
33 import java.util.Date;
34 import java.util.List;
35 import java.util.SortedMap;
36 import java.util.SortedSet;
37 import java.util.TreeMap;
38 import java.util.TreeSet;
39
40 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
41 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
42 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
43 import org.forester.io.writers.PhylogenyWriter;
44 import org.forester.phylogeny.Phylogeny;
45 import org.forester.phylogeny.PhylogenyMethods;
46 import org.forester.phylogeny.PhylogenyNode;
47 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
48 import org.forester.phylogeny.factories.PhylogenyFactory;
49 import org.forester.sdi.GSDI;
50 import org.forester.sdi.GSDII;
51 import org.forester.sdi.GSDIR;
52 import org.forester.sdi.SDIException;
53 import org.forester.sdi.SDIutil;
54 import org.forester.sdi.SDIutil.ALGORITHM;
55 import org.forester.util.CommandLineArguments;
56 import org.forester.util.EasyWriter;
57 import org.forester.util.ForesterConstants;
58 import org.forester.util.ForesterUtil;
59
60 public final class gsdi {
61
62     final static public boolean REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE = true;
63     final static private String ALLOW_STRIPPING_OF_GENE_TREE_OPTION    = "g";
64     final static private String GSDIR_OPTION                           = "r";
65     final static private String MOST_PARSIMONIOUS_OPTION               = "m";
66     final static private String SUFFIX_FOR_DIR_OPTION                  = "s";
67     final static private String GUESS_FORMAT_OF_SPECIES_TREE           = "q";
68     final static private String TRANSFER_TAXONOMY_OPTION               = "t";
69     final static private String HELP_OPTION_1                          = "help";
70     final static private String HELP_OPTION_2                          = "h";
71     final static private String SUFFIX_FOR_SPECIES_TREE_USED           = "_species_tree_used.xml";
72     final static private String OUTTREE_SUFFIX                         = "_gsdir.xml";
73     final static private String LOGFILE_NAME                           = "00_gsdi_log.txt";
74     final static private String REMAPPED_SUFFIX                        = "_gsdi_remapped.txt";
75     final static private String PRG_NAME                               = "gsdi";
76     final static private String PRG_VERSION                            = "1.100";
77     final static private String PRG_DATE                               = "170331";
78     final static private String PRG_DESC                               = "general speciation duplication inference";
79     final static private String E_MAIL                                 = "phyloxml@gmail.com";
80     final static private String WWW                                    = "https://sites.google.com/site/cmzmasek/home/software/forester";
81
82     public static void main( final String args[] ) {
83         try {
84             ForesterUtil.printProgramInformation( PRG_NAME,
85                                                   PRG_DESC,
86                                                   PRG_VERSION,
87                                                   PRG_DATE,
88                                                   E_MAIL,
89                                                   WWW,
90                                                   ForesterUtil.getForesterLibraryInformation() );
91             CommandLineArguments cla = null;
92             try {
93                 cla = new CommandLineArguments( args );
94             }
95             catch ( final Exception e ) {
96                 ForesterUtil.fatalError( PRG_NAME, e.getMessage() );
97             }
98             if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) ) {
99                 System.out.println();
100                 gsdi.print_help();
101                 System.exit( 0 );
102             }
103             else if ( ( args.length < 2 ) || ( cla.getNumberOfNames() != 3 ) ) {
104                 System.out.println();
105                 System.out.println( "Wrong number of arguments." );
106                 System.out.println();
107                 gsdi.print_help();
108                 System.exit( -1 );
109             }
110             final List<String> allowed_options = new ArrayList<String>();
111             allowed_options.add( GSDIR_OPTION );
112             allowed_options.add( GUESS_FORMAT_OF_SPECIES_TREE );
113             allowed_options.add( MOST_PARSIMONIOUS_OPTION );
114             allowed_options.add( ALLOW_STRIPPING_OF_GENE_TREE_OPTION );
115             allowed_options.add( TRANSFER_TAXONOMY_OPTION );
116             allowed_options.add( SUFFIX_FOR_DIR_OPTION );
117             final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
118             if ( dissallowed_options.length() > 0 ) {
119                 ForesterUtil.fatalError( PRG_NAME, "unknown option(s): " + dissallowed_options );
120             }
121             execute( cla );
122         }
123         catch ( final IOException e ) {
124             ForesterUtil.fatalError( gsdi.PRG_NAME, e.getMessage() );
125         }
126     }
127
128     private static void execute( final CommandLineArguments cla ) throws IOException {
129         ALGORITHM base_algorithm = ALGORITHM.GSDI;
130         boolean most_parsimonous_duplication_model = false;
131         boolean allow_stripping_of_gene_tree = false;
132         if ( cla.isOptionSet( GSDIR_OPTION ) ) {
133             base_algorithm = ALGORITHM.GSDIR;
134         }
135         if ( cla.isOptionSet( MOST_PARSIMONIOUS_OPTION ) ) {
136             if ( base_algorithm == ALGORITHM.SDI ) {
137                 ForesterUtil.fatalError( PRG_NAME, "Cannot use most parsimonious duplication mode with SDI" );
138             }
139             most_parsimonous_duplication_model = true;
140         }
141         if ( cla.isOptionSet( ALLOW_STRIPPING_OF_GENE_TREE_OPTION ) ) {
142             if ( base_algorithm == ALGORITHM.SDI ) {
143                 ForesterUtil.fatalError( PRG_NAME, "Cannot allow stripping of gene tree with SDI" );
144             }
145             allow_stripping_of_gene_tree = true;
146         }
147         boolean transfer_taxonomy = false;
148         if ( cla.isOptionSet( TRANSFER_TAXONOMY_OPTION ) ) {
149             transfer_taxonomy = true;
150         }
151         boolean use_gene_tree_dir = false;
152         final String gene_tree_suffix;
153         if ( cla.isOptionSet( SUFFIX_FOR_DIR_OPTION ) ) {
154             gene_tree_suffix = cla.getOptionValue( SUFFIX_FOR_DIR_OPTION );
155             use_gene_tree_dir = true;
156         }
157         else {
158             gene_tree_suffix = null;
159         }
160         File gene_tree_file = null;
161         File species_tree_file = null;
162         File out_file = null;
163         File log_file = null;
164         File out_dir = null;
165         try {
166             gene_tree_file = cla.getFile( 0 );
167             species_tree_file = cla.getFile( 1 );
168             if ( use_gene_tree_dir ) {
169                 out_dir = cla.getFile( 2 );
170                 if ( out_dir.exists() ) {
171                     if ( !out_dir.isDirectory() ) {
172                         ForesterUtil
173                                 .fatalError( gsdi.PRG_NAME,
174                                              "out-directory [" + out_dir + "] already exists but is not a directory" );
175                     }
176                 }
177                 else {
178                     final boolean success = out_dir.mkdirs();
179                     if ( !success ) {
180                         ForesterUtil.fatalError( gsdi.PRG_NAME, "could not create out-directory [" + out_dir + "]" );
181                     }
182                 }
183             }
184             else {
185                 out_file = cla.getFile( 2 );
186             }
187         }
188         catch ( final IllegalArgumentException e ) {
189             ForesterUtil.fatalError( PRG_NAME, "error in command line: " + e.getMessage() );
190         }
191         if ( use_gene_tree_dir ) {
192             final File indir = new File( gene_tree_file.toString() );
193             if ( !indir.exists() ) {
194                 ForesterUtil.fatalError( gsdi.PRG_NAME, "in-directory [" + indir + "] does not exist" );
195             }
196             if ( !indir.isDirectory() ) {
197                 ForesterUtil.fatalError( gsdi.PRG_NAME, "in-directory [" + indir + "] is not a directory" );
198             }
199             final String species_tree_file_name = species_tree_file.getName();
200             final File gene_tree_files[] = indir.listFiles( new FilenameFilter() {
201
202                 @Override
203                 public boolean accept( final File dir, final String name ) {
204                     return (( name.endsWith( gene_tree_suffix) ) && !(name.equals( species_tree_file_name)));
205                 }
206             } );
207             
208             if ( gene_tree_files.length < 1 ) {
209                 ForesterUtil.fatalError( gsdi.PRG_NAME,
210                                          "in-directory [" + indir + "] does not contain any gene tree files with suffix "
211                                                  + gene_tree_suffix );
212             }
213           
214             execute( base_algorithm,
215                      most_parsimonous_duplication_model,
216                      allow_stripping_of_gene_tree,
217                      transfer_taxonomy,
218                      gene_tree_files,
219                      species_tree_file,
220                      out_dir );
221         }
222         else {
223             execute( base_algorithm,
224                      most_parsimonous_duplication_model,
225                      allow_stripping_of_gene_tree,
226                      transfer_taxonomy,
227                      gene_tree_file,
228                      species_tree_file,
229                      out_file,
230                      log_file );
231         }
232     }
233
234     private static void execute( final ALGORITHM base_algorithm,
235                                  final boolean most_parsimonous_duplication_model,
236                                  final boolean allow_stripping_of_gene_tree,
237                                  final boolean transfer_taxonomy,
238                                  final File gene_tree_files[],
239                                  final File species_tree_file,
240                                  final File outdir )
241             throws IOException {
242         final File log_file = new File( outdir, LOGFILE_NAME );
243         if ( ForesterUtil.isWritableFile( log_file ) != null ) {
244             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( log_file ) );
245         }
246         EasyWriter log_writer = null;
247         try {
248             log_writer = ForesterUtil.createEasyWriter( log_file );
249         }
250         catch ( final IOException e ) {
251             ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to create [" + log_file + "]: " + e.getMessage() );
252         }
253         log_writer.println( "# " + PRG_NAME );
254         log_writer.println( "# Version\t" + PRG_VERSION );
255         log_writer.println( "# Date\t" + PRG_DATE );
256         log_writer.println( "# Forester version\t" + ForesterConstants.FORESTER_VERSION );
257         log_writer.println( "# Species tree\t" + species_tree_file.getCanonicalPath() );
258         if ( base_algorithm == ALGORITHM.GSDI ) {
259             log_writer.println( "# Algorithm\tGSDI" );
260         }
261         else if ( base_algorithm == ALGORITHM.GSDIR ) {
262             log_writer.println( "# Algorithm\tGSDIR" );
263         }
264         log_writer.println( "# Use most parsimonous duplication model\t" + most_parsimonous_duplication_model );
265         log_writer.println( "# Allow stripping of gene tree nodes\t" + allow_stripping_of_gene_tree );
266         log_writer.println( "# Start time\t" + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
267         int counter = 0;
268         for( final File gene_tree_file : gene_tree_files ) {
269            
270             String outname = gene_tree_file.getName();
271             if ( outname.indexOf( "." ) > 0 ) {
272                 outname = outname.substring( 0, outname.lastIndexOf( "." ) );
273             }
274             outname = outname + OUTTREE_SUFFIX;
275             execute( base_algorithm,
276                      most_parsimonous_duplication_model,
277                      allow_stripping_of_gene_tree,
278                      transfer_taxonomy,
279                      gene_tree_file,
280                      species_tree_file,
281                      new File( outdir, outname ),
282                      log_writer );
283             System.out.print( "\r" + counter++ );
284             
285         }
286         System.out.print( "\r" );
287         log_writer.close();
288         System.out.println("Analyzed " + counter + " gene trees");
289         System.out.println();
290         System.out.println( "Wrote log to: " + log_file.getCanonicalPath() );
291         System.out.println();
292     }
293
294     private static void execute( final ALGORITHM base_algorithm,
295                                  final boolean most_parsimonous_duplication_model,
296                                  final boolean allow_stripping_of_gene_tree,
297                                  final boolean transfer_taxonomy,
298                                  final File gene_tree_file,
299                                  final File species_tree_file,
300                                  final File out_file,
301                                  final EasyWriter log_writer )
302             throws IOException {
303         if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) {
304             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( gene_tree_file ) );
305         }
306         if ( ForesterUtil.isReadableFile( species_tree_file ) != null ) {
307             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( species_tree_file ) );
308         }
309         if ( ForesterUtil.isWritableFile( out_file ) != null ) {
310             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( out_file ) );
311         }
312         Phylogeny species_tree = null;
313         Phylogeny gene_tree = null;
314         try {
315             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
316             gene_tree = factory.create( gene_tree_file, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];
317         }
318         catch ( final IOException e ) {
319             fatalError( "error",
320                         "failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage(),
321                         log_writer );
322         }
323         try {
324             species_tree = SDIutil.parseSpeciesTree( gene_tree,
325                                                      species_tree_file,
326                                                      REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE,
327                                                      true,
328                                                      TAXONOMY_EXTRACTION.NO );
329         }
330         catch ( final PhyloXmlDataFormatException e ) {
331             fatalError( "user error",
332                         "failed to transfer general node name, in [" + species_tree_file + "]: " + e.getMessage(),
333                         log_writer );
334         }
335         catch ( final SDIException e ) {
336             fatalError( "user error", e.getMessage(), log_writer );
337         }
338         catch ( final IOException e ) {
339             fatalError( "error",
340                         "Failed to read species tree from [" + species_tree_file + "]: " + e.getMessage(),
341                         log_writer );
342         }
343         gene_tree.setRooted( true );
344         species_tree.setRooted( true );
345         if ( !gene_tree.isCompletelyBinary() ) {
346             fatalError( "user error", "gene tree [" + gene_tree_file + "] is not completely binary", log_writer );
347         }
348         if ( base_algorithm == ALGORITHM.SDI ) {
349             if ( !species_tree.isCompletelyBinary() ) {
350                 fatalError( "user error",
351                             "species tree is not completely binary, use GSDI or GSDIR instead",
352                             log_writer );
353             }
354         }
355         log_writer.print( gene_tree_file.getName() );
356         log_writer.print( "\t" );
357         log_writer.print( ( ForesterUtil.isEmpty( gene_tree.getName() ) ? "" : gene_tree.getName() ) );
358         log_writer.print( "\t" );
359         GSDII gsdii = null;
360         try {
361             log_writer.flush();
362             if ( base_algorithm == ALGORITHM.GSDI ) {
363                 gsdii = new GSDI( gene_tree,
364                                   species_tree,
365                                   most_parsimonous_duplication_model,
366                                   allow_stripping_of_gene_tree,
367                                   true,
368                                   transfer_taxonomy );
369             }
370             else if ( base_algorithm == ALGORITHM.GSDIR ) {
371                 gsdii = new GSDIR( gene_tree, species_tree, allow_stripping_of_gene_tree, true, transfer_taxonomy );
372             }
373         }
374         catch ( final SDIException e ) {
375             fatalError( "user error", e.getLocalizedMessage(), log_writer );
376         }
377         catch ( final IOException e ) {
378             fatalError( "error", e.toString(), log_writer );
379         }
380         catch ( final OutOfMemoryError e ) {
381             ForesterUtil.outOfMemoryError( e );
382         }
383         catch ( final Exception e ) {
384             e.printStackTrace();
385             fatalError( "unexpected error", e.toString(), log_writer );
386         }
387         try {
388             final PhylogenyWriter writer = new PhylogenyWriter();
389             if ( base_algorithm == ALGORITHM.GSDIR ) {
390                 writer.toPhyloXML( out_file, ( ( GSDIR ) gsdii ).getMinDuplicationsSumGeneTree(), 0 );
391             }
392             else {
393                 writer.toPhyloXML( out_file, gene_tree, 0 );
394             }
395         }
396         catch ( final IOException e ) {
397             ForesterUtil.fatalError( PRG_NAME,
398                                      "Failed to write to [" + out_file.getCanonicalPath() + "]: " + e.getMessage() );
399         }
400         log_writer.print( gene_tree.getNumberOfExternalNodes() + "\t" );
401         log_writer.print( gsdii.getSpeciationsSum() + "\t" );
402         if ( ( base_algorithm == ALGORITHM.GSDIR ) ) {
403             final GSDIR gsdir = ( GSDIR ) gsdii;
404             log_writer.print( gsdir.getMinDuplicationsSum() + "\t" );
405         }
406         else if ( ( base_algorithm == ALGORITHM.GSDI ) ) {
407             final GSDI gsdi = ( GSDI ) gsdii;
408             log_writer.print( gsdi.getDuplicationsSum() + "\t" );
409             if ( !most_parsimonous_duplication_model ) {
410                 log_writer.print( gsdi.getSpeciationOrDuplicationEventsSum() + "\t" );
411             }
412         }
413         log_writer.print( gsdii.getTaxCompBase() + "\t" );
414         log_writer.println();
415     }
416
417     private static void execute( final ALGORITHM base_algorithm,
418                                  final boolean most_parsimonous_duplication_model,
419                                  final boolean allow_stripping_of_gene_tree,
420                                  final boolean transfer_taxonomy,
421                                  final File gene_tree_file,
422                                  final File species_tree_file,
423                                  final File out_file,
424                                  final File log_file )
425             throws IOException {
426         if ( ForesterUtil.isReadableFile( gene_tree_file ) != null ) {
427             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( gene_tree_file ) );
428         }
429         if ( ForesterUtil.isReadableFile( species_tree_file ) != null ) {
430             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isReadableFile( species_tree_file ) );
431         }
432         if ( ForesterUtil.isWritableFile( out_file ) != null ) {
433             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( out_file ) );
434         }
435         if ( ForesterUtil.isWritableFile( log_file ) != null ) {
436             ForesterUtil.fatalError( gsdi.PRG_NAME, ForesterUtil.isWritableFile( log_file ) );
437         }
438         EasyWriter log_writer = null;
439         try {
440             log_writer = ForesterUtil.createEasyWriter( log_file );
441         }
442         catch ( final IOException e ) {
443             ForesterUtil.fatalError( gsdi.PRG_NAME, "Failed to create [" + log_file + "]: " + e.getMessage() );
444         }
445         Phylogeny species_tree = null;
446         Phylogeny gene_tree = null;
447         try {
448             final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
449             gene_tree = factory.create( gene_tree_file, PhyloXmlParser.createPhyloXmlParserXsdValidating() )[ 0 ];
450         }
451         catch ( final IOException e ) {
452             fatalError( "error",
453                         "failed to read gene tree from [" + gene_tree_file + "]: " + e.getMessage(),
454                         log_writer );
455         }
456         try {
457             species_tree = SDIutil.parseSpeciesTree( gene_tree,
458                                                      species_tree_file,
459                                                      REPLACE_UNDERSCORES_IN_NH_SPECIES_TREE,
460                                                      true,
461                                                      TAXONOMY_EXTRACTION.NO );
462         }
463         catch ( final PhyloXmlDataFormatException e ) {
464             fatalError( "user error",
465                         "failed to transfer general node name, in [" + species_tree_file + "]: " + e.getMessage(),
466                         log_writer );
467         }
468         catch ( final SDIException e ) {
469             fatalError( "user error", e.getMessage(), log_writer );
470         }
471         catch ( final IOException e ) {
472             fatalError( "error",
473                         "Failed to read species tree from [" + species_tree_file + "]: " + e.getMessage(),
474                         log_writer );
475         }
476         gene_tree.setRooted( true );
477         species_tree.setRooted( true );
478         if ( !gene_tree.isCompletelyBinary() ) {
479             fatalError( "user error", "gene tree [" + gene_tree_file + "] is not completely binary", log_writer );
480         }
481         if ( base_algorithm == ALGORITHM.SDI ) {
482             if ( !species_tree.isCompletelyBinary() ) {
483                 fatalError( "user error",
484                             "species tree is not completely binary, use GSDI or GSDIR instead",
485                             log_writer );
486             }
487         }
488         log_writer.println( PRG_NAME + " - " + PRG_DESC );
489         log_writer.println( "  version         : " + PRG_VERSION );
490         log_writer.println( "  date            : " + PRG_DATE );
491         log_writer.println( "  forester version: " + ForesterConstants.FORESTER_VERSION );
492         log_writer.println();
493         log_writer.println( "Start time                               : "
494                 + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
495         System.out.println( "Start time                               : "
496                 + new SimpleDateFormat( "yyyyMMdd HH:mm:ss" ).format( new Date() ) );
497         log_writer.println( "Gene tree file                           : " + gene_tree_file.getCanonicalPath() );
498         System.out.println( "Gene tree file                           : " + gene_tree_file.getCanonicalPath() );
499         log_writer.println( "Gene tree name                           : "
500                 + ( ForesterUtil.isEmpty( gene_tree.getName() ) ? "" : gene_tree.getName() ) );
501         System.out.println( "Gene tree name                           : "
502                 + ( ForesterUtil.isEmpty( gene_tree.getName() ) ? "" : gene_tree.getName() ) );
503         log_writer.println( "Species tree file                        : " + species_tree_file.getCanonicalPath() );
504         System.out.println( "Species tree file                        : " + species_tree_file.getCanonicalPath() );
505         log_writer.println( "Species tree name                        : "
506                 + ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) );
507         System.out.println( "Species tree name                        : "
508                 + ( ForesterUtil.isEmpty( species_tree.getName() ) ? "" : gene_tree.getName() ) );
509         System.out.println( "Transfer taxonomy                        : " + transfer_taxonomy );
510         GSDII gsdii = null;
511         final long start_time = new Date().getTime();
512         try {
513             if ( base_algorithm == ALGORITHM.GSDI ) {
514                 System.out.println( "Algorithm                                : GSDI" );
515                 log_writer.println( "Algorithm                                : GSDI" );
516             }
517             else if ( base_algorithm == ALGORITHM.GSDIR ) {
518                 System.out.println( "Algorithm                                : GSDIR" );
519                 log_writer.println( "Algorithm                                : GSDIR" );
520             }
521             System.out.println( "Use most parsimonous duplication model   : " + most_parsimonous_duplication_model );
522             System.out.println( "Allow stripping of gene tree nodes       : " + allow_stripping_of_gene_tree );
523             log_writer.println( "Use most parsimonous duplication model   : " + most_parsimonous_duplication_model );
524             log_writer.println( "Allow stripping of gene tree nodes       : " + allow_stripping_of_gene_tree );
525             log_writer.flush();
526             if ( base_algorithm == ALGORITHM.GSDI ) {
527                 gsdii = new GSDI( gene_tree,
528                                   species_tree,
529                                   most_parsimonous_duplication_model,
530                                   allow_stripping_of_gene_tree,
531                                   true,
532                                   transfer_taxonomy );
533             }
534             else if ( base_algorithm == ALGORITHM.GSDIR ) {
535                 gsdii = new GSDIR( gene_tree, species_tree, allow_stripping_of_gene_tree, true, transfer_taxonomy );
536             }
537         }
538         catch ( final SDIException e ) {
539             fatalError( "user error", e.getLocalizedMessage(), log_writer );
540         }
541         catch ( final IOException e ) {
542             fatalError( "error", e.toString(), log_writer );
543         }
544         catch ( final OutOfMemoryError e ) {
545             ForesterUtil.outOfMemoryError( e );
546         }
547         catch ( final Exception e ) {
548             e.printStackTrace();
549             fatalError( "unexpected error", e.toString(), log_writer );
550         }
551         System.out.println( "Running time (excluding I/O)             : " + ( new Date().getTime() - start_time )
552                 + "ms" );
553         log_writer.println( "Running time (excluding I/O)             : " + ( new Date().getTime() - start_time )
554                 + "ms" );
555         System.out.println( "Mapping based on                         : " + gsdii.getTaxCompBase() );
556         log_writer.println( "Mapping based on                         : " + gsdii.getTaxCompBase() );
557         try {
558             final PhylogenyWriter writer = new PhylogenyWriter();
559             if ( base_algorithm == ALGORITHM.GSDIR ) {
560                 writer.toPhyloXML( out_file, ( ( GSDIR ) gsdii ).getMinDuplicationsSumGeneTree(), 0 );
561             }
562             else {
563                 writer.toPhyloXML( out_file, gene_tree, 0 );
564             }
565         }
566         catch ( final IOException e ) {
567             ForesterUtil.fatalError( PRG_NAME,
568                                      "Failed to write to [" + out_file.getCanonicalPath() + "]: " + e.getMessage() );
569         }
570         System.out.println( "Wrote resulting gene tree to             : " + out_file.getCanonicalPath() );
571         log_writer.println( "Wrote resulting gene tree to             : " + out_file.getCanonicalPath() );
572         final File species_tree_used_file = new File( ForesterUtil.removeSuffix( out_file.toString() )
573                 + SUFFIX_FOR_SPECIES_TREE_USED );
574         try {
575             final PhylogenyWriter writer = new PhylogenyWriter();
576             writer.toPhyloXML( species_tree_used_file, species_tree, 0 );
577         }
578         catch ( final IOException e ) {
579             ForesterUtil.fatalError( PRG_NAME,
580                                      "Failed to write to [" + species_tree_used_file.getCanonicalPath() + "]: "
581                                              + e.getMessage() );
582         }
583         System.out.println( "Wrote (stripped) species tree to         : " + species_tree_used_file.getCanonicalPath() );
584         log_writer.println( "Wrote (stripped) species tree to         : " + species_tree_used_file.getCanonicalPath() );
585         if ( ( gsdii.getReMappedScientificNamesFromGeneTree() != null )
586                 && !gsdii.getReMappedScientificNamesFromGeneTree().isEmpty() ) {
587             System.out.println( "Number of gene tree species remapped     : "
588                     + gsdii.getReMappedScientificNamesFromGeneTree().size() );
589             log_writer.println( "Number of gene tree species remapped     : "
590                     + gsdii.getReMappedScientificNamesFromGeneTree().size() );
591             writeToRemappedFile( out_file, gsdii.getReMappedScientificNamesFromGeneTree(), log_writer );
592         }
593         System.out.println( "Number of external nodes in gene tree    : " + gene_tree.getNumberOfExternalNodes() );
594         log_writer.println( "Number of external nodes in gene tree    : " + gene_tree.getNumberOfExternalNodes() );
595         System.out.println( "Number of external nodes in species tree : " + species_tree.getNumberOfExternalNodes() );
596         log_writer.println( "Number of external nodes in species tree : " + species_tree.getNumberOfExternalNodes() );
597         final int poly = PhylogenyMethods.countNumberOfPolytomies( species_tree );
598         System.out.println( "Number of polytomies in species tree     : " + poly );
599         log_writer.println( "Number of polytomies in species tree     : " + poly );
600         System.out.println( "External nodes stripped from gene tree   : "
601                 + gsdii.getStrippedExternalGeneTreeNodes().size() );
602         log_writer.println( "External nodes stripped from gene tree   : "
603                 + gsdii.getStrippedExternalGeneTreeNodes().size() );
604         System.out
605                 .println( "External nodes stripped from species tree: " + gsdii.getStrippedSpeciesTreeNodes().size() );
606         log_writer
607                 .println( "External nodes stripped from species tree: " + gsdii.getStrippedSpeciesTreeNodes().size() );
608         System.out.println();
609         System.out.println( "Number of speciations                    : " + gsdii.getSpeciationsSum() );
610         log_writer.println( "Number of speciations                    : " + gsdii.getSpeciationsSum() );
611         if ( ( base_algorithm == ALGORITHM.GSDIR ) ) {
612             final GSDIR gsdir = ( GSDIR ) gsdii;
613             System.out.println( "Minimal number of duplications           : " + gsdir.getMinDuplicationsSum() );
614             log_writer.println( "Minimal number of duplications           : " + gsdir.getMinDuplicationsSum() );
615         }
616         else if ( ( base_algorithm == ALGORITHM.GSDI ) ) {
617             final GSDI gsdi = ( GSDI ) gsdii;
618             System.out.println( "Number of duplications                   : " + gsdi.getDuplicationsSum() );
619             log_writer.println( "Number of duplications                   : " + gsdi.getDuplicationsSum() );
620             if ( !most_parsimonous_duplication_model ) {
621                 final int u = gsdi.getSpeciationOrDuplicationEventsSum();
622                 System.out.println( "Number of potential duplications         : " + u );
623                 log_writer.println( "Number of potential duplications         : " + u );
624             }
625         }
626         log_writer.println();
627         printMappedNodesToLog( log_writer, gsdii );
628         log_writer.println();
629         printStrippedGeneTreeNodesToLog( log_writer, gsdii );
630         System.out.println();
631         System.out.println( "Wrote log to                             : " + log_file.getCanonicalPath() );
632         System.out.println();
633         log_writer.close();
634     }
635
636     private static void fatalError( final String type, final String msg, final EasyWriter log_writer ) {
637         try {
638             log_writer.flush();
639             log_writer.println();
640             log_writer.print( type.toUpperCase() + ": " );
641             log_writer.println( msg );
642             log_writer.close();
643         }
644         catch ( final IOException e ) {
645             e.printStackTrace();
646         }
647         ForesterUtil.fatalError( gsdi.PRG_NAME, msg );
648     }
649
650     private static void print_help() {
651         System.out.println( "Usage: " + PRG_NAME
652                 + " [-options] <gene tree file, or gene trees in-directory> <species tree> <outfile, or out-directory>" );
653         System.out.println();
654         System.out.println( "Options:" );
655         System.out.println( " -" + ALLOW_STRIPPING_OF_GENE_TREE_OPTION
656                 + "         : to allow stripping of gene tree nodes without a matching species" );
657         System.out.println( " -" + MOST_PARSIMONIOUS_OPTION
658                 + "         : use most parimonious duplication model for GSDI: " );
659         System.out.println( "              assign nodes as speciations which would otherwise be assiged" );
660         System.out.println( "              as potential duplications due to polytomies in the species tree" );
661         System.out.println( " -" + GUESS_FORMAT_OF_SPECIES_TREE
662                 + "         : to allow species tree in other formats than phyloXML (i.e. Newick, NHX, Nexus)" );
663         System.out.println( " -" + GSDIR_OPTION
664                 + "         : to use GSDIR algorithm instead of GSDI algorithm (re-rooting)" );
665         System.out.println( " -" + TRANSFER_TAXONOMY_OPTION
666                 + "         : to transfer taxonomic data from species tree to gene tree" );
667         System.out.println( " -" + SUFFIX_FOR_DIR_OPTION
668                 + "=<suffix>: suffix for gene trees for analyzing entire directory of trees" );
669         System.out.println();
670         System.out.println();
671         System.out.println( "Gene tree(s):" );
672         System.out.println( " in phyloXM format, with taxonomy and sequence data in appropriate fields" );
673         System.out.println();
674         System.out.println( "Species tree:" );
675         System.out.println( " in phyloXML format (unless option -" + GUESS_FORMAT_OF_SPECIES_TREE + " is used)" );
676         System.out.println();
677         System.out.println( "Examples: gsdi -" + ALLOW_STRIPPING_OF_GENE_TREE_OPTION
678                 + " gene_tree.xml tree_of_life.xml out.xml" );
679         System.out.println( "          gsdi -" + SUFFIX_FOR_DIR_OPTION + " -" + SUFFIX_FOR_DIR_OPTION + "=.xml"
680                 + " gene_tree_dir tree_of_life.xml out_dir" );
681         System.out.println();
682     }
683
684     private static void printMappedNodesToLog( final EasyWriter log_writer, final GSDII gsdi ) throws IOException {
685         final SortedSet<String> ss = new TreeSet<String>();
686         for( final PhylogenyNode n : gsdi.getMappedExternalSpeciesTreeNodes() ) {
687             ss.add( n.toString() );
688         }
689         log_writer.println( "The following " + ss.size() + " species were used: " );
690         for( final String s : ss ) {
691             log_writer.println( "  " + s );
692         }
693     }
694
695     private static void printStrippedGeneTreeNodesToLog( final EasyWriter log_writer, final GSDII gsdi )
696             throws IOException {
697         final SortedMap<String, Integer> sm = new TreeMap<String, Integer>();
698         for( final PhylogenyNode n : gsdi.getStrippedExternalGeneTreeNodes() ) {
699             final String s = n.toString();
700             if ( sm.containsKey( s ) ) {
701                 sm.put( s, sm.get( s ) + 1 );
702             }
703             else {
704                 sm.put( s, 1 );
705             }
706         }
707         log_writer.println( "The following " + sm.size() + " nodes were stripped from the gene tree: " );
708         for( final String s : sm.keySet() ) {
709             final int count = sm.get( s );
710             if ( count == 1 ) {
711                 log_writer.println( "  " + s );
712             }
713             else {
714                 log_writer.println( "  " + s + " [" + count + "]" );
715             }
716         }
717     }
718
719     private static void writeToRemappedFile( final File out_file,
720                                              final SortedSet<String> remapped,
721                                              final EasyWriter log_writer )
722             throws IOException {
723         final File file = new File( ForesterUtil.removeSuffix( out_file.toString() ) + REMAPPED_SUFFIX );
724         final EasyWriter remapped_writer = ForesterUtil.createEasyWriter( file );
725         for( final String s : remapped ) {
726             remapped_writer.println( s );
727         }
728         remapped_writer.close();
729         System.out.println( "Wrote remapped gene tree species to      : " + file.getCanonicalPath() );
730         log_writer.println( "Wrote remapped gene tree species to      : " + file.getCanonicalPath() );
731     }
732 }