in progress...
[jalview.git] / forester / java / src / org / forester / application / rio.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
9 // All rights reserved
10 //
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
15 //
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 //
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26
27 package org.forester.application;
28
29 import java.io.File;
30 import java.io.FilenameFilter;
31 import java.io.IOException;
32 import java.math.RoundingMode;
33 import java.util.ArrayList;
34 import java.util.Arrays;
35 import java.util.List;
36
37 import org.forester.datastructures.IntMatrix;
38 import org.forester.io.parsers.IteratingPhylogenyParser;
39 import org.forester.io.parsers.PhylogenyParser;
40 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
41 import org.forester.io.parsers.nhx.NHXParser;
42 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
43 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
44 import org.forester.io.parsers.util.ParserUtils;
45 import org.forester.io.writers.PhylogenyWriter;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.rio.RIO;
48 import org.forester.rio.RIO.REROOTING;
49 import org.forester.rio.RIOException;
50 import org.forester.sdi.SDIException;
51 import org.forester.sdi.SDIutil.ALGORITHM;
52 import org.forester.util.BasicDescriptiveStatistics;
53 import org.forester.util.CommandLineArguments;
54 import org.forester.util.EasyWriter;
55 import org.forester.util.ForesterUtil;
56
57 public class rio {
58
59     final static private String PRG_NAME                     = "rio";
60     final static private String PRG_VERSION                  = "4.000 beta 11";
61     final static private String PRG_DATE                     = "170410";
62     final static private String E_MAIL                       = "phyloxml@gmail.com";
63     final static private String WWW                          = "https://sites.google.com/site/cmzmasek/home/software/forester";
64     final static private String HELP_OPTION_1                = "help";
65     final static private String LOGFILE_SUFFIX               = "_RIO_log.tsv";
66     final static private String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_sst.xml";
67     final static private String ORTHO_OUTTABLE_SUFFIX        = "_RIO_o_table.tsv";
68     final static private String OUT_GENE_TREE_SUFFIX         = "_RIO_gene_tree.xml";
69     final static private String HELP_OPTION_2                = "h";
70     final static private String GT_FIRST                     = "f";
71     final static private String GT_LAST                      = "l";
72     final static private String REROOTING_OPT                = "r";
73     final static private String OUTGROUP                     = "o";
74     final static private String RETURN_SPECIES_TREE          = "s";
75     final static private String RETURN_BEST_GENE_TREE        = "g";
76     final static private String USE_SDIR                     = "b";
77     final static private String TRANSFER_TAXONOMY_OPTION     = "t";
78     final static private String GENE_TREES_SUFFIX_OPTION     = "u";
79
80     public static void main( final String[] args ) {
81         ForesterUtil.printProgramInformation( PRG_NAME,
82                                               "resampled inference of orthologs",
83                                               PRG_VERSION,
84                                               PRG_DATE,
85                                               E_MAIL,
86                                               WWW,
87                                               ForesterUtil.getForesterLibraryInformation() );
88         CommandLineArguments cla = null;
89         try {
90             cla = new CommandLineArguments( args );
91         }
92         catch ( final Exception e ) {
93             ForesterUtil.fatalError( e.getMessage() );
94         }
95         if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) {
96             printHelp();
97         }
98         if ( ( args.length < 3 ) || ( args.length > 11 ) || ( cla.getNumberOfNames() < 3 ) ) {
99             System.out.println();
100             System.out.println( "error: incorrect number of arguments" );
101             System.out.println();
102             printHelp();
103         }
104         final List<String> allowed_options = new ArrayList<String>();
105         allowed_options.add( GT_FIRST );
106         allowed_options.add( GT_LAST );
107         allowed_options.add( REROOTING_OPT );
108         allowed_options.add( OUTGROUP );
109         allowed_options.add( USE_SDIR );
110         allowed_options.add( RETURN_SPECIES_TREE );
111         allowed_options.add( RETURN_BEST_GENE_TREE );
112         allowed_options.add( TRANSFER_TAXONOMY_OPTION );
113         allowed_options.add( GENE_TREES_SUFFIX_OPTION );
114         final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
115         if ( dissallowed_options.length() > 0 ) {
116             ForesterUtil.fatalError( "unknown option(s): " + dissallowed_options );
117         }
118         final File gene_trees_file = cla.getFile( 0 );
119         final boolean use_dir;
120         File indir = null;
121         File outdir = null;
122         if ( gene_trees_file.isDirectory() ) {
123             if ( !gene_trees_file.exists() ) {
124                 ForesterUtil.fatalError( "gene trees directory \"" + gene_trees_file + "\" does not exist" );
125             }
126             use_dir = true;
127             indir = gene_trees_file;
128         }
129         else {
130             use_dir = false;
131         }
132         final File species_tree_file = cla.getFile( 1 );
133         File orthology_outtable = null;
134         if ( use_dir ) {
135             outdir = cla.getFile( 2 );
136         }
137         else {
138             orthology_outtable = cla.getFile( 2 );
139         }
140         File logfile;
141         if ( use_dir ) {
142             if ( ( cla.getNumberOfNames() < 4 ) ) {
143                 System.out.println();
144                 System.out.println( "error: incorrect number of arguments" );
145                 System.out.println();
146                 printHelp();
147             }
148             logfile = cla.getFile( 3 );
149             if ( logfile.exists() ) {
150                 ForesterUtil.fatalError( "\"" + logfile + "\" already exists" );
151             }
152         }
153         else {
154             if ( cla.getNumberOfNames() > 3 ) {
155                 logfile = cla.getFile( 3 );
156                 if ( logfile.exists() ) {
157                     ForesterUtil.fatalError( "\"" + logfile + "\" already exists" );
158                 }
159             }
160             else {
161                 logfile = null;
162             }
163         }
164         boolean sdir = false;
165         if ( cla.isOptionSet( USE_SDIR ) ) {
166             if ( cla.isOptionHasAValue( USE_SDIR ) ) {
167                 ForesterUtil.fatalError( "no value allowed for -" + USE_SDIR );
168             }
169             sdir = true;
170             if ( logfile != null ) {
171                 ForesterUtil.fatalError( "no logfile output for SDIR algorithm" );
172             }
173         }
174         String outgroup = null;
175         if ( cla.isOptionSet( OUTGROUP ) ) {
176             if ( sdir ) {
177                 ForesterUtil.fatalError( "no outgroup option for SDIR algorithm" );
178             }
179             if ( use_dir ) {
180                 ForesterUtil.fatalError( "no outgroup option for operating on gene trees directory" );
181             }
182             if ( !cla.isOptionHasAValue( OUTGROUP ) ) {
183                 ForesterUtil.fatalError( "no value for -" + OUTGROUP );
184             }
185             outgroup = cla.getOptionValueAsCleanString( OUTGROUP );
186         }
187         REROOTING rerooting = REROOTING.BY_ALGORITHM;
188         if ( cla.isOptionSet( REROOTING_OPT ) ) {
189             if ( !cla.isOptionHasAValue( REROOTING_OPT ) ) {
190                 ForesterUtil.fatalError( "no value for -" + REROOTING_OPT );
191             }
192             if ( sdir ) {
193                 ForesterUtil.fatalError( "no re-rooting option for SDIR algorithm" );
194             }
195             final String rerooting_str = cla.getOptionValueAsCleanString( REROOTING_OPT ).toLowerCase();
196             if ( rerooting_str.equals( "none" ) ) {
197                 rerooting = REROOTING.NONE;
198             }
199             else if ( rerooting_str.equals( "midpoint" ) ) {
200                 rerooting = REROOTING.MIDPOINT;
201             }
202             else if ( rerooting_str.equals( "outgroup" ) ) {
203                 if ( use_dir ) {
204                     ForesterUtil.fatalError( "no outgroup option for operating on gene trees directory" );
205                 }
206                 rerooting = REROOTING.OUTGROUP;
207             }
208             else {
209                 ForesterUtil
210                         .fatalError( "values for re-rooting are: 'none', 'midpoint', or 'outgroup' (minizming duplications is default)" );
211             }
212         }
213         if ( ForesterUtil.isEmpty( outgroup ) && ( rerooting == REROOTING.OUTGROUP ) ) {
214             ForesterUtil.fatalError( "selected re-rooting by outgroup, but outgroup not set" );
215         }
216         if ( !ForesterUtil.isEmpty( outgroup ) && ( rerooting != REROOTING.OUTGROUP ) ) {
217             ForesterUtil.fatalError( "outgroup set, but selected re-rooting by other approach" );
218         }
219         int gt_first = RIO.DEFAULT_RANGE;
220         int gt_last = RIO.DEFAULT_RANGE;
221         if ( cla.isOptionSet( GT_FIRST ) ) {
222             if ( !cla.isOptionHasAValue( GT_FIRST ) ) {
223                 ForesterUtil.fatalError( "no value for -" + GT_FIRST );
224             }
225             if ( sdir ) {
226                 ForesterUtil.fatalError( "no gene tree range option for SDIR algorithm" );
227             }
228             try {
229                 gt_first = cla.getOptionValueAsInt( GT_FIRST );
230             }
231             catch ( final IOException e ) {
232                 ForesterUtil.fatalError( "could not parse integer for -" + GT_FIRST + " option" );
233             }
234             if ( gt_first < 0 ) {
235                 ForesterUtil.fatalError( "attempt to set index of first tree to analyze to: " + gt_first );
236             }
237         }
238         if ( cla.isOptionSet( GT_LAST ) ) {
239             if ( !cla.isOptionHasAValue( GT_LAST ) ) {
240                 ForesterUtil.fatalError( "no value for -" + GT_LAST );
241             }
242             if ( sdir ) {
243                 ForesterUtil.fatalError( "no gene tree range option for SDIR algorithm" );
244             }
245             try {
246                 gt_last = cla.getOptionValueAsInt( GT_LAST );
247             }
248             catch ( final IOException e ) {
249                 ForesterUtil.fatalError( "could not parse integer for -" + GT_LAST + " option" );
250             }
251             if ( gt_last < 0 ) {
252                 ForesterUtil.fatalError( "attempt to set index of last tree to analyze to: " + gt_last );
253             }
254         }
255         if ( ( ( gt_last != RIO.DEFAULT_RANGE ) && ( gt_first != RIO.DEFAULT_RANGE ) ) && ( ( gt_last < gt_first ) ) ) {
256             ForesterUtil.fatalError( "attempt to set range (0-based) of gene to analyze to: from " + gt_first + " to "
257                     + gt_last );
258         }
259         File return_species_tree = null;
260         if ( !sdir && cla.isOptionSet( RETURN_SPECIES_TREE ) ) {
261             if ( use_dir ) {
262                 ForesterUtil.fatalError( "no return species tree option when operating on gene trees directory" );
263             }
264             if ( !cla.isOptionHasAValue( RETURN_SPECIES_TREE ) ) {
265                 ForesterUtil.fatalError( "no value for -" + RETURN_SPECIES_TREE );
266             }
267             final String s = cla.getOptionValueAsCleanString( RETURN_SPECIES_TREE );
268             return_species_tree = new File( s );
269             if ( return_species_tree.exists() ) {
270                 ForesterUtil.fatalError( "\"" + return_species_tree + "\" already exists" );
271             }
272         }
273         File return_gene_tree = null;
274         if ( !sdir && cla.isOptionSet( RETURN_BEST_GENE_TREE ) ) {
275             if ( use_dir ) {
276                 ForesterUtil.fatalError( "no best gene tree return option when operating on gene trees directory" );
277             }
278             if ( !cla.isOptionHasAValue( RETURN_BEST_GENE_TREE ) ) {
279                 ForesterUtil.fatalError( "no value for -" + RETURN_BEST_GENE_TREE );
280             }
281             final String s = cla.getOptionValueAsCleanString( RETURN_BEST_GENE_TREE );
282             return_gene_tree = new File( s );
283             if ( return_gene_tree.exists() ) {
284                 ForesterUtil.fatalError( "\"" + return_gene_tree + "\" already exists" );
285             }
286         }
287         boolean transfer_taxonomy = false;
288         if ( !sdir && cla.isOptionSet( TRANSFER_TAXONOMY_OPTION ) ) {
289             if ( use_dir ) {
290                 ForesterUtil.fatalError( "no transferring taxonomy option when operating on gene trees directory" );
291             }
292             if ( return_gene_tree == null ) {
293                 ForesterUtil.fatalError( "no point in transferring taxonomy data without returning best gene tree" );
294             }
295             transfer_taxonomy = true;
296         }
297         if ( !use_dir ) {
298             ForesterUtil.fatalErrorIfFileNotReadable( gene_trees_file );
299         }
300         else {
301             transfer_taxonomy = true;
302         }
303         final String gene_trees_suffix;
304         if ( cla.isOptionSet( GENE_TREES_SUFFIX_OPTION ) ) {
305             if ( !use_dir ) {
306                 ForesterUtil.fatalError( "no gene tree suffix option when operating on indivual gene trees" );
307             }
308             if ( !cla.isOptionHasAValue( GENE_TREES_SUFFIX_OPTION ) ) {
309                 ForesterUtil.fatalError( "no value for -" + GENE_TREES_SUFFIX_OPTION );
310             }
311             gene_trees_suffix = cla.getOptionValueAsCleanString( GENE_TREES_SUFFIX_OPTION );
312         }
313         else {
314             gene_trees_suffix = ".mlt";
315         }
316         ForesterUtil.fatalErrorIfFileNotReadable( species_tree_file );
317         if ( !use_dir && orthology_outtable.exists() ) {
318             ForesterUtil.fatalError( "\"" + orthology_outtable + "\" already exists" );
319         }
320         long time = 0;
321         try {
322             if ( use_dir ) {
323                 System.out.println( "Gene trees in-dir                   :\t" + indir.getCanonicalPath() );
324                 System.out.println( "Gene trees suffix                   :\t" + gene_trees_suffix );
325             }
326             else {
327                 System.out.println( "Gene trees                          :\t" + gene_trees_file.getCanonicalPath() );
328             }
329             System.out.println( "Species tree                        :\t" + species_tree_file.getCanonicalPath() );
330         }
331         catch ( final IOException e ) {
332             ForesterUtil.fatalError( e.getLocalizedMessage() );
333         }
334         if ( use_dir ) {
335             System.out.println( "Out-dir                             :\t" + outdir );
336         }
337         else {
338             System.out.println( "All vs all orthology results table  :\t" + orthology_outtable );
339         }
340         if ( logfile != null ) {
341             System.out.println( "Logfile                             :\t" + logfile );
342         }
343         if ( gt_first != RIO.DEFAULT_RANGE ) {
344             System.out.println( "First gene tree to analyze          :\t" + gt_first );
345         }
346         if ( gt_last != RIO.DEFAULT_RANGE ) {
347             System.out.println( "Last gene tree to analyze           :\t" + gt_last );
348         }
349         String rerooting_str = "";
350         switch ( rerooting ) {
351             case BY_ALGORITHM: {
352                 rerooting_str = "by minimizing duplications";
353                 break;
354             }
355             case MIDPOINT: {
356                 rerooting_str = "by midpoint method";
357                 break;
358             }
359             case OUTGROUP: {
360                 rerooting_str = "by outgroup: " + outgroup;
361                 break;
362             }
363             case NONE: {
364                 rerooting_str = "none";
365                 break;
366             }
367         }
368         System.out.println( "Re-rooting                          : \t" + rerooting_str );
369         if ( !sdir ) {
370             System.out.println( "Non binary species tree             :\tallowed" );
371         }
372         else {
373             System.out.println( "Non binary species tree             :\tdisallowed" );
374         }
375         if ( return_species_tree != null ) {
376             System.out.println( "Write used species tree to          :\t" + return_species_tree );
377         }
378         if ( return_gene_tree != null ) {
379             System.out.println( "Write best gene tree to             :\t" + return_gene_tree );
380             System.out.println( "Transfer taxonomic data             :\t" + transfer_taxonomy );
381         }
382         time = System.currentTimeMillis();
383         final ALGORITHM algorithm;
384         if ( sdir ) {
385             algorithm = ALGORITHM.SDIR;
386         }
387         else {
388             algorithm = ALGORITHM.GSDIR;
389         }
390         EasyWriter log = null;
391         if ( use_dir ) {
392             if ( outdir.exists() ) {
393                 if ( !outdir.isDirectory() ) {
394                     ForesterUtil.fatalError( PRG_NAME,
395                                              "out-directory [" + outdir + "] already exists but is not a directory" );
396                 }
397             }
398             else {
399                 final boolean success = outdir.mkdirs();
400                 if ( !success ) {
401                     ForesterUtil.fatalError( PRG_NAME, "could not create out-directory [" + outdir + "]" );
402                 }
403             }
404             final String species_tree_file_name = species_tree_file.getName();
405             final File gene_trees_files[] = indir.listFiles( new FilenameFilter() {
406
407                 @Override
408                 public boolean accept( final File dir, final String name ) {
409                     return ( ( name.endsWith( gene_trees_suffix ) ) && !( name.equals( species_tree_file_name ) ) );
410                 }
411             } );
412             if ( gene_trees_files.length < 1 ) {
413                 ForesterUtil.fatalError( PRG_NAME,
414                                          "in-directory [" + indir
415                                                  + "] does not contain any gene tree files with suffix "
416                                                  + gene_trees_suffix );
417             }
418             try {
419                 log = ForesterUtil.createEasyWriter( logfile );
420             }
421             catch ( final IOException e ) {
422                 ForesterUtil.fatalError( PRG_NAME, "could not create [" + logfile + "]" );
423             }
424             Arrays.sort( gene_trees_files );
425             try {
426                 log.print( "# program" );
427                 log.print( "\t" );
428                 log.print( PRG_NAME );
429                 log.println();
430                 log.print( "# version" );
431                 log.print( "\t" );
432                 log.print( PRG_VERSION );
433                 log.println();
434                 log.print( "# date" );
435                 log.print( "\t" );
436                 log.print( PRG_DATE );
437                 log.println();
438                 log.print( "# Algorithm " );
439                 log.print( "\t" );
440                 log.print( algorithm.toString() );
441                 log.println();
442                 log.print( "# Gene trees in-dir" );
443                 log.print( "\t" );
444                 log.print( indir.getCanonicalPath() );
445                 log.println();
446                 log.print( "# Gene trees suffix" );
447                 log.print( "\t" );
448                 log.print( gene_trees_suffix );
449                 log.println();
450                 log.print( "# Species tree" );
451                 log.print( "\t" );
452                 log.print( species_tree_file.getCanonicalPath() );
453                 log.println();
454                 log.print( "# Out-dir" );
455                 log.print( "\t" );
456                 log.print( outdir.getCanonicalPath() );
457                 log.println();
458                 log.print( "# Logfile" );
459                 log.print( "\t" );
460                 log.print( logfile.getCanonicalPath() );
461                 log.println();
462                 if ( gt_first != RIO.DEFAULT_RANGE ) {
463                     log.print( "# First gene tree to analyze" );
464                     log.print( "\t" );
465                     log.print( Integer.toString( gt_first ) );
466                     log.println();
467                 }
468                 if ( gt_last != RIO.DEFAULT_RANGE ) {
469                     log.print( "# Last gene tree to analyze" );
470                     log.print( "\t" );
471                     log.print( Integer.toString( gt_last ) );
472                     log.println();
473                 }
474                 log.print( "# Re-rooting" );
475                 log.print( "\t" );
476                 log.print( rerooting_str );
477                 log.println();
478                 log.print( "# Non binary species tree" );
479                 log.print( "\t" );
480                 if ( !sdir ) {
481                     log.print( "allowed" );
482                 }
483                 else {
484                     log.print( "disallowed" );
485                 }
486                 log.println();
487                 log.println();
488                 log.print( "NAME" );
489                 log.print( "\t" );
490                 log.print( "EXT NODES" );
491                 log.print( "\t" );
492                 log.print( "MEAN DUP" );
493                 log.print( "\t" );
494                 log.print( "MEAN DUP SD" );
495                 log.print( "\t" );
496                 log.print( "MEDIAN DUP" );
497                 log.print( "\t" );
498                 log.print( "MIN DUP" );
499                 log.print( "\t" );
500                 log.print( "MAX DUP" );
501                 log.print( "\t" );
502                 log.print( "REMOVED EXT NODES" );
503                 log.print( "\t" );
504                 log.print( "N" );
505                 log.println();
506             }
507             catch ( IOException e ) {
508                 ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
509             }
510             int counter = 1;
511             for( final File gf : gene_trees_files ) {
512                 String outname = gf.getName();
513                 System.out
514                         .print( "\r                                                                                            " );
515                 System.out.print( "\r" + counter + "/" + gene_trees_files.length + ": " + outname );
516                 counter++;
517                 if ( outname.indexOf( "." ) > 0 ) {
518                     outname = outname.substring( 0, outname.lastIndexOf( "." ) );
519                 }
520                 try {
521                     executeAnalysis( gf,
522                                      species_tree_file,
523                                      new File( outdir.getCanonicalFile() + "/" + outname + ORTHO_OUTTABLE_SUFFIX ),
524                                      new File( outdir.getCanonicalFile() + "/" + outname + LOGFILE_SUFFIX ),
525                                      outgroup,
526                                      rerooting,
527                                      gt_first,
528                                      gt_last,
529                                      new File( outdir.getCanonicalFile() + "/" + outname
530                                              + STRIPPED_SPECIES_TREE_SUFFIX ),
531                                      new File( outdir.getCanonicalFile() + "/" + outname + OUT_GENE_TREE_SUFFIX ),
532                                      transfer_taxonomy,
533                                      algorithm,
534                                      true,
535                                      log );
536                 }
537                 catch ( IOException e ) {
538                     ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
539                 }
540             }
541             System.out
542                     .print( "\r                                                                                        " );
543             System.out.println();
544         }
545         else {
546             executeAnalysis( gene_trees_file,
547                              species_tree_file,
548                              orthology_outtable,
549                              logfile,
550                              outgroup,
551                              rerooting,
552                              gt_first,
553                              gt_last,
554                              return_species_tree,
555                              return_gene_tree,
556                              transfer_taxonomy,
557                              algorithm,
558                              false,
559                              null );
560         }
561         if ( !use_dir ) {
562             time = System.currentTimeMillis() - time;
563             System.out.println( "Time                                :\t" + time + "ms" );
564         }
565         else {
566             try {
567                 log.close();
568             }
569             catch ( IOException e ) {
570                 ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
571             }
572             time = System.currentTimeMillis() - time;
573             System.out.println( "Time                                :\t" + time + "ms" );
574         }
575         System.exit( 0 );
576     }
577
578     private static final void executeAnalysis( final File gene_trees_file,
579                                                final File species_tree_file,
580                                                final File orthology_outtable,
581                                                final File logfile,
582                                                final String outgroup,
583                                                final REROOTING rerooting,
584                                                final int gt_first,
585                                                final int gt_last,
586                                                final File return_species_tree,
587                                                final File return_gene_tree,
588                                                final boolean transfer_taxonomy,
589                                                final ALGORITHM algorithm,
590                                                final boolean use_gene_trees_dir,
591                                                final EasyWriter log ) {
592         try {
593             final RIO rio;
594             boolean iterating = false;
595             final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
596             if ( p instanceof PhyloXmlParser ) {
597                 rio = RIO.executeAnalysis( gene_trees_file,
598                                            species_tree_file,
599                                            algorithm,
600                                            rerooting,
601                                            outgroup,
602                                            gt_first,
603                                            gt_last,
604                                            logfile != null,
605                                            true,
606                                            transfer_taxonomy );
607             }
608             else {
609                 iterating = true;
610                 if ( p instanceof NHXParser ) {
611                     final NHXParser nhx = ( NHXParser ) p;
612                     nhx.setReplaceUnderscores( false );
613                     nhx.setIgnoreQuotes( true );
614                     nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
615                 }
616                 else if ( p instanceof NexusPhylogeniesParser ) {
617                     final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
618                     nex.setReplaceUnderscores( false );
619                     nex.setIgnoreQuotes( true );
620                     nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
621                 }
622                 else {
623                     throw new RuntimeException( "unknown parser type: " + p );
624                 }
625                 final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
626                 ip.setSource( gene_trees_file );
627                 rio = RIO.executeAnalysis( ip,
628                                            species_tree_file,
629                                            algorithm,
630                                            rerooting,
631                                            outgroup,
632                                            gt_first,
633                                            gt_last,
634                                            logfile != null,
635                                            !use_gene_trees_dir,
636                                            transfer_taxonomy );
637             }
638             if ( !use_gene_trees_dir ) {
639                 if ( algorithm == ALGORITHM.GSDIR ) {
640                     System.out.println( "Taxonomy linking based on           :\t" + rio.getGSDIRtaxCompBase() );
641                 }
642             }
643             final IntMatrix m;
644             if ( iterating ) {
645                 m = rio.getOrthologTable();
646             }
647             else {
648                 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
649             }
650             final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
651             writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir );
652             if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
653                 writeLogFile( logfile,
654                               rio,
655                               species_tree_file,
656                               gene_trees_file,
657                               orthology_outtable,
658                               PRG_NAME,
659                               PRG_VERSION,
660                               PRG_DATE,
661                               ForesterUtil.getForesterLibraryInformation(),
662                               !use_gene_trees_dir );
663             }
664             if ( return_species_tree != null ) {
665                 writeTree( rio.getSpeciesTree(),
666                            return_species_tree,
667                            use_gene_trees_dir ? null : "Wrote (stripped) species tree to    :\t" );
668             }
669             if ( return_gene_tree != null ) {
670                 writeTree( rio.getMinDuplicationsGeneTree(),
671                            return_gene_tree,
672                            use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" );
673             }
674             final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
675             final int min = ( int ) stats.getMin();
676             final int max = ( int ) stats.getMax();
677             final int median = ( int ) stats.median();
678             int min_count = 0;
679             int max_count = 0;
680             int median_count = 0;
681             for( double d : stats.getData() ) {
682                 if ( ( ( int ) d ) == min ) {
683                     ++min_count;
684                 }
685                 if ( ( ( int ) d ) == max ) {
686                     ++max_count;
687                 }
688                 if ( ( ( int ) d ) == median ) {
689                     ++median_count;
690                 }
691             }
692             final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
693             final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
694             final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
695             if ( use_gene_trees_dir ) {
696                 String name = gene_trees_file.getName();
697                 if ( name.indexOf( "." ) > 0 ) {
698                     name = name.substring( 0, name.lastIndexOf( "." ) );
699                 }
700                 log.print( name );
701                 log.print( "\t" );
702                 log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
703                 log.print( "\t" );
704                 log.print( df.format( stats.arithmeticMean() ) );
705                 log.print( "\t" );
706                 log.print( df.format( stats.sampleStandardDeviation() ) );
707                 log.print( "\t" );
708                 if ( stats.getN() > 3 ) {
709                     log.print( df.format( median ) );
710                 }
711                 else {
712                     log.print( "" );
713                 }
714                 log.print( "\t" );
715                 log.print( Integer.toString( min ) );
716                 log.print( "\t" );
717                 log.print( Integer.toString( max ) );
718                 log.print( "\t" );
719                 log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
720                 log.print( "\t" );
721                 log.print( Integer.toString( stats.getN() ) );
722                 log.println();
723             }
724             else {
725                 System.out.println( "Gene tree internal nodes            :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
726                 System.out.println( "Gene tree external nodes            :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
727                 System.out.println( "Mean number of duplications         :\t" + df.format( stats.arithmeticMean() )
728                         + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
729                         + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
730                 if ( stats.getN() > 3 ) {
731                     System.out.println( "Median number of duplications       :\t" + df.format( median ) + "\t"
732                             + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
733                 }
734                 System.out.println( "Minimum duplications                :\t" + min + "\t"
735                         + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
736                 System.out.println( "Maximum duplications                :\t" + ( int ) max + "\t"
737                         + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
738                 System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
739                         + df.format( median_count_percentage ) + "%" );
740                 System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
741                         + df.format( min_count_percentage ) + "%" );
742                 System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
743                         + df.format( max_count_percentage ) + "%" );
744                 System.out.println( "Removed ext gene tree nodes:\t" + rio.getRemovedGeneTreeNodes().size() );
745             }
746         }
747         catch ( final RIOException e ) {
748             ForesterUtil.fatalError( e.getLocalizedMessage() );
749         }
750         catch ( final SDIException e ) {
751             ForesterUtil.fatalError( e.getLocalizedMessage() );
752         }
753         catch ( final IOException e ) {
754             ForesterUtil.fatalError( e.getLocalizedMessage() );
755         }
756         catch ( final OutOfMemoryError e ) {
757             ForesterUtil.outOfMemoryError( e );
758         }
759         catch ( final Exception e ) {
760             ForesterUtil.unexpectedFatalError( e );
761         }
762         catch ( final Error e ) {
763             ForesterUtil.unexpectedFatalError( e );
764         }
765     }
766
767     private final static void printHelp() {
768         System.out.println( "Usage" );
769         System.out.println();
770         System.out.println( PRG_NAME
771                 + " [options] <gene trees infile> <species tree infile> <all vs all orthology table outfile> [logfile]" );
772         System.out.println();
773         System.out.println( PRG_NAME + " [options] <gene trees indir> <species tree infile> <outdir> <logfile>" );
774         System.out.println();
775         System.out.println();
776         System.out.println( " Options" );
777         System.out.println( "  -" + GT_FIRST + "=<first>     : first gene tree to analyze (0-based index)" );
778         System.out.println( "  -" + GT_LAST + "=<last>      : last gene tree to analyze (0-based index)" );
779         System.out.println( "  -" + REROOTING_OPT
780                 + "=<re-rooting>: re-rooting method for gene trees, possible values or 'none', 'midpoint'," );
781         System.out.println( "                   or 'outgroup' (default: by minizming duplications)" );
782         System.out.println( "  -" + OUTGROUP
783                 + "=<outgroup>  : for rooting by outgroup, name of outgroup (external gene tree node)" );
784         System.out
785                 .println( "  -" + RETURN_SPECIES_TREE + "=<outfile>   : to write the (stripped) species tree to file" );
786         System.out.println( "  -" + RETURN_BEST_GENE_TREE
787                 + "=<outfile>   : to write (one) minimal duplication gene tree to file" );
788         System.out.println( "  -" + TRANSFER_TAXONOMY_OPTION
789                 + "             : to transfer taxonomic data from species tree to returned minimal duplication gene tree\n"
790                 + "                   (if -" + RETURN_BEST_GENE_TREE + " option is used)" );
791         System.out.println( "  -" + USE_SDIR
792                 + "             : to use SDIR instead of GSDIR (faster, but non-binary species trees are" );
793         System.out.println( "                   disallowed, as are most options)" );
794         System.out.println();
795         System.out.println( " Formats" );
796         System.out
797                 .println( "  The gene trees, as well as the species tree, ideally are in phyloXML (www.phyloxml.org) format," );
798         System.out
799                 .println( "  but can also be in New Hamphshire (Newick) or Nexus format as long as species information can be" );
800         System.out
801                 .println( "  extracted from the gene names (e.g. \"HUMAN\" from \"BCL2_HUMAN\") and matched to a single species" );
802         System.out.println( "  in the species tree." );
803         System.out.println();
804         System.out.println( " Examples" );
805         System.out.println( "  rio gene_trees.nh species.xml outtable.tsv log.txt" );
806         System.out
807                 .println( "  rio -t -f=10 -l=100 -r=none -g=out_gene_tree.xml -s=stripped_species.xml gene_trees.xml species.xml outtable.tsv log.txt" );
808         System.out.println();
809         System.exit( -1 );
810     }
811
812     private static void writeLogFile( final File logfile,
813                                       final RIO rio,
814                                       final File species_tree_file,
815                                       final File gene_trees_file,
816                                       final File outtable,
817                                       final String prg_name,
818                                       final String prg_v,
819                                       final String prg_date,
820                                       final String f,
821                                       final boolean verbose )
822             throws IOException {
823         final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
824         out.println( "# " + prg_name );
825         out.println( "# version : " + prg_v );
826         out.println( "# date    : " + prg_date );
827         out.println( "# based on: " + f );
828         out.println( "# ----------------------------------" );
829         out.println( "Gene trees                          :\t" + gene_trees_file.getCanonicalPath() );
830         out.println( "Species tree                        :\t" + species_tree_file.getCanonicalPath() );
831         out.println( "All vs all orthology table          :\t" + outtable.getCanonicalPath() );
832         out.flush();
833         out.println( rio.getLog().toString() );
834         out.close();
835         if ( verbose ) {
836             System.out.println( "Wrote log to                        :\t" + logfile.getCanonicalPath() );
837         }
838     }
839
840     private static void writeTable( final File table_outfile,
841                                     final int gene_trees_analyzed,
842                                     final IntMatrix m,
843                                     final boolean verbose )
844             throws IOException {
845         final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
846         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
847         df.setDecimalSeparatorAlwaysShown( false );
848         df.setRoundingMode( RoundingMode.HALF_UP );
849         for( int i = 0; i < m.size(); ++i ) {
850             w.print( "\t" );
851             w.print( m.getLabel( i ) );
852         }
853         w.println();
854         for( int x = 0; x < m.size(); ++x ) {
855             w.print( m.getLabel( x ) );
856             for( int y = 0; y < m.size(); ++y ) {
857                 w.print( "\t" );
858                 if ( x == y ) {
859                     if ( m.get( x, y ) != gene_trees_analyzed ) {
860                         ForesterUtil.unexpectedFatalError( "diagonal value is off" );
861                     }
862                     w.print( "-" );
863                 }
864                 else {
865                     w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
866                 }
867             }
868             w.println();
869         }
870         w.close();
871         if ( verbose ) {
872             System.out.println( "Wrote table to                      :\t" + table_outfile.getCanonicalPath() );
873         }
874     }
875
876     private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException {
877         final PhylogenyWriter writer = new PhylogenyWriter();
878         writer.toPhyloXML( f, p, 0 );
879         if ( comment != null ) {
880             System.out.println( comment + f.getCanonicalPath() );
881         }
882     }
883 }