in progress...
[jalview.git] / forester / java / src / org / forester / application / rio.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2017 Christian M. Zmasek
6 // All rights reserved
7 //
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
12 //
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
17 //
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
21 //
22 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
23
24 package org.forester.application;
25
26 import java.io.File;
27 import java.io.FilenameFilter;
28 import java.io.IOException;
29 import java.math.RoundingMode;
30 import java.util.ArrayList;
31 import java.util.Arrays;
32 import java.util.Iterator;
33 import java.util.List;
34 import java.util.SortedSet;
35 import java.util.TreeSet;
36
37 import org.forester.datastructures.IntMatrix;
38 import org.forester.io.parsers.IteratingPhylogenyParser;
39 import org.forester.io.parsers.PhylogenyParser;
40 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
41 import org.forester.io.parsers.nhx.NHXParser;
42 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
43 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
44 import org.forester.io.parsers.util.ParserUtils;
45 import org.forester.io.writers.PhylogenyWriter;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.rio.RIO;
48 import org.forester.rio.RIO.REROOTING;
49 import org.forester.rio.RIOException;
50 import org.forester.sdi.SDIException;
51 import org.forester.sdi.SDIutil.ALGORITHM;
52 import org.forester.util.BasicDescriptiveStatistics;
53 import org.forester.util.CommandLineArguments;
54 import org.forester.util.EasyWriter;
55 import org.forester.util.ForesterUtil;
56
57 public class rio {
58
59     final static private String PRG_NAME                       = "rio";
60     final static private String PRG_VERSION                    = "5.000";
61     final static private String PRG_DATE                       = "170411";
62     final static private String E_MAIL                         = "phyloxml@gmail.com";
63     final static private String WWW                            = "https://sites.google.com/site/cmzmasek/home/software/forester";
64     final static private String HELP_OPTION_1                  = "help";
65     final static private String LOGFILE_SUFFIX                 = "_RIO_log.tsv";
66     final static private String STRIPPED_SPECIES_TREE_SUFFIX   = "_RIO_sst.xml";
67     final static private String ORTHO_OUTTABLE_SUFFIX          = "_RIO_orthologies.tsv";
68     final static private String OUT_MIN_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_min_dup_";
69     final static private String OUT_MED_DUP_GENE_TREE_SUFFIX   = "_RIO_gene_tree_med_dup_";
70     final static private String ORTHOLOG_GROUPS_SUFFIX         = "_RIO_ortholog_groups.tsv";
71     final static private String HELP_OPTION_2                  = "h";
72     final static private String GT_FIRST                       = "f";
73     final static private String GT_LAST                        = "l";
74     final static private String REROOTING_OPT                  = "r";
75     final static private String OUTGROUP                       = "o";
76     final static private String USE_SDIR                       = "s";
77     final static private String GENE_TREES_SUFFIX_OPTION       = "g";
78     final static private String ORTHOLOG_GROUPS_CUTOFF_OPTION  = "c";
79     final static private double ORTHOLOG_GROUPS_CUTOFF_DEFAULT = 0.5;
80
81     public static void main( final String[] args ) {
82         ForesterUtil.printProgramInformation( PRG_NAME,
83                                               "resampled inference of orthologs",
84                                               PRG_VERSION,
85                                               PRG_DATE,
86                                               E_MAIL,
87                                               WWW,
88                                               ForesterUtil.getForesterLibraryInformation() );
89         CommandLineArguments cla = null;
90         try {
91             cla = new CommandLineArguments( args );
92         }
93         catch ( final Exception e ) {
94             ForesterUtil.fatalError( e.getMessage() );
95         }
96         if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) {
97             printHelp();
98         }
99         if ( ( args.length < 3 ) || ( args.length > 11 ) || ( cla.getNumberOfNames() < 3 ) ) {
100             System.out.println();
101             System.out.println( "error: incorrect number of arguments" );
102             System.out.println();
103             printHelp();
104         }
105         final List<String> allowed_options = new ArrayList<String>();
106         allowed_options.add( GT_FIRST );
107         allowed_options.add( GT_LAST );
108         allowed_options.add( REROOTING_OPT );
109         allowed_options.add( OUTGROUP );
110         allowed_options.add( USE_SDIR );
111         allowed_options.add( GENE_TREES_SUFFIX_OPTION );
112         allowed_options.add( ORTHOLOG_GROUPS_CUTOFF_OPTION );
113         final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
114         if ( dissallowed_options.length() > 0 ) {
115             ForesterUtil.fatalError( "unknown option(s): " + dissallowed_options );
116         }
117         final File gene_trees_file = cla.getFile( 0 );
118         final boolean use_dir;
119         File indir = null;
120         File outdir = null;
121         if ( gene_trees_file.isDirectory() ) {
122             if ( !gene_trees_file.exists() ) {
123                 ForesterUtil.fatalError( "gene trees directory \"" + gene_trees_file + "\" does not exist" );
124             }
125             use_dir = true;
126             indir = gene_trees_file;
127         }
128         else {
129             use_dir = false;
130         }
131         final File species_tree_file = cla.getFile( 1 );
132         File orthology_outtable = null;
133         if ( use_dir ) {
134             outdir = cla.getFile( 2 );
135         }
136         else {
137             orthology_outtable = cla.getFile( 2 );
138         }
139         File logfile;
140         if ( use_dir ) {
141             if ( ( cla.getNumberOfNames() < 4 ) ) {
142                 System.out.println();
143                 System.out.println( "error: incorrect number of arguments" );
144                 System.out.println();
145                 printHelp();
146             }
147             logfile = cla.getFile( 3 );
148             if ( logfile.exists() ) {
149                 ForesterUtil.fatalError( "\"" + logfile + "\" already exists" );
150             }
151         }
152         else {
153             if ( cla.getNumberOfNames() > 3 ) {
154                 logfile = cla.getFile( 3 );
155                 if ( logfile.exists() ) {
156                     ForesterUtil.fatalError( "\"" + logfile + "\" already exists" );
157                 }
158             }
159             else {
160                 logfile = null;
161             }
162         }
163         boolean sdir = false;
164         if ( cla.isOptionSet( USE_SDIR ) ) {
165             if ( cla.isOptionHasAValue( USE_SDIR ) ) {
166                 ForesterUtil.fatalError( "no value allowed for -" + USE_SDIR );
167             }
168             sdir = true;
169             if ( !use_dir && logfile != null ) {
170                 ForesterUtil.fatalError( "no logfile output for SDIR algorithm" );
171             }
172         }
173         String outgroup = null;
174         if ( cla.isOptionSet( OUTGROUP ) ) {
175             if ( sdir ) {
176                 ForesterUtil.fatalError( "no outgroup option for SDIR algorithm" );
177             }
178             if ( use_dir ) {
179                 ForesterUtil.fatalError( "no outgroup option for operating on gene trees directory" );
180             }
181             if ( !cla.isOptionHasAValue( OUTGROUP ) ) {
182                 ForesterUtil.fatalError( "no value for -" + OUTGROUP );
183             }
184             outgroup = cla.getOptionValueAsCleanString( OUTGROUP );
185         }
186         REROOTING rerooting = REROOTING.BY_ALGORITHM;
187         if ( cla.isOptionSet( REROOTING_OPT ) ) {
188             if ( !cla.isOptionHasAValue( REROOTING_OPT ) ) {
189                 ForesterUtil.fatalError( "no value for -" + REROOTING_OPT );
190             }
191             if ( sdir ) {
192                 ForesterUtil.fatalError( "no re-rooting option for SDIR algorithm" );
193             }
194             final String rerooting_str = cla.getOptionValueAsCleanString( REROOTING_OPT ).toLowerCase();
195             if ( rerooting_str.equals( "none" ) ) {
196                 rerooting = REROOTING.NONE;
197             }
198             else if ( rerooting_str.equals( "midpoint" ) ) {
199                 rerooting = REROOTING.MIDPOINT;
200             }
201             else if ( rerooting_str.equals( "outgroup" ) ) {
202                 if ( use_dir ) {
203                     ForesterUtil.fatalError( "no outgroup option for operating on gene trees directory" );
204                 }
205                 rerooting = REROOTING.OUTGROUP;
206             }
207             else {
208                 ForesterUtil
209                         .fatalError( "values for re-rooting are: 'none', 'midpoint', or 'outgroup' (minizming duplications is default)" );
210             }
211         }
212         if ( ForesterUtil.isEmpty( outgroup ) && ( rerooting == REROOTING.OUTGROUP ) ) {
213             ForesterUtil.fatalError( "selected re-rooting by outgroup, but outgroup not set" );
214         }
215         if ( !ForesterUtil.isEmpty( outgroup ) && ( rerooting != REROOTING.OUTGROUP ) ) {
216             ForesterUtil.fatalError( "outgroup set, but selected re-rooting by other approach" );
217         }
218         int gt_first = RIO.DEFAULT_RANGE;
219         int gt_last = RIO.DEFAULT_RANGE;
220         if ( cla.isOptionSet( GT_FIRST ) ) {
221             if ( sdir ) {
222                 ForesterUtil.fatalError( "no gene tree range option for SDIR algorithm" );
223             }
224             if ( !cla.isOptionHasAValue( GT_FIRST ) ) {
225                 ForesterUtil.fatalError( "no value for -" + GT_FIRST );
226             }
227             try {
228                 gt_first = cla.getOptionValueAsInt( GT_FIRST );
229             }
230             catch ( final IOException e ) {
231                 ForesterUtil.fatalError( "could not parse integer for -" + GT_FIRST + " option" );
232             }
233             if ( gt_first < 0 ) {
234                 ForesterUtil.fatalError( "attempt to set index of first tree to analyze to: " + gt_first );
235             }
236         }
237         if ( cla.isOptionSet( GT_LAST ) ) {
238             if ( sdir ) {
239                 ForesterUtil.fatalError( "no gene tree range option for SDIR algorithm" );
240             }
241             if ( !cla.isOptionHasAValue( GT_LAST ) ) {
242                 ForesterUtil.fatalError( "no value for -" + GT_LAST );
243             }
244             try {
245                 gt_last = cla.getOptionValueAsInt( GT_LAST );
246             }
247             catch ( final IOException e ) {
248                 ForesterUtil.fatalError( "could not parse integer for -" + GT_LAST + " option" );
249             }
250             if ( gt_last < 0 ) {
251                 ForesterUtil.fatalError( "attempt to set index of last tree to analyze to: " + gt_last );
252             }
253         }
254         if ( ( ( gt_last != RIO.DEFAULT_RANGE ) && ( gt_first != RIO.DEFAULT_RANGE ) ) && ( ( gt_last < gt_first ) ) ) {
255             ForesterUtil.fatalError( "attempt to set range (0-based) of gene to analyze to: from " + gt_first + " to "
256                     + gt_last );
257         }
258         double ortholog_group_cutoff = ORTHOLOG_GROUPS_CUTOFF_DEFAULT;
259         if ( cla.isOptionSet( ORTHOLOG_GROUPS_CUTOFF_OPTION ) ) {
260             if ( sdir ) {
261                 ForesterUtil.fatalError( "ortholog groups cutoff for SDIR algorithm" );
262             }
263             if ( !cla.isOptionHasAValue( ORTHOLOG_GROUPS_CUTOFF_OPTION ) ) {
264                 ForesterUtil.fatalError( "no value for -" + ORTHOLOG_GROUPS_CUTOFF_OPTION );
265             }
266             try {
267                 ortholog_group_cutoff = cla.getOptionValueAsDouble( ORTHOLOG_GROUPS_CUTOFF_OPTION );
268             }
269             catch ( final IOException e ) {
270                 ForesterUtil.fatalError( "could not parse double for -" + ORTHOLOG_GROUPS_CUTOFF_OPTION + " option" );
271             }
272             if ( ortholog_group_cutoff < 0 ) {
273                 ForesterUtil.fatalError( "attempt to set ortholog groups cutoff to: " + ortholog_group_cutoff );
274             }
275             if ( ortholog_group_cutoff > 1 ) {
276                 ForesterUtil.fatalError( "attempt to set ortholog groups cutoff to: " + ortholog_group_cutoff );
277             }
278         }
279         if ( !use_dir ) {
280             ForesterUtil.fatalErrorIfFileNotReadable( gene_trees_file );
281         }
282         final String gene_trees_suffix;
283         if ( cla.isOptionSet( GENE_TREES_SUFFIX_OPTION ) ) {
284             if ( !use_dir ) {
285                 ForesterUtil.fatalError( "no gene tree suffix option when operating on indivual gene trees" );
286             }
287             if ( !cla.isOptionHasAValue( GENE_TREES_SUFFIX_OPTION ) ) {
288                 ForesterUtil.fatalError( "no value for -" + GENE_TREES_SUFFIX_OPTION );
289             }
290             gene_trees_suffix = cla.getOptionValueAsCleanString( GENE_TREES_SUFFIX_OPTION );
291         }
292         else {
293             gene_trees_suffix = ".mlt";
294         }
295         ForesterUtil.fatalErrorIfFileNotReadable( species_tree_file );
296         if ( !use_dir && orthology_outtable.exists() ) {
297             ForesterUtil.fatalError( "\"" + orthology_outtable + "\" already exists" );
298         }
299         long time = 0;
300         try {
301             if ( use_dir ) {
302                 System.out.println( "Gene trees in-dir                   :\t" + indir.getCanonicalPath() );
303                 System.out.println( "Gene trees suffix                   :\t" + gene_trees_suffix );
304             }
305             else {
306                 System.out.println( "Gene trees                          :\t" + gene_trees_file.getCanonicalPath() );
307             }
308             System.out.println( "Species tree                        :\t" + species_tree_file.getCanonicalPath() );
309         }
310         catch ( final IOException e ) {
311             ForesterUtil.fatalError( e.getLocalizedMessage() );
312         }
313         if ( use_dir ) {
314             System.out.println( "Out-dir                             :\t" + outdir );
315         }
316         else {
317             System.out.println( "All vs all orthology results table  :\t" + orthology_outtable );
318         }
319         if ( logfile != null ) {
320             System.out.println( "Logfile                             :\t" + logfile );
321         }
322         System.out.println( "Ortholog groups cutoff              :\t" + ortholog_group_cutoff );
323         if ( gt_first != RIO.DEFAULT_RANGE ) {
324             System.out.println( "First gene tree to analyze          :\t" + gt_first );
325         }
326         if ( gt_last != RIO.DEFAULT_RANGE ) {
327             System.out.println( "Last gene tree to analyze           :\t" + gt_last );
328         }
329         String rerooting_str = "";
330         switch ( rerooting ) {
331             case BY_ALGORITHM: {
332                 rerooting_str = "by minimizing duplications";
333                 break;
334             }
335             case MIDPOINT: {
336                 rerooting_str = "by midpoint method";
337                 break;
338             }
339             case OUTGROUP: {
340                 rerooting_str = "by outgroup: " + outgroup;
341                 break;
342             }
343             case NONE: {
344                 rerooting_str = "none";
345                 break;
346             }
347         }
348         System.out.println( "Re-rooting                          : \t" + rerooting_str );
349         if ( !sdir ) {
350             System.out.println( "Non binary species tree             :\tallowed" );
351         }
352         else {
353             System.out.println( "Non binary species tree             :\tdisallowed" );
354         }
355         time = System.currentTimeMillis();
356         final ALGORITHM algorithm;
357         if ( sdir ) {
358             algorithm = ALGORITHM.SDIR;
359         }
360         else {
361             algorithm = ALGORITHM.GSDIR;
362         }
363         EasyWriter log = null;
364         if ( use_dir ) {
365             if ( outdir.exists() ) {
366                 if ( !outdir.isDirectory() ) {
367                     ForesterUtil.fatalError( PRG_NAME,
368                                              "out-directory [" + outdir + "] already exists but is not a directory" );
369                 }
370             }
371             else {
372                 final boolean success = outdir.mkdirs();
373                 if ( !success ) {
374                     ForesterUtil.fatalError( PRG_NAME, "could not create out-directory [" + outdir + "]" );
375                 }
376             }
377             final String species_tree_file_name = species_tree_file.getName();
378             final File gene_trees_files[] = indir.listFiles( new FilenameFilter() {
379
380                 @Override
381                 public boolean accept( final File dir, final String name ) {
382                     return ( ( name.endsWith( gene_trees_suffix ) ) && !( name.equals( species_tree_file_name ) ) );
383                 }
384             } );
385             if ( gene_trees_files.length < 1 ) {
386                 ForesterUtil.fatalError( PRG_NAME,
387                                          "in-directory [" + indir
388                                                  + "] does not contain any gene tree files with suffix "
389                                                  + gene_trees_suffix );
390             }
391             try {
392                 log = ForesterUtil.createEasyWriter( logfile );
393             }
394             catch ( final IOException e ) {
395                 ForesterUtil.fatalError( PRG_NAME, "could not create [" + logfile + "]" );
396             }
397             Arrays.sort( gene_trees_files );
398             try {
399                 log.print( "# program" );
400                 log.print( "\t" );
401                 log.print( PRG_NAME );
402                 log.println();
403                 log.print( "# version" );
404                 log.print( "\t" );
405                 log.print( PRG_VERSION );
406                 log.println();
407                 log.print( "# date" );
408                 log.print( "\t" );
409                 log.print( PRG_DATE );
410                 log.println();
411                 log.print( "# Algorithm " );
412                 log.print( "\t" );
413                 log.print( algorithm.toString() );
414                 log.println();
415                 log.print( "# Gene trees in-dir" );
416                 log.print( "\t" );
417                 log.print( indir.getCanonicalPath() );
418                 log.println();
419                 log.print( "# Gene trees suffix" );
420                 log.print( "\t" );
421                 log.print( gene_trees_suffix );
422                 log.println();
423                 log.print( "# Species tree" );
424                 log.print( "\t" );
425                 log.print( species_tree_file.getCanonicalPath() );
426                 log.println();
427                 log.print( "# Out-dir" );
428                 log.print( "\t" );
429                 log.print( outdir.getCanonicalPath() );
430                 log.println();
431                 log.print( "# Logfile" );
432                 log.print( "\t" );
433                 log.print( logfile.getCanonicalPath() );
434                 log.println();
435                 log.print( "# Ortholog groups cutoff" );
436                 log.print( "\t" );
437                 log.print( Double.toString( ortholog_group_cutoff ) );
438                 log.println();
439                 if ( gt_first != RIO.DEFAULT_RANGE ) {
440                     log.print( "# First gene tree to analyze" );
441                     log.print( "\t" );
442                     log.print( Integer.toString( gt_first ) );
443                     log.println();
444                 }
445                 if ( gt_last != RIO.DEFAULT_RANGE ) {
446                     log.print( "# Last gene tree to analyze" );
447                     log.print( "\t" );
448                     log.print( Integer.toString( gt_last ) );
449                     log.println();
450                 }
451                 log.print( "# Re-rooting" );
452                 log.print( "\t" );
453                 log.print( rerooting_str );
454                 log.println();
455                 log.print( "# Non binary species tree" );
456                 log.print( "\t" );
457                 if ( !sdir ) {
458                     log.print( "allowed" );
459                 }
460                 else {
461                     log.print( "disallowed" );
462                 }
463                 log.println();
464                 log.println();
465                 log.print( "NAME" );
466                 log.print( "\t" );
467                 log.print( "EXT NODES" );
468                 log.print( "\t" );
469                 log.print(  ortholog_group_cutoff + " O GROUPS" );
470                 log.print( "\t" );
471                 log.print( "0.05 O GROUPS" );
472                 log.print( "\t" );
473                 log.print( "0.25 O GROUPS" );
474                 log.print( "\t" );
475                 log.print( "0.5 O GROUPS" );
476                 log.print( "\t" );
477                 log.print( "0.75 O GROUPS" );
478                 log.print( "\t" );
479                 log.print( "0.95 O GROUPS" );
480                 log.print( "\t" );
481                 log.print( "MEDIAN DUP" );
482                 log.print( "\t" );
483                 log.print( "MEAN DUP" );
484                 log.print( "\t" );
485                 log.print( "MEAN DUP SD" );
486                 log.print( "\t" );
487                 log.print( "MIN DUP" );
488                 log.print( "\t" );
489                 log.print( "MAX DUP" );
490                 log.print( "\t" );
491                 log.print( "REMOVED EXT NODES" );
492                 log.print( "\t" );
493                 log.print( "N" );
494                 log.println();
495             }
496             catch ( IOException e ) {
497                 ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
498             }
499             int counter = 1;
500             for( final File gf : gene_trees_files ) {
501                 String outname = gf.getName();
502                 System.out
503                         .print( "\r                                                                                            " );
504                 System.out.print( "\r" + counter + "/" + gene_trees_files.length + ": " + outname );
505                 counter++;
506                 if ( outname.indexOf( "." ) > 0 ) {
507                     outname = outname.substring( 0, outname.lastIndexOf( "." ) );
508                 }
509                 try {
510                     executeAnalysis( gf,
511                                      species_tree_file,
512                                      new File( outdir.getCanonicalFile() + "/" + outname + ORTHO_OUTTABLE_SUFFIX ),
513                                      new File( outdir.getCanonicalFile() + "/" + outname + ORTHOLOG_GROUPS_SUFFIX ),
514                                      new File( outdir.getCanonicalFile() + "/" + outname + LOGFILE_SUFFIX ),
515                                      outgroup,
516                                      rerooting,
517                                      gt_first,
518                                      gt_last,
519                                      new File( outdir.getCanonicalFile() + "/" + outname
520                                              + STRIPPED_SPECIES_TREE_SUFFIX ),
521                                      new File( outdir.getCanonicalFile() + "/" + outname
522                                              + OUT_MIN_DUP_GENE_TREE_SUFFIX ),
523                                      new File( outdir.getCanonicalFile() + "/" + outname
524                                              + OUT_MED_DUP_GENE_TREE_SUFFIX ),
525                                      true,
526                                      algorithm,
527                                      true,
528                                      log,
529                                      ortholog_group_cutoff );
530                 }
531                 catch ( IOException e ) {
532                     ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
533                 }
534             }
535             System.out
536                     .print( "\r                                                                                        " );
537             System.out.println();
538         }
539         else {
540             String outname = orthology_outtable.toString();
541             if ( outname.indexOf( "." ) > 0 ) {
542                 outname = outname.substring( 0, outname.lastIndexOf( "." ) );
543             }
544             executeAnalysis( gene_trees_file,
545                              species_tree_file,
546                              orthology_outtable,
547                              new File( outname + ORTHOLOG_GROUPS_SUFFIX ),
548                              logfile,
549                              outgroup,
550                              rerooting,
551                              gt_first,
552                              gt_last,
553                              new File( outname + STRIPPED_SPECIES_TREE_SUFFIX ),
554                              new File( outname + OUT_MIN_DUP_GENE_TREE_SUFFIX ),
555                              new File( outname + OUT_MED_DUP_GENE_TREE_SUFFIX ),
556                              algorithm == ALGORITHM.GSDIR,
557                              algorithm,
558                              false,
559                              null,
560                              ortholog_group_cutoff );
561         }
562         if ( !use_dir ) {
563             time = System.currentTimeMillis() - time;
564             System.out.println( "Time                                :\t" + time + "ms" );
565         }
566         else {
567             try {
568                 log.close();
569             }
570             catch ( IOException e ) {
571                 ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
572             }
573             time = System.currentTimeMillis() - time;
574             System.out.println( "Time                                :\t" + time + "ms" );
575         }
576         System.exit( 0 );
577     }
578
579     private static final void executeAnalysis( final File gene_trees_file,
580                                                final File species_tree_file,
581                                                final File orthology_outtable,
582                                                final File orthology_groups_outfile,
583                                                final File logfile,
584                                                final String outgroup,
585                                                final REROOTING rerooting,
586                                                final int gt_first,
587                                                final int gt_last,
588                                                final File return_species_tree,
589                                                final File return_min_dup_gene_tree,
590                                                final File return_median_dup_gene_tree,
591                                                final boolean transfer_taxonomy,
592                                                final ALGORITHM algorithm,
593                                                final boolean use_gene_trees_dir,
594                                                final EasyWriter log,
595                                                final double ortholog_group_cutoff ) {
596         try {
597             final RIO rio;
598             boolean iterating = false;
599             final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
600             if ( p instanceof PhyloXmlParser ) {
601                 rio = RIO.executeAnalysis( gene_trees_file,
602                                            species_tree_file,
603                                            algorithm,
604                                            rerooting,
605                                            outgroup,
606                                            gt_first,
607                                            gt_last,
608                                            logfile != null,
609                                            true,
610                                            transfer_taxonomy );
611             }
612             else {
613                 iterating = true;
614                 if ( p instanceof NHXParser ) {
615                     final NHXParser nhx = ( NHXParser ) p;
616                     nhx.setReplaceUnderscores( false );
617                     nhx.setIgnoreQuotes( true );
618                     nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
619                 }
620                 else if ( p instanceof NexusPhylogeniesParser ) {
621                     final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
622                     nex.setReplaceUnderscores( false );
623                     nex.setIgnoreQuotes( true );
624                     nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
625                 }
626                 else {
627                     throw new RuntimeException( "unknown parser type: " + p );
628                 }
629                 final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
630                 ip.setSource( gene_trees_file );
631                 rio = RIO.executeAnalysis( ip,
632                                            species_tree_file,
633                                            algorithm,
634                                            rerooting,
635                                            outgroup,
636                                            gt_first,
637                                            gt_last,
638                                            logfile != null,
639                                            !use_gene_trees_dir,
640                                            transfer_taxonomy );
641             }
642             if ( !use_gene_trees_dir ) {
643                 if ( algorithm == ALGORITHM.GSDIR ) {
644                     System.out.println( "Taxonomy linking based on           :\t" + rio.getGSDIRtaxCompBase() );
645                 }
646             }
647             final IntMatrix m;
648             if ( iterating ) {
649                 m = rio.getOrthologTable();
650             }
651             else {
652                 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
653             }
654             final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
655             writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir );
656             final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
657                                                             ortholog_group_cutoff,
658                                                             stats.getN(),
659                                                             m,
660                                                             !use_gene_trees_dir,
661                                                             false );
662             final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true );
663             final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true );
664             final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true );
665             final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true );
666             final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true );
667             if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
668                 writeLogFile( logfile,
669                               rio,
670                               species_tree_file,
671                               gene_trees_file,
672                               orthology_outtable,
673                               PRG_NAME,
674                               PRG_VERSION,
675                               PRG_DATE,
676                               ForesterUtil.getForesterLibraryInformation(),
677                               !use_gene_trees_dir );
678             }
679             if ( return_species_tree != null ) {
680                 writeTree( rio.getSpeciesTree(),
681                            return_species_tree,
682                            use_gene_trees_dir ? null : "Wrote (stripped) species tree to    :\t" );
683             }
684             if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
685                 final int min = ( int ) rio.getDuplicationsStatistics().getMin();
686                 writeTree( rio.getMinDuplicationsGeneTree(),
687                            new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
688                            use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" );
689             }
690             if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
691                 final int med = ( int ) rio.getDuplicationsStatistics().median();
692                 writeTree( rio.getDuplicationsToTreeMap().get( med ),
693                            new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
694                            use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" );
695             }
696             final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
697             final int min = ( int ) stats.getMin();
698             final int max = ( int ) stats.getMax();
699             final int median = ( int ) stats.median();
700             int min_count = 0;
701             int max_count = 0;
702             int median_count = 0;
703             for( double d : stats.getData() ) {
704                 if ( ( ( int ) d ) == min ) {
705                     ++min_count;
706                 }
707                 if ( ( ( int ) d ) == max ) {
708                     ++max_count;
709                 }
710                 if ( ( ( int ) d ) == median ) {
711                     ++median_count;
712                 }
713             }
714             final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
715             final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
716             final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
717             if ( use_gene_trees_dir ) {
718                 String name = gene_trees_file.getName();
719                 if ( name.indexOf( "." ) > 0 ) {
720                     name = name.substring( 0, name.lastIndexOf( "." ) );
721                 }
722                 log.print( name );
723                 log.print( "\t" );
724                 log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
725                 log.print( "\t" );
726                 log.print( Integer.toString( ortholog_groups ) );
727                 //
728                 log.print( "\t" );
729                 log.print( Integer.toString( ortholog_groups_005 ) );
730                 log.print( "\t" );
731                 log.print( Integer.toString( ortholog_groups_025 ) );
732                 log.print( "\t" );
733                 log.print( Integer.toString( ortholog_groups_05 ) );
734                 log.print( "\t" );
735                 log.print( Integer.toString( ortholog_groups_075 ) );
736                 log.print( "\t" );
737                 log.print( Integer.toString( ortholog_groups_095 ) );
738                 //
739                 log.print( "\t" );
740                 if ( stats.getN() > 3 ) {
741                     log.print( df.format( median ) );
742                 }
743                 else {
744                     log.print( "" );
745                 }
746                 log.print( "\t" );
747                 log.print( df.format( stats.arithmeticMean() ) );
748                 log.print( "\t" );
749                 if ( stats.getN() > 3 ) {
750                     log.print( df.format( stats.sampleStandardDeviation() ) );
751                 }
752                 else {
753                     log.print( "" );
754                 }
755                 log.print( "\t" );
756                 log.print( Integer.toString( min ) );
757                 log.print( "\t" );
758                 log.print( Integer.toString( max ) );
759                 log.print( "\t" );
760                 log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
761                 log.print( "\t" );
762                 log.print( Integer.toString( stats.getN() ) );
763                 log.println();
764             }
765             else {
766                 System.out.println( "Gene tree internal nodes            :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
767                 System.out.println( "Gene tree external nodes            :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
768                 System.out.println( "Mean number of duplications         :\t" + df.format( stats.arithmeticMean() )
769                         + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
770                         + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
771                 if ( stats.getN() > 3 ) {
772                     System.out.println( "Median number of duplications       :\t" + df.format( median ) + "\t"
773                             + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
774                 }
775                 System.out.println( "Minimum duplications                :\t" + min + "\t"
776                         + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
777                 System.out.println( "Maximum duplications                :\t" + ( int ) max + "\t"
778                         + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
779                 System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
780                         + df.format( median_count_percentage ) + "%" );
781                 System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
782                         + df.format( min_count_percentage ) + "%" );
783                 System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
784                         + df.format( max_count_percentage ) + "%" );
785                 if ( algorithm == ALGORITHM.GSDIR ) {
786                     System.out.println( "Removed ext gene tree nodes         :\t"
787                             + rio.getRemovedGeneTreeNodes().size() );
788                 }
789             }
790         }
791         catch ( final RIOException e ) {
792             ForesterUtil.fatalError( e.getLocalizedMessage() );
793         }
794         catch ( final SDIException e ) {
795             ForesterUtil.fatalError( e.getLocalizedMessage() );
796         }
797         catch ( final IOException e ) {
798             ForesterUtil.fatalError( e.getLocalizedMessage() );
799         }
800         catch ( final OutOfMemoryError e ) {
801             ForesterUtil.outOfMemoryError( e );
802         }
803         catch ( final Exception e ) {
804             ForesterUtil.unexpectedFatalError( e );
805         }
806         catch ( final Error e ) {
807             ForesterUtil.unexpectedFatalError( e );
808         }
809     }
810
811     private final static void printHelp() {
812         System.out.println( "Usage" );
813         System.out.println();
814         System.out.println( PRG_NAME
815                 + " [options] <gene trees infile> <species tree infile> <all vs all orthology table outfile> [logfile]" );
816         System.out.println();
817         System.out.println( PRG_NAME + " [options] <gene trees indir> <species tree infile> <outdir> <logfile>" );
818         System.out.println();
819         System.out.println();
820         System.out.println( " Options" );
821         System.out.println( "  -" + GT_FIRST + "=<first>     : first gene tree to analyze (0-based index)" );
822         System.out.println( "  -" + GT_LAST + "=<last>      : last gene tree to analyze (0-based index)" );
823         System.out.println( "  -" + ORTHOLOG_GROUPS_CUTOFF_OPTION
824                 + "=<cutoff>    : cutoff value for ortholog groups (default: " + ORTHOLOG_GROUPS_CUTOFF_DEFAULT + ")" );
825         System.out.println( "  -" + REROOTING_OPT
826                 + "=<re-rooting>: re-rooting method for gene trees, possible values or 'none', 'midpoint'," );
827         System.out.println( "                   or 'outgroup' (default: by minizming duplications)" );
828         System.out.println( "  -" + OUTGROUP
829                 + "=<outgroup>  : for rooting by outgroup, name of outgroup (external gene tree node)" );
830         System.out.println( "  -" + USE_SDIR
831                 + "             : to use SDIR instead of GSDIR (faster, but non-binary species trees are" );
832         System.out.println( "                   disallowed, as are most options)" );
833         System.out.println( "  -" + GENE_TREES_SUFFIX_OPTION
834                 + "=<suffix>    : suffix for gene trees when operating on gene tree directories (default: .mlt)" );
835         System.out.println();
836         System.out.println( " Formats" );
837         System.out
838                 .println( "  The gene trees, as well as the species tree, ideally are in phyloXML (www.phyloxml.org) format," );
839         System.out
840                 .println( "  but can also be in New Hamphshire (Newick) or Nexus format as long as species information can be" );
841         System.out
842                 .println( "  extracted from the gene names (e.g. \"HUMAN\" from \"BCL2_HUMAN\") and matched to a single species" );
843         System.out.println( "  in the species tree." );
844         System.out.println();
845         System.out.println( " Examples" );
846         System.out.println( "  rio -s gene_trees.nh species.xml outtable.tsv" );
847         System.out.println( "  rio gene_trees.nh species.xml outtable.tsv log.txt" );
848         System.out.println( "  rio -c=0.9 -f=10 -l=100 -r=none gene_trees.xml species.xml outtable.tsv log.txt" );
849         System.out.println( "  rio -g=.xml gene_trees_dir species.xml out_dir log.tsv" );
850         System.out.println();
851         System.exit( -1 );
852     }
853
854     private static void writeLogFile( final File logfile,
855                                       final RIO rio,
856                                       final File species_tree_file,
857                                       final File gene_trees_file,
858                                       final File outtable,
859                                       final String prg_name,
860                                       final String prg_v,
861                                       final String prg_date,
862                                       final String f,
863                                       final boolean verbose )
864             throws IOException {
865         final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
866         out.println( "# " + prg_name );
867         out.println( "# version : " + prg_v );
868         out.println( "# date    : " + prg_date );
869         out.println( "# based on: " + f );
870         out.println( "# ----------------------------------" );
871         out.println( "Gene trees                          :\t" + gene_trees_file.getCanonicalPath() );
872         out.println( "Species tree                        :\t" + species_tree_file.getCanonicalPath() );
873         out.println( "All vs all orthology table          :\t" + outtable.getCanonicalPath() );
874         out.flush();
875         out.println( rio.getLog().toString() );
876         out.close();
877         if ( verbose ) {
878             System.out.println( "Wrote log to                        :\t" + logfile.getCanonicalPath() );
879         }
880     }
881
882     private static final void writeTable( final File table_outfile,
883                                           final int gene_trees_analyzed,
884                                           final IntMatrix m,
885                                           final boolean verbose )
886             throws IOException {
887         final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
888         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
889         df.setDecimalSeparatorAlwaysShown( false );
890         df.setRoundingMode( RoundingMode.HALF_UP );
891         for( int i = 0; i < m.size(); ++i ) {
892             w.print( "\t" );
893             w.print( m.getLabel( i ) );
894         }
895         w.println();
896         for( int x = 0; x < m.size(); ++x ) {
897             w.print( m.getLabel( x ) );
898             for( int y = 0; y < m.size(); ++y ) {
899                 w.print( "\t" );
900                 if ( x == y ) {
901                     if ( m.get( x, y ) != gene_trees_analyzed ) {
902                         ForesterUtil.unexpectedFatalError( "diagonal value is off" );
903                     }
904                     w.print( "-" );
905                 }
906                 else {
907                     w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
908                 }
909             }
910             w.println();
911         }
912         w.close();
913         if ( verbose ) {
914             System.out.println( "Wrote table to                      :\t" + table_outfile.getCanonicalPath() );
915         }
916     }
917
918     private static final int writeOrtologGroups( final File outfile,
919                                                  final double cutoff,
920                                                  final int gene_trees_analyzed,
921                                                  final IntMatrix m,
922                                                  final boolean verbose,
923                                                  final boolean calc_conly )
924             throws IOException {
925         List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
926         BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
927         int below_075 = 0;
928         int below_05 = 0;
929         int below_025 = 0;
930         for( int x = 1; x < m.size(); ++x ) {
931             final String a = m.getLabel( x );
932             for( int y = 0; y < x; ++y ) {
933                 final String b = m.getLabel( y );
934                 final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed;
935                 stats.addValue( s );
936                 if ( s < 0.75 ) {
937                     below_075++;
938                     if ( s < 0.5 ) {
939                         below_05++;
940                         if ( s < 0.25 ) {
941                             below_025++;
942                         }
943                     }
944                 }
945                 if ( s >= cutoff ) {
946                     boolean found = false;
947                     for( final SortedSet<String> group : groups ) {
948                         if ( group.contains( a ) ) {
949                             group.add( b );
950                             found = true;
951                         }
952                         if ( group.contains( b ) ) {
953                             group.add( a );
954                             found = true;
955                         }
956                     }
957                     if ( !found ) {
958                         final SortedSet<String> new_group = new TreeSet<String>();
959                         new_group.add( a );
960                         new_group.add( b );
961                         groups.add( new_group );
962                     }
963                 }
964             }
965         }
966         //Deal with singlets:
967         for( int x = 0; x < m.size(); ++x ) {
968             final String a = m.getLabel( x );
969             boolean found = false;
970             for( final SortedSet<String> group : groups ) {
971                 if ( group.contains( a ) ) {
972                     found = true;
973                     break;
974                 }
975             }
976             if ( !found ) {
977                 final SortedSet<String> new_group = new TreeSet<String>();
978                 new_group.add( a );
979                 groups.add( new_group );
980             }
981         }
982         if ( calc_conly ) {
983             return groups.size();
984         }
985         final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
986         df.setDecimalSeparatorAlwaysShown( false );
987         df.setRoundingMode( RoundingMode.HALF_UP );
988         final EasyWriter w = ForesterUtil.createEasyWriter( outfile );
989         int counter = 1;
990         for( final SortedSet<String> group : groups ) {
991             w.print( Integer.toString( counter++ ) );
992             for( final String s : group ) {
993                 w.print( "\t" );
994                 w.print( s );
995             }
996             w.println();
997         }
998         w.println();
999         w.println( "# Cutoff\t" + df.format( cutoff ) );
1000         w.println();
1001         w.println( "# Orthology support statistics:" );
1002         if ( stats.getN() > 3 ) {
1003             w.println( "# Median\t" + df.format( stats.median() ) );
1004         }
1005         w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) );
1006         if ( stats.getN() > 3 ) {
1007             w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) );
1008         }
1009         w.println( "# Min\t" + df.format( stats.getMin() ) );
1010         w.println( "# Max\t" + df.format( stats.getMax() ) );
1011         w.println( "# Total\t" + df.format( stats.getN() ) );
1012         w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" );
1013         w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" );
1014         w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" );
1015         w.close();
1016         if ( verbose ) {
1017             System.out.println( "Number of ortholog groups           :\t" + groups.size() );
1018             System.out.println( "Wrote orthologs groups table to     :\t" + outfile.getCanonicalPath() );
1019         }
1020         return groups.size();
1021     }
1022
1023     private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException {
1024         final PhylogenyWriter writer = new PhylogenyWriter();
1025         writer.toPhyloXML( f, p, 0 );
1026         if ( comment != null ) {
1027             System.out.println( comment + f.getCanonicalPath() );
1028         }
1029     }
1030 }