2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2017 Christian M. Zmasek
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
24 package org.forester.application;
27 import java.io.FilenameFilter;
28 import java.io.IOException;
29 import java.math.RoundingMode;
30 import java.util.ArrayList;
31 import java.util.Arrays;
32 import java.util.Iterator;
33 import java.util.List;
34 import java.util.SortedSet;
35 import java.util.TreeSet;
37 import org.forester.datastructures.IntMatrix;
38 import org.forester.io.parsers.IteratingPhylogenyParser;
39 import org.forester.io.parsers.PhylogenyParser;
40 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
41 import org.forester.io.parsers.nhx.NHXParser;
42 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
43 import org.forester.io.parsers.phyloxml.PhyloXmlParser;
44 import org.forester.io.parsers.util.ParserUtils;
45 import org.forester.io.writers.PhylogenyWriter;
46 import org.forester.phylogeny.Phylogeny;
47 import org.forester.rio.RIO;
48 import org.forester.rio.RIO.REROOTING;
49 import org.forester.rio.RIOException;
50 import org.forester.sdi.SDIException;
51 import org.forester.sdi.SDIutil.ALGORITHM;
52 import org.forester.util.BasicDescriptiveStatistics;
53 import org.forester.util.CommandLineArguments;
54 import org.forester.util.EasyWriter;
55 import org.forester.util.ForesterUtil;
59 final static private String PRG_NAME = "rio";
60 final static private String PRG_VERSION = "5.000";
61 final static private String PRG_DATE = "170411";
62 final static private String E_MAIL = "phyloxml@gmail.com";
63 final static private String WWW = "https://sites.google.com/site/cmzmasek/home/software/forester";
64 final static private String HELP_OPTION_1 = "help";
65 final static private String LOGFILE_SUFFIX = "_RIO_log.tsv";
66 final static private String STRIPPED_SPECIES_TREE_SUFFIX = "_RIO_sst.xml";
67 final static private String ORTHO_OUTTABLE_SUFFIX = "_RIO_orthologies.tsv";
68 final static private String OUT_MIN_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_min_dup_";
69 final static private String OUT_MED_DUP_GENE_TREE_SUFFIX = "_RIO_gene_tree_med_dup_";
70 final static private String ORTHOLOG_GROUPS_SUFFIX = "_RIO_ortholog_groups.tsv";
71 final static private String HELP_OPTION_2 = "h";
72 final static private String GT_FIRST = "f";
73 final static private String GT_LAST = "l";
74 final static private String REROOTING_OPT = "r";
75 final static private String OUTGROUP = "o";
76 final static private String USE_SDIR = "s";
77 final static private String GENE_TREES_SUFFIX_OPTION = "g";
78 final static private String ORTHOLOG_GROUPS_CUTOFF_OPTION = "c";
79 final static private double ORTHOLOG_GROUPS_CUTOFF_DEFAULT = 0.5;
81 public static void main( final String[] args ) {
82 ForesterUtil.printProgramInformation( PRG_NAME,
83 "resampled inference of orthologs",
88 ForesterUtil.getForesterLibraryInformation() );
89 CommandLineArguments cla = null;
91 cla = new CommandLineArguments( args );
93 catch ( final Exception e ) {
94 ForesterUtil.fatalError( e.getMessage() );
96 if ( cla.isOptionSet( HELP_OPTION_1 ) || cla.isOptionSet( HELP_OPTION_2 ) || ( args.length == 0 ) ) {
99 if ( ( args.length < 3 ) || ( args.length > 11 ) || ( cla.getNumberOfNames() < 3 ) ) {
100 System.out.println();
101 System.out.println( "error: incorrect number of arguments" );
102 System.out.println();
105 final List<String> allowed_options = new ArrayList<String>();
106 allowed_options.add( GT_FIRST );
107 allowed_options.add( GT_LAST );
108 allowed_options.add( REROOTING_OPT );
109 allowed_options.add( OUTGROUP );
110 allowed_options.add( USE_SDIR );
111 allowed_options.add( GENE_TREES_SUFFIX_OPTION );
112 allowed_options.add( ORTHOLOG_GROUPS_CUTOFF_OPTION );
113 final String dissallowed_options = cla.validateAllowedOptionsAsString( allowed_options );
114 if ( dissallowed_options.length() > 0 ) {
115 ForesterUtil.fatalError( "unknown option(s): " + dissallowed_options );
117 final File gene_trees_file = cla.getFile( 0 );
118 final boolean use_dir;
121 if ( gene_trees_file.isDirectory() ) {
122 if ( !gene_trees_file.exists() ) {
123 ForesterUtil.fatalError( "gene trees directory \"" + gene_trees_file + "\" does not exist" );
126 indir = gene_trees_file;
131 final File species_tree_file = cla.getFile( 1 );
132 File orthology_outtable = null;
134 outdir = cla.getFile( 2 );
137 orthology_outtable = cla.getFile( 2 );
141 if ( ( cla.getNumberOfNames() < 4 ) ) {
142 System.out.println();
143 System.out.println( "error: incorrect number of arguments" );
144 System.out.println();
147 logfile = cla.getFile( 3 );
148 if ( logfile.exists() ) {
149 ForesterUtil.fatalError( "\"" + logfile + "\" already exists" );
153 if ( cla.getNumberOfNames() > 3 ) {
154 logfile = cla.getFile( 3 );
155 if ( logfile.exists() ) {
156 ForesterUtil.fatalError( "\"" + logfile + "\" already exists" );
163 boolean sdir = false;
164 if ( cla.isOptionSet( USE_SDIR ) ) {
165 if ( cla.isOptionHasAValue( USE_SDIR ) ) {
166 ForesterUtil.fatalError( "no value allowed for -" + USE_SDIR );
169 if ( !use_dir && logfile != null ) {
170 ForesterUtil.fatalError( "no logfile output for SDIR algorithm" );
173 String outgroup = null;
174 if ( cla.isOptionSet( OUTGROUP ) ) {
176 ForesterUtil.fatalError( "no outgroup option for SDIR algorithm" );
179 ForesterUtil.fatalError( "no outgroup option for operating on gene trees directory" );
181 if ( !cla.isOptionHasAValue( OUTGROUP ) ) {
182 ForesterUtil.fatalError( "no value for -" + OUTGROUP );
184 outgroup = cla.getOptionValueAsCleanString( OUTGROUP );
186 REROOTING rerooting = REROOTING.BY_ALGORITHM;
187 if ( cla.isOptionSet( REROOTING_OPT ) ) {
188 if ( !cla.isOptionHasAValue( REROOTING_OPT ) ) {
189 ForesterUtil.fatalError( "no value for -" + REROOTING_OPT );
192 ForesterUtil.fatalError( "no re-rooting option for SDIR algorithm" );
194 final String rerooting_str = cla.getOptionValueAsCleanString( REROOTING_OPT ).toLowerCase();
195 if ( rerooting_str.equals( "none" ) ) {
196 rerooting = REROOTING.NONE;
198 else if ( rerooting_str.equals( "midpoint" ) ) {
199 rerooting = REROOTING.MIDPOINT;
201 else if ( rerooting_str.equals( "outgroup" ) ) {
203 ForesterUtil.fatalError( "no outgroup option for operating on gene trees directory" );
205 rerooting = REROOTING.OUTGROUP;
209 .fatalError( "values for re-rooting are: 'none', 'midpoint', or 'outgroup' (minizming duplications is default)" );
212 if ( ForesterUtil.isEmpty( outgroup ) && ( rerooting == REROOTING.OUTGROUP ) ) {
213 ForesterUtil.fatalError( "selected re-rooting by outgroup, but outgroup not set" );
215 if ( !ForesterUtil.isEmpty( outgroup ) && ( rerooting != REROOTING.OUTGROUP ) ) {
216 ForesterUtil.fatalError( "outgroup set, but selected re-rooting by other approach" );
218 int gt_first = RIO.DEFAULT_RANGE;
219 int gt_last = RIO.DEFAULT_RANGE;
220 if ( cla.isOptionSet( GT_FIRST ) ) {
222 ForesterUtil.fatalError( "no gene tree range option for SDIR algorithm" );
224 if ( !cla.isOptionHasAValue( GT_FIRST ) ) {
225 ForesterUtil.fatalError( "no value for -" + GT_FIRST );
228 gt_first = cla.getOptionValueAsInt( GT_FIRST );
230 catch ( final IOException e ) {
231 ForesterUtil.fatalError( "could not parse integer for -" + GT_FIRST + " option" );
233 if ( gt_first < 0 ) {
234 ForesterUtil.fatalError( "attempt to set index of first tree to analyze to: " + gt_first );
237 if ( cla.isOptionSet( GT_LAST ) ) {
239 ForesterUtil.fatalError( "no gene tree range option for SDIR algorithm" );
241 if ( !cla.isOptionHasAValue( GT_LAST ) ) {
242 ForesterUtil.fatalError( "no value for -" + GT_LAST );
245 gt_last = cla.getOptionValueAsInt( GT_LAST );
247 catch ( final IOException e ) {
248 ForesterUtil.fatalError( "could not parse integer for -" + GT_LAST + " option" );
251 ForesterUtil.fatalError( "attempt to set index of last tree to analyze to: " + gt_last );
254 if ( ( ( gt_last != RIO.DEFAULT_RANGE ) && ( gt_first != RIO.DEFAULT_RANGE ) ) && ( ( gt_last < gt_first ) ) ) {
255 ForesterUtil.fatalError( "attempt to set range (0-based) of gene to analyze to: from " + gt_first + " to "
258 double ortholog_group_cutoff = ORTHOLOG_GROUPS_CUTOFF_DEFAULT;
259 if ( cla.isOptionSet( ORTHOLOG_GROUPS_CUTOFF_OPTION ) ) {
261 ForesterUtil.fatalError( "ortholog groups cutoff for SDIR algorithm" );
263 if ( !cla.isOptionHasAValue( ORTHOLOG_GROUPS_CUTOFF_OPTION ) ) {
264 ForesterUtil.fatalError( "no value for -" + ORTHOLOG_GROUPS_CUTOFF_OPTION );
267 ortholog_group_cutoff = cla.getOptionValueAsDouble( ORTHOLOG_GROUPS_CUTOFF_OPTION );
269 catch ( final IOException e ) {
270 ForesterUtil.fatalError( "could not parse double for -" + ORTHOLOG_GROUPS_CUTOFF_OPTION + " option" );
272 if ( ortholog_group_cutoff < 0 ) {
273 ForesterUtil.fatalError( "attempt to set ortholog groups cutoff to: " + ortholog_group_cutoff );
275 if ( ortholog_group_cutoff > 1 ) {
276 ForesterUtil.fatalError( "attempt to set ortholog groups cutoff to: " + ortholog_group_cutoff );
280 ForesterUtil.fatalErrorIfFileNotReadable( gene_trees_file );
282 final String gene_trees_suffix;
283 if ( cla.isOptionSet( GENE_TREES_SUFFIX_OPTION ) ) {
285 ForesterUtil.fatalError( "no gene tree suffix option when operating on indivual gene trees" );
287 if ( !cla.isOptionHasAValue( GENE_TREES_SUFFIX_OPTION ) ) {
288 ForesterUtil.fatalError( "no value for -" + GENE_TREES_SUFFIX_OPTION );
290 gene_trees_suffix = cla.getOptionValueAsCleanString( GENE_TREES_SUFFIX_OPTION );
293 gene_trees_suffix = ".mlt";
295 ForesterUtil.fatalErrorIfFileNotReadable( species_tree_file );
296 if ( !use_dir && orthology_outtable.exists() ) {
297 ForesterUtil.fatalError( "\"" + orthology_outtable + "\" already exists" );
302 System.out.println( "Gene trees in-dir :\t" + indir.getCanonicalPath() );
303 System.out.println( "Gene trees suffix :\t" + gene_trees_suffix );
306 System.out.println( "Gene trees :\t" + gene_trees_file.getCanonicalPath() );
308 System.out.println( "Species tree :\t" + species_tree_file.getCanonicalPath() );
310 catch ( final IOException e ) {
311 ForesterUtil.fatalError( e.getLocalizedMessage() );
314 System.out.println( "Out-dir :\t" + outdir );
317 System.out.println( "All vs all orthology results table :\t" + orthology_outtable );
319 if ( logfile != null ) {
320 System.out.println( "Logfile :\t" + logfile );
322 System.out.println( "Ortholog groups cutoff :\t" + ortholog_group_cutoff );
323 if ( gt_first != RIO.DEFAULT_RANGE ) {
324 System.out.println( "First gene tree to analyze :\t" + gt_first );
326 if ( gt_last != RIO.DEFAULT_RANGE ) {
327 System.out.println( "Last gene tree to analyze :\t" + gt_last );
329 String rerooting_str = "";
330 switch ( rerooting ) {
332 rerooting_str = "by minimizing duplications";
336 rerooting_str = "by midpoint method";
340 rerooting_str = "by outgroup: " + outgroup;
344 rerooting_str = "none";
348 System.out.println( "Re-rooting : \t" + rerooting_str );
350 System.out.println( "Non binary species tree :\tallowed" );
353 System.out.println( "Non binary species tree :\tdisallowed" );
355 time = System.currentTimeMillis();
356 final ALGORITHM algorithm;
358 algorithm = ALGORITHM.SDIR;
361 algorithm = ALGORITHM.GSDIR;
363 EasyWriter log = null;
365 if ( outdir.exists() ) {
366 if ( !outdir.isDirectory() ) {
367 ForesterUtil.fatalError( PRG_NAME,
368 "out-directory [" + outdir + "] already exists but is not a directory" );
372 final boolean success = outdir.mkdirs();
374 ForesterUtil.fatalError( PRG_NAME, "could not create out-directory [" + outdir + "]" );
377 final String species_tree_file_name = species_tree_file.getName();
378 final File gene_trees_files[] = indir.listFiles( new FilenameFilter() {
381 public boolean accept( final File dir, final String name ) {
382 return ( ( name.endsWith( gene_trees_suffix ) ) && !( name.equals( species_tree_file_name ) ) );
385 if ( gene_trees_files.length < 1 ) {
386 ForesterUtil.fatalError( PRG_NAME,
387 "in-directory [" + indir
388 + "] does not contain any gene tree files with suffix "
389 + gene_trees_suffix );
392 log = ForesterUtil.createEasyWriter( logfile );
394 catch ( final IOException e ) {
395 ForesterUtil.fatalError( PRG_NAME, "could not create [" + logfile + "]" );
397 Arrays.sort( gene_trees_files );
399 log.print( "# program" );
401 log.print( PRG_NAME );
403 log.print( "# version" );
405 log.print( PRG_VERSION );
407 log.print( "# date" );
409 log.print( PRG_DATE );
411 log.print( "# Algorithm " );
413 log.print( algorithm.toString() );
415 log.print( "# Gene trees in-dir" );
417 log.print( indir.getCanonicalPath() );
419 log.print( "# Gene trees suffix" );
421 log.print( gene_trees_suffix );
423 log.print( "# Species tree" );
425 log.print( species_tree_file.getCanonicalPath() );
427 log.print( "# Out-dir" );
429 log.print( outdir.getCanonicalPath() );
431 log.print( "# Logfile" );
433 log.print( logfile.getCanonicalPath() );
435 log.print( "# Ortholog groups cutoff" );
437 log.print( Double.toString( ortholog_group_cutoff ) );
439 if ( gt_first != RIO.DEFAULT_RANGE ) {
440 log.print( "# First gene tree to analyze" );
442 log.print( Integer.toString( gt_first ) );
445 if ( gt_last != RIO.DEFAULT_RANGE ) {
446 log.print( "# Last gene tree to analyze" );
448 log.print( Integer.toString( gt_last ) );
451 log.print( "# Re-rooting" );
453 log.print( rerooting_str );
455 log.print( "# Non binary species tree" );
458 log.print( "allowed" );
461 log.print( "disallowed" );
467 log.print( "EXT NODES" );
469 log.print( ortholog_group_cutoff + " O GROUPS" );
471 log.print( "0.05 O GROUPS" );
473 log.print( "0.25 O GROUPS" );
475 log.print( "0.5 O GROUPS" );
477 log.print( "0.75 O GROUPS" );
479 log.print( "0.95 O GROUPS" );
481 log.print( "MEDIAN DUP" );
483 log.print( "MEAN DUP" );
485 log.print( "MEAN DUP SD" );
487 log.print( "MIN DUP" );
489 log.print( "MAX DUP" );
491 log.print( "REMOVED EXT NODES" );
496 catch ( IOException e ) {
497 ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
500 for( final File gf : gene_trees_files ) {
501 String outname = gf.getName();
504 System.out.print( "\r" + counter + "/" + gene_trees_files.length + ": " + outname );
506 if ( outname.indexOf( "." ) > 0 ) {
507 outname = outname.substring( 0, outname.lastIndexOf( "." ) );
512 new File( outdir.getCanonicalFile() + "/" + outname + ORTHO_OUTTABLE_SUFFIX ),
513 new File( outdir.getCanonicalFile() + "/" + outname + ORTHOLOG_GROUPS_SUFFIX ),
514 new File( outdir.getCanonicalFile() + "/" + outname + LOGFILE_SUFFIX ),
519 new File( outdir.getCanonicalFile() + "/" + outname
520 + STRIPPED_SPECIES_TREE_SUFFIX ),
521 new File( outdir.getCanonicalFile() + "/" + outname
522 + OUT_MIN_DUP_GENE_TREE_SUFFIX ),
523 new File( outdir.getCanonicalFile() + "/" + outname
524 + OUT_MED_DUP_GENE_TREE_SUFFIX ),
529 ortholog_group_cutoff );
531 catch ( IOException e ) {
532 ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
537 System.out.println();
540 String outname = orthology_outtable.toString();
541 if ( outname.indexOf( "." ) > 0 ) {
542 outname = outname.substring( 0, outname.lastIndexOf( "." ) );
544 executeAnalysis( gene_trees_file,
547 new File( outname + ORTHOLOG_GROUPS_SUFFIX ),
553 new File( outname + STRIPPED_SPECIES_TREE_SUFFIX ),
554 new File( outname + OUT_MIN_DUP_GENE_TREE_SUFFIX ),
555 new File( outname + OUT_MED_DUP_GENE_TREE_SUFFIX ),
556 algorithm == ALGORITHM.GSDIR,
560 ortholog_group_cutoff );
563 time = System.currentTimeMillis() - time;
564 System.out.println( "Time :\t" + time + "ms" );
570 catch ( IOException e ) {
571 ForesterUtil.fatalError( PRG_NAME, e.getLocalizedMessage() );
573 time = System.currentTimeMillis() - time;
574 System.out.println( "Time :\t" + time + "ms" );
579 private static final void executeAnalysis( final File gene_trees_file,
580 final File species_tree_file,
581 final File orthology_outtable,
582 final File orthology_groups_outfile,
584 final String outgroup,
585 final REROOTING rerooting,
588 final File return_species_tree,
589 final File return_min_dup_gene_tree,
590 final File return_median_dup_gene_tree,
591 final boolean transfer_taxonomy,
592 final ALGORITHM algorithm,
593 final boolean use_gene_trees_dir,
594 final EasyWriter log,
595 final double ortholog_group_cutoff ) {
598 boolean iterating = false;
599 final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
600 if ( p instanceof PhyloXmlParser ) {
601 rio = RIO.executeAnalysis( gene_trees_file,
614 if ( p instanceof NHXParser ) {
615 final NHXParser nhx = ( NHXParser ) p;
616 nhx.setReplaceUnderscores( false );
617 nhx.setIgnoreQuotes( true );
618 nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
620 else if ( p instanceof NexusPhylogeniesParser ) {
621 final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
622 nex.setReplaceUnderscores( false );
623 nex.setIgnoreQuotes( true );
624 nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
627 throw new RuntimeException( "unknown parser type: " + p );
629 final IteratingPhylogenyParser ip = ( IteratingPhylogenyParser ) p;
630 ip.setSource( gene_trees_file );
631 rio = RIO.executeAnalysis( ip,
642 if ( !use_gene_trees_dir ) {
643 if ( algorithm == ALGORITHM.GSDIR ) {
644 System.out.println( "Taxonomy linking based on :\t" + rio.getGSDIRtaxCompBase() );
649 m = rio.getOrthologTable();
652 m = RIO.calculateOrthologTable( rio.getAnalyzedGeneTrees(), true );
654 final BasicDescriptiveStatistics stats = rio.getDuplicationsStatistics();
655 writeTable( orthology_outtable, stats.getN(), m, !use_gene_trees_dir );
656 final int ortholog_groups = writeOrtologGroups( orthology_groups_outfile,
657 ortholog_group_cutoff,
662 final int ortholog_groups_005 = writeOrtologGroups( null, 0.05, stats.getN(), m, false, true );
663 final int ortholog_groups_025 = writeOrtologGroups( null, 0.25, stats.getN(), m, false, true );
664 final int ortholog_groups_05 = writeOrtologGroups( null, 0.5, stats.getN(), m, false, true );
665 final int ortholog_groups_075 = writeOrtologGroups( null, 0.75, stats.getN(), m, false, true );
666 final int ortholog_groups_095 = writeOrtologGroups( null, 0.95, stats.getN(), m, false, true );
667 if ( ( algorithm != ALGORITHM.SDIR ) && ( logfile != null ) ) {
668 writeLogFile( logfile,
676 ForesterUtil.getForesterLibraryInformation(),
677 !use_gene_trees_dir );
679 if ( return_species_tree != null ) {
680 writeTree( rio.getSpeciesTree(),
682 use_gene_trees_dir ? null : "Wrote (stripped) species tree to :\t" );
684 if ( return_min_dup_gene_tree != null && rio.getMinDuplicationsGeneTree() != null ) {
685 final int min = ( int ) rio.getDuplicationsStatistics().getMin();
686 writeTree( rio.getMinDuplicationsGeneTree(),
687 new File( return_min_dup_gene_tree.toString() + min + ".xml" ),
688 use_gene_trees_dir ? null : "Wrote one min duplication gene tree :\t" );
690 if ( return_median_dup_gene_tree != null && rio.getDuplicationsToTreeMap() != null ) {
691 final int med = ( int ) rio.getDuplicationsStatistics().median();
692 writeTree( rio.getDuplicationsToTreeMap().get( med ),
693 new File( return_median_dup_gene_tree.toString() + med + ".xml" ),
694 use_gene_trees_dir ? null : "Wrote one med duplication gene tree :\t" );
696 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
697 final int min = ( int ) stats.getMin();
698 final int max = ( int ) stats.getMax();
699 final int median = ( int ) stats.median();
702 int median_count = 0;
703 for( double d : stats.getData() ) {
704 if ( ( ( int ) d ) == min ) {
707 if ( ( ( int ) d ) == max ) {
710 if ( ( ( int ) d ) == median ) {
714 final double min_count_percentage = ( 100.0 * min_count ) / stats.getN();
715 final double max_count_percentage = ( 100.0 * max_count ) / stats.getN();
716 final double median_count_percentage = ( 100.0 * median_count ) / stats.getN();
717 if ( use_gene_trees_dir ) {
718 String name = gene_trees_file.getName();
719 if ( name.indexOf( "." ) > 0 ) {
720 name = name.substring( 0, name.lastIndexOf( "." ) );
724 log.print( Integer.toString( rio.getExtNodesOfAnalyzedGeneTrees() ) );
726 log.print( Integer.toString( ortholog_groups ) );
729 log.print( Integer.toString( ortholog_groups_005 ) );
731 log.print( Integer.toString( ortholog_groups_025 ) );
733 log.print( Integer.toString( ortholog_groups_05 ) );
735 log.print( Integer.toString( ortholog_groups_075 ) );
737 log.print( Integer.toString( ortholog_groups_095 ) );
740 if ( stats.getN() > 3 ) {
741 log.print( df.format( median ) );
747 log.print( df.format( stats.arithmeticMean() ) );
749 if ( stats.getN() > 3 ) {
750 log.print( df.format( stats.sampleStandardDeviation() ) );
756 log.print( Integer.toString( min ) );
758 log.print( Integer.toString( max ) );
760 log.print( Integer.toString( rio.getRemovedGeneTreeNodes().size() ) );
762 log.print( Integer.toString( stats.getN() ) );
766 System.out.println( "Gene tree internal nodes :\t" + rio.getIntNodesOfAnalyzedGeneTrees() );
767 System.out.println( "Gene tree external nodes :\t" + rio.getExtNodesOfAnalyzedGeneTrees() );
768 System.out.println( "Mean number of duplications :\t" + df.format( stats.arithmeticMean() )
769 + "\t" + df.format( ( 100.0 * stats.arithmeticMean() ) / rio.getIntNodesOfAnalyzedGeneTrees() )
770 + "%\t(sd: " + df.format( stats.sampleStandardDeviation() ) + ")" );
771 if ( stats.getN() > 3 ) {
772 System.out.println( "Median number of duplications :\t" + df.format( median ) + "\t"
773 + df.format( ( 100.0 * median ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
775 System.out.println( "Minimum duplications :\t" + min + "\t"
776 + df.format( ( 100.0 * min ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
777 System.out.println( "Maximum duplications :\t" + ( int ) max + "\t"
778 + df.format( ( 100.0 * max ) / rio.getIntNodesOfAnalyzedGeneTrees() ) + "%" );
779 System.out.println( "Gene trees with median duplications :\t" + median_count + "\t"
780 + df.format( median_count_percentage ) + "%" );
781 System.out.println( "Gene trees with minimum duplications:\t" + min_count + "\t"
782 + df.format( min_count_percentage ) + "%" );
783 System.out.println( "Gene trees with maximum duplications:\t" + max_count + "\t"
784 + df.format( max_count_percentage ) + "%" );
785 if ( algorithm == ALGORITHM.GSDIR ) {
786 System.out.println( "Removed ext gene tree nodes :\t"
787 + rio.getRemovedGeneTreeNodes().size() );
791 catch ( final RIOException e ) {
792 ForesterUtil.fatalError( e.getLocalizedMessage() );
794 catch ( final SDIException e ) {
795 ForesterUtil.fatalError( e.getLocalizedMessage() );
797 catch ( final IOException e ) {
798 ForesterUtil.fatalError( e.getLocalizedMessage() );
800 catch ( final OutOfMemoryError e ) {
801 ForesterUtil.outOfMemoryError( e );
803 catch ( final Exception e ) {
804 ForesterUtil.unexpectedFatalError( e );
806 catch ( final Error e ) {
807 ForesterUtil.unexpectedFatalError( e );
811 private final static void printHelp() {
812 System.out.println( "Usage" );
813 System.out.println();
814 System.out.println( PRG_NAME
815 + " [options] <gene trees infile> <species tree infile> <all vs all orthology table outfile> [logfile]" );
816 System.out.println();
817 System.out.println( PRG_NAME + " [options] <gene trees indir> <species tree infile> <outdir> <logfile>" );
818 System.out.println();
819 System.out.println();
820 System.out.println( " Options" );
821 System.out.println( " -" + GT_FIRST + "=<first> : first gene tree to analyze (0-based index)" );
822 System.out.println( " -" + GT_LAST + "=<last> : last gene tree to analyze (0-based index)" );
823 System.out.println( " -" + ORTHOLOG_GROUPS_CUTOFF_OPTION
824 + "=<cutoff> : cutoff value for ortholog groups (default: " + ORTHOLOG_GROUPS_CUTOFF_DEFAULT + ")" );
825 System.out.println( " -" + REROOTING_OPT
826 + "=<re-rooting>: re-rooting method for gene trees, possible values or 'none', 'midpoint'," );
827 System.out.println( " or 'outgroup' (default: by minizming duplications)" );
828 System.out.println( " -" + OUTGROUP
829 + "=<outgroup> : for rooting by outgroup, name of outgroup (external gene tree node)" );
830 System.out.println( " -" + USE_SDIR
831 + " : to use SDIR instead of GSDIR (faster, but non-binary species trees are" );
832 System.out.println( " disallowed, as are most options)" );
833 System.out.println( " -" + GENE_TREES_SUFFIX_OPTION
834 + "=<suffix> : suffix for gene trees when operating on gene tree directories (default: .mlt)" );
835 System.out.println();
836 System.out.println( " Formats" );
838 .println( " The gene trees, as well as the species tree, ideally are in phyloXML (www.phyloxml.org) format," );
840 .println( " but can also be in New Hamphshire (Newick) or Nexus format as long as species information can be" );
842 .println( " extracted from the gene names (e.g. \"HUMAN\" from \"BCL2_HUMAN\") and matched to a single species" );
843 System.out.println( " in the species tree." );
844 System.out.println();
845 System.out.println( " Examples" );
846 System.out.println( " rio -s gene_trees.nh species.xml outtable.tsv" );
847 System.out.println( " rio gene_trees.nh species.xml outtable.tsv log.txt" );
848 System.out.println( " rio -c=0.9 -f=10 -l=100 -r=none gene_trees.xml species.xml outtable.tsv log.txt" );
849 System.out.println( " rio -g=.xml gene_trees_dir species.xml out_dir log.tsv" );
850 System.out.println();
854 private static void writeLogFile( final File logfile,
856 final File species_tree_file,
857 final File gene_trees_file,
859 final String prg_name,
861 final String prg_date,
863 final boolean verbose )
865 final EasyWriter out = ForesterUtil.createEasyWriter( logfile );
866 out.println( "# " + prg_name );
867 out.println( "# version : " + prg_v );
868 out.println( "# date : " + prg_date );
869 out.println( "# based on: " + f );
870 out.println( "# ----------------------------------" );
871 out.println( "Gene trees :\t" + gene_trees_file.getCanonicalPath() );
872 out.println( "Species tree :\t" + species_tree_file.getCanonicalPath() );
873 out.println( "All vs all orthology table :\t" + outtable.getCanonicalPath() );
875 out.println( rio.getLog().toString() );
878 System.out.println( "Wrote log to :\t" + logfile.getCanonicalPath() );
882 private static final void writeTable( final File table_outfile,
883 final int gene_trees_analyzed,
885 final boolean verbose )
887 final EasyWriter w = ForesterUtil.createEasyWriter( table_outfile );
888 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
889 df.setDecimalSeparatorAlwaysShown( false );
890 df.setRoundingMode( RoundingMode.HALF_UP );
891 for( int i = 0; i < m.size(); ++i ) {
893 w.print( m.getLabel( i ) );
896 for( int x = 0; x < m.size(); ++x ) {
897 w.print( m.getLabel( x ) );
898 for( int y = 0; y < m.size(); ++y ) {
901 if ( m.get( x, y ) != gene_trees_analyzed ) {
902 ForesterUtil.unexpectedFatalError( "diagonal value is off" );
907 w.print( df.format( ( ( double ) m.get( x, y ) ) / gene_trees_analyzed ) );
914 System.out.println( "Wrote table to :\t" + table_outfile.getCanonicalPath() );
918 private static final int writeOrtologGroups( final File outfile,
920 final int gene_trees_analyzed,
922 final boolean verbose,
923 final boolean calc_conly )
925 List<SortedSet<String>> groups = new ArrayList<SortedSet<String>>();
926 BasicDescriptiveStatistics stats = new BasicDescriptiveStatistics();
930 for( int x = 1; x < m.size(); ++x ) {
931 final String a = m.getLabel( x );
932 for( int y = 0; y < x; ++y ) {
933 final String b = m.getLabel( y );
934 final double s = ( ( double ) m.get( x, y ) ) / gene_trees_analyzed;
946 boolean found = false;
947 for( final SortedSet<String> group : groups ) {
948 if ( group.contains( a ) ) {
952 if ( group.contains( b ) ) {
958 final SortedSet<String> new_group = new TreeSet<String>();
961 groups.add( new_group );
966 //Deal with singlets:
967 for( int x = 0; x < m.size(); ++x ) {
968 final String a = m.getLabel( x );
969 boolean found = false;
970 for( final SortedSet<String> group : groups ) {
971 if ( group.contains( a ) ) {
977 final SortedSet<String> new_group = new TreeSet<String>();
979 groups.add( new_group );
983 return groups.size();
985 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.####" );
986 df.setDecimalSeparatorAlwaysShown( false );
987 df.setRoundingMode( RoundingMode.HALF_UP );
988 final EasyWriter w = ForesterUtil.createEasyWriter( outfile );
990 for( final SortedSet<String> group : groups ) {
991 w.print( Integer.toString( counter++ ) );
992 for( final String s : group ) {
999 w.println( "# Cutoff\t" + df.format( cutoff ) );
1001 w.println( "# Orthology support statistics:" );
1002 if ( stats.getN() > 3 ) {
1003 w.println( "# Median\t" + df.format( stats.median() ) );
1005 w.println( "# Mean\t" + df.format( stats.arithmeticMean() ) );
1006 if ( stats.getN() > 3 ) {
1007 w.println( "# SD\t" + df.format( stats.sampleStandardDeviation() ) );
1009 w.println( "# Min\t" + df.format( stats.getMin() ) );
1010 w.println( "# Max\t" + df.format( stats.getMax() ) );
1011 w.println( "# Total\t" + df.format( stats.getN() ) );
1012 w.println( "# Below 0.75\t" + below_075 + "\t" + df.format( ( 100.0 * below_075 / stats.getN() ) ) + "%" );
1013 w.println( "# Below 0.5\t" + below_05 + "\t" + df.format( ( 100.0 * below_05 / stats.getN() ) ) + "%" );
1014 w.println( "# Below 0.25\t" + below_025 + "\t" + df.format( ( 100.0 * below_025 / stats.getN() ) ) + "%" );
1017 System.out.println( "Number of ortholog groups :\t" + groups.size() );
1018 System.out.println( "Wrote orthologs groups table to :\t" + outfile.getCanonicalPath() );
1020 return groups.size();
1023 private static void writeTree( final Phylogeny p, final File f, final String comment ) throws IOException {
1024 final PhylogenyWriter writer = new PhylogenyWriter();
1025 writer.toPhyloXML( f, p, 0 );
1026 if ( comment != null ) {
1027 System.out.println( comment + f.getCanonicalPath() );