2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
28 package org.forester.rio;
31 import java.io.FileNotFoundException;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.Collections;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.List;
39 import java.util.SortedSet;
40 import java.util.TreeSet;
42 import org.forester.datastructures.IntMatrix;
43 import org.forester.io.parsers.IteratingPhylogenyParser;
44 import org.forester.io.parsers.PhylogenyParser;
45 import org.forester.io.parsers.nexus.NexusPhylogeniesParser;
46 import org.forester.io.parsers.nhx.NHXParser;
47 import org.forester.io.parsers.nhx.NHXParser.TAXONOMY_EXTRACTION;
48 import org.forester.io.parsers.util.ParserUtils;
49 import org.forester.phylogeny.Phylogeny;
50 import org.forester.phylogeny.PhylogenyMethods;
51 import org.forester.phylogeny.PhylogenyNode;
52 import org.forester.phylogeny.data.Taxonomy;
53 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
54 import org.forester.phylogeny.factories.PhylogenyFactory;
55 import org.forester.sdi.GSDI;
56 import org.forester.sdi.GSDIR;
57 import org.forester.sdi.SDIException;
58 import org.forester.sdi.SDIR;
59 import org.forester.sdi.SDIutil;
60 import org.forester.sdi.SDIutil.ALGORITHM;
61 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
62 import org.forester.util.BasicDescriptiveStatistics;
63 import org.forester.util.ForesterUtil;
65 public final class RIO {
67 public static final int DEFAULT_RANGE = -1;
68 private static final int END_OF_GT = Integer.MAX_VALUE;
69 private static IntMatrix _m;
70 private Phylogeny[] _analyzed_gene_trees;
71 private List<PhylogenyNode> _removed_gene_tree_nodes;
72 private int _ext_nodes;
73 private int _int_nodes;
74 private TaxonomyComparisonBase _gsdir_tax_comp_base;
75 private final StringBuilder _log;
76 private final BasicDescriptiveStatistics _duplications_stats;
77 private final boolean _produce_log;
78 private final boolean _verbose;
79 private final REROOTING _rerooting;
80 private final Phylogeny _species_tree;
81 private Phylogeny _min_dub_gene_tree;
83 private RIO( final IteratingPhylogenyParser p,
84 final Phylogeny species_tree,
85 final ALGORITHM algorithm,
86 final REROOTING rerooting,
87 final String outgroup,
90 final boolean produce_log,
91 final boolean verbose,
92 final boolean transfer_taxonomy )
93 throws IOException, SDIException, RIOException {
94 if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
97 else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
100 removeSingleDescendentsNodes( species_tree, verbose );
102 checkPreconditions( p, species_tree, rerooting, outgroup, first, last );
103 _produce_log = produce_log;
105 _rerooting = rerooting;
108 _log = new StringBuilder();
109 _gsdir_tax_comp_base = null;
110 _analyzed_gene_trees = null;
111 _removed_gene_tree_nodes = null;
112 _duplications_stats = new BasicDescriptiveStatistics();
114 inferOrthologs( p, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
115 _species_tree = species_tree;
118 private RIO( final Phylogeny[] gene_trees,
119 final Phylogeny species_tree,
120 final ALGORITHM algorithm,
121 final REROOTING rerooting,
122 final String outgroup,
125 final boolean produce_log,
126 final boolean verbose,
127 final boolean transfer_taxonomy )
128 throws IOException, SDIException, RIOException {
129 if ( ( last == DEFAULT_RANGE ) && ( first >= 0 ) ) {
130 last = gene_trees.length - 1;
132 else if ( ( first == DEFAULT_RANGE ) && ( last >= 0 ) ) {
135 removeSingleDescendentsNodes( species_tree, verbose );
136 checkPreconditions( gene_trees, species_tree, rerooting, outgroup, first, last );
137 _produce_log = produce_log;
139 _rerooting = rerooting;
142 _log = new StringBuilder();
143 _gsdir_tax_comp_base = null;
144 _analyzed_gene_trees = null;
145 _removed_gene_tree_nodes = null;
146 _duplications_stats = new BasicDescriptiveStatistics();
147 inferOrthologs( gene_trees, species_tree, algorithm, outgroup, first, last, transfer_taxonomy );
148 _species_tree = species_tree;
151 public final Phylogeny[] getAnalyzedGeneTrees() {
152 return _analyzed_gene_trees;
155 public final BasicDescriptiveStatistics getDuplicationsStatistics() {
156 return _duplications_stats;
160 * Returns the numbers of number of ext nodes in gene trees analyzed (after
163 * @return number of ext nodes in gene trees analyzed (after stripping)
165 public final int getExtNodesOfAnalyzedGeneTrees() {
169 public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
170 return _gsdir_tax_comp_base;
174 * Returns the numbers of number of int nodes in gene trees analyzed (after
177 * @return number of int nodes in gene trees analyzed (after stripping)
179 public final int getIntNodesOfAnalyzedGeneTrees() {
183 public final StringBuilder getLog() {
187 final public Phylogeny getMinDuplicationsGeneTree() {
188 return _min_dub_gene_tree;
191 public final IntMatrix getOrthologTable() {
195 public final List<PhylogenyNode> getRemovedGeneTreeNodes() {
196 return _removed_gene_tree_nodes;
199 public final Phylogeny getSpeciesTree() {
200 return _species_tree;
203 private final void inferOrthologs( final IteratingPhylogenyParser parser,
204 final Phylogeny species_tree,
205 final ALGORITHM algorithm,
206 final String outgroup,
209 final boolean transfer_taxonomy )
210 throws SDIException, RIOException, FileNotFoundException, IOException {
211 if ( !parser.hasNext() ) {
212 throw new RIOException( "no gene trees to analyze" );
215 preLog( -1, species_tree, algorithm, outgroup );
218 System.out.println();
220 int gene_tree_ext_nodes = 0;
223 final boolean no_range = ( first < 0 ) || ( last < first );
224 while ( parser.hasNext() ) {
225 final Phylogeny gt = parser.next();
226 if ( no_range || ( ( i >= first ) && ( i <= last ) ) ) {
227 if ( gt.isEmpty() ) {
228 throw new RIOException( "gene tree #" + i + " is empty" );
230 if ( gt.getNumberOfExternalNodes() == 1 ) {
231 throw new RIOException( "gene tree #" + i + " has only one external node" );
234 System.out.print( "\r" + i );
236 if ( counter == 0 ) {
237 if ( algorithm == ALGORITHM.SDIR ) {
238 // Removes from species_tree all species not found in gene_tree.
239 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gt, species_tree );
240 if ( species_tree.isEmpty() ) {
241 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
244 gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
246 else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
247 throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
248 + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) ("
249 + gene_tree_ext_nodes + ")" );
251 if ( algorithm == ALGORITHM.SDIR ) {
252 // Removes from gene_tree all species not found in species_tree.
253 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
254 if ( gt.isEmpty() ) {
255 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
258 final Phylogeny analyzed_gt = performOrthologInference( gt,
264 RIO.calculateOrthologTable( analyzed_gt, true, counter );
270 System.out.print( "\rGene trees analyzed :\t" + counter );
272 if ( ( first >= 0 ) && ( counter == 0 ) && ( i > 0 ) ) {
273 throw new RIOException( "attempt to analyze first gene tree #" + first + " in a set of " + i );
279 postLog( species_tree, first, ( first + counter ) - 1 );
282 System.out.println();
283 System.out.println();
287 private final void inferOrthologs( final Phylogeny[] gene_trees,
288 final Phylogeny species_tree,
289 final ALGORITHM algorithm,
290 final String outgroup,
293 final boolean transfer_taxonomy )
294 throws SDIException, RIOException, FileNotFoundException, IOException {
295 if ( algorithm == ALGORITHM.SDIR ) {
296 // Removes from species_tree all species not found in gene_tree.
297 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
298 if ( species_tree.isEmpty() ) {
299 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
302 final Phylogeny[] my_gene_trees;
303 if ( ( first >= 0 ) && ( last >= first ) && ( last < gene_trees.length ) ) {
304 my_gene_trees = new Phylogeny[ ( 1 + last ) - first ];
306 for( int i = first; i <= last; ++i ) {
307 my_gene_trees[ c++ ] = gene_trees[ i ];
311 my_gene_trees = gene_trees;
314 preLog( gene_trees.length, species_tree, algorithm, outgroup );
316 if ( _verbose && ( my_gene_trees.length >= 4 ) ) {
317 System.out.println();
319 _analyzed_gene_trees = new Phylogeny[ my_gene_trees.length ];
320 int gene_tree_ext_nodes = 0;
321 for( int i = 0; i < my_gene_trees.length; ++i ) {
322 final Phylogeny gt = my_gene_trees[ i ];
323 if ( gt.isEmpty() ) {
324 throw new RIOException( "gene tree #" + i + " is empty" );
326 if ( gt.getNumberOfExternalNodes() == 1 ) {
327 throw new RIOException( "gene tree #" + i + " has only one external node" );
329 if ( _verbose && ( my_gene_trees.length > 4 ) ) {
330 ForesterUtil.updateProgress( ( ( double ) i ) / my_gene_trees.length );
333 gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
335 else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
336 throw new RIOException( "gene tree #" + i + " has a different number of external nodes ("
337 + gt.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + gene_tree_ext_nodes
340 if ( algorithm == ALGORITHM.SDIR ) {
341 // Removes from gene_tree all species not found in species_tree.
342 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
343 if ( gt.isEmpty() ) {
344 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
347 _analyzed_gene_trees[ i ] = performOrthologInference( gt,
355 postLog( species_tree, first, last );
357 if ( _verbose && ( my_gene_trees.length > 4 ) ) {
358 System.out.println();
359 System.out.println();
363 private final boolean log() {
367 private final void log( final String s ) {
369 _log.append( ForesterUtil.LINE_SEPARATOR );
372 private final void logRemovedGeneTreeNodes() {
373 final SortedSet<String> rn = new TreeSet<String>();
374 for( final PhylogenyNode n : getRemovedGeneTreeNodes() ) {
375 final Taxonomy t = n.getNodeData().getTaxonomy();
376 switch ( getGSDIRtaxCompBase() ) {
378 rn.add( t.getTaxonomyCode() );
382 rn.add( t.getIdentifier().toString() );
385 case SCIENTIFIC_NAME: {
386 rn.add( t.getScientificName() );
391 final StringBuilder sb = new StringBuilder();
392 for( final String s : rn ) {
396 log( "Species stripped from gene trees :" + sb );
399 private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
400 final Phylogeny species_tree,
401 final ALGORITHM algorithm,
402 final String outgroup,
404 final boolean transfer_taxonomy )
405 throws SDIException, RIOException {
406 final Phylogeny assigned_tree;
407 switch ( algorithm ) {
409 assigned_tree = performOrthologInferenceBySDI( gene_tree, species_tree );
413 assigned_tree = performOrthologInferenceByGSDI( gene_tree,
421 throw new IllegalArgumentException( "illegal algorithm: " + algorithm );
425 _ext_nodes = assigned_tree.getNumberOfExternalNodes();
426 _int_nodes = assigned_tree.getNumberOfInternalNodes();
428 else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) {
429 throw new RIOException( "after stripping gene tree #" + i + " has a different number of external nodes ("
430 + assigned_tree.getNumberOfExternalNodes() + ") than the preceding gene tree(s) (" + _ext_nodes
433 return assigned_tree;
436 private final Phylogeny performOrthologInferenceByGSDI( final Phylogeny gene_tree,
437 final Phylogeny species_tree,
438 final String outgroup,
440 final boolean transfer_taxonomy )
441 throws SDIException, RIOException {
442 final Phylogeny assigned_tree;
444 if ( _rerooting == REROOTING.BY_ALGORITHM ) {
445 final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0, transfer_taxonomy );
446 assigned_tree = gsdir.getMinDuplicationsSumGeneTree();
448 _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
449 for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
450 if ( !r.getNodeData().isHasTaxonomy() ) {
451 throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #"
452 + i + ": " + r.toString() );
457 _gsdir_tax_comp_base = gsdir.getTaxCompBase();
459 dups = gsdir.getMinDuplicationsSum();
462 if ( _rerooting == REROOTING.MIDPOINT ) {
463 PhylogenyMethods.midpointRoot( gene_tree );
465 else if ( _rerooting == REROOTING.OUTGROUP ) {
466 final PhylogenyNode n = gene_tree.getNode( outgroup );
467 gene_tree.reRoot( n );
469 final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true, transfer_taxonomy );
470 _removed_gene_tree_nodes = gsdi.getStrippedExternalGeneTreeNodes();
471 for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
472 if ( !r.getNodeData().isHasTaxonomy() ) {
473 throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #" + i
474 + ": " + r.toString() );
477 assigned_tree = gene_tree;
479 _gsdir_tax_comp_base = gsdi.getTaxCompBase();
481 dups = gsdi.getDuplicationsSum();
483 if ( ( i == 0 ) || ( dups < _duplications_stats.getMin() ) ) {
484 _min_dub_gene_tree = assigned_tree;
485 _min_dub_gene_tree.setRerootable( false );
487 _duplications_stats.addValue( dups );
488 return assigned_tree;
491 private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree )
492 throws SDIException {
493 final SDIR sdir = new SDIR();
494 return sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ];
497 private final void postLog( final Phylogeny species_tree, final int first, final int last ) {
498 final java.text.DecimalFormat df = new java.text.DecimalFormat( "0.##" );
499 final int min = ( int ) getDuplicationsStatistics().getMin();
500 final int max = ( int ) getDuplicationsStatistics().getMax();
501 final int median = ( int ) getDuplicationsStatistics().median();
504 int median_count = 0;
505 for( double d : getDuplicationsStatistics().getData() ) {
506 if ( ( ( int ) d ) == min ) {
509 if ( ( ( int ) d ) == max ) {
512 if ( ( ( int ) d ) == median ) {
516 final double min_count_percentage = ( 100.0 * min_count ) / getDuplicationsStatistics().getN();
517 final double max_count_percentage = ( 100.0 * max_count ) / getDuplicationsStatistics().getN();
518 final double median_count_percentage = ( 100.0 * median_count ) / getDuplicationsStatistics().getN();
519 if ( ( getRemovedGeneTreeNodes() != null ) && ( getRemovedGeneTreeNodes().size() > 0 ) ) {
520 logRemovedGeneTreeNodes();
522 log( "Gene trees analyzed :\t" + getDuplicationsStatistics().getN() );
523 if ( ( first >= 0 ) && ( last >= 0 ) ) {
524 log( "Gene trees analyzed range :\t" + first + "-" + last );
526 log( "Gene tree internal nodes :\t" + getIntNodesOfAnalyzedGeneTrees() );
527 log( "Gene tree external nodes :\t" + getExtNodesOfAnalyzedGeneTrees() );
528 log( "Removed ext gene tree nodes :\t" + getRemovedGeneTreeNodes().size() );
529 log( "Spec tree ext nodes (after strip) :\t" + species_tree.getNumberOfExternalNodes() );
530 log( "Spec tree polytomies (after strip) :\t" + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
531 log( "Taxonomy linking based on :\t" + getGSDIRtaxCompBase() );
532 log( "Mean number of duplications :\t" + df.format( getDuplicationsStatistics().arithmeticMean() )
534 + df.format( ( 100.0 * getDuplicationsStatistics().arithmeticMean() )
535 / getIntNodesOfAnalyzedGeneTrees() )
536 + "%\t(sd: " + df.format( getDuplicationsStatistics().sampleStandardDeviation() ) + ")" );
537 if ( getDuplicationsStatistics().getN() > 3 ) {
538 log( "Median number of duplications :\t" + df.format( median ) + "\t"
539 + df.format( ( 100.0 * median ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
541 log( "Minimum duplications :\t" + min + "\t"
542 + df.format( ( 100.0 * min ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
543 log( "Maximum duplications :\t" + ( int ) max + "\t"
544 + df.format( ( 100.0 * max ) / getIntNodesOfAnalyzedGeneTrees() ) + "%" );
545 log( "Gene trees with median duplications :\t" + median_count + "\t" + df.format( median_count_percentage )
547 log( "Gene trees with minimum duplications:\t" + min_count + "\t" + df.format( min_count_percentage ) + "%" );
548 log( "Gene trees with maximum duplications:\t" + max_count + "\t" + df.format( max_count_percentage ) + "%" );
551 private final void preLog( final int gene_trees,
552 final Phylogeny species_tree,
553 final ALGORITHM algorithm,
554 final String outgroup ) {
555 if ( gene_trees > 0 ) {
556 log( "Number of gene trees (total) :\t" + gene_trees );
558 log( "Algorithm :\t" + algorithm );
559 log( "Spec tree ext nodes (prior strip) :\t" + species_tree.getNumberOfExternalNodes() );
560 log( "Spec tree polytomies (prior strip) :\t" + PhylogenyMethods.countNumberOfPolytomies( species_tree ) );
562 switch ( _rerooting ) {
564 rs = "minimizing duplications";
572 rs = "outgroup: " + outgroup;
580 log( "Re-rooting :\t" + rs );
583 public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees, final boolean sort )
584 throws RIOException {
585 final List<String> labels = new ArrayList<String>();
586 final Set<String> labels_set = new HashSet<String>();
587 for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) {
588 final String label = obtainLabel( labels_set, n );
589 labels_set.add( label );
593 Collections.sort( labels );
595 final IntMatrix m = new IntMatrix( labels );
597 for( final Phylogeny gt : analyzed_gene_trees ) {
599 updateCounts( m, counter, gt );
604 public final static RIO executeAnalysis( final File gene_trees_file,
605 final File species_tree_file,
606 final ALGORITHM algorithm,
607 final REROOTING rerooting,
608 final String outgroup,
611 final boolean produce_log,
612 final boolean verbose,
613 final boolean transfer_taxonomy )
614 throws IOException, SDIException, RIOException {
615 final Phylogeny[] gene_trees = parseGeneTrees( gene_trees_file );
616 if ( gene_trees.length < 1 ) {
617 throw new RIOException( "\"" + gene_trees_file + "\" is devoid of appropriate gene trees" );
619 final Phylogeny species_tree = SDIutil
620 .parseSpeciesTree( gene_trees[ 0 ], species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
621 return new RIO( gene_trees,
633 public final static RIO executeAnalysis( final File gene_trees_file,
634 final Phylogeny species_tree,
635 final ALGORITHM algorithm,
636 final REROOTING rerooting,
637 final String outgroup,
638 final boolean produce_log,
639 final boolean verbose,
640 final boolean transfer_taxonomy )
641 throws IOException, SDIException, RIOException {
642 return new RIO( parseGeneTrees( gene_trees_file ),
654 public final static RIO executeAnalysis( final File gene_trees_file,
655 final Phylogeny species_tree,
656 final ALGORITHM algorithm,
657 final REROOTING rerooting,
658 final String outgroup,
661 final boolean produce_log,
662 final boolean verbose,
663 final boolean transfer_taxonomy )
664 throws IOException, SDIException, RIOException {
665 return new RIO( parseGeneTrees( gene_trees_file ),
677 public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
678 final File species_tree_file,
679 final ALGORITHM algorithm,
680 final REROOTING rerooting,
681 final String outgroup,
684 final boolean produce_log,
685 final boolean verbose,
686 final boolean transfer_taxonomy )
687 throws IOException, SDIException, RIOException {
688 final Phylogeny g0 = p.next();
689 if ( ( g0 == null ) || g0.isEmpty() || ( g0.getNumberOfExternalNodes() < 2 ) ) {
690 throw new RIOException( "input file does not seem to contain any gene trees" );
692 final Phylogeny species_tree = SDIutil
693 .parseSpeciesTree( g0, species_tree_file, false, true, TAXONOMY_EXTRACTION.NO );
707 public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
708 final Phylogeny species_tree,
709 final ALGORITHM algorithm,
710 final REROOTING rerooting,
711 final String outgroup,
712 final boolean produce_log,
713 final boolean verbose,
714 final boolean transfer_taxonomy )
715 throws IOException, SDIException, RIOException {
728 public final static RIO executeAnalysis( final IteratingPhylogenyParser p,
729 final Phylogeny species_tree,
730 final ALGORITHM algorithm,
731 final REROOTING rerooting,
732 final String outgroup,
735 final boolean produce_log,
736 final boolean verbose,
737 final boolean transfer_taxonomy )
738 throws IOException, SDIException, RIOException {
751 public final static RIO executeAnalysis( final Phylogeny[] gene_trees, final Phylogeny species_tree )
752 throws IOException, SDIException, RIOException {
753 return new RIO( gene_trees,
756 REROOTING.BY_ALGORITHM,
765 public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
766 final Phylogeny species_tree,
767 final ALGORITHM algorithm,
768 final REROOTING rerooting,
769 final String outgroup,
770 final boolean produce_log,
771 final boolean verbose,
772 final boolean transfer_taxonomy )
773 throws IOException, SDIException, RIOException {
774 return new RIO( gene_trees,
786 public final static RIO executeAnalysis( final Phylogeny[] gene_trees,
787 final Phylogeny species_tree,
788 final ALGORITHM algorithm,
789 final REROOTING rerooting,
790 final String outgroup,
793 final boolean produce_log,
794 final boolean verbose,
795 final boolean transfer_taxonomy )
796 throws IOException, SDIException, RIOException {
797 return new RIO( gene_trees,
809 private final static void calculateOrthologTable( final Phylogeny g, final boolean sort, final int counter )
810 throws RIOException {
811 if ( counter == 0 ) {
812 final List<String> labels = new ArrayList<String>();
813 final Set<String> labels_set = new HashSet<String>();
814 for( final PhylogenyNode n : g.getExternalNodes() ) {
815 final String label = obtainLabel( labels_set, n );
816 labels_set.add( label );
820 Collections.sort( labels );
822 _m = new IntMatrix( labels );
824 updateCounts( _m, counter, g );
827 private final static void checkPreconditions( final IteratingPhylogenyParser p,
828 final Phylogeny species_tree,
829 final REROOTING rerooting,
830 final String outgroup,
833 throws RIOException, IOException {
834 final Phylogeny g0 = p.next();
835 if ( ( g0 == null ) || g0.isEmpty() ) {
836 throw new RIOException( "input file does not seem to contain any gene trees" );
838 if ( g0.getNumberOfExternalNodes() < 2 ) {
839 throw new RIOException( "input file does not seem to contain any useable gene trees" );
841 if ( !species_tree.isRooted() ) {
842 throw new RIOException( "species tree is not rooted" );
844 if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
845 && ( ( last < first ) || ( last < 0 ) || ( first < 0 ) ) ) {
846 throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
849 if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
850 throw new RIOException( "outgroup not set for midpoint rooting" );
852 if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
853 throw new RIOException( "outgroup only used for midpoint rooting" );
855 if ( ( rerooting == REROOTING.MIDPOINT ) && ( PhylogenyMethods.calculateMaxDistanceToRoot( g0 ) <= 0 ) ) {
856 throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
858 if ( rerooting == REROOTING.OUTGROUP ) {
860 g0.getNode( outgroup );
862 catch ( final IllegalArgumentException e ) {
863 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
868 private final static void checkPreconditions( final Phylogeny[] gene_trees,
869 final Phylogeny species_tree,
870 final REROOTING rerooting,
871 final String outgroup,
874 throws RIOException {
875 if ( !species_tree.isRooted() ) {
876 throw new RIOException( "species tree is not rooted" );
878 if ( !( ( last == DEFAULT_RANGE ) && ( first == DEFAULT_RANGE ) )
879 && ( ( last < first ) || ( last >= gene_trees.length ) || ( last < 0 ) || ( first < 0 ) ) ) {
880 throw new RIOException( "attempt to set range (0-based) of gene to analyze to: from " + first + " to "
881 + last + " (out of " + gene_trees.length + ")" );
883 if ( ( rerooting == REROOTING.OUTGROUP ) && ForesterUtil.isEmpty( outgroup ) ) {
884 throw new RIOException( "outgroup not set for midpoint rooting" );
886 if ( ( rerooting != REROOTING.OUTGROUP ) && !ForesterUtil.isEmpty( outgroup ) ) {
887 throw new RIOException( "outgroup only used for midpoint rooting" );
889 if ( ( rerooting == REROOTING.MIDPOINT )
890 && ( PhylogenyMethods.calculateMaxDistanceToRoot( gene_trees[ 0 ] ) <= 0 ) ) {
891 throw new RIOException( "attempt to use midpoint rooting on gene trees which seem to have no (positive) branch lengths (cladograms)" );
893 if ( rerooting == REROOTING.OUTGROUP ) {
895 gene_trees[ 0 ].getNode( outgroup );
897 catch ( final IllegalArgumentException e ) {
898 throw new RIOException( "cannot perform re-rooting by outgroup: " + e.getLocalizedMessage() );
903 private final static String obtainLabel( final Set<String> labels_set, final PhylogenyNode n ) throws RIOException {
905 if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
906 label = n.getNodeData().getSequence().getName();
908 else if ( n.getNodeData().isHasSequence()
909 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) {
910 label = n.getNodeData().getSequence().getSymbol();
912 else if ( n.getNodeData().isHasSequence()
913 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getGeneName() ) ) {
914 label = n.getNodeData().getSequence().getGeneName();
916 else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
920 throw new RIOException( "node " + n + " has no appropriate label" );
922 if ( labels_set.contains( label ) ) {
923 throw new RIOException( "label " + label + " is not unique" );
928 private final static Phylogeny[] parseGeneTrees( final File gene_trees_file )
929 throws FileNotFoundException, IOException {
930 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
931 final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
932 if ( p instanceof NHXParser ) {
933 final NHXParser nhx = ( NHXParser ) p;
934 nhx.setReplaceUnderscores( false );
935 nhx.setIgnoreQuotes( true );
936 nhx.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
938 else if ( p instanceof NexusPhylogeniesParser ) {
939 final NexusPhylogeniesParser nex = ( NexusPhylogeniesParser ) p;
940 nex.setReplaceUnderscores( false );
941 nex.setIgnoreQuotes( true );
942 nex.setTaxonomyExtraction( TAXONOMY_EXTRACTION.AGGRESSIVE );
944 return factory.create( gene_trees_file, p );
947 private final static void removeSingleDescendentsNodes( final Phylogeny species_tree, final boolean verbose ) {
948 final int o = PhylogenyMethods.countNumberOfOneDescendantNodes( species_tree );
951 System.out.println( "warning: species tree has " + o
952 + " internal nodes with only one descendent which are therefore going to be removed" );
954 PhylogenyMethods.deleteInternalNodesWithOnlyOneDescendent( species_tree );
958 private final static void updateCounts( final IntMatrix m, final int counter, final Phylogeny g )
959 throws RIOException {
960 PhylogenyMethods.preOrderReId( g );
961 final HashMap<String, PhylogenyNode> map = PhylogenyMethods.createNameToExtNodeMap( g );
962 for( int x = 0; x < m.size(); ++x ) {
963 final String mx = m.getLabel( x );
964 final PhylogenyNode nx = map.get( mx );
966 throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter );
970 for( int y = 0; y < m.size(); ++y ) {
971 my = m.getLabel( y );
974 throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter );
976 if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) {
977 m.inreaseByOne( x, y );
983 public enum REROOTING {