2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
28 package org.forester.rio;
31 import java.io.FileNotFoundException;
32 import java.io.IOException;
33 import java.util.ArrayList;
34 import java.util.Collections;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.List;
40 import org.forester.datastructures.IntMatrix;
41 import org.forester.io.parsers.PhylogenyParser;
42 import org.forester.io.parsers.nhx.NHXParser;
43 import org.forester.io.parsers.util.ParserUtils;
44 import org.forester.phylogeny.Phylogeny;
45 import org.forester.phylogeny.PhylogenyMethods;
46 import org.forester.phylogeny.PhylogenyNode;
47 import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
48 import org.forester.phylogeny.factories.PhylogenyFactory;
49 import org.forester.sdi.GSDI;
50 import org.forester.sdi.GSDIR;
51 import org.forester.sdi.SDIException;
52 import org.forester.sdi.SDIR;
53 import org.forester.sdi.SDIutil.ALGORITHM;
54 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
55 import org.forester.util.BasicDescriptiveStatistics;
56 import org.forester.util.ForesterUtil;
58 public final class RIO {
60 public enum REROOTING {
61 NONE, BY_ALGORITHM, MIDPOINT, OUTGROUP;
63 private Phylogeny[] _analyzed_gene_trees;
64 private List<PhylogenyNode> _removed_gene_tree_nodes;
65 private int _ext_nodes;
66 private TaxonomyComparisonBase _gsdir_tax_comp_base;
67 private StringBuilder _log;
68 private BasicDescriptiveStatistics _duplications_stats;
69 private boolean _produce_log;
70 private boolean _verbose;
71 private REROOTING _rerooting;
73 public RIO( final File gene_trees_file,
74 final Phylogeny species_tree,
75 final ALGORITHM algorithm,
76 final REROOTING rerooting,
77 final String outgroup,
78 final boolean produce_log,
79 final boolean verbose ) throws IOException, SDIException, RIOException {
80 checkReRooting( rerooting, outgroup );
81 init( produce_log, verbose, rerooting );
82 inferOrthologs( gene_trees_file, species_tree, algorithm, outgroup );
85 public RIO( final Phylogeny[] gene_trees,
86 final Phylogeny species_tree,
87 final ALGORITHM algorithm,
88 final REROOTING rerooting,
89 final String outgroup,
90 final boolean produce_log,
91 final boolean verbose ) throws IOException, SDIException, RIOException {
92 checkReRooting( rerooting, outgroup );
93 init( produce_log, verbose, rerooting );
94 inferOrthologs( gene_trees, species_tree, algorithm, outgroup );
97 public final Phylogeny[] getAnalyzedGeneTrees() {
98 return _analyzed_gene_trees;
102 * Returns the numbers of number of ext nodes in gene trees analyzed (after
105 * @return number of ext nodes in gene trees analyzed (after stripping)
107 public final int getExtNodesOfAnalyzedGeneTrees() {
111 public final TaxonomyComparisonBase getGSDIRtaxCompBase() {
112 return _gsdir_tax_comp_base;
115 public final StringBuilder getLog() {
119 public final List<PhylogenyNode> getRemovedGeneTreeNodes() {
120 return _removed_gene_tree_nodes;
123 private final void inferOrthologs( final File gene_trees_file,
124 final Phylogeny species_tree,
125 final ALGORITHM algorithm,
126 final String outgroup ) throws SDIException, RIOException,
127 FileNotFoundException, IOException {
128 final PhylogenyFactory factory = ParserBasedPhylogenyFactory.getInstance();
129 final PhylogenyParser p = ParserUtils.createParserDependingOnFileType( gene_trees_file, true );
130 if ( p instanceof NHXParser ) {
131 final NHXParser nhx = ( NHXParser ) p;
132 nhx.setReplaceUnderscores( false );
133 nhx.setIgnoreQuotes( true );
134 nhx.setTaxonomyExtraction( NHXParser.TAXONOMY_EXTRACTION.YES );
136 final Phylogeny[] gene_trees = factory.create( gene_trees_file, p );
137 inferOrthologs( gene_trees, species_tree, algorithm, outgroup );
140 private final void inferOrthologs( final Phylogeny[] gene_trees,
141 final Phylogeny species_tree,
142 final ALGORITHM algorithm,
143 final String outgroup ) throws SDIException, RIOException,
144 FileNotFoundException, IOException {
145 if ( algorithm == ALGORITHM.SDIR ) {
146 // Removes from species_tree all species not found in gene_tree.
147 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( gene_trees[ 0 ], species_tree );
148 if ( species_tree.isEmpty() ) {
149 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
152 if ( _produce_log ) {
153 _log = new StringBuilder();
154 if ( _rerooting == REROOTING.BY_ALGORITHM ) {
158 _analyzed_gene_trees = new Phylogeny[ gene_trees.length ];
159 int gene_tree_ext_nodes = 0;
161 System.out.println();
163 for( int i = 0; i < gene_trees.length; ++i ) {
164 final Phylogeny gt = gene_trees[ i ];
166 ForesterUtil.updateProgress( ( double ) i / gene_trees.length );
169 gene_tree_ext_nodes = gt.getNumberOfExternalNodes();
171 else if ( gene_tree_ext_nodes != gt.getNumberOfExternalNodes() ) {
172 throw new RIOException( "gene tree #" + ( i + 1 ) + " has a different number of external nodes ("
173 + gt.getNumberOfExternalNodes() + ") than the preceding gene trees (" + gene_tree_ext_nodes
176 if ( algorithm == ALGORITHM.SDIR ) {
177 // Removes from gene_tree all species not found in species_tree.
178 PhylogenyMethods.taxonomyBasedDeletionOfExternalNodes( species_tree, gt );
179 if ( gt.isEmpty() ) {
180 throw new RIOException( "failed to establish species based mapping between gene and species trees" );
183 _analyzed_gene_trees[ i ] = performOrthologInference( gt, species_tree, algorithm, outgroup, i );
186 System.out.println();
187 System.out.println();
191 private final void init( final boolean produce_log, final boolean verbose, final REROOTING rerooting ) {
192 _produce_log = produce_log;
194 _rerooting = rerooting;
197 _gsdir_tax_comp_base = null;
198 _analyzed_gene_trees = null;
199 _removed_gene_tree_nodes = null;
200 _duplications_stats = new BasicDescriptiveStatistics();
203 private final Phylogeny performOrthologInference( final Phylogeny gene_tree,
204 final Phylogeny species_tree,
205 final ALGORITHM algorithm,
206 final String outgroup,
207 final int i ) throws SDIException, RIOException {
208 final Phylogeny assigned_tree;
209 switch ( algorithm ) {
211 assigned_tree = performOrthologInferenceBySDI( gene_tree, species_tree );
215 assigned_tree = performOrthologInferenceByGSDI( gene_tree, species_tree, outgroup, i );
219 throw new IllegalArgumentException( "illegal algorithm: " + algorithm );
223 _ext_nodes = assigned_tree.getNumberOfExternalNodes();
225 else if ( _ext_nodes != assigned_tree.getNumberOfExternalNodes() ) {
226 throw new RIOException( "after stripping gene tree #" + ( i + 1 )
227 + " has a different number of external nodes (" + assigned_tree.getNumberOfExternalNodes()
228 + ") than the preceding gene trees (" + _ext_nodes + ")" );
230 return assigned_tree;
233 private final Phylogeny performOrthologInferenceByGSDI( final Phylogeny gene_tree,
234 final Phylogeny species_tree,
235 final String outgroup,
236 final int i ) throws SDIException, RIOException {
237 final Phylogeny assigned_tree;
238 if ( _rerooting == REROOTING.BY_ALGORITHM ) {
239 final GSDIR gsdir = new GSDIR( gene_tree, species_tree, true, i == 0 );
240 final List<Phylogeny> assigned_trees = gsdir.getMinDuplicationsSumGeneTrees();
242 _removed_gene_tree_nodes = gsdir.getStrippedExternalGeneTreeNodes();
243 for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
244 if ( !r.getNodeData().isHasTaxonomy() ) {
245 throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #1: "
250 final List<Integer> shortests = GSDIR.getIndexesOfShortestTree( assigned_trees );
251 assigned_tree = assigned_trees.get( shortests.get( 0 ) );
252 if ( _produce_log ) {
253 writeStatsToLog( i, gsdir, shortests );
256 _gsdir_tax_comp_base = gsdir.getTaxCompBase();
258 _duplications_stats.addValue( gsdir.getMinDuplicationsSum() );
261 if ( _rerooting == REROOTING.MIDPOINT ) {
262 PhylogenyMethods.midpointRoot( gene_tree );
264 else if ( _rerooting == REROOTING.OUTGROUP ) {
267 n = gene_tree.getNode( outgroup );
269 catch ( IllegalArgumentException e ) {
270 throw new RIOException( "failed to perform re-rooting by outgroup: " + e.getLocalizedMessage() );
272 gene_tree.reRoot( n );
274 final GSDI gsdi = new GSDI( gene_tree, species_tree, true, true, true );
275 _removed_gene_tree_nodes = gsdi.getStrippedExternalGeneTreeNodes();
276 for( final PhylogenyNode r : _removed_gene_tree_nodes ) {
277 if ( !r.getNodeData().isHasTaxonomy() ) {
278 throw new RIOException( "node with no (appropriate) taxonomic information found in gene tree #1: "
282 assigned_tree = gene_tree;
284 _gsdir_tax_comp_base = gsdi.getTaxCompBase();
286 _duplications_stats.addValue( gsdi.getDuplicationsSum() );
288 return assigned_tree;
291 private final Phylogeny performOrthologInferenceBySDI( final Phylogeny gene_tree, final Phylogeny species_tree )
292 throws SDIException {
293 final SDIR sdir = new SDIR();
294 return sdir.infer( gene_tree, species_tree, false, true, true, true, 1 )[ 0 ];
297 private void writeLogSubHeader() {
300 _log.append( "with minimal number of duplications" );
302 _log.append( "root placements" );
303 _log.append( "\t[" );
304 _log.append( "min" );
306 _log.append( "max" );
307 _log.append( "]\t<" );
308 _log.append( "shortest" );
310 _log.append( ForesterUtil.LINE_SEPARATOR );
313 private final void writeStatsToLog( final int i, final GSDIR gsdir, final List<Integer> shortests ) {
314 final BasicDescriptiveStatistics stats = gsdir.getDuplicationsSumStats();
317 _log.append( gsdir.getMinDuplicationsSumGeneTrees().size() );
319 _log.append( stats.getN() );
320 _log.append( "\t[" );
321 _log.append( ( int ) stats.getMin() );
323 _log.append( ( int ) stats.getMax() );
324 _log.append( "]\t<" );
325 _log.append( shortests.size() );
327 _log.append( ForesterUtil.LINE_SEPARATOR );
330 public final static IntMatrix calculateOrthologTable( final Phylogeny[] analyzed_gene_trees, final boolean sort )
331 throws RIOException {
332 final List<String> labels = new ArrayList<String>();
333 final Set<String> labels_set = new HashSet<String>();
335 for( final PhylogenyNode n : analyzed_gene_trees[ 0 ].getExternalNodes() ) {
336 if ( n.getNodeData().isHasSequence() && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getName() ) ) {
337 label = n.getNodeData().getSequence().getName();
339 else if ( n.getNodeData().isHasSequence()
340 && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getSymbol() ) ) {
341 label = n.getNodeData().getSequence().getSymbol();
343 else if ( !ForesterUtil.isEmpty( n.getName() ) ) {
347 throw new RIOException( "node " + n + " has no appropriate label" );
349 if ( labels_set.contains( label ) ) {
350 throw new RIOException( "label " + label + " is not unique" );
352 labels_set.add( label );
356 Collections.sort( labels );
358 final IntMatrix m = new IntMatrix( labels );
360 for( final Phylogeny gt : analyzed_gene_trees ) {
362 PhylogenyMethods.preOrderReId( gt );
363 final HashMap<String, PhylogenyNode> map = PhylogenyMethods.createNameToExtNodeMap( gt );
364 for( int x = 0; x < m.size(); ++x ) {
365 final String mx = m.getLabel( x );
366 final PhylogenyNode nx = map.get( mx );
368 throw new RIOException( "node \"" + mx + "\" not present in gene tree #" + counter );
372 for( int y = 0; y < m.size(); ++y ) {
373 my = m.getLabel( y );
376 throw new RIOException( "node \"" + my + "\" not present in gene tree #" + counter );
378 if ( !PhylogenyMethods.calculateLCAonTreeWithIdsInPreOrder( nx, ny ).isDuplication() ) {
379 m.inreaseByOne( x, y );
387 private final static void checkReRooting( final REROOTING rerooting, final String outgroup ) {
388 if ( !ForesterUtil.isEmpty( outgroup ) && rerooting != REROOTING.OUTGROUP ) {
389 throw new IllegalArgumentException( "can only use outgroup when re-rooting in this manner" );
393 public BasicDescriptiveStatistics getDuplicationsStatistics() {
394 return _duplications_stats;