2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
28 package org.forester.sdi;
30 import java.util.ArrayList;
31 import java.util.HashSet;
32 import java.util.List;
35 import org.forester.phylogeny.Phylogeny;
36 import org.forester.phylogeny.PhylogenyBranch;
37 import org.forester.phylogeny.PhylogenyNode;
38 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
41 * Allows to infer duplications - speciations on a unrooted gene tree. It
42 * reroots the gene trees on each of its branches and performs SDIse on each of
43 * the resulting trees. Trees which minimize a certain criterion are returned as
44 * the "correctly" rooted ones. The criterions are: <ul> <li>Sum of duplications
45 * <li>Mapping cost L <li>Phylogeny height - which is the largest distance from
46 * root to external node (minimizing of which is the same as "midpoint rooting")
53 * @author Christian M. Zmasek
57 private final static double ZERO_DIFF = 1.0E-6; // Due to inaccurate
60 // everything that should
64 private int _min_cost;
65 private double _min_height;
66 private double _min_diff;
67 private long _time_sdi;
70 * Default contructor which creates an "empty" object..
77 * Returns the number of differently rooted trees which minimize the
78 * (rooting) "criterion" - as determined by method "infer".
80 * @see #infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean)
81 * @return number of differently rooted trees which minimized the criterion
83 public int getCount() {
88 * Returns the (absolue value of the) minimal difference in tree heights of
89 * the two subtrees at the root (of the (re)rooted gene tree) - as
90 * determined by method "infer" - if minimize_height is set to true.
92 * If a tree is midpoint rooted this number is zero.
94 * <B>IMPORTANT </B>: If minimize_mapping_cost or minimize_sum_of_dup are
95 * also set to true, then this returns the minimal difference in tree
96 * heights of the trees which minimize the first criterion, and is therefore
97 * not necessarily zero.
99 * (Last modified: 01/22/00)
101 * @see #infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean)
102 * @return the minimal difference in tree heights -- IF calculated by
105 public double getMinimalDiffInSubTreeHeights() {
110 * Returns the minimal number of duplications - as determined by method
113 * <B>IMPORTANT </B>: If the tree is not rooted by minimizing the sum of
114 * duplications or the mapping cost L, then this number is NOT NECESSARILY
115 * the MINIMAL number of duplications.
117 * @see #infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean)
118 * @return (minimal) number of duplications
120 public int getMinimalDuplications() {
125 * Returns the minimal mapping cost L - as determined by method "infer" - if
126 * minimize_mapping_cost is set to true.
128 * (Last modified: 11/07/00)
130 * @see #infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean)
131 * @return the minimal mapping cost "L" -- IF calculated by "infer"
133 public int getMinimalMappingCost() {
138 * Returns the minimal tree height - as determined by method "infer" - if
139 * minimize_height is set to true. <B>IMPORTANT </B>: If
140 * minimize_mapping_cost or minimize_sum_of_dup are also set to true, then
141 * this returns the minimal tree height of the trees which minimize the
144 * (Last modified: 01/12/00)
146 * @see #infer(Phylogeny,Phylogeny,boolean,boolean,boolean,boolean,int,boolean)
147 * @return the minimal tree height -- IF calculated by "infer"
149 public double getMinimalTreeHeight() {
154 * Returns the sum of times (in ms) needed to run method infer of class SDI.
155 * Final variable TIME needs to be set to true.
157 * @return sum of times (in ms) needed to run method infer of class SDI
159 public long getTimeSumSDI() {
164 * Infers gene duplications on a possibly unrooted gene Phylogeny gene_tree.
165 * The tree is rooted be minimizing either the sum of duplications, the
166 * mapping cost L, or the tree height (or combinations thereof). If
167 * return_trees is set to true, it returns an array of possibly more than
168 * one differently rooted Trees. <br>
169 * The maximal number of returned trees is set with max_trees_to_return.
171 * Phylogeny species_tree is a species Phylogeny to which the gene Phylogeny
172 * gene_tree is compared to. <br>
173 * If both minimize_sum_of_dup and minimize_mapping_cost are true, the tree
174 * is rooted by minimizing the mapping cost L.<br>
175 * If minimize_sum_of_dup, minimize_mapping_cost, and minimize_height are
176 * false tree gene_tree is assumed to be alreadty rooted and no attempts at
177 * rooting are made, and only one tree is returned. <br>
182 * <li>Both Trees must be completely binary (except deepest node of gene
184 * <li>The species Phylogeny must be rooted
185 * <li>Both Trees must have species names in the species name fields of
187 * <li>Both Trees must not have any collapses nodes
190 * (Last modified: 10/01/01)
193 * a binary (except deepest node) gene Phylogeny
194 * @param species_tree
195 * a rooted binary species Phylogeny
196 * @param minimize_mapping_cost
197 * set to true to root by minimizing the mapping cost L (and also
198 * the sum of duplications)
199 * @param minimize_sum_of_dup
200 * set to true to root by minimizing the sum of duplications
201 * @param minimize_height
202 * set to true to root by minimizing the tree height - if
203 * minimize_mapping_cost is set to true or minimize_sum_of_dup is
204 * set to true, then out of the resulting trees with minimal
205 * mapping cost or minimal number of duplications the tree with
206 * the minimal height is chosen
207 * @param return_trees
208 * set to true to return Array of Trees, otherwise null is
210 * @param max_trees_to_return
211 * maximal number of Trees to return (=maximal size of returned
212 * Array) must be no lower than 1
213 * @return array of rooted Trees with duplication vs. speciation assigned if
214 * return_trees is set to true, null otherwise
215 * @throws SDIException
217 public Phylogeny[] infer( final Phylogeny gene_tree,
218 final Phylogeny species_tree,
219 final boolean minimize_mapping_cost,
220 boolean minimize_sum_of_dup,
221 final boolean minimize_height,
222 final boolean return_trees,
223 int max_trees_to_return ) throws SDIException {
226 final ArrayList<Phylogeny> trees = new ArrayList<Phylogeny>();
227 Phylogeny[] tree_array = null;
228 List<PhylogenyBranch> branches = null;
230 PhylogenyNode prev_root = null;
231 PhylogenyNode prev_root_c1 = null;
232 PhylogenyNode prev_root_c2 = null;
233 int duplications = 0;
236 int min_duplications = Integer.MAX_VALUE;
237 int min_cost = Integer.MAX_VALUE;
241 double min_height = Double.MAX_VALUE;
242 double min_diff = 0.0;
243 double[] height__diff = new double[ 2 ];
244 boolean smaller = false;
245 boolean equal = false;
246 boolean prev_root_was_dup = false;
247 if ( max_trees_to_return < 1 ) {
248 max_trees_to_return = 1;
250 if ( minimize_mapping_cost && minimize_sum_of_dup ) {
251 minimize_sum_of_dup = false;
253 if ( !minimize_mapping_cost && !minimize_sum_of_dup && !minimize_height ) {
254 throw new IllegalArgumentException( "parameter to minimize not given for rooting of phylogeny" );
256 g = gene_tree.copy();
257 if ( g.getNumberOfExternalNodes() <= 1 ) {
259 setMinimalDuplications( 0 );
260 setMinimalTreeHeight( 0.0 );
261 tree_array = new Phylogeny[ 1 ];
265 for( final PhylogenyNodeIterator iter = g.iteratorPostorder(); iter.hasNext(); ) {
266 final PhylogenyNode n = iter.next();
268 if ( ( n.getNumberOfDescendants() != 2 ) && ( n.getNumberOfDescendants() != 3 ) ) {
269 throw new IllegalArgumentException( "attempt to run SDI on gene tree with "
270 + n.getNumberOfDescendants() + " child nodes at its root" );
273 else if ( !n.isExternal() && ( n.getNumberOfDescendants() != 2 ) ) {
274 throw new IllegalArgumentException( "attempt to run SDI on gene tree which is not completely binary [found node with "
275 + n.getNumberOfDescendants() + " child nodes]" );
278 for( final PhylogenyNodeIterator iter = species_tree.iteratorPostorder(); iter.hasNext(); ) {
279 final PhylogenyNode n = iter.next();
280 if ( !n.isExternal() && ( n.getNumberOfDescendants() != 2 ) ) {
281 throw new IllegalArgumentException( "attempt to run SDI with a species tree which is not completely binary (after stripping) [found node with "
282 + n.getNumberOfDescendants() + " child nodes]" );
285 g.reRoot( g.getFirstExternalNode() );
286 branches = SDIR.getBranchesInPreorder( g );
287 if ( minimize_mapping_cost || minimize_sum_of_dup ) {
288 sdise = new SDIse( g, species_tree );
289 duplications = sdise.getDuplicationsSum();
291 final Set<PhylogenyBranch> used_root_placements = new HashSet<PhylogenyBranch>();
292 F: for( j = 0; j < branches.size(); ++j ) {
293 prev_root = g.getRoot();
294 prev_root_c1 = prev_root.getChildNode1();
295 prev_root_c2 = prev_root.getChildNode2();
296 prev_root_was_dup = prev_root.isDuplication();
297 final PhylogenyBranch current_branch = branches.get( j );
298 g.reRoot( current_branch );
299 if ( minimize_mapping_cost || minimize_sum_of_dup ) {
300 duplications = sdise.updateM( prev_root_was_dup, prev_root_c1, prev_root_c2 );
302 if ( !used_root_placements.contains( current_branch ) ) {
303 if ( minimize_mapping_cost ) {
304 cost = sdise.computeMappingCostL();
305 if ( minimize_height && ( cost <= min_cost ) ) {
306 height__diff = SDIR.moveRootOnBranchToMinHeight( g );
307 height = height__diff[ 0 ];
308 diff = height__diff[ 1 ];
310 if ( cost == min_cost ) {
311 if ( minimize_height ) {
312 smaller = equal = false;
313 if ( height < min_height ) {
318 else if ( height == min_height ) {
322 if ( Math.abs( diff ) < min_diff ) {
323 min_diff = Math.abs( diff );
326 if ( return_trees ) {
327 if ( minimize_height ) {
330 trees.add( g.copy() );
332 else if ( equal && ( trees.size() < max_trees_to_return ) ) {
333 trees.add( g.copy() );
338 if ( trees.size() < max_trees_to_return ) {
339 trees.add( g.copy() );
343 else if ( !minimize_height ) {
347 else if ( cost < min_cost ) {
348 if ( minimize_height ) {
350 min_diff = Math.abs( diff );
352 if ( return_trees ) {
354 trees.add( g.copy() );
359 if ( duplications < min_duplications ) {
360 min_duplications = duplications;
363 else if ( minimize_sum_of_dup ) {
364 if ( minimize_height && ( duplications <= min_duplications ) ) {
365 height__diff = SDIR.moveRootOnBranchToMinHeight( g );
366 height = height__diff[ 0 ];
367 diff = height__diff[ 1 ];
369 if ( duplications == min_duplications ) {
370 if ( minimize_height ) {
371 smaller = equal = false;
372 if ( height < min_height ) {
377 else if ( height == min_height ) {
381 if ( Math.abs( diff ) < min_diff ) {
382 min_diff = Math.abs( diff );
385 if ( return_trees ) {
386 if ( minimize_height ) {
389 trees.add( g.copy() );
391 else if ( equal && ( trees.size() < max_trees_to_return ) ) {
392 trees.add( g.copy() );
397 if ( trees.size() < max_trees_to_return ) {
398 trees.add( g.copy() );
402 else if ( !minimize_height ) {
406 else if ( duplications < min_duplications ) {
407 if ( minimize_height ) {
409 min_diff = Math.abs( diff );
411 if ( return_trees ) {
413 trees.add( g.copy() );
416 min_duplications = duplications;
419 else if ( minimize_height ) {
420 height__diff = SDIR.moveRootOnBranchToMinHeight( g );
421 height = height__diff[ 0 ];
422 diff = height__diff[ 1 ];
423 if ( Math.abs( diff ) < SDIR.ZERO_DIFF ) {
424 sdise = new SDIse( g, species_tree );
425 min_duplications = sdise.getDuplicationsSum();
427 min_diff = Math.abs( diff );
429 if ( return_trees ) {
430 trees.add( g.copy() );
435 } // if ( used_root_placements.containsKey( current_branch ) )
436 used_root_placements.add( current_branch );
437 } // End of huge for loop "F".
438 if ( return_trees ) {
440 tree_array = new Phylogeny[ trees.size() ];
441 for( int i = 0; i < trees.size(); ++i ) {
442 tree_array[ i ] = trees.get( i );
443 tree_array[ i ].recalculateNumberOfExternalDescendants( false );
447 setMinimalDuplications( min_duplications );
448 setMinimalMappingCost( min_cost );
449 setMinimalTreeHeight( min_height );
450 setMinimalDiffInSubTreeHeights( Math.abs( min_diff ) );
454 private void init() {
456 _min_dup = Integer.MAX_VALUE;
457 _min_cost = Integer.MAX_VALUE;
458 _min_height = Double.MAX_VALUE;
459 _min_diff = Double.MAX_VALUE;
463 private void setCount( final int i ) {
467 private void setMinimalDiffInSubTreeHeights( final double d ) {
471 private void setMinimalDuplications( final int i ) {
475 private void setMinimalMappingCost( final int i ) {
479 private void setMinimalTreeHeight( final double d ) {
483 // This was totally changed on 2006/10/03.
484 // Places references to all Branches of Phylogeny t into a List.
485 // The order is preorder.
486 // Trees are treated as if they were unrooted (i.e. child 1 and
487 // child 2 of the root are treated as if they were connected
489 // The resulting List allows to visit all branches without ever
490 // traversing more than one node at a time.
491 public static List<PhylogenyBranch> getBranchesInPreorder( final Phylogeny t ) {
492 final ArrayList<PhylogenyBranch> branches = new ArrayList<PhylogenyBranch>();
493 if ( t.isEmpty() || ( t.getNumberOfExternalNodes() <= 1 ) ) {
496 if ( t.getNumberOfExternalNodes() == 2 ) {
497 branches.add( new PhylogenyBranch( t.getRoot().getChildNode1(), t.getRoot().getChildNode2() ) );
500 final Set<Integer> one = new HashSet<Integer>();
501 final Set<Integer> two = new HashSet<Integer>();
502 PhylogenyNode node = t.getRoot();
503 while ( !node.isRoot() || !two.contains( node.getId() ) ) {
504 if ( !node.isExternal() && !two.contains( node.getId() ) ) {
505 if ( !one.contains( node.getId() ) && !two.contains( node.getId() ) ) {
506 one.add( node.getId() );
507 node = node.getChildNode1();
510 two.add( node.getId() );
511 node = node.getChildNode2();
513 if ( !node.getParent().isRoot() ) {
514 branches.add( new PhylogenyBranch( node, node.getParent() ) );
516 else if ( !node.isExternal() ) {
517 branches.add( new PhylogenyBranch( t.getRoot().getChildNode1(), t.getRoot().getChildNode2() ) );
521 if ( !node.getParent().isRoot() && !node.isExternal() ) {
522 branches.add( new PhylogenyBranch( node, node.getParent() ) );
524 node = node.getParent();
530 // This places the root of t on its branch in such a way that it
531 // minimizes the tree height as good as possible.
532 // Returns the height and the difference in heights of the resulting
533 // modified Phylogeny t.
534 private static double[] moveRootOnBranchToMinHeight( final Phylogeny t ) {
535 final PhylogenyNode root = t.getRoot();
536 if ( root.getNumberOfDescendants() != 2 ) {
537 throw new IllegalArgumentException( "attempt to move root to minimize height on root where number of child nodes does not equal two" );
539 final PhylogenyNode child0 = root.getChildNode( 0 );
540 final PhylogenyNode child1 = root.getChildNode( 1 );
541 final double newdist = 0.5 * ( ( child0.getDistanceToParent() > 0 ? child0.getDistanceToParent() : 0 ) + ( child1
542 .getDistanceToParent() > 0 ? child1.getDistanceToParent() : 0 ) );
543 child0.setDistanceToParent( newdist );
544 child1.setDistanceToParent( newdist );
545 final double d = child0.getDistanceToParent();
548 final double[] height_diff = new double[ 2 ];
549 final double l0 = t.calculateSubtreeHeight( t.getRoot().getChildNode( 0 ) );
550 final double l1 = t.calculateSubtreeHeight( t.getRoot().getChildNode( 1 ) );
552 height = t.getHeight();
554 if ( ( 2 * d ) > Math.abs( diff ) ) {
555 child0.setDistanceToParent( d - ( diff / 2.0 ) );
556 child1.setDistanceToParent( d + ( diff / 2.0 ) );
557 height_diff[ 0 ] = height - Math.abs( diff / 2 );
558 height_diff[ 1 ] = 0.0;
562 child0.setDistanceToParent( 0.0 );
563 child1.setDistanceToParent( 2 * d );
564 height_diff[ 1 ] = diff - ( 2 * d );
567 child0.setDistanceToParent( 2 * d );
568 child1.setDistanceToParent( 0.0 );
569 height_diff[ 1 ] = diff + ( 2 * d );
571 height_diff[ 0 ] = height - d;
575 height_diff[ 0 ] = height;
576 height_diff[ 1 ] = diff;