2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
26 package org.forester.sdi;
28 import java.util.HashMap;
29 import java.util.HashSet;
32 import org.forester.phylogeny.Phylogeny;
33 import org.forester.phylogeny.PhylogenyNode;
34 import org.forester.phylogeny.data.Event;
35 import org.forester.phylogeny.data.Taxonomy;
36 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
37 import org.forester.util.ForesterUtil;
40 * Implements our algorithm for speciation - duplication inference (SDI). <p>
41 * The initialization is accomplished by: </p> <ul> <li>method
42 * "linkExtNodesOfG()" of class SDI: setting the links for the external nodes of
43 * the gene tree <li>"preorderReID(int)" from class Phylogeny: numbering of
44 * nodes of the species tree in preorder <li>the optional stripping of the
45 * species tree is accomplished by method "stripTree(Phylogeny,Phylogeny)" of
46 * class Phylogeny </ul> <p> The recursion part is accomplished by this class'
47 * method "geneTreePostOrderTraversal(PhylogenyNode)". <p> Requires JDK 1.5 or
50 * @see SDI#linkNodesOfG()
52 * @see Phylogeny#preorderReID(int)
55 * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny)
57 * @see #geneTreePostOrderTraversal(PhylogenyNode)
59 * @author Christian M. Zmasek
61 public final class GSDI extends SDI {
63 private final HashMap<PhylogenyNode, Integer> _transversal_counts;
64 private final boolean _most_parsimonious_duplication_model;
65 private final boolean _strip_gene_tree;
66 private int _speciation_or_duplication_events_sum;
67 private int _speciations_sum;
70 * Constructor which sets the gene tree and the species tree to be compared.
71 * species_tree is the species tree to which the gene tree gene_tree will be
72 * compared to - with method "infer(boolean)". Both Trees must be completely
73 * binary and rooted. The actual inference is accomplished with method
74 * "infer(boolean)". The mapping cost L can then be calculated with method
75 * "computeMappingCost()".
78 * @see #infer(boolean)
79 * @see SDI#computeMappingCostL()
81 * reference to a rooted gene tree to which assign duplication vs
82 * speciation, must have species names in the species name fields
83 * for all external nodes
85 * reference to a rooted binary species tree which might get
86 * stripped in the process, must have species names in the
87 * species name fields for all external nodes
89 * @param most_parsimonious_duplication_model
90 * set to true to assign nodes as speciations which would
91 * otherwise be assiged as unknown because of polytomies in the
95 public GSDI( final Phylogeny gene_tree,
96 final Phylogeny species_tree,
97 final boolean most_parsimonious_duplication_model,
98 final boolean strip_gene_tree ) {
99 super( gene_tree, species_tree );
100 _speciation_or_duplication_events_sum = 0;
101 _speciations_sum = 0;
102 _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
103 _transversal_counts = new HashMap<PhylogenyNode, Integer>();
104 _duplications_sum = 0;
105 _strip_gene_tree = strip_gene_tree;
106 getSpeciesTree().preOrderReId();
108 geneTreePostOrderTraversal( getGeneTree().getRoot() );
111 public GSDI( final Phylogeny gene_tree,
112 final Phylogeny species_tree,
113 final boolean most_parsimonious_duplication_model ) {
114 this( gene_tree, species_tree, most_parsimonious_duplication_model, false );
117 private final Event createDuplicationEvent() {
118 final Event event = Event.createSingleDuplicationEvent();
123 private final Event createSingleSpeciationOrDuplicationEvent() {
124 final Event event = Event.createSingleSpeciationOrDuplicationEvent();
125 ++_speciation_or_duplication_events_sum;
129 private final Event createSpeciationEvent() {
130 final Event event = Event.createSingleSpeciationEvent();
135 // s is the node on the species tree g maps to.
136 private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) {
138 // Determine how many children map to same node as parent.
139 int sum_g_childs_mapping_to_s = 0;
140 for( final PhylogenyNodeIterator iter = g.iterateChildNodesForward(); iter.hasNext(); ) {
141 if ( iter.next().getLink() == s ) {
142 ++sum_g_childs_mapping_to_s;
145 // Determine the sum of traversals.
146 int traversals_sum = 0;
147 int max_traversals = 0;
148 PhylogenyNode max_traversals_node = null;
149 if ( !s.isExternal() ) {
150 for( final PhylogenyNodeIterator iter = s.iterateChildNodesForward(); iter.hasNext(); ) {
151 final PhylogenyNode current_node = iter.next();
152 final int traversals = getTraversalCount( current_node );
153 traversals_sum += traversals;
154 if ( traversals > max_traversals ) {
155 max_traversals = traversals;
156 max_traversals_node = current_node;
160 // System.out.println( " sum=" + traversals_sum );
161 // System.out.println( " max=" + max_traversals );
162 // System.out.println( " m=" + sum_g_childs_mapping_to_s );
163 if ( sum_g_childs_mapping_to_s > 0 ) {
164 if ( traversals_sum == 2 ) {
165 event = createDuplicationEvent();
167 else if ( traversals_sum > 2 ) {
168 if ( max_traversals <= 1 ) {
169 if ( _most_parsimonious_duplication_model ) {
170 event = createSpeciationEvent();
173 event = createSingleSpeciationOrDuplicationEvent();
177 event = createDuplicationEvent();
178 _transversal_counts.put( max_traversals_node, 1 );
182 event = createDuplicationEvent();
186 event = createSpeciationEvent();
188 g.getNodeData().setEvent( event );
192 * Traverses the subtree of PhylogenyNode g in postorder, calculating the
193 * mapping function M, and determines which nodes represent speciation
194 * events and which ones duplication events.
196 * Preconditions: Mapping M for external nodes must have been calculated and
197 * the species tree must be labeled in preorder.
202 * starting node of a gene tree - normally the root
204 final void geneTreePostOrderTraversal( final PhylogenyNode g ) {
205 if ( !g.isExternal() ) {
206 for( final PhylogenyNodeIterator iter = g.iterateChildNodesForward(); iter.hasNext(); ) {
207 geneTreePostOrderTraversal( iter.next() );
209 final PhylogenyNode[] linked_nodes = new PhylogenyNode[ g.getNumberOfDescendants() ];
210 for( int i = 0; i < linked_nodes.length; ++i ) {
211 linked_nodes[ i ] = g.getChildNode( i ).getLink();
213 final int[] min_max = obtainMinMaxIdIndices( linked_nodes );
214 int min_i = min_max[ 0 ];
215 int max_i = min_max[ 1 ];
216 // initTransversalCounts();
217 while ( linked_nodes[ min_i ] != linked_nodes[ max_i ] ) {
218 increaseTraversalCount( linked_nodes[ max_i ] );
219 linked_nodes[ max_i ] = linked_nodes[ max_i ].getParent();
220 final int[] min_max_ = obtainMinMaxIdIndices( linked_nodes );
221 min_i = min_max_[ 0 ];
222 max_i = min_max_[ 1 ];
224 final PhylogenyNode s = linked_nodes[ max_i ];
226 // Determines whether dup. or spec.
227 determineEvent( s, g );
228 // _transversal_counts.clear();
232 public final int getSpeciationOrDuplicationEventsSum() {
233 return _speciation_or_duplication_events_sum;
236 public final int getSpeciationsSum() {
237 return _speciations_sum;
240 private final int getTraversalCount( final PhylogenyNode node ) {
241 if ( _transversal_counts.containsKey( node ) ) {
242 return _transversal_counts.get( node );
247 private final void increaseTraversalCount( final PhylogenyNode node ) {
248 if ( _transversal_counts.containsKey( node ) ) {
249 _transversal_counts.put( node, _transversal_counts.get( node ) + 1 );
252 _transversal_counts.put( node, 1 );
254 // System.out.println( "count for node " + node.getID() + " is now "
255 // + getTraversalCount( node ) );
259 * This allows for linking of internal nodes of the species tree (as opposed
260 * to just external nodes, as in the method it overrides.
264 final void linkNodesOfG() {
265 final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes = new HashMap<Taxonomy, PhylogenyNode>();
266 for( final PhylogenyNodeIterator iter = _species_tree.iteratorLevelOrder(); iter.hasNext(); ) {
267 final PhylogenyNode n = iter.next();
268 if ( n.getNodeData().isHasTaxonomy() ) {
269 if ( speciestree_ext_nodes.containsKey( n.getNodeData().getTaxonomy() ) ) {
270 throw new IllegalArgumentException( "taxonomy [" + n.getNodeData().getTaxonomy()
271 + "] is not unique in species phylogeny" );
273 speciestree_ext_nodes.put( n.getNodeData().getTaxonomy(), n );
276 if ( _strip_gene_tree ) {
277 final Set<PhylogenyNode> to_delete = new HashSet<PhylogenyNode>();
278 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
279 final PhylogenyNode g = iter.next();
280 if ( !g.getNodeData().isHasTaxonomy() ) {
281 throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" );
283 final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() );
285 // throw new IllegalArgumentException( "species " + g.getNodeData().getTaxonomy()
286 // + " not present in species tree" );
290 for( final PhylogenyNode n : to_delete ) {
291 _gene_tree.deleteSubtree( n, true );
292 System.out.println( "deleted" + n );
295 // Retrieve the reference to the PhylogenyNode with a matching species.
296 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
297 final PhylogenyNode g = iter.next();
298 if ( !g.getNodeData().isHasTaxonomy() ) {
299 throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" );
301 final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() );
303 throw new IllegalArgumentException( "species " + g.getNodeData().getTaxonomy()
304 + " not present in species tree" );
311 public final String toString() {
312 final StringBuffer sb = new StringBuffer();
313 sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
314 sb.append( ForesterUtil.getLineSeparator() );
315 sb.append( "Speciations sum : " + getSpeciationsSum() );
316 sb.append( ForesterUtil.getLineSeparator() );
317 sb.append( "Duplications sum : " + getDuplicationsSum() );
318 sb.append( ForesterUtil.getLineSeparator() );
319 if ( !_most_parsimonious_duplication_model ) {
320 sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() );
321 sb.append( ForesterUtil.getLineSeparator() );
323 sb.append( "mapping cost L : " + computeMappingCostL() );
324 return sb.toString();
327 static final int[] obtainMinMaxIdIndices( final PhylogenyNode[] linked_nodes ) {
330 int max_i_id = -Integer.MAX_VALUE;
331 int min_i_id = Integer.MAX_VALUE;
332 for( int i = 0; i < linked_nodes.length; ++i ) {
333 final int id_i = linked_nodes[ i ].getId();
334 if ( id_i > max_i_id ) {
336 max_i_id = linked_nodes[ max_i ].getId();
338 if ( id_i < min_i_id ) {
340 min_i_id = linked_nodes[ min_i ].getId();
343 return new int[] { min_i, max_i };
346 * Updates the mapping function M after the root of the gene tree has been
347 * moved by one branch. It calculates M for the root of the gene tree and
348 * one of its two children.
350 * To be used ONLY by method "SDIunrooted.fastInfer(Phylogeny,Phylogeny)".
354 * @param prev_root_was_dup
355 * true if the previous root was a duplication, false otherwise
356 * @param prev_root_c1
357 * child 1 of the previous root
358 * @param prev_root_c2
359 * child 2 of the previous root
360 * @return number of duplications which have been assigned in gene tree
362 // int updateM( final boolean prev_root_was_dup,
363 // final PhylogenyNode prev_root_c1, final PhylogenyNode prev_root_c2 ) {
364 // final PhylogenyNode root = getGeneTree().getRoot();
365 // if ( ( root.getChildNode1() == prev_root_c1 )
366 // || ( root.getChildNode2() == prev_root_c1 ) ) {
367 // calculateMforNode( prev_root_c1 );
370 // calculateMforNode( prev_root_c2 );
372 // Event event = null;
373 // if ( prev_root_was_dup ) {
374 // event = Event.createSingleDuplicationEvent();
377 // event = Event.createSingleSpeciationEvent();
379 // root.getPhylogenyNodeData().setEvent( event );
380 // calculateMforNode( root );
381 // return getDuplications();
382 // } // updateM( boolean, PhylogenyNode, PhylogenyNode )
383 // Helper method for updateM( boolean, PhylogenyNode, PhylogenyNode )
384 // Calculates M for PhylogenyNode n, given that M for the two children
385 // of n has been calculated.
386 // (Last modified: 10/02/01)
387 // private void calculateMforNode( final PhylogenyNode n ) {
388 // if ( !n.isExternal() ) {
389 // boolean was_duplication = n.isDuplication();
390 // PhylogenyNode a = n.getChildNode1().getLink(), b = n
391 // .getChildNode2().getLink();
392 // while ( a != b ) {
393 // if ( a.getID() > b.getID() ) {
394 // a = a.getParent();
397 // b = b.getParent();
401 // Event event = null;
402 // if ( ( a == n.getChildNode1().getLink() )
403 // || ( a == n.getChildNode2().getLink() ) ) {
404 // event = Event.createSingleDuplicationEvent();
405 // if ( !was_duplication ) {
406 // increaseDuplications();
410 // event = Event.createSingleSpeciationEvent();
411 // if ( was_duplication ) {
412 // decreaseDuplications();
415 // n.getPhylogenyNodeData().setEvent( event );
417 // } // calculateMforNode( PhylogenyNode )