2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
26 package org.forester.sdi;
28 import java.util.HashMap;
29 import java.util.HashSet;
33 import org.forester.phylogeny.Phylogeny;
34 import org.forester.phylogeny.PhylogenyNode;
35 import org.forester.phylogeny.data.Event;
36 import org.forester.phylogeny.data.Taxonomy;
37 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
38 import org.forester.util.ForesterUtil;
41 * Implements our algorithm for speciation - duplication inference (SDI). <p>
42 * The initialization is accomplished by: </p> <ul> <li>method
43 * "linkExtNodesOfG()" of class SDI: setting the links for the external nodes of
44 * the gene tree <li>"preorderReID(int)" from class Phylogeny: numbering of
45 * nodes of the species tree in preorder <li>the optional stripping of the
46 * species tree is accomplished by method "stripTree(Phylogeny,Phylogeny)" of
47 * class Phylogeny </ul> <p> The recursion part is accomplished by this class'
48 * method "geneTreePostOrderTraversal(PhylogenyNode)". <p> Requires JDK 1.5 or
51 * @see SDI#linkNodesOfG()
53 * @see Phylogeny#preorderReID(int)
56 * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny)
58 * @see #geneTreePostOrderTraversal(PhylogenyNode)
60 * @author Christian M. Zmasek
62 public final class GSDI extends SDI {
64 private final HashMap<PhylogenyNode, Integer> _transversal_counts;
65 private final boolean _most_parsimonious_duplication_model;
66 private final boolean _strip_gene_tree;
67 private int _speciation_or_duplication_events_sum;
68 private int _speciations_sum;
69 private final Set<PhylogenyNode> _stripped_gene_tree_nodes;
72 * Constructor which sets the gene tree and the species tree to be compared.
73 * species_tree is the species tree to which the gene tree gene_tree will be
74 * compared to - with method "infer(boolean)". Both Trees must be completely
75 * binary and rooted. The actual inference is accomplished with method
76 * "infer(boolean)". The mapping cost L can then be calculated with method
77 * "computeMappingCost()".
80 * @see #infer(boolean)
81 * @see SDI#computeMappingCostL()
83 * reference to a rooted gene tree to which assign duplication vs
84 * speciation, must have species names in the species name fields
85 * for all external nodes
87 * reference to a rooted binary species tree which might get
88 * stripped in the process, must have species names in the
89 * species name fields for all external nodes
91 * @param most_parsimonious_duplication_model
92 * set to true to assign nodes as speciations which would
93 * otherwise be assiged as unknown because of polytomies in the
97 public GSDI( final Phylogeny gene_tree,
98 final Phylogeny species_tree,
99 final boolean most_parsimonious_duplication_model,
100 final boolean strip_gene_tree ) {
101 super( gene_tree, species_tree );
102 _speciation_or_duplication_events_sum = 0;
103 _speciations_sum = 0;
104 _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
105 _transversal_counts = new HashMap<PhylogenyNode, Integer>();
106 _duplications_sum = 0;
107 _strip_gene_tree = strip_gene_tree;
108 _stripped_gene_tree_nodes = new HashSet<PhylogenyNode>();
109 getSpeciesTree().preOrderReId();
111 geneTreePostOrderTraversal( getGeneTree().getRoot() );
114 public GSDI( final Phylogeny gene_tree,
115 final Phylogeny species_tree,
116 final boolean most_parsimonious_duplication_model ) {
117 this( gene_tree, species_tree, most_parsimonious_duplication_model, false );
120 private final Event createDuplicationEvent() {
121 final Event event = Event.createSingleDuplicationEvent();
126 private final Event createSingleSpeciationOrDuplicationEvent() {
127 final Event event = Event.createSingleSpeciationOrDuplicationEvent();
128 ++_speciation_or_duplication_events_sum;
132 private final Event createSpeciationEvent() {
133 final Event event = Event.createSingleSpeciationEvent();
138 // s is the node on the species tree g maps to.
139 private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) {
141 // Determine how many children map to same node as parent.
142 int sum_g_childs_mapping_to_s = 0;
143 for( final PhylogenyNodeIterator iter = g.iterateChildNodesForward(); iter.hasNext(); ) {
144 if ( iter.next().getLink() == s ) {
145 ++sum_g_childs_mapping_to_s;
148 // Determine the sum of traversals.
149 int traversals_sum = 0;
150 int max_traversals = 0;
151 PhylogenyNode max_traversals_node = null;
152 if ( !s.isExternal() ) {
153 for( final PhylogenyNodeIterator iter = s.iterateChildNodesForward(); iter.hasNext(); ) {
154 final PhylogenyNode current_node = iter.next();
155 final int traversals = getTraversalCount( current_node );
156 traversals_sum += traversals;
157 if ( traversals > max_traversals ) {
158 max_traversals = traversals;
159 max_traversals_node = current_node;
163 // System.out.println( " sum=" + traversals_sum );
164 // System.out.println( " max=" + max_traversals );
165 // System.out.println( " m=" + sum_g_childs_mapping_to_s );
166 if ( sum_g_childs_mapping_to_s > 0 ) {
167 if ( traversals_sum == 2 ) {
168 event = createDuplicationEvent();
170 else if ( traversals_sum > 2 ) {
171 if ( max_traversals <= 1 ) {
172 if ( _most_parsimonious_duplication_model ) {
173 event = createSpeciationEvent();
176 event = createSingleSpeciationOrDuplicationEvent();
180 event = createDuplicationEvent();
181 _transversal_counts.put( max_traversals_node, 1 );
185 event = createDuplicationEvent();
189 event = createSpeciationEvent();
191 g.getNodeData().setEvent( event );
195 * Traverses the subtree of PhylogenyNode g in postorder, calculating the
196 * mapping function M, and determines which nodes represent speciation
197 * events and which ones duplication events.
199 * Preconditions: Mapping M for external nodes must have been calculated and
200 * the species tree must be labeled in preorder.
205 * starting node of a gene tree - normally the root
207 final void geneTreePostOrderTraversal( final PhylogenyNode g ) {
208 if ( !g.isExternal() ) {
209 for( final PhylogenyNodeIterator iter = g.iterateChildNodesForward(); iter.hasNext(); ) {
210 geneTreePostOrderTraversal( iter.next() );
212 final PhylogenyNode[] linked_nodes = new PhylogenyNode[ g.getNumberOfDescendants() ];
213 for( int i = 0; i < linked_nodes.length; ++i ) {
214 linked_nodes[ i ] = g.getChildNode( i ).getLink();
216 final int[] min_max = obtainMinMaxIdIndices( linked_nodes );
217 int min_i = min_max[ 0 ];
218 int max_i = min_max[ 1 ];
219 // initTransversalCounts();
220 while ( linked_nodes[ min_i ] != linked_nodes[ max_i ] ) {
221 increaseTraversalCount( linked_nodes[ max_i ] );
222 linked_nodes[ max_i ] = linked_nodes[ max_i ].getParent();
223 final int[] min_max_ = obtainMinMaxIdIndices( linked_nodes );
224 min_i = min_max_[ 0 ];
225 max_i = min_max_[ 1 ];
227 final PhylogenyNode s = linked_nodes[ max_i ];
229 // Determines whether dup. or spec.
230 determineEvent( s, g );
231 // _transversal_counts.clear();
235 public final int getSpeciationOrDuplicationEventsSum() {
236 return _speciation_or_duplication_events_sum;
239 public final int getSpeciationsSum() {
240 return _speciations_sum;
243 private final int getTraversalCount( final PhylogenyNode node ) {
244 if ( _transversal_counts.containsKey( node ) ) {
245 return _transversal_counts.get( node );
250 private final void increaseTraversalCount( final PhylogenyNode node ) {
251 if ( _transversal_counts.containsKey( node ) ) {
252 _transversal_counts.put( node, _transversal_counts.get( node ) + 1 );
255 _transversal_counts.put( node, 1 );
257 // System.out.println( "count for node " + node.getID() + " is now "
258 // + getTraversalCount( node ) );
262 * This allows for linking of internal nodes of the species tree (as opposed
263 * to just external nodes, as in the method it overrides.
267 // final void linkNodesOfG() {
268 // final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes = createTaxonomyToNodeMap();
269 // if ( _strip_gene_tree ) {
270 // stripGeneTree( speciestree_ext_nodes );
271 // if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) {
272 // throw new IllegalArgumentException( "species tree does not contain any"
273 // + " nodes matching species in the gene tree" );
276 // // Retrieve the reference to the PhylogenyNode with a matching species.
277 // for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
278 // final PhylogenyNode g = iter.next();
279 // if ( !g.getNodeData().isHasTaxonomy() ) {
280 // throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" );
282 // final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() );
283 // if ( s == null ) {
284 // throw new IllegalArgumentException( "species " + g.getNodeData().getTaxonomy()
285 // + " not present in species tree" );
290 final void linkNodesOfG() {
291 // final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes = createTaxonomyToNodeMap();
292 // if ( _strip_gene_tree ) {
293 // stripGeneTree( speciestree_ext_nodes );
294 // if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) {
295 // throw new IllegalArgumentException( "species tree does not contain any"
296 // + " nodes matching species in the gene tree" );
299 // // Retrieve the reference to the PhylogenyNode with a matching species.
300 // for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
301 // final PhylogenyNode g = iter.next();
302 // if ( !g.getNodeData().isHasTaxonomy() ) {
303 // throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" );
305 // final PhylogenyNode s = speciestree_ext_nodes.get( g.getNodeData().getTaxonomy() );
306 // if ( s == null ) {
307 // throw new IllegalArgumentException( "species " + g.getNodeData().getTaxonomy()
308 // + " not present in species tree" );
313 final Map<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
314 final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase( _gene_tree );
315 System.out.println( "comp base is: " + tax_comp_base );
316 // if ( _strip_gene_tree ) {
317 // stripGeneTree2( speciestree_ext_nodes );
318 // if ( ( _gene_tree == null ) || ( _gene_tree.getNumberOfExternalNodes() < 2 ) ) {
319 // throw new IllegalArgumentException( "species tree does not contain any"
320 // + " nodes matching species in the gene tree" );
323 // Put references to all external nodes of the species tree into a map.
324 // Stringyfied taxonomy is the key, node is the value.
325 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
326 final PhylogenyNode s = iter.next();
327 final String tax_str = taxonomyToString( s, tax_comp_base );
328 if ( speciestree_ext_nodes.containsKey( tax_str ) ) {
329 throw new IllegalArgumentException( "taxonomy [" + s + "] is not unique in species phylogeny" );
331 speciestree_ext_nodes.put( tax_str, s );
333 // Retrieve the reference to the node with a matching stringyfied taxonomy.
334 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
335 final PhylogenyNode g = iter.next();
336 if ( !g.getNodeData().isHasTaxonomy() ) {
337 if ( _strip_gene_tree ) {
338 _stripped_gene_tree_nodes.add( g );
342 throw new IllegalArgumentException( "gene tree node [" + g + "] has no taxonomic data" );
345 final String tax_str = taxonomyToString( g, tax_comp_base );
346 if ( ForesterUtil.isEmpty( tax_str ) ) {
347 if ( _strip_gene_tree ) {
348 _stripped_gene_tree_nodes.add( g );
352 throw new IllegalArgumentException( "gene tree node [" + g + "] has no appropriate taxonomic data" );
355 final PhylogenyNode s = speciestree_ext_nodes.get( tax_str );
357 if ( _strip_gene_tree ) {
358 _stripped_gene_tree_nodes.add( g );
361 throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy()
362 + "] not present in species tree" );
367 System.out.println( "setting link of " + g + " to " + s );
369 if ( _strip_gene_tree ) {
370 for( final PhylogenyNode n : _stripped_gene_tree_nodes ) {
371 if ( _gene_tree.getNode( n.getId() ) != null ) {
372 _gene_tree.deleteSubtree( _gene_tree.getNode( n.getId() ), true );
379 final private HashMap<Taxonomy, PhylogenyNode> createTaxonomyToNodeMap() {
380 final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes = new HashMap<Taxonomy, PhylogenyNode>();
381 for( final PhylogenyNodeIterator iter = _species_tree.iteratorLevelOrder(); iter.hasNext(); ) {
382 final PhylogenyNode n = iter.next();
383 if ( n.getNodeData().isHasTaxonomy() ) {
384 if ( speciestree_ext_nodes.containsKey( n.getNodeData().getTaxonomy() ) ) {
385 throw new IllegalArgumentException( "taxonomy [" + n.getNodeData().getTaxonomy()
386 + "] is not unique in species phylogeny" );
388 speciestree_ext_nodes.put( n.getNodeData().getTaxonomy(), n );
391 return speciestree_ext_nodes;
394 private final void stripGeneTree( final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes ) {
395 // final Set<PhylogenyNode> to_delete = new HashSet<PhylogenyNode>();
396 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
397 final PhylogenyNode g = iter.next();
398 if ( !g.getNodeData().isHasTaxonomy() ) {
399 throw new IllegalArgumentException( "gene tree node " + g + " has no taxonomic data" );
401 if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) {
402 _stripped_gene_tree_nodes.add( g );
405 for( final PhylogenyNode n : _stripped_gene_tree_nodes ) {
406 _gene_tree.deleteSubtree( n, true );
410 private final void stripGeneTree2( final HashMap<Taxonomy, PhylogenyNode> speciestree_ext_nodes ) {
411 // final Set<PhylogenyNode> to_delete = new HashSet<PhylogenyNode>();
412 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
413 final PhylogenyNode g = iter.next();
414 if ( !g.getNodeData().isHasTaxonomy() ) {
415 _stripped_gene_tree_nodes.add( g );
418 if ( !speciestree_ext_nodes.containsKey( g.getNodeData().getTaxonomy() ) ) {
419 _stripped_gene_tree_nodes.add( g );
423 for( final PhylogenyNode n : _stripped_gene_tree_nodes ) {
424 _gene_tree.deleteSubtree( n, true );
428 public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) {
429 int with_id_count = 0;
430 int with_code_count = 0;
431 int with_sn_count = 0;
433 for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
434 final PhylogenyNode g = iter.next();
435 if ( g.getNodeData().isHasTaxonomy() ) {
436 final Taxonomy tax = g.getNodeData().getTaxonomy();
437 if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
438 if ( ++with_id_count > max ) {
442 if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
443 if ( ++with_code_count > max ) {
444 max = with_code_count;
447 if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
448 if ( ++with_sn_count > max ) {
455 throw new IllegalArgumentException( "gene tree has no taxonomic data" );
457 else if ( max == 1 ) {
458 throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" );
460 else if ( max == with_sn_count ) {
461 return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME;
463 else if ( max == with_id_count ) {
464 return SDI.TaxonomyComparisonBase.ID;
467 return SDI.TaxonomyComparisonBase.CODE;
471 public Set<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
472 return _stripped_gene_tree_nodes;
476 public final String toString() {
477 final StringBuffer sb = new StringBuffer();
478 sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
479 sb.append( ForesterUtil.getLineSeparator() );
480 sb.append( "Speciations sum : " + getSpeciationsSum() );
481 sb.append( ForesterUtil.getLineSeparator() );
482 sb.append( "Duplications sum : " + getDuplicationsSum() );
483 sb.append( ForesterUtil.getLineSeparator() );
484 if ( !_most_parsimonious_duplication_model ) {
485 sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() );
486 sb.append( ForesterUtil.getLineSeparator() );
488 sb.append( "mapping cost L : " + computeMappingCostL() );
489 return sb.toString();
492 static final int[] obtainMinMaxIdIndices( final PhylogenyNode[] linked_nodes ) {
495 int max_i_id = -Integer.MAX_VALUE;
496 int min_i_id = Integer.MAX_VALUE;
497 for( int i = 0; i < linked_nodes.length; ++i ) {
498 final int id_i = linked_nodes[ i ].getId();
499 if ( id_i > max_i_id ) {
501 max_i_id = linked_nodes[ max_i ].getId();
503 if ( id_i < min_i_id ) {
505 min_i_id = linked_nodes[ min_i ].getId();
508 return new int[] { min_i, max_i };
511 * Updates the mapping function M after the root of the gene tree has been
512 * moved by one branch. It calculates M for the root of the gene tree and
513 * one of its two children.
515 * To be used ONLY by method "SDIunrooted.fastInfer(Phylogeny,Phylogeny)".
519 * @param prev_root_was_dup
520 * true if the previous root was a duplication, false otherwise
521 * @param prev_root_c1
522 * child 1 of the previous root
523 * @param prev_root_c2
524 * child 2 of the previous root
525 * @return number of duplications which have been assigned in gene tree
527 // int updateM( final boolean prev_root_was_dup,
528 // final PhylogenyNode prev_root_c1, final PhylogenyNode prev_root_c2 ) {
529 // final PhylogenyNode root = getGeneTree().getRoot();
530 // if ( ( root.getChildNode1() == prev_root_c1 )
531 // || ( root.getChildNode2() == prev_root_c1 ) ) {
532 // calculateMforNode( prev_root_c1 );
535 // calculateMforNode( prev_root_c2 );
537 // Event event = null;
538 // if ( prev_root_was_dup ) {
539 // event = Event.createSingleDuplicationEvent();
542 // event = Event.createSingleSpeciationEvent();
544 // root.getPhylogenyNodeData().setEvent( event );
545 // calculateMforNode( root );
546 // return getDuplications();
547 // } // updateM( boolean, PhylogenyNode, PhylogenyNode )
548 // Helper method for updateM( boolean, PhylogenyNode, PhylogenyNode )
549 // Calculates M for PhylogenyNode n, given that M for the two children
550 // of n has been calculated.
551 // (Last modified: 10/02/01)
552 // private void calculateMforNode( final PhylogenyNode n ) {
553 // if ( !n.isExternal() ) {
554 // boolean was_duplication = n.isDuplication();
555 // PhylogenyNode a = n.getChildNode1().getLink(), b = n
556 // .getChildNode2().getLink();
557 // while ( a != b ) {
558 // if ( a.getID() > b.getID() ) {
559 // a = a.getParent();
562 // b = b.getParent();
566 // Event event = null;
567 // if ( ( a == n.getChildNode1().getLink() )
568 // || ( a == n.getChildNode2().getLink() ) ) {
569 // event = Event.createSingleDuplicationEvent();
570 // if ( !was_duplication ) {
571 // increaseDuplications();
575 // event = Event.createSingleSpeciationEvent();
576 // if ( was_duplication ) {
577 // decreaseDuplications();
580 // n.getPhylogenyNodeData().setEvent( event );
582 // } // calculateMforNode( PhylogenyNode )