2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org/forester
28 package org.forester.sdi;
30 import java.util.HashMap;
33 import org.forester.phylogeny.Phylogeny;
34 import org.forester.phylogeny.PhylogenyMethods;
35 import org.forester.phylogeny.PhylogenyNode;
36 import org.forester.phylogeny.data.Event;
37 import org.forester.phylogeny.data.Taxonomy;
38 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
39 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
40 import org.forester.util.ForesterUtil;
43 * Implements our algorithm for speciation - duplication inference (SDI). <p>
44 * Reference: </p> <ul> <li>Zmasek, C.M. and Eddy, S.R. (2001) "A simple
45 * algorithm to infer gene duplication and speciation events on a gene tree".
46 * Bioinformatics, in press. </ul> <p> The initialization is accomplished by:
47 * </p> <ul> <li>method "linkExtNodesOfG()" of class SDI: setting the links for
48 * the external nodes of the gene tree <li>"preorderReID(int)" from class
49 * Phylogeny: numbering of nodes of the species tree in preorder <li>the
50 * optional stripping of the species tree is accomplished by method
51 * "stripTree(Phylogeny,Phylogeny)" of class Phylogeny </ul> <p> The recursion
52 * part is accomplished by this class' method
53 * "geneTreePostOrderTraversal(PhylogenyNode)". <p> Requires JDK 1.2 or greater.
55 * @see SDI#linkNodesOfG()
57 * @see Phylogeny#preorderReID(int)
60 * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny)
62 * @see #geneTreePostOrderTraversal(PhylogenyNode)
64 * @author Christian M. Zmasek
66 * @version 1.102 -- last modified: 10/02/01
70 final Phylogeny _gene_tree;
71 final Phylogeny _species_tree;
72 int _duplications_sum; // Sum of duplications.
73 int _mapping_cost; // Mapping cost "L".
76 * Constructor which sets the gene tree and the species tree to be compared.
77 * species_tree is the species tree to which the gene tree gene_tree will be
78 * compared to - with method "infer(boolean)". Both Trees must be completely
79 * binary and rooted. The actual inference is accomplished with method
80 * "infer(boolean)". The mapping cost L can then be calculated with method
81 * "computeMappingCost()".
83 * (Last modified: 01/11/01)
85 * @see #infer(boolean)
86 * @see SDI#computeMappingCostL()
88 * reference to a rooted binary gene Phylogeny to which assign
89 * duplication vs speciation, must have species names in the
90 * species name fields for all external nodes
92 * reference to a rooted binary species Phylogeny which might get
93 * stripped in the process, must have species names in the
94 * species name fields for all external nodes
95 * @throws SDIException
97 public SDI( final Phylogeny gene_tree, final Phylogeny species_tree ) throws SDIException {
98 if ( species_tree.isEmpty() || gene_tree.isEmpty() ) {
99 throw new IllegalArgumentException( "attempt to infer duplications using empty tree(s)" );
101 if ( !species_tree.isRooted() ) {
102 throw new IllegalArgumentException( "attempt to infer duplications on unrooted species tree" );
104 _gene_tree = gene_tree;
105 _species_tree = species_tree;
107 _duplications_sum = 0;
108 PhylogenyMethods.preOrderReId( getSpeciesTree() );
110 geneTreePostOrderTraversal( getGeneTree().getRoot() );
114 * Computes the cost of mapping the gene tree gene_tree onto the species
115 * tree species_tree. Before this method can be called, the mapping has to
116 * be calculated with method "infer(boolean)".
118 * Reference. Zhang, L. (1997) On a Mirkin-Muchnik-Smith Conjecture for
119 * Comparing Molecular Phylogenies. Journal of Computational Biology 4
122 * @return the mapping cost "L"
124 public int computeMappingCostL() {
125 _species_tree.levelOrderReID();
127 computeMappingCostHelper( _gene_tree.getRoot() );
128 return _mapping_cost;
132 * Returns the number of duplications.
134 * @return number of duplications
136 public int getDuplicationsSum() {
137 return _duplications_sum;
141 * Returns the gene tree.
145 public Phylogeny getGeneTree() {
150 * Returns the species tree.
152 * @return species tree
154 public Phylogeny getSpeciesTree() {
155 return _species_tree;
159 public String toString() {
160 final StringBuffer sb = new StringBuffer();
161 sb.append( getClass() );
162 sb.append( ForesterUtil.LINE_SEPARATOR );
163 sb.append( "Duplications sum : " + getDuplicationsSum() );
164 sb.append( ForesterUtil.LINE_SEPARATOR );
165 sb.append( "mapping cost L : " + computeMappingCostL() );
166 return sb.toString();
170 * Traverses the subtree of PhylogenyNode g in postorder, calculating the
171 * mapping function M, and determines which nodes represent speciation
172 * events and which ones duplication events.
174 * Preconditions: Mapping M for external nodes must have been calculated and
175 * the species tree must be labelled in preorder.
177 * (Last modified: 01/11/01)
180 * starting node of a gene tree - normally the root
182 void geneTreePostOrderTraversal( final PhylogenyNode g ) {
184 if ( !g.isExternal() ) {
185 geneTreePostOrderTraversal( g.getChildNode( 0 ) );
186 geneTreePostOrderTraversal( g.getChildNode( 1 ) );
187 a = g.getChildNode( 0 ).getLink();
188 b = g.getChildNode( 1 ).getLink();
190 if ( a.getId() > b.getId() ) {
198 // Determines whether dup. or spec.
200 if ( ( a == g.getChildNode( 0 ).getLink() ) || ( a == g.getChildNode( 1 ).getLink() ) ) {
201 event = Event.createSingleDuplicationEvent();
205 event = Event.createSingleSpeciationEvent();
207 g.getNodeData().setEvent( event );
209 } // geneTreePostOrderTraversal( PhylogenyNode )
212 * Calculates the mapping function for the external nodes of the gene tree:
213 * links (sets the field "link" of PhylogenyNode) each external
214 * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree
215 * which has the same species name.
216 * @throws SDIException
218 final void linkNodesOfG() throws SDIException {
219 final Map<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
220 final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase();
221 // Put references to all external nodes of the species tree into a map.
222 // Stringyfied taxonomy is the key, node is the value.
223 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
224 final PhylogenyNode s = iter.next();
225 final String tax_str = SDIutil.taxonomyToString( s, tax_comp_base );
226 if ( speciestree_ext_nodes.containsKey( tax_str ) ) {
227 throw new IllegalArgumentException( "taxonomy [" + s.getNodeData().getTaxonomy()
228 + "] is not unique in species phylogeny" );
230 speciestree_ext_nodes.put( tax_str, s );
232 // Retrieve the reference to the node with a matching stringyfied taxonomy.
233 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
234 final PhylogenyNode g = iter.next();
235 final String tax_str = SDIutil.taxonomyToString( g, tax_comp_base );
236 final PhylogenyNode s = speciestree_ext_nodes.get( tax_str );
238 throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy()
239 + "] not present in species tree" );
246 * Updates the mapping function M after the root of the gene tree has been
247 * moved by one branch. It calculates M for the root of the gene tree and
248 * one of its two children.
250 * To be used ONLY by method "SDIunrooted.fastInfer(Phylogeny,Phylogeny)".
252 * (Last modfied: 10/02/01)
254 * @param prev_root_was_dup
255 * true if the previous root was a duplication, false otherwise
256 * @param prev_root_c1
257 * child 1 of the previous root
258 * @param prev_root_c2
259 * child 2 of the previous root
260 * @return number of duplications which have been assigned in gene tree
262 int updateM( final boolean prev_root_was_dup, final PhylogenyNode prev_root_c1, final PhylogenyNode prev_root_c2 ) {
263 final PhylogenyNode root = getGeneTree().getRoot();
264 if ( ( root.getChildNode1() == prev_root_c1 ) || ( root.getChildNode2() == prev_root_c1 ) ) {
265 calculateMforNode( prev_root_c1 );
268 calculateMforNode( prev_root_c2 );
271 if ( prev_root_was_dup ) {
272 event = Event.createSingleDuplicationEvent();
275 event = Event.createSingleSpeciationEvent();
277 root.getNodeData().setEvent( event );
278 calculateMforNode( root );
279 return getDuplicationsSum();
280 } // updateM( boolean, PhylogenyNode, PhylogenyNode )
282 // Helper method for updateM( boolean, PhylogenyNode, PhylogenyNode )
283 // Calculates M for PhylogenyNode n, given that M for the two children
284 // of n has been calculated.
285 // (Last modified: 10/02/01)
286 private void calculateMforNode( final PhylogenyNode n ) {
287 if ( !n.isExternal() ) {
288 final boolean was_duplication = n.isDuplication();
289 PhylogenyNode a = n.getChildNode1().getLink();
290 PhylogenyNode b = n.getChildNode2().getLink();
292 if ( a.getId() > b.getId() ) {
301 if ( ( a == n.getChildNode1().getLink() ) || ( a == n.getChildNode2().getLink() ) ) {
302 event = Event.createSingleDuplicationEvent();
303 if ( !was_duplication ) {
308 event = Event.createSingleSpeciationEvent();
309 if ( was_duplication ) {
313 n.getNodeData().setEvent( event );
315 } // calculateMforNode( PhylogenyNode )
317 // Helper method for "computeMappingCost()".
318 private void computeMappingCostHelper( final PhylogenyNode g ) {
319 if ( !g.isExternal() ) {
320 computeMappingCostHelper( g.getChildNode1() );
321 computeMappingCostHelper( g.getChildNode2() );
322 if ( ( g.getLink() != g.getChildNode1().getLink() ) && ( g.getLink() != g.getChildNode2().getLink() ) ) {
323 _mapping_cost += ( ( g.getChildNode1().getLink().getId() + g.getChildNode2().getLink().getId() )
324 - ( 2 * g.getLink().getId() ) - 2 );
326 else if ( ( g.getLink() != g.getChildNode1().getLink() ) && ( g.getLink() == g.getChildNode2().getLink() ) ) {
327 _mapping_cost += ( ( g.getChildNode1().getLink().getId() - g.getLink().getId() ) + 1 );
329 else if ( ( g.getLink() == g.getChildNode1().getLink() ) && ( g.getLink() != g.getChildNode2().getLink() ) ) {
330 _mapping_cost += ( ( g.getChildNode2().getLink().getId() - g.getLink().getId() ) + 1 );
338 private TaxonomyComparisonBase determineTaxonomyComparisonBase() {
339 TaxonomyComparisonBase base = null;
340 boolean all_have_id = true;
341 boolean all_have_code = true;
342 boolean all_have_sn = true;
343 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
344 final PhylogenyNode n = iter.next();
345 if ( n.getNodeData().isHasTaxonomy() ) {
346 final Taxonomy tax = n.getNodeData().getTaxonomy();
347 if ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
350 if ( ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
351 all_have_code = false;
353 if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) {
358 throw new IllegalArgumentException( "species tree node [" + n + "] has no taxonomic data" );
361 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
362 final PhylogenyNode n = iter.next();
363 if ( n.getNodeData().isHasTaxonomy() ) {
364 final Taxonomy tax = n.getNodeData().getTaxonomy();
365 if ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
368 if ( ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
369 all_have_code = false;
371 if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) {
376 throw new IllegalArgumentException( "gene tree node [" + n + "] has no taxonomic data" );
380 base = TaxonomyComparisonBase.ID;
382 else if ( all_have_code ) {
383 base = TaxonomyComparisonBase.CODE;
385 else if ( all_have_sn ) {
386 base = TaxonomyComparisonBase.SCIENTIFIC_NAME;
389 throw new IllegalArgumentException( "gene tree and species tree have incomparable taxonomies" );
395 * Calculates the mapping function for the external nodes of the gene tree:
396 * links (sets the field "link" of PhylogenyNode) each external by taxonomy
398 * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree
399 * which has the same species name.
400 * Olivier CHABROL : olivier.chabrol@univ-provence.fr
402 private final void linkNodesOfGByTaxonomyIdentifier() {
403 final HashMap<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
404 if ( _species_tree.getFirstExternalNode().isRoot() ) {
405 speciestree_ext_nodes.put( _species_tree.getFirstExternalNode().getNodeData().getTaxonomy().getIdentifier()
406 .getValue(), _species_tree.getFirstExternalNode() );
409 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
410 final PhylogenyNode s = iter.next();
411 speciestree_ext_nodes.put( s.getNodeData().getTaxonomy().getIdentifier().getValue(), s );
414 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
415 final PhylogenyNode g = iter.next();
416 final PhylogenyNode s = speciestree_ext_nodes
417 .get( g.getNodeData().getTaxonomy().getIdentifier().getValue() );
419 String message = "species [" + g.getNodeData().getTaxonomy().getIdentifier().getValue();
420 message += "] not present in species tree";
421 throw new IllegalArgumentException( message );
426 } // End of class SDIse.