2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org
28 package org.forester.sdi;
30 import java.util.HashMap;
33 import org.forester.phylogeny.Phylogeny;
34 import org.forester.phylogeny.PhylogenyNode;
35 import org.forester.phylogeny.data.Taxonomy;
36 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
37 import org.forester.util.ForesterUtil;
39 public abstract class SDI {
41 final Phylogeny _gene_tree;
42 final Phylogeny _species_tree;
43 int _duplications_sum; // Sum of duplications.
44 int _mapping_cost; // Mapping cost "L".
47 * Constructor which sets the gene tree and the species tree to be compared.
48 * species_tree is the species tree to which the gene tree gene_tree will be
50 * Infers for each PhylogenyNode of gene_tree whether
51 * it represents a speciation or duplication event by calculating and
52 * interpreting the mapping function M. The most parsimonious sequence of
53 * speciation and duplication events is assumed.
54 * The mapping cost L can be
55 * calculated with method "computeMappingCost()".
60 * <li>Both Trees must be rooted
61 * <li>Both Trees must have species names in the species name fields of all
62 * their external nodes
66 * reference to a rooted binary gene Phylogeny to which assign
67 * duplication vs speciation, must have species names in the
68 * species name fields for all external nodes
70 * reference to a rooted binary species Phylogeny which might get
71 * stripped in the process, must have species names in the
72 * species name fields for all external nodes
74 public SDI( final Phylogeny gene_tree, final Phylogeny species_tree ) {
75 if ( species_tree.isEmpty() || gene_tree.isEmpty() ) {
76 throw new IllegalArgumentException( "attempt to infer duplications using empty tree(s)" );
78 if ( !gene_tree.isRooted() ) {
79 throw new IllegalArgumentException( "attempt to infer duplications on unrooted gene tree" );
81 if ( !species_tree.isRooted() ) {
82 throw new IllegalArgumentException( "attempt to infer duplications on unrooted species tree" );
84 _gene_tree = gene_tree;
85 _species_tree = species_tree;
86 _duplications_sum = 0;
90 // Helper method for "computeMappingCost()".
91 private void computeMappingCostHelper( final PhylogenyNode g ) {
92 if ( !g.isExternal() ) {
93 computeMappingCostHelper( g.getChildNode1() );
94 computeMappingCostHelper( g.getChildNode2() );
95 if ( ( g.getLink() != g.getChildNode1().getLink() ) && ( g.getLink() != g.getChildNode2().getLink() ) ) {
96 _mapping_cost += ( g.getChildNode1().getLink().getId() + g.getChildNode2().getLink().getId()
97 - ( 2 * g.getLink().getId() ) - 2 );
99 else if ( ( g.getLink() != g.getChildNode1().getLink() ) && ( g.getLink() == g.getChildNode2().getLink() ) ) {
100 _mapping_cost += ( g.getChildNode1().getLink().getId() - g.getLink().getId() + 1 );
102 else if ( ( g.getLink() == g.getChildNode1().getLink() ) && ( g.getLink() != g.getChildNode2().getLink() ) ) {
103 _mapping_cost += ( g.getChildNode2().getLink().getId() - g.getLink().getId() + 1 );
112 * Computes the cost of mapping the gene tree gene_tree onto the species
113 * tree species_tree. Before this method can be called, the mapping has to
114 * be calculated with method "infer(boolean)".
116 * Reference. Zhang, L. (1997) On a Mirkin-Muchnik-Smith Conjecture for
117 * Comparing Molecular Phylogenies. Journal of Computational Biology 4
120 * @return the mapping cost "L"
122 public int computeMappingCostL() {
123 _species_tree.levelOrderReID();
125 computeMappingCostHelper( _gene_tree.getRoot() );
126 return _mapping_cost;
129 private TaxonomyComparisonBase determineTaxonomyComparisonBase() {
130 TaxonomyComparisonBase base = null;
131 boolean all_have_id = true;
132 boolean all_have_code = true;
133 boolean all_have_sn = true;
134 boolean all_have_cn = true;
135 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
136 final PhylogenyNode n = iter.next();
137 if ( n.getNodeData().isHasTaxonomy() ) {
138 final Taxonomy tax = n.getNodeData().getTaxonomy();
139 if ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
142 if ( ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
143 all_have_code = false;
145 if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) {
148 if ( ForesterUtil.isEmpty( tax.getCommonName() ) ) {
153 throw new IllegalArgumentException( "species tree node [" + n + "] has no taxonomic data" );
156 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
157 final PhylogenyNode n = iter.next();
158 if ( n.getNodeData().isHasTaxonomy() ) {
159 final Taxonomy tax = n.getNodeData().getTaxonomy();
160 if ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
163 if ( ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
164 all_have_code = false;
166 if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) {
169 if ( ForesterUtil.isEmpty( tax.getCommonName() ) ) {
174 throw new IllegalArgumentException( "gene tree node [" + n + "] has no taxonomic data" );
178 base = TaxonomyComparisonBase.ID;
180 else if ( all_have_code ) {
181 base = TaxonomyComparisonBase.CODE;
183 else if ( all_have_sn ) {
184 base = TaxonomyComparisonBase.SCIENTIFIC_NAME;
186 else if ( all_have_cn ) {
187 base = TaxonomyComparisonBase.COMMON_NAME;
190 throw new IllegalArgumentException( "gene tree and species tree have incomparable taxonomies" );
196 * Returns the number of duplications.
198 * @return number of duplications
200 public int getDuplicationsSum() {
201 return _duplications_sum;
205 * Returns the gene tree.
209 public Phylogeny getGeneTree() {
214 * Returns the species tree.
216 * @return species tree
218 public Phylogeny getSpeciesTree() {
219 return _species_tree;
223 * Calculates the mapping function for the external nodes of the gene tree:
224 * links (sets the field "link" of PhylogenyNode) each external
225 * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree
226 * which has the same species name.
228 void linkNodesOfG() {
229 final Map<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
230 final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase();
231 // Put references to all external nodes of the species tree into a map.
232 // Stringyfied taxonomy is the key, node is the value.
233 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
234 final PhylogenyNode s = iter.next();
235 final String tax_str = taxonomyToString( s, tax_comp_base );
236 if ( speciestree_ext_nodes.containsKey( tax_str ) ) {
237 throw new IllegalArgumentException( "taxonomy [" + s.getNodeData().getTaxonomy()
238 + "] is not unique in species phylogeny" );
240 speciestree_ext_nodes.put( tax_str, s );
242 // Retrieve the reference to the node with a matching stringyfied taxonomy.
243 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
244 final PhylogenyNode g = iter.next();
245 final String tax_str = taxonomyToString( g, tax_comp_base );
246 final PhylogenyNode s = speciestree_ext_nodes.get( tax_str );
248 throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy()
249 + "] not present in species tree" );
256 * Calculates the mapping function for the external nodes of the gene tree:
257 * links (sets the field "link" of PhylogenyNode) each external by taxonomy
259 * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree
260 * which has the same species name.
261 * Olivier CHABROL : olivier.chabrol@univ-provence.fr
263 void linkNodesOfGByTaxonomyIdentifier() {
264 final HashMap<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
265 if ( _species_tree.getFirstExternalNode().isRoot() ) {
266 speciestree_ext_nodes.put( _species_tree.getFirstExternalNode().getNodeData().getTaxonomy().getIdentifier()
267 .getValue(), _species_tree.getFirstExternalNode() );
270 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
271 final PhylogenyNode s = iter.next();
272 speciestree_ext_nodes.put( s.getNodeData().getTaxonomy().getIdentifier().getValue(), s );
275 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
276 final PhylogenyNode g = iter.next();
277 final PhylogenyNode s = speciestree_ext_nodes
278 .get( g.getNodeData().getTaxonomy().getIdentifier().getValue() );
280 String message = "species [" + g.getNodeData().getTaxonomy().getIdentifier().getValue();
281 message += "] not present in species tree";
282 throw new IllegalArgumentException( message );
289 public String toString() {
290 final StringBuffer sb = new StringBuffer();
291 sb.append( getClass() );
292 sb.append( ForesterUtil.LINE_SEPARATOR );
293 sb.append( "Duplications sum : " + getDuplicationsSum() );
294 sb.append( ForesterUtil.LINE_SEPARATOR );
295 sb.append( "mapping cost L : " + computeMappingCostL() );
296 return sb.toString();
299 private static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) {
300 final Taxonomy tax = n.getNodeData().getTaxonomy();
303 return tax.getIdentifier().getValue();
305 return tax.getTaxonomyCode();
306 case SCIENTIFIC_NAME:
307 return tax.getScientificName();
309 return tax.getCommonName();
311 throw new IllegalArgumentException( "unknown comparison base for taxonomies: " + base );
315 enum TaxonomyComparisonBase {
316 ID, CODE, SCIENTIFIC_NAME, COMMON_NAME;