2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org
28 package org.forester.sdi;
30 import java.util.HashMap;
33 import org.forester.phylogeny.Phylogeny;
34 import org.forester.phylogeny.PhylogenyNode;
35 import org.forester.phylogeny.data.Taxonomy;
36 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
37 import org.forester.util.ForesterUtil;
39 public abstract class SDI {
41 final Phylogeny _gene_tree;
42 final Phylogeny _species_tree;
43 int _duplications_sum; // Sum of duplications.
44 int _mapping_cost; // Mapping cost "L".
47 * Constructor which sets the gene tree and the species tree to be compared.
48 * species_tree is the species tree to which the gene tree gene_tree will be
50 * Infers for each PhylogenyNode of gene_tree whether
51 * it represents a speciation or duplication event by calculating and
52 * interpreting the mapping function M. The most parsimonious sequence of
53 * speciation and duplication events is assumed.
54 * The mapping cost L can be
55 * calculated with method "computeMappingCost()".
60 * <li>Both Trees must be rooted
61 * <li>Both Trees must have species names in the species name fields of all
62 * their external nodes
66 * reference to a rooted binary gene Phylogeny to which assign
67 * duplication vs speciation, must have species names in the
68 * species name fields for all external nodes
70 * reference to a rooted binary species Phylogeny which might get
71 * stripped in the process, must have species names in the
72 * species name fields for all external nodes
74 public SDI( final Phylogeny gene_tree, final Phylogeny species_tree ) {
75 if ( species_tree.isEmpty() || gene_tree.isEmpty() ) {
76 throw new IllegalArgumentException( "attempt to infer duplications using empty tree(s)" );
78 if ( !gene_tree.isRooted() ) {
79 throw new IllegalArgumentException( "attempt to infer duplications on unrooted gene tree" );
81 if ( !species_tree.isRooted() ) {
82 throw new IllegalArgumentException( "attempt to infer duplications on unrooted species tree" );
84 _gene_tree = gene_tree;
85 _species_tree = species_tree;
86 _duplications_sum = 0;
90 // Helper method for "computeMappingCost()".
91 private void computeMappingCostHelper( final PhylogenyNode g ) {
92 if ( !g.isExternal() ) {
93 computeMappingCostHelper( g.getChildNode1() );
94 computeMappingCostHelper( g.getChildNode2() );
95 if ( ( g.getLink() != g.getChildNode1().getLink() ) && ( g.getLink() != g.getChildNode2().getLink() ) ) {
96 _mapping_cost += ( g.getChildNode1().getLink().getId() + g.getChildNode2().getLink().getId()
97 - ( 2 * g.getLink().getId() ) - 2 );
99 else if ( ( g.getLink() != g.getChildNode1().getLink() ) && ( g.getLink() == g.getChildNode2().getLink() ) ) {
100 _mapping_cost += ( g.getChildNode1().getLink().getId() - g.getLink().getId() + 1 );
102 else if ( ( g.getLink() == g.getChildNode1().getLink() ) && ( g.getLink() != g.getChildNode2().getLink() ) ) {
103 _mapping_cost += ( g.getChildNode2().getLink().getId() - g.getLink().getId() + 1 );
112 * Computes the cost of mapping the gene tree gene_tree onto the species
113 * tree species_tree. Before this method can be called, the mapping has to
114 * be calculated with method "infer(boolean)".
116 * Reference. Zhang, L. (1997) On a Mirkin-Muchnik-Smith Conjecture for
117 * Comparing Molecular Phylogenies. Journal of Computational Biology 4
120 * @return the mapping cost "L"
122 public int computeMappingCostL() {
123 _species_tree.levelOrderReID();
125 computeMappingCostHelper( _gene_tree.getRoot() );
126 return _mapping_cost;
129 private TaxonomyComparisonBase determineTaxonomyComparisonBase() {
130 TaxonomyComparisonBase base = null;
131 boolean all_have_id = true;
132 boolean all_have_code = true;
133 boolean all_have_sn = true;
134 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
135 final PhylogenyNode n = iter.next();
136 if ( n.getNodeData().isHasTaxonomy() ) {
137 final Taxonomy tax = n.getNodeData().getTaxonomy();
138 if ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
141 if ( ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
142 all_have_code = false;
144 if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) {
149 throw new IllegalArgumentException( "species tree node [" + n + "] has no taxonomic data" );
152 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
153 final PhylogenyNode n = iter.next();
154 if ( n.getNodeData().isHasTaxonomy() ) {
155 final Taxonomy tax = n.getNodeData().getTaxonomy();
156 if ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
159 if ( ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
160 all_have_code = false;
162 if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) {
167 throw new IllegalArgumentException( "gene tree node [" + n + "] has no taxonomic data" );
171 base = TaxonomyComparisonBase.ID;
173 else if ( all_have_code ) {
174 base = TaxonomyComparisonBase.CODE;
176 else if ( all_have_sn ) {
177 base = TaxonomyComparisonBase.SCIENTIFIC_NAME;
180 throw new IllegalArgumentException( "gene tree and species tree have incomparable taxonomies" );
186 * Returns the number of duplications.
188 * @return number of duplications
190 public int getDuplicationsSum() {
191 return _duplications_sum;
195 * Returns the gene tree.
199 public Phylogeny getGeneTree() {
204 * Returns the species tree.
206 * @return species tree
208 public Phylogeny getSpeciesTree() {
209 return _species_tree;
213 * Calculates the mapping function for the external nodes of the gene tree:
214 * links (sets the field "link" of PhylogenyNode) each external
215 * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree
216 * which has the same species name.
217 * @throws SDIException
219 void linkNodesOfG() throws SDIException {
220 final Map<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
221 final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase();
222 // Put references to all external nodes of the species tree into a map.
223 // Stringyfied taxonomy is the key, node is the value.
224 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
225 final PhylogenyNode s = iter.next();
226 final String tax_str = taxonomyToString( s, tax_comp_base );
227 if ( speciestree_ext_nodes.containsKey( tax_str ) ) {
228 throw new IllegalArgumentException( "taxonomy [" + s.getNodeData().getTaxonomy()
229 + "] is not unique in species phylogeny" );
231 speciestree_ext_nodes.put( tax_str, s );
233 // Retrieve the reference to the node with a matching stringyfied taxonomy.
234 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
235 final PhylogenyNode g = iter.next();
236 final String tax_str = taxonomyToString( g, tax_comp_base );
237 final PhylogenyNode s = speciestree_ext_nodes.get( tax_str );
239 throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy()
240 + "] not present in species tree" );
247 * Calculates the mapping function for the external nodes of the gene tree:
248 * links (sets the field "link" of PhylogenyNode) each external by taxonomy
250 * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree
251 * which has the same species name.
252 * Olivier CHABROL : olivier.chabrol@univ-provence.fr
254 void linkNodesOfGByTaxonomyIdentifier() {
255 final HashMap<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
256 if ( _species_tree.getFirstExternalNode().isRoot() ) {
257 speciestree_ext_nodes.put( _species_tree.getFirstExternalNode().getNodeData().getTaxonomy().getIdentifier()
258 .getValue(), _species_tree.getFirstExternalNode() );
261 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
262 final PhylogenyNode s = iter.next();
263 speciestree_ext_nodes.put( s.getNodeData().getTaxonomy().getIdentifier().getValue(), s );
266 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
267 final PhylogenyNode g = iter.next();
268 final PhylogenyNode s = speciestree_ext_nodes
269 .get( g.getNodeData().getTaxonomy().getIdentifier().getValue() );
271 String message = "species [" + g.getNodeData().getTaxonomy().getIdentifier().getValue();
272 message += "] not present in species tree";
273 throw new IllegalArgumentException( message );
280 public String toString() {
281 final StringBuffer sb = new StringBuffer();
282 sb.append( getClass() );
283 sb.append( ForesterUtil.LINE_SEPARATOR );
284 sb.append( "Duplications sum : " + getDuplicationsSum() );
285 sb.append( ForesterUtil.LINE_SEPARATOR );
286 sb.append( "mapping cost L : " + computeMappingCostL() );
287 return sb.toString();
290 static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) {
291 final Taxonomy tax = n.getNodeData().getTaxonomy();
294 return tax.getIdentifier().getValue();
296 return tax.getTaxonomyCode();
297 case SCIENTIFIC_NAME:
298 return tax.getScientificName();
300 throw new IllegalArgumentException( "unknown comparison base for taxonomies: " + base );
304 public enum TaxonomyComparisonBase {
305 ID, CODE, SCIENTIFIC_NAME