2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // Copyright (C) 2000-2001 Washington University School of Medicine
8 // and Howard Hughes Medical Institute
11 // This library is free software; you can redistribute it and/or
12 // modify it under the terms of the GNU Lesser General Public
13 // License as published by the Free Software Foundation; either
14 // version 2.1 of the License, or (at your option) any later version.
16 // This library is distributed in the hope that it will be useful,
17 // but WITHOUT ANY WARRANTY; without even the implied warranty of
18 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 // Lesser General Public License for more details.
21 // You should have received a copy of the GNU Lesser General Public
22 // License along with this library; if not, write to the Free Software
23 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
25 // Contact: phylosoft @ gmail . com
26 // WWW: www.phylosoft.org
28 package org.forester.sdi;
30 import java.util.HashMap;
33 import org.forester.phylogeny.Phylogeny;
34 import org.forester.phylogeny.PhylogenyNode;
35 import org.forester.phylogeny.data.Identifier;
36 import org.forester.phylogeny.data.Taxonomy;
37 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
38 import org.forester.util.ForesterUtil;
40 public abstract class SDI {
42 final Phylogeny _gene_tree;
43 final Phylogeny _species_tree;
44 int _duplications_sum; // Sum of duplications.
45 int _mapping_cost; // Mapping cost "L".
48 * Constructor which sets the gene tree and the species tree to be compared.
49 * species_tree is the species tree to which the gene tree gene_tree will be
51 * Infers for each PhylogenyNode of gene_tree whether
52 * it represents a speciation or duplication event by calculating and
53 * interpreting the mapping function M. The most parsimonious sequence of
54 * speciation and duplication events is assumed.
55 * The mapping cost L can be
56 * calculated with method "computeMappingCost()".
61 * <li>Both Trees must be rooted
62 * <li>Both Trees must have species names in the species name fields of all
63 * their external nodes
67 * reference to a rooted binary gene Phylogeny to which assign
68 * duplication vs speciation, must have species names in the
69 * species name fields for all external nodes
71 * reference to a rooted binary species Phylogeny which might get
72 * stripped in the process, must have species names in the
73 * species name fields for all external nodes
75 public SDI( final Phylogeny gene_tree, final Phylogeny species_tree ) {
76 if ( species_tree.isEmpty() || gene_tree.isEmpty() ) {
77 throw new IllegalArgumentException( "attempt to infer duplications using empty tree(s)" );
79 if ( !gene_tree.isRooted() ) {
80 throw new IllegalArgumentException( "attempt to infer duplications on unrooted gene tree" );
82 if ( !species_tree.isRooted() ) {
83 throw new IllegalArgumentException( "attempt to infer duplications on unrooted species tree" );
85 _gene_tree = gene_tree;
86 _species_tree = species_tree;
87 _duplications_sum = 0;
91 // Helper method for "computeMappingCost()".
92 private void computeMappingCostHelper( final PhylogenyNode g ) {
93 if ( !g.isExternal() ) {
94 computeMappingCostHelper( g.getChildNode1() );
95 computeMappingCostHelper( g.getChildNode2() );
96 if ( ( g.getLink() != g.getChildNode1().getLink() ) && ( g.getLink() != g.getChildNode2().getLink() ) ) {
97 _mapping_cost += ( ( g.getChildNode1().getLink().getId() + g.getChildNode2().getLink().getId() )
98 - ( 2 * g.getLink().getId() ) - 2 );
100 else if ( ( g.getLink() != g.getChildNode1().getLink() ) && ( g.getLink() == g.getChildNode2().getLink() ) ) {
101 _mapping_cost += ( ( g.getChildNode1().getLink().getId() - g.getLink().getId() ) + 1 );
103 else if ( ( g.getLink() == g.getChildNode1().getLink() ) && ( g.getLink() != g.getChildNode2().getLink() ) ) {
104 _mapping_cost += ( ( g.getChildNode2().getLink().getId() - g.getLink().getId() ) + 1 );
113 * Computes the cost of mapping the gene tree gene_tree onto the species
114 * tree species_tree. Before this method can be called, the mapping has to
115 * be calculated with method "infer(boolean)".
117 * Reference. Zhang, L. (1997) On a Mirkin-Muchnik-Smith Conjecture for
118 * Comparing Molecular Phylogenies. Journal of Computational Biology 4
121 * @return the mapping cost "L"
123 public int computeMappingCostL() {
124 _species_tree.levelOrderReID();
126 computeMappingCostHelper( _gene_tree.getRoot() );
127 return _mapping_cost;
130 private TaxonomyComparisonBase determineTaxonomyComparisonBase() {
131 TaxonomyComparisonBase base = null;
132 boolean all_have_id = true;
133 boolean all_have_code = true;
134 boolean all_have_sn = true;
135 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
136 final PhylogenyNode n = iter.next();
137 if ( n.getNodeData().isHasTaxonomy() ) {
138 final Taxonomy tax = n.getNodeData().getTaxonomy();
139 if ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
142 if ( ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
143 all_have_code = false;
145 if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) {
150 throw new IllegalArgumentException( "species tree node [" + n + "] has no taxonomic data" );
153 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
154 final PhylogenyNode n = iter.next();
155 if ( n.getNodeData().isHasTaxonomy() ) {
156 final Taxonomy tax = n.getNodeData().getTaxonomy();
157 if ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
160 if ( ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
161 all_have_code = false;
163 if ( ForesterUtil.isEmpty( tax.getScientificName() ) ) {
168 throw new IllegalArgumentException( "gene tree node [" + n + "] has no taxonomic data" );
172 base = TaxonomyComparisonBase.ID;
174 else if ( all_have_code ) {
175 base = TaxonomyComparisonBase.CODE;
177 else if ( all_have_sn ) {
178 base = TaxonomyComparisonBase.SCIENTIFIC_NAME;
181 throw new IllegalArgumentException( "gene tree and species tree have incomparable taxonomies" );
187 * Returns the number of duplications.
189 * @return number of duplications
191 public int getDuplicationsSum() {
192 return _duplications_sum;
196 * Returns the gene tree.
200 public Phylogeny getGeneTree() {
205 * Returns the species tree.
207 * @return species tree
209 public Phylogeny getSpeciesTree() {
210 return _species_tree;
214 * Calculates the mapping function for the external nodes of the gene tree:
215 * links (sets the field "link" of PhylogenyNode) each external
216 * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree
217 * which has the same species name.
218 * @throws SDIException
220 void linkNodesOfG() throws SDIException {
221 final Map<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
222 final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase();
223 // Put references to all external nodes of the species tree into a map.
224 // Stringyfied taxonomy is the key, node is the value.
225 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
226 final PhylogenyNode s = iter.next();
227 final String tax_str = taxonomyToString( s, tax_comp_base );
228 if ( speciestree_ext_nodes.containsKey( tax_str ) ) {
229 throw new IllegalArgumentException( "taxonomy [" + s.getNodeData().getTaxonomy()
230 + "] is not unique in species phylogeny" );
232 speciestree_ext_nodes.put( tax_str, s );
234 // Retrieve the reference to the node with a matching stringyfied taxonomy.
235 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
236 final PhylogenyNode g = iter.next();
237 final String tax_str = taxonomyToString( g, tax_comp_base );
238 final PhylogenyNode s = speciestree_ext_nodes.get( tax_str );
240 throw new IllegalArgumentException( "taxonomy [" + g.getNodeData().getTaxonomy()
241 + "] not present in species tree" );
248 * Calculates the mapping function for the external nodes of the gene tree:
249 * links (sets the field "link" of PhylogenyNode) each external by taxonomy
251 * PhylogenyNode of gene_tree to the external PhylogenyNode of species_tree
252 * which has the same species name.
253 * Olivier CHABROL : olivier.chabrol@univ-provence.fr
255 void linkNodesOfGByTaxonomyIdentifier() {
256 final HashMap<String, PhylogenyNode> speciestree_ext_nodes = new HashMap<String, PhylogenyNode>();
257 if ( _species_tree.getFirstExternalNode().isRoot() ) {
258 speciestree_ext_nodes.put( _species_tree.getFirstExternalNode().getNodeData().getTaxonomy().getIdentifier()
259 .getValue(), _species_tree.getFirstExternalNode() );
262 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
263 final PhylogenyNode s = iter.next();
264 speciestree_ext_nodes.put( s.getNodeData().getTaxonomy().getIdentifier().getValue(), s );
267 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
268 final PhylogenyNode g = iter.next();
269 final PhylogenyNode s = speciestree_ext_nodes
270 .get( g.getNodeData().getTaxonomy().getIdentifier().getValue() );
272 String message = "species [" + g.getNodeData().getTaxonomy().getIdentifier().getValue();
273 message += "] not present in species tree";
274 throw new IllegalArgumentException( message );
281 public String toString() {
282 final StringBuffer sb = new StringBuffer();
283 sb.append( getClass() );
284 sb.append( ForesterUtil.LINE_SEPARATOR );
285 sb.append( "Duplications sum : " + getDuplicationsSum() );
286 sb.append( ForesterUtil.LINE_SEPARATOR );
287 sb.append( "mapping cost L : " + computeMappingCostL() );
288 return sb.toString();
291 static String taxonomyToString( final PhylogenyNode n, final TaxonomyComparisonBase base ) {
294 final Identifier id = n.getNodeData().getTaxonomy().getIdentifier();
298 return id.getValuePlusProvider();
300 return n.getNodeData().getTaxonomy().getTaxonomyCode();
301 case SCIENTIFIC_NAME:
302 return n.getNodeData().getTaxonomy().getScientificName();
304 throw new IllegalArgumentException( "unknown comparison base for taxonomies: " + base );
308 public enum TaxonomyComparisonBase {
312 public String toString() {
313 return "taxonomy id";
319 public String toString() {
320 return "taxonomy code/mnemonic";
326 public String toString() {
327 return "scientific name";