2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
26 package org.forester.sdi;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
35 import org.forester.phylogeny.Phylogeny;
36 import org.forester.phylogeny.PhylogenyNode;
37 import org.forester.phylogeny.data.Event;
38 import org.forester.phylogeny.data.Taxonomy;
39 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
40 import org.forester.util.ForesterUtil;
43 * Implements our algorithm for speciation - duplication inference (SDI). <p>
44 * The initialization is accomplished by: </p> <ul> <li>method
45 * "linkExtNodesOfG()" of class SDI: setting the links for the external nodes of
46 * the gene tree <li>"preorderReID(int)" from class Phylogeny: numbering of
47 * nodes of the species tree in preorder <li>the optional stripping of the
48 * species tree is accomplished by method "stripTree(Phylogeny,Phylogeny)" of
49 * class Phylogeny </ul> <p> The recursion part is accomplished by this class'
50 * method "geneTreePostOrderTraversal(PhylogenyNode)". <p> Requires JDK 1.5 or
53 * @see SDI#linkNodesOfG()
55 * @see Phylogeny#preorderReID(int)
58 * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny)
60 * @see #geneTreePostOrderTraversal(PhylogenyNode)
62 * @author Christian M. Zmasek
64 public final class GSDI extends SDI {
66 private final boolean _most_parsimonious_duplication_model;
67 private final boolean _strip_gene_tree;
68 private final boolean _strip_species_tree;
69 private int _speciation_or_duplication_events_sum;
70 private int _speciations_sum;
71 private final List<PhylogenyNode> _stripped_gene_tree_nodes;
72 private final List<PhylogenyNode> _stripped_species_tree_nodes;
73 private final Set<PhylogenyNode> _mapped_species_tree_nodes;
75 public GSDI( final Phylogeny gene_tree,
76 final Phylogeny species_tree,
77 final boolean most_parsimonious_duplication_model,
78 final boolean strip_gene_tree,
79 final boolean strip_species_tree ) throws SDIException {
80 super( gene_tree, species_tree );
81 _speciation_or_duplication_events_sum = 0;
83 _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
84 _duplications_sum = 0;
85 _strip_gene_tree = strip_gene_tree;
86 _strip_species_tree = strip_species_tree;
87 _stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
88 _stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
89 _mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
90 getSpeciesTree().preOrderReId();
92 geneTreePostOrderTraversal();
95 GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model )
97 this( gene_tree, species_tree, most_parsimonious_duplication_model, false, false );
100 // s is the node on the species tree g maps to.
101 private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) {
102 boolean oyako = false;
103 if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
106 if ( g.getLink().getNumberOfDescendants() == 2 ) {
108 g.getNodeData().setEvent( createDuplicationEvent() );
111 g.getNodeData().setEvent( createSpeciationEvent() );
116 final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
117 for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
119 while ( n.getParent() != s ) {
127 boolean multiple = false;
128 for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
130 while ( n.getParent() != s ) {
136 if ( set.contains( n ) ) {
142 g.getNodeData().setEvent( createDuplicationEvent() );
145 g.getNodeData().setEvent( createSingleSpeciationOrDuplicationEvent() );
149 g.getNodeData().setEvent( createSpeciationEvent() );
155 * Traverses the subtree of PhylogenyNode g in postorder, calculating the
156 * mapping function M, and determines which nodes represent speciation
157 * events and which ones duplication events.
159 * Preconditions: Mapping M for external nodes must have been calculated and
160 * the species tree must be labeled in preorder.
164 final void geneTreePostOrderTraversal() {
165 for( final PhylogenyNodeIterator it = getGeneTree().iteratorPostorder(); it.hasNext(); ) {
166 final PhylogenyNode g = it.next();
167 if ( g.isInternal() ) {
168 PhylogenyNode s1 = g.getChildNode1().getLink();
169 PhylogenyNode s2 = g.getChildNode2().getLink();
171 if ( s1.getId() > s2.getId() ) {
179 determineEvent( s1, g );
184 private final Event createDuplicationEvent() {
185 final Event event = Event.createSingleDuplicationEvent();
190 private final Event createSingleSpeciationOrDuplicationEvent() {
191 final Event event = Event.createSingleSpeciationOrDuplicationEvent();
192 ++_speciation_or_duplication_events_sum;
196 private final Event createSpeciationEvent() {
197 final Event event = Event.createSingleSpeciationEvent();
202 public final int getSpeciationOrDuplicationEventsSum() {
203 return _speciation_or_duplication_events_sum;
206 public final int getSpeciationsSum() {
207 return _speciations_sum;
211 * This allows for linking of internal nodes of the species tree (as opposed
212 * to just external nodes, as in the method it overrides.
213 * @throws SDIException
217 final void linkNodesOfG() throws SDIException {
218 final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
219 final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
220 final TaxonomyComparisonBase tax_comp_base = determineTaxonomyComparisonBase( _gene_tree );
221 // System.out.println( "comp base is: " + tax_comp_base );
222 // Stringyfied taxonomy is the key, node is the value.
223 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
224 final PhylogenyNode s = iter.next();
225 species_tree_ext_nodes.add( s );
226 final String tax_str = taxonomyToString( s, tax_comp_base );
227 if ( !ForesterUtil.isEmpty( tax_str ) ) {
228 if ( species_to_node_map.containsKey( tax_str ) ) {
229 throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" );
231 species_to_node_map.put( tax_str, s );
234 // Retrieve the reference to the node with a matching stringyfied taxonomy.
235 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
236 final PhylogenyNode g = iter.next();
237 if ( !g.getNodeData().isHasTaxonomy() ) {
238 if ( _strip_gene_tree ) {
239 _stripped_gene_tree_nodes.add( g );
242 throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
246 final String tax_str = taxonomyToString( g, tax_comp_base );
247 if ( ForesterUtil.isEmpty( tax_str ) ) {
248 if ( _strip_gene_tree ) {
249 _stripped_gene_tree_nodes.add( g );
252 throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
256 final PhylogenyNode s = species_to_node_map.get( tax_str );
258 if ( _strip_gene_tree ) {
259 _stripped_gene_tree_nodes.add( g );
262 throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
263 + "\" not present in species tree" );
268 _mapped_species_tree_nodes.add( s );
269 // System.out.println( "setting link of " + g + " to " + s );
274 if ( _strip_gene_tree ) {
275 for( final PhylogenyNode g : _stripped_gene_tree_nodes ) {
276 _gene_tree.deleteSubtree( g, true );
279 if ( _strip_species_tree ) {
280 for( final PhylogenyNode s : species_tree_ext_nodes ) {
281 if ( !_mapped_species_tree_nodes.contains( s ) ) {
282 _species_tree.deleteSubtree( s, true );
288 public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
289 return _mapped_species_tree_nodes;
292 public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) {
293 int with_id_count = 0;
294 int with_code_count = 0;
295 int with_sn_count = 0;
297 for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
298 final PhylogenyNode g = iter.next();
299 if ( g.getNodeData().isHasTaxonomy() ) {
300 final Taxonomy tax = g.getNodeData().getTaxonomy();
301 if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
302 if ( ++with_id_count > max ) {
306 if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
307 if ( ++with_code_count > max ) {
308 max = with_code_count;
311 if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
312 if ( ++with_sn_count > max ) {
319 throw new IllegalArgumentException( "gene tree has no taxonomic data" );
321 else if ( max == 1 ) {
322 throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" );
324 else if ( max == with_sn_count ) {
325 return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME;
327 else if ( max == with_id_count ) {
328 return SDI.TaxonomyComparisonBase.ID;
331 return SDI.TaxonomyComparisonBase.CODE;
335 public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
336 return _stripped_gene_tree_nodes;
340 public final String toString() {
341 final StringBuffer sb = new StringBuffer();
342 sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
343 sb.append( ForesterUtil.getLineSeparator() );
344 sb.append( "Speciations sum : " + getSpeciationsSum() );
345 sb.append( ForesterUtil.getLineSeparator() );
346 sb.append( "Duplications sum : " + getDuplicationsSum() );
347 sb.append( ForesterUtil.getLineSeparator() );
348 if ( !_most_parsimonious_duplication_model ) {
349 sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() );
350 sb.append( ForesterUtil.getLineSeparator() );
352 sb.append( "mapping cost L : " + computeMappingCostL() );
353 return sb.toString();