2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
26 package org.forester.sdi;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
34 import java.util.SortedSet;
35 import java.util.TreeSet;
37 import org.forester.phylogeny.Phylogeny;
38 import org.forester.phylogeny.PhylogenyMethods;
39 import org.forester.phylogeny.PhylogenyNode;
40 import org.forester.phylogeny.data.Event;
41 import org.forester.phylogeny.data.Taxonomy;
42 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
43 import org.forester.util.ForesterUtil;
46 * Implements our algorithm for speciation - duplication inference (SDI). <p>
47 * The initialization is accomplished by: </p> <ul> <li>method
48 * "linkExtNodesOfG()" of class SDI: setting the links for the external nodes of
49 * the gene tree <li>"preorderReID(int)" from class Phylogeny: numbering of
50 * nodes of the species tree in preorder <li>the optional stripping of the
51 * species tree is accomplished by method "stripTree(Phylogeny,Phylogeny)" of
52 * class Phylogeny </ul> <p> The recursion part is accomplished by this class'
53 * method "geneTreePostOrderTraversal(PhylogenyNode)". <p> Requires JDK 1.5 or
56 * @see SDI#linkNodesOfG()
58 * @see Phylogeny#preorderReID(int)
61 * PhylogenyMethods#taxonomyBasedDeletionOfExternalNodes(Phylogeny,Phylogeny)
63 * @see #geneTreePostOrderTraversal(PhylogenyNode)
65 * @author Christian M. Zmasek
67 public class GSDI extends SDI {
69 private final boolean _most_parsimonious_duplication_model;
70 private final boolean _strip_gene_tree;
71 private final boolean _strip_species_tree;
72 protected int _speciation_or_duplication_events_sum;
73 protected int _speciations_sum;
74 private final List<PhylogenyNode> _stripped_gene_tree_nodes;
75 private final List<PhylogenyNode> _stripped_species_tree_nodes;
76 private final Set<PhylogenyNode> _mapped_species_tree_nodes;
77 private TaxonomyComparisonBase _tax_comp_base;
78 private final SortedSet<String> _scientific_names_mapped_to_reduced_specificity;
80 public GSDI( final Phylogeny gene_tree,
81 final Phylogeny species_tree,
82 final boolean most_parsimonious_duplication_model,
83 final boolean strip_gene_tree,
84 final boolean strip_species_tree ) throws SDIException {
85 super( gene_tree, species_tree );
86 _speciation_or_duplication_events_sum = 0;
88 _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
89 _duplications_sum = 0;
90 _strip_gene_tree = strip_gene_tree;
91 _strip_species_tree = strip_species_tree;
92 _stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
93 _stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
94 _mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
95 _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
97 PhylogenyMethods.preOrderReId( getSpeciesTree() );
98 geneTreePostOrderTraversal();
101 GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean most_parsimonious_duplication_model )
102 throws SDIException {
103 this( gene_tree, species_tree, most_parsimonious_duplication_model, false, false );
106 public GSDI( final Phylogeny gene_tree,
107 final Phylogeny species_tree,
108 final boolean most_parsimonious_duplication_model,
109 final boolean strip_gene_tree,
110 final boolean strip_species_tree,
111 final int x ) throws SDIException {
112 super( gene_tree, species_tree );
113 _speciation_or_duplication_events_sum = 0;
114 _speciations_sum = 0;
115 _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
116 _duplications_sum = 0;
117 _strip_gene_tree = strip_gene_tree;
118 _strip_species_tree = strip_species_tree;
119 _stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
120 _stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
121 _mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
122 _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
125 // s is the node on the species tree g maps to.
126 private final void determineEvent( final PhylogenyNode s, final PhylogenyNode g ) {
127 boolean oyako = false;
128 if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
131 if ( g.getLink().getNumberOfDescendants() == 2 ) {
133 g.getNodeData().setEvent( createDuplicationEvent() );
136 g.getNodeData().setEvent( createSpeciationEvent() );
141 final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
142 for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
144 while ( n.getParent() != s ) {
152 boolean multiple = false;
153 for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
155 while ( n.getParent() != s ) {
161 if ( set.contains( n ) ) {
167 g.getNodeData().setEvent( createDuplicationEvent() );
170 if ( _most_parsimonious_duplication_model ) {
171 g.getNodeData().setEvent( createSpeciationEvent() );
174 g.getNodeData().setEvent( createSingleSpeciationOrDuplicationEvent() );
179 g.getNodeData().setEvent( createSpeciationEvent() );
185 * Traverses the subtree of PhylogenyNode g in postorder, calculating the
186 * mapping function M, and determines which nodes represent speciation
187 * events and which ones duplication events.
189 * Preconditions: Mapping M for external nodes must have been calculated and
190 * the species tree must be labeled in preorder.
194 final void geneTreePostOrderTraversal() {
195 for( final PhylogenyNodeIterator it = getGeneTree().iteratorPostorder(); it.hasNext(); ) {
196 final PhylogenyNode g = it.next();
197 if ( g.isInternal() ) {
198 PhylogenyNode s1 = g.getChildNode1().getLink();
199 PhylogenyNode s2 = g.getChildNode2().getLink();
201 if ( s1.getId() > s2.getId() ) {
209 determineEvent( s1, g );
214 private final Event createDuplicationEvent() {
215 final Event event = Event.createSingleDuplicationEvent();
220 private final Event createSingleSpeciationOrDuplicationEvent() {
221 final Event event = Event.createSingleSpeciationOrDuplicationEvent();
222 ++_speciation_or_duplication_events_sum;
226 private final Event createSpeciationEvent() {
227 final Event event = Event.createSingleSpeciationEvent();
232 public final int getSpeciationOrDuplicationEventsSum() {
233 return _speciation_or_duplication_events_sum;
236 public final int getSpeciationsSum() {
237 return _speciations_sum;
241 * This allows for linking of internal nodes of the species tree (as opposed
242 * to just external nodes, as in the method it overrides.
243 * @throws SDIException
247 final void linkNodesOfG() throws SDIException {
248 final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
249 final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
250 _tax_comp_base = determineTaxonomyComparisonBase( _gene_tree );
251 // Stringyfied taxonomy is the key, node is the value.
252 for( final PhylogenyNodeIterator iter = _species_tree.iteratorExternalForward(); iter.hasNext(); ) {
253 final PhylogenyNode s = iter.next();
254 species_tree_ext_nodes.add( s );
255 if ( s.getNodeData().isHasTaxonomy() ) {
256 final String tax_str = taxonomyToString( s, _tax_comp_base );
257 if ( !ForesterUtil.isEmpty( tax_str ) ) {
258 if ( species_to_node_map.containsKey( tax_str ) ) {
259 throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" );
261 species_to_node_map.put( tax_str, s );
265 // Retrieve the reference to the node with a matching stringyfied taxonomy.
266 for( final PhylogenyNodeIterator iter = _gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
267 final PhylogenyNode g = iter.next();
268 if ( !g.getNodeData().isHasTaxonomy() ) {
269 if ( _strip_gene_tree ) {
270 _stripped_gene_tree_nodes.add( g );
273 throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
277 final String tax_str = taxonomyToString( g, _tax_comp_base );
278 if ( ForesterUtil.isEmpty( tax_str ) ) {
279 if ( _strip_gene_tree ) {
280 _stripped_gene_tree_nodes.add( g );
283 throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
287 PhylogenyNode s = species_to_node_map.get( tax_str );
288 if ( ( _tax_comp_base == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
289 && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
290 s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str );
293 if ( _strip_gene_tree ) {
294 _stripped_gene_tree_nodes.add( g );
297 throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
298 + "\" not present in species tree" );
303 _mapped_species_tree_nodes.add( s );
308 if ( _strip_gene_tree ) {
310 if ( getGeneTree().isEmpty() || ( getGeneTree().getNumberOfExternalNodes() < 2 ) ) {
311 throw new SDIException( "species could not be mapped between gene tree and species tree" );
314 if ( _strip_species_tree ) {
315 stripSpeciesTree( species_tree_ext_nodes );
319 private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
320 final String tax_str ) {
321 PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, " (" );
323 if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
324 final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
325 s = species_to_node_map.get( new_tax_str );
327 addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str );
332 for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
333 " form ", " subform ", " cultivar ", " section ", " subsection " } ) {
334 s = tryMapByRemovingOverlySpecificData( species_to_node_map, tax_str, t );
343 private final PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
344 final String tax_str,
345 final String term ) {
346 final int i = tax_str.indexOf( term );
348 final String new_tax_str = tax_str.substring( 0, i ).trim();
349 final PhylogenyNode s = species_to_node_map.get( new_tax_str );
351 addScientificNamesMappedToReducedSpecificity( tax_str, new_tax_str );
358 private final void addScientificNamesMappedToReducedSpecificity( final String s1, final String s2 ) {
359 _scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
362 public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
363 return _scientific_names_mapped_to_reduced_specificity;
366 public TaxonomyComparisonBase getTaxCompBase() {
367 return _tax_comp_base;
370 private void stripSpeciesTree( final List<PhylogenyNode> species_tree_ext_nodes ) {
371 for( final PhylogenyNode s : species_tree_ext_nodes ) {
372 if ( !_mapped_species_tree_nodes.contains( s ) ) {
373 _species_tree.deleteSubtree( s, true );
374 _stripped_species_tree_nodes.add( s );
377 _species_tree.clearHashIdToNodeMap();
378 _species_tree.externalNodesHaveChanged();
381 public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
382 return _stripped_species_tree_nodes;
385 private void stripGeneTree() {
386 for( final PhylogenyNode g : _stripped_gene_tree_nodes ) {
387 _gene_tree.deleteSubtree( g, true );
389 _gene_tree.clearHashIdToNodeMap();
390 _gene_tree.externalNodesHaveChanged();
393 public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
394 return _mapped_species_tree_nodes;
397 public static TaxonomyComparisonBase determineTaxonomyComparisonBase( final Phylogeny gene_tree ) {
398 int with_id_count = 0;
399 int with_code_count = 0;
400 int with_sn_count = 0;
402 for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
403 final PhylogenyNode g = iter.next();
404 if ( g.getNodeData().isHasTaxonomy() ) {
405 final Taxonomy tax = g.getNodeData().getTaxonomy();
406 if ( ( tax.getIdentifier() != null ) && !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) {
407 if ( ++with_id_count > max ) {
411 if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
412 if ( ++with_code_count > max ) {
413 max = with_code_count;
416 if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
417 if ( ++with_sn_count > max ) {
424 throw new IllegalArgumentException( "gene tree has no taxonomic data" );
426 else if ( max == 1 ) {
427 throw new IllegalArgumentException( "gene tree has only one node with taxonomic data" );
429 else if ( max == with_id_count ) {
430 return SDI.TaxonomyComparisonBase.ID;
432 else if ( max == with_sn_count ) {
433 return SDI.TaxonomyComparisonBase.SCIENTIFIC_NAME;
436 return SDI.TaxonomyComparisonBase.CODE;
440 public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
441 return _stripped_gene_tree_nodes;
445 public final String toString() {
446 final StringBuffer sb = new StringBuffer();
447 sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
448 sb.append( ForesterUtil.getLineSeparator() );
449 sb.append( "Speciations sum : " + getSpeciationsSum() );
450 sb.append( ForesterUtil.getLineSeparator() );
451 sb.append( "Duplications sum : " + getDuplicationsSum() );
452 sb.append( ForesterUtil.getLineSeparator() );
453 if ( !_most_parsimonious_duplication_model ) {
454 sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() );
455 sb.append( ForesterUtil.getLineSeparator() );
457 sb.append( "mapping cost L : " + computeMappingCostL() );
458 return sb.toString();