2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
26 package org.forester.sdi;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
34 import java.util.SortedSet;
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyMethods;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Event;
40 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
41 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
42 import org.forester.util.ForesterUtil;
44 public final class GSDI {
46 private final boolean _most_parsimonious_duplication_model;
47 private final int _speciation_or_duplication_events_sum;
48 private final int _speciations_sum;
49 private final int _duplications_sum;
50 private final List<PhylogenyNode> _stripped_gene_tree_nodes;
51 private final List<PhylogenyNode> _stripped_species_tree_nodes;
52 private final Set<PhylogenyNode> _mapped_species_tree_nodes;
53 private final TaxonomyComparisonBase _tax_comp_base;
54 private final SortedSet<String> _scientific_names_mapped_to_reduced_specificity;
56 public GSDI( final Phylogeny gene_tree,
57 final Phylogeny species_tree,
58 final boolean most_parsimonious_duplication_model,
59 final boolean strip_gene_tree,
60 final boolean strip_species_tree ) throws SDIException {
61 _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
62 final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree,
67 _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes();
68 _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes();
69 _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes();
70 _scientific_names_mapped_to_reduced_specificity = nodes_linking_result
71 .getScientificNamesMappedToReducedSpecificity();
72 _tax_comp_base = nodes_linking_result.getTaxCompBase();
73 PhylogenyMethods.preOrderReId( species_tree );
74 final GSDIsummaryResult gsdi_summary_result = new GSDIsummaryResult();
75 geneTreePostOrderTraversal( gene_tree, _most_parsimonious_duplication_model, gsdi_summary_result );
76 _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum();
77 _speciations_sum = gsdi_summary_result.getSpeciationsSum();
78 _duplications_sum = gsdi_summary_result.getDuplicationsSum();
81 public int getDuplicationsSum() {
82 return _duplications_sum;
85 public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
86 return _mapped_species_tree_nodes;
89 public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
90 return _scientific_names_mapped_to_reduced_specificity;
93 public final int getSpeciationOrDuplicationEventsSum() {
94 return _speciation_or_duplication_events_sum;
97 public final int getSpeciationsSum() {
98 return _speciations_sum;
101 public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
102 return _stripped_gene_tree_nodes;
105 public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
106 return _stripped_species_tree_nodes;
109 public TaxonomyComparisonBase getTaxCompBase() {
110 return _tax_comp_base;
114 public final String toString() {
115 final StringBuffer sb = new StringBuffer();
116 sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
117 sb.append( ForesterUtil.getLineSeparator() );
118 sb.append( "Speciations sum : " + getSpeciationsSum() );
119 sb.append( ForesterUtil.getLineSeparator() );
120 sb.append( "Duplications sum : " + getDuplicationsSum() );
121 sb.append( ForesterUtil.getLineSeparator() );
122 if ( !_most_parsimonious_duplication_model ) {
123 sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() );
124 sb.append( ForesterUtil.getLineSeparator() );
126 return sb.toString();
130 * Traverses the subtree of PhylogenyNode g in postorder, calculating the
131 * mapping function M, and determines which nodes represent speciation
132 * events and which ones duplication events.
134 * Preconditions: Mapping M for external nodes must have been calculated and
135 * the species tree must be labeled in preorder.
139 final static void geneTreePostOrderTraversal( final Phylogeny gene_tree,
140 final boolean most_parsimonious_duplication_model,
141 final GSDIsummaryResult res ) {
142 for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
143 final PhylogenyNode g = it.next();
144 if ( g.isInternal() ) {
145 PhylogenyNode s1 = g.getChildNode1().getLink();
146 PhylogenyNode s2 = g.getChildNode2().getLink();
148 if ( s1.getId() > s2.getId() ) {
156 determineEvent( s1, g, most_parsimonious_duplication_model, res );
162 * This allows for linking of internal nodes of the species tree (as opposed
163 * to just external nodes, as in the method it overrides.
164 * If TaxonomyComparisonBase is null, it will try to determine it.
165 * @throws SDIException
168 final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
169 final Phylogeny species_tree,
170 final TaxonomyComparisonBase tax_comp_base,
171 final boolean strip_gene_tree,
172 final boolean strip_species_tree ) throws SDIException {
173 final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
174 final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
175 final NodesLinkingResult res = new NodesLinkingResult();
176 if ( tax_comp_base == null ) {
177 res.setTaxCompBase( SDIutil.determineTaxonomyComparisonBase( gene_tree ) );
180 res.setTaxCompBase( tax_comp_base );
182 // Stringyfied taxonomy is the key, node is the value.
183 for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) {
184 final PhylogenyNode s = iter.next();
185 species_tree_ext_nodes.add( s );
186 if ( s.getNodeData().isHasTaxonomy() ) {
187 final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() );
188 if ( !ForesterUtil.isEmpty( tax_str ) ) {
189 if ( species_to_node_map.containsKey( tax_str ) ) {
190 throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" );
192 species_to_node_map.put( tax_str, s );
196 // Retrieve the reference to the node with a matching stringyfied taxonomy.
197 for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
198 final PhylogenyNode g = iter.next();
199 if ( !g.getNodeData().isHasTaxonomy() ) {
200 if ( strip_gene_tree ) {
201 res.getStrippedGeneTreeNodes().add( g );
204 throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
208 final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() );
209 if ( ForesterUtil.isEmpty( tax_str ) ) {
210 if ( strip_gene_tree ) {
211 res.getStrippedGeneTreeNodes().add( g );
214 throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
218 PhylogenyNode s = species_to_node_map.get( tax_str );
219 if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
220 && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
221 s = tryMapByRemovingOverlySpecificData( species_to_node_map,
223 res.getScientificNamesMappedToReducedSpecificity() );
226 if ( strip_gene_tree ) {
227 res.getStrippedGeneTreeNodes().add( g );
230 throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
231 + "\" not present in species tree" );
236 res.getMappedSpeciesTreeNodes().add( s );
241 if ( strip_gene_tree ) {
242 stripTree( gene_tree, res.getStrippedGeneTreeNodes() );
243 if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) {
244 throw new SDIException( "species could not be mapped between gene tree and species tree" );
247 if ( strip_species_tree ) {
248 stripSpeciesTree( species_tree, species_tree_ext_nodes, res.getMappedSpeciesTreeNodes() );
253 private final static void addScientificNamesMappedToReducedSpecificity( final String s1,
255 final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
256 scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
260 // protected GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean strip_gene_tree )
261 // throws SDIException {
262 // super( gene_tree, species_tree );
263 // _speciation_or_duplication_events_sum = -1;
264 // _speciations_sum = 0;
265 // _most_parsimonious_duplication_model = true;
266 // _duplications_sum = 0;
267 // _stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
268 // _stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
269 // _mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
270 // _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
272 // s is the node on the species tree g maps to.
273 private final static void determineEvent( final PhylogenyNode s,
274 final PhylogenyNode g,
275 final boolean most_parsimonious_duplication_model,
276 final GSDIsummaryResult res ) {
277 boolean oyako = false;
278 if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
281 if ( g.getLink().getNumberOfDescendants() == 2 ) {
283 g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
284 res.increaseDuplicationsSum();
287 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
288 res.increaseSpeciationsSum();
293 final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
294 for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
296 while ( n.getParent() != s ) {
304 boolean multiple = false;
305 for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
307 while ( n.getParent() != s ) {
313 if ( set.contains( n ) ) {
319 g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
320 res.increaseDuplicationsSum();
323 if ( most_parsimonious_duplication_model ) {
324 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
325 res.increaseSpeciationsSum();
328 g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() );
329 res.increaseSpeciationOrDuplicationEventsSum();
334 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
335 res.increaseSpeciationsSum();
340 private final static List<PhylogenyNode> stripSpeciesTree( final Phylogeny species_tree,
341 final List<PhylogenyNode> species_tree_ext_nodes,
342 final Set<PhylogenyNode> keep ) {
343 final List<PhylogenyNode> stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
344 for( final PhylogenyNode s : species_tree_ext_nodes ) {
345 if ( !keep.contains( s ) ) {
346 species_tree.deleteSubtree( s, true );
347 stripped_species_tree_nodes.add( s );
350 species_tree.clearHashIdToNodeMap();
351 species_tree.externalNodesHaveChanged();
352 return stripped_species_tree_nodes;
355 private final static void stripTree( final Phylogeny phy, final List<PhylogenyNode> strip_nodes ) {
356 for( final PhylogenyNode g : strip_nodes ) {
357 phy.deleteSubtree( g, true );
359 phy.clearHashIdToNodeMap();
360 phy.externalNodesHaveChanged();
363 private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
364 final String tax_str,
365 final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
366 PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map,
369 scientific_names_mapped_to_reduced_specificity );
371 if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
372 final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
373 s = species_to_node_map.get( new_tax_str );
375 addScientificNamesMappedToReducedSpecificity( tax_str,
377 scientific_names_mapped_to_reduced_specificity );
382 for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
383 " form ", " subform ", " cultivar ", " section ", " subsection " } ) {
384 s = tryMapByRemovingOverlySpecificData( species_to_node_map,
387 scientific_names_mapped_to_reduced_specificity );
396 private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
397 final String tax_str,
399 final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
400 final int i = tax_str.indexOf( term );
402 final String new_tax_str = tax_str.substring( 0, i ).trim();
403 final PhylogenyNode s = species_to_node_map.get( new_tax_str );
405 addScientificNamesMappedToReducedSpecificity( tax_str,
407 scientific_names_mapped_to_reduced_specificity );