2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
26 package org.forester.sdi;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
34 import java.util.SortedSet;
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyMethods;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Event;
40 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
41 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
42 import org.forester.util.ForesterUtil;
44 public final class GSDI implements GSDII {
46 private final boolean _most_parsimonious_duplication_model;
47 private final int _speciation_or_duplication_events_sum;
48 private final int _speciations_sum;
49 private final int _duplications_sum;
50 private final List<PhylogenyNode> _stripped_gene_tree_nodes;
51 private final List<PhylogenyNode> _stripped_species_tree_nodes;
52 private final Set<PhylogenyNode> _mapped_species_tree_nodes;
53 private final TaxonomyComparisonBase _tax_comp_base;
54 private final SortedSet<String> _scientific_names_mapped_to_reduced_specificity;
56 public GSDI( final Phylogeny gene_tree,
57 final Phylogeny species_tree,
58 final boolean most_parsimonious_duplication_model,
59 final boolean strip_gene_tree,
60 final boolean strip_species_tree ) throws SDIException {
61 this( gene_tree, species_tree, most_parsimonious_duplication_model, strip_gene_tree, strip_species_tree, true );
64 public GSDI( final Phylogeny gene_tree,
65 final Phylogeny species_tree,
66 final boolean most_parsimonious_duplication_model,
67 final boolean strip_gene_tree,
68 final boolean strip_species_tree,
69 final boolean transfer_taxonomy ) throws SDIException {
70 _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
71 if ( gene_tree.getRoot().getNumberOfDescendants() == 3 ) {
72 gene_tree.reRoot( gene_tree.getRoot().getChildNode( 2 ) );
74 final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree,
78 _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes();
79 _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes();
80 _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes();
81 _scientific_names_mapped_to_reduced_specificity = nodes_linking_result
82 .getScientificNamesMappedToReducedSpecificity();
83 _tax_comp_base = nodes_linking_result.getTaxCompBase();
84 PhylogenyMethods.preOrderReId( species_tree );
85 final GSDIsummaryResult gsdi_summary_result = geneTreePostOrderTraversal( gene_tree,
86 _most_parsimonious_duplication_model,
88 _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum();
89 _speciations_sum = gsdi_summary_result.getSpeciationsSum();
90 _duplications_sum = gsdi_summary_result.getDuplicationsSum();
93 public int getDuplicationsSum() {
94 return _duplications_sum;
98 public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
99 return _mapped_species_tree_nodes;
103 public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
104 return _scientific_names_mapped_to_reduced_specificity;
107 public final int getSpeciationOrDuplicationEventsSum() {
108 return _speciation_or_duplication_events_sum;
112 public final int getSpeciationsSum() {
113 return _speciations_sum;
117 public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
118 return _stripped_gene_tree_nodes;
122 public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
123 return _stripped_species_tree_nodes;
127 public TaxonomyComparisonBase getTaxCompBase() {
128 return _tax_comp_base;
132 public final String toString() {
133 final StringBuffer sb = new StringBuffer();
134 sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
135 sb.append( ForesterUtil.getLineSeparator() );
136 sb.append( "Speciations sum : " + getSpeciationsSum() );
137 sb.append( ForesterUtil.getLineSeparator() );
138 sb.append( "Duplications sum : " + getDuplicationsSum() );
139 sb.append( ForesterUtil.getLineSeparator() );
140 if ( !_most_parsimonious_duplication_model ) {
141 sb.append( "Speciation or duplications sum : " + getSpeciationOrDuplicationEventsSum() );
142 sb.append( ForesterUtil.getLineSeparator() );
144 return sb.toString();
148 * Traverses the subtree of PhylogenyNode g in postorder, calculating the
149 * mapping function M, and determines which nodes represent speciation
150 * events and which ones duplication events.
152 * Preconditions: Mapping M for external nodes must have been calculated and
153 * the species tree must be labeled in preorder.
155 * @param transfer_taxonomy
157 * @throws SDIException
160 final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree,
161 final boolean most_parsimonious_duplication_model,
162 final boolean transfer_taxonomy ) throws SDIException {
163 final GSDIsummaryResult res = new GSDIsummaryResult();
164 for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
165 final PhylogenyNode g = it.next();
166 if ( g.isInternal() ) {
167 if ( g.getNumberOfDescendants() != 2 ) {
168 throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants()
171 PhylogenyNode s1 = g.getChildNode1().getLink();
172 PhylogenyNode s2 = g.getChildNode2().getLink();
174 if ( s1.getId() > s2.getId() ) {
182 determineEvent( s1, g, most_parsimonious_duplication_model, res );
184 if ( transfer_taxonomy ) {
185 transferTaxonomy( g );
191 final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree,
192 final boolean most_parsimonious_duplication_model,
193 final int min_duplications ) throws SDIException {
194 final GSDIsummaryResult res = new GSDIsummaryResult();
195 for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
196 final PhylogenyNode g = it.next();
197 if ( g.isInternal() ) {
198 if ( g.getNumberOfDescendants() != 2 ) {
199 throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants()
202 PhylogenyNode s1 = g.getChildNode1().getLink();
203 PhylogenyNode s2 = g.getChildNode2().getLink();
205 if ( s1.getId() > s2.getId() ) {
213 determineEvent( s1, g, most_parsimonious_duplication_model, res );
214 if ( res.getDuplicationsSum() > min_duplications ) {
222 final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
223 final Phylogeny species_tree,
224 final boolean strip_gene_tree,
225 final boolean strip_species_tree ) throws SDIException {
226 final TaxonomyComparisonBase tax_comp_base = SDIutil.determineTaxonomyComparisonBase( gene_tree );
227 if ( tax_comp_base == null ) {
228 throw new RuntimeException( "failed to establish taxonomy linking base (taxonomy linking base is null)" );
230 return linkNodesOfG( gene_tree, species_tree, tax_comp_base, strip_gene_tree, strip_species_tree );
234 * This allows for linking of internal nodes of the species tree (as opposed
235 * to just external nodes, as in the method it overrides.
236 * If TaxonomyComparisonBase is null, it will try to determine it.
237 * @throws SDIException
240 final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
241 final Phylogeny species_tree,
242 final TaxonomyComparisonBase tax_comp_base,
243 final boolean strip_gene_tree,
244 final boolean strip_species_tree ) throws SDIException {
245 if ( tax_comp_base == null ) {
246 throw new IllegalArgumentException( "taxonomy linking base is null" );
248 final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
249 final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
250 final NodesLinkingResult res = new NodesLinkingResult();
251 res.setTaxCompBase( tax_comp_base );
252 // Stringyfied taxonomy is the key, node is the value.
253 for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) {
254 final PhylogenyNode s = iter.next();
255 species_tree_ext_nodes.add( s );
256 if ( s.getNodeData().isHasTaxonomy() ) {
257 final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() );
258 if ( !ForesterUtil.isEmpty( tax_str ) ) {
259 if ( species_to_node_map.containsKey( tax_str ) ) {
260 throw new SDIException( "taxonomy \"" + tax_str + "\" is not unique in species tree (using "
261 + res.getTaxCompBase() + " for linking to gene tree)" );
263 species_to_node_map.put( tax_str, s );
267 // Retrieve the reference to the node with a matching stringyfied taxonomy.
268 for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
269 final PhylogenyNode g = iter.next();
270 if ( !g.getNodeData().isHasTaxonomy() ) {
271 if ( strip_gene_tree ) {
272 res.getStrippedGeneTreeNodes().add( g );
275 throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
279 final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() );
280 if ( ForesterUtil.isEmpty( tax_str ) ) {
281 if ( strip_gene_tree ) {
282 res.getStrippedGeneTreeNodes().add( g );
285 throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
289 PhylogenyNode s = species_to_node_map.get( tax_str );
290 if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
291 && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
292 s = tryMapByRemovingOverlySpecificData( species_to_node_map,
294 res.getScientificNamesMappedToReducedSpecificity() );
297 if ( strip_gene_tree ) {
298 res.getStrippedGeneTreeNodes().add( g );
301 throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
302 + "\" not present in species tree" );
307 res.getMappedSpeciesTreeNodes().add( s );
312 if ( strip_gene_tree ) {
313 stripTree( gene_tree, res.getStrippedGeneTreeNodes() );
314 if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) {
315 throw new SDIException( "species could not be mapped between gene tree and species tree (based on "
316 + res.getTaxCompBase() + ")" );
319 if ( strip_species_tree ) {
320 stripSpeciesTree( species_tree, species_tree_ext_nodes, res );
325 static final void transferTaxonomy( final PhylogenyNode g ) {
327 throw new IllegalArgumentException( "gene tree node is null" );
329 final PhylogenyNode s = g.getLink();
331 throw new IllegalArgumentException( "mapped species tree node is null" );
333 if ( s.getNodeData().isHasTaxonomy() ) {
334 g.getNodeData().setTaxonomy( s.getNodeData().getTaxonomy() );
335 if ( g.isInternal() ) {
336 if ( g.getChildNode1().isInternal() && g.getChildNode1().getNodeData().isHasTaxonomy()
337 && ( g.getChildNode1().getNodeData().getTaxonomy() == s.getNodeData().getTaxonomy() ) ) {
338 g.getChildNode1().getNodeData().setTaxonomy( null );
340 if ( g.getChildNode2().isInternal() && g.getChildNode2().getNodeData().isHasTaxonomy()
341 && ( g.getChildNode2().getNodeData().getTaxonomy() == s.getNodeData().getTaxonomy() ) ) {
342 g.getChildNode2().getNodeData().setTaxonomy( null );
346 else if ( ForesterUtil.isEmpty( g.getName() ) && !ForesterUtil.isEmpty( s.getName() ) ) {
347 g.setName( s.getName() );
348 if ( g.isInternal() ) {
349 if ( g.getChildNode1().isInternal() && ( g.getChildNode1().getName() == s.getName() ) ) {
350 g.getChildNode1().setName( "" );
352 if ( g.getChildNode2().isInternal() && ( g.getChildNode2().getName() == s.getName() ) ) {
353 g.getChildNode2().setName( "" );
359 private final static void addScientificNamesMappedToReducedSpecificity( final String s1,
361 final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
362 scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
365 private final static void determineEvent( final PhylogenyNode s,
366 final PhylogenyNode g,
367 final boolean most_parsimonious_duplication_model,
368 final GSDIsummaryResult res ) {
369 boolean oyako = false;
370 if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
373 if ( g.getLink().getNumberOfDescendants() == 2 ) {
375 g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
376 res.increaseDuplicationsSum();
379 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
380 res.increaseSpeciationsSum();
385 final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
386 for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
388 while ( ( n.getParent() != s ) && ( n.getParent() != null ) ) {
396 boolean multiple = false;
397 for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
399 while ( ( n.getParent() != s ) && ( n.getParent() != null ) ) {
405 if ( set.contains( n ) ) {
411 g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
412 res.increaseDuplicationsSum();
415 if ( most_parsimonious_duplication_model ) {
416 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
417 res.increaseSpeciationsSum();
420 g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() );
421 res.increaseSpeciationOrDuplicationEventsSum();
426 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
427 res.increaseSpeciationsSum();
432 private final static void stripSpeciesTree( final Phylogeny species_tree,
433 final List<PhylogenyNode> species_tree_ext_nodes,
434 final NodesLinkingResult res ) {
435 for( final PhylogenyNode s : species_tree_ext_nodes ) {
436 if ( !res.getMappedSpeciesTreeNodes().contains( s ) ) {
437 species_tree.deleteSubtree( s, true );
438 res.getStrippedSpeciesTreeNodes().add( s );
441 species_tree.clearHashIdToNodeMap();
442 species_tree.externalNodesHaveChanged();
445 private final static void stripTree( final Phylogeny phy, final List<PhylogenyNode> strip_nodes ) {
446 for( final PhylogenyNode g : strip_nodes ) {
447 phy.deleteSubtree( g, true );
449 phy.clearHashIdToNodeMap();
450 phy.externalNodesHaveChanged();
453 private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
454 final String tax_str,
455 final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
456 PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map,
459 scientific_names_mapped_to_reduced_specificity );
461 if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
462 final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
463 s = species_to_node_map.get( new_tax_str );
465 addScientificNamesMappedToReducedSpecificity( tax_str,
467 scientific_names_mapped_to_reduced_specificity );
472 for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
473 " form ", " subform ", " cultivar ", " section ", " subsection " } ) {
474 s = tryMapByRemovingOverlySpecificData( species_to_node_map,
477 scientific_names_mapped_to_reduced_specificity );
486 private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
487 final String tax_str,
489 final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
490 final int i = tax_str.indexOf( term );
492 final String new_tax_str = tax_str.substring( 0, i ).trim();
493 final PhylogenyNode s = species_to_node_map.get( new_tax_str );
495 addScientificNamesMappedToReducedSpecificity( tax_str,
497 scientific_names_mapped_to_reduced_specificity );