d6e6fb6f9a13a600983ceb0e5a7abde025133f52
[jalview.git] / forester / java / src / org / forester / sdi / GSDI.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.sdi;
27
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.Set;
34 import java.util.SortedSet;
35
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyMethods;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Event;
40 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
41 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
42 import org.forester.util.ForesterUtil;
43
44 public final class GSDI {
45
46     private final boolean                _most_parsimonious_duplication_model;
47     private final int                    _speciation_or_duplication_events_sum;
48     private final int                    _speciations_sum;
49     private final int                    _duplications_sum;
50     private final List<PhylogenyNode>    _stripped_gene_tree_nodes;
51     private final List<PhylogenyNode>    _stripped_species_tree_nodes;
52     private final Set<PhylogenyNode>     _mapped_species_tree_nodes;
53     private final TaxonomyComparisonBase _tax_comp_base;
54     private final SortedSet<String>      _scientific_names_mapped_to_reduced_specificity;
55
56     public GSDI( final Phylogeny gene_tree,
57                  final Phylogeny species_tree,
58                  final boolean most_parsimonious_duplication_model,
59                  final boolean strip_gene_tree,
60                  final boolean strip_species_tree ) throws SDIException {
61         _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
62         final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree,
63                                                                       species_tree,
64                                                                       null,
65                                                                       strip_gene_tree,
66                                                                       strip_species_tree );
67         _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes();
68         _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes();
69         _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes();
70         _scientific_names_mapped_to_reduced_specificity = nodes_linking_result
71                 .getScientificNamesMappedToReducedSpecificity();
72         _tax_comp_base = nodes_linking_result.getTaxCompBase();
73         PhylogenyMethods.preOrderReId( species_tree );
74         final GSDIsummaryResult gsdi_summary_result = new GSDIsummaryResult();
75         geneTreePostOrderTraversal( gene_tree, _most_parsimonious_duplication_model, gsdi_summary_result );
76         _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum();
77         _speciations_sum = gsdi_summary_result.getSpeciationsSum();
78         _duplications_sum = gsdi_summary_result.getDuplicationsSum();
79     }
80
81     public int getDuplicationsSum() {
82         return _duplications_sum;
83     }
84
85     public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
86         return _mapped_species_tree_nodes;
87     }
88
89     public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
90         return _scientific_names_mapped_to_reduced_specificity;
91     }
92
93     public final int getSpeciationOrDuplicationEventsSum() {
94         return _speciation_or_duplication_events_sum;
95     }
96
97     public final int getSpeciationsSum() {
98         return _speciations_sum;
99     }
100
101     public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
102         return _stripped_gene_tree_nodes;
103     }
104
105     public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
106         return _stripped_species_tree_nodes;
107     }
108
109     public TaxonomyComparisonBase getTaxCompBase() {
110         return _tax_comp_base;
111     }
112
113     @Override
114     public final String toString() {
115         final StringBuffer sb = new StringBuffer();
116         sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
117         sb.append( ForesterUtil.getLineSeparator() );
118         sb.append( "Speciations sum                    : " + getSpeciationsSum() );
119         sb.append( ForesterUtil.getLineSeparator() );
120         sb.append( "Duplications sum                   : " + getDuplicationsSum() );
121         sb.append( ForesterUtil.getLineSeparator() );
122         if ( !_most_parsimonious_duplication_model ) {
123             sb.append( "Speciation or duplications sum     : " + getSpeciationOrDuplicationEventsSum() );
124             sb.append( ForesterUtil.getLineSeparator() );
125         }
126         return sb.toString();
127     }
128
129     /**
130      * Traverses the subtree of PhylogenyNode g in postorder, calculating the
131      * mapping function M, and determines which nodes represent speciation
132      * events and which ones duplication events.
133      * <p>
134      * Preconditions: Mapping M for external nodes must have been calculated and
135      * the species tree must be labeled in preorder.
136      * <p>
137      * 
138      */
139     final static void geneTreePostOrderTraversal( final Phylogeny gene_tree,
140                                                   final boolean most_parsimonious_duplication_model,
141                                                   final GSDIsummaryResult res ) {
142         for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
143             final PhylogenyNode g = it.next();
144             if ( g.isInternal() ) {
145                 PhylogenyNode s1 = g.getChildNode1().getLink();
146                 PhylogenyNode s2 = g.getChildNode2().getLink();
147                 while ( s1 != s2 ) {
148                     if ( s1.getId() > s2.getId() ) {
149                         s1 = s1.getParent();
150                     }
151                     else {
152                         s2 = s2.getParent();
153                     }
154                 }
155                 g.setLink( s1 );
156                 determineEvent( s1, g, most_parsimonious_duplication_model, res );
157             }
158         }
159     }
160
161     /**
162      * This allows for linking of internal nodes of the species tree (as opposed
163      * to just external nodes, as in the method it overrides.
164      * If TaxonomyComparisonBase is null, it will try to determine it.
165      * @throws SDIException 
166      * 
167      */
168     final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
169                                                   final Phylogeny species_tree,
170                                                   final TaxonomyComparisonBase tax_comp_base,
171                                                   final boolean strip_gene_tree,
172                                                   final boolean strip_species_tree ) throws SDIException {
173         final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
174         final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
175         final NodesLinkingResult res = new NodesLinkingResult();
176         if ( tax_comp_base == null ) {
177             res.setTaxCompBase( SDIutil.determineTaxonomyComparisonBase( gene_tree ) );
178         }
179         else {
180             res.setTaxCompBase( tax_comp_base );
181         }
182         // Stringyfied taxonomy is the key, node is the value.
183         for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) {
184             final PhylogenyNode s = iter.next();
185             species_tree_ext_nodes.add( s );
186             if ( s.getNodeData().isHasTaxonomy() ) {
187                 final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() );
188                 if ( !ForesterUtil.isEmpty( tax_str ) ) {
189                     if ( species_to_node_map.containsKey( tax_str ) ) {
190                         throw new SDIException( "taxonomy \"" + s + "\" is not unique in species tree" );
191                     }
192                     species_to_node_map.put( tax_str, s );
193                 }
194             }
195         }
196         // Retrieve the reference to the node with a matching stringyfied taxonomy.
197         for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
198             final PhylogenyNode g = iter.next();
199             if ( !g.getNodeData().isHasTaxonomy() ) {
200                 if ( strip_gene_tree ) {
201                     res.getStrippedGeneTreeNodes().add( g );
202                 }
203                 else {
204                     throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
205                 }
206             }
207             else {
208                 final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() );
209                 if ( ForesterUtil.isEmpty( tax_str ) ) {
210                     if ( strip_gene_tree ) {
211                         res.getStrippedGeneTreeNodes().add( g );
212                     }
213                     else {
214                         throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
215                     }
216                 }
217                 else {
218                     PhylogenyNode s = species_to_node_map.get( tax_str );
219                     if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
220                             && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
221                         s = tryMapByRemovingOverlySpecificData( species_to_node_map,
222                                                                 tax_str,
223                                                                 res.getScientificNamesMappedToReducedSpecificity() );
224                     }
225                     if ( s == null ) {
226                         if ( strip_gene_tree ) {
227                             res.getStrippedGeneTreeNodes().add( g );
228                         }
229                         else {
230                             throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
231                                     + "\" not present in species tree" );
232                         }
233                     }
234                     else {
235                         g.setLink( s );
236                         res.getMappedSpeciesTreeNodes().add( s );
237                     }
238                 }
239             }
240         } // for loop
241         if ( strip_gene_tree ) {
242             stripTree( gene_tree, res.getStrippedGeneTreeNodes() );
243             if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) {
244                 throw new SDIException( "species could not be mapped between gene tree and species tree" );
245             }
246         }
247         if ( strip_species_tree ) {
248             stripSpeciesTree( species_tree, species_tree_ext_nodes, res.getMappedSpeciesTreeNodes() );
249         }
250         return res;
251     }
252
253     private final static void addScientificNamesMappedToReducedSpecificity( final String s1,
254                                                                             final String s2,
255                                                                             final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
256         scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
257     }
258
259     // Used by GSDIR
260     //    protected GSDI( final Phylogeny gene_tree, final Phylogeny species_tree, final boolean strip_gene_tree )
261     //            throws SDIException {
262     //        super( gene_tree, species_tree );
263     //        _speciation_or_duplication_events_sum = -1;
264     //        _speciations_sum = 0;
265     //        _most_parsimonious_duplication_model = true;
266     //        _duplications_sum = 0;
267     //        _stripped_gene_tree_nodes = new ArrayList<PhylogenyNode>();
268     //        _stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
269     //        _mapped_species_tree_nodes = new HashSet<PhylogenyNode>();
270     //        _scientific_names_mapped_to_reduced_specificity = new TreeSet<String>();
271     //    }
272     // s is the node on the species tree g maps to.
273     private final static void determineEvent( final PhylogenyNode s,
274                                               final PhylogenyNode g,
275                                               final boolean most_parsimonious_duplication_model,
276                                               final GSDIsummaryResult res ) {
277         boolean oyako = false;
278         if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
279             oyako = true;
280         }
281         if ( g.getLink().getNumberOfDescendants() == 2 ) {
282             if ( oyako ) {
283                 g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
284                 res.increaseDuplicationsSum();
285             }
286             else {
287                 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
288                 res.increaseSpeciationsSum();
289             }
290         }
291         else {
292             if ( oyako ) {
293                 final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
294                 for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
295                     n = n.getLink();
296                     while ( n.getParent() != s ) {
297                         n = n.getParent();
298                         if ( n.isRoot() ) {
299                             break;
300                         }
301                     }
302                     set.add( n );
303                 }
304                 boolean multiple = false;
305                 for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
306                     n = n.getLink();
307                     while ( n.getParent() != s ) {
308                         n = n.getParent();
309                         if ( n.isRoot() ) {
310                             break;
311                         }
312                     }
313                     if ( set.contains( n ) ) {
314                         multiple = true;
315                         break;
316                     }
317                 }
318                 if ( multiple ) {
319                     g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
320                     res.increaseDuplicationsSum();
321                 }
322                 else {
323                     if ( most_parsimonious_duplication_model ) {
324                         g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
325                         res.increaseSpeciationsSum();
326                     }
327                     else {
328                         g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() );
329                         res.increaseSpeciationOrDuplicationEventsSum();
330                     }
331                 }
332             }
333             else {
334                 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
335                 res.increaseSpeciationsSum();
336             }
337         }
338     }
339
340     private final static List<PhylogenyNode> stripSpeciesTree( final Phylogeny species_tree,
341                                                                final List<PhylogenyNode> species_tree_ext_nodes,
342                                                                final Set<PhylogenyNode> keep ) {
343         final List<PhylogenyNode> stripped_species_tree_nodes = new ArrayList<PhylogenyNode>();
344         for( final PhylogenyNode s : species_tree_ext_nodes ) {
345             if ( !keep.contains( s ) ) {
346                 species_tree.deleteSubtree( s, true );
347                 stripped_species_tree_nodes.add( s );
348             }
349         }
350         species_tree.clearHashIdToNodeMap();
351         species_tree.externalNodesHaveChanged();
352         return stripped_species_tree_nodes;
353     }
354
355     private final static void stripTree( final Phylogeny phy, final List<PhylogenyNode> strip_nodes ) {
356         for( final PhylogenyNode g : strip_nodes ) {
357             phy.deleteSubtree( g, true );
358         }
359         phy.clearHashIdToNodeMap();
360         phy.externalNodesHaveChanged();
361     }
362
363     private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
364                                                                            final String tax_str,
365                                                                            final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
366         PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map,
367                                                               tax_str,
368                                                               " (",
369                                                               scientific_names_mapped_to_reduced_specificity );
370         if ( s == null ) {
371             if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
372                 final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
373                 s = species_to_node_map.get( new_tax_str );
374                 if ( s != null ) {
375                     addScientificNamesMappedToReducedSpecificity( tax_str,
376                                                                   new_tax_str,
377                                                                   scientific_names_mapped_to_reduced_specificity );
378                 }
379             }
380         }
381         if ( s == null ) {
382             for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
383                     " form ", " subform ", " cultivar ", " section ", " subsection " } ) {
384                 s = tryMapByRemovingOverlySpecificData( species_to_node_map,
385                                                         tax_str,
386                                                         t,
387                                                         scientific_names_mapped_to_reduced_specificity );
388                 if ( s != null ) {
389                     break;
390                 }
391             }
392         }
393         return s;
394     }
395
396     private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
397                                                                            final String tax_str,
398                                                                            final String term,
399                                                                            final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
400         final int i = tax_str.indexOf( term );
401         if ( i > 4 ) {
402             final String new_tax_str = tax_str.substring( 0, i ).trim();
403             final PhylogenyNode s = species_to_node_map.get( new_tax_str );
404             if ( s != null ) {
405                 addScientificNamesMappedToReducedSpecificity( tax_str,
406                                                               new_tax_str,
407                                                               scientific_names_mapped_to_reduced_specificity );
408             }
409             return s;
410         }
411         return null;
412     }
413 }