f9cc14ea35799ccbcb66a9f4dcbb3f28ae2c0c78
[jalview.git] / forester / java / src / org / forester / sdi / GSDI.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.sdi;
27
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.Set;
34 import java.util.SortedSet;
35
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyMethods;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Event;
40 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
41 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
42 import org.forester.util.ForesterUtil;
43
44 public final class GSDI implements GSDII {
45
46     private final boolean                _most_parsimonious_duplication_model;
47     private final int                    _speciation_or_duplication_events_sum;
48     private final int                    _speciations_sum;
49     private final int                    _duplications_sum;
50     private final List<PhylogenyNode>    _stripped_gene_tree_nodes;
51     private final List<PhylogenyNode>    _stripped_species_tree_nodes;
52     private final Set<PhylogenyNode>     _mapped_species_tree_nodes;
53     private final TaxonomyComparisonBase _tax_comp_base;
54     private final SortedSet<String>      _scientific_names_mapped_to_reduced_specificity;
55
56     public GSDI( final Phylogeny gene_tree,
57                  final Phylogeny species_tree,
58                  final boolean most_parsimonious_duplication_model,
59                  final boolean strip_gene_tree,
60                  final boolean strip_species_tree ) throws SDIException {
61         _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
62         final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree,
63                                                                       species_tree,
64                                                                       null,
65                                                                       strip_gene_tree,
66                                                                       strip_species_tree );
67         _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes();
68         _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes();
69         _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes();
70         _scientific_names_mapped_to_reduced_specificity = nodes_linking_result
71                 .getScientificNamesMappedToReducedSpecificity();
72         _tax_comp_base = nodes_linking_result.getTaxCompBase();
73         PhylogenyMethods.preOrderReId( species_tree );
74         final GSDIsummaryResult gsdi_summary_result = geneTreePostOrderTraversal( gene_tree,
75                                                                                   _most_parsimonious_duplication_model );
76         _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum();
77         _speciations_sum = gsdi_summary_result.getSpeciationsSum();
78         _duplications_sum = gsdi_summary_result.getDuplicationsSum();
79     }
80
81     public int getDuplicationsSum() {
82         return _duplications_sum;
83     }
84
85     @Override
86     public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
87         return _mapped_species_tree_nodes;
88     }
89
90     @Override
91     public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
92         return _scientific_names_mapped_to_reduced_specificity;
93     }
94
95     public final int getSpeciationOrDuplicationEventsSum() {
96         return _speciation_or_duplication_events_sum;
97     }
98
99     @Override
100     public final int getSpeciationsSum() {
101         return _speciations_sum;
102     }
103
104     @Override
105     public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
106         return _stripped_gene_tree_nodes;
107     }
108
109     @Override
110     public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
111         return _stripped_species_tree_nodes;
112     }
113
114     @Override
115     public TaxonomyComparisonBase getTaxCompBase() {
116         return _tax_comp_base;
117     }
118
119     @Override
120     public final String toString() {
121         final StringBuffer sb = new StringBuffer();
122         sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
123         sb.append( ForesterUtil.getLineSeparator() );
124         sb.append( "Speciations sum                    : " + getSpeciationsSum() );
125         sb.append( ForesterUtil.getLineSeparator() );
126         sb.append( "Duplications sum                   : " + getDuplicationsSum() );
127         sb.append( ForesterUtil.getLineSeparator() );
128         if ( !_most_parsimonious_duplication_model ) {
129             sb.append( "Speciation or duplications sum     : " + getSpeciationOrDuplicationEventsSum() );
130             sb.append( ForesterUtil.getLineSeparator() );
131         }
132         return sb.toString();
133     }
134
135     /**
136      * Traverses the subtree of PhylogenyNode g in postorder, calculating the
137      * mapping function M, and determines which nodes represent speciation
138      * events and which ones duplication events.
139      * <p>
140      * Preconditions: Mapping M for external nodes must have been calculated and
141      * the species tree must be labeled in preorder.
142      * <p>
143      * @return 
144      * 
145      */
146     final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree,
147                                                                final boolean most_parsimonious_duplication_model ) {
148         final GSDIsummaryResult res = new GSDIsummaryResult();
149         for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
150             final PhylogenyNode g = it.next();
151             if ( g.isInternal() ) {
152                 PhylogenyNode s1 = g.getChildNode1().getLink();
153                 PhylogenyNode s2 = g.getChildNode2().getLink();
154                 while ( s1 != s2 ) {
155                     if ( s1.getId() > s2.getId() ) {
156                         s1 = s1.getParent();
157                     }
158                     else {
159                         s2 = s2.getParent();
160                     }
161                 }
162                 g.setLink( s1 );
163                 determineEvent( s1, g, most_parsimonious_duplication_model, res );
164             }
165         }
166         return res;
167     }
168
169     /**
170      * This allows for linking of internal nodes of the species tree (as opposed
171      * to just external nodes, as in the method it overrides.
172      * If TaxonomyComparisonBase is null, it will try to determine it.
173      * @throws SDIException 
174      * 
175      */
176     final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
177                                                   final Phylogeny species_tree,
178                                                   final TaxonomyComparisonBase tax_comp_base,
179                                                   final boolean strip_gene_tree,
180                                                   final boolean strip_species_tree ) throws SDIException {
181         final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
182         final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
183         final NodesLinkingResult res = new NodesLinkingResult();
184         if ( tax_comp_base == null ) {
185             res.setTaxCompBase( SDIutil.determineTaxonomyComparisonBase( gene_tree ) );
186         }
187         else {
188             res.setTaxCompBase( tax_comp_base );
189         }
190         // Stringyfied taxonomy is the key, node is the value.
191         for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) {
192             final PhylogenyNode s = iter.next();
193             species_tree_ext_nodes.add( s );
194             if ( s.getNodeData().isHasTaxonomy() ) {
195                 final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() );
196                 if ( !ForesterUtil.isEmpty( tax_str ) ) {
197                     if ( species_to_node_map.containsKey( tax_str ) ) {
198                         throw new SDIException( "taxonomy \"" + tax_str + "\" is not unique in species tree (using "
199                                 + res.getTaxCompBase() + " for linking to gene tree)" );
200                     }
201                     species_to_node_map.put( tax_str, s );
202                 }
203             }
204         }
205         // Retrieve the reference to the node with a matching stringyfied taxonomy.
206         for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
207             final PhylogenyNode g = iter.next();
208             if ( !g.getNodeData().isHasTaxonomy() ) {
209                 if ( strip_gene_tree ) {
210                     res.getStrippedGeneTreeNodes().add( g );
211                 }
212                 else {
213                     throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
214                 }
215             }
216             else {
217                 final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() );
218                 if ( ForesterUtil.isEmpty( tax_str ) ) {
219                     if ( strip_gene_tree ) {
220                         res.getStrippedGeneTreeNodes().add( g );
221                     }
222                     else {
223                         throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
224                     }
225                 }
226                 else {
227                     PhylogenyNode s = species_to_node_map.get( tax_str );
228                     if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
229                             && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
230                         s = tryMapByRemovingOverlySpecificData( species_to_node_map,
231                                                                 tax_str,
232                                                                 res.getScientificNamesMappedToReducedSpecificity() );
233                     }
234                     if ( s == null ) {
235                         if ( strip_gene_tree ) {
236                             res.getStrippedGeneTreeNodes().add( g );
237                         }
238                         else {
239                             throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
240                                     + "\" not present in species tree" );
241                         }
242                     }
243                     else {
244                         g.setLink( s );
245                         res.getMappedSpeciesTreeNodes().add( s );
246                     }
247                 }
248             }
249         } // for loop
250         if ( strip_gene_tree ) {
251             stripTree( gene_tree, res.getStrippedGeneTreeNodes() );
252             if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) {
253                 throw new SDIException( "species could not be mapped between gene tree and species tree" );
254             }
255         }
256         if ( strip_species_tree ) {
257             stripSpeciesTree( species_tree, species_tree_ext_nodes, res );
258         }
259         return res;
260     }
261
262     private final static void addScientificNamesMappedToReducedSpecificity( final String s1,
263                                                                             final String s2,
264                                                                             final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
265         scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
266     }
267
268     private final static void determineEvent( final PhylogenyNode s,
269                                               final PhylogenyNode g,
270                                               final boolean most_parsimonious_duplication_model,
271                                               final GSDIsummaryResult res ) {
272         boolean oyako = false;
273         if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
274             oyako = true;
275         }
276         if ( g.getLink().getNumberOfDescendants() == 2 ) {
277             if ( oyako ) {
278                 g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
279                 res.increaseDuplicationsSum();
280             }
281             else {
282                 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
283                 res.increaseSpeciationsSum();
284             }
285         }
286         else {
287             if ( oyako ) {
288                 final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
289                 for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
290                     n = n.getLink();
291                     while ( n.getParent() != s ) {
292                         n = n.getParent();
293                         if ( n.isRoot() ) {
294                             break;
295                         }
296                     }
297                     set.add( n );
298                 }
299                 boolean multiple = false;
300                 for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
301                     n = n.getLink();
302                     while ( n.getParent() != s ) {
303                         n = n.getParent();
304                         if ( n.isRoot() ) {
305                             break;
306                         }
307                     }
308                     if ( set.contains( n ) ) {
309                         multiple = true;
310                         break;
311                     }
312                 }
313                 if ( multiple ) {
314                     g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
315                     res.increaseDuplicationsSum();
316                 }
317                 else {
318                     if ( most_parsimonious_duplication_model ) {
319                         g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
320                         res.increaseSpeciationsSum();
321                     }
322                     else {
323                         g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() );
324                         res.increaseSpeciationOrDuplicationEventsSum();
325                     }
326                 }
327             }
328             else {
329                 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
330                 res.increaseSpeciationsSum();
331             }
332         }
333     }
334
335     private final static void stripSpeciesTree( final Phylogeny species_tree,
336                                                 final List<PhylogenyNode> species_tree_ext_nodes,
337                                                 final NodesLinkingResult res ) {
338         for( final PhylogenyNode s : species_tree_ext_nodes ) {
339             if ( !res.getMappedSpeciesTreeNodes().contains( s ) ) {
340                 species_tree.deleteSubtree( s, true );
341                 res.getStrippedSpeciesTreeNodes().add( s );
342             }
343         }
344         species_tree.clearHashIdToNodeMap();
345         species_tree.externalNodesHaveChanged();
346     }
347
348     private final static void stripTree( final Phylogeny phy, final List<PhylogenyNode> strip_nodes ) {
349         for( final PhylogenyNode g : strip_nodes ) {
350             phy.deleteSubtree( g, true );
351         }
352         phy.clearHashIdToNodeMap();
353         phy.externalNodesHaveChanged();
354     }
355
356     private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
357                                                                            final String tax_str,
358                                                                            final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
359         PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map,
360                                                               tax_str,
361                                                               " (",
362                                                               scientific_names_mapped_to_reduced_specificity );
363         if ( s == null ) {
364             if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
365                 final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
366                 s = species_to_node_map.get( new_tax_str );
367                 if ( s != null ) {
368                     addScientificNamesMappedToReducedSpecificity( tax_str,
369                                                                   new_tax_str,
370                                                                   scientific_names_mapped_to_reduced_specificity );
371                 }
372             }
373         }
374         if ( s == null ) {
375             for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
376                     " form ", " subform ", " cultivar ", " section ", " subsection " } ) {
377                 s = tryMapByRemovingOverlySpecificData( species_to_node_map,
378                                                         tax_str,
379                                                         t,
380                                                         scientific_names_mapped_to_reduced_specificity );
381                 if ( s != null ) {
382                     break;
383                 }
384             }
385         }
386         return s;
387     }
388
389     private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
390                                                                            final String tax_str,
391                                                                            final String term,
392                                                                            final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
393         final int i = tax_str.indexOf( term );
394         if ( i > 4 ) {
395             final String new_tax_str = tax_str.substring( 0, i ).trim();
396             final PhylogenyNode s = species_to_node_map.get( new_tax_str );
397             if ( s != null ) {
398                 addScientificNamesMappedToReducedSpecificity( tax_str,
399                                                               new_tax_str,
400                                                               scientific_names_mapped_to_reduced_specificity );
401             }
402             return s;
403         }
404         return null;
405     }
406 }