added transfer of taxonomy for GSDI
[jalview.git] / forester / java / src / org / forester / sdi / GSDI.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.sdi;
27
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.Set;
34 import java.util.SortedSet;
35
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyMethods;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Event;
40 import org.forester.phylogeny.data.Taxonomy;
41 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
42 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
43 import org.forester.util.ForesterUtil;
44
45 public final class GSDI implements GSDII {
46
47     private final boolean                _most_parsimonious_duplication_model;
48     private final int                    _speciation_or_duplication_events_sum;
49     private final int                    _speciations_sum;
50     private final int                    _duplications_sum;
51     private final List<PhylogenyNode>    _stripped_gene_tree_nodes;
52     private final List<PhylogenyNode>    _stripped_species_tree_nodes;
53     private final Set<PhylogenyNode>     _mapped_species_tree_nodes;
54     private final TaxonomyComparisonBase _tax_comp_base;
55     private final SortedSet<String>      _scientific_names_mapped_to_reduced_specificity;
56
57     public GSDI( final Phylogeny gene_tree,
58                  final Phylogeny species_tree,
59                  final boolean most_parsimonious_duplication_model,
60                  final boolean strip_gene_tree,
61                  final boolean strip_species_tree ) throws SDIException {
62         _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
63         if ( gene_tree.getRoot().getNumberOfDescendants() == 3 ) {
64             gene_tree.reRoot( gene_tree.getRoot().getChildNode( 2 ) );
65         }
66         final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree,
67                                                                       species_tree,
68                                                                       strip_gene_tree,
69                                                                       strip_species_tree );
70         _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes();
71         _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes();
72         _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes();
73         _scientific_names_mapped_to_reduced_specificity = nodes_linking_result
74                 .getScientificNamesMappedToReducedSpecificity();
75         _tax_comp_base = nodes_linking_result.getTaxCompBase();
76         PhylogenyMethods.preOrderReId( species_tree );
77         final GSDIsummaryResult gsdi_summary_result = geneTreePostOrderTraversal( gene_tree,
78                                                                                   _most_parsimonious_duplication_model,
79                                                                                   true );
80         _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum();
81         _speciations_sum = gsdi_summary_result.getSpeciationsSum();
82         _duplications_sum = gsdi_summary_result.getDuplicationsSum();
83     }
84
85     public int getDuplicationsSum() {
86         return _duplications_sum;
87     }
88
89     @Override
90     public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
91         return _mapped_species_tree_nodes;
92     }
93
94     @Override
95     public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
96         return _scientific_names_mapped_to_reduced_specificity;
97     }
98
99     public final int getSpeciationOrDuplicationEventsSum() {
100         return _speciation_or_duplication_events_sum;
101     }
102
103     @Override
104     public final int getSpeciationsSum() {
105         return _speciations_sum;
106     }
107
108     @Override
109     public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
110         return _stripped_gene_tree_nodes;
111     }
112
113     @Override
114     public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
115         return _stripped_species_tree_nodes;
116     }
117
118     @Override
119     public TaxonomyComparisonBase getTaxCompBase() {
120         return _tax_comp_base;
121     }
122
123     @Override
124     public final String toString() {
125         final StringBuffer sb = new StringBuffer();
126         sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
127         sb.append( ForesterUtil.getLineSeparator() );
128         sb.append( "Speciations sum                    : " + getSpeciationsSum() );
129         sb.append( ForesterUtil.getLineSeparator() );
130         sb.append( "Duplications sum                   : " + getDuplicationsSum() );
131         sb.append( ForesterUtil.getLineSeparator() );
132         if ( !_most_parsimonious_duplication_model ) {
133             sb.append( "Speciation or duplications sum     : " + getSpeciationOrDuplicationEventsSum() );
134             sb.append( ForesterUtil.getLineSeparator() );
135         }
136         return sb.toString();
137     }
138
139     /**
140      * Traverses the subtree of PhylogenyNode g in postorder, calculating the
141      * mapping function M, and determines which nodes represent speciation
142      * events and which ones duplication events.
143      * <p>
144      * Preconditions: Mapping M for external nodes must have been calculated and
145      * the species tree must be labeled in preorder.
146      * <p>
147      * @param transfer_taxonomy 
148      * @return 
149      * @throws SDIException 
150      * 
151      */
152     final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree,
153                                                                final boolean most_parsimonious_duplication_model,
154                                                                final boolean transfer_taxonomy ) throws SDIException {
155         final GSDIsummaryResult res = new GSDIsummaryResult();
156         for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
157             final PhylogenyNode g = it.next();
158             if ( g.isInternal() ) {
159                 if ( g.getNumberOfDescendants() != 2 ) {
160                     throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants()
161                             + " descendents" );
162                 }
163                 PhylogenyNode s1 = g.getChildNode1().getLink();
164                 PhylogenyNode s2 = g.getChildNode2().getLink();
165                 while ( s1 != s2 ) {
166                     if ( s1.getId() > s2.getId() ) {
167                         s1 = s1.getParent();
168                     }
169                     else {
170                         s2 = s2.getParent();
171                     }
172                 }
173                 g.setLink( s1 );
174                 if ( transfer_taxonomy ) {
175                     transferTaxonomy( g, s1 );
176                 }
177                 determineEvent( s1, g, most_parsimonious_duplication_model, res );
178             }
179         }
180         return res;
181     }
182
183     private static final void transferTaxonomy( final PhylogenyNode g, final PhylogenyNode s ) {
184         if ( s.getNodeData().isHasTaxonomy() ) {
185             g.getNodeData().setTaxonomy( s.getNodeData().getTaxonomy() );
186             if ( g.isInternal() ) {
187                 if ( g.getChildNode1().isInternal() ) {
188                     if ( g.getChildNode1().getNodeData().isHasTaxonomy() && g.getChildNode1().getNodeData().getTaxonomy() == s.getNodeData().getTaxonomy() ) {
189                         g.getChildNode1().getNodeData().setTaxonomy( null );
190                     }
191                 }
192                 if ( g.getChildNode2().isInternal() ) {
193                     if ( g.getChildNode2().getNodeData().isHasTaxonomy() && g.getChildNode2().getNodeData().getTaxonomy() == s.getNodeData().getTaxonomy() ) {
194                         g.getChildNode2().getNodeData().setTaxonomy( null );
195                     }
196                 }
197             }
198         }
199         else if ( ForesterUtil.isEmpty( g.getName() ) && !ForesterUtil.isEmpty( s.getName() ) ) {
200             g.setName( s.getName() );
201             if ( g.isInternal() ) {
202                 if ( g.getChildNode1().isInternal() ) {
203                     if ( g.getChildNode1().getName() == s.getName() ) {
204                         g.getChildNode1().setName( "" );
205                     }
206                 }
207                 if ( g.getChildNode2().isInternal() ) {
208                     if ( g.getChildNode2().getName() == s.getName() ) {
209                         g.getChildNode2().setName( "" );
210                     }
211                 }
212             }
213         }
214     }
215
216     final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree,
217                                                                final boolean most_parsimonious_duplication_model,
218                                                                final int min_duplications ) throws SDIException {
219         final GSDIsummaryResult res = new GSDIsummaryResult();
220         for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
221             final PhylogenyNode g = it.next();
222             if ( g.isInternal() ) {
223                 if ( g.getNumberOfDescendants() != 2 ) {
224                     throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants()
225                             + " descendents" );
226                 }
227                 PhylogenyNode s1 = g.getChildNode1().getLink();
228                 PhylogenyNode s2 = g.getChildNode2().getLink();
229                 while ( s1 != s2 ) {
230                     if ( s1.getId() > s2.getId() ) {
231                         s1 = s1.getParent();
232                     }
233                     else {
234                         s2 = s2.getParent();
235                     }
236                 }
237                 g.setLink( s1 );
238                 determineEvent( s1, g, most_parsimonious_duplication_model, res );
239                 if ( res.getDuplicationsSum() > min_duplications ) {
240                     return null;
241                 }
242             }
243         }
244         return res;
245     }
246
247     final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
248                                                   final Phylogeny species_tree,
249                                                   final boolean strip_gene_tree,
250                                                   final boolean strip_species_tree ) throws SDIException {
251         final TaxonomyComparisonBase tax_comp_base = SDIutil.determineTaxonomyComparisonBase( gene_tree );
252         if ( tax_comp_base == null ) {
253             throw new RuntimeException( "failed to establish taxonomy linking base (taxonomy linking base is null)" );
254         }
255         return linkNodesOfG( gene_tree, species_tree, tax_comp_base, strip_gene_tree, strip_species_tree );
256     }
257
258     /**
259      * This allows for linking of internal nodes of the species tree (as opposed
260      * to just external nodes, as in the method it overrides.
261      * If TaxonomyComparisonBase is null, it will try to determine it.
262      * @throws SDIException 
263      * 
264      */
265     final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
266                                                   final Phylogeny species_tree,
267                                                   final TaxonomyComparisonBase tax_comp_base,
268                                                   final boolean strip_gene_tree,
269                                                   final boolean strip_species_tree ) throws SDIException {
270         if ( tax_comp_base == null ) {
271             throw new IllegalArgumentException( "taxonomy linking base is null" );
272         }
273         final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
274         final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
275         final NodesLinkingResult res = new NodesLinkingResult();
276         res.setTaxCompBase( tax_comp_base );
277         // Stringyfied taxonomy is the key, node is the value.
278         for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) {
279             final PhylogenyNode s = iter.next();
280             species_tree_ext_nodes.add( s );
281             if ( s.getNodeData().isHasTaxonomy() ) {
282                 final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() );
283                 if ( !ForesterUtil.isEmpty( tax_str ) ) {
284                     if ( species_to_node_map.containsKey( tax_str ) ) {
285                         throw new SDIException( "taxonomy \"" + tax_str + "\" is not unique in species tree (using "
286                                 + res.getTaxCompBase() + " for linking to gene tree)" );
287                     }
288                     species_to_node_map.put( tax_str, s );
289                 }
290             }
291         }
292         // Retrieve the reference to the node with a matching stringyfied taxonomy.
293         for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
294             final PhylogenyNode g = iter.next();
295             if ( !g.getNodeData().isHasTaxonomy() ) {
296                 if ( strip_gene_tree ) {
297                     res.getStrippedGeneTreeNodes().add( g );
298                 }
299                 else {
300                     throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
301                 }
302             }
303             else {
304                 final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() );
305                 if ( ForesterUtil.isEmpty( tax_str ) ) {
306                     if ( strip_gene_tree ) {
307                         res.getStrippedGeneTreeNodes().add( g );
308                     }
309                     else {
310                         throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
311                     }
312                 }
313                 else {
314                     PhylogenyNode s = species_to_node_map.get( tax_str );
315                     if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
316                             && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
317                         s = tryMapByRemovingOverlySpecificData( species_to_node_map,
318                                                                 tax_str,
319                                                                 res.getScientificNamesMappedToReducedSpecificity() );
320                     }
321                     if ( s == null ) {
322                         if ( strip_gene_tree ) {
323                             res.getStrippedGeneTreeNodes().add( g );
324                         }
325                         else {
326                             throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
327                                     + "\" not present in species tree" );
328                         }
329                     }
330                     else {
331                         g.setLink( s );
332                         res.getMappedSpeciesTreeNodes().add( s );
333                     }
334                 }
335             }
336         } // for loop
337         if ( strip_gene_tree ) {
338             stripTree( gene_tree, res.getStrippedGeneTreeNodes() );
339             if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) {
340                 throw new SDIException( "species could not be mapped between gene tree and species tree (based on "
341                         + res.getTaxCompBase() + ")" );
342             }
343         }
344         if ( strip_species_tree ) {
345             stripSpeciesTree( species_tree, species_tree_ext_nodes, res );
346         }
347         return res;
348     }
349
350     private final static void addScientificNamesMappedToReducedSpecificity( final String s1,
351                                                                             final String s2,
352                                                                             final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
353         scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
354     }
355
356     private final static void determineEvent( final PhylogenyNode s,
357                                               final PhylogenyNode g,
358                                               final boolean most_parsimonious_duplication_model,
359                                               final GSDIsummaryResult res ) {
360         boolean oyako = false;
361         if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
362             oyako = true;
363         }
364         if ( g.getLink().getNumberOfDescendants() == 2 ) {
365             if ( oyako ) {
366                 g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
367                 res.increaseDuplicationsSum();
368             }
369             else {
370                 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
371                 res.increaseSpeciationsSum();
372             }
373         }
374         else {
375             if ( oyako ) {
376                 final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
377                 for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
378                     n = n.getLink();
379                     while ( ( n.getParent() != s ) && ( n.getParent() != null ) ) {
380                         n = n.getParent();
381                         if ( n.isRoot() ) {
382                             break;
383                         }
384                     }
385                     set.add( n );
386                 }
387                 boolean multiple = false;
388                 for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
389                     n = n.getLink();
390                     while ( ( n.getParent() != s ) && ( n.getParent() != null ) ) {
391                         n = n.getParent();
392                         if ( n.isRoot() ) {
393                             break;
394                         }
395                     }
396                     if ( set.contains( n ) ) {
397                         multiple = true;
398                         break;
399                     }
400                 }
401                 if ( multiple ) {
402                     g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
403                     res.increaseDuplicationsSum();
404                 }
405                 else {
406                     if ( most_parsimonious_duplication_model ) {
407                         g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
408                         res.increaseSpeciationsSum();
409                     }
410                     else {
411                         g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() );
412                         res.increaseSpeciationOrDuplicationEventsSum();
413                     }
414                 }
415             }
416             else {
417                 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
418                 res.increaseSpeciationsSum();
419             }
420         }
421     }
422
423     private final static void stripSpeciesTree( final Phylogeny species_tree,
424                                                 final List<PhylogenyNode> species_tree_ext_nodes,
425                                                 final NodesLinkingResult res ) {
426         for( final PhylogenyNode s : species_tree_ext_nodes ) {
427             if ( !res.getMappedSpeciesTreeNodes().contains( s ) ) {
428                 species_tree.deleteSubtree( s, true );
429                 res.getStrippedSpeciesTreeNodes().add( s );
430             }
431         }
432         species_tree.clearHashIdToNodeMap();
433         species_tree.externalNodesHaveChanged();
434     }
435
436     private final static void stripTree( final Phylogeny phy, final List<PhylogenyNode> strip_nodes ) {
437         for( final PhylogenyNode g : strip_nodes ) {
438             phy.deleteSubtree( g, true );
439         }
440         phy.clearHashIdToNodeMap();
441         phy.externalNodesHaveChanged();
442     }
443
444     private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
445                                                                            final String tax_str,
446                                                                            final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
447         PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map,
448                                                               tax_str,
449                                                               " (",
450                                                               scientific_names_mapped_to_reduced_specificity );
451         if ( s == null ) {
452             if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
453                 final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
454                 s = species_to_node_map.get( new_tax_str );
455                 if ( s != null ) {
456                     addScientificNamesMappedToReducedSpecificity( tax_str,
457                                                                   new_tax_str,
458                                                                   scientific_names_mapped_to_reduced_specificity );
459                 }
460             }
461         }
462         if ( s == null ) {
463             for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
464                     " form ", " subform ", " cultivar ", " section ", " subsection " } ) {
465                 s = tryMapByRemovingOverlySpecificData( species_to_node_map,
466                                                         tax_str,
467                                                         t,
468                                                         scientific_names_mapped_to_reduced_specificity );
469                 if ( s != null ) {
470                     break;
471                 }
472             }
473         }
474         return s;
475     }
476
477     private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
478                                                                            final String tax_str,
479                                                                            final String term,
480                                                                            final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
481         final int i = tax_str.indexOf( term );
482         if ( i > 4 ) {
483             final String new_tax_str = tax_str.substring( 0, i ).trim();
484             final PhylogenyNode s = species_to_node_map.get( new_tax_str );
485             if ( s != null ) {
486                 addScientificNamesMappedToReducedSpecificity( tax_str,
487                                                               new_tax_str,
488                                                               scientific_names_mapped_to_reduced_specificity );
489             }
490             return s;
491         }
492         return null;
493     }
494 }