fixed 2nd rio bug
[jalview.git] / forester / java / src / org / forester / sdi / GSDI.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
25
26 package org.forester.sdi;
27
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.Set;
34 import java.util.SortedSet;
35
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyMethods;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Event;
40 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
41 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
42 import org.forester.util.ForesterUtil;
43
44 public final class GSDI implements GSDII {
45
46     private final boolean                _most_parsimonious_duplication_model;
47     private final int                    _speciation_or_duplication_events_sum;
48     private final int                    _speciations_sum;
49     private final int                    _duplications_sum;
50     private final List<PhylogenyNode>    _stripped_gene_tree_nodes;
51     private final List<PhylogenyNode>    _stripped_species_tree_nodes;
52     private final Set<PhylogenyNode>     _mapped_species_tree_nodes;
53     private final TaxonomyComparisonBase _tax_comp_base;
54     private final SortedSet<String>      _scientific_names_mapped_to_reduced_specificity;
55
56     public GSDI( final Phylogeny gene_tree,
57                  final Phylogeny species_tree,
58                  final boolean most_parsimonious_duplication_model,
59                  final boolean strip_gene_tree,
60                  final boolean strip_species_tree ) throws SDIException {
61         _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
62         if ( gene_tree.getRoot().getNumberOfDescendants() == 3 ) {
63             gene_tree.reRoot( gene_tree.getRoot().getChildNode( 2 ) );
64         }
65         final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree,
66                                                                       species_tree,
67                                                                       strip_gene_tree,
68                                                                       strip_species_tree );
69         _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes();
70         _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes();
71         _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes();
72         _scientific_names_mapped_to_reduced_specificity = nodes_linking_result
73                 .getScientificNamesMappedToReducedSpecificity();
74         _tax_comp_base = nodes_linking_result.getTaxCompBase();
75         PhylogenyMethods.preOrderReId( species_tree );
76         final GSDIsummaryResult gsdi_summary_result = geneTreePostOrderTraversal( gene_tree,
77                                                                                   _most_parsimonious_duplication_model );
78         _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum();
79         _speciations_sum = gsdi_summary_result.getSpeciationsSum();
80         _duplications_sum = gsdi_summary_result.getDuplicationsSum();
81     }
82
83     public int getDuplicationsSum() {
84         return _duplications_sum;
85     }
86
87     @Override
88     public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
89         return _mapped_species_tree_nodes;
90     }
91
92     @Override
93     public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
94         return _scientific_names_mapped_to_reduced_specificity;
95     }
96
97     public final int getSpeciationOrDuplicationEventsSum() {
98         return _speciation_or_duplication_events_sum;
99     }
100
101     @Override
102     public final int getSpeciationsSum() {
103         return _speciations_sum;
104     }
105
106     @Override
107     public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
108         return _stripped_gene_tree_nodes;
109     }
110
111     @Override
112     public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
113         return _stripped_species_tree_nodes;
114     }
115
116     @Override
117     public TaxonomyComparisonBase getTaxCompBase() {
118         return _tax_comp_base;
119     }
120
121     @Override
122     public final String toString() {
123         final StringBuffer sb = new StringBuffer();
124         sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
125         sb.append( ForesterUtil.getLineSeparator() );
126         sb.append( "Speciations sum                    : " + getSpeciationsSum() );
127         sb.append( ForesterUtil.getLineSeparator() );
128         sb.append( "Duplications sum                   : " + getDuplicationsSum() );
129         sb.append( ForesterUtil.getLineSeparator() );
130         if ( !_most_parsimonious_duplication_model ) {
131             sb.append( "Speciation or duplications sum     : " + getSpeciationOrDuplicationEventsSum() );
132             sb.append( ForesterUtil.getLineSeparator() );
133         }
134         return sb.toString();
135     }
136
137     /**
138      * Traverses the subtree of PhylogenyNode g in postorder, calculating the
139      * mapping function M, and determines which nodes represent speciation
140      * events and which ones duplication events.
141      * <p>
142      * Preconditions: Mapping M for external nodes must have been calculated and
143      * the species tree must be labeled in preorder.
144      * <p>
145      * @return 
146      * @throws SDIException 
147      * 
148      */
149     final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree,
150                                                                final boolean most_parsimonious_duplication_model )
151             throws SDIException {
152         final GSDIsummaryResult res = new GSDIsummaryResult();
153         for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
154             final PhylogenyNode g = it.next();
155             if ( g.isInternal() ) {
156                 if ( g.getNumberOfDescendants() != 2 ) {
157                     throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants()
158                             + " descendents" );
159                 }
160                 PhylogenyNode s1 = g.getChildNode1().getLink();
161                 PhylogenyNode s2 = g.getChildNode2().getLink();
162                 while ( s1 != s2 ) {
163                     if ( s1.getId() > s2.getId() ) {
164                         s1 = s1.getParent();
165                     }
166                     else {
167                         s2 = s2.getParent();
168                     }
169                 }
170                 g.setLink( s1 );
171                 determineEvent( s1, g, most_parsimonious_duplication_model, res );
172             }
173         }
174         return res;
175     }
176
177     final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree,
178                                                                final boolean most_parsimonious_duplication_model,
179                                                                final int min_duplications ) throws SDIException {
180         final GSDIsummaryResult res = new GSDIsummaryResult();
181         for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
182             final PhylogenyNode g = it.next();
183             if ( g.isInternal() ) {
184                 if ( g.getNumberOfDescendants() != 2 ) {
185                     throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants()
186                             + " descendents" );
187                 }
188                 PhylogenyNode s1 = g.getChildNode1().getLink();
189                 PhylogenyNode s2 = g.getChildNode2().getLink();
190                 while ( s1 != s2 ) {
191                     if ( s1.getId() > s2.getId() ) {
192                         s1 = s1.getParent();
193                     }
194                     else {
195                         s2 = s2.getParent();
196                     }
197                 }
198                 g.setLink( s1 );
199                 determineEvent( s1, g, most_parsimonious_duplication_model, res );
200                 if ( res.getDuplicationsSum() > min_duplications ) {
201                     return null;
202                 }
203             }
204         }
205         return res;
206     }
207
208     final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
209                                                   final Phylogeny species_tree,
210                                                   final boolean strip_gene_tree,
211                                                   final boolean strip_species_tree ) throws SDIException {
212         final TaxonomyComparisonBase tax_comp_base = SDIutil.determineTaxonomyComparisonBase( gene_tree );
213         if ( tax_comp_base == null ) {
214             throw new RuntimeException( "failed to establish taxonomy linking base (taxonomy linking base is null)" );
215         }
216         return linkNodesOfG( gene_tree, species_tree, tax_comp_base, strip_gene_tree, strip_species_tree );
217     }
218
219     /**
220      * This allows for linking of internal nodes of the species tree (as opposed
221      * to just external nodes, as in the method it overrides.
222      * If TaxonomyComparisonBase is null, it will try to determine it.
223      * @throws SDIException 
224      * 
225      */
226     final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
227                                                   final Phylogeny species_tree,
228                                                   final TaxonomyComparisonBase tax_comp_base,
229                                                   final boolean strip_gene_tree,
230                                                   final boolean strip_species_tree ) throws SDIException {
231         if ( tax_comp_base == null ) {
232             throw new IllegalArgumentException( "taxonomy linking base is null" );
233         }
234         final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
235         final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
236         final NodesLinkingResult res = new NodesLinkingResult();
237         res.setTaxCompBase( tax_comp_base );
238         // Stringyfied taxonomy is the key, node is the value.
239         for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) {
240             final PhylogenyNode s = iter.next();
241             species_tree_ext_nodes.add( s );
242             if ( s.getNodeData().isHasTaxonomy() ) {
243                 final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() );
244                 if ( !ForesterUtil.isEmpty( tax_str ) ) {
245                     if ( species_to_node_map.containsKey( tax_str ) ) {
246                         throw new SDIException( "taxonomy \"" + tax_str + "\" is not unique in species tree (using "
247                                 + res.getTaxCompBase() + " for linking to gene tree)" );
248                     }
249                     species_to_node_map.put( tax_str, s );
250                 }
251             }
252         }
253         // Retrieve the reference to the node with a matching stringyfied taxonomy.
254         for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
255             final PhylogenyNode g = iter.next();
256             if ( !g.getNodeData().isHasTaxonomy() ) {
257                 if ( strip_gene_tree ) {
258                     res.getStrippedGeneTreeNodes().add( g );
259                 }
260                 else {
261                     throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
262                 }
263             }
264             else {
265                 final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() );
266                 if ( ForesterUtil.isEmpty( tax_str ) ) {
267                     if ( strip_gene_tree ) {
268                         res.getStrippedGeneTreeNodes().add( g );
269                     }
270                     else {
271                         throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
272                     }
273                 }
274                 else {
275                     PhylogenyNode s = species_to_node_map.get( tax_str );
276                     if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
277                             && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
278                         s = tryMapByRemovingOverlySpecificData( species_to_node_map,
279                                                                 tax_str,
280                                                                 res.getScientificNamesMappedToReducedSpecificity() );
281                     }
282                     if ( s == null ) {
283                         if ( strip_gene_tree ) {
284                             res.getStrippedGeneTreeNodes().add( g );
285                         }
286                         else {
287                             throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
288                                     + "\" not present in species tree" );
289                         }
290                     }
291                     else {
292                         g.setLink( s );
293                         res.getMappedSpeciesTreeNodes().add( s );
294                     }
295                 }
296             }
297         } // for loop
298         if ( strip_gene_tree ) {
299             stripTree( gene_tree, res.getStrippedGeneTreeNodes() );
300             if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) {
301                 throw new SDIException( "species could not be mapped between gene tree and species tree (based on "
302                         + res.getTaxCompBase() + ")" );
303             }
304         }
305         if ( strip_species_tree ) {
306             stripSpeciesTree( species_tree, species_tree_ext_nodes, res );
307         }
308         return res;
309     }
310
311     private final static void addScientificNamesMappedToReducedSpecificity( final String s1,
312                                                                             final String s2,
313                                                                             final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
314         scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
315     }
316
317     private final static void determineEvent( final PhylogenyNode s,
318                                               final PhylogenyNode g,
319                                               final boolean most_parsimonious_duplication_model,
320                                               final GSDIsummaryResult res ) {
321         boolean oyako = false;
322         if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
323             oyako = true;
324         }
325         if ( g.getLink().getNumberOfDescendants() == 2 ) {
326             if ( oyako ) {
327                 g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
328                 res.increaseDuplicationsSum();
329             }
330             else {
331                 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
332                 res.increaseSpeciationsSum();
333             }
334         }
335         else {
336             if ( oyako ) {
337                 final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
338                 for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
339                     n = n.getLink();
340                     while ( ( n.getParent() != s ) && ( n.getParent() != null ) ) {
341                         n = n.getParent();
342                         if ( n.isRoot() ) {
343                             break;
344                         }
345                     }
346                     set.add( n );
347                 }
348                 boolean multiple = false;
349                 for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
350                     n = n.getLink();
351                     while ( ( n.getParent() != s ) && ( n.getParent() != null ) ) {
352                         n = n.getParent();
353                         if ( n.isRoot() ) {
354                             break;
355                         }
356                     }
357                     if ( set.contains( n ) ) {
358                         multiple = true;
359                         break;
360                     }
361                 }
362                 if ( multiple ) {
363                     g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
364                     res.increaseDuplicationsSum();
365                 }
366                 else {
367                     if ( most_parsimonious_duplication_model ) {
368                         g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
369                         res.increaseSpeciationsSum();
370                     }
371                     else {
372                         g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() );
373                         res.increaseSpeciationOrDuplicationEventsSum();
374                     }
375                 }
376             }
377             else {
378                 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
379                 res.increaseSpeciationsSum();
380             }
381         }
382     }
383
384     private final static void stripSpeciesTree( final Phylogeny species_tree,
385                                                 final List<PhylogenyNode> species_tree_ext_nodes,
386                                                 final NodesLinkingResult res ) {
387         for( final PhylogenyNode s : species_tree_ext_nodes ) {
388             if ( !res.getMappedSpeciesTreeNodes().contains( s ) ) {
389                 species_tree.deleteSubtree( s, true );
390                 res.getStrippedSpeciesTreeNodes().add( s );
391             }
392         }
393         species_tree.clearHashIdToNodeMap();
394         species_tree.externalNodesHaveChanged();
395     }
396
397     private final static void stripTree( final Phylogeny phy, final List<PhylogenyNode> strip_nodes ) {
398         for( final PhylogenyNode g : strip_nodes ) {
399             phy.deleteSubtree( g, true );
400         }
401         phy.clearHashIdToNodeMap();
402         phy.externalNodesHaveChanged();
403     }
404
405     private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
406                                                                            final String tax_str,
407                                                                            final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
408         PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map,
409                                                               tax_str,
410                                                               " (",
411                                                               scientific_names_mapped_to_reduced_specificity );
412         if ( s == null ) {
413             if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
414                 final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
415                 s = species_to_node_map.get( new_tax_str );
416                 if ( s != null ) {
417                     addScientificNamesMappedToReducedSpecificity( tax_str,
418                                                                   new_tax_str,
419                                                                   scientific_names_mapped_to_reduced_specificity );
420                 }
421             }
422         }
423         if ( s == null ) {
424             for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
425                     " form ", " subform ", " cultivar ", " section ", " subsection " } ) {
426                 s = tryMapByRemovingOverlySpecificData( species_to_node_map,
427                                                         tax_str,
428                                                         t,
429                                                         scientific_names_mapped_to_reduced_specificity );
430                 if ( s != null ) {
431                     break;
432                 }
433             }
434         }
435         return s;
436     }
437
438     private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
439                                                                            final String tax_str,
440                                                                            final String term,
441                                                                            final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
442         final int i = tax_str.indexOf( term );
443         if ( i > 4 ) {
444             final String new_tax_str = tax_str.substring( 0, i ).trim();
445             final PhylogenyNode s = species_to_node_map.get( new_tax_str );
446             if ( s != null ) {
447                 addScientificNamesMappedToReducedSpecificity( tax_str,
448                                                               new_tax_str,
449                                                               scientific_names_mapped_to_reduced_specificity );
450             }
451             return s;
452         }
453         return null;
454     }
455 }