remove single node root
[jalview.git] / forester / java / src / org / forester / sdi / GSDI.java
1 // $Id:
2 // FORESTER -- software libraries and applications
3 // for evolutionary biology research and applications.
4 //
5 // Copyright (C) 2008-2009 Christian M. Zmasek
6 // Copyright (C) 2008-2009 Burnham Institute for Medical Research
7 // All rights reserved
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 //
23 // Contact: phylosoft @ gmail . com
24 // WWW: www.phylosoft.org/forester
25
26 package org.forester.sdi;
27
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.HashSet;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.Set;
34 import java.util.SortedSet;
35
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyMethods;
38 import org.forester.phylogeny.PhylogenyNode;
39 import org.forester.phylogeny.data.Event;
40 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
41 import org.forester.sdi.SDIutil.TaxonomyComparisonBase;
42 import org.forester.util.ForesterUtil;
43
44 public final class GSDI implements GSDII {
45
46     private final boolean                _most_parsimonious_duplication_model;
47     private final int                    _speciation_or_duplication_events_sum;
48     private final int                    _speciations_sum;
49     private final int                    _duplications_sum;
50     private final List<PhylogenyNode>    _stripped_gene_tree_nodes;
51     private final List<PhylogenyNode>    _stripped_species_tree_nodes;
52     private final Set<PhylogenyNode>     _mapped_species_tree_nodes;
53     private final TaxonomyComparisonBase _tax_comp_base;
54     private final SortedSet<String>      _scientific_names_mapped_to_reduced_specificity;
55
56     public GSDI( final Phylogeny gene_tree,
57                  final Phylogeny species_tree,
58                  final boolean most_parsimonious_duplication_model,
59                  final boolean strip_gene_tree,
60                  final boolean strip_species_tree ) throws SDIException {
61         _most_parsimonious_duplication_model = most_parsimonious_duplication_model;
62         if ( gene_tree.getRoot().getNumberOfDescendants() == 3 ) {
63             gene_tree.reRoot( gene_tree.getRoot().getChildNode( 2 ) );
64         }
65         final NodesLinkingResult nodes_linking_result = linkNodesOfG( gene_tree,
66                                                                       species_tree,
67                                                                       null,
68                                                                       strip_gene_tree,
69                                                                       strip_species_tree );
70         _stripped_gene_tree_nodes = nodes_linking_result.getStrippedGeneTreeNodes();
71         _stripped_species_tree_nodes = nodes_linking_result.getStrippedSpeciesTreeNodes();
72         _mapped_species_tree_nodes = nodes_linking_result.getMappedSpeciesTreeNodes();
73         _scientific_names_mapped_to_reduced_specificity = nodes_linking_result
74                 .getScientificNamesMappedToReducedSpecificity();
75         _tax_comp_base = nodes_linking_result.getTaxCompBase();
76         PhylogenyMethods.preOrderReId( species_tree );
77         final GSDIsummaryResult gsdi_summary_result = geneTreePostOrderTraversal( gene_tree,
78                                                                                   _most_parsimonious_duplication_model );
79         _speciation_or_duplication_events_sum = gsdi_summary_result.getSpeciationOrDuplicationEventsSum();
80         _speciations_sum = gsdi_summary_result.getSpeciationsSum();
81         _duplications_sum = gsdi_summary_result.getDuplicationsSum();
82     }
83
84     public int getDuplicationsSum() {
85         return _duplications_sum;
86     }
87
88     @Override
89     public Set<PhylogenyNode> getMappedExternalSpeciesTreeNodes() {
90         return _mapped_species_tree_nodes;
91     }
92
93     @Override
94     public final SortedSet<String> getReMappedScientificNamesFromGeneTree() {
95         return _scientific_names_mapped_to_reduced_specificity;
96     }
97
98     public final int getSpeciationOrDuplicationEventsSum() {
99         return _speciation_or_duplication_events_sum;
100     }
101
102     @Override
103     public final int getSpeciationsSum() {
104         return _speciations_sum;
105     }
106
107     @Override
108     public List<PhylogenyNode> getStrippedExternalGeneTreeNodes() {
109         return _stripped_gene_tree_nodes;
110     }
111
112     @Override
113     public List<PhylogenyNode> getStrippedSpeciesTreeNodes() {
114         return _stripped_species_tree_nodes;
115     }
116
117     @Override
118     public TaxonomyComparisonBase getTaxCompBase() {
119         return _tax_comp_base;
120     }
121
122     @Override
123     public final String toString() {
124         final StringBuffer sb = new StringBuffer();
125         sb.append( "Most parsimonious duplication model: " + _most_parsimonious_duplication_model );
126         sb.append( ForesterUtil.getLineSeparator() );
127         sb.append( "Speciations sum                    : " + getSpeciationsSum() );
128         sb.append( ForesterUtil.getLineSeparator() );
129         sb.append( "Duplications sum                   : " + getDuplicationsSum() );
130         sb.append( ForesterUtil.getLineSeparator() );
131         if ( !_most_parsimonious_duplication_model ) {
132             sb.append( "Speciation or duplications sum     : " + getSpeciationOrDuplicationEventsSum() );
133             sb.append( ForesterUtil.getLineSeparator() );
134         }
135         return sb.toString();
136     }
137
138     /**
139      * Traverses the subtree of PhylogenyNode g in postorder, calculating the
140      * mapping function M, and determines which nodes represent speciation
141      * events and which ones duplication events.
142      * <p>
143      * Preconditions: Mapping M for external nodes must have been calculated and
144      * the species tree must be labeled in preorder.
145      * <p>
146      * @return 
147      * @throws SDIException 
148      * 
149      */
150     final static GSDIsummaryResult geneTreePostOrderTraversal( final Phylogeny gene_tree,
151                                                                final boolean most_parsimonious_duplication_model )
152             throws SDIException {
153         final GSDIsummaryResult res = new GSDIsummaryResult();
154         for( final PhylogenyNodeIterator it = gene_tree.iteratorPostorder(); it.hasNext(); ) {
155             final PhylogenyNode g = it.next();
156             if ( g.isInternal() ) {
157                 if ( g.getNumberOfDescendants() != 2 ) {
158                     throw new SDIException( "gene tree contains internal node with " + g.getNumberOfDescendants()
159                             + " descendents" );
160                 }
161                 PhylogenyNode s1 = g.getChildNode1().getLink();
162                 PhylogenyNode s2 = g.getChildNode2().getLink();
163                 while ( s1 != s2 ) {
164                     if ( s1.getId() > s2.getId() ) {
165                         s1 = s1.getParent();
166                     }
167                     else {
168                         s2 = s2.getParent();
169                     }
170                 }
171                 g.setLink( s1 );
172                 determineEvent( s1, g, most_parsimonious_duplication_model, res );
173             }
174         }
175         return res;
176     }
177
178     /**
179      * This allows for linking of internal nodes of the species tree (as opposed
180      * to just external nodes, as in the method it overrides.
181      * If TaxonomyComparisonBase is null, it will try to determine it.
182      * @throws SDIException 
183      * 
184      */
185     final static NodesLinkingResult linkNodesOfG( final Phylogeny gene_tree,
186                                                   final Phylogeny species_tree,
187                                                   final TaxonomyComparisonBase tax_comp_base,
188                                                   final boolean strip_gene_tree,
189                                                   final boolean strip_species_tree ) throws SDIException {
190         final Map<String, PhylogenyNode> species_to_node_map = new HashMap<String, PhylogenyNode>();
191         final List<PhylogenyNode> species_tree_ext_nodes = new ArrayList<PhylogenyNode>();
192         final NodesLinkingResult res = new NodesLinkingResult();
193         if ( tax_comp_base == null ) {
194             res.setTaxCompBase( SDIutil.determineTaxonomyComparisonBase( gene_tree ) );
195         }
196         else {
197             res.setTaxCompBase( tax_comp_base );
198         }
199         // Stringyfied taxonomy is the key, node is the value.
200         for( final PhylogenyNodeIterator iter = species_tree.iteratorExternalForward(); iter.hasNext(); ) {
201             final PhylogenyNode s = iter.next();
202             species_tree_ext_nodes.add( s );
203             if ( s.getNodeData().isHasTaxonomy() ) {
204                 final String tax_str = SDIutil.taxonomyToString( s, res.getTaxCompBase() );
205                 if ( !ForesterUtil.isEmpty( tax_str ) ) {
206                     if ( species_to_node_map.containsKey( tax_str ) ) {
207                         throw new SDIException( "taxonomy \"" + tax_str + "\" is not unique in species tree (using "
208                                 + res.getTaxCompBase() + " for linking to gene tree)" );
209                     }
210                     species_to_node_map.put( tax_str, s );
211                 }
212             }
213         }
214         // Retrieve the reference to the node with a matching stringyfied taxonomy.
215         for( final PhylogenyNodeIterator iter = gene_tree.iteratorExternalForward(); iter.hasNext(); ) {
216             final PhylogenyNode g = iter.next();
217             if ( !g.getNodeData().isHasTaxonomy() ) {
218                 if ( strip_gene_tree ) {
219                     res.getStrippedGeneTreeNodes().add( g );
220                 }
221                 else {
222                     throw new SDIException( "gene tree node \"" + g + "\" has no taxonomic data" );
223                 }
224             }
225             else {
226                 final String tax_str = SDIutil.taxonomyToString( g, res.getTaxCompBase() );
227                 if ( ForesterUtil.isEmpty( tax_str ) ) {
228                     if ( strip_gene_tree ) {
229                         res.getStrippedGeneTreeNodes().add( g );
230                     }
231                     else {
232                         throw new SDIException( "gene tree node \"" + g + "\" has no appropriate taxonomic data" );
233                     }
234                 }
235                 else {
236                     PhylogenyNode s = species_to_node_map.get( tax_str );
237                     if ( ( res.getTaxCompBase() == TaxonomyComparisonBase.SCIENTIFIC_NAME ) && ( s == null )
238                             && ( ForesterUtil.countChars( tax_str, ' ' ) > 1 ) ) {
239                         s = tryMapByRemovingOverlySpecificData( species_to_node_map,
240                                                                 tax_str,
241                                                                 res.getScientificNamesMappedToReducedSpecificity() );
242                     }
243                     if ( s == null ) {
244                         if ( strip_gene_tree ) {
245                             res.getStrippedGeneTreeNodes().add( g );
246                         }
247                         else {
248                             throw new SDIException( "taxonomy \"" + g.getNodeData().getTaxonomy()
249                                     + "\" not present in species tree" );
250                         }
251                     }
252                     else {
253                         g.setLink( s );
254                         res.getMappedSpeciesTreeNodes().add( s );
255                     }
256                 }
257             }
258         } // for loop
259         if ( strip_gene_tree ) {
260             stripTree( gene_tree, res.getStrippedGeneTreeNodes() );
261             if ( gene_tree.isEmpty() || ( gene_tree.getNumberOfExternalNodes() < 2 ) ) {
262                 throw new SDIException( "species could not be mapped between gene tree and species tree" );
263             }
264         }
265         if ( strip_species_tree ) {
266             stripSpeciesTree( species_tree, species_tree_ext_nodes, res );
267         }
268         return res;
269     }
270
271     private final static void addScientificNamesMappedToReducedSpecificity( final String s1,
272                                                                             final String s2,
273                                                                             final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
274         scientific_names_mapped_to_reduced_specificity.add( s1 + " -> " + s2 );
275     }
276
277     private final static void determineEvent( final PhylogenyNode s,
278                                               final PhylogenyNode g,
279                                               final boolean most_parsimonious_duplication_model,
280                                               final GSDIsummaryResult res ) {
281         boolean oyako = false;
282         if ( ( g.getChildNode1().getLink() == s ) || ( g.getChildNode2().getLink() == s ) ) {
283             oyako = true;
284         }
285         if ( g.getLink().getNumberOfDescendants() == 2 ) {
286             if ( oyako ) {
287                 g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
288                 res.increaseDuplicationsSum();
289             }
290             else {
291                 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
292                 res.increaseSpeciationsSum();
293             }
294         }
295         else {
296             if ( oyako ) {
297                 final Set<PhylogenyNode> set = new HashSet<PhylogenyNode>();
298                 for( PhylogenyNode n : g.getChildNode1().getAllExternalDescendants() ) {
299                     n = n.getLink();
300                     while ( n.getParent() != s ) {
301                         n = n.getParent();
302                         if ( n.isRoot() ) {
303                             break;
304                         }
305                     }
306                     set.add( n );
307                 }
308                 boolean multiple = false;
309                 for( PhylogenyNode n : g.getChildNode2().getAllExternalDescendants() ) {
310                     n = n.getLink();
311                     while ( n.getParent() != s ) {
312                         n = n.getParent();
313                         if ( n.isRoot() ) {
314                             break;
315                         }
316                     }
317                     if ( set.contains( n ) ) {
318                         multiple = true;
319                         break;
320                     }
321                 }
322                 if ( multiple ) {
323                     g.getNodeData().setEvent( Event.createSingleDuplicationEvent() );
324                     res.increaseDuplicationsSum();
325                 }
326                 else {
327                     if ( most_parsimonious_duplication_model ) {
328                         g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
329                         res.increaseSpeciationsSum();
330                     }
331                     else {
332                         g.getNodeData().setEvent( Event.createSingleSpeciationOrDuplicationEvent() );
333                         res.increaseSpeciationOrDuplicationEventsSum();
334                     }
335                 }
336             }
337             else {
338                 g.getNodeData().setEvent( Event.createSingleSpeciationEvent() );
339                 res.increaseSpeciationsSum();
340             }
341         }
342     }
343
344     private final static void stripSpeciesTree( final Phylogeny species_tree,
345                                                 final List<PhylogenyNode> species_tree_ext_nodes,
346                                                 final NodesLinkingResult res ) {
347         for( final PhylogenyNode s : species_tree_ext_nodes ) {
348             if ( !res.getMappedSpeciesTreeNodes().contains( s ) ) {
349                 species_tree.deleteSubtree( s, true );
350                 res.getStrippedSpeciesTreeNodes().add( s );
351             }
352         }
353         species_tree.clearHashIdToNodeMap();
354         species_tree.externalNodesHaveChanged();
355     }
356
357     private final static void stripTree( final Phylogeny phy, final List<PhylogenyNode> strip_nodes ) {
358         for( final PhylogenyNode g : strip_nodes ) {
359             phy.deleteSubtree( g, true );
360         }
361         phy.clearHashIdToNodeMap();
362         phy.externalNodesHaveChanged();
363     }
364
365     private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
366                                                                            final String tax_str,
367                                                                            final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
368         PhylogenyNode s = tryMapByRemovingOverlySpecificData( species_to_node_map,
369                                                               tax_str,
370                                                               " (",
371                                                               scientific_names_mapped_to_reduced_specificity );
372         if ( s == null ) {
373             if ( ForesterUtil.countChars( tax_str, ' ' ) == 2 ) {
374                 final String new_tax_str = tax_str.substring( 0, tax_str.lastIndexOf( ' ' ) ).trim();
375                 s = species_to_node_map.get( new_tax_str );
376                 if ( s != null ) {
377                     addScientificNamesMappedToReducedSpecificity( tax_str,
378                                                                   new_tax_str,
379                                                                   scientific_names_mapped_to_reduced_specificity );
380                 }
381             }
382         }
383         if ( s == null ) {
384             for( final String t : new String[] { " subspecies ", " strain ", " variety ", " varietas ", " subvariety ",
385                     " form ", " subform ", " cultivar ", " section ", " subsection " } ) {
386                 s = tryMapByRemovingOverlySpecificData( species_to_node_map,
387                                                         tax_str,
388                                                         t,
389                                                         scientific_names_mapped_to_reduced_specificity );
390                 if ( s != null ) {
391                     break;
392                 }
393             }
394         }
395         return s;
396     }
397
398     private final static PhylogenyNode tryMapByRemovingOverlySpecificData( final Map<String, PhylogenyNode> species_to_node_map,
399                                                                            final String tax_str,
400                                                                            final String term,
401                                                                            final SortedSet<String> scientific_names_mapped_to_reduced_specificity ) {
402         final int i = tax_str.indexOf( term );
403         if ( i > 4 ) {
404             final String new_tax_str = tax_str.substring( 0, i ).trim();
405             final PhylogenyNode s = species_to_node_map.get( new_tax_str );
406             if ( s != null ) {
407                 addScientificNamesMappedToReducedSpecificity( tax_str,
408                                                               new_tax_str,
409                                                               scientific_names_mapped_to_reduced_specificity );
410             }
411             return s;
412         }
413         return null;
414     }
415 }