initial commit
[jalview.git] / forester / java / src / org / forester / analysis / AncestralTaxonomyInference.java
1 // forester -- software libraries and applications
2 // for genomics and evolutionary biology research.
3 //
4 // Copyright (C) 2010 Christian M Zmasek
5 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
6 // All rights reserved
7 // 
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
12 //
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
17 // 
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
21 //
22 // Contact: phylosoft @ gmail . com
23 // WWW: www.phylosoft.org/forester
24
25 package org.forester.analysis;
26
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.List;
31 import java.util.SortedSet;
32 import java.util.TreeSet;
33
34 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
35 import org.forester.phylogeny.Phylogeny;
36 import org.forester.phylogeny.PhylogenyNode;
37 import org.forester.phylogeny.data.Identifier;
38 import org.forester.phylogeny.data.Taxonomy;
39 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
40 import org.forester.util.ForesterUtil;
41 import org.forester.ws.uniprot.UniProtTaxonomy;
42 import org.forester.ws.uniprot.UniProtWsTools;
43
44 public final class AncestralTaxonomyInference {
45
46         private static final int MAX_CACHE_SIZE = 100000;
47         private static final int MAX_TAXONOMIES_TO_RETURN = 100;
48         private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
49         private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map = new HashMap<String, UniProtTaxonomy>();
50         private static final HashMap<String, UniProtTaxonomy> _cn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
51         private static final HashMap<String, UniProtTaxonomy> _id_up_cache_map = new HashMap<String, UniProtTaxonomy>();
52
53         synchronized private static void clearCachesIfTooLarge() {
54                 if (getSnTaxCacheMap().size() > MAX_CACHE_SIZE) {
55                         getSnTaxCacheMap().clear();
56                 }
57                 if (getCnTaxCacheMap().size() > MAX_CACHE_SIZE) {
58                         getCnTaxCacheMap().clear();
59                 }
60                 if (getCodeTaxCacheMap().size() > MAX_CACHE_SIZE) {
61                         getCodeTaxCacheMap().clear();
62                 }
63                 if (getIdTaxCacheMap().size() > MAX_CACHE_SIZE) {
64                         getIdTaxCacheMap().clear();
65                 }
66         }
67
68         synchronized private static HashMap<String, UniProtTaxonomy> getCnTaxCacheMap() {
69                 return _cn_up_cache_map;
70         }
71
72         synchronized private static HashMap<String, UniProtTaxonomy> getCodeTaxCacheMap() {
73                 return _code_up_cache_map;
74         }
75
76         synchronized private static HashMap<String, UniProtTaxonomy> getIdTaxCacheMap() {
77                 return _id_up_cache_map;
78         }
79
80         synchronized private static HashMap<String, UniProtTaxonomy> getSnTaxCacheMap() {
81                 return _sn_up_cache_map;
82         }
83
84         synchronized private static UniProtTaxonomy getTaxonomies(
85                         final HashMap<String, UniProtTaxonomy> cache, final String query,
86                         final QUERY_TYPE qt) throws IOException {
87                 if (cache.containsKey(query)) {
88                         return cache.get(query).copy();
89                 } else {
90                         List<UniProtTaxonomy> up_taxonomies = null;
91                         switch (qt) {
92                         case ID:
93                                 up_taxonomies = getTaxonomiesFromId(query);
94                                 break;
95                         case CODE:
96                                 up_taxonomies = getTaxonomiesFromTaxonomyCode(query);
97                                 break;
98                         case SN:
99                                 up_taxonomies = getTaxonomiesFromScientificName(query);
100                                 break;
101                         case CN:
102                                 up_taxonomies = getTaxonomiesFromCommonName(query);
103                                 break;
104                         default:
105                                 throw new RuntimeException();
106                         }
107                         if ((up_taxonomies != null) && (up_taxonomies.size() == 1)) {
108                                 final UniProtTaxonomy up_tax = up_taxonomies.get(0);
109                                 if (!ForesterUtil.isEmpty(up_tax.getScientificName())) {
110                                         getSnTaxCacheMap().put(up_tax.getScientificName(), up_tax);
111                                 }
112                                 if (!ForesterUtil.isEmpty(up_tax.getCode())) {
113                                         getCodeTaxCacheMap().put(up_tax.getCode(), up_tax);
114                                 }
115                                 if (!ForesterUtil.isEmpty(up_tax.getCommonName())) {
116                                         getCnTaxCacheMap().put(up_tax.getCommonName(), up_tax);
117                                 }
118                                 if (!ForesterUtil.isEmpty(up_tax.getId())) {
119                                         getIdTaxCacheMap().put(up_tax.getId(), up_tax);
120                                 }
121                                 return up_tax;
122                         } else {
123                                 return null;
124                         }
125                 }
126         }
127
128         synchronized private static List<UniProtTaxonomy> getTaxonomiesFromCommonName(
129                         final String query) throws IOException {
130                 return UniProtWsTools.getTaxonomiesFromCommonNameStrict(query,
131                                 MAX_TAXONOMIES_TO_RETURN);
132         }
133
134         synchronized private static List<UniProtTaxonomy> getTaxonomiesFromId(
135                         final String query) throws IOException {
136                 return UniProtWsTools.getTaxonomiesFromId(query,
137                                 MAX_TAXONOMIES_TO_RETURN);
138         }
139
140         synchronized private static List<UniProtTaxonomy> getTaxonomiesFromScientificName(
141                         final String query) throws IOException {
142                 return UniProtWsTools.getTaxonomiesFromScientificNameStrict(query,
143                                 MAX_TAXONOMIES_TO_RETURN);
144         }
145
146         synchronized private static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode(
147                         final String query) throws IOException {
148                 return UniProtWsTools.getTaxonomiesFromTaxonomyCode(query,
149                                 MAX_TAXONOMIES_TO_RETURN);
150         }
151
152         synchronized public static SortedSet<String> inferTaxonomyFromDescendents(
153                         final Phylogeny phy) throws IOException {
154                 clearCachesIfTooLarge();
155                 final SortedSet<String> not_found = new TreeSet<String>();
156                 for (final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter
157                                 .hasNext();) {
158                         final PhylogenyNode node = iter.next();
159                         // final QUERY_TYPE qt = null;
160                         // Taxonomy tax = null;
161                         // if ( node.getNodeData().isHasTaxonomy() ) {
162                         // tax = node.getNodeData().getTaxonomy();
163                         // }
164                         // UniProtTaxonomy up_tax = null;
165                         // if ( ( tax != null )
166                         // && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty(
167                         // tax.getScientificName() )
168                         // || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ||
169                         // !ForesterUtil.isEmpty( tax
170                         // .getCommonName() ) ) ) {
171                         // final String query = null;
172                         // up_tax = obtainUniProtTaxonomy( tax, query, qt );
173                         // if ( up_tax == null ) {
174                         // not_found.add( query );
175                         // }
176                         // else {
177                         // updateTaxonomy( qt, node, tax, up_tax );
178                         // }
179                         // }
180                         if (!node.isExternal()) {
181                                 inferTaxonomyFromDescendents(node, not_found);
182                         }
183                 }
184                 return not_found;
185         }
186
187         synchronized private static void inferTaxonomyFromDescendents(
188                         final PhylogenyNode n, final SortedSet<String> not_found)
189                         throws IOException {
190                 if (n.isExternal()) {
191                         throw new IllegalArgumentException(
192                                         "attempt to infer taxonomy from descendants of external node");
193                 }
194                 n.getNodeData().setTaxonomy(null);
195                 final List<PhylogenyNode> descs = n.getDescendants();
196                 final List<String[]> lineages = new ArrayList<String[]>();
197                 int shortest_lin_length = Integer.MAX_VALUE;
198                 for (final PhylogenyNode desc : descs) {
199                         if (desc.getNodeData().isHasTaxonomy()
200                                         && (isHasAppropriateId(desc.getNodeData().getTaxonomy())
201                                                         || !ForesterUtil.isEmpty(desc.getNodeData()
202                                                                         .getTaxonomy().getScientificName())
203                                                         || !ForesterUtil.isEmpty(desc.getNodeData()
204                                                                         .getTaxonomy().getTaxonomyCode()) || !ForesterUtil
205                                                         .isEmpty(desc.getNodeData().getTaxonomy()
206                                                                         .getCommonName()))) {
207                                 final QUERY_TYPE qt = null;
208                                 final String query = null;
209                                 final UniProtTaxonomy up_tax = obtainUniProtTaxonomy(desc
210                                                 .getNodeData().getTaxonomy(), query, qt);
211                                 String[] lineage = null;
212                                 if (up_tax != null) {
213                                         lineage = obtainLineagePlusOwnScientificName(up_tax);
214                                 }
215                                 if ((lineage == null) || (lineage.length < 1)) {
216                                         not_found.add(desc.getNodeData().getTaxonomy().asText()
217                                                         .toString());
218                                         return;
219                                 }
220                                 if (lineage.length < shortest_lin_length) {
221                                         shortest_lin_length = lineage.length;
222                                 }
223                                 lineages.add(lineage);
224                         } else {
225                                 String msg = "Node(s) with no or inappropriate taxonomic information found";
226                                 if (!ForesterUtil.isEmpty(desc.getName())) {
227                                         msg = "Node " + desc.getName()
228                                                         + " has no or inappropriate taxonomic information";
229                                 }
230                                 throw new IllegalArgumentException(msg);
231                         }
232                 }
233                 String last_common_lineage = null;
234                 if (shortest_lin_length > 0) {
235                         I: for (int i = 0; i < shortest_lin_length; ++i) {
236                                 final String lineage_0 = lineages.get(0)[i];
237                                 for (int j = 1; j < lineages.size(); ++j) {
238                                         if (!lineage_0.equals(lineages.get(j)[i])) {
239                                                 break I;
240                                         }
241                                 }
242                                 last_common_lineage = lineage_0;
243                         }
244                 }
245                 if (last_common_lineage == null) {
246                         return;
247                 }
248                 // if ( !n.getNodeData().isHasTaxonomy() ) {
249                 // n.getNodeData().setTaxonomy( new Taxonomy() );
250                 // }
251                 final Taxonomy tax = new Taxonomy();
252                 n.getNodeData().setTaxonomy(tax);
253                 tax.setScientificName(last_common_lineage);
254                 final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromSn(last_common_lineage);
255                 if (up_tax != null) {
256                         if (!ForesterUtil.isEmpty(up_tax.getRank())) {
257                                 try {
258                                         tax.setRank(up_tax.getRank().toLowerCase());
259                                 } catch (final PhyloXmlDataFormatException ex) {
260                                         tax.setRank("");
261                                 }
262                         }
263                         if (!ForesterUtil.isEmpty(up_tax.getId())) {
264                                 tax.setIdentifier(new Identifier(up_tax.getId(), "uniprot"));
265                         }
266                         if (!ForesterUtil.isEmpty(up_tax.getCommonName())) {
267                                 tax.setCommonName(up_tax.getCommonName());
268                         }
269                         if (!ForesterUtil.isEmpty(up_tax.getSynonym())
270                                         && !tax.getSynonyms().contains(up_tax.getSynonym())) {
271                                 tax.getSynonyms().add(up_tax.getSynonym());
272                         }
273                 }
274                 for (final PhylogenyNode desc : descs) {
275                         if (!desc.isExternal() && desc.getNodeData().isHasTaxonomy()
276                                         && desc.getNodeData().getTaxonomy().isEqual(tax)) {
277                                 desc.getNodeData().setTaxonomy(null);
278                         }
279                 }
280         }
281
282         synchronized private static boolean isHasAppropriateId(final Taxonomy tax) {
283                 return ((tax.getIdentifier() != null) && (!ForesterUtil.isEmpty(tax
284                                 .getIdentifier().getValue()) && (tax.getIdentifier()
285                                 .getProvider().equalsIgnoreCase("ncbi")
286                                 || tax.getIdentifier().getProvider()
287                                                 .equalsIgnoreCase("uniprot") || tax.getIdentifier()
288                                 .getProvider().equalsIgnoreCase("uniprotkb"))));
289         }
290
291         synchronized public static SortedSet<String> obtainDetailedTaxonomicInformation(
292                         final Phylogeny phy) throws IOException {
293                 clearCachesIfTooLarge();
294                 final SortedSet<String> not_found = new TreeSet<String>();
295                 for (final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter
296                                 .hasNext();) {
297                         final PhylogenyNode node = iter.next();
298                         final QUERY_TYPE qt = null;
299                         Taxonomy tax = null;
300                         if (node.getNodeData().isHasTaxonomy()) {
301                                 tax = node.getNodeData().getTaxonomy();
302                         } else if (node.isExternal()) {
303                                 if (!ForesterUtil.isEmpty(node.getName())) {
304                                         not_found.add(node.getName());
305                                 } else {
306                                         not_found.add(node.toString());
307                                 }
308                         }
309                         UniProtTaxonomy up_tax = null;
310                         if ((tax != null)
311                                         && (isHasAppropriateId(tax)
312                                                         || !ForesterUtil.isEmpty(tax.getScientificName())
313                                                         || !ForesterUtil.isEmpty(tax.getTaxonomyCode()) || !ForesterUtil
314                                                         .isEmpty(tax.getCommonName()))) {
315                                 up_tax = obtainUniProtTaxonomy(tax, null, qt);
316                                 if (up_tax != null) {
317                                         updateTaxonomy(qt, node, tax, up_tax);
318                                 } else {
319                                         not_found.add(tax.toString());
320                                 }
321                         }
322                 }
323                 return not_found;
324         }
325
326         synchronized private static String[] obtainLineagePlusOwnScientificName(
327                         final UniProtTaxonomy up_tax) {
328                 final String[] lineage = up_tax.getLineage();
329                 final String[] lin_plus_self = new String[lineage.length + 1];
330                 for (int i = 0; i < lineage.length; ++i) {
331                         lin_plus_self[i] = lineage[i];
332                 }
333                 lin_plus_self[lineage.length] = up_tax.getScientificName();
334                 return lin_plus_self;
335         }
336
337         synchronized private static UniProtTaxonomy obtainUniProtTaxonomy(
338                         final Taxonomy tax, String query, QUERY_TYPE qt) throws IOException {
339                 if (isHasAppropriateId(tax)) {
340                         query = tax.getIdentifier().getValue();
341                         qt = QUERY_TYPE.ID;
342                         return getTaxonomies(getIdTaxCacheMap(), query, qt);
343                 } else if (!ForesterUtil.isEmpty(tax.getScientificName())) {
344                         query = tax.getScientificName();
345                         qt = QUERY_TYPE.SN;
346                         return getTaxonomies(getSnTaxCacheMap(), query, qt);
347                 } else if (!ForesterUtil.isEmpty(tax.getTaxonomyCode())) {
348                         query = tax.getTaxonomyCode();
349                         qt = QUERY_TYPE.CODE;
350                         return getTaxonomies(getCodeTaxCacheMap(), query, qt);
351                 } else {
352                         query = tax.getCommonName();
353                         qt = QUERY_TYPE.CN;
354                         return getTaxonomies(getCnTaxCacheMap(), query, qt);
355                 }
356         }
357
358         synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn(
359                         final String sn) throws IOException {
360                 UniProtTaxonomy up_tax = null;
361                 if (getSnTaxCacheMap().containsKey(sn)) {
362                         up_tax = getSnTaxCacheMap().get(sn).copy();
363                 } else {
364                         final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName(sn);
365                         if ((up_taxonomies != null) && (up_taxonomies.size() == 1)) {
366                                 up_tax = up_taxonomies.get(0);
367                                 getSnTaxCacheMap().put(sn, up_tax);
368                                 if (!ForesterUtil.isEmpty(up_tax.getCode())) {
369                                         getCodeTaxCacheMap().put(up_tax.getCode(), up_tax);
370                                 }
371                                 if (!ForesterUtil.isEmpty(up_tax.getCommonName())) {
372                                         getCnTaxCacheMap().put(up_tax.getCommonName(), up_tax);
373                                 }
374                                 if (!ForesterUtil.isEmpty(up_tax.getId())) {
375                                         getIdTaxCacheMap().put(up_tax.getId(), up_tax);
376                                 }
377                         }
378                 }
379                 return up_tax;
380         }
381
382         synchronized private static void updateTaxonomy(final QUERY_TYPE qt,
383                         final PhylogenyNode node, final Taxonomy tax,
384                         final UniProtTaxonomy up_tax) {
385                 if ((qt != QUERY_TYPE.SN)
386                                 && !ForesterUtil.isEmpty(up_tax.getScientificName())
387                                 && ForesterUtil.isEmpty(tax.getScientificName())) {
388                         tax.setScientificName(up_tax.getScientificName());
389                 }
390                 if (node.isExternal()
391                                 && ((qt != QUERY_TYPE.CODE)
392                                                 && !ForesterUtil.isEmpty(up_tax.getCode()) && ForesterUtil
393                                                 .isEmpty(tax.getTaxonomyCode()))) {
394                         tax.setTaxonomyCode(up_tax.getCode());
395                 }
396                 if ((qt != QUERY_TYPE.CN)
397                                 && !ForesterUtil.isEmpty(up_tax.getCommonName())
398                                 && ForesterUtil.isEmpty(tax.getCommonName())) {
399                         tax.setCommonName(up_tax.getCommonName());
400                 }
401                 if (!ForesterUtil.isEmpty(up_tax.getSynonym())
402                                 && !tax.getSynonyms().contains(up_tax.getSynonym())) {
403                         tax.getSynonyms().add(up_tax.getSynonym());
404                 }
405                 if (!ForesterUtil.isEmpty(up_tax.getRank())
406                                 && ForesterUtil.isEmpty(tax.getRank())) {
407                         try {
408                                 tax.setRank(up_tax.getRank().toLowerCase());
409                         } catch (final PhyloXmlDataFormatException ex) {
410                                 tax.setRank("");
411                         }
412                 }
413                 if ((qt != QUERY_TYPE.ID) && !ForesterUtil.isEmpty(up_tax.getId())
414                                 && (tax.getIdentifier() == null)) {
415                         tax.setIdentifier(new Identifier(up_tax.getId(), "uniprot"));
416                 }
417         }
418
419         private enum QUERY_TYPE {
420                 CODE, SN, CN, ID;
421         }
422 }