1 // forester -- software libraries and applications
2 // for genomics and evolutionary biology research.
4 // Copyright (C) 2010 Christian M Zmasek
5 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 // Contact: phylosoft @ gmail . com
23 // WWW: www.phylosoft.org/forester
25 package org.forester.analysis;
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.List;
31 import java.util.SortedSet;
32 import java.util.TreeSet;
34 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
35 import org.forester.phylogeny.Phylogeny;
36 import org.forester.phylogeny.PhylogenyNode;
37 import org.forester.phylogeny.data.Identifier;
38 import org.forester.phylogeny.data.Taxonomy;
39 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
40 import org.forester.util.ForesterUtil;
41 import org.forester.ws.uniprot.UniProtTaxonomy;
42 import org.forester.ws.uniprot.UniProtWsTools;
44 public final class AncestralTaxonomyInference {
46 private static final int MAX_CACHE_SIZE = 100000;
47 private static final int MAX_TAXONOMIES_TO_RETURN = 100;
48 private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
49 private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map = new HashMap<String, UniProtTaxonomy>();
50 private static final HashMap<String, UniProtTaxonomy> _cn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
51 private static final HashMap<String, UniProtTaxonomy> _id_up_cache_map = new HashMap<String, UniProtTaxonomy>();
53 synchronized private static void clearCachesIfTooLarge() {
54 if (getSnTaxCacheMap().size() > MAX_CACHE_SIZE) {
55 getSnTaxCacheMap().clear();
57 if (getCnTaxCacheMap().size() > MAX_CACHE_SIZE) {
58 getCnTaxCacheMap().clear();
60 if (getCodeTaxCacheMap().size() > MAX_CACHE_SIZE) {
61 getCodeTaxCacheMap().clear();
63 if (getIdTaxCacheMap().size() > MAX_CACHE_SIZE) {
64 getIdTaxCacheMap().clear();
68 synchronized private static HashMap<String, UniProtTaxonomy> getCnTaxCacheMap() {
69 return _cn_up_cache_map;
72 synchronized private static HashMap<String, UniProtTaxonomy> getCodeTaxCacheMap() {
73 return _code_up_cache_map;
76 synchronized private static HashMap<String, UniProtTaxonomy> getIdTaxCacheMap() {
77 return _id_up_cache_map;
80 synchronized private static HashMap<String, UniProtTaxonomy> getSnTaxCacheMap() {
81 return _sn_up_cache_map;
84 synchronized private static UniProtTaxonomy getTaxonomies(
85 final HashMap<String, UniProtTaxonomy> cache, final String query,
86 final QUERY_TYPE qt) throws IOException {
87 if (cache.containsKey(query)) {
88 return cache.get(query).copy();
90 List<UniProtTaxonomy> up_taxonomies = null;
93 up_taxonomies = getTaxonomiesFromId(query);
96 up_taxonomies = getTaxonomiesFromTaxonomyCode(query);
99 up_taxonomies = getTaxonomiesFromScientificName(query);
102 up_taxonomies = getTaxonomiesFromCommonName(query);
105 throw new RuntimeException();
107 if ((up_taxonomies != null) && (up_taxonomies.size() == 1)) {
108 final UniProtTaxonomy up_tax = up_taxonomies.get(0);
109 if (!ForesterUtil.isEmpty(up_tax.getScientificName())) {
110 getSnTaxCacheMap().put(up_tax.getScientificName(), up_tax);
112 if (!ForesterUtil.isEmpty(up_tax.getCode())) {
113 getCodeTaxCacheMap().put(up_tax.getCode(), up_tax);
115 if (!ForesterUtil.isEmpty(up_tax.getCommonName())) {
116 getCnTaxCacheMap().put(up_tax.getCommonName(), up_tax);
118 if (!ForesterUtil.isEmpty(up_tax.getId())) {
119 getIdTaxCacheMap().put(up_tax.getId(), up_tax);
128 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromCommonName(
129 final String query) throws IOException {
130 return UniProtWsTools.getTaxonomiesFromCommonNameStrict(query,
131 MAX_TAXONOMIES_TO_RETURN);
134 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromId(
135 final String query) throws IOException {
136 return UniProtWsTools.getTaxonomiesFromId(query,
137 MAX_TAXONOMIES_TO_RETURN);
140 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromScientificName(
141 final String query) throws IOException {
142 return UniProtWsTools.getTaxonomiesFromScientificNameStrict(query,
143 MAX_TAXONOMIES_TO_RETURN);
146 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode(
147 final String query) throws IOException {
148 return UniProtWsTools.getTaxonomiesFromTaxonomyCode(query,
149 MAX_TAXONOMIES_TO_RETURN);
152 synchronized public static SortedSet<String> inferTaxonomyFromDescendents(
153 final Phylogeny phy) throws IOException {
154 clearCachesIfTooLarge();
155 final SortedSet<String> not_found = new TreeSet<String>();
156 for (final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter
158 final PhylogenyNode node = iter.next();
159 // final QUERY_TYPE qt = null;
160 // Taxonomy tax = null;
161 // if ( node.getNodeData().isHasTaxonomy() ) {
162 // tax = node.getNodeData().getTaxonomy();
164 // UniProtTaxonomy up_tax = null;
165 // if ( ( tax != null )
166 // && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty(
167 // tax.getScientificName() )
168 // || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ||
169 // !ForesterUtil.isEmpty( tax
170 // .getCommonName() ) ) ) {
171 // final String query = null;
172 // up_tax = obtainUniProtTaxonomy( tax, query, qt );
173 // if ( up_tax == null ) {
174 // not_found.add( query );
177 // updateTaxonomy( qt, node, tax, up_tax );
180 if (!node.isExternal()) {
181 inferTaxonomyFromDescendents(node, not_found);
187 synchronized private static void inferTaxonomyFromDescendents(
188 final PhylogenyNode n, final SortedSet<String> not_found)
190 if (n.isExternal()) {
191 throw new IllegalArgumentException(
192 "attempt to infer taxonomy from descendants of external node");
194 n.getNodeData().setTaxonomy(null);
195 final List<PhylogenyNode> descs = n.getDescendants();
196 final List<String[]> lineages = new ArrayList<String[]>();
197 int shortest_lin_length = Integer.MAX_VALUE;
198 for (final PhylogenyNode desc : descs) {
199 if (desc.getNodeData().isHasTaxonomy()
200 && (isHasAppropriateId(desc.getNodeData().getTaxonomy())
201 || !ForesterUtil.isEmpty(desc.getNodeData()
202 .getTaxonomy().getScientificName())
203 || !ForesterUtil.isEmpty(desc.getNodeData()
204 .getTaxonomy().getTaxonomyCode()) || !ForesterUtil
205 .isEmpty(desc.getNodeData().getTaxonomy()
206 .getCommonName()))) {
207 final QUERY_TYPE qt = null;
208 final String query = null;
209 final UniProtTaxonomy up_tax = obtainUniProtTaxonomy(desc
210 .getNodeData().getTaxonomy(), query, qt);
211 String[] lineage = null;
212 if (up_tax != null) {
213 lineage = obtainLineagePlusOwnScientificName(up_tax);
215 if ((lineage == null) || (lineage.length < 1)) {
216 not_found.add(desc.getNodeData().getTaxonomy().asText()
220 if (lineage.length < shortest_lin_length) {
221 shortest_lin_length = lineage.length;
223 lineages.add(lineage);
225 String msg = "Node(s) with no or inappropriate taxonomic information found";
226 if (!ForesterUtil.isEmpty(desc.getName())) {
227 msg = "Node " + desc.getName()
228 + " has no or inappropriate taxonomic information";
230 throw new IllegalArgumentException(msg);
233 String last_common_lineage = null;
234 if (shortest_lin_length > 0) {
235 I: for (int i = 0; i < shortest_lin_length; ++i) {
236 final String lineage_0 = lineages.get(0)[i];
237 for (int j = 1; j < lineages.size(); ++j) {
238 if (!lineage_0.equals(lineages.get(j)[i])) {
242 last_common_lineage = lineage_0;
245 if (last_common_lineage == null) {
248 // if ( !n.getNodeData().isHasTaxonomy() ) {
249 // n.getNodeData().setTaxonomy( new Taxonomy() );
251 final Taxonomy tax = new Taxonomy();
252 n.getNodeData().setTaxonomy(tax);
253 tax.setScientificName(last_common_lineage);
254 final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromSn(last_common_lineage);
255 if (up_tax != null) {
256 if (!ForesterUtil.isEmpty(up_tax.getRank())) {
258 tax.setRank(up_tax.getRank().toLowerCase());
259 } catch (final PhyloXmlDataFormatException ex) {
263 if (!ForesterUtil.isEmpty(up_tax.getId())) {
264 tax.setIdentifier(new Identifier(up_tax.getId(), "uniprot"));
266 if (!ForesterUtil.isEmpty(up_tax.getCommonName())) {
267 tax.setCommonName(up_tax.getCommonName());
269 if (!ForesterUtil.isEmpty(up_tax.getSynonym())
270 && !tax.getSynonyms().contains(up_tax.getSynonym())) {
271 tax.getSynonyms().add(up_tax.getSynonym());
274 for (final PhylogenyNode desc : descs) {
275 if (!desc.isExternal() && desc.getNodeData().isHasTaxonomy()
276 && desc.getNodeData().getTaxonomy().isEqual(tax)) {
277 desc.getNodeData().setTaxonomy(null);
282 synchronized private static boolean isHasAppropriateId(final Taxonomy tax) {
283 return ((tax.getIdentifier() != null) && (!ForesterUtil.isEmpty(tax
284 .getIdentifier().getValue()) && (tax.getIdentifier()
285 .getProvider().equalsIgnoreCase("ncbi")
286 || tax.getIdentifier().getProvider()
287 .equalsIgnoreCase("uniprot") || tax.getIdentifier()
288 .getProvider().equalsIgnoreCase("uniprotkb"))));
291 synchronized public static SortedSet<String> obtainDetailedTaxonomicInformation(
292 final Phylogeny phy) throws IOException {
293 clearCachesIfTooLarge();
294 final SortedSet<String> not_found = new TreeSet<String>();
295 for (final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter
297 final PhylogenyNode node = iter.next();
298 final QUERY_TYPE qt = null;
300 if (node.getNodeData().isHasTaxonomy()) {
301 tax = node.getNodeData().getTaxonomy();
302 } else if (node.isExternal()) {
303 if (!ForesterUtil.isEmpty(node.getName())) {
304 not_found.add(node.getName());
306 not_found.add(node.toString());
309 UniProtTaxonomy up_tax = null;
311 && (isHasAppropriateId(tax)
312 || !ForesterUtil.isEmpty(tax.getScientificName())
313 || !ForesterUtil.isEmpty(tax.getTaxonomyCode()) || !ForesterUtil
314 .isEmpty(tax.getCommonName()))) {
315 up_tax = obtainUniProtTaxonomy(tax, null, qt);
316 if (up_tax != null) {
317 updateTaxonomy(qt, node, tax, up_tax);
319 not_found.add(tax.toString());
326 synchronized private static String[] obtainLineagePlusOwnScientificName(
327 final UniProtTaxonomy up_tax) {
328 final String[] lineage = up_tax.getLineage();
329 final String[] lin_plus_self = new String[lineage.length + 1];
330 for (int i = 0; i < lineage.length; ++i) {
331 lin_plus_self[i] = lineage[i];
333 lin_plus_self[lineage.length] = up_tax.getScientificName();
334 return lin_plus_self;
337 synchronized private static UniProtTaxonomy obtainUniProtTaxonomy(
338 final Taxonomy tax, String query, QUERY_TYPE qt) throws IOException {
339 if (isHasAppropriateId(tax)) {
340 query = tax.getIdentifier().getValue();
342 return getTaxonomies(getIdTaxCacheMap(), query, qt);
343 } else if (!ForesterUtil.isEmpty(tax.getScientificName())) {
344 query = tax.getScientificName();
346 return getTaxonomies(getSnTaxCacheMap(), query, qt);
347 } else if (!ForesterUtil.isEmpty(tax.getTaxonomyCode())) {
348 query = tax.getTaxonomyCode();
349 qt = QUERY_TYPE.CODE;
350 return getTaxonomies(getCodeTaxCacheMap(), query, qt);
352 query = tax.getCommonName();
354 return getTaxonomies(getCnTaxCacheMap(), query, qt);
358 synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn(
359 final String sn) throws IOException {
360 UniProtTaxonomy up_tax = null;
361 if (getSnTaxCacheMap().containsKey(sn)) {
362 up_tax = getSnTaxCacheMap().get(sn).copy();
364 final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName(sn);
365 if ((up_taxonomies != null) && (up_taxonomies.size() == 1)) {
366 up_tax = up_taxonomies.get(0);
367 getSnTaxCacheMap().put(sn, up_tax);
368 if (!ForesterUtil.isEmpty(up_tax.getCode())) {
369 getCodeTaxCacheMap().put(up_tax.getCode(), up_tax);
371 if (!ForesterUtil.isEmpty(up_tax.getCommonName())) {
372 getCnTaxCacheMap().put(up_tax.getCommonName(), up_tax);
374 if (!ForesterUtil.isEmpty(up_tax.getId())) {
375 getIdTaxCacheMap().put(up_tax.getId(), up_tax);
382 synchronized private static void updateTaxonomy(final QUERY_TYPE qt,
383 final PhylogenyNode node, final Taxonomy tax,
384 final UniProtTaxonomy up_tax) {
385 if ((qt != QUERY_TYPE.SN)
386 && !ForesterUtil.isEmpty(up_tax.getScientificName())
387 && ForesterUtil.isEmpty(tax.getScientificName())) {
388 tax.setScientificName(up_tax.getScientificName());
390 if (node.isExternal()
391 && ((qt != QUERY_TYPE.CODE)
392 && !ForesterUtil.isEmpty(up_tax.getCode()) && ForesterUtil
393 .isEmpty(tax.getTaxonomyCode()))) {
394 tax.setTaxonomyCode(up_tax.getCode());
396 if ((qt != QUERY_TYPE.CN)
397 && !ForesterUtil.isEmpty(up_tax.getCommonName())
398 && ForesterUtil.isEmpty(tax.getCommonName())) {
399 tax.setCommonName(up_tax.getCommonName());
401 if (!ForesterUtil.isEmpty(up_tax.getSynonym())
402 && !tax.getSynonyms().contains(up_tax.getSynonym())) {
403 tax.getSynonyms().add(up_tax.getSynonym());
405 if (!ForesterUtil.isEmpty(up_tax.getRank())
406 && ForesterUtil.isEmpty(tax.getRank())) {
408 tax.setRank(up_tax.getRank().toLowerCase());
409 } catch (final PhyloXmlDataFormatException ex) {
413 if ((qt != QUERY_TYPE.ID) && !ForesterUtil.isEmpty(up_tax.getId())
414 && (tax.getIdentifier() == null)) {
415 tax.setIdentifier(new Identifier(up_tax.getId(), "uniprot"));
419 private enum QUERY_TYPE {