From 2343f209dd616cf44434e3750dfcd4e334446d46 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Wed, 9 Feb 2011 01:05:46 +0000 Subject: [PATCH] initial commit --- .../analysis/AncestralTaxonomyInference.java | 422 ++++++++++++++++++++ 1 file changed, 422 insertions(+) create mode 100644 forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java diff --git a/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java b/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java new file mode 100644 index 0000000..b9e6b46 --- /dev/null +++ b/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java @@ -0,0 +1,422 @@ +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.analysis; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; +import org.forester.phylogeny.Phylogeny; +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Taxonomy; +import org.forester.phylogeny.iterators.PhylogenyNodeIterator; +import org.forester.util.ForesterUtil; +import org.forester.ws.uniprot.UniProtTaxonomy; +import org.forester.ws.uniprot.UniProtWsTools; + +public final class AncestralTaxonomyInference { + + private static final int MAX_CACHE_SIZE = 100000; + private static final int MAX_TAXONOMIES_TO_RETURN = 100; + private static final HashMap _sn_up_cache_map = new HashMap(); + private static final HashMap _code_up_cache_map = new HashMap(); + private static final HashMap _cn_up_cache_map = new HashMap(); + private static final HashMap _id_up_cache_map = new HashMap(); + + synchronized private static void clearCachesIfTooLarge() { + if (getSnTaxCacheMap().size() > MAX_CACHE_SIZE) { + getSnTaxCacheMap().clear(); + } + if (getCnTaxCacheMap().size() > MAX_CACHE_SIZE) { + getCnTaxCacheMap().clear(); + } + if (getCodeTaxCacheMap().size() > MAX_CACHE_SIZE) { + getCodeTaxCacheMap().clear(); + } + if (getIdTaxCacheMap().size() > MAX_CACHE_SIZE) { + getIdTaxCacheMap().clear(); + } + } + + synchronized private static HashMap getCnTaxCacheMap() { + return _cn_up_cache_map; + } + + synchronized private static HashMap getCodeTaxCacheMap() { + return _code_up_cache_map; + } + + synchronized private static HashMap getIdTaxCacheMap() { + return _id_up_cache_map; + } + + synchronized private static HashMap getSnTaxCacheMap() { + return _sn_up_cache_map; + } + + synchronized private static UniProtTaxonomy getTaxonomies( + final HashMap cache, final String query, + final QUERY_TYPE qt) throws IOException { + if (cache.containsKey(query)) { + return cache.get(query).copy(); + } else { + List up_taxonomies = null; + switch (qt) { + case ID: + up_taxonomies = getTaxonomiesFromId(query); + break; + case CODE: + up_taxonomies = getTaxonomiesFromTaxonomyCode(query); + break; + case SN: + up_taxonomies = getTaxonomiesFromScientificName(query); + break; + case CN: + up_taxonomies = getTaxonomiesFromCommonName(query); + break; + default: + throw new RuntimeException(); + } + if ((up_taxonomies != null) && (up_taxonomies.size() == 1)) { + final UniProtTaxonomy up_tax = up_taxonomies.get(0); + if (!ForesterUtil.isEmpty(up_tax.getScientificName())) { + getSnTaxCacheMap().put(up_tax.getScientificName(), up_tax); + } + if (!ForesterUtil.isEmpty(up_tax.getCode())) { + getCodeTaxCacheMap().put(up_tax.getCode(), up_tax); + } + if (!ForesterUtil.isEmpty(up_tax.getCommonName())) { + getCnTaxCacheMap().put(up_tax.getCommonName(), up_tax); + } + if (!ForesterUtil.isEmpty(up_tax.getId())) { + getIdTaxCacheMap().put(up_tax.getId(), up_tax); + } + return up_tax; + } else { + return null; + } + } + } + + synchronized private static List getTaxonomiesFromCommonName( + final String query) throws IOException { + return UniProtWsTools.getTaxonomiesFromCommonNameStrict(query, + MAX_TAXONOMIES_TO_RETURN); + } + + synchronized private static List getTaxonomiesFromId( + final String query) throws IOException { + return UniProtWsTools.getTaxonomiesFromId(query, + MAX_TAXONOMIES_TO_RETURN); + } + + synchronized private static List getTaxonomiesFromScientificName( + final String query) throws IOException { + return UniProtWsTools.getTaxonomiesFromScientificNameStrict(query, + MAX_TAXONOMIES_TO_RETURN); + } + + synchronized private static List getTaxonomiesFromTaxonomyCode( + final String query) throws IOException { + return UniProtWsTools.getTaxonomiesFromTaxonomyCode(query, + MAX_TAXONOMIES_TO_RETURN); + } + + synchronized public static SortedSet inferTaxonomyFromDescendents( + final Phylogeny phy) throws IOException { + clearCachesIfTooLarge(); + final SortedSet not_found = new TreeSet(); + for (final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter + .hasNext();) { + final PhylogenyNode node = iter.next(); + // final QUERY_TYPE qt = null; + // Taxonomy tax = null; + // if ( node.getNodeData().isHasTaxonomy() ) { + // tax = node.getNodeData().getTaxonomy(); + // } + // UniProtTaxonomy up_tax = null; + // if ( ( tax != null ) + // && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( + // tax.getScientificName() ) + // || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || + // !ForesterUtil.isEmpty( tax + // .getCommonName() ) ) ) { + // final String query = null; + // up_tax = obtainUniProtTaxonomy( tax, query, qt ); + // if ( up_tax == null ) { + // not_found.add( query ); + // } + // else { + // updateTaxonomy( qt, node, tax, up_tax ); + // } + // } + if (!node.isExternal()) { + inferTaxonomyFromDescendents(node, not_found); + } + } + return not_found; + } + + synchronized private static void inferTaxonomyFromDescendents( + final PhylogenyNode n, final SortedSet not_found) + throws IOException { + if (n.isExternal()) { + throw new IllegalArgumentException( + "attempt to infer taxonomy from descendants of external node"); + } + n.getNodeData().setTaxonomy(null); + final List descs = n.getDescendants(); + final List lineages = new ArrayList(); + int shortest_lin_length = Integer.MAX_VALUE; + for (final PhylogenyNode desc : descs) { + if (desc.getNodeData().isHasTaxonomy() + && (isHasAppropriateId(desc.getNodeData().getTaxonomy()) + || !ForesterUtil.isEmpty(desc.getNodeData() + .getTaxonomy().getScientificName()) + || !ForesterUtil.isEmpty(desc.getNodeData() + .getTaxonomy().getTaxonomyCode()) || !ForesterUtil + .isEmpty(desc.getNodeData().getTaxonomy() + .getCommonName()))) { + final QUERY_TYPE qt = null; + final String query = null; + final UniProtTaxonomy up_tax = obtainUniProtTaxonomy(desc + .getNodeData().getTaxonomy(), query, qt); + String[] lineage = null; + if (up_tax != null) { + lineage = obtainLineagePlusOwnScientificName(up_tax); + } + if ((lineage == null) || (lineage.length < 1)) { + not_found.add(desc.getNodeData().getTaxonomy().asText() + .toString()); + return; + } + if (lineage.length < shortest_lin_length) { + shortest_lin_length = lineage.length; + } + lineages.add(lineage); + } else { + String msg = "Node(s) with no or inappropriate taxonomic information found"; + if (!ForesterUtil.isEmpty(desc.getName())) { + msg = "Node " + desc.getName() + + " has no or inappropriate taxonomic information"; + } + throw new IllegalArgumentException(msg); + } + } + String last_common_lineage = null; + if (shortest_lin_length > 0) { + I: for (int i = 0; i < shortest_lin_length; ++i) { + final String lineage_0 = lineages.get(0)[i]; + for (int j = 1; j < lineages.size(); ++j) { + if (!lineage_0.equals(lineages.get(j)[i])) { + break I; + } + } + last_common_lineage = lineage_0; + } + } + if (last_common_lineage == null) { + return; + } + // if ( !n.getNodeData().isHasTaxonomy() ) { + // n.getNodeData().setTaxonomy( new Taxonomy() ); + // } + final Taxonomy tax = new Taxonomy(); + n.getNodeData().setTaxonomy(tax); + tax.setScientificName(last_common_lineage); + final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromSn(last_common_lineage); + if (up_tax != null) { + if (!ForesterUtil.isEmpty(up_tax.getRank())) { + try { + tax.setRank(up_tax.getRank().toLowerCase()); + } catch (final PhyloXmlDataFormatException ex) { + tax.setRank(""); + } + } + if (!ForesterUtil.isEmpty(up_tax.getId())) { + tax.setIdentifier(new Identifier(up_tax.getId(), "uniprot")); + } + if (!ForesterUtil.isEmpty(up_tax.getCommonName())) { + tax.setCommonName(up_tax.getCommonName()); + } + if (!ForesterUtil.isEmpty(up_tax.getSynonym()) + && !tax.getSynonyms().contains(up_tax.getSynonym())) { + tax.getSynonyms().add(up_tax.getSynonym()); + } + } + for (final PhylogenyNode desc : descs) { + if (!desc.isExternal() && desc.getNodeData().isHasTaxonomy() + && desc.getNodeData().getTaxonomy().isEqual(tax)) { + desc.getNodeData().setTaxonomy(null); + } + } + } + + synchronized private static boolean isHasAppropriateId(final Taxonomy tax) { + return ((tax.getIdentifier() != null) && (!ForesterUtil.isEmpty(tax + .getIdentifier().getValue()) && (tax.getIdentifier() + .getProvider().equalsIgnoreCase("ncbi") + || tax.getIdentifier().getProvider() + .equalsIgnoreCase("uniprot") || tax.getIdentifier() + .getProvider().equalsIgnoreCase("uniprotkb")))); + } + + synchronized public static SortedSet obtainDetailedTaxonomicInformation( + final Phylogeny phy) throws IOException { + clearCachesIfTooLarge(); + final SortedSet not_found = new TreeSet(); + for (final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter + .hasNext();) { + final PhylogenyNode node = iter.next(); + final QUERY_TYPE qt = null; + Taxonomy tax = null; + if (node.getNodeData().isHasTaxonomy()) { + tax = node.getNodeData().getTaxonomy(); + } else if (node.isExternal()) { + if (!ForesterUtil.isEmpty(node.getName())) { + not_found.add(node.getName()); + } else { + not_found.add(node.toString()); + } + } + UniProtTaxonomy up_tax = null; + if ((tax != null) + && (isHasAppropriateId(tax) + || !ForesterUtil.isEmpty(tax.getScientificName()) + || !ForesterUtil.isEmpty(tax.getTaxonomyCode()) || !ForesterUtil + .isEmpty(tax.getCommonName()))) { + up_tax = obtainUniProtTaxonomy(tax, null, qt); + if (up_tax != null) { + updateTaxonomy(qt, node, tax, up_tax); + } else { + not_found.add(tax.toString()); + } + } + } + return not_found; + } + + synchronized private static String[] obtainLineagePlusOwnScientificName( + final UniProtTaxonomy up_tax) { + final String[] lineage = up_tax.getLineage(); + final String[] lin_plus_self = new String[lineage.length + 1]; + for (int i = 0; i < lineage.length; ++i) { + lin_plus_self[i] = lineage[i]; + } + lin_plus_self[lineage.length] = up_tax.getScientificName(); + return lin_plus_self; + } + + synchronized private static UniProtTaxonomy obtainUniProtTaxonomy( + final Taxonomy tax, String query, QUERY_TYPE qt) throws IOException { + if (isHasAppropriateId(tax)) { + query = tax.getIdentifier().getValue(); + qt = QUERY_TYPE.ID; + return getTaxonomies(getIdTaxCacheMap(), query, qt); + } else if (!ForesterUtil.isEmpty(tax.getScientificName())) { + query = tax.getScientificName(); + qt = QUERY_TYPE.SN; + return getTaxonomies(getSnTaxCacheMap(), query, qt); + } else if (!ForesterUtil.isEmpty(tax.getTaxonomyCode())) { + query = tax.getTaxonomyCode(); + qt = QUERY_TYPE.CODE; + return getTaxonomies(getCodeTaxCacheMap(), query, qt); + } else { + query = tax.getCommonName(); + qt = QUERY_TYPE.CN; + return getTaxonomies(getCnTaxCacheMap(), query, qt); + } + } + + synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn( + final String sn) throws IOException { + UniProtTaxonomy up_tax = null; + if (getSnTaxCacheMap().containsKey(sn)) { + up_tax = getSnTaxCacheMap().get(sn).copy(); + } else { + final List up_taxonomies = getTaxonomiesFromScientificName(sn); + if ((up_taxonomies != null) && (up_taxonomies.size() == 1)) { + up_tax = up_taxonomies.get(0); + getSnTaxCacheMap().put(sn, up_tax); + if (!ForesterUtil.isEmpty(up_tax.getCode())) { + getCodeTaxCacheMap().put(up_tax.getCode(), up_tax); + } + if (!ForesterUtil.isEmpty(up_tax.getCommonName())) { + getCnTaxCacheMap().put(up_tax.getCommonName(), up_tax); + } + if (!ForesterUtil.isEmpty(up_tax.getId())) { + getIdTaxCacheMap().put(up_tax.getId(), up_tax); + } + } + } + return up_tax; + } + + synchronized private static void updateTaxonomy(final QUERY_TYPE qt, + final PhylogenyNode node, final Taxonomy tax, + final UniProtTaxonomy up_tax) { + if ((qt != QUERY_TYPE.SN) + && !ForesterUtil.isEmpty(up_tax.getScientificName()) + && ForesterUtil.isEmpty(tax.getScientificName())) { + tax.setScientificName(up_tax.getScientificName()); + } + if (node.isExternal() + && ((qt != QUERY_TYPE.CODE) + && !ForesterUtil.isEmpty(up_tax.getCode()) && ForesterUtil + .isEmpty(tax.getTaxonomyCode()))) { + tax.setTaxonomyCode(up_tax.getCode()); + } + if ((qt != QUERY_TYPE.CN) + && !ForesterUtil.isEmpty(up_tax.getCommonName()) + && ForesterUtil.isEmpty(tax.getCommonName())) { + tax.setCommonName(up_tax.getCommonName()); + } + if (!ForesterUtil.isEmpty(up_tax.getSynonym()) + && !tax.getSynonyms().contains(up_tax.getSynonym())) { + tax.getSynonyms().add(up_tax.getSynonym()); + } + if (!ForesterUtil.isEmpty(up_tax.getRank()) + && ForesterUtil.isEmpty(tax.getRank())) { + try { + tax.setRank(up_tax.getRank().toLowerCase()); + } catch (final PhyloXmlDataFormatException ex) { + tax.setRank(""); + } + } + if ((qt != QUERY_TYPE.ID) && !ForesterUtil.isEmpty(up_tax.getId()) + && (tax.getIdentifier() == null)) { + tax.setIdentifier(new Identifier(up_tax.getId(), "uniprot")); + } + } + + private enum QUERY_TYPE { + CODE, SN, CN, ID; + } +} -- 1.7.10.2