X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fanalysis%2FAncestralTaxonomyInference.java;h=94453ff5834713b749ea89852981a135a932e406;hb=refs%2Fheads%2Fkjvdh%2Ffeatures%2Fforester;hp=b9e6b464d3c18a01838c5f0ee7c021461dfddc0c;hpb=2343f209dd616cf44434e3750dfcd4e334446d46;p=jalview.git diff --git a/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java b/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java index b9e6b46..94453ff 100644 --- a/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java +++ b/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java @@ -4,7 +4,7 @@ // Copyright (C) 2010 Christian M Zmasek // Copyright (C) 2010 Sanford-Burnham Medical Research Institute // All rights reserved -// +// // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either @@ -14,22 +14,19 @@ // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. -// +// // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.analysis; import java.io.IOException; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.SortedSet; -import java.util.TreeSet; import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.phylogeny.Phylogeny; @@ -38,385 +35,174 @@ import org.forester.phylogeny.data.Identifier; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; -import org.forester.ws.uniprot.UniProtTaxonomy; -import org.forester.ws.uniprot.UniProtWsTools; +import org.forester.ws.seqdb.UniProtTaxonomy; public final class AncestralTaxonomyInference { - private static final int MAX_CACHE_SIZE = 100000; - private static final int MAX_TAXONOMIES_TO_RETURN = 100; - private static final HashMap _sn_up_cache_map = new HashMap(); - private static final HashMap _code_up_cache_map = new HashMap(); - private static final HashMap _cn_up_cache_map = new HashMap(); - private static final HashMap _id_up_cache_map = new HashMap(); - - synchronized private static void clearCachesIfTooLarge() { - if (getSnTaxCacheMap().size() > MAX_CACHE_SIZE) { - getSnTaxCacheMap().clear(); - } - if (getCnTaxCacheMap().size() > MAX_CACHE_SIZE) { - getCnTaxCacheMap().clear(); - } - if (getCodeTaxCacheMap().size() > MAX_CACHE_SIZE) { - getCodeTaxCacheMap().clear(); - } - if (getIdTaxCacheMap().size() > MAX_CACHE_SIZE) { - getIdTaxCacheMap().clear(); - } - } - - synchronized private static HashMap getCnTaxCacheMap() { - return _cn_up_cache_map; - } - - synchronized private static HashMap getCodeTaxCacheMap() { - return _code_up_cache_map; - } - - synchronized private static HashMap getIdTaxCacheMap() { - return _id_up_cache_map; - } - - synchronized private static HashMap getSnTaxCacheMap() { - return _sn_up_cache_map; - } - - synchronized private static UniProtTaxonomy getTaxonomies( - final HashMap cache, final String query, - final QUERY_TYPE qt) throws IOException { - if (cache.containsKey(query)) { - return cache.get(query).copy(); - } else { - List up_taxonomies = null; - switch (qt) { - case ID: - up_taxonomies = getTaxonomiesFromId(query); - break; - case CODE: - up_taxonomies = getTaxonomiesFromTaxonomyCode(query); - break; - case SN: - up_taxonomies = getTaxonomiesFromScientificName(query); - break; - case CN: - up_taxonomies = getTaxonomiesFromCommonName(query); - break; - default: - throw new RuntimeException(); - } - if ((up_taxonomies != null) && (up_taxonomies.size() == 1)) { - final UniProtTaxonomy up_tax = up_taxonomies.get(0); - if (!ForesterUtil.isEmpty(up_tax.getScientificName())) { - getSnTaxCacheMap().put(up_tax.getScientificName(), up_tax); - } - if (!ForesterUtil.isEmpty(up_tax.getCode())) { - getCodeTaxCacheMap().put(up_tax.getCode(), up_tax); - } - if (!ForesterUtil.isEmpty(up_tax.getCommonName())) { - getCnTaxCacheMap().put(up_tax.getCommonName(), up_tax); - } - if (!ForesterUtil.isEmpty(up_tax.getId())) { - getIdTaxCacheMap().put(up_tax.getId(), up_tax); - } - return up_tax; - } else { - return null; - } - } - } - - synchronized private static List getTaxonomiesFromCommonName( - final String query) throws IOException { - return UniProtWsTools.getTaxonomiesFromCommonNameStrict(query, - MAX_TAXONOMIES_TO_RETURN); - } - - synchronized private static List getTaxonomiesFromId( - final String query) throws IOException { - return UniProtWsTools.getTaxonomiesFromId(query, - MAX_TAXONOMIES_TO_RETURN); - } - - synchronized private static List getTaxonomiesFromScientificName( - final String query) throws IOException { - return UniProtWsTools.getTaxonomiesFromScientificNameStrict(query, - MAX_TAXONOMIES_TO_RETURN); - } - - synchronized private static List getTaxonomiesFromTaxonomyCode( - final String query) throws IOException { - return UniProtWsTools.getTaxonomiesFromTaxonomyCode(query, - MAX_TAXONOMIES_TO_RETURN); - } - - synchronized public static SortedSet inferTaxonomyFromDescendents( - final Phylogeny phy) throws IOException { - clearCachesIfTooLarge(); - final SortedSet not_found = new TreeSet(); - for (final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter - .hasNext();) { - final PhylogenyNode node = iter.next(); - // final QUERY_TYPE qt = null; - // Taxonomy tax = null; - // if ( node.getNodeData().isHasTaxonomy() ) { - // tax = node.getNodeData().getTaxonomy(); - // } - // UniProtTaxonomy up_tax = null; - // if ( ( tax != null ) - // && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( - // tax.getScientificName() ) - // || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || - // !ForesterUtil.isEmpty( tax - // .getCommonName() ) ) ) { - // final String query = null; - // up_tax = obtainUniProtTaxonomy( tax, query, qt ); - // if ( up_tax == null ) { - // not_found.add( query ); - // } - // else { - // updateTaxonomy( qt, node, tax, up_tax ); - // } - // } - if (!node.isExternal()) { - inferTaxonomyFromDescendents(node, not_found); - } - } - return not_found; - } - - synchronized private static void inferTaxonomyFromDescendents( - final PhylogenyNode n, final SortedSet not_found) - throws IOException { - if (n.isExternal()) { - throw new IllegalArgumentException( - "attempt to infer taxonomy from descendants of external node"); - } - n.getNodeData().setTaxonomy(null); - final List descs = n.getDescendants(); - final List lineages = new ArrayList(); - int shortest_lin_length = Integer.MAX_VALUE; - for (final PhylogenyNode desc : descs) { - if (desc.getNodeData().isHasTaxonomy() - && (isHasAppropriateId(desc.getNodeData().getTaxonomy()) - || !ForesterUtil.isEmpty(desc.getNodeData() - .getTaxonomy().getScientificName()) - || !ForesterUtil.isEmpty(desc.getNodeData() - .getTaxonomy().getTaxonomyCode()) || !ForesterUtil - .isEmpty(desc.getNodeData().getTaxonomy() - .getCommonName()))) { - final QUERY_TYPE qt = null; - final String query = null; - final UniProtTaxonomy up_tax = obtainUniProtTaxonomy(desc - .getNodeData().getTaxonomy(), query, qt); - String[] lineage = null; - if (up_tax != null) { - lineage = obtainLineagePlusOwnScientificName(up_tax); - } - if ((lineage == null) || (lineage.length < 1)) { - not_found.add(desc.getNodeData().getTaxonomy().asText() - .toString()); - return; - } - if (lineage.length < shortest_lin_length) { - shortest_lin_length = lineage.length; - } - lineages.add(lineage); - } else { - String msg = "Node(s) with no or inappropriate taxonomic information found"; - if (!ForesterUtil.isEmpty(desc.getName())) { - msg = "Node " + desc.getName() - + " has no or inappropriate taxonomic information"; - } - throw new IllegalArgumentException(msg); - } - } - String last_common_lineage = null; - if (shortest_lin_length > 0) { - I: for (int i = 0; i < shortest_lin_length; ++i) { - final String lineage_0 = lineages.get(0)[i]; - for (int j = 1; j < lineages.size(); ++j) { - if (!lineage_0.equals(lineages.get(j)[i])) { - break I; - } - } - last_common_lineage = lineage_0; - } - } - if (last_common_lineage == null) { - return; - } - // if ( !n.getNodeData().isHasTaxonomy() ) { - // n.getNodeData().setTaxonomy( new Taxonomy() ); - // } - final Taxonomy tax = new Taxonomy(); - n.getNodeData().setTaxonomy(tax); - tax.setScientificName(last_common_lineage); - final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromSn(last_common_lineage); - if (up_tax != null) { - if (!ForesterUtil.isEmpty(up_tax.getRank())) { - try { - tax.setRank(up_tax.getRank().toLowerCase()); - } catch (final PhyloXmlDataFormatException ex) { - tax.setRank(""); - } - } - if (!ForesterUtil.isEmpty(up_tax.getId())) { - tax.setIdentifier(new Identifier(up_tax.getId(), "uniprot")); - } - if (!ForesterUtil.isEmpty(up_tax.getCommonName())) { - tax.setCommonName(up_tax.getCommonName()); - } - if (!ForesterUtil.isEmpty(up_tax.getSynonym()) - && !tax.getSynonyms().contains(up_tax.getSynonym())) { - tax.getSynonyms().add(up_tax.getSynonym()); - } - } - for (final PhylogenyNode desc : descs) { - if (!desc.isExternal() && desc.getNodeData().isHasTaxonomy() - && desc.getNodeData().getTaxonomy().isEqual(tax)) { - desc.getNodeData().setTaxonomy(null); - } - } - } - - synchronized private static boolean isHasAppropriateId(final Taxonomy tax) { - return ((tax.getIdentifier() != null) && (!ForesterUtil.isEmpty(tax - .getIdentifier().getValue()) && (tax.getIdentifier() - .getProvider().equalsIgnoreCase("ncbi") - || tax.getIdentifier().getProvider() - .equalsIgnoreCase("uniprot") || tax.getIdentifier() - .getProvider().equalsIgnoreCase("uniprotkb")))); - } - - synchronized public static SortedSet obtainDetailedTaxonomicInformation( - final Phylogeny phy) throws IOException { - clearCachesIfTooLarge(); - final SortedSet not_found = new TreeSet(); - for (final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter - .hasNext();) { - final PhylogenyNode node = iter.next(); - final QUERY_TYPE qt = null; - Taxonomy tax = null; - if (node.getNodeData().isHasTaxonomy()) { - tax = node.getNodeData().getTaxonomy(); - } else if (node.isExternal()) { - if (!ForesterUtil.isEmpty(node.getName())) { - not_found.add(node.getName()); - } else { - not_found.add(node.toString()); - } - } - UniProtTaxonomy up_tax = null; - if ((tax != null) - && (isHasAppropriateId(tax) - || !ForesterUtil.isEmpty(tax.getScientificName()) - || !ForesterUtil.isEmpty(tax.getTaxonomyCode()) || !ForesterUtil - .isEmpty(tax.getCommonName()))) { - up_tax = obtainUniProtTaxonomy(tax, null, qt); - if (up_tax != null) { - updateTaxonomy(qt, node, tax, up_tax); - } else { - not_found.add(tax.toString()); - } - } - } - return not_found; - } - - synchronized private static String[] obtainLineagePlusOwnScientificName( - final UniProtTaxonomy up_tax) { - final String[] lineage = up_tax.getLineage(); - final String[] lin_plus_self = new String[lineage.length + 1]; - for (int i = 0; i < lineage.length; ++i) { - lin_plus_self[i] = lineage[i]; - } - lin_plus_self[lineage.length] = up_tax.getScientificName(); - return lin_plus_self; - } - - synchronized private static UniProtTaxonomy obtainUniProtTaxonomy( - final Taxonomy tax, String query, QUERY_TYPE qt) throws IOException { - if (isHasAppropriateId(tax)) { - query = tax.getIdentifier().getValue(); - qt = QUERY_TYPE.ID; - return getTaxonomies(getIdTaxCacheMap(), query, qt); - } else if (!ForesterUtil.isEmpty(tax.getScientificName())) { - query = tax.getScientificName(); - qt = QUERY_TYPE.SN; - return getTaxonomies(getSnTaxCacheMap(), query, qt); - } else if (!ForesterUtil.isEmpty(tax.getTaxonomyCode())) { - query = tax.getTaxonomyCode(); - qt = QUERY_TYPE.CODE; - return getTaxonomies(getCodeTaxCacheMap(), query, qt); - } else { - query = tax.getCommonName(); - qt = QUERY_TYPE.CN; - return getTaxonomies(getCnTaxCacheMap(), query, qt); - } - } - - synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn( - final String sn) throws IOException { - UniProtTaxonomy up_tax = null; - if (getSnTaxCacheMap().containsKey(sn)) { - up_tax = getSnTaxCacheMap().get(sn).copy(); - } else { - final List up_taxonomies = getTaxonomiesFromScientificName(sn); - if ((up_taxonomies != null) && (up_taxonomies.size() == 1)) { - up_tax = up_taxonomies.get(0); - getSnTaxCacheMap().put(sn, up_tax); - if (!ForesterUtil.isEmpty(up_tax.getCode())) { - getCodeTaxCacheMap().put(up_tax.getCode(), up_tax); - } - if (!ForesterUtil.isEmpty(up_tax.getCommonName())) { - getCnTaxCacheMap().put(up_tax.getCommonName(), up_tax); - } - if (!ForesterUtil.isEmpty(up_tax.getId())) { - getIdTaxCacheMap().put(up_tax.getId(), up_tax); - } - } - } - return up_tax; - } - - synchronized private static void updateTaxonomy(final QUERY_TYPE qt, - final PhylogenyNode node, final Taxonomy tax, - final UniProtTaxonomy up_tax) { - if ((qt != QUERY_TYPE.SN) - && !ForesterUtil.isEmpty(up_tax.getScientificName()) - && ForesterUtil.isEmpty(tax.getScientificName())) { - tax.setScientificName(up_tax.getScientificName()); - } - if (node.isExternal() - && ((qt != QUERY_TYPE.CODE) - && !ForesterUtil.isEmpty(up_tax.getCode()) && ForesterUtil - .isEmpty(tax.getTaxonomyCode()))) { - tax.setTaxonomyCode(up_tax.getCode()); - } - if ((qt != QUERY_TYPE.CN) - && !ForesterUtil.isEmpty(up_tax.getCommonName()) - && ForesterUtil.isEmpty(tax.getCommonName())) { - tax.setCommonName(up_tax.getCommonName()); - } - if (!ForesterUtil.isEmpty(up_tax.getSynonym()) - && !tax.getSynonyms().contains(up_tax.getSynonym())) { - tax.getSynonyms().add(up_tax.getSynonym()); - } - if (!ForesterUtil.isEmpty(up_tax.getRank()) - && ForesterUtil.isEmpty(tax.getRank())) { - try { - tax.setRank(up_tax.getRank().toLowerCase()); - } catch (final PhyloXmlDataFormatException ex) { - tax.setRank(""); - } - } - if ((qt != QUERY_TYPE.ID) && !ForesterUtil.isEmpty(up_tax.getId()) - && (tax.getIdentifier() == null)) { - tax.setIdentifier(new Identifier(up_tax.getId(), "uniprot")); - } - } - - private enum QUERY_TYPE { - CODE, SN, CN, ID; - } + public static void inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException, + AncestralTaxonomyInferenceException { + TaxonomyDataManager.clearCachesIfTooLarge(); + for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { + final PhylogenyNode node = iter.next(); + if ( !node.isExternal() ) { + inferTaxonomyFromDescendents( node ); + } + } + } + + private static void inferTaxonomyFromDescendents( final PhylogenyNode n ) throws IOException, + AncestralTaxonomyInferenceException { + if ( n.isExternal() ) { + throw new IllegalArgumentException( "attempt to infer taxonomy from descendants of external node" ); + } + n.getNodeData().setTaxonomy( null ); + final List descs = n.getDescendants(); + final List lineages = new ArrayList(); + int shortest_lin_length = Integer.MAX_VALUE; + for( final PhylogenyNode desc : descs ) { + if ( desc.getNodeData().isHasTaxonomy() + && ( TaxonomyDataManager.isHasAppropriateId( desc.getNodeData().getTaxonomy() ) + || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getScientificName() ) + || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() ) + || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getTaxonomyCode() ) || !ForesterUtil + .isEmpty( desc.getNodeData().getTaxonomy().getCommonName() ) ) ) { + final UniProtTaxonomy up_tax = TaxonomyDataManager.obtainUniProtTaxonomy( desc.getNodeData() + .getTaxonomy(), null, null ); + if ( ( up_tax == null ) && ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() ) ) { + String desc_str = ""; + if ( !ForesterUtil.isEmpty( desc.getName() ) ) { + desc_str = "\"" + desc.getName() + "\""; + } + else { + desc_str = "[" + desc.getId() + "]"; + } + System.out.println( desc.getNodeData().getTaxonomy().toString() ); + System.out.println( ForesterUtil.stringListToString( desc.getNodeData().getTaxonomy().getLineage(), + " > " ) ); + throw new AncestralTaxonomyInferenceException( "a taxonomy for node " + desc_str + + " could not be established from the database" ); + } + String[] lineage = ForesterUtil.stringListToArray( desc.getNodeData().getTaxonomy().getLineage() ); + if ( ( lineage == null ) || ( lineage.length < 1 ) ) { + lineage = ForesterUtil.stringListToArray( up_tax.getLineage() ); + } + if ( ( lineage == null ) || ( lineage.length < 1 ) ) { + throw new AncestralTaxonomyInferenceException( "a taxonomic lineage for node \"" + + desc.getNodeData().getTaxonomy().toString() + "\" could not be established" ); + } + if ( lineage.length < shortest_lin_length ) { + shortest_lin_length = lineage.length; + } + lineages.add( lineage ); + } + else { + String node = ""; + if ( !ForesterUtil.isEmpty( desc.getName() ) ) { + node = "\"" + desc.getName() + "\""; + } + else { + node = "[" + desc.getId() + "]"; + } + throw new AncestralTaxonomyInferenceException( "node " + node + + " has no or inappropriate taxonomic information" ); + } + } + final List last_common_lineage = new ArrayList(); + String last_common = null; + if ( shortest_lin_length > 0 ) { + I: for( int i = 0; i < shortest_lin_length; ++i ) { + final String lineage_0 = lineages.get( 0 )[ i ]; + for( int j = 1; j < lineages.size(); ++j ) { + if ( !lineage_0.equals( lineages.get( j )[ i ] ) ) { + break I; + } + } + last_common_lineage.add( lineage_0 ); + last_common = lineage_0; + } + } + if ( last_common_lineage.isEmpty() ) { + boolean saw_viruses = false; + boolean saw_cellular_organism = false; + boolean saw_x = false; + for( final String[] lineage : lineages ) { + if ( lineage.length > 0 ) { + if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.VIRUSES ) ) { + saw_viruses = true; + } + else if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.CELLULAR_ORGANISMS ) ) { + saw_cellular_organism = true; + } + else if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.X ) ) { + saw_x = true; + } + if ( ( saw_cellular_organism && saw_viruses ) || saw_x ) { + break; + } + } + } + if ( ( saw_cellular_organism && saw_viruses ) || saw_x ) { + last_common_lineage.add( UniProtTaxonomy.X ); + last_common = UniProtTaxonomy.X; + } + else { + String msg = "no common lineage for:\n"; + int counter = 0; + for( final String[] strings : lineages ) { + msg += counter + ": "; + ++counter; + for( final String string : strings ) { + msg += string + " "; + } + msg += "\n"; + } + throw new AncestralTaxonomyInferenceException( msg ); + } + } + final Taxonomy tax = new Taxonomy(); + n.getNodeData().setTaxonomy( tax ); + tax.setScientificName( last_common ); + final UniProtTaxonomy up_tax = TaxonomyDataManager.obtainUniProtTaxonomyFromLineage( last_common_lineage ); + if ( up_tax != null ) { + if ( !ForesterUtil.isEmpty( up_tax.getRank() ) ) { + try { + tax.setRank( up_tax.getRank().toLowerCase() ); + } + catch ( final PhyloXmlDataFormatException ex ) { + tax.setRank( "" ); + } + } + if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) { + tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) ); + } + if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) { + tax.setCommonName( up_tax.getCommonName() ); + } + if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) { + tax.getSynonyms().add( up_tax.getSynonym() ); + } + if ( up_tax.getLineage() != null ) { + tax.setLineage( new ArrayList() ); + for( final String lin : up_tax.getLineage() ) { + if ( !ForesterUtil.isEmpty( lin ) ) { + tax.getLineage().add( lin ); + } + } + } + } + if ( ForesterUtil.isEmpty( tax.getLineage() ) ) { + tax.setLineage( new ArrayList() ); + for( final String lin : last_common_lineage ) { + if ( !ForesterUtil.isEmpty( lin ) ) { + tax.getLineage().add( lin ); + } + } + } + for( final PhylogenyNode desc : descs ) { + if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy() + && desc.getNodeData().getTaxonomy().isEqual( tax ) ) { + desc.getNodeData().setTaxonomy( null ); + } + } + } }