// forester -- software libraries and applications // for genomics and evolutionary biology research. // // Copyright (C) 2010 Christian M Zmasek // Copyright (C) 2010 Sanford-Burnham Medical Research Institute // All rights reserved // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com // WWW: www.phylosoft.org/forester package org.forester.analysis; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Identifier; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; import org.forester.ws.uniprot.UniProtTaxonomy; import org.forester.ws.uniprot.UniProtWsTools; public final class AncestralTaxonomyInference { private static final int MAX_CACHE_SIZE = 100000; private static final int MAX_TAXONOMIES_TO_RETURN = 100; private static final HashMap _sn_up_cache_map = new HashMap(); private static final HashMap _code_up_cache_map = new HashMap(); private static final HashMap _cn_up_cache_map = new HashMap(); private static final HashMap _id_up_cache_map = new HashMap(); synchronized private static void clearCachesIfTooLarge() { if ( getSnTaxCacheMap().size() > MAX_CACHE_SIZE ) { getSnTaxCacheMap().clear(); } if ( getCnTaxCacheMap().size() > MAX_CACHE_SIZE ) { getCnTaxCacheMap().clear(); } if ( getCodeTaxCacheMap().size() > MAX_CACHE_SIZE ) { getCodeTaxCacheMap().clear(); } if ( getIdTaxCacheMap().size() > MAX_CACHE_SIZE ) { getIdTaxCacheMap().clear(); } } synchronized private static HashMap getCnTaxCacheMap() { return _cn_up_cache_map; } synchronized private static HashMap getCodeTaxCacheMap() { return _code_up_cache_map; } synchronized private static HashMap getIdTaxCacheMap() { return _id_up_cache_map; } synchronized private static HashMap getSnTaxCacheMap() { return _sn_up_cache_map; } synchronized private static UniProtTaxonomy getTaxonomies( final HashMap cache, final String query, final QUERY_TYPE qt ) throws IOException { if ( cache.containsKey( query ) ) { return cache.get( query ).copy(); } else { List up_taxonomies = null; switch ( qt ) { case ID: up_taxonomies = getTaxonomiesFromId( query ); break; case CODE: up_taxonomies = getTaxonomiesFromTaxonomyCode( query ); break; case SN: up_taxonomies = getTaxonomiesFromScientificName( query ); break; case CN: up_taxonomies = getTaxonomiesFromCommonName( query ); break; default: throw new RuntimeException(); } if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) { final UniProtTaxonomy up_tax = up_taxonomies.get( 0 ); if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) { getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax ); } if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) { getCodeTaxCacheMap().put( up_tax.getCode(), up_tax ); } if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) { getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax ); } if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) { getIdTaxCacheMap().put( up_tax.getId(), up_tax ); } return up_tax; } else { return null; } } } synchronized private static List getTaxonomiesFromCommonName( final String query ) throws IOException { return UniProtWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN ); } synchronized private static List getTaxonomiesFromId( final String query ) throws IOException { return UniProtWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN ); } synchronized private static List getTaxonomiesFromScientificName( final String query ) throws IOException { return UniProtWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN ); } synchronized private static List getTaxonomiesFromTaxonomyCode( final String query ) throws IOException { return UniProtWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN ); } synchronized public static SortedSet inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException { clearCachesIfTooLarge(); final SortedSet not_found = new TreeSet(); for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); // final QUERY_TYPE qt = null; // Taxonomy tax = null; // if ( node.getNodeData().isHasTaxonomy() ) { // tax = node.getNodeData().getTaxonomy(); // } // UniProtTaxonomy up_tax = null; // if ( ( tax != null ) // && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( // tax.getScientificName() ) // || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || // !ForesterUtil.isEmpty( tax // .getCommonName() ) ) ) { // final String query = null; // up_tax = obtainUniProtTaxonomy( tax, query, qt ); // if ( up_tax == null ) { // not_found.add( query ); // } // else { // updateTaxonomy( qt, node, tax, up_tax ); // } // } if ( !node.isExternal() ) { inferTaxonomyFromDescendents( node, not_found ); } } return not_found; } synchronized private static void inferTaxonomyFromDescendents( final PhylogenyNode n, final SortedSet not_found ) throws IOException { if ( n.isExternal() ) { throw new IllegalArgumentException( "attempt to infer taxonomy from descendants of external node" ); } n.getNodeData().setTaxonomy( null ); final List descs = n.getDescendants(); final List lineages = new ArrayList(); int shortest_lin_length = Integer.MAX_VALUE; for( final PhylogenyNode desc : descs ) { if ( desc.getNodeData().isHasTaxonomy() && ( isHasAppropriateId( desc.getNodeData().getTaxonomy() ) || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getScientificName() ) || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getTaxonomyCode() ) || !ForesterUtil .isEmpty( desc.getNodeData().getTaxonomy().getCommonName() ) ) ) { final QUERY_TYPE qt = null; final String query = null; final UniProtTaxonomy up_tax = obtainUniProtTaxonomy( desc.getNodeData().getTaxonomy(), query, qt ); String[] lineage = null; if ( up_tax != null ) { lineage = obtainLineagePlusOwnScientificName( up_tax ); } if ( ( lineage == null ) || ( lineage.length < 1 ) ) { not_found.add( desc.getNodeData().getTaxonomy().asText().toString() ); return; } if ( lineage.length < shortest_lin_length ) { shortest_lin_length = lineage.length; } lineages.add( lineage ); } else { String msg = "Node(s) with no or inappropriate taxonomic information found"; if ( !ForesterUtil.isEmpty( desc.getName() ) ) { msg = "Node " + desc.getName() + " has no or inappropriate taxonomic information"; } throw new IllegalArgumentException( msg ); } } String last_common_lineage = null; if ( shortest_lin_length > 0 ) { I: for( int i = 0; i < shortest_lin_length; ++i ) { final String lineage_0 = lineages.get( 0 )[ i ]; for( int j = 1; j < lineages.size(); ++j ) { if ( !lineage_0.equals( lineages.get( j )[ i ] ) ) { break I; } } last_common_lineage = lineage_0; } } if ( last_common_lineage == null ) { return; } // if ( !n.getNodeData().isHasTaxonomy() ) { // n.getNodeData().setTaxonomy( new Taxonomy() ); // } final Taxonomy tax = new Taxonomy(); n.getNodeData().setTaxonomy( tax ); tax.setScientificName( last_common_lineage ); final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromSn( last_common_lineage ); if ( up_tax != null ) { if ( !ForesterUtil.isEmpty( up_tax.getRank() ) ) { try { tax.setRank( up_tax.getRank().toLowerCase() ); } catch ( final PhyloXmlDataFormatException ex ) { tax.setRank( "" ); } } if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) { tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) ); } if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) { tax.setCommonName( up_tax.getCommonName() ); } if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) { tax.getSynonyms().add( up_tax.getSynonym() ); } } for( final PhylogenyNode desc : descs ) { if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy() && desc.getNodeData().getTaxonomy().isEqual( tax ) ) { desc.getNodeData().setTaxonomy( null ); } } } synchronized private static boolean isHasAppropriateId( final Taxonomy tax ) { return ( ( tax.getIdentifier() != null ) && ( !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) && ( tax .getIdentifier().getProvider().equalsIgnoreCase( "ncbi" ) || tax.getIdentifier().getProvider().equalsIgnoreCase( "uniprot" ) || tax.getIdentifier().getProvider() .equalsIgnoreCase( "uniprotkb" ) ) ) ); } synchronized public static SortedSet obtainDetailedTaxonomicInformation( final Phylogeny phy ) throws IOException { clearCachesIfTooLarge(); final SortedSet not_found = new TreeSet(); for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); final QUERY_TYPE qt = null; Taxonomy tax = null; if ( node.getNodeData().isHasTaxonomy() ) { tax = node.getNodeData().getTaxonomy(); } else if ( node.isExternal() ) { if ( !ForesterUtil.isEmpty( node.getName() ) ) { not_found.add( node.getName() ); } else { not_found.add( node.toString() ); } } UniProtTaxonomy up_tax = null; if ( ( tax != null ) && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() ) || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax .getCommonName() ) ) ) { up_tax = obtainUniProtTaxonomy( tax, null, qt ); if ( up_tax != null ) { updateTaxonomy( qt, node, tax, up_tax ); } else { not_found.add( tax.toString() ); } } } return not_found; } synchronized private static String[] obtainLineagePlusOwnScientificName( final UniProtTaxonomy up_tax ) { final String[] lineage = up_tax.getLineage(); final String[] lin_plus_self = new String[ lineage.length + 1 ]; for( int i = 0; i < lineage.length; ++i ) { lin_plus_self[ i ] = lineage[ i ]; } lin_plus_self[ lineage.length ] = up_tax.getScientificName(); return lin_plus_self; } synchronized private static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, String query, QUERY_TYPE qt ) throws IOException { if ( isHasAppropriateId( tax ) ) { query = tax.getIdentifier().getValue(); qt = QUERY_TYPE.ID; return getTaxonomies( getIdTaxCacheMap(), query, qt ); } else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { query = tax.getScientificName(); qt = QUERY_TYPE.SN; return getTaxonomies( getSnTaxCacheMap(), query, qt ); } else if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { query = tax.getTaxonomyCode(); qt = QUERY_TYPE.CODE; return getTaxonomies( getCodeTaxCacheMap(), query, qt ); } else { query = tax.getCommonName(); qt = QUERY_TYPE.CN; return getTaxonomies( getCnTaxCacheMap(), query, qt ); } } synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn( final String sn ) throws IOException { UniProtTaxonomy up_tax = null; if ( getSnTaxCacheMap().containsKey( sn ) ) { up_tax = getSnTaxCacheMap().get( sn ).copy(); } else { final List up_taxonomies = getTaxonomiesFromScientificName( sn ); if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) { up_tax = up_taxonomies.get( 0 ); getSnTaxCacheMap().put( sn, up_tax ); if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) { getCodeTaxCacheMap().put( up_tax.getCode(), up_tax ); } if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) { getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax ); } if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) { getIdTaxCacheMap().put( up_tax.getId(), up_tax ); } } } return up_tax; } synchronized private static void updateTaxonomy( final QUERY_TYPE qt, final PhylogenyNode node, final Taxonomy tax, final UniProtTaxonomy up_tax ) { if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() ) && ForesterUtil.isEmpty( tax.getScientificName() ) ) { tax.setScientificName( up_tax.getScientificName() ); } if ( node.isExternal() && ( ( qt != QUERY_TYPE.CODE ) && !ForesterUtil.isEmpty( up_tax.getCode() ) && ForesterUtil .isEmpty( tax.getTaxonomyCode() ) ) ) { tax.setTaxonomyCode( up_tax.getCode() ); } if ( ( qt != QUERY_TYPE.CN ) && !ForesterUtil.isEmpty( up_tax.getCommonName() ) && ForesterUtil.isEmpty( tax.getCommonName() ) ) { tax.setCommonName( up_tax.getCommonName() ); } if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) { tax.getSynonyms().add( up_tax.getSynonym() ); } if ( !ForesterUtil.isEmpty( up_tax.getRank() ) && ForesterUtil.isEmpty( tax.getRank() ) ) { try { tax.setRank( up_tax.getRank().toLowerCase() ); } catch ( final PhyloXmlDataFormatException ex ) { tax.setRank( "" ); } } if ( ( qt != QUERY_TYPE.ID ) && !ForesterUtil.isEmpty( up_tax.getId() ) && ( tax.getIdentifier() == null ) ) { tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) ); } } private enum QUERY_TYPE { CODE, SN, CN, ID; } }