X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fanalysis%2FAncestralTaxonomyInference.java;h=94453ff5834713b749ea89852981a135a932e406;hb=refs%2Fheads%2Fkjvdh%2Ffeatures%2Fforester;hp=0d1465a1ed51ea14d2130f359d3e5b513b541520;hpb=c4f9dc6343e1fee8846c893b968065d9d9178655;p=jalview.git diff --git a/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java b/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java index 0d1465a..94453ff 100644 --- a/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java +++ b/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java @@ -20,18 +20,14 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.analysis; import java.io.IOException; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.SortedSet; -import java.util.TreeSet; -import org.forester.archaeopteryx.tools.AncestralTaxonomyInferenceException; import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; @@ -39,117 +35,13 @@ import org.forester.phylogeny.data.Identifier; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; -import org.forester.ws.uniprot.UniProtTaxonomy; -import org.forester.ws.uniprot.UniProtWsTools; +import org.forester.ws.seqdb.UniProtTaxonomy; public final class AncestralTaxonomyInference { - private static final int MAX_CACHE_SIZE = 100000; - private static final int MAX_TAXONOMIES_TO_RETURN = 100; - private static final HashMap _sn_up_cache_map = new HashMap(); - private static final HashMap _code_up_cache_map = new HashMap(); - private static final HashMap _cn_up_cache_map = new HashMap(); - private static final HashMap _id_up_cache_map = new HashMap(); - - synchronized private static void clearCachesIfTooLarge() { - if ( getSnTaxCacheMap().size() > MAX_CACHE_SIZE ) { - getSnTaxCacheMap().clear(); - } - if ( getCnTaxCacheMap().size() > MAX_CACHE_SIZE ) { - getCnTaxCacheMap().clear(); - } - if ( getCodeTaxCacheMap().size() > MAX_CACHE_SIZE ) { - getCodeTaxCacheMap().clear(); - } - if ( getIdTaxCacheMap().size() > MAX_CACHE_SIZE ) { - getIdTaxCacheMap().clear(); - } - } - - synchronized private static HashMap getCnTaxCacheMap() { - return _cn_up_cache_map; - } - - synchronized private static HashMap getCodeTaxCacheMap() { - return _code_up_cache_map; - } - - synchronized private static HashMap getIdTaxCacheMap() { - return _id_up_cache_map; - } - - synchronized private static HashMap getSnTaxCacheMap() { - return _sn_up_cache_map; - } - - synchronized private static UniProtTaxonomy getTaxonomies( final HashMap cache, - final String query, - final QUERY_TYPE qt ) throws IOException { - if ( cache.containsKey( query ) ) { - return cache.get( query ).copy(); - } - else { - List up_taxonomies = null; - switch ( qt ) { - case ID: - up_taxonomies = getTaxonomiesFromId( query ); - break; - case CODE: - up_taxonomies = getTaxonomiesFromTaxonomyCode( query ); - break; - case SN: - up_taxonomies = getTaxonomiesFromScientificName( query ); - break; - case CN: - up_taxonomies = getTaxonomiesFromCommonName( query ); - break; - default: - throw new RuntimeException(); - } - if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) { - final UniProtTaxonomy up_tax = up_taxonomies.get( 0 ); - if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) { - getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax ); - } - if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) { - getCodeTaxCacheMap().put( up_tax.getCode(), up_tax ); - } - if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) { - getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax ); - } - if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) { - getIdTaxCacheMap().put( up_tax.getId(), up_tax ); - } - return up_tax; - } - else { - return null; - } - } - } - - synchronized private static List getTaxonomiesFromCommonName( final String query ) - throws IOException { - return UniProtWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN ); - } - - synchronized private static List getTaxonomiesFromId( final String query ) throws IOException { - return UniProtWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN ); - } - - synchronized private static List getTaxonomiesFromScientificName( final String query ) - throws IOException { - return UniProtWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN ); - } - - synchronized private static List getTaxonomiesFromTaxonomyCode( final String query ) - throws IOException { - return UniProtWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN ); - } - - synchronized public static void inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException, - AncestralTaxonomyInferenceException { - clearCachesIfTooLarge(); + public static void inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException, + AncestralTaxonomyInferenceException { + TaxonomyDataManager.clearCachesIfTooLarge(); for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); if ( !node.isExternal() ) { @@ -158,8 +50,8 @@ public final class AncestralTaxonomyInference { } } - synchronized private static void inferTaxonomyFromDescendents( final PhylogenyNode n ) throws IOException, - AncestralTaxonomyInferenceException { + private static void inferTaxonomyFromDescendents( final PhylogenyNode n ) throws IOException, + AncestralTaxonomyInferenceException { if ( n.isExternal() ) { throw new IllegalArgumentException( "attempt to infer taxonomy from descendants of external node" ); } @@ -169,18 +61,34 @@ public final class AncestralTaxonomyInference { int shortest_lin_length = Integer.MAX_VALUE; for( final PhylogenyNode desc : descs ) { if ( desc.getNodeData().isHasTaxonomy() - && ( isHasAppropriateId( desc.getNodeData().getTaxonomy() ) + && ( TaxonomyDataManager.isHasAppropriateId( desc.getNodeData().getTaxonomy() ) || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getScientificName() ) + || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() ) || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getTaxonomyCode() ) || !ForesterUtil .isEmpty( desc.getNodeData().getTaxonomy().getCommonName() ) ) ) { - final UniProtTaxonomy up_tax = obtainUniProtTaxonomy( desc.getNodeData().getTaxonomy(), null, null ); - String[] lineage = null; - if ( up_tax != null ) { - lineage = up_tax.getLineageAsArray(); + final UniProtTaxonomy up_tax = TaxonomyDataManager.obtainUniProtTaxonomy( desc.getNodeData() + .getTaxonomy(), null, null ); + if ( ( up_tax == null ) && ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() ) ) { + String desc_str = ""; + if ( !ForesterUtil.isEmpty( desc.getName() ) ) { + desc_str = "\"" + desc.getName() + "\""; + } + else { + desc_str = "[" + desc.getId() + "]"; + } + System.out.println( desc.getNodeData().getTaxonomy().toString() ); + System.out.println( ForesterUtil.stringListToString( desc.getNodeData().getTaxonomy().getLineage(), + " > " ) ); + throw new AncestralTaxonomyInferenceException( "a taxonomy for node " + desc_str + + " could not be established from the database" ); + } + String[] lineage = ForesterUtil.stringListToArray( desc.getNodeData().getTaxonomy().getLineage() ); + if ( ( lineage == null ) || ( lineage.length < 1 ) ) { + lineage = ForesterUtil.stringListToArray( up_tax.getLineage() ); } if ( ( lineage == null ) || ( lineage.length < 1 ) ) { throw new AncestralTaxonomyInferenceException( "a taxonomic lineage for node \"" - + desc.getNodeData().getTaxonomy().toString() + "\" could not be found" ); + + desc.getNodeData().getTaxonomy().toString() + "\" could not be established" ); } if ( lineage.length < shortest_lin_length ) { shortest_lin_length = lineage.length; @@ -195,19 +103,8 @@ public final class AncestralTaxonomyInference { else { node = "[" + desc.getId() + "]"; } - // final List e = desc.getAllExternalDescendants(); - //TODO remove me! - // System.out.println(); - // int x = 0; - // for( final PhylogenyNode object : e ) { - // System.out.println( x + ":" ); - // System.out.println( object.getName() + " " ); - // x++; - // } - // System.out.println(); - // throw new AncestralTaxonomyInferenceException( "node " + node - + " has no or inappropriate taxonomic information" ); + + " has no or inappropriate taxonomic information" ); } } final List last_common_lineage = new ArrayList(); @@ -220,29 +117,52 @@ public final class AncestralTaxonomyInference { break I; } } - // last_common_lineage = lineage_0; last_common_lineage.add( lineage_0 ); last_common = lineage_0; } } - // if ( last_common_lineage == null ) { if ( last_common_lineage.isEmpty() ) { - String msg = "no common lineage for:\n"; - int counter = 0; - for( final String[] strings : lineages ) { - msg += counter + ": "; - ++counter; - for( final String string : strings ) { - msg += string + " "; + boolean saw_viruses = false; + boolean saw_cellular_organism = false; + boolean saw_x = false; + for( final String[] lineage : lineages ) { + if ( lineage.length > 0 ) { + if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.VIRUSES ) ) { + saw_viruses = true; + } + else if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.CELLULAR_ORGANISMS ) ) { + saw_cellular_organism = true; + } + else if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.X ) ) { + saw_x = true; + } + if ( ( saw_cellular_organism && saw_viruses ) || saw_x ) { + break; + } + } + } + if ( ( saw_cellular_organism && saw_viruses ) || saw_x ) { + last_common_lineage.add( UniProtTaxonomy.X ); + last_common = UniProtTaxonomy.X; + } + else { + String msg = "no common lineage for:\n"; + int counter = 0; + for( final String[] strings : lineages ) { + msg += counter + ": "; + ++counter; + for( final String string : strings ) { + msg += string + " "; + } + msg += "\n"; } - msg += "\n"; + throw new AncestralTaxonomyInferenceException( msg ); } - throw new AncestralTaxonomyInferenceException( msg ); } final Taxonomy tax = new Taxonomy(); n.getNodeData().setTaxonomy( tax ); tax.setScientificName( last_common ); - final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromCommonLineage( last_common_lineage ); + final UniProtTaxonomy up_tax = TaxonomyDataManager.obtainUniProtTaxonomyFromLineage( last_common_lineage ); if ( up_tax != null ) { if ( !ForesterUtil.isEmpty( up_tax.getRank() ) ) { try { @@ -270,224 +190,19 @@ public final class AncestralTaxonomyInference { } } } - for( final PhylogenyNode desc : descs ) { - if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy() - && desc.getNodeData().getTaxonomy().isEqual( tax ) ) { - desc.getNodeData().setTaxonomy( null ); - } - } - } - - synchronized private static boolean isHasAppropriateId( final Taxonomy tax ) { - return ( ( tax.getIdentifier() != null ) && ( !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) && ( tax - .getIdentifier().getProvider().equalsIgnoreCase( "ncbi" ) - || tax.getIdentifier().getProvider().equalsIgnoreCase( "uniprot" ) || tax.getIdentifier().getProvider() - .equalsIgnoreCase( "uniprotkb" ) ) ) ); - } - - synchronized public static SortedSet obtainDetailedTaxonomicInformation( final Phylogeny phy, - final boolean delete ) - throws IOException { - clearCachesIfTooLarge(); - final SortedSet not_found = new TreeSet(); - List not_found_external_nodes = null; - if ( delete ) { - not_found_external_nodes = new ArrayList(); - } - for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { - final PhylogenyNode node = iter.next(); - final QUERY_TYPE qt = null; - Taxonomy tax = null; - if ( node.getNodeData().isHasTaxonomy() ) { - tax = node.getNodeData().getTaxonomy(); - } - else if ( node.isExternal() ) { - if ( !ForesterUtil.isEmpty( node.getName() ) ) { - not_found.add( node.getName() ); - } - else { - not_found.add( node.toString() ); - } - if ( delete ) { - not_found_external_nodes.add( node ); - } - } - UniProtTaxonomy uniprot_tax = null; - if ( ( tax != null ) - && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() ) - || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax - .getCommonName() ) ) ) { - uniprot_tax = obtainUniProtTaxonomy( tax, null, qt ); - if ( uniprot_tax != null ) { - updateTaxonomy( qt, node, tax, uniprot_tax ); - } - else { - not_found.add( tax.toString() ); - if ( delete && node.isExternal() ) { - not_found_external_nodes.add( node ); - } - } - } - } - if ( delete ) { - for( final PhylogenyNode node : not_found_external_nodes ) { - phy.deleteSubtree( node, true ); - } - phy.externalNodesHaveChanged(); - phy.hashIDs(); - phy.recalculateNumberOfExternalDescendants( true ); - } - return not_found; - } - - // TODO this might not be needed anymore - // synchronized private static String[] obtainLineagePlusOwnScientificName( final UniProtTaxonomy up_tax ) { - // final String[] lineage = up_tax.getLineageAsArray(); - // final String[] lin_plus_self = new String[ lineage.length + 1 ]; - // for( int i = 0; i < lineage.length; ++i ) { - // lin_plus_self[ i ] = lineage[ i ]; - // } - // lin_plus_self[ lineage.length ] = up_tax.getScientificName(); - // return lin_plus_self; - // } - synchronized private static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, String query, QUERY_TYPE qt ) - throws IOException { - if ( isHasAppropriateId( tax ) ) { - query = tax.getIdentifier().getValue(); - qt = QUERY_TYPE.ID; - System.out.println( "query by id: " + query ); - return getTaxonomies( getIdTaxCacheMap(), query, qt ); - } - else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { - query = tax.getScientificName(); - qt = QUERY_TYPE.SN; - System.out.println( "query by sn: " + query ); - return getTaxonomies( getSnTaxCacheMap(), query, qt ); - } - else if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { - query = tax.getTaxonomyCode(); - qt = QUERY_TYPE.CODE; - return getTaxonomies( getCodeTaxCacheMap(), query, qt ); - } - else { - query = tax.getCommonName(); - qt = QUERY_TYPE.CN; - return getTaxonomies( getCnTaxCacheMap(), query, qt ); - } - } - - synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn( final String sn ) throws IOException { - UniProtTaxonomy up_tax = null; - if ( getSnTaxCacheMap().containsKey( sn ) ) { - up_tax = getSnTaxCacheMap().get( sn ).copy(); - } - else { - final List up_taxonomies = getTaxonomiesFromScientificName( sn ); - if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) { - up_tax = up_taxonomies.get( 0 ); - getSnTaxCacheMap().put( sn, up_tax ); - if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) { - getCodeTaxCacheMap().put( up_tax.getCode(), up_tax ); - } - if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) { - getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax ); - } - if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) { - getIdTaxCacheMap().put( up_tax.getId(), up_tax ); - } - } - } - return up_tax; - } - - synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromCommonLineage( final List lineage ) - throws AncestralTaxonomyInferenceException, IOException { - UniProtTaxonomy up_tax = null; - // -- if ( getSnTaxCacheMap().containsKey( sn ) ) { - // -- up_tax = getSnTaxCacheMap().get( sn ).copy(); - // -- } - // else { - final List up_taxonomies = getTaxonomiesFromScientificName( lineage.get( lineage.size() - 1 ) ); - //-- if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) { - if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) { - for( final UniProtTaxonomy up_taxonomy : up_taxonomies ) { - boolean match = true; - I: for( int i = 0; i < lineage.size(); ++i ) { - if ( !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) { - match = false; - break I; - } - } - if ( match ) { - if ( up_tax != null ) { - throw new AncestralTaxonomyInferenceException( "lineage \"" - + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" ); - } - up_tax = up_taxonomy; - } - } - if ( up_tax == null ) { - throw new AncestralTaxonomyInferenceException( "lineage \"" - + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" ); - } - //-- up_tax = up_taxonomies.get( 0 ); - //-- getSnTaxCacheMap().put( sn, up_tax ); - if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) { - getCodeTaxCacheMap().put( up_tax.getCode(), up_tax ); - } - if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) { - getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax ); - } - if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) { - getIdTaxCacheMap().put( up_tax.getId(), up_tax ); - } - } - // } - return up_tax; - } - - synchronized private static void updateTaxonomy( final QUERY_TYPE qt, - final PhylogenyNode node, - final Taxonomy tax, - final UniProtTaxonomy up_tax ) { - if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() ) - && ForesterUtil.isEmpty( tax.getScientificName() ) ) { - tax.setScientificName( up_tax.getScientificName() ); - } - // if ( node.isExternal() - if ( ( qt != QUERY_TYPE.CODE ) && !ForesterUtil.isEmpty( up_tax.getCode() ) - && ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { - tax.setTaxonomyCode( up_tax.getCode() ); - } - if ( ( qt != QUERY_TYPE.CN ) && !ForesterUtil.isEmpty( up_tax.getCommonName() ) - && ForesterUtil.isEmpty( tax.getCommonName() ) ) { - tax.setCommonName( up_tax.getCommonName() ); - } - if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) { - tax.getSynonyms().add( up_tax.getSynonym() ); - } - if ( !ForesterUtil.isEmpty( up_tax.getRank() ) && ForesterUtil.isEmpty( tax.getRank() ) ) { - try { - tax.setRank( up_tax.getRank().toLowerCase() ); - } - catch ( final PhyloXmlDataFormatException ex ) { - tax.setRank( "" ); - } - } - if ( ( qt != QUERY_TYPE.ID ) && !ForesterUtil.isEmpty( up_tax.getId() ) && ( tax.getIdentifier() == null ) ) { - tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) ); - } - if ( up_tax.getLineage() != null ) { + if ( ForesterUtil.isEmpty( tax.getLineage() ) ) { tax.setLineage( new ArrayList() ); - for( final String lin : up_tax.getLineage() ) { + for( final String lin : last_common_lineage ) { if ( !ForesterUtil.isEmpty( lin ) ) { tax.getLineage().add( lin ); } } } - } - - private enum QUERY_TYPE { - CODE, SN, CN, ID; + for( final PhylogenyNode desc : descs ) { + if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy() + && desc.getNodeData().getTaxonomy().isEqual( tax ) ) { + desc.getNodeData().setTaxonomy( null ); + } + } } }