X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fanalysis%2FTaxonomyDataManager.java;h=d78720cf4dcfc660171f46c48fe6f24b43089c4b;hb=06b38f91bc061d8ab1dfea3b6238c94c95a30d26;hp=0e48458870538bedf48274e4d8e8d45680802902;hpb=c3ef647b344999a7989bc61589a2286ceeeb2f7f;p=jalview.git diff --git a/forester/java/src/org/forester/analysis/TaxonomyDataManager.java b/forester/java/src/org/forester/analysis/TaxonomyDataManager.java index 0e48458..d78720c 100644 --- a/forester/java/src/org/forester/analysis/TaxonomyDataManager.java +++ b/forester/java/src/org/forester/analysis/TaxonomyDataManager.java @@ -22,7 +22,7 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA // // Contact: phylosoft @ gmail . com -// WWW: www.phylosoft.org/forester +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester package org.forester.analysis; @@ -33,21 +33,26 @@ import java.util.HashMap; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; +import java.util.regex.Matcher; import javax.swing.JOptionPane; import org.forester.archaeopteryx.MainFrameApplication; import org.forester.archaeopteryx.TreePanel; +import org.forester.archaeopteryx.tools.AncestralTaxonomyInferrer; import org.forester.archaeopteryx.tools.RunnableProcess; +import org.forester.io.parsers.nhx.NHXParser; import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; +import org.forester.io.parsers.util.ParserUtils; import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Identifier; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; -import org.forester.ws.uniprot.UniProtTaxonomy; -import org.forester.ws.uniprot.UniProtWsTools; +import org.forester.util.TaxonomyUtil; +import org.forester.ws.seqdb.SequenceDbWsTools; +import org.forester.ws.seqdb.UniProtTaxonomy; public final class TaxonomyDataManager extends RunnableProcess { @@ -55,18 +60,17 @@ public final class TaxonomyDataManager extends RunnableProcess { CODE, SN, CN, ID, LIN; } private static final int MAX_CACHE_SIZE = 100000; - private static final int MAX_TAXONOMIES_TO_RETURN = 10; + private static final int MAX_TAXONOMIES_TO_RETURN = 2000; private static final HashMap _sn_up_cache_map = new HashMap(); private static final HashMap _lineage_up_cache_map = new HashMap(); private static final HashMap _code_up_cache_map = new HashMap(); private static final HashMap _cn_up_cache_map = new HashMap(); private static final HashMap _id_up_cache_map = new HashMap(); - - private final Phylogeny _phy; - private final MainFrameApplication _mf; - private final TreePanel _treepanel; - private final boolean _delete; - private final boolean _allow_simple_names; + private final Phylogeny _phy; + private final MainFrameApplication _mf; + private final TreePanel _treepanel; + private final boolean _delete; + private final boolean _allow_simple_names; public TaxonomyDataManager( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) { _phy = phy; @@ -77,10 +81,10 @@ public final class TaxonomyDataManager extends RunnableProcess { } public TaxonomyDataManager( final MainFrameApplication mf, - final TreePanel treepanel, - final Phylogeny phy, - final boolean delete, - final boolean allow_simple_name ) { + final TreePanel treepanel, + final Phylogeny phy, + final boolean delete, + final boolean allow_simple_name ) { _phy = phy; _mf = mf; _treepanel = treepanel; @@ -88,7 +92,6 @@ public final class TaxonomyDataManager extends RunnableProcess { _allow_simple_names = allow_simple_name; } - synchronized static void clearCachesIfTooLarge() { if ( getSnTaxCacheMap().size() > MAX_CACHE_SIZE ) { getSnTaxCacheMap().clear(); @@ -127,9 +130,9 @@ public final class TaxonomyDataManager extends RunnableProcess { return _sn_up_cache_map; } - private final static UniProtTaxonomy getTaxonomies( final HashMap cache, - final Object query, - final QUERY_TYPE qt ) throws IOException, + private final static UniProtTaxonomy obtainTaxonomy( final HashMap cache, + final Object query, + final QUERY_TYPE qt ) throws IOException, AncestralTaxonomyInferenceException { if ( cache.containsKey( query ) ) { return cache.get( query ).copy(); @@ -177,21 +180,33 @@ public final class TaxonomyDataManager extends RunnableProcess { } private final static List getTaxonomiesFromCommonName( final String query ) throws IOException { - return UniProtWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN ); + return SequenceDbWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN ); } private final static List getTaxonomiesFromId( final String query ) throws IOException { - return UniProtWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN ); + return SequenceDbWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN ); } - private final static List getTaxonomiesFromScientificName( final String query ) throws IOException { - return UniProtWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN ); + if ( query.equalsIgnoreCase( UniProtTaxonomy.BACTERIA ) || query.equalsIgnoreCase( UniProtTaxonomy.ARCHAEA ) + || query.equalsIgnoreCase( UniProtTaxonomy.VIRUSES ) + || query.equalsIgnoreCase( UniProtTaxonomy.EUKARYOTA ) || query.equalsIgnoreCase( UniProtTaxonomy.X ) ) { + final List l = new ArrayList(); + l.add( UniProtTaxonomy.createSpecialFromScientificName( query ) ); + return l; + } + return SequenceDbWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN ); } - private final static List getTaxonomiesFromTaxonomyCode( final String query ) throws IOException { - return UniProtWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN ); + //FIXME fix "SPHAR" issue + if ( ( ( query.indexOf( "XX" ) == 3 ) && TaxonomyUtil.isHasTaxIdFromFakeTaxCode( query ) ) + || query.equals( "SPHAR" ) /* TODO remove me, is same as Sphingomonas aromaticivorans */ + ) { + final int id = TaxonomyUtil.getTaxIdFromFakeTaxCode( query ); + return SequenceDbWsTools.getTaxonomiesFromId( String.valueOf( id ), MAX_TAXONOMIES_TO_RETURN ); + } + return SequenceDbWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN ); } static final boolean isHasAppropriateId( final Taxonomy tax ) { @@ -202,8 +217,8 @@ public final class TaxonomyDataManager extends RunnableProcess { } synchronized final private static SortedSet obtainDetailedTaxonomicInformation( final Phylogeny phy, - final boolean delete, - final boolean allow_to_use_basic_node_names ) + final boolean delete, + final boolean allow_to_use_basic_node_names ) throws IOException, AncestralTaxonomyInferenceException { clearCachesIfTooLarge(); final SortedSet not_found = new TreeSet(); @@ -233,17 +248,32 @@ public final class TaxonomyDataManager extends RunnableProcess { } } UniProtTaxonomy uniprot_tax = null; - if ( ( ( tax != null ) - && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() ) - || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax - .getCommonName() ) ) ) || - ( allow_to_use_basic_node_names && !ForesterUtil.isEmpty( node.getName() ) ) ) { - uniprot_tax = obtainUniProtTaxonomy( tax, null, qt ); + if ( ( ( tax != null ) && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() ) + || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax.getCommonName() ) ) ) + || ( allow_to_use_basic_node_names && !ForesterUtil.isEmpty( node.getName() ) ) ) { + if ( ( ( tax != null ) && ( isHasAppropriateId( tax ) + || !ForesterUtil.isEmpty( tax.getScientificName() ) + || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil + .isEmpty( tax.getCommonName() ) ) ) ) { + uniprot_tax = obtainUniProtTaxonomy( tax, null, qt ); + } + else { + uniprot_tax = obtainUniProtTaxonomy( node.getName(), qt ); + } if ( uniprot_tax != null ) { + if ( tax == null ) { + tax = new Taxonomy(); + node.getNodeData().addTaxonomy( tax ); + } updateTaxonomy( qt, node, tax, uniprot_tax ); } else { - not_found.add( tax.toString() ); + if ( tax != null ) { + not_found.add( tax.toString() ); + } + else { + not_found.add( node.getName() ); + } if ( delete && node.isExternal() ) { not_found_external_nodes.add( node ); } @@ -255,7 +285,7 @@ public final class TaxonomyDataManager extends RunnableProcess { phy.deleteSubtree( node, true ); } phy.externalNodesHaveChanged(); - phy.hashIDs(); + phy.clearHashIdToNodeMap(); phy.recalculateNumberOfExternalDescendants( true ); } return not_found; @@ -264,92 +294,147 @@ public final class TaxonomyDataManager extends RunnableProcess { public final static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, Object query, QUERY_TYPE qt ) throws IOException, AncestralTaxonomyInferenceException { if ( tax == null ) { - throw new IllegalArgumentException( "illegal attempt to use empty taxonomy object"); + throw new IllegalArgumentException( "illegal attempt to use empty taxonomy object" ); } - - if ( TaxonomyDataManager.isHasAppropriateId( tax ) ) { query = tax.getIdentifier().getValue(); qt = QUERY_TYPE.ID; - return getTaxonomies( TaxonomyDataManager.getIdTaxCacheMap(), query, qt ); + return obtainTaxonomy( TaxonomyDataManager.getIdTaxCacheMap(), query, qt ); } else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { if ( !ForesterUtil.isEmpty( tax.getLineage() ) ) { query = tax.getLineage(); qt = QUERY_TYPE.LIN; - return getTaxonomies( TaxonomyDataManager.getLineageTaxCacheMap(), query, qt ); + return obtainTaxonomy( TaxonomyDataManager.getLineageTaxCacheMap(), query, qt ); } else { query = tax.getScientificName(); qt = QUERY_TYPE.SN; - return getTaxonomies( TaxonomyDataManager.getSnTaxCacheMap(), query, qt ); + return obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), query, qt ); } } else if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { query = tax.getTaxonomyCode(); qt = QUERY_TYPE.CODE; - return getTaxonomies( TaxonomyDataManager.getCodeTaxCacheMap(), query, qt ); + return obtainTaxonomy( TaxonomyDataManager.getCodeTaxCacheMap(), query, qt ); } else { query = tax.getCommonName(); qt = QUERY_TYPE.CN; - return getTaxonomies( TaxonomyDataManager.getCnTaxCacheMap(), query, qt ); + return obtainTaxonomy( TaxonomyDataManager.getCnTaxCacheMap(), query, qt ); + } + } + + public final static UniProtTaxonomy obtainUniProtTaxonomy( final String simple_name, QUERY_TYPE qt ) + throws IOException, AncestralTaxonomyInferenceException { + if ( ForesterUtil.isEmpty( simple_name ) ) { + throw new IllegalArgumentException( "illegal attempt to use empty simple name" ); + } + UniProtTaxonomy ut = null; + final String code = ParserUtils.extractTaxonomyCodeFromNodeName( simple_name, + NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE ); + if ( !ForesterUtil.isEmpty( code ) ) { + qt = QUERY_TYPE.CODE; + ut = obtainTaxonomy( TaxonomyDataManager.getCodeTaxCacheMap(), code, qt ); + } + if ( ut == null ) { + final String sn = ParserUtils.extractScientificNameFromNodeName( simple_name ); + if ( !ForesterUtil.isEmpty( sn ) ) { + qt = QUERY_TYPE.SN; + ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), sn, qt ); + } + } + if ( ut == null ) { + final String id = ParserUtils + .extractUniprotTaxonomyIdFromNodeName( simple_name, + NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ); + if ( !ForesterUtil.isEmpty( id ) ) { + qt = QUERY_TYPE.ID; + ut = obtainTaxonomy( TaxonomyDataManager.getIdTaxCacheMap(), id, qt ); + } } + if ( ut == null ) { + String sn = ""; + final Matcher m = ParserUtils.TAXOMONY_SN_PATTERN_GENUS.matcher( simple_name ); + if ( m.matches() ) { + sn = m.group( 1 ); + } + if ( !ForesterUtil.isEmpty( sn ) ) { + qt = QUERY_TYPE.SN; + ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), sn, qt ); + } + } + return ut; } static final UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List lineage ) throws AncestralTaxonomyInferenceException, IOException { final String lineage_str = ForesterUtil.stringListToString( lineage, ">" ); - UniProtTaxonomy up_tax = null; if ( TaxonomyDataManager.getLineageTaxCacheMap().containsKey( lineage_str ) ) { - up_tax = TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy(); + return TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy(); } else { + final List matching_taxonomies = new ArrayList(); final List up_taxonomies = getTaxonomiesFromScientificName( lineage .get( lineage.size() - 1 ) ); if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) { for( final UniProtTaxonomy up_taxonomy : up_taxonomies ) { boolean match = true; I: for( int i = 0; i < lineage.size(); ++i ) { - if ( !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) { + if ( ( i == up_taxonomy.getLineage().size() ) + || !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) { match = false; break I; } } if ( match ) { - if ( up_tax != null ) { - throw new AncestralTaxonomyInferenceException( "lineage \"" - + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" ); - } - up_tax = up_taxonomy; + matching_taxonomies.add( up_taxonomy ); } } - if ( up_tax == null ) { + if ( matching_taxonomies.isEmpty() ) { throw new AncestralTaxonomyInferenceException( "lineage \"" + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" ); } - TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, up_tax ); - if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) { - TaxonomyDataManager.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax ); + //in case of more than one (e.g. "Xenopus" Genus and Subgenus), keep shorter, less specific one: + int shortest = Integer.MAX_VALUE; + UniProtTaxonomy least_specific_up_tax = null; + for( final UniProtTaxonomy m : matching_taxonomies ) { + final int s = m.getLineage().size(); + if ( s < shortest ) { + shortest = s; + least_specific_up_tax = m; + } } - if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) { - TaxonomyDataManager.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax ); + TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, least_specific_up_tax ); + if ( !ForesterUtil.isEmpty( least_specific_up_tax.getScientificName() ) ) { + TaxonomyDataManager.getSnTaxCacheMap().put( least_specific_up_tax.getScientificName(), + least_specific_up_tax ); } - if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) { - TaxonomyDataManager.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax ); + if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCode() ) ) { + TaxonomyDataManager.getCodeTaxCacheMap().put( least_specific_up_tax.getCode(), + least_specific_up_tax ); } - if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) { - TaxonomyDataManager.getIdTaxCacheMap().put( up_tax.getId(), up_tax ); + if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCommonName() ) ) { + TaxonomyDataManager.getCnTaxCacheMap().put( least_specific_up_tax.getCommonName(), + least_specific_up_tax ); } + if ( !ForesterUtil.isEmpty( least_specific_up_tax.getId() ) ) { + TaxonomyDataManager.getIdTaxCacheMap().put( least_specific_up_tax.getId(), least_specific_up_tax ); + } + return least_specific_up_tax; + } + else { + throw new AncestralTaxonomyInferenceException( "taxonomy \"" + ( lineage.get( lineage.size() - 1 ) ) + + "\" not found" ); } } - return up_tax; } synchronized final private static void updateTaxonomy( final QUERY_TYPE qt, - final PhylogenyNode node, - final Taxonomy tax, - final UniProtTaxonomy up_tax ) { + final PhylogenyNode node, + final Taxonomy tax, + final UniProtTaxonomy up_tax ) + throws PhyloXmlDataFormatException { if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() ) && ForesterUtil.isEmpty( tax.getScientificName() ) ) { tax.setScientificName( up_tax.getScientificName() ); @@ -386,7 +471,7 @@ public final class TaxonomyDataManager extends RunnableProcess { } } } - + private final void execute() { start( _mf, "taxonomy data" ); SortedSet not_found = null; @@ -496,7 +581,7 @@ public final class TaxonomyDataManager extends RunnableProcess { } private final String getBaseUrl() { - return UniProtWsTools.BASE_URL; + return AncestralTaxonomyInferrer.getBaseUrl(); } @Override