3 // forester -- software libraries and applications
4 // for genomics and evolutionary biology research.
6 // Copyright (C) 2010 Christian M Zmasek
7 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: www.phylosoft.org/forester
27 package org.forester.analysis;
29 import java.io.IOException;
30 import java.net.UnknownHostException;
31 import java.util.ArrayList;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.SortedSet;
35 import java.util.TreeSet;
37 import javax.swing.JOptionPane;
39 import org.forester.archaeopteryx.MainFrameApplication;
40 import org.forester.archaeopteryx.TreePanel;
41 import org.forester.archaeopteryx.tools.RunnableProcess;
42 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
43 import org.forester.phylogeny.Phylogeny;
44 import org.forester.phylogeny.PhylogenyNode;
45 import org.forester.phylogeny.data.Identifier;
46 import org.forester.phylogeny.data.Taxonomy;
47 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
48 import org.forester.util.ForesterUtil;
49 import org.forester.ws.uniprot.UniProtTaxonomy;
50 import org.forester.ws.uniprot.UniProtWsTools;
52 public final class TaxonomyDataManager extends RunnableProcess {
55 CODE, SN, CN, ID, LIN;
57 private static final int MAX_CACHE_SIZE = 100000;
58 private static final int MAX_TAXONOMIES_TO_RETURN = 10;
59 private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
60 private static final HashMap<String, UniProtTaxonomy> _lineage_up_cache_map = new HashMap<String, UniProtTaxonomy>();
61 private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map = new HashMap<String, UniProtTaxonomy>();
62 private static final HashMap<String, UniProtTaxonomy> _cn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
63 private static final HashMap<String, UniProtTaxonomy> _id_up_cache_map = new HashMap<String, UniProtTaxonomy>();
64 private final Phylogeny _phy;
65 private final MainFrameApplication _mf;
66 private final TreePanel _treepanel;
67 private final boolean _delete;
68 private final boolean _allow_simple_names;
70 public TaxonomyDataManager( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) {
73 _treepanel = treepanel;
75 _allow_simple_names = false;
78 public TaxonomyDataManager( final MainFrameApplication mf,
79 final TreePanel treepanel,
82 final boolean allow_simple_name ) {
85 _treepanel = treepanel;
87 _allow_simple_names = allow_simple_name;
90 synchronized static void clearCachesIfTooLarge() {
91 if ( getSnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
92 getSnTaxCacheMap().clear();
94 if ( getLineageTaxCacheMap().size() > MAX_CACHE_SIZE ) {
95 getLineageTaxCacheMap().clear();
97 if ( getCnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
98 getCnTaxCacheMap().clear();
100 if ( getCodeTaxCacheMap().size() > MAX_CACHE_SIZE ) {
101 getCodeTaxCacheMap().clear();
103 if ( getIdTaxCacheMap().size() > MAX_CACHE_SIZE ) {
104 getIdTaxCacheMap().clear();
108 synchronized final static HashMap<String, UniProtTaxonomy> getCnTaxCacheMap() {
109 return _cn_up_cache_map;
112 synchronized final static HashMap<String, UniProtTaxonomy> getCodeTaxCacheMap() {
113 return _code_up_cache_map;
116 synchronized final static HashMap<String, UniProtTaxonomy> getIdTaxCacheMap() {
117 return _id_up_cache_map;
120 synchronized final static HashMap<String, UniProtTaxonomy> getLineageTaxCacheMap() {
121 return _lineage_up_cache_map;
124 synchronized final static HashMap<String, UniProtTaxonomy> getSnTaxCacheMap() {
125 return _sn_up_cache_map;
128 private final static UniProtTaxonomy obtainTaxonomy( final HashMap<String, UniProtTaxonomy> cache,
130 final QUERY_TYPE qt ) throws IOException,
131 AncestralTaxonomyInferenceException {
132 if ( cache.containsKey( query ) ) {
133 return cache.get( query ).copy();
136 List<UniProtTaxonomy> up_taxonomies = null;
139 up_taxonomies = getTaxonomiesFromId( ( String ) query );
142 up_taxonomies = getTaxonomiesFromTaxonomyCode( ( String ) query );
145 up_taxonomies = getTaxonomiesFromScientificName( ( String ) query );
148 up_taxonomies = getTaxonomiesFromCommonName( ( String ) query );
151 return obtainUniProtTaxonomyFromLineage( ( List<String> ) query );
153 throw new RuntimeException();
155 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
156 final UniProtTaxonomy up_tax = up_taxonomies.get( 0 );
157 if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
158 TaxonomyDataManager.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
160 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
161 TaxonomyDataManager.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
163 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
164 TaxonomyDataManager.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
166 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
167 TaxonomyDataManager.getIdTaxCacheMap().put( up_tax.getId(), up_tax );
177 private final static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String query ) throws IOException {
178 return UniProtWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
181 private final static List<UniProtTaxonomy> getTaxonomiesFromId( final String query ) throws IOException {
182 return UniProtWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN );
185 private final static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String query ) throws IOException {
186 return UniProtWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
189 private final static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode( final String query ) throws IOException {
190 return UniProtWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN );
193 static final boolean isHasAppropriateId( final Taxonomy tax ) {
194 return ( ( tax.getIdentifier() != null ) && ( !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) && ( tax
195 .getIdentifier().getProvider().equalsIgnoreCase( "ncbi" )
196 || tax.getIdentifier().getProvider().equalsIgnoreCase( "uniprot" ) || tax.getIdentifier().getProvider()
197 .equalsIgnoreCase( "uniprotkb" ) ) ) );
200 synchronized final private static SortedSet<String> obtainDetailedTaxonomicInformation( final Phylogeny phy,
201 final boolean delete,
202 final boolean allow_to_use_basic_node_names )
203 throws IOException, AncestralTaxonomyInferenceException {
204 clearCachesIfTooLarge();
205 final SortedSet<String> not_found = new TreeSet<String>();
206 List<PhylogenyNode> not_found_external_nodes = null;
208 not_found_external_nodes = new ArrayList<PhylogenyNode>();
210 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
211 final PhylogenyNode node = iter.next();
212 final QUERY_TYPE qt = null;
214 if ( node.getNodeData().isHasTaxonomy() ) {
215 tax = node.getNodeData().getTaxonomy();
217 else if ( allow_to_use_basic_node_names && !ForesterUtil.isEmpty( node.getName() ) ) {
218 // Nothing to be done.
220 else if ( node.isExternal() ) {
221 if ( !ForesterUtil.isEmpty( node.getName() ) ) {
222 not_found.add( node.getName() );
225 not_found.add( node.toString() );
228 not_found_external_nodes.add( node );
231 UniProtTaxonomy uniprot_tax = null;
232 if ( ( ( tax != null ) && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() )
233 || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax.getCommonName() ) ) )
234 || ( allow_to_use_basic_node_names && !ForesterUtil.isEmpty( node.getName() ) ) ) {
236 uniprot_tax = obtainUniProtTaxonomy( tax, null, qt );
239 uniprot_tax = obtainUniProtTaxonomy( node.getName(), qt );
241 if ( uniprot_tax != null ) {
243 tax = new Taxonomy();
244 node.getNodeData().addTaxonomy( tax );
247 updateTaxonomy( qt, node, tax, uniprot_tax );
251 not_found.add( tax.toString() );
254 not_found.add(node.getName() );
256 if ( delete && node.isExternal() ) {
257 not_found_external_nodes.add( node );
263 for( final PhylogenyNode node : not_found_external_nodes ) {
264 phy.deleteSubtree( node, true );
266 phy.externalNodesHaveChanged();
268 phy.recalculateNumberOfExternalDescendants( true );
273 public final static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, Object query, QUERY_TYPE qt )
274 throws IOException, AncestralTaxonomyInferenceException {
276 throw new IllegalArgumentException( "illegal attempt to use empty taxonomy object" );
278 if ( TaxonomyDataManager.isHasAppropriateId( tax ) ) {
279 query = tax.getIdentifier().getValue();
281 return obtainTaxonomy( TaxonomyDataManager.getIdTaxCacheMap(), query, qt );
283 else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
284 if ( !ForesterUtil.isEmpty( tax.getLineage() ) ) {
285 query = tax.getLineage();
287 return obtainTaxonomy( TaxonomyDataManager.getLineageTaxCacheMap(), query, qt );
290 query = tax.getScientificName();
292 return obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), query, qt );
295 else if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
296 query = tax.getTaxonomyCode();
297 qt = QUERY_TYPE.CODE;
298 return obtainTaxonomy( TaxonomyDataManager.getCodeTaxCacheMap(), query, qt );
301 query = tax.getCommonName();
303 return obtainTaxonomy( TaxonomyDataManager.getCnTaxCacheMap(), query, qt );
307 public final static UniProtTaxonomy obtainUniProtTaxonomy( final String simple_name, QUERY_TYPE qt )
308 throws IOException, AncestralTaxonomyInferenceException {
309 if ( ForesterUtil.isEmpty( simple_name ) ) {
310 throw new IllegalArgumentException( "illegal attempt to use empty simple name" );
313 UniProtTaxonomy ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), simple_name, qt );
315 qt = QUERY_TYPE.CODE;
316 ut = obtainTaxonomy( TaxonomyDataManager.getCodeTaxCacheMap(), simple_name, qt );
320 ut = obtainTaxonomy( TaxonomyDataManager.getCnTaxCacheMap(), simple_name, qt );
325 static final UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List<String> lineage )
326 throws AncestralTaxonomyInferenceException, IOException {
327 final String lineage_str = ForesterUtil.stringListToString( lineage, ">" );
328 UniProtTaxonomy up_tax = null;
329 if ( TaxonomyDataManager.getLineageTaxCacheMap().containsKey( lineage_str ) ) {
330 up_tax = TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy();
333 final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( lineage
334 .get( lineage.size() - 1 ) );
335 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) {
336 for( final UniProtTaxonomy up_taxonomy : up_taxonomies ) {
337 boolean match = true;
338 I: for( int i = 0; i < lineage.size(); ++i ) {
339 if ( !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) {
345 if ( up_tax != null ) {
346 throw new AncestralTaxonomyInferenceException( "lineage \""
347 + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" );
349 up_tax = up_taxonomy;
352 if ( up_tax == null ) {
353 throw new AncestralTaxonomyInferenceException( "lineage \""
354 + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" );
356 TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, up_tax );
357 if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
358 TaxonomyDataManager.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
360 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
361 TaxonomyDataManager.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
363 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
364 TaxonomyDataManager.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
366 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
367 TaxonomyDataManager.getIdTaxCacheMap().put( up_tax.getId(), up_tax );
374 synchronized final private static void updateTaxonomy( final QUERY_TYPE qt,
375 final PhylogenyNode node,
377 final UniProtTaxonomy up_tax ) {
378 if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() )
379 && ForesterUtil.isEmpty( tax.getScientificName() ) ) {
380 tax.setScientificName( up_tax.getScientificName() );
382 if ( node.isExternal() && ( qt != QUERY_TYPE.CODE ) && !ForesterUtil.isEmpty( up_tax.getCode() )
383 && ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
384 tax.setTaxonomyCode( up_tax.getCode() );
386 if ( ( qt != QUERY_TYPE.CN ) && !ForesterUtil.isEmpty( up_tax.getCommonName() )
387 && ForesterUtil.isEmpty( tax.getCommonName() ) ) {
388 tax.setCommonName( up_tax.getCommonName() );
390 if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
391 tax.getSynonyms().add( up_tax.getSynonym() );
393 if ( !ForesterUtil.isEmpty( up_tax.getRank() ) && ForesterUtil.isEmpty( tax.getRank() ) ) {
395 tax.setRank( up_tax.getRank().toLowerCase() );
397 catch ( final PhyloXmlDataFormatException ex ) {
401 if ( ( qt != QUERY_TYPE.ID ) && !ForesterUtil.isEmpty( up_tax.getId() )
402 && ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) ) {
403 tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
405 if ( up_tax.getLineage() != null ) {
406 tax.setLineage( new ArrayList<String>() );
407 for( final String lin : up_tax.getLineage() ) {
408 if ( !ForesterUtil.isEmpty( lin ) ) {
409 tax.getLineage().add( lin );
415 private final void execute() {
416 start( _mf, "taxonomy data" );
417 SortedSet<String> not_found = null;
419 not_found = obtainDetailedTaxonomicInformation( _phy, _delete, _allow_simple_names );
421 catch ( final UnknownHostException e ) {
422 JOptionPane.showMessageDialog( _mf,
423 "Could not connect to \"" + getBaseUrl() + "\"",
424 "Network error during taxonomic information gathering",
425 JOptionPane.ERROR_MESSAGE );
428 catch ( final IOException e ) {
430 JOptionPane.showMessageDialog( _mf,
432 "Failed to obtain taxonomic information",
433 JOptionPane.ERROR_MESSAGE );
436 catch ( final AncestralTaxonomyInferenceException e ) {
438 JOptionPane.showMessageDialog( _mf,
440 "Failed to obtain taxonomic information",
441 JOptionPane.ERROR_MESSAGE );
447 if ( ( _phy == null ) || _phy.isEmpty() ) {
449 JOptionPane.showMessageDialog( _mf,
450 "None of the external node taxonomies could be resolved",
451 "Taxonomy Tool Failed",
452 JOptionPane.WARNING_MESSAGE );
454 catch ( final Exception e ) {
455 // Not important if this fails, do nothing.
459 _treepanel.setTree( _phy );
461 _treepanel.setEdited( true );
462 if ( ( not_found != null ) && ( not_found.size() > 0 ) ) {
463 int max = not_found.size();
464 boolean more = false;
469 final StringBuffer sb = new StringBuffer();
470 sb.append( "Not all taxonomies could be resolved.\n" );
471 if ( not_found.size() == 1 ) {
473 sb.append( "The following taxonomy was not found and deleted (if external):\n" );
476 sb.append( "The following taxonomy was not found:\n" );
481 sb.append( "The following taxonomies were not found and deleted (if external) (total: "
482 + not_found.size() + "):\n" );
485 sb.append( "The following taxonomies were not found (total: " + not_found.size() + "):\n" );
489 for( final String string : not_found ) {
501 JOptionPane.showMessageDialog( _mf,
503 "Taxonomy Tool Completed",
504 JOptionPane.WARNING_MESSAGE );
506 catch ( final Exception e ) {
507 // Not important if this fails, do nothing.
512 JOptionPane.showMessageDialog( _mf,
513 "Taxonomy tool successfully completed",
514 "Taxonomy Tool Completed",
515 JOptionPane.INFORMATION_MESSAGE );
517 catch ( final Exception e ) {
518 // Not important if this fails, do nothing.
523 private final String getBaseUrl() {
524 return UniProtWsTools.BASE_URL;