3 // forester -- software libraries and applications
4 // for genomics and evolutionary biology research.
6 // Copyright (C) 2010 Christian M Zmasek
7 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 // Contact: phylosoft @ gmail . com
25 // WWW: https://sites.google.com/site/cmzmasek/home/software/forester
27 package org.forester.analysis;
29 import java.io.IOException;
30 import java.net.UnknownHostException;
31 import java.util.ArrayList;
32 import java.util.HashMap;
33 import java.util.List;
34 import java.util.SortedSet;
35 import java.util.TreeSet;
36 import java.util.regex.Matcher;
38 import javax.swing.JOptionPane;
40 import org.forester.archaeopteryx.MainFrameApplication;
41 import org.forester.archaeopteryx.TreePanel;
42 import org.forester.archaeopteryx.tools.AncestralTaxonomyInferrer;
43 import org.forester.archaeopteryx.tools.RunnableProcess;
44 import org.forester.io.parsers.nhx.NHXParser;
45 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
46 import org.forester.io.parsers.util.ParserUtils;
47 import org.forester.phylogeny.Phylogeny;
48 import org.forester.phylogeny.PhylogenyNode;
49 import org.forester.phylogeny.data.Identifier;
50 import org.forester.phylogeny.data.Taxonomy;
51 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
52 import org.forester.util.ForesterUtil;
53 import org.forester.util.TaxonomyUtil;
54 import org.forester.ws.seqdb.SequenceDbWsTools;
55 import org.forester.ws.seqdb.UniProtTaxonomy;
57 public final class TaxonomyDataManager extends RunnableProcess {
60 CODE, SN, CN, ID, LIN;
62 private static final int MAX_CACHE_SIZE = 100000;
63 private static final int MAX_TAXONOMIES_TO_RETURN = 2000;
64 private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
65 private static final HashMap<String, UniProtTaxonomy> _lineage_up_cache_map = new HashMap<String, UniProtTaxonomy>();
66 private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map = new HashMap<String, UniProtTaxonomy>();
67 private static final HashMap<String, UniProtTaxonomy> _cn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
68 private static final HashMap<String, UniProtTaxonomy> _id_up_cache_map = new HashMap<String, UniProtTaxonomy>();
69 private final Phylogeny _phy;
70 private final MainFrameApplication _mf;
71 private final TreePanel _treepanel;
72 private final boolean _delete;
73 private final boolean _allow_simple_names;
75 public TaxonomyDataManager( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) {
78 _treepanel = treepanel;
80 _allow_simple_names = false;
83 public TaxonomyDataManager( final MainFrameApplication mf,
84 final TreePanel treepanel,
87 final boolean allow_simple_name ) {
90 _treepanel = treepanel;
92 _allow_simple_names = allow_simple_name;
95 synchronized static void clearCachesIfTooLarge() {
96 if ( getSnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
97 getSnTaxCacheMap().clear();
99 if ( getLineageTaxCacheMap().size() > MAX_CACHE_SIZE ) {
100 getLineageTaxCacheMap().clear();
102 if ( getCnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
103 getCnTaxCacheMap().clear();
105 if ( getCodeTaxCacheMap().size() > MAX_CACHE_SIZE ) {
106 getCodeTaxCacheMap().clear();
108 if ( getIdTaxCacheMap().size() > MAX_CACHE_SIZE ) {
109 getIdTaxCacheMap().clear();
113 synchronized final static HashMap<String, UniProtTaxonomy> getCnTaxCacheMap() {
114 return _cn_up_cache_map;
117 synchronized final static HashMap<String, UniProtTaxonomy> getCodeTaxCacheMap() {
118 return _code_up_cache_map;
121 synchronized final static HashMap<String, UniProtTaxonomy> getIdTaxCacheMap() {
122 return _id_up_cache_map;
125 synchronized final static HashMap<String, UniProtTaxonomy> getLineageTaxCacheMap() {
126 return _lineage_up_cache_map;
129 synchronized final static HashMap<String, UniProtTaxonomy> getSnTaxCacheMap() {
130 return _sn_up_cache_map;
134 @SuppressWarnings("unchecked")
135 private final static UniProtTaxonomy obtainTaxonomy( final HashMap<String, UniProtTaxonomy> cache,
137 final QUERY_TYPE qt ) throws IOException,
138 AncestralTaxonomyInferenceException {
139 if ( cache.containsKey( query ) ) {
140 return cache.get( query ).copy();
143 List<UniProtTaxonomy> up_taxonomies = null;
146 up_taxonomies = getTaxonomiesFromId( ( String ) query );
149 up_taxonomies = getTaxonomiesFromTaxonomyCode( ( String ) query );
152 up_taxonomies = getTaxonomiesFromScientificName( ( String ) query );
155 up_taxonomies = getTaxonomiesFromCommonName( ( String ) query );
158 return obtainUniProtTaxonomyFromLineage( ( List<String> ) query );
161 throw new RuntimeException();
163 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
164 final UniProtTaxonomy up_tax = up_taxonomies.get( 0 );
165 if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
166 TaxonomyDataManager.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
168 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
169 TaxonomyDataManager.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
171 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
172 TaxonomyDataManager.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
174 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
175 TaxonomyDataManager.getIdTaxCacheMap().put( up_tax.getId(), up_tax );
185 private final static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String query ) throws IOException {
186 return SequenceDbWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
189 private final static List<UniProtTaxonomy> getTaxonomiesFromId( final String query ) throws IOException {
190 return SequenceDbWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN );
193 private final static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String query ) throws IOException {
194 if ( query.equalsIgnoreCase( UniProtTaxonomy.BACTERIA ) || query.equalsIgnoreCase( UniProtTaxonomy.ARCHAEA )
195 || query.equalsIgnoreCase( UniProtTaxonomy.VIRUSES )
196 || query.equalsIgnoreCase( UniProtTaxonomy.EUKARYOTA ) || query.equalsIgnoreCase( UniProtTaxonomy.X ) ) {
197 final List<UniProtTaxonomy> l = new ArrayList<UniProtTaxonomy>();
198 l.add( UniProtTaxonomy.createSpecialFromScientificName( query ) );
201 return SequenceDbWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
204 private final static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode( final String query ) throws IOException {
205 //FIXME fix "SPHAR" issue
206 if ( ( ( query.indexOf( "XX" ) == 3 ) && TaxonomyUtil.isHasTaxIdFromFakeTaxCode( query ) )
207 || query.equals( "SPHAR" ) /* TODO remove me, is same as Sphingomonas aromaticivorans */
209 final int id = TaxonomyUtil.getTaxIdFromFakeTaxCode( query );
210 return SequenceDbWsTools.getTaxonomiesFromId( String.valueOf( id ), MAX_TAXONOMIES_TO_RETURN );
212 return SequenceDbWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN );
215 static final boolean isHasAppropriateId( final Taxonomy tax ) {
216 return ( ( tax.getIdentifier() != null ) && ( !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) && ( tax
217 .getIdentifier().getProvider().equalsIgnoreCase( "ncbi" )
218 || tax.getIdentifier().getProvider().equalsIgnoreCase( "uniprot" ) || tax.getIdentifier().getProvider()
219 .equalsIgnoreCase( "uniprotkb" ) ) ) );
222 synchronized final private static SortedSet<String> obtainDetailedTaxonomicInformation( final Phylogeny phy,
223 final boolean delete,
224 final boolean allow_to_use_basic_node_names )
225 throws IOException, AncestralTaxonomyInferenceException {
226 clearCachesIfTooLarge();
227 final SortedSet<String> not_found = new TreeSet<String>();
228 List<PhylogenyNode> not_found_external_nodes = null;
230 not_found_external_nodes = new ArrayList<PhylogenyNode>();
232 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
233 final PhylogenyNode node = iter.next();
234 final QUERY_TYPE qt = null;
236 if ( node.getNodeData().isHasTaxonomy() ) {
237 tax = node.getNodeData().getTaxonomy();
239 else if ( allow_to_use_basic_node_names && !ForesterUtil.isEmpty( node.getName() ) ) {
240 // Nothing to be done.
242 else if ( node.isExternal() ) {
243 if ( !ForesterUtil.isEmpty( node.getName() ) ) {
244 not_found.add( node.getName() );
247 not_found.add( node.toString() );
250 not_found_external_nodes.add( node );
253 UniProtTaxonomy uniprot_tax = null;
254 if ( ( ( tax != null ) && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() )
255 || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax.getCommonName() ) ) )
256 || ( allow_to_use_basic_node_names && !ForesterUtil.isEmpty( node.getName() ) ) ) {
257 if ( ( ( tax != null ) && ( isHasAppropriateId( tax )
258 || !ForesterUtil.isEmpty( tax.getScientificName() )
259 || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil
260 .isEmpty( tax.getCommonName() ) ) ) ) {
261 uniprot_tax = obtainUniProtTaxonomy( tax, null, qt );
264 uniprot_tax = obtainUniProtTaxonomy( node.getName(), qt );
266 if ( uniprot_tax != null ) {
268 tax = new Taxonomy();
269 node.getNodeData().addTaxonomy( tax );
271 updateTaxonomy( qt, node, tax, uniprot_tax );
275 not_found.add( tax.toString() );
278 not_found.add( node.getName() );
280 if ( delete && node.isExternal() ) {
281 not_found_external_nodes.add( node );
287 for( final PhylogenyNode node : not_found_external_nodes ) {
288 phy.deleteSubtree( node, true );
290 phy.externalNodesHaveChanged();
291 phy.clearHashIdToNodeMap();
292 phy.recalculateNumberOfExternalDescendants( true );
297 public final static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, Object query, QUERY_TYPE qt )
298 throws IOException, AncestralTaxonomyInferenceException {
300 throw new IllegalArgumentException( "illegal attempt to use empty taxonomy object" );
302 if ( TaxonomyDataManager.isHasAppropriateId( tax ) ) {
303 query = tax.getIdentifier().getValue();
305 return obtainTaxonomy( TaxonomyDataManager.getIdTaxCacheMap(), query, qt );
307 else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
308 if ( !ForesterUtil.isEmpty( tax.getLineage() ) ) {
309 query = tax.getLineage();
311 return obtainTaxonomy( TaxonomyDataManager.getLineageTaxCacheMap(), query, qt );
314 query = tax.getScientificName();
316 return obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), query, qt );
319 else if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
320 query = tax.getTaxonomyCode();
321 qt = QUERY_TYPE.CODE;
322 return obtainTaxonomy( TaxonomyDataManager.getCodeTaxCacheMap(), query, qt );
325 query = tax.getCommonName();
327 return obtainTaxonomy( TaxonomyDataManager.getCnTaxCacheMap(), query, qt );
331 public final static UniProtTaxonomy obtainUniProtTaxonomy( final String simple_name, QUERY_TYPE qt )
332 throws IOException, AncestralTaxonomyInferenceException {
333 if ( ForesterUtil.isEmpty( simple_name ) ) {
334 throw new IllegalArgumentException( "illegal attempt to use empty simple name" );
336 UniProtTaxonomy ut = null;
337 final String code = ParserUtils.extractTaxonomyCodeFromNodeName( simple_name,
338 NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE );
339 if ( !ForesterUtil.isEmpty( code ) ) {
340 qt = QUERY_TYPE.CODE;
341 ut = obtainTaxonomy( TaxonomyDataManager.getCodeTaxCacheMap(), code, qt );
344 final String sn = ParserUtils.extractScientificNameFromNodeName( simple_name );
345 if ( !ForesterUtil.isEmpty( sn ) ) {
347 ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), sn, qt );
351 final String id = ParserUtils
352 .extractUniprotTaxonomyIdFromNodeName( simple_name,
353 NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED );
354 if ( !ForesterUtil.isEmpty( id ) ) {
356 ut = obtainTaxonomy( TaxonomyDataManager.getIdTaxCacheMap(), id, qt );
361 final Matcher m = ParserUtils.TAXOMONY_SN_PATTERN_GENUS.matcher( simple_name );
365 if ( !ForesterUtil.isEmpty( sn ) ) {
367 ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), sn, qt );
373 static final UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List<String> lineage )
374 throws AncestralTaxonomyInferenceException, IOException {
375 final String lineage_str = ForesterUtil.stringListToString( lineage, ">" );
376 if ( TaxonomyDataManager.getLineageTaxCacheMap().containsKey( lineage_str ) ) {
377 return TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy();
380 final List<UniProtTaxonomy> matching_taxonomies = new ArrayList<UniProtTaxonomy>();
381 final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( lineage
382 .get( lineage.size() - 1 ) );
383 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) {
384 for( final UniProtTaxonomy up_taxonomy : up_taxonomies ) {
385 boolean match = true;
386 I: for( int i = 0; i < lineage.size(); ++i ) {
387 if ( ( i == up_taxonomy.getLineage().size() )
388 || !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) {
394 matching_taxonomies.add( up_taxonomy );
397 if ( matching_taxonomies.isEmpty() ) {
398 throw new AncestralTaxonomyInferenceException( "lineage \""
399 + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" );
401 //in case of more than one (e.g. "Xenopus" Genus and Subgenus), keep shorter, less specific one:
402 int shortest = Integer.MAX_VALUE;
403 UniProtTaxonomy least_specific_up_tax = null;
404 for( final UniProtTaxonomy m : matching_taxonomies ) {
405 final int s = m.getLineage().size();
406 if ( s < shortest ) {
408 least_specific_up_tax = m;
411 TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, least_specific_up_tax );
412 if ( !ForesterUtil.isEmpty( least_specific_up_tax.getScientificName() ) ) {
413 TaxonomyDataManager.getSnTaxCacheMap().put( least_specific_up_tax.getScientificName(),
414 least_specific_up_tax );
416 if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCode() ) ) {
417 TaxonomyDataManager.getCodeTaxCacheMap().put( least_specific_up_tax.getCode(),
418 least_specific_up_tax );
420 if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCommonName() ) ) {
421 TaxonomyDataManager.getCnTaxCacheMap().put( least_specific_up_tax.getCommonName(),
422 least_specific_up_tax );
424 if ( !ForesterUtil.isEmpty( least_specific_up_tax.getId() ) ) {
425 TaxonomyDataManager.getIdTaxCacheMap().put( least_specific_up_tax.getId(), least_specific_up_tax );
427 return least_specific_up_tax;
430 throw new AncestralTaxonomyInferenceException( "taxonomy \"" + ( lineage.get( lineage.size() - 1 ) )
436 synchronized final private static void updateTaxonomy( final QUERY_TYPE qt,
437 final PhylogenyNode node,
439 final UniProtTaxonomy up_tax )
440 throws PhyloXmlDataFormatException {
441 if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() )
442 && ForesterUtil.isEmpty( tax.getScientificName() ) ) {
443 tax.setScientificName( up_tax.getScientificName() );
445 if ( node.isExternal() && ( qt != QUERY_TYPE.CODE ) && !ForesterUtil.isEmpty( up_tax.getCode() )
446 && ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
447 tax.setTaxonomyCode( up_tax.getCode() );
449 if ( ( qt != QUERY_TYPE.CN ) && !ForesterUtil.isEmpty( up_tax.getCommonName() )
450 && ForesterUtil.isEmpty( tax.getCommonName() ) ) {
451 tax.setCommonName( up_tax.getCommonName() );
453 if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
454 tax.getSynonyms().add( up_tax.getSynonym() );
456 if ( !ForesterUtil.isEmpty( up_tax.getRank() ) && ForesterUtil.isEmpty( tax.getRank() ) ) {
458 tax.setRank( up_tax.getRank().toLowerCase() );
460 catch ( final PhyloXmlDataFormatException ex ) {
464 if ( ( qt != QUERY_TYPE.ID ) && !ForesterUtil.isEmpty( up_tax.getId() )
465 && ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) ) {
466 tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
468 if ( up_tax.getLineage() != null ) {
469 tax.setLineage( new ArrayList<String>() );
470 for( final String lin : up_tax.getLineage() ) {
471 if ( !ForesterUtil.isEmpty( lin ) ) {
472 tax.getLineage().add( lin );
478 private final void execute() {
479 start( _mf, "taxonomy data" );
480 SortedSet<String> not_found = null;
482 not_found = obtainDetailedTaxonomicInformation( _phy, _delete, _allow_simple_names );
484 catch ( final UnknownHostException e ) {
485 JOptionPane.showMessageDialog( _mf,
486 "Could not connect to \"" + getBaseUrl() + "\"",
487 "Network error during taxonomic information gathering",
488 JOptionPane.ERROR_MESSAGE );
491 catch ( final IOException e ) {
493 JOptionPane.showMessageDialog( _mf,
495 "Failed to obtain taxonomic information",
496 JOptionPane.ERROR_MESSAGE );
499 catch ( final AncestralTaxonomyInferenceException e ) {
501 JOptionPane.showMessageDialog( _mf,
503 "Failed to obtain taxonomic information",
504 JOptionPane.ERROR_MESSAGE );
510 if ( ( _phy == null ) || _phy.isEmpty() ) {
512 JOptionPane.showMessageDialog( _mf,
513 "None of the external node taxonomies could be resolved",
514 "Taxonomy Tool Failed",
515 JOptionPane.WARNING_MESSAGE );
517 catch ( final Exception e ) {
518 // Not important if this fails, do nothing.
522 _treepanel.setTree( _phy );
524 _treepanel.setEdited( true );
525 if ( ( not_found != null ) && ( not_found.size() > 0 ) ) {
526 int max = not_found.size();
527 boolean more = false;
532 final StringBuffer sb = new StringBuffer();
533 sb.append( "Not all taxonomies could be resolved.\n" );
534 if ( not_found.size() == 1 ) {
536 sb.append( "The following taxonomy was not found and deleted (if external):\n" );
539 sb.append( "The following taxonomy was not found:\n" );
544 sb.append( "The following taxonomies were not found and deleted (if external) (total: "
545 + not_found.size() + "):\n" );
548 sb.append( "The following taxonomies were not found (total: " + not_found.size() + "):\n" );
552 for( final String string : not_found ) {
564 JOptionPane.showMessageDialog( _mf,
566 "Taxonomy Tool Completed",
567 JOptionPane.WARNING_MESSAGE );
569 catch ( final Exception e ) {
570 // Not important if this fails, do nothing.
575 JOptionPane.showMessageDialog( _mf,
576 "Taxonomy tool successfully completed",
577 "Taxonomy Tool Completed",
578 JOptionPane.INFORMATION_MESSAGE );
580 catch ( final Exception e ) {
581 // Not important if this fails, do nothing.
586 private final String getBaseUrl() {
587 return AncestralTaxonomyInferrer.getBaseUrl();