1 // forester -- software libraries and applications
2 // for genomics and evolutionary biology research.
4 // Copyright (C) 2010 Christian M Zmasek
5 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 // Contact: phylosoft @ gmail . com
23 // WWW: www.phylosoft.org/forester
25 package org.forester.analysis;
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.List;
31 import java.util.SortedSet;
32 import java.util.TreeSet;
34 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
35 import org.forester.phylogeny.Phylogeny;
36 import org.forester.phylogeny.PhylogenyNode;
37 import org.forester.phylogeny.data.Identifier;
38 import org.forester.phylogeny.data.Taxonomy;
39 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
40 import org.forester.util.ForesterUtil;
41 import org.forester.ws.uniprot.UniProtTaxonomy;
42 import org.forester.ws.uniprot.UniProtWsTools;
44 public final class AncestralTaxonomyInference {
46 private static final int MAX_CACHE_SIZE = 100000;
47 private static final int MAX_TAXONOMIES_TO_RETURN = 10;
48 private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
49 private static final HashMap<String, UniProtTaxonomy> _lineage_up_cache_map = new HashMap<String, UniProtTaxonomy>();
50 private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map = new HashMap<String, UniProtTaxonomy>();
51 private static final HashMap<String, UniProtTaxonomy> _cn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
52 private static final HashMap<String, UniProtTaxonomy> _id_up_cache_map = new HashMap<String, UniProtTaxonomy>();
54 synchronized private static void clearCachesIfTooLarge() {
55 if ( getSnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
56 getSnTaxCacheMap().clear();
58 if ( getLineageTaxCacheMap().size() > MAX_CACHE_SIZE ) {
59 getLineageTaxCacheMap().clear();
61 if ( getCnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
62 getCnTaxCacheMap().clear();
64 if ( getCodeTaxCacheMap().size() > MAX_CACHE_SIZE ) {
65 getCodeTaxCacheMap().clear();
67 if ( getIdTaxCacheMap().size() > MAX_CACHE_SIZE ) {
68 getIdTaxCacheMap().clear();
72 synchronized private static HashMap<String, UniProtTaxonomy> getCnTaxCacheMap() {
73 return _cn_up_cache_map;
76 synchronized private static HashMap<String, UniProtTaxonomy> getCodeTaxCacheMap() {
77 return _code_up_cache_map;
80 synchronized private static HashMap<String, UniProtTaxonomy> getIdTaxCacheMap() {
81 return _id_up_cache_map;
84 synchronized private static HashMap<String, UniProtTaxonomy> getSnTaxCacheMap() {
85 return _sn_up_cache_map;
88 synchronized private static HashMap<String, UniProtTaxonomy> getLineageTaxCacheMap() {
89 return _lineage_up_cache_map;
92 synchronized private static UniProtTaxonomy getTaxonomies( final HashMap<String, UniProtTaxonomy> cache,
94 final QUERY_TYPE qt ) throws IOException,
95 AncestralTaxonomyInferenceException {
96 if ( cache.containsKey( query ) ) {
97 return cache.get( query ).copy();
100 List<UniProtTaxonomy> up_taxonomies = null;
103 up_taxonomies = getTaxonomiesFromId( ( String ) query );
106 up_taxonomies = getTaxonomiesFromTaxonomyCode( ( String ) query );
109 up_taxonomies = getTaxonomiesFromScientificName( ( String ) query );
112 up_taxonomies = getTaxonomiesFromCommonName( ( String ) query );
115 return obtainUniProtTaxonomyFromLineage( ( List<String> ) query );
117 throw new RuntimeException();
119 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
120 final UniProtTaxonomy up_tax = up_taxonomies.get( 0 );
121 if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
122 getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
124 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
125 getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
127 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
128 getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
130 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
131 getIdTaxCacheMap().put( up_tax.getId(), up_tax );
141 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String query )
143 return UniProtWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
146 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromId( final String query ) throws IOException {
147 return UniProtWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN );
150 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String query )
152 return UniProtWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
155 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode( final String query )
157 return UniProtWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN );
160 synchronized public static void inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException,
161 AncestralTaxonomyInferenceException {
162 clearCachesIfTooLarge();
163 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
164 final PhylogenyNode node = iter.next();
165 if ( !node.isExternal() ) {
166 inferTaxonomyFromDescendents( node );
171 synchronized private static void inferTaxonomyFromDescendents( final PhylogenyNode n ) throws IOException,
172 AncestralTaxonomyInferenceException {
173 if ( n.isExternal() ) {
174 throw new IllegalArgumentException( "attempt to infer taxonomy from descendants of external node" );
176 n.getNodeData().setTaxonomy( null );
177 final List<PhylogenyNode> descs = n.getDescendants();
178 final List<String[]> lineages = new ArrayList<String[]>();
179 int shortest_lin_length = Integer.MAX_VALUE;
180 for( final PhylogenyNode desc : descs ) {
181 if ( desc.getNodeData().isHasTaxonomy()
182 && ( isHasAppropriateId( desc.getNodeData().getTaxonomy() )
183 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getScientificName() )
184 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() )
185 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getTaxonomyCode() ) || !ForesterUtil
186 .isEmpty( desc.getNodeData().getTaxonomy().getCommonName() ) ) ) {
187 final UniProtTaxonomy up_tax = obtainUniProtTaxonomy( desc.getNodeData().getTaxonomy(), null, null );
188 if ( ( up_tax == null ) && ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() ) ) {
189 String desc_str = "";
190 if ( !ForesterUtil.isEmpty( desc.getName() ) ) {
191 desc_str = "\"" + desc.getName() + "\"";
194 desc_str = "[" + desc.getId() + "]";
196 System.out.println( desc.getNodeData().getTaxonomy().toString() );
197 System.out.println( ForesterUtil.stringListToString( desc.getNodeData().getTaxonomy().getLineage(),
199 throw new AncestralTaxonomyInferenceException( "a taxonomy for node " + desc_str
200 + " could not be established from the database" );
202 String[] lineage = ForesterUtil.stringListToArray( desc.getNodeData().getTaxonomy().getLineage() );
203 if ( ( lineage == null ) || ( lineage.length < 1 ) ) {
204 lineage = ForesterUtil.stringListToArray( up_tax.getLineage() );
206 if ( ( lineage == null ) || ( lineage.length < 1 ) ) {
207 throw new AncestralTaxonomyInferenceException( "a taxonomic lineage for node \""
208 + desc.getNodeData().getTaxonomy().toString() + "\" could not be established" );
210 if ( lineage.length < shortest_lin_length ) {
211 shortest_lin_length = lineage.length;
213 lineages.add( lineage );
217 if ( !ForesterUtil.isEmpty( desc.getName() ) ) {
218 node = "\"" + desc.getName() + "\"";
221 node = "[" + desc.getId() + "]";
223 // final List<PhylogenyNode> e = desc.getAllExternalDescendants();
225 // System.out.println();
227 // for( final PhylogenyNode object : e ) {
228 // System.out.println( x + ":" );
229 // System.out.println( object.getName() + " " );
232 // System.out.println();
234 throw new AncestralTaxonomyInferenceException( "node " + node
235 + " has no or inappropriate taxonomic information" );
238 final List<String> last_common_lineage = new ArrayList<String>();
239 String last_common = null;
240 if ( shortest_lin_length > 0 ) {
241 I: for( int i = 0; i < shortest_lin_length; ++i ) {
242 final String lineage_0 = lineages.get( 0 )[ i ];
243 for( int j = 1; j < lineages.size(); ++j ) {
244 if ( !lineage_0.equals( lineages.get( j )[ i ] ) ) {
248 last_common_lineage.add( lineage_0 );
249 last_common = lineage_0;
252 if ( last_common_lineage.isEmpty() ) {
253 boolean saw_viruses = false;
254 boolean saw_cellular_organism = false;
255 for( final String[] lineage : lineages ) {
256 if ( lineage.length > 0 ) {
257 if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.VIRUSES ) ) {
260 else if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.CELLULAR_ORGANISMS ) ) {
261 saw_cellular_organism = true;
263 if ( saw_cellular_organism && saw_viruses ) {
268 if ( saw_cellular_organism && saw_viruses ) {
269 last_common_lineage.add( UniProtTaxonomy.CELLULAR_ORGANISMS );
270 last_common = UniProtTaxonomy.CELLULAR_ORGANISMS;
273 String msg = "no common lineage for:\n";
275 for( final String[] strings : lineages ) {
276 msg += counter + ": ";
278 for( final String string : strings ) {
283 throw new AncestralTaxonomyInferenceException( msg );
286 final Taxonomy tax = new Taxonomy();
287 n.getNodeData().setTaxonomy( tax );
288 tax.setScientificName( last_common );
289 final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromLineage( last_common_lineage );
290 if ( up_tax != null ) {
291 if ( !ForesterUtil.isEmpty( up_tax.getRank() ) ) {
293 tax.setRank( up_tax.getRank().toLowerCase() );
295 catch ( final PhyloXmlDataFormatException ex ) {
299 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
300 tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
302 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
303 tax.setCommonName( up_tax.getCommonName() );
305 if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
306 tax.getSynonyms().add( up_tax.getSynonym() );
308 if ( up_tax.getLineage() != null ) {
309 tax.setLineage( new ArrayList<String>() );
310 for( final String lin : up_tax.getLineage() ) {
311 if ( !ForesterUtil.isEmpty( lin ) ) {
312 tax.getLineage().add( lin );
317 if ( ForesterUtil.isEmpty( tax.getLineage() ) ) {
318 tax.setLineage( new ArrayList<String>() );
319 for( final String lin : last_common_lineage ) {
320 if ( !ForesterUtil.isEmpty( lin ) ) {
321 tax.getLineage().add( lin );
325 for( final PhylogenyNode desc : descs ) {
326 if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy()
327 && desc.getNodeData().getTaxonomy().isEqual( tax ) ) {
328 desc.getNodeData().setTaxonomy( null );
333 synchronized private static boolean isHasAppropriateId( final Taxonomy tax ) {
334 return ( ( tax.getIdentifier() != null ) && ( !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) && ( tax
335 .getIdentifier().getProvider().equalsIgnoreCase( "ncbi" )
336 || tax.getIdentifier().getProvider().equalsIgnoreCase( "uniprot" ) || tax.getIdentifier().getProvider()
337 .equalsIgnoreCase( "uniprotkb" ) ) ) );
340 synchronized public static SortedSet<String> obtainDetailedTaxonomicInformation( final Phylogeny phy,
341 final boolean delete )
342 throws IOException, AncestralTaxonomyInferenceException {
343 clearCachesIfTooLarge();
344 final SortedSet<String> not_found = new TreeSet<String>();
345 List<PhylogenyNode> not_found_external_nodes = null;
347 not_found_external_nodes = new ArrayList<PhylogenyNode>();
349 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
350 final PhylogenyNode node = iter.next();
351 final QUERY_TYPE qt = null;
353 if ( node.getNodeData().isHasTaxonomy() ) {
354 tax = node.getNodeData().getTaxonomy();
356 else if ( node.isExternal() ) {
357 if ( !ForesterUtil.isEmpty( node.getName() ) ) {
358 not_found.add( node.getName() );
361 not_found.add( node.toString() );
364 not_found_external_nodes.add( node );
367 UniProtTaxonomy uniprot_tax = null;
369 && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() )
370 || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax
371 .getCommonName() ) ) ) {
372 uniprot_tax = obtainUniProtTaxonomy( tax, null, qt );
373 if ( uniprot_tax != null ) {
374 updateTaxonomy( qt, node, tax, uniprot_tax );
377 not_found.add( tax.toString() );
378 if ( delete && node.isExternal() ) {
379 not_found_external_nodes.add( node );
385 for( final PhylogenyNode node : not_found_external_nodes ) {
386 phy.deleteSubtree( node, true );
388 phy.externalNodesHaveChanged();
390 phy.recalculateNumberOfExternalDescendants( true );
395 synchronized public static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, Object query, QUERY_TYPE qt )
396 throws IOException, AncestralTaxonomyInferenceException {
397 if ( isHasAppropriateId( tax ) ) {
398 query = tax.getIdentifier().getValue();
400 return getTaxonomies( getIdTaxCacheMap(), query, qt );
402 else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
403 if ( !ForesterUtil.isEmpty( tax.getLineage() ) ) {
404 query = tax.getLineage();
406 return getTaxonomies( getLineageTaxCacheMap(), query, qt );
409 query = tax.getScientificName();
411 return getTaxonomies( getSnTaxCacheMap(), query, qt );
414 else if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
415 query = tax.getTaxonomyCode();
416 qt = QUERY_TYPE.CODE;
417 return getTaxonomies( getCodeTaxCacheMap(), query, qt );
420 query = tax.getCommonName();
422 return getTaxonomies( getCnTaxCacheMap(), query, qt );
426 synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List<String> lineage )
427 throws AncestralTaxonomyInferenceException, IOException {
428 final String lineage_str = ForesterUtil.stringListToString( lineage, ">" );
429 UniProtTaxonomy up_tax = null;
430 if ( getLineageTaxCacheMap().containsKey( lineage_str ) ) {
431 up_tax = getLineageTaxCacheMap().get( lineage_str ).copy();
434 final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( lineage
435 .get( lineage.size() - 1 ) );
436 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) {
437 for( final UniProtTaxonomy up_taxonomy : up_taxonomies ) {
438 boolean match = true;
439 I: for( int i = 0; i < lineage.size(); ++i ) {
440 if ( !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) {
446 if ( up_tax != null ) {
447 throw new AncestralTaxonomyInferenceException( "lineage \""
448 + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" );
450 up_tax = up_taxonomy;
453 if ( up_tax == null ) {
454 throw new AncestralTaxonomyInferenceException( "lineage \""
455 + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" );
457 getLineageTaxCacheMap().put( lineage_str, up_tax );
458 if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
459 getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
461 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
462 getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
464 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
465 getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
467 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
468 getIdTaxCacheMap().put( up_tax.getId(), up_tax );
475 synchronized private static void updateTaxonomy( final QUERY_TYPE qt,
476 final PhylogenyNode node,
478 final UniProtTaxonomy up_tax ) {
479 if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() )
480 && ForesterUtil.isEmpty( tax.getScientificName() ) ) {
481 tax.setScientificName( up_tax.getScientificName() );
483 if ( node.isExternal() && ( qt != QUERY_TYPE.CODE ) && !ForesterUtil.isEmpty( up_tax.getCode() )
484 && ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
485 tax.setTaxonomyCode( up_tax.getCode() );
487 if ( ( qt != QUERY_TYPE.CN ) && !ForesterUtil.isEmpty( up_tax.getCommonName() )
488 && ForesterUtil.isEmpty( tax.getCommonName() ) ) {
489 tax.setCommonName( up_tax.getCommonName() );
491 if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
492 tax.getSynonyms().add( up_tax.getSynonym() );
494 if ( !ForesterUtil.isEmpty( up_tax.getRank() ) && ForesterUtil.isEmpty( tax.getRank() ) ) {
496 tax.setRank( up_tax.getRank().toLowerCase() );
498 catch ( final PhyloXmlDataFormatException ex ) {
502 if ( ( qt != QUERY_TYPE.ID ) && !ForesterUtil.isEmpty( up_tax.getId() )
503 && ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) ) {
504 tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
506 if ( up_tax.getLineage() != null ) {
507 tax.setLineage( new ArrayList<String>() );
508 for( final String lin : up_tax.getLineage() ) {
509 if ( !ForesterUtil.isEmpty( lin ) ) {
510 tax.getLineage().add( lin );
516 private enum QUERY_TYPE {
517 CODE, SN, CN, ID, LIN;