1 // forester -- software libraries and applications
2 // for genomics and evolutionary biology research.
4 // Copyright (C) 2010 Christian M Zmasek
5 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 // Contact: phylosoft @ gmail . com
23 // WWW: www.phylosoft.org/forester
25 package org.forester.analysis;
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.List;
31 import java.util.SortedSet;
32 import java.util.TreeSet;
34 import org.forester.archaeopteryx.tools.AncestralTaxonomyInferenceException;
35 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
36 import org.forester.phylogeny.Phylogeny;
37 import org.forester.phylogeny.PhylogenyNode;
38 import org.forester.phylogeny.data.Identifier;
39 import org.forester.phylogeny.data.Taxonomy;
40 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
41 import org.forester.util.ForesterUtil;
42 import org.forester.ws.uniprot.UniProtTaxonomy;
43 import org.forester.ws.uniprot.UniProtWsTools;
45 public final class AncestralTaxonomyInference {
47 private static final int MAX_CACHE_SIZE = 100000;
48 private static final int MAX_TAXONOMIES_TO_RETURN = 100;
49 private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
50 private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map = new HashMap<String, UniProtTaxonomy>();
51 private static final HashMap<String, UniProtTaxonomy> _cn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
52 private static final HashMap<String, UniProtTaxonomy> _id_up_cache_map = new HashMap<String, UniProtTaxonomy>();
54 synchronized private static void clearCachesIfTooLarge() {
55 if ( getSnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
56 getSnTaxCacheMap().clear();
58 if ( getCnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
59 getCnTaxCacheMap().clear();
61 if ( getCodeTaxCacheMap().size() > MAX_CACHE_SIZE ) {
62 getCodeTaxCacheMap().clear();
64 if ( getIdTaxCacheMap().size() > MAX_CACHE_SIZE ) {
65 getIdTaxCacheMap().clear();
69 synchronized private static HashMap<String, UniProtTaxonomy> getCnTaxCacheMap() {
70 return _cn_up_cache_map;
73 synchronized private static HashMap<String, UniProtTaxonomy> getCodeTaxCacheMap() {
74 return _code_up_cache_map;
77 synchronized private static HashMap<String, UniProtTaxonomy> getIdTaxCacheMap() {
78 return _id_up_cache_map;
81 synchronized private static HashMap<String, UniProtTaxonomy> getSnTaxCacheMap() {
82 return _sn_up_cache_map;
85 synchronized private static UniProtTaxonomy getTaxonomies( final HashMap<String, UniProtTaxonomy> cache,
87 final QUERY_TYPE qt ) throws IOException {
88 if ( cache.containsKey( query ) ) {
89 return cache.get( query ).copy();
92 List<UniProtTaxonomy> up_taxonomies = null;
95 up_taxonomies = getTaxonomiesFromId( query );
98 up_taxonomies = getTaxonomiesFromTaxonomyCode( query );
101 up_taxonomies = getTaxonomiesFromScientificName( query );
104 up_taxonomies = getTaxonomiesFromCommonName( query );
107 throw new RuntimeException();
109 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
110 final UniProtTaxonomy up_tax = up_taxonomies.get( 0 );
111 if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
112 getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
114 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
115 getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
117 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
118 getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
120 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
121 getIdTaxCacheMap().put( up_tax.getId(), up_tax );
131 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String query )
133 return UniProtWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
136 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromId( final String query ) throws IOException {
137 return UniProtWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN );
140 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String query )
142 return UniProtWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
145 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode( final String query )
147 return UniProtWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN );
150 synchronized public static void inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException,
151 AncestralTaxonomyInferenceException {
152 clearCachesIfTooLarge();
153 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
154 final PhylogenyNode node = iter.next();
155 if ( !node.isExternal() ) {
156 inferTaxonomyFromDescendents( node );
161 synchronized private static void inferTaxonomyFromDescendents( final PhylogenyNode n ) throws IOException,
162 AncestralTaxonomyInferenceException {
163 if ( n.isExternal() ) {
164 throw new IllegalArgumentException( "attempt to infer taxonomy from descendants of external node" );
166 n.getNodeData().setTaxonomy( null );
167 final List<PhylogenyNode> descs = n.getDescendants();
168 final List<String[]> lineages = new ArrayList<String[]>();
169 int shortest_lin_length = Integer.MAX_VALUE;
170 for( final PhylogenyNode desc : descs ) {
171 if ( desc.getNodeData().isHasTaxonomy()
172 && ( isHasAppropriateId( desc.getNodeData().getTaxonomy() )
173 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getScientificName() )
174 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getTaxonomyCode() ) || !ForesterUtil
175 .isEmpty( desc.getNodeData().getTaxonomy().getCommonName() ) ) ) {
176 final UniProtTaxonomy up_tax = obtainUniProtTaxonomy( desc.getNodeData().getTaxonomy(), null, null );
177 String[] lineage = null;
178 if ( up_tax != null ) {
179 lineage = up_tax.getLineageAsArray();
181 if ( ( lineage == null ) || ( lineage.length < 1 ) ) {
182 throw new AncestralTaxonomyInferenceException( "a taxonomic lineage for node \""
183 + desc.getNodeData().getTaxonomy().toString() + "\" could not be found" );
185 if ( lineage.length < shortest_lin_length ) {
186 shortest_lin_length = lineage.length;
188 lineages.add( lineage );
192 if ( !ForesterUtil.isEmpty( desc.getName() ) ) {
193 node = "\"" + desc.getName() + "\"";
196 node = "[" + desc.getId() + "]";
198 // final List<PhylogenyNode> e = desc.getAllExternalDescendants();
200 // System.out.println();
202 // for( final PhylogenyNode object : e ) {
203 // System.out.println( x + ":" );
204 // System.out.println( object.getName() + " " );
207 // System.out.println();
209 throw new AncestralTaxonomyInferenceException( "node " + node
210 + " has no or inappropriate taxonomic information" );
213 final List<String> last_common_lineage = new ArrayList<String>();
214 String last_common = null;
215 if ( shortest_lin_length > 0 ) {
216 I: for( int i = 0; i < shortest_lin_length; ++i ) {
217 final String lineage_0 = lineages.get( 0 )[ i ];
218 for( int j = 1; j < lineages.size(); ++j ) {
219 if ( !lineage_0.equals( lineages.get( j )[ i ] ) ) {
223 // last_common_lineage = lineage_0;
224 last_common_lineage.add( lineage_0 );
225 last_common = lineage_0;
228 // if ( last_common_lineage == null ) {
229 if ( last_common_lineage.isEmpty() ) {
230 String msg = "no common lineage for:\n";
232 for( final String[] strings : lineages ) {
233 msg += counter + ": ";
235 for( final String string : strings ) {
240 throw new AncestralTaxonomyInferenceException( msg );
242 final Taxonomy tax = new Taxonomy();
243 n.getNodeData().setTaxonomy( tax );
244 tax.setScientificName( last_common );
245 final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromCommonLineage( last_common_lineage );
246 if ( up_tax != null ) {
247 if ( !ForesterUtil.isEmpty( up_tax.getRank() ) ) {
249 tax.setRank( up_tax.getRank().toLowerCase() );
251 catch ( final PhyloXmlDataFormatException ex ) {
255 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
256 tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
258 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
259 tax.setCommonName( up_tax.getCommonName() );
261 if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
262 tax.getSynonyms().add( up_tax.getSynonym() );
264 if ( up_tax.getLineage() != null ) {
265 tax.setLineage( new ArrayList<String>() );
266 for( final String lin : up_tax.getLineage() ) {
267 if ( !ForesterUtil.isEmpty( lin ) ) {
268 tax.getLineage().add( lin );
273 for( final PhylogenyNode desc : descs ) {
274 if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy()
275 && desc.getNodeData().getTaxonomy().isEqual( tax ) ) {
276 desc.getNodeData().setTaxonomy( null );
281 synchronized private static boolean isHasAppropriateId( final Taxonomy tax ) {
282 return ( ( tax.getIdentifier() != null ) && ( !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) && ( tax
283 .getIdentifier().getProvider().equalsIgnoreCase( "ncbi" )
284 || tax.getIdentifier().getProvider().equalsIgnoreCase( "uniprot" ) || tax.getIdentifier().getProvider()
285 .equalsIgnoreCase( "uniprotkb" ) ) ) );
288 synchronized public static SortedSet<String> obtainDetailedTaxonomicInformation( final Phylogeny phy,
289 final boolean delete )
291 clearCachesIfTooLarge();
292 final SortedSet<String> not_found = new TreeSet<String>();
293 List<PhylogenyNode> not_found_external_nodes = null;
295 not_found_external_nodes = new ArrayList<PhylogenyNode>();
297 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
298 final PhylogenyNode node = iter.next();
299 final QUERY_TYPE qt = null;
301 if ( node.getNodeData().isHasTaxonomy() ) {
302 tax = node.getNodeData().getTaxonomy();
304 else if ( node.isExternal() ) {
305 if ( !ForesterUtil.isEmpty( node.getName() ) ) {
306 not_found.add( node.getName() );
309 not_found.add( node.toString() );
312 not_found_external_nodes.add( node );
315 UniProtTaxonomy uniprot_tax = null;
317 && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() )
318 || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax
319 .getCommonName() ) ) ) {
320 uniprot_tax = obtainUniProtTaxonomy( tax, null, qt );
321 if ( uniprot_tax != null ) {
322 updateTaxonomy( qt, node, tax, uniprot_tax );
325 not_found.add( tax.toString() );
326 if ( delete && node.isExternal() ) {
327 not_found_external_nodes.add( node );
333 for( final PhylogenyNode node : not_found_external_nodes ) {
334 phy.deleteSubtree( node, true );
336 phy.externalNodesHaveChanged();
338 phy.recalculateNumberOfExternalDescendants( true );
343 // TODO this might not be needed anymore
344 // synchronized private static String[] obtainLineagePlusOwnScientificName( final UniProtTaxonomy up_tax ) {
345 // final String[] lineage = up_tax.getLineageAsArray();
346 // final String[] lin_plus_self = new String[ lineage.length + 1 ];
347 // for( int i = 0; i < lineage.length; ++i ) {
348 // lin_plus_self[ i ] = lineage[ i ];
350 // lin_plus_self[ lineage.length ] = up_tax.getScientificName();
351 // return lin_plus_self;
353 synchronized private static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, String query, QUERY_TYPE qt )
355 if ( isHasAppropriateId( tax ) ) {
356 query = tax.getIdentifier().getValue();
358 System.out.println( "query by id: " + query );
359 return getTaxonomies( getIdTaxCacheMap(), query, qt );
361 else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
362 query = tax.getScientificName();
364 System.out.println( "query by sn: " + query );
365 return getTaxonomies( getSnTaxCacheMap(), query, qt );
367 else if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
368 query = tax.getTaxonomyCode();
369 qt = QUERY_TYPE.CODE;
370 return getTaxonomies( getCodeTaxCacheMap(), query, qt );
373 query = tax.getCommonName();
375 return getTaxonomies( getCnTaxCacheMap(), query, qt );
379 synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn( final String sn ) throws IOException {
380 UniProtTaxonomy up_tax = null;
381 if ( getSnTaxCacheMap().containsKey( sn ) ) {
382 up_tax = getSnTaxCacheMap().get( sn ).copy();
385 final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( sn );
386 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
387 up_tax = up_taxonomies.get( 0 );
388 getSnTaxCacheMap().put( sn, up_tax );
389 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
390 getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
392 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
393 getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
395 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
396 getIdTaxCacheMap().put( up_tax.getId(), up_tax );
403 synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromCommonLineage( final List<String> lineage )
404 throws AncestralTaxonomyInferenceException, IOException {
405 UniProtTaxonomy up_tax = null;
406 // -- if ( getSnTaxCacheMap().containsKey( sn ) ) {
407 // -- up_tax = getSnTaxCacheMap().get( sn ).copy();
410 final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( lineage.get( lineage.size() - 1 ) );
411 //-- if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
412 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) {
413 for( final UniProtTaxonomy up_taxonomy : up_taxonomies ) {
414 boolean match = true;
415 I: for( int i = 0; i < lineage.size(); ++i ) {
416 if ( !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) {
422 if ( up_tax != null ) {
423 throw new AncestralTaxonomyInferenceException( "lineage \""
424 + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" );
426 up_tax = up_taxonomy;
429 if ( up_tax == null ) {
430 throw new AncestralTaxonomyInferenceException( "lineage \""
431 + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" );
433 //-- up_tax = up_taxonomies.get( 0 );
434 //-- getSnTaxCacheMap().put( sn, up_tax );
435 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
436 getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
438 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
439 getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
441 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
442 getIdTaxCacheMap().put( up_tax.getId(), up_tax );
449 synchronized private static void updateTaxonomy( final QUERY_TYPE qt,
450 final PhylogenyNode node,
452 final UniProtTaxonomy up_tax ) {
453 if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() )
454 && ForesterUtil.isEmpty( tax.getScientificName() ) ) {
455 tax.setScientificName( up_tax.getScientificName() );
457 // if ( node.isExternal()
458 if ( ( qt != QUERY_TYPE.CODE ) && !ForesterUtil.isEmpty( up_tax.getCode() )
459 && ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
460 tax.setTaxonomyCode( up_tax.getCode() );
462 if ( ( qt != QUERY_TYPE.CN ) && !ForesterUtil.isEmpty( up_tax.getCommonName() )
463 && ForesterUtil.isEmpty( tax.getCommonName() ) ) {
464 tax.setCommonName( up_tax.getCommonName() );
466 if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
467 tax.getSynonyms().add( up_tax.getSynonym() );
469 if ( !ForesterUtil.isEmpty( up_tax.getRank() ) && ForesterUtil.isEmpty( tax.getRank() ) ) {
471 tax.setRank( up_tax.getRank().toLowerCase() );
473 catch ( final PhyloXmlDataFormatException ex ) {
477 if ( ( qt != QUERY_TYPE.ID ) && !ForesterUtil.isEmpty( up_tax.getId() ) && ( tax.getIdentifier() == null ) ) {
478 tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
480 if ( up_tax.getLineage() != null ) {
481 tax.setLineage( new ArrayList<String>() );
482 for( final String lin : up_tax.getLineage() ) {
483 if ( !ForesterUtil.isEmpty( lin ) ) {
484 tax.getLineage().add( lin );
490 private enum QUERY_TYPE {