1 // forester -- software libraries and applications
2 // for genomics and evolutionary biology research.
4 // Copyright (C) 2010 Christian M Zmasek
5 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
8 // This library is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 2.1 of the License, or (at your option) any later version.
13 // This library is distributed in the hope that it will be useful,
14 // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 // Lesser General Public License for more details.
18 // You should have received a copy of the GNU Lesser General Public
19 // License along with this library; if not, write to the Free Software
20 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 // Contact: phylosoft @ gmail . com
23 // WWW: www.phylosoft.org/forester
25 package org.forester.analysis;
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.List;
31 import java.util.SortedSet;
32 import java.util.TreeSet;
34 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
35 import org.forester.phylogeny.Phylogeny;
36 import org.forester.phylogeny.PhylogenyNode;
37 import org.forester.phylogeny.data.Identifier;
38 import org.forester.phylogeny.data.Taxonomy;
39 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
40 import org.forester.util.ForesterUtil;
41 import org.forester.ws.uniprot.UniProtTaxonomy;
42 import org.forester.ws.uniprot.UniProtWsTools;
44 public final class AncestralTaxonomyInference {
46 private static final int MAX_CACHE_SIZE = 100000;
47 private static final int MAX_TAXONOMIES_TO_RETURN = 100;
48 private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
49 private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map = new HashMap<String, UniProtTaxonomy>();
50 private static final HashMap<String, UniProtTaxonomy> _cn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
51 private static final HashMap<String, UniProtTaxonomy> _id_up_cache_map = new HashMap<String, UniProtTaxonomy>();
53 synchronized private static void clearCachesIfTooLarge() {
54 if ( getSnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
55 getSnTaxCacheMap().clear();
57 if ( getCnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
58 getCnTaxCacheMap().clear();
60 if ( getCodeTaxCacheMap().size() > MAX_CACHE_SIZE ) {
61 getCodeTaxCacheMap().clear();
63 if ( getIdTaxCacheMap().size() > MAX_CACHE_SIZE ) {
64 getIdTaxCacheMap().clear();
68 synchronized private static HashMap<String, UniProtTaxonomy> getCnTaxCacheMap() {
69 return _cn_up_cache_map;
72 synchronized private static HashMap<String, UniProtTaxonomy> getCodeTaxCacheMap() {
73 return _code_up_cache_map;
76 synchronized private static HashMap<String, UniProtTaxonomy> getIdTaxCacheMap() {
77 return _id_up_cache_map;
80 synchronized private static HashMap<String, UniProtTaxonomy> getSnTaxCacheMap() {
81 return _sn_up_cache_map;
84 synchronized private static UniProtTaxonomy getTaxonomies( final HashMap<String, UniProtTaxonomy> cache,
86 final QUERY_TYPE qt ) throws IOException {
87 if ( cache.containsKey( query ) ) {
88 return cache.get( query ).copy();
91 List<UniProtTaxonomy> up_taxonomies = null;
94 up_taxonomies = getTaxonomiesFromId( query );
97 up_taxonomies = getTaxonomiesFromTaxonomyCode( query );
100 up_taxonomies = getTaxonomiesFromScientificName( query );
103 up_taxonomies = getTaxonomiesFromCommonName( query );
106 throw new RuntimeException();
108 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
109 final UniProtTaxonomy up_tax = up_taxonomies.get( 0 );
110 if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
111 getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
113 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
114 getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
116 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
117 getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
119 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
120 getIdTaxCacheMap().put( up_tax.getId(), up_tax );
130 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String query )
132 return UniProtWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
135 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromId( final String query ) throws IOException {
136 return UniProtWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN );
139 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String query )
141 return UniProtWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
144 synchronized private static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode( final String query )
146 return UniProtWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN );
149 synchronized public static SortedSet<String> inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException {
150 clearCachesIfTooLarge();
151 final SortedSet<String> not_found = new TreeSet<String>();
152 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
153 final PhylogenyNode node = iter.next();
154 if ( !node.isExternal() ) {
155 inferTaxonomyFromDescendents( node, not_found );
161 synchronized private static void inferTaxonomyFromDescendents( final PhylogenyNode n,
162 final SortedSet<String> not_found )
164 if ( n.isExternal() ) {
165 throw new IllegalArgumentException( "attempt to infer taxonomy from descendants of external node" );
167 n.getNodeData().setTaxonomy( null );
168 final List<PhylogenyNode> descs = n.getDescendants();
169 final List<String[]> lineages = new ArrayList<String[]>();
170 int shortest_lin_length = Integer.MAX_VALUE;
171 for( final PhylogenyNode desc : descs ) {
172 if ( desc.getNodeData().isHasTaxonomy()
173 && ( isHasAppropriateId( desc.getNodeData().getTaxonomy() )
174 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getScientificName() )
175 || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getTaxonomyCode() ) || !ForesterUtil
176 .isEmpty( desc.getNodeData().getTaxonomy().getCommonName() ) ) ) {
178 final UniProtTaxonomy up_tax = obtainUniProtTaxonomy( desc.getNodeData().getTaxonomy(), null, null );
179 String[] lineage = null;
180 if ( up_tax != null ) {
181 //lineage = obtainLineagePlusOwnScientificName( up_tax );
182 lineage = up_tax.getLineageAsArray();
184 if ( ( lineage == null ) || ( lineage.length < 1 ) ) {
186 System.out.println( "node " + desc.getNodeData().getTaxonomy().toString() + " has no lineage!" );
187 not_found.add( desc.getNodeData().getTaxonomy().asText().toString() );
190 if ( lineage.length < shortest_lin_length ) {
191 shortest_lin_length = lineage.length;
193 lineages.add( lineage );
196 String msg = "Node(s) with no or inappropriate taxonomic information found";
198 if ( !ForesterUtil.isEmpty( desc.getName() ) ) {
199 node = "\"" + desc.getName() + "\"";
202 node = "[" + desc.getId() + "]";
204 msg = "Node " + node + " has no or inappropriate taxonomic information";
205 // final List<PhylogenyNode> e = desc.getAllExternalDescendants();
207 // System.out.println();
209 // for( final PhylogenyNode object : e ) {
210 // System.out.println( x + ":" );
211 // System.out.println( object.getName() + " " );
214 // System.out.println();
216 throw new IllegalArgumentException( msg );
219 List<String> last_common_lineage = new ArrayList<String>();
220 String last_common = null;
221 if ( shortest_lin_length > 0 ) {
222 I: for( int i = 0; i < shortest_lin_length; ++i ) {
223 final String lineage_0 = lineages.get( 0 )[ i ];
224 for( int j = 1; j < lineages.size(); ++j ) {
225 if ( !lineage_0.equals( lineages.get( j )[ i ] ) ) {
229 // last_common_lineage = lineage_0;
230 last_common_lineage.add( lineage_0 ) ;
231 last_common =lineage_0;
234 // if ( last_common_lineage == null ) {
235 if ( last_common_lineage.isEmpty() ) {
236 System.out.println( "No common lineage for:" );
238 for( final String[] strings : lineages ) {
239 System.out.print( counter + ": " );
241 for( final String string : strings ) {
242 System.out.print( string + " " );
244 System.out.println();
248 final Taxonomy tax = new Taxonomy();
249 n.getNodeData().setTaxonomy( tax );
250 tax.setScientificName( last_common );
251 final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromCommonLineage( last_common_lineage );
252 if ( up_tax != null ) {
253 if ( !ForesterUtil.isEmpty( up_tax.getRank() ) ) {
255 tax.setRank( up_tax.getRank().toLowerCase() );
257 catch ( final PhyloXmlDataFormatException ex ) {
261 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
262 tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
264 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
265 tax.setCommonName( up_tax.getCommonName() );
267 if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
268 tax.getSynonyms().add( up_tax.getSynonym() );
270 if ( up_tax.getLineage() != null ) {
271 tax.setLineage( new ArrayList<String>() );
272 for( final String lin : up_tax.getLineage() ) {
273 if ( !ForesterUtil.isEmpty( lin ) ) {
274 tax.getLineage().add( lin );
280 for( final PhylogenyNode desc : descs ) {
281 if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy()
282 && desc.getNodeData().getTaxonomy().isEqual( tax ) ) {
283 desc.getNodeData().setTaxonomy( null );
288 synchronized private static boolean isHasAppropriateId( final Taxonomy tax ) {
289 return ( ( tax.getIdentifier() != null ) && ( !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) && ( tax
290 .getIdentifier().getProvider().equalsIgnoreCase( "ncbi" )
291 || tax.getIdentifier().getProvider().equalsIgnoreCase( "uniprot" ) || tax.getIdentifier().getProvider()
292 .equalsIgnoreCase( "uniprotkb" ) ) ) );
295 synchronized public static SortedSet<String> obtainDetailedTaxonomicInformation( final Phylogeny phy,
296 final boolean delete )
298 clearCachesIfTooLarge();
299 final SortedSet<String> not_found = new TreeSet<String>();
300 List<PhylogenyNode> not_found_external_nodes = null;
302 not_found_external_nodes = new ArrayList<PhylogenyNode>();
304 for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
305 final PhylogenyNode node = iter.next();
306 final QUERY_TYPE qt = null;
308 if ( node.getNodeData().isHasTaxonomy() ) {
309 tax = node.getNodeData().getTaxonomy();
311 else if ( node.isExternal() ) {
312 if ( !ForesterUtil.isEmpty( node.getName() ) ) {
313 not_found.add( node.getName() );
316 not_found.add( node.toString() );
319 not_found_external_nodes.add( node );
322 UniProtTaxonomy uniprot_tax = null;
324 && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() )
325 || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax
326 .getCommonName() ) ) ) {
327 uniprot_tax = obtainUniProtTaxonomy( tax, null, qt );
328 if ( uniprot_tax != null ) {
329 updateTaxonomy( qt, node, tax, uniprot_tax );
332 not_found.add( tax.toString() );
333 if ( delete && node.isExternal() ) {
334 not_found_external_nodes.add( node );
340 for( final PhylogenyNode node : not_found_external_nodes ) {
341 phy.deleteSubtree( node, true );
343 phy.externalNodesHaveChanged();
345 phy.recalculateNumberOfExternalDescendants( true );
350 // TODO this might not be needed anymore
351 // synchronized private static String[] obtainLineagePlusOwnScientificName( final UniProtTaxonomy up_tax ) {
352 // final String[] lineage = up_tax.getLineageAsArray();
353 // final String[] lin_plus_self = new String[ lineage.length + 1 ];
354 // for( int i = 0; i < lineage.length; ++i ) {
355 // lin_plus_self[ i ] = lineage[ i ];
357 // lin_plus_self[ lineage.length ] = up_tax.getScientificName();
358 // return lin_plus_self;
360 synchronized private static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, String query, QUERY_TYPE qt )
362 if ( isHasAppropriateId( tax ) ) {
363 query = tax.getIdentifier().getValue();
365 System.out.println( "query by id: " + query);
366 return getTaxonomies( getIdTaxCacheMap(), query, qt );
368 else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
369 query = tax.getScientificName();
371 System.out.println( "query by sn: " + query);
372 return getTaxonomies( getSnTaxCacheMap(), query, qt );
374 else if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
375 query = tax.getTaxonomyCode();
376 qt = QUERY_TYPE.CODE;
377 return getTaxonomies( getCodeTaxCacheMap(), query, qt );
380 query = tax.getCommonName();
382 return getTaxonomies( getCnTaxCacheMap(), query, qt );
386 synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn( final String sn) throws IOException {
387 UniProtTaxonomy up_tax = null;
388 if ( getSnTaxCacheMap().containsKey( sn ) ) {
389 up_tax = getSnTaxCacheMap().get( sn ).copy();
392 final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( sn );
393 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
394 up_tax = up_taxonomies.get( 0 );
395 getSnTaxCacheMap().put( sn, up_tax );
396 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
397 getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
399 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
400 getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
402 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
403 getIdTaxCacheMap().put( up_tax.getId(), up_tax );
411 synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromCommonLineage( List<String> lineage ) throws IOException {
412 UniProtTaxonomy up_tax = null;
413 // -- if ( getSnTaxCacheMap().containsKey( sn ) ) {
414 // -- up_tax = getSnTaxCacheMap().get( sn ).copy();
417 final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( lineage.get(lineage.size() -1 ) );
418 //-- if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
420 if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) {
421 for( UniProtTaxonomy up_taxonomy : up_taxonomies ) {
422 boolean match = true;
424 I: for( int i = 0; i < lineage.size(); ++i ) {
425 if ( !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) {
431 if ( up_tax != null ) {
432 throw new IOException( "not unique!");
434 up_tax = up_taxonomy;
438 if ( up_tax == null ) {
439 throw new IOException( "not found!");
441 //-- up_tax = up_taxonomies.get( 0 );
442 //-- getSnTaxCacheMap().put( sn, up_tax );
443 if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
444 getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
446 if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
447 getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
449 if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
450 getIdTaxCacheMap().put( up_tax.getId(), up_tax );
458 synchronized private static void updateTaxonomy( final QUERY_TYPE qt,
459 final PhylogenyNode node,
461 final UniProtTaxonomy up_tax ) {
462 if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() )
463 && ForesterUtil.isEmpty( tax.getScientificName() ) ) {
464 tax.setScientificName( up_tax.getScientificName() );
466 // if ( node.isExternal()
467 if ( ( qt != QUERY_TYPE.CODE ) && !ForesterUtil.isEmpty( up_tax.getCode() )
468 && ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
469 tax.setTaxonomyCode( up_tax.getCode() );
471 if ( ( qt != QUERY_TYPE.CN ) && !ForesterUtil.isEmpty( up_tax.getCommonName() )
472 && ForesterUtil.isEmpty( tax.getCommonName() ) ) {
473 tax.setCommonName( up_tax.getCommonName() );
475 if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
476 tax.getSynonyms().add( up_tax.getSynonym() );
478 if ( !ForesterUtil.isEmpty( up_tax.getRank() ) && ForesterUtil.isEmpty( tax.getRank() ) ) {
480 tax.setRank( up_tax.getRank().toLowerCase() );
482 catch ( final PhyloXmlDataFormatException ex ) {
486 if ( ( qt != QUERY_TYPE.ID ) && !ForesterUtil.isEmpty( up_tax.getId() ) && ( tax.getIdentifier() == null ) ) {
487 tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
489 if ( up_tax.getLineage() != null ) {
490 tax.setLineage( new ArrayList<String>() );
491 for( final String lin : up_tax.getLineage() ) {
492 if ( !ForesterUtil.isEmpty( lin ) ) {
493 tax.getLineage().add( lin );
500 private enum QUERY_TYPE {