X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fanalysis%2FAncestralTaxonomyInference.java;h=73af2c142c86be1e23ec5858599e7b0db661a8fe;hb=8cb65713b89737f529cedce7bcd39f2b9f9fc8a1;hp=fb34a14eaa199b61dd3c4f56a7c957a301242d43;hpb=9bb791edf52887de31d1a49ff9606f85724a09a8;p=jalview.git diff --git a/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java b/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java index fb34a14..73af2c1 100644 --- a/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java +++ b/forester/java/src/org/forester/analysis/AncestralTaxonomyInference.java @@ -44,8 +44,9 @@ import org.forester.ws.uniprot.UniProtWsTools; public final class AncestralTaxonomyInference { private static final int MAX_CACHE_SIZE = 100000; - private static final int MAX_TAXONOMIES_TO_RETURN = 100; + private static final int MAX_TAXONOMIES_TO_RETURN = 10; private static final HashMap _sn_up_cache_map = new HashMap(); + private static final HashMap _lineage_up_cache_map = new HashMap(); private static final HashMap _code_up_cache_map = new HashMap(); private static final HashMap _cn_up_cache_map = new HashMap(); private static final HashMap _id_up_cache_map = new HashMap(); @@ -54,6 +55,9 @@ public final class AncestralTaxonomyInference { if ( getSnTaxCacheMap().size() > MAX_CACHE_SIZE ) { getSnTaxCacheMap().clear(); } + if ( getLineageTaxCacheMap().size() > MAX_CACHE_SIZE ) { + getLineageTaxCacheMap().clear(); + } if ( getCnTaxCacheMap().size() > MAX_CACHE_SIZE ) { getCnTaxCacheMap().clear(); } @@ -81,9 +85,14 @@ public final class AncestralTaxonomyInference { return _sn_up_cache_map; } + synchronized private static HashMap getLineageTaxCacheMap() { + return _lineage_up_cache_map; + } + synchronized private static UniProtTaxonomy getTaxonomies( final HashMap cache, - final String query, - final QUERY_TYPE qt ) throws IOException { + final Object query, + final QUERY_TYPE qt ) throws IOException, + AncestralTaxonomyInferenceException { if ( cache.containsKey( query ) ) { return cache.get( query ).copy(); } @@ -91,17 +100,19 @@ public final class AncestralTaxonomyInference { List up_taxonomies = null; switch ( qt ) { case ID: - up_taxonomies = getTaxonomiesFromId( query ); + up_taxonomies = getTaxonomiesFromId( ( String ) query ); break; case CODE: - up_taxonomies = getTaxonomiesFromTaxonomyCode( query ); + up_taxonomies = getTaxonomiesFromTaxonomyCode( ( String ) query ); break; case SN: - up_taxonomies = getTaxonomiesFromScientificName( query ); + up_taxonomies = getTaxonomiesFromScientificName( ( String ) query ); break; case CN: - up_taxonomies = getTaxonomiesFromCommonName( query ); + up_taxonomies = getTaxonomiesFromCommonName( ( String ) query ); break; + case LIN: + return obtainUniProtTaxonomyFromLineage( ( List ) query ); default: throw new RuntimeException(); } @@ -146,21 +157,19 @@ public final class AncestralTaxonomyInference { return UniProtWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN ); } - synchronized public static SortedSet inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException { + synchronized public static void inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException, + AncestralTaxonomyInferenceException { clearCachesIfTooLarge(); - final SortedSet not_found = new TreeSet(); for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); if ( !node.isExternal() ) { - inferTaxonomyFromDescendents( node, not_found ); + inferTaxonomyFromDescendents( node ); } } - return not_found; } - synchronized private static void inferTaxonomyFromDescendents( final PhylogenyNode n, - final SortedSet not_found ) - throws IOException { + synchronized private static void inferTaxonomyFromDescendents( final PhylogenyNode n ) throws IOException, + AncestralTaxonomyInferenceException { if ( n.isExternal() ) { throw new IllegalArgumentException( "attempt to infer taxonomy from descendants of external node" ); } @@ -172,21 +181,31 @@ public final class AncestralTaxonomyInference { if ( desc.getNodeData().isHasTaxonomy() && ( isHasAppropriateId( desc.getNodeData().getTaxonomy() ) || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getScientificName() ) + || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() ) || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getTaxonomyCode() ) || !ForesterUtil .isEmpty( desc.getNodeData().getTaxonomy().getCommonName() ) ) ) { - final QUERY_TYPE qt = null; - final String query = null; - final UniProtTaxonomy up_tax = obtainUniProtTaxonomy( desc.getNodeData().getTaxonomy(), query, qt ); - String[] lineage = null; - if ( up_tax != null ) { - //lineage = obtainLineagePlusOwnScientificName( up_tax ); - lineage = up_tax.getLineageAsArray(); + final UniProtTaxonomy up_tax = obtainUniProtTaxonomy( desc.getNodeData().getTaxonomy(), null, null ); + if ( ( up_tax == null ) && ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getLineage() ) ) { + String desc_str = ""; + if ( !ForesterUtil.isEmpty( desc.getName() ) ) { + desc_str = "\"" + desc.getName() + "\""; + } + else { + desc_str = "[" + desc.getId() + "]"; + } + System.out.println( desc.getNodeData().getTaxonomy().toString() ); + System.out.println( ForesterUtil.stringListToString( desc.getNodeData().getTaxonomy().getLineage(), + " > " ) ); + throw new AncestralTaxonomyInferenceException( "a taxonomy for node " + desc_str + + " could not be established from the database" ); } + String[] lineage = ForesterUtil.stringListToArray( desc.getNodeData().getTaxonomy().getLineage() ); if ( ( lineage == null ) || ( lineage.length < 1 ) ) { - //TODO remove me - System.out.println( "node " + desc.getNodeData().getTaxonomy().toString() + " has no lineage!" ); - not_found.add( desc.getNodeData().getTaxonomy().asText().toString() ); - return; + lineage = ForesterUtil.stringListToArray( up_tax.getLineage() ); + } + if ( ( lineage == null ) || ( lineage.length < 1 ) ) { + throw new AncestralTaxonomyInferenceException( "a taxonomic lineage for node \"" + + desc.getNodeData().getTaxonomy().toString() + "\" could not be established" ); } if ( lineage.length < shortest_lin_length ) { shortest_lin_length = lineage.length; @@ -194,7 +213,6 @@ public final class AncestralTaxonomyInference { lineages.add( lineage ); } else { - String msg = "Node(s) with no or inappropriate taxonomic information found"; String node = ""; if ( !ForesterUtil.isEmpty( desc.getName() ) ) { node = "\"" + desc.getName() + "\""; @@ -202,22 +220,23 @@ public final class AncestralTaxonomyInference { else { node = "[" + desc.getId() + "]"; } - msg = "Node " + node + " has no or inappropriate taxonomic information"; - List e = desc.getAllExternalDescendants(); + // final List e = desc.getAllExternalDescendants(); //TODO remove me! - System.out.println(); - int x = 0; - for( PhylogenyNode object : e ) { - System.out.println( x + ":" ); - System.out.println( object.getName() + " " ); - x++; - } - System.out.println(); + // System.out.println(); + // int x = 0; + // for( final PhylogenyNode object : e ) { + // System.out.println( x + ":" ); + // System.out.println( object.getName() + " " ); + // x++; + // } + // System.out.println(); // - throw new IllegalArgumentException( msg ); + throw new AncestralTaxonomyInferenceException( "node " + node + + " has no or inappropriate taxonomic information" ); } } - String last_common_lineage = null; + final List last_common_lineage = new ArrayList(); + String last_common = null; if ( shortest_lin_length > 0 ) { I: for( int i = 0; i < shortest_lin_length; ++i ) { final String lineage_0 = lineages.get( 0 )[ i ]; @@ -226,29 +245,48 @@ public final class AncestralTaxonomyInference { break I; } } - last_common_lineage = lineage_0; + last_common_lineage.add( lineage_0 ); + last_common = lineage_0; } } - if ( last_common_lineage == null ) { - System.out.println( "No common lineage for:" ); - int counter = 0; - for( String[] strings : lineages ) { - System.out.print( counter + ": " ); - ++counter; - for( String string : strings ) { - System.out.print( string + " " ); + if ( last_common_lineage.isEmpty() ) { + boolean saw_viruses = false; + boolean saw_cellular_organism = false; + for( final String[] lineage : lineages ) { + if ( lineage.length > 0 ) { + if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.VIRUSES ) ) { + saw_viruses = true; + } + else if ( lineage[ 0 ].equalsIgnoreCase( UniProtTaxonomy.CELLULAR_ORGANISMS ) ) { + saw_cellular_organism = true; + } + if ( saw_cellular_organism && saw_viruses ) { + break; + } + } + } + if ( saw_cellular_organism && saw_viruses ) { + last_common_lineage.add( UniProtTaxonomy.CELLULAR_ORGANISMS ); + last_common = UniProtTaxonomy.CELLULAR_ORGANISMS; + } + else { + String msg = "no common lineage for:\n"; + int counter = 0; + for( final String[] strings : lineages ) { + msg += counter + ": "; + ++counter; + for( final String string : strings ) { + msg += string + " "; + } + msg += "\n"; } - System.out.println(); + throw new AncestralTaxonomyInferenceException( msg ); } - return; } - // if ( !n.getNodeData().isHasTaxonomy() ) { - // n.getNodeData().setTaxonomy( new Taxonomy() ); - // } final Taxonomy tax = new Taxonomy(); n.getNodeData().setTaxonomy( tax ); - tax.setScientificName( last_common_lineage ); - final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromSn( last_common_lineage ); + tax.setScientificName( last_common ); + final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromLineage( last_common_lineage ); if ( up_tax != null ) { if ( !ForesterUtil.isEmpty( up_tax.getRank() ) ) { try { @@ -267,6 +305,22 @@ public final class AncestralTaxonomyInference { if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) { tax.getSynonyms().add( up_tax.getSynonym() ); } + if ( up_tax.getLineage() != null ) { + tax.setLineage( new ArrayList() ); + for( final String lin : up_tax.getLineage() ) { + if ( !ForesterUtil.isEmpty( lin ) ) { + tax.getLineage().add( lin ); + } + } + } + } + if ( ForesterUtil.isEmpty( tax.getLineage() ) ) { + tax.setLineage( new ArrayList() ); + for( final String lin : last_common_lineage ) { + if ( !ForesterUtil.isEmpty( lin ) ) { + tax.getLineage().add( lin ); + } + } } for( final PhylogenyNode desc : descs ) { if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy() @@ -285,7 +339,7 @@ public final class AncestralTaxonomyInference { synchronized public static SortedSet obtainDetailedTaxonomicInformation( final Phylogeny phy, final boolean delete ) - throws IOException { + throws IOException, AncestralTaxonomyInferenceException { clearCachesIfTooLarge(); final SortedSet not_found = new TreeSet(); List not_found_external_nodes = null; @@ -310,14 +364,14 @@ public final class AncestralTaxonomyInference { not_found_external_nodes.add( node ); } } - UniProtTaxonomy up_tax = null; + UniProtTaxonomy uniprot_tax = null; if ( ( tax != null ) && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() ) || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax .getCommonName() ) ) ) { - up_tax = obtainUniProtTaxonomy( tax, null, qt ); - if ( up_tax != null ) { - updateTaxonomy( qt, node, tax, up_tax ); + uniprot_tax = obtainUniProtTaxonomy( tax, null, qt ); + if ( uniprot_tax != null ) { + updateTaxonomy( qt, node, tax, uniprot_tax ); } else { not_found.add( tax.toString() ); @@ -328,35 +382,34 @@ public final class AncestralTaxonomyInference { } } if ( delete ) { - for( PhylogenyNode node : not_found_external_nodes ) { - phy.deleteSubtree( node, false ); + for( final PhylogenyNode node : not_found_external_nodes ) { + phy.deleteSubtree( node, true ); } + phy.externalNodesHaveChanged(); + phy.hashIDs(); phy.recalculateNumberOfExternalDescendants( true ); } return not_found; } - // TODO this might not be needed anymore - // synchronized private static String[] obtainLineagePlusOwnScientificName( final UniProtTaxonomy up_tax ) { - // final String[] lineage = up_tax.getLineageAsArray(); - // final String[] lin_plus_self = new String[ lineage.length + 1 ]; - // for( int i = 0; i < lineage.length; ++i ) { - // lin_plus_self[ i ] = lineage[ i ]; - // } - // lin_plus_self[ lineage.length ] = up_tax.getScientificName(); - // return lin_plus_self; - // } - synchronized private static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, String query, QUERY_TYPE qt ) - throws IOException { + synchronized public static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, Object query, QUERY_TYPE qt ) + throws IOException, AncestralTaxonomyInferenceException { if ( isHasAppropriateId( tax ) ) { query = tax.getIdentifier().getValue(); qt = QUERY_TYPE.ID; return getTaxonomies( getIdTaxCacheMap(), query, qt ); } else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { - query = tax.getScientificName(); - qt = QUERY_TYPE.SN; - return getTaxonomies( getSnTaxCacheMap(), query, qt ); + if ( !ForesterUtil.isEmpty( tax.getLineage() ) ) { + query = tax.getLineage(); + qt = QUERY_TYPE.LIN; + return getTaxonomies( getLineageTaxCacheMap(), query, qt ); + } + else { + query = tax.getScientificName(); + qt = QUERY_TYPE.SN; + return getTaxonomies( getSnTaxCacheMap(), query, qt ); + } } else if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { query = tax.getTaxonomyCode(); @@ -370,16 +423,41 @@ public final class AncestralTaxonomyInference { } } - synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn( final String sn ) throws IOException { + synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List lineage ) + throws AncestralTaxonomyInferenceException, IOException { + final String lineage_str = ForesterUtil.stringListToString( lineage, ">" ); UniProtTaxonomy up_tax = null; - if ( getSnTaxCacheMap().containsKey( sn ) ) { - up_tax = getSnTaxCacheMap().get( sn ).copy(); + if ( getLineageTaxCacheMap().containsKey( lineage_str ) ) { + up_tax = getLineageTaxCacheMap().get( lineage_str ).copy(); } else { - final List up_taxonomies = getTaxonomiesFromScientificName( sn ); - if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) { - up_tax = up_taxonomies.get( 0 ); - getSnTaxCacheMap().put( sn, up_tax ); + final List up_taxonomies = getTaxonomiesFromScientificName( lineage + .get( lineage.size() - 1 ) ); + if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) { + for( final UniProtTaxonomy up_taxonomy : up_taxonomies ) { + boolean match = true; + I: for( int i = 0; i < lineage.size(); ++i ) { + if ( !lineage.get( i ).equalsIgnoreCase( up_taxonomy.getLineage().get( i ) ) ) { + match = false; + break I; + } + } + if ( match ) { + if ( up_tax != null ) { + throw new AncestralTaxonomyInferenceException( "lineage \"" + + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" ); + } + up_tax = up_taxonomy; + } + } + if ( up_tax == null ) { + throw new AncestralTaxonomyInferenceException( "lineage \"" + + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" ); + } + getLineageTaxCacheMap().put( lineage_str, up_tax ); + if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) { + getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax ); + } if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) { getCodeTaxCacheMap().put( up_tax.getCode(), up_tax ); } @@ -402,9 +480,8 @@ public final class AncestralTaxonomyInference { && ForesterUtil.isEmpty( tax.getScientificName() ) ) { tax.setScientificName( up_tax.getScientificName() ); } - if ( node.isExternal() - && ( ( qt != QUERY_TYPE.CODE ) && !ForesterUtil.isEmpty( up_tax.getCode() ) && ForesterUtil - .isEmpty( tax.getTaxonomyCode() ) ) ) { + if ( node.isExternal() && ( qt != QUERY_TYPE.CODE ) && !ForesterUtil.isEmpty( up_tax.getCode() ) + && ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) { tax.setTaxonomyCode( up_tax.getCode() ); } if ( ( qt != QUERY_TYPE.CN ) && !ForesterUtil.isEmpty( up_tax.getCommonName() ) @@ -422,12 +499,21 @@ public final class AncestralTaxonomyInference { tax.setRank( "" ); } } - if ( ( qt != QUERY_TYPE.ID ) && !ForesterUtil.isEmpty( up_tax.getId() ) && ( tax.getIdentifier() == null ) ) { + if ( ( qt != QUERY_TYPE.ID ) && !ForesterUtil.isEmpty( up_tax.getId() ) + && ( ( tax.getIdentifier() == null ) || ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) ) ) { tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) ); } + if ( up_tax.getLineage() != null ) { + tax.setLineage( new ArrayList() ); + for( final String lin : up_tax.getLineage() ) { + if ( !ForesterUtil.isEmpty( lin ) ) { + tax.getLineage().add( lin ); + } + } + } } private enum QUERY_TYPE { - CODE, SN, CN, ID; + CODE, SN, CN, ID, LIN; } }