in progress
[jalview.git] / forester / java / src / org / forester / analysis / AncestralTaxonomyInference.java
index b9e6b46..dec0a63 100644 (file)
@@ -4,7 +4,7 @@
 // Copyright (C) 2010 Christian M Zmasek
 // Copyright (C) 2010 Sanford-Burnham Medical Research Institute
 // All rights reserved
-// 
+//
 // This library is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
 // License as published by the Free Software Foundation; either
@@ -14,7 +14,7 @@
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 // Lesser General Public License for more details.
-// 
+//
 // You should have received a copy of the GNU Lesser General Public
 // License along with this library; if not, write to the Free Software
 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
@@ -43,380 +43,410 @@ import org.forester.ws.uniprot.UniProtWsTools;
 
 public final class AncestralTaxonomyInference {
 
-       private static final int MAX_CACHE_SIZE = 100000;
-       private static final int MAX_TAXONOMIES_TO_RETURN = 100;
-       private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
-       private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map = new HashMap<String, UniProtTaxonomy>();
-       private static final HashMap<String, UniProtTaxonomy> _cn_up_cache_map = new HashMap<String, UniProtTaxonomy>();
-       private static final HashMap<String, UniProtTaxonomy> _id_up_cache_map = new HashMap<String, UniProtTaxonomy>();
-
-       synchronized private static void clearCachesIfTooLarge() {
-               if (getSnTaxCacheMap().size() > MAX_CACHE_SIZE) {
-                       getSnTaxCacheMap().clear();
-               }
-               if (getCnTaxCacheMap().size() > MAX_CACHE_SIZE) {
-                       getCnTaxCacheMap().clear();
-               }
-               if (getCodeTaxCacheMap().size() > MAX_CACHE_SIZE) {
-                       getCodeTaxCacheMap().clear();
-               }
-               if (getIdTaxCacheMap().size() > MAX_CACHE_SIZE) {
-                       getIdTaxCacheMap().clear();
-               }
-       }
+    private static final int                              MAX_CACHE_SIZE           = 100000;
+    private static final int                              MAX_TAXONOMIES_TO_RETURN = 100;
+    private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map         = new HashMap<String, UniProtTaxonomy>();
+    private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map       = new HashMap<String, UniProtTaxonomy>();
+    private static final HashMap<String, UniProtTaxonomy> _cn_up_cache_map         = new HashMap<String, UniProtTaxonomy>();
+    private static final HashMap<String, UniProtTaxonomy> _id_up_cache_map         = new HashMap<String, UniProtTaxonomy>();
 
-       synchronized private static HashMap<String, UniProtTaxonomy> getCnTaxCacheMap() {
-               return _cn_up_cache_map;
-       }
+    synchronized private static void clearCachesIfTooLarge() {
+        if ( getSnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
+            getSnTaxCacheMap().clear();
+        }
+        if ( getCnTaxCacheMap().size() > MAX_CACHE_SIZE ) {
+            getCnTaxCacheMap().clear();
+        }
+        if ( getCodeTaxCacheMap().size() > MAX_CACHE_SIZE ) {
+            getCodeTaxCacheMap().clear();
+        }
+        if ( getIdTaxCacheMap().size() > MAX_CACHE_SIZE ) {
+            getIdTaxCacheMap().clear();
+        }
+    }
 
-       synchronized private static HashMap<String, UniProtTaxonomy> getCodeTaxCacheMap() {
-               return _code_up_cache_map;
-       }
+    synchronized private static HashMap<String, UniProtTaxonomy> getCnTaxCacheMap() {
+        return _cn_up_cache_map;
+    }
 
-       synchronized private static HashMap<String, UniProtTaxonomy> getIdTaxCacheMap() {
-               return _id_up_cache_map;
-       }
+    synchronized private static HashMap<String, UniProtTaxonomy> getCodeTaxCacheMap() {
+        return _code_up_cache_map;
+    }
 
-       synchronized private static HashMap<String, UniProtTaxonomy> getSnTaxCacheMap() {
-               return _sn_up_cache_map;
-       }
+    synchronized private static HashMap<String, UniProtTaxonomy> getIdTaxCacheMap() {
+        return _id_up_cache_map;
+    }
 
-       synchronized private static UniProtTaxonomy getTaxonomies(
-                       final HashMap<String, UniProtTaxonomy> cache, final String query,
-                       final QUERY_TYPE qt) throws IOException {
-               if (cache.containsKey(query)) {
-                       return cache.get(query).copy();
-               } else {
-                       List<UniProtTaxonomy> up_taxonomies = null;
-                       switch (qt) {
-                       case ID:
-                               up_taxonomies = getTaxonomiesFromId(query);
-                               break;
-                       case CODE:
-                               up_taxonomies = getTaxonomiesFromTaxonomyCode(query);
-                               break;
-                       case SN:
-                               up_taxonomies = getTaxonomiesFromScientificName(query);
-                               break;
-                       case CN:
-                               up_taxonomies = getTaxonomiesFromCommonName(query);
-                               break;
-                       default:
-                               throw new RuntimeException();
-                       }
-                       if ((up_taxonomies != null) && (up_taxonomies.size() == 1)) {
-                               final UniProtTaxonomy up_tax = up_taxonomies.get(0);
-                               if (!ForesterUtil.isEmpty(up_tax.getScientificName())) {
-                                       getSnTaxCacheMap().put(up_tax.getScientificName(), up_tax);
-                               }
-                               if (!ForesterUtil.isEmpty(up_tax.getCode())) {
-                                       getCodeTaxCacheMap().put(up_tax.getCode(), up_tax);
-                               }
-                               if (!ForesterUtil.isEmpty(up_tax.getCommonName())) {
-                                       getCnTaxCacheMap().put(up_tax.getCommonName(), up_tax);
-                               }
-                               if (!ForesterUtil.isEmpty(up_tax.getId())) {
-                                       getIdTaxCacheMap().put(up_tax.getId(), up_tax);
-                               }
-                               return up_tax;
-                       } else {
-                               return null;
-                       }
-               }
-       }
+    synchronized private static HashMap<String, UniProtTaxonomy> getSnTaxCacheMap() {
+        return _sn_up_cache_map;
+    }
 
-       synchronized private static List<UniProtTaxonomy> getTaxonomiesFromCommonName(
-                       final String query) throws IOException {
-               return UniProtWsTools.getTaxonomiesFromCommonNameStrict(query,
-                               MAX_TAXONOMIES_TO_RETURN);
-       }
+    synchronized private static UniProtTaxonomy getTaxonomies( final HashMap<String, UniProtTaxonomy> cache,
+                                                               final String query,
+                                                               final QUERY_TYPE qt ) throws IOException {
+        if ( cache.containsKey( query ) ) {
+            return cache.get( query ).copy();
+        }
+        else {
+            List<UniProtTaxonomy> up_taxonomies = null;
+            switch ( qt ) {
+                case ID:
+                    up_taxonomies = getTaxonomiesFromId( query );
+                    break;
+                case CODE:
+                    up_taxonomies = getTaxonomiesFromTaxonomyCode( query );
+                    break;
+                case SN:
+                    up_taxonomies = getTaxonomiesFromScientificName( query );
+                    break;
+                case CN:
+                    up_taxonomies = getTaxonomiesFromCommonName( query );
+                    break;
+                default:
+                    throw new RuntimeException();
+            }
+            if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
+                final UniProtTaxonomy up_tax = up_taxonomies.get( 0 );
+                if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
+                    getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
+                }
+                if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
+                    getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
+                }
+                if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
+                    getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
+                }
+                if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
+                    getIdTaxCacheMap().put( up_tax.getId(), up_tax );
+                }
+                return up_tax;
+            }
+            else {
+                return null;
+            }
+        }
+    }
 
-       synchronized private static List<UniProtTaxonomy> getTaxonomiesFromId(
-                       final String query) throws IOException {
-               return UniProtWsTools.getTaxonomiesFromId(query,
-                               MAX_TAXONOMIES_TO_RETURN);
-       }
+    synchronized private static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String query )
+            throws IOException {
+        return UniProtWsTools.getTaxonomiesFromCommonNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
+    }
 
-       synchronized private static List<UniProtTaxonomy> getTaxonomiesFromScientificName(
-                       final String query) throws IOException {
-               return UniProtWsTools.getTaxonomiesFromScientificNameStrict(query,
-                               MAX_TAXONOMIES_TO_RETURN);
-       }
+    synchronized private static List<UniProtTaxonomy> getTaxonomiesFromId( final String query ) throws IOException {
+        return UniProtWsTools.getTaxonomiesFromId( query, MAX_TAXONOMIES_TO_RETURN );
+    }
 
-       synchronized private static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode(
-                       final String query) throws IOException {
-               return UniProtWsTools.getTaxonomiesFromTaxonomyCode(query,
-                               MAX_TAXONOMIES_TO_RETURN);
-       }
+    synchronized private static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String query )
+            throws IOException {
+        return UniProtWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
+    }
 
-       synchronized public static SortedSet<String> inferTaxonomyFromDescendents(
-                       final Phylogeny phy) throws IOException {
-               clearCachesIfTooLarge();
-               final SortedSet<String> not_found = new TreeSet<String>();
-               for (final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter
-                               .hasNext();) {
-                       final PhylogenyNode node = iter.next();
-                       // final QUERY_TYPE qt = null;
-                       // Taxonomy tax = null;
-                       // if ( node.getNodeData().isHasTaxonomy() ) {
-                       // tax = node.getNodeData().getTaxonomy();
-                       // }
-                       // UniProtTaxonomy up_tax = null;
-                       // if ( ( tax != null )
-                       // && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty(
-                       // tax.getScientificName() )
-                       // || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ||
-                       // !ForesterUtil.isEmpty( tax
-                       // .getCommonName() ) ) ) {
-                       // final String query = null;
-                       // up_tax = obtainUniProtTaxonomy( tax, query, qt );
-                       // if ( up_tax == null ) {
-                       // not_found.add( query );
-                       // }
-                       // else {
-                       // updateTaxonomy( qt, node, tax, up_tax );
-                       // }
-                       // }
-                       if (!node.isExternal()) {
-                               inferTaxonomyFromDescendents(node, not_found);
-                       }
-               }
-               return not_found;
-       }
+    synchronized private static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode( final String query )
+            throws IOException {
+        return UniProtWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN );
+    }
 
-       synchronized private static void inferTaxonomyFromDescendents(
-                       final PhylogenyNode n, final SortedSet<String> not_found)
-                       throws IOException {
-               if (n.isExternal()) {
-                       throw new IllegalArgumentException(
-                                       "attempt to infer taxonomy from descendants of external node");
-               }
-               n.getNodeData().setTaxonomy(null);
-               final List<PhylogenyNode> descs = n.getDescendants();
-               final List<String[]> lineages = new ArrayList<String[]>();
-               int shortest_lin_length = Integer.MAX_VALUE;
-               for (final PhylogenyNode desc : descs) {
-                       if (desc.getNodeData().isHasTaxonomy()
-                                       && (isHasAppropriateId(desc.getNodeData().getTaxonomy())
-                                                       || !ForesterUtil.isEmpty(desc.getNodeData()
-                                                                       .getTaxonomy().getScientificName())
-                                                       || !ForesterUtil.isEmpty(desc.getNodeData()
-                                                                       .getTaxonomy().getTaxonomyCode()) || !ForesterUtil
-                                                       .isEmpty(desc.getNodeData().getTaxonomy()
-                                                                       .getCommonName()))) {
-                               final QUERY_TYPE qt = null;
-                               final String query = null;
-                               final UniProtTaxonomy up_tax = obtainUniProtTaxonomy(desc
-                                               .getNodeData().getTaxonomy(), query, qt);
-                               String[] lineage = null;
-                               if (up_tax != null) {
-                                       lineage = obtainLineagePlusOwnScientificName(up_tax);
-                               }
-                               if ((lineage == null) || (lineage.length < 1)) {
-                                       not_found.add(desc.getNodeData().getTaxonomy().asText()
-                                                       .toString());
-                                       return;
-                               }
-                               if (lineage.length < shortest_lin_length) {
-                                       shortest_lin_length = lineage.length;
-                               }
-                               lineages.add(lineage);
-                       } else {
-                               String msg = "Node(s) with no or inappropriate taxonomic information found";
-                               if (!ForesterUtil.isEmpty(desc.getName())) {
-                                       msg = "Node " + desc.getName()
-                                                       + " has no or inappropriate taxonomic information";
-                               }
-                               throw new IllegalArgumentException(msg);
-                       }
-               }
-               String last_common_lineage = null;
-               if (shortest_lin_length > 0) {
-                       I: for (int i = 0; i < shortest_lin_length; ++i) {
-                               final String lineage_0 = lineages.get(0)[i];
-                               for (int j = 1; j < lineages.size(); ++j) {
-                                       if (!lineage_0.equals(lineages.get(j)[i])) {
-                                               break I;
-                                       }
-                               }
-                               last_common_lineage = lineage_0;
-                       }
-               }
-               if (last_common_lineage == null) {
-                       return;
-               }
-               // if ( !n.getNodeData().isHasTaxonomy() ) {
-               // n.getNodeData().setTaxonomy( new Taxonomy() );
-               // }
-               final Taxonomy tax = new Taxonomy();
-               n.getNodeData().setTaxonomy(tax);
-               tax.setScientificName(last_common_lineage);
-               final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromSn(last_common_lineage);
-               if (up_tax != null) {
-                       if (!ForesterUtil.isEmpty(up_tax.getRank())) {
-                               try {
-                                       tax.setRank(up_tax.getRank().toLowerCase());
-                               } catch (final PhyloXmlDataFormatException ex) {
-                                       tax.setRank("");
-                               }
-                       }
-                       if (!ForesterUtil.isEmpty(up_tax.getId())) {
-                               tax.setIdentifier(new Identifier(up_tax.getId(), "uniprot"));
-                       }
-                       if (!ForesterUtil.isEmpty(up_tax.getCommonName())) {
-                               tax.setCommonName(up_tax.getCommonName());
-                       }
-                       if (!ForesterUtil.isEmpty(up_tax.getSynonym())
-                                       && !tax.getSynonyms().contains(up_tax.getSynonym())) {
-                               tax.getSynonyms().add(up_tax.getSynonym());
-                       }
-               }
-               for (final PhylogenyNode desc : descs) {
-                       if (!desc.isExternal() && desc.getNodeData().isHasTaxonomy()
-                                       && desc.getNodeData().getTaxonomy().isEqual(tax)) {
-                               desc.getNodeData().setTaxonomy(null);
-                       }
-               }
-       }
+    synchronized public static SortedSet<String> inferTaxonomyFromDescendents( final Phylogeny phy ) throws IOException {
+        clearCachesIfTooLarge();
+        final SortedSet<String> not_found = new TreeSet<String>();
+        for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
+            final PhylogenyNode node = iter.next();
+            if ( !node.isExternal() ) {
+                inferTaxonomyFromDescendents( node, not_found );
+            }
+        }
+        return not_found;
+    }
 
-       synchronized private static boolean isHasAppropriateId(final Taxonomy tax) {
-               return ((tax.getIdentifier() != null) && (!ForesterUtil.isEmpty(tax
-                               .getIdentifier().getValue()) && (tax.getIdentifier()
-                               .getProvider().equalsIgnoreCase("ncbi")
-                               || tax.getIdentifier().getProvider()
-                                               .equalsIgnoreCase("uniprot") || tax.getIdentifier()
-                               .getProvider().equalsIgnoreCase("uniprotkb"))));
-       }
+    synchronized private static void inferTaxonomyFromDescendents( final PhylogenyNode n,
+                                                                   final SortedSet<String> not_found )
+            throws IOException {
+        if ( n.isExternal() ) {
+            throw new IllegalArgumentException( "attempt to infer taxonomy from descendants of external node" );
+        }
+        n.getNodeData().setTaxonomy( null );
+        final List<PhylogenyNode> descs = n.getDescendants();
+        final List<String[]> lineages = new ArrayList<String[]>();
+        int shortest_lin_length = Integer.MAX_VALUE;
+        for( final PhylogenyNode desc : descs ) {
+            if ( desc.getNodeData().isHasTaxonomy()
+                    && ( isHasAppropriateId( desc.getNodeData().getTaxonomy() )
+                            || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getScientificName() )
+                            || !ForesterUtil.isEmpty( desc.getNodeData().getTaxonomy().getTaxonomyCode() ) || !ForesterUtil
+                            .isEmpty( desc.getNodeData().getTaxonomy().getCommonName() ) ) ) {
+             
+                final UniProtTaxonomy up_tax = obtainUniProtTaxonomy( desc.getNodeData().getTaxonomy(), null, null );
+                String[] lineage = null;
+                if ( up_tax != null ) {
+                    //lineage = obtainLineagePlusOwnScientificName( up_tax );
+                    lineage = up_tax.getLineageAsArray();
+                }
+                if ( ( lineage == null ) || ( lineage.length < 1 ) ) {
+                    //TODO remove me
+                    System.out.println( "node " + desc.getNodeData().getTaxonomy().toString() + " has no lineage!" );
+                    not_found.add( desc.getNodeData().getTaxonomy().asText().toString() );
+                    return;
+                }
+                if ( lineage.length < shortest_lin_length ) {
+                    shortest_lin_length = lineage.length;
+                }
+                lineages.add( lineage );
+            }
+            else {
+                String msg = "Node(s) with no or inappropriate taxonomic information found";
+                String node = "";
+                if ( !ForesterUtil.isEmpty( desc.getName() ) ) {
+                    node = "\"" + desc.getName() + "\"";
+                }
+                else {
+                    node = "[" + desc.getId() + "]";
+                }
+                msg = "Node " + node + " has no or inappropriate taxonomic information";
+             //   final List<PhylogenyNode> e = desc.getAllExternalDescendants();
+                //TODO remove me!
+//                System.out.println();
+//                int x = 0;
+//                for( final PhylogenyNode object : e ) {
+//                    System.out.println( x + ":" );
+//                    System.out.println( object.getName() + "  " );
+//                    x++;
+//                }
+//                System.out.println();
+                //
+                throw new IllegalArgumentException( msg );
+            }
+        }
+        String last_common_lineage = null;
+        if ( shortest_lin_length > 0 ) {
+            I: for( int i = 0; i < shortest_lin_length; ++i ) {
+                final String lineage_0 = lineages.get( 0 )[ i ];
+                for( int j = 1; j < lineages.size(); ++j ) {
+                    if ( !lineage_0.equals( lineages.get( j )[ i ] ) ) {
+                        break I;
+                    }
+                }
+                last_common_lineage = lineage_0;
+            }
+        }
+        if ( last_common_lineage == null ) {
+            System.out.println( "No common lineage for:" );
+            int counter = 0;
+            for( final String[] strings : lineages ) {
+                System.out.print( counter + ": " );
+                ++counter;
+                for( final String string : strings ) {
+                    System.out.print( string + " " );
+                }
+                System.out.println();
+            }
+            return;
+        }
+        final Taxonomy tax = new Taxonomy();
+        n.getNodeData().setTaxonomy( tax );
+        tax.setScientificName( last_common_lineage );
+        final UniProtTaxonomy up_tax = obtainUniProtTaxonomyFromSn( last_common_lineage, lineage );
+        if ( up_tax != null ) {
+            if ( !ForesterUtil.isEmpty( up_tax.getRank() ) ) {
+                try {
+                    tax.setRank( up_tax.getRank().toLowerCase() );
+                }
+                catch ( final PhyloXmlDataFormatException ex ) {
+                    tax.setRank( "" );
+                }
+            }
+            if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
+                tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
+            }
+            if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
+                tax.setCommonName( up_tax.getCommonName() );
+            }
+            if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
+                tax.getSynonyms().add( up_tax.getSynonym() );
+            }
+            if ( up_tax.getLineage() != null ) {
+                tax.setLineage( new ArrayList<String>() );
+                for( final String lin : up_tax.getLineage() ) {
+                    if ( !ForesterUtil.isEmpty( lin ) ) {
+                        tax.getLineage().add( lin );
+                    }
+                }
+            }
+            
+        }
+        for( final PhylogenyNode desc : descs ) {
+            if ( !desc.isExternal() && desc.getNodeData().isHasTaxonomy()
+                    && desc.getNodeData().getTaxonomy().isEqual( tax ) ) {
+                desc.getNodeData().setTaxonomy( null );
+            }
+        }
+    }
 
-       synchronized public static SortedSet<String> obtainDetailedTaxonomicInformation(
-                       final Phylogeny phy) throws IOException {
-               clearCachesIfTooLarge();
-               final SortedSet<String> not_found = new TreeSet<String>();
-               for (final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter
-                               .hasNext();) {
-                       final PhylogenyNode node = iter.next();
-                       final QUERY_TYPE qt = null;
-                       Taxonomy tax = null;
-                       if (node.getNodeData().isHasTaxonomy()) {
-                               tax = node.getNodeData().getTaxonomy();
-                       } else if (node.isExternal()) {
-                               if (!ForesterUtil.isEmpty(node.getName())) {
-                                       not_found.add(node.getName());
-                               } else {
-                                       not_found.add(node.toString());
-                               }
-                       }
-                       UniProtTaxonomy up_tax = null;
-                       if ((tax != null)
-                                       && (isHasAppropriateId(tax)
-                                                       || !ForesterUtil.isEmpty(tax.getScientificName())
-                                                       || !ForesterUtil.isEmpty(tax.getTaxonomyCode()) || !ForesterUtil
-                                                       .isEmpty(tax.getCommonName()))) {
-                               up_tax = obtainUniProtTaxonomy(tax, null, qt);
-                               if (up_tax != null) {
-                                       updateTaxonomy(qt, node, tax, up_tax);
-                               } else {
-                                       not_found.add(tax.toString());
-                               }
-                       }
-               }
-               return not_found;
-       }
+    synchronized private static boolean isHasAppropriateId( final Taxonomy tax ) {
+        return ( ( tax.getIdentifier() != null ) && ( !ForesterUtil.isEmpty( tax.getIdentifier().getValue() ) && ( tax
+                .getIdentifier().getProvider().equalsIgnoreCase( "ncbi" )
+                || tax.getIdentifier().getProvider().equalsIgnoreCase( "uniprot" ) || tax.getIdentifier().getProvider()
+                .equalsIgnoreCase( "uniprotkb" ) ) ) );
+    }
 
-       synchronized private static String[] obtainLineagePlusOwnScientificName(
-                       final UniProtTaxonomy up_tax) {
-               final String[] lineage = up_tax.getLineage();
-               final String[] lin_plus_self = new String[lineage.length + 1];
-               for (int i = 0; i < lineage.length; ++i) {
-                       lin_plus_self[i] = lineage[i];
-               }
-               lin_plus_self[lineage.length] = up_tax.getScientificName();
-               return lin_plus_self;
-       }
+    synchronized public static SortedSet<String> obtainDetailedTaxonomicInformation( final Phylogeny phy,
+                                                                                     final boolean delete )
+            throws IOException {
+        clearCachesIfTooLarge();
+        final SortedSet<String> not_found = new TreeSet<String>();
+        List<PhylogenyNode> not_found_external_nodes = null;
+        if ( delete ) {
+            not_found_external_nodes = new ArrayList<PhylogenyNode>();
+        }
+        for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) {
+            final PhylogenyNode node = iter.next();
+            final QUERY_TYPE qt = null;
+            Taxonomy tax = null;
+            if ( node.getNodeData().isHasTaxonomy() ) {
+                tax = node.getNodeData().getTaxonomy();
+            }
+            else if ( node.isExternal() ) {
+                if ( !ForesterUtil.isEmpty( node.getName() ) ) {
+                    not_found.add( node.getName() );
+                }
+                else {
+                    not_found.add( node.toString() );
+                }
+                if ( delete ) {
+                    not_found_external_nodes.add( node );
+                }
+            }
+            UniProtTaxonomy uniprot_tax = null;
+            if ( ( tax != null )
+                    && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() )
+                            || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax
+                            .getCommonName() ) ) ) {
+                uniprot_tax = obtainUniProtTaxonomy( tax, null, qt );
+                if ( uniprot_tax != null ) {
+                    updateTaxonomy( qt, node, tax, uniprot_tax );
+                }
+                else {
+                    not_found.add( tax.toString() );
+                    if ( delete && node.isExternal() ) {
+                        not_found_external_nodes.add( node );
+                    }
+                }
+            }
+        }
+        if ( delete ) {
+            for( final PhylogenyNode node : not_found_external_nodes ) {
+                phy.deleteSubtree( node, true );
+            }
+            phy.externalNodesHaveChanged();
+            phy.hashIDs();
+            phy.recalculateNumberOfExternalDescendants( true );
+        }
+        return not_found;
+    }
 
-       synchronized private static UniProtTaxonomy obtainUniProtTaxonomy(
-                       final Taxonomy tax, String query, QUERY_TYPE qt) throws IOException {
-               if (isHasAppropriateId(tax)) {
-                       query = tax.getIdentifier().getValue();
-                       qt = QUERY_TYPE.ID;
-                       return getTaxonomies(getIdTaxCacheMap(), query, qt);
-               } else if (!ForesterUtil.isEmpty(tax.getScientificName())) {
-                       query = tax.getScientificName();
-                       qt = QUERY_TYPE.SN;
-                       return getTaxonomies(getSnTaxCacheMap(), query, qt);
-               } else if (!ForesterUtil.isEmpty(tax.getTaxonomyCode())) {
-                       query = tax.getTaxonomyCode();
-                       qt = QUERY_TYPE.CODE;
-                       return getTaxonomies(getCodeTaxCacheMap(), query, qt);
-               } else {
-                       query = tax.getCommonName();
-                       qt = QUERY_TYPE.CN;
-                       return getTaxonomies(getCnTaxCacheMap(), query, qt);
-               }
-       }
+    // TODO this might not be needed anymore
+    //  synchronized private static String[] obtainLineagePlusOwnScientificName( final UniProtTaxonomy up_tax ) {
+    //      final String[] lineage = up_tax.getLineageAsArray();
+    //      final String[] lin_plus_self = new String[ lineage.length + 1 ];
+    //      for( int i = 0; i < lineage.length; ++i ) {
+    //          lin_plus_self[ i ] = lineage[ i ];
+    //      }
+    //      lin_plus_self[ lineage.length ] = up_tax.getScientificName();
+    //      return lin_plus_self;
+    //  }
+    synchronized private static UniProtTaxonomy obtainUniProtTaxonomy( final Taxonomy tax, String query, QUERY_TYPE qt )
+            throws IOException {
+        if ( isHasAppropriateId( tax ) ) {
+            query = tax.getIdentifier().getValue();
+            qt = QUERY_TYPE.ID;
+            System.out.println( "query by id: " + query);
+            return getTaxonomies( getIdTaxCacheMap(), query, qt );
+        }
+        else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
+            query = tax.getScientificName();
+            qt = QUERY_TYPE.SN;
+            System.out.println( "query by sn: " + query);
+            return getTaxonomies( getSnTaxCacheMap(), query, qt );
+        }
+        else if ( !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
+            query = tax.getTaxonomyCode();
+            qt = QUERY_TYPE.CODE;
+            return getTaxonomies( getCodeTaxCacheMap(), query, qt );
+        }
+        else {
+            query = tax.getCommonName();
+            qt = QUERY_TYPE.CN;
+            return getTaxonomies( getCnTaxCacheMap(), query, qt );
+        }
+    }
 
-       synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn(
-                       final String sn) throws IOException {
-               UniProtTaxonomy up_tax = null;
-               if (getSnTaxCacheMap().containsKey(sn)) {
-                       up_tax = getSnTaxCacheMap().get(sn).copy();
-               } else {
-                       final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName(sn);
-                       if ((up_taxonomies != null) && (up_taxonomies.size() == 1)) {
-                               up_tax = up_taxonomies.get(0);
-                               getSnTaxCacheMap().put(sn, up_tax);
-                               if (!ForesterUtil.isEmpty(up_tax.getCode())) {
-                                       getCodeTaxCacheMap().put(up_tax.getCode(), up_tax);
-                               }
-                               if (!ForesterUtil.isEmpty(up_tax.getCommonName())) {
-                                       getCnTaxCacheMap().put(up_tax.getCommonName(), up_tax);
-                               }
-                               if (!ForesterUtil.isEmpty(up_tax.getId())) {
-                                       getIdTaxCacheMap().put(up_tax.getId(), up_tax);
-                               }
-                       }
-               }
-               return up_tax;
-       }
+    synchronized private static UniProtTaxonomy obtainUniProtTaxonomyFromSn( final String sn, List<String> lineage ) throws IOException {
+        UniProtTaxonomy up_tax = null;
+        if ( getSnTaxCacheMap().containsKey( sn ) ) {
+            up_tax = getSnTaxCacheMap().get( sn ).copy();
+        }
+        else {
+            final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( sn );
+            if ( ( up_taxonomies != null ) && ( up_taxonomies.size() == 1 ) ) {
+                up_tax = up_taxonomies.get( 0 );
+                getSnTaxCacheMap().put( sn, up_tax );
+                if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
+                    getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
+                }
+                if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
+                    getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
+                }
+                if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
+                    getIdTaxCacheMap().put( up_tax.getId(), up_tax );
+                }
+                
+            }
+        }
+        return up_tax;
+    }
 
-       synchronized private static void updateTaxonomy(final QUERY_TYPE qt,
-                       final PhylogenyNode node, final Taxonomy tax,
-                       final UniProtTaxonomy up_tax) {
-               if ((qt != QUERY_TYPE.SN)
-                               && !ForesterUtil.isEmpty(up_tax.getScientificName())
-                               && ForesterUtil.isEmpty(tax.getScientificName())) {
-                       tax.setScientificName(up_tax.getScientificName());
-               }
-               if (node.isExternal()
-                               && ((qt != QUERY_TYPE.CODE)
-                                               && !ForesterUtil.isEmpty(up_tax.getCode()) && ForesterUtil
-                                               .isEmpty(tax.getTaxonomyCode()))) {
-                       tax.setTaxonomyCode(up_tax.getCode());
-               }
-               if ((qt != QUERY_TYPE.CN)
-                               && !ForesterUtil.isEmpty(up_tax.getCommonName())
-                               && ForesterUtil.isEmpty(tax.getCommonName())) {
-                       tax.setCommonName(up_tax.getCommonName());
-               }
-               if (!ForesterUtil.isEmpty(up_tax.getSynonym())
-                               && !tax.getSynonyms().contains(up_tax.getSynonym())) {
-                       tax.getSynonyms().add(up_tax.getSynonym());
-               }
-               if (!ForesterUtil.isEmpty(up_tax.getRank())
-                               && ForesterUtil.isEmpty(tax.getRank())) {
-                       try {
-                               tax.setRank(up_tax.getRank().toLowerCase());
-                       } catch (final PhyloXmlDataFormatException ex) {
-                               tax.setRank("");
-                       }
-               }
-               if ((qt != QUERY_TYPE.ID) && !ForesterUtil.isEmpty(up_tax.getId())
-                               && (tax.getIdentifier() == null)) {
-                       tax.setIdentifier(new Identifier(up_tax.getId(), "uniprot"));
-               }
-       }
+    synchronized private static void updateTaxonomy( final QUERY_TYPE qt,
+                                                     final PhylogenyNode node,
+                                                     final Taxonomy tax,
+                                                     final UniProtTaxonomy up_tax ) {
+        if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() )
+                && ForesterUtil.isEmpty( tax.getScientificName() ) ) {
+            tax.setScientificName( up_tax.getScientificName() );
+        }
+        //  if ( node.isExternal()
+        if ( ( qt != QUERY_TYPE.CODE ) && !ForesterUtil.isEmpty( up_tax.getCode() )
+                && ForesterUtil.isEmpty( tax.getTaxonomyCode() ) ) {
+            tax.setTaxonomyCode( up_tax.getCode() );
+        }
+        if ( ( qt != QUERY_TYPE.CN ) && !ForesterUtil.isEmpty( up_tax.getCommonName() )
+                && ForesterUtil.isEmpty( tax.getCommonName() ) ) {
+            tax.setCommonName( up_tax.getCommonName() );
+        }
+        if ( !ForesterUtil.isEmpty( up_tax.getSynonym() ) && !tax.getSynonyms().contains( up_tax.getSynonym() ) ) {
+            tax.getSynonyms().add( up_tax.getSynonym() );
+        }
+        if ( !ForesterUtil.isEmpty( up_tax.getRank() ) && ForesterUtil.isEmpty( tax.getRank() ) ) {
+            try {
+                tax.setRank( up_tax.getRank().toLowerCase() );
+            }
+            catch ( final PhyloXmlDataFormatException ex ) {
+                tax.setRank( "" );
+            }
+        }
+        if ( ( qt != QUERY_TYPE.ID ) && !ForesterUtil.isEmpty( up_tax.getId() ) && ( tax.getIdentifier() == null ) ) {
+            tax.setIdentifier( new Identifier( up_tax.getId(), "uniprot" ) );
+        }
+        if ( up_tax.getLineage() != null ) {
+            tax.setLineage( new ArrayList<String>() );
+            for( final String lin : up_tax.getLineage() ) {
+                if ( !ForesterUtil.isEmpty( lin ) ) {
+                    tax.getLineage().add( lin );
+                }
+            }
+        }
+        
+    }
 
-       private enum QUERY_TYPE {
-               CODE, SN, CN, ID;
-       }
+    private enum QUERY_TYPE {
+        CODE, SN, CN, ID;
+    }
 }