JAL-2797 added constructor including embedded/standalone boolean
[jalview.git] / forester / java / src / org / forester / analysis / TaxonomyDataManager.java
index cddaa4f..f99dc7f 100644 (file)
@@ -33,6 +33,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.SortedSet;
 import java.util.TreeSet;
+import java.util.regex.Matcher;
 
 import javax.swing.JOptionPane;
 
@@ -40,13 +41,16 @@ import org.forester.archaeopteryx.MainFrameApplication;
 import org.forester.archaeopteryx.TreePanel;
 import org.forester.archaeopteryx.tools.AncestralTaxonomyInferrer;
 import org.forester.archaeopteryx.tools.RunnableProcess;
+import org.forester.io.parsers.nhx.NHXParser;
 import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
+import org.forester.io.parsers.util.ParserUtils;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.data.Identifier;
 import org.forester.phylogeny.data.Taxonomy;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
 import org.forester.util.ForesterUtil;
+import org.forester.util.TaxonomyUtil;
 import org.forester.ws.seqdb.SequenceDbWsTools;
 import org.forester.ws.seqdb.UniProtTaxonomy;
 
@@ -56,7 +60,7 @@ public final class TaxonomyDataManager extends RunnableProcess {
         CODE, SN, CN, ID, LIN;
     }
     private static final int                              MAX_CACHE_SIZE           = 100000;
-    private static final int                              MAX_TAXONOMIES_TO_RETURN = 10;
+    private static final int                              MAX_TAXONOMIES_TO_RETURN = 2000;
     private static final HashMap<String, UniProtTaxonomy> _sn_up_cache_map         = new HashMap<String, UniProtTaxonomy>();
     private static final HashMap<String, UniProtTaxonomy> _lineage_up_cache_map    = new HashMap<String, UniProtTaxonomy>();
     private static final HashMap<String, UniProtTaxonomy> _code_up_cache_map       = new HashMap<String, UniProtTaxonomy>();
@@ -126,10 +130,12 @@ public final class TaxonomyDataManager extends RunnableProcess {
         return _sn_up_cache_map;
     }
 
+    
+    @SuppressWarnings("unchecked")
     private final static UniProtTaxonomy obtainTaxonomy( final HashMap<String, UniProtTaxonomy> cache,
                                                          final Object query,
                                                          final QUERY_TYPE qt ) throws IOException,
-            AncestralTaxonomyInferenceException {
+                                                         AncestralTaxonomyInferenceException {
         if ( cache.containsKey( query ) ) {
             return cache.get( query ).copy();
         }
@@ -150,6 +156,7 @@ public final class TaxonomyDataManager extends RunnableProcess {
                     break;
                 case LIN:
                     return obtainUniProtTaxonomyFromLineage( ( List<String> ) query );
+                    
                 default:
                     throw new RuntimeException();
             }
@@ -184,10 +191,24 @@ public final class TaxonomyDataManager extends RunnableProcess {
     }
 
     private final static List<UniProtTaxonomy> getTaxonomiesFromScientificName( final String query ) throws IOException {
+        if ( query.equalsIgnoreCase( UniProtTaxonomy.BACTERIA ) || query.equalsIgnoreCase( UniProtTaxonomy.ARCHAEA )
+                || query.equalsIgnoreCase( UniProtTaxonomy.VIRUSES )
+                || query.equalsIgnoreCase( UniProtTaxonomy.EUKARYOTA ) || query.equalsIgnoreCase( UniProtTaxonomy.X ) ) {
+            final List<UniProtTaxonomy> l = new ArrayList<UniProtTaxonomy>();
+            l.add( UniProtTaxonomy.createSpecialFromScientificName( query ) );
+            return l;
+        }
         return SequenceDbWsTools.getTaxonomiesFromScientificNameStrict( query, MAX_TAXONOMIES_TO_RETURN );
     }
 
     private final static List<UniProtTaxonomy> getTaxonomiesFromTaxonomyCode( final String query ) throws IOException {
+        //FIXME fix "SPHAR" issue
+        if ( ( ( query.indexOf( "XX" ) == 3 ) && TaxonomyUtil.isHasTaxIdFromFakeTaxCode( query ) )
+                || query.equals( "SPHAR" ) /* TODO remove me, is same as Sphingomonas aromaticivorans */
+                ) {
+            final int id = TaxonomyUtil.getTaxIdFromFakeTaxCode( query );
+            return SequenceDbWsTools.getTaxonomiesFromId( String.valueOf( id ), MAX_TAXONOMIES_TO_RETURN );
+        }
         return SequenceDbWsTools.getTaxonomiesFromTaxonomyCode( query, MAX_TAXONOMIES_TO_RETURN );
     }
 
@@ -201,7 +222,7 @@ public final class TaxonomyDataManager extends RunnableProcess {
     synchronized final private static SortedSet<String> obtainDetailedTaxonomicInformation( final Phylogeny phy,
                                                                                             final boolean delete,
                                                                                             final boolean allow_to_use_basic_node_names )
-            throws IOException, AncestralTaxonomyInferenceException {
+                                                                                                    throws IOException, AncestralTaxonomyInferenceException {
         clearCachesIfTooLarge();
         final SortedSet<String> not_found = new TreeSet<String>();
         List<PhylogenyNode> not_found_external_nodes = null;
@@ -233,7 +254,10 @@ public final class TaxonomyDataManager extends RunnableProcess {
             if ( ( ( tax != null ) && ( isHasAppropriateId( tax ) || !ForesterUtil.isEmpty( tax.getScientificName() )
                     || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil.isEmpty( tax.getCommonName() ) ) )
                     || ( allow_to_use_basic_node_names && !ForesterUtil.isEmpty( node.getName() ) ) ) {
-                if ( tax != null ) {
+                if ( ( ( tax != null ) && ( isHasAppropriateId( tax )
+                        || !ForesterUtil.isEmpty( tax.getScientificName() )
+                        || !ForesterUtil.isEmpty( tax.getTaxonomyCode() ) || !ForesterUtil
+                        .isEmpty( tax.getCommonName() ) ) ) ) {
                     uniprot_tax = obtainUniProtTaxonomy( tax, null, qt );
                 }
                 else {
@@ -243,7 +267,6 @@ public final class TaxonomyDataManager extends RunnableProcess {
                     if ( tax == null ) {
                         tax = new Taxonomy();
                         node.getNodeData().addTaxonomy( tax );
-                        node.setName( "" );
                     }
                     updateTaxonomy( qt, node, tax, uniprot_tax );
                 }
@@ -310,15 +333,39 @@ public final class TaxonomyDataManager extends RunnableProcess {
         if ( ForesterUtil.isEmpty( simple_name ) ) {
             throw new IllegalArgumentException( "illegal attempt to use empty simple name" );
         }
-        qt = QUERY_TYPE.SN;
-        UniProtTaxonomy ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), simple_name, qt );
-        if ( ut == null ) {
+        UniProtTaxonomy ut = null;
+        final String code = ParserUtils.extractTaxonomyCodeFromNodeName( simple_name,
+                                                                         NHXParser.TAXONOMY_EXTRACTION.AGGRESSIVE );
+        if ( !ForesterUtil.isEmpty( code ) ) {
             qt = QUERY_TYPE.CODE;
-            ut = obtainTaxonomy( TaxonomyDataManager.getCodeTaxCacheMap(), simple_name, qt );
+            ut = obtainTaxonomy( TaxonomyDataManager.getCodeTaxCacheMap(), code, qt );
         }
         if ( ut == null ) {
-            qt = QUERY_TYPE.CN;
-            ut = obtainTaxonomy( TaxonomyDataManager.getCnTaxCacheMap(), simple_name, qt );
+            final String sn = ParserUtils.extractScientificNameFromNodeName( simple_name );
+            if ( !ForesterUtil.isEmpty( sn ) ) {
+                qt = QUERY_TYPE.SN;
+                ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), sn, qt );
+            }
+        }
+        if ( ut == null ) {
+            final String id = ParserUtils
+                    .extractUniprotTaxonomyIdFromNodeName( simple_name,
+                                                           NHXParser.TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED );
+            if ( !ForesterUtil.isEmpty( id ) ) {
+                qt = QUERY_TYPE.ID;
+                ut = obtainTaxonomy( TaxonomyDataManager.getIdTaxCacheMap(), id, qt );
+            }
+        }
+        if ( ut == null ) {
+            String sn = "";
+            final Matcher m = ParserUtils.TAXOMONY_SN_PATTERN_GENUS.matcher( simple_name );
+            if ( m.matches() ) {
+                sn = m.group( 1 );
+            }
+            if ( !ForesterUtil.isEmpty( sn ) ) {
+                qt = QUERY_TYPE.SN;
+                ut = obtainTaxonomy( TaxonomyDataManager.getSnTaxCacheMap(), sn, qt );
+            }
         }
         return ut;
     }
@@ -326,13 +373,13 @@ public final class TaxonomyDataManager extends RunnableProcess {
     static final UniProtTaxonomy obtainUniProtTaxonomyFromLineage( final List<String> lineage )
             throws AncestralTaxonomyInferenceException, IOException {
         final String lineage_str = ForesterUtil.stringListToString( lineage, ">" );
-        UniProtTaxonomy up_tax = null;
         if ( TaxonomyDataManager.getLineageTaxCacheMap().containsKey( lineage_str ) ) {
-            up_tax = TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy();
+            return TaxonomyDataManager.getLineageTaxCacheMap().get( lineage_str ).copy();
         }
         else {
+            final List<UniProtTaxonomy> matching_taxonomies = new ArrayList<UniProtTaxonomy>();
             final List<UniProtTaxonomy> up_taxonomies = getTaxonomiesFromScientificName( lineage
-                    .get( lineage.size() - 1 ) );
+                                                                                         .get( lineage.size() - 1 ) );
             if ( ( up_taxonomies != null ) && ( up_taxonomies.size() > 0 ) ) {
                 for( final UniProtTaxonomy up_taxonomy : up_taxonomies ) {
                     boolean match = true;
@@ -344,41 +391,53 @@ public final class TaxonomyDataManager extends RunnableProcess {
                         }
                     }
                     if ( match ) {
-                        if ( up_tax != null ) {
-                            //TODO this is dead code?!
-                            throw new AncestralTaxonomyInferenceException( "lineage \""
-                                    + ForesterUtil.stringListToString( lineage, " > " ) + "\" is not unique" );
-                        }
-                        up_tax = up_taxonomy;
+                        matching_taxonomies.add( up_taxonomy );
                     }
                 }
-                if ( up_tax == null ) {
+                if ( matching_taxonomies.isEmpty() ) {
                     throw new AncestralTaxonomyInferenceException( "lineage \""
                             + ForesterUtil.stringListToString( lineage, " > " ) + "\" not found" );
                 }
-                TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, up_tax );
-                if ( !ForesterUtil.isEmpty( up_tax.getScientificName() ) ) {
-                    TaxonomyDataManager.getSnTaxCacheMap().put( up_tax.getScientificName(), up_tax );
+                //in case of more than one (e.g. "Xenopus" Genus and Subgenus), keep shorter, less specific  one:
+                int shortest = Integer.MAX_VALUE;
+                UniProtTaxonomy least_specific_up_tax = null;
+                for( final UniProtTaxonomy m : matching_taxonomies ) {
+                    final int s = m.getLineage().size();
+                    if ( s < shortest ) {
+                        shortest = s;
+                        least_specific_up_tax = m;
+                    }
                 }
-                if ( !ForesterUtil.isEmpty( up_tax.getCode() ) ) {
-                    TaxonomyDataManager.getCodeTaxCacheMap().put( up_tax.getCode(), up_tax );
+                TaxonomyDataManager.getLineageTaxCacheMap().put( lineage_str, least_specific_up_tax );
+                if ( !ForesterUtil.isEmpty( least_specific_up_tax.getScientificName() ) ) {
+                    TaxonomyDataManager.getSnTaxCacheMap().put( least_specific_up_tax.getScientificName(),
+                                                                least_specific_up_tax );
                 }
-                if ( !ForesterUtil.isEmpty( up_tax.getCommonName() ) ) {
-                    TaxonomyDataManager.getCnTaxCacheMap().put( up_tax.getCommonName(), up_tax );
+                if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCode() ) ) {
+                    TaxonomyDataManager.getCodeTaxCacheMap().put( least_specific_up_tax.getCode(),
+                                                                  least_specific_up_tax );
                 }
-                if ( !ForesterUtil.isEmpty( up_tax.getId() ) ) {
-                    TaxonomyDataManager.getIdTaxCacheMap().put( up_tax.getId(), up_tax );
+                if ( !ForesterUtil.isEmpty( least_specific_up_tax.getCommonName() ) ) {
+                    TaxonomyDataManager.getCnTaxCacheMap().put( least_specific_up_tax.getCommonName(),
+                                                                least_specific_up_tax );
                 }
+                if ( !ForesterUtil.isEmpty( least_specific_up_tax.getId() ) ) {
+                    TaxonomyDataManager.getIdTaxCacheMap().put( least_specific_up_tax.getId(), least_specific_up_tax );
+                }
+                return least_specific_up_tax;
+            }
+            else {
+                throw new AncestralTaxonomyInferenceException( "taxonomy \"" + ( lineage.get( lineage.size() - 1 ) )
+                                                               + "\" not found" );
             }
         }
-        return up_tax;
     }
 
     synchronized final private static void updateTaxonomy( final QUERY_TYPE qt,
                                                            final PhylogenyNode node,
                                                            final Taxonomy tax,
                                                            final UniProtTaxonomy up_tax )
-            throws PhyloXmlDataFormatException {
+                                                                   throws PhyloXmlDataFormatException {
         if ( ( qt != QUERY_TYPE.SN ) && !ForesterUtil.isEmpty( up_tax.getScientificName() )
                 && ForesterUtil.isEmpty( tax.getScientificName() ) ) {
             tax.setScientificName( up_tax.getScientificName() );
@@ -423,7 +482,7 @@ public final class TaxonomyDataManager extends RunnableProcess {
             not_found = obtainDetailedTaxonomicInformation( _phy, _delete, _allow_simple_names );
         }
         catch ( final UnknownHostException e ) {
-            JOptionPane.showMessageDialog( _mf,
+            JOptionPane.showMessageDialog( _mf.getThisFrame(),
                                            "Could not connect to \"" + getBaseUrl() + "\"",
                                            "Network error during taxonomic information gathering",
                                            JOptionPane.ERROR_MESSAGE );
@@ -431,7 +490,7 @@ public final class TaxonomyDataManager extends RunnableProcess {
         }
         catch ( final IOException e ) {
             e.printStackTrace();
-            JOptionPane.showMessageDialog( _mf,
+            JOptionPane.showMessageDialog( _mf.getThisFrame(),
                                            e.toString(),
                                            "Failed to obtain taxonomic information",
                                            JOptionPane.ERROR_MESSAGE );
@@ -439,7 +498,7 @@ public final class TaxonomyDataManager extends RunnableProcess {
         }
         catch ( final AncestralTaxonomyInferenceException e ) {
             e.printStackTrace();
-            JOptionPane.showMessageDialog( _mf,
+            JOptionPane.showMessageDialog( _mf.getThisFrame(),
                                            e.toString(),
                                            "Failed to obtain taxonomic information",
                                            JOptionPane.ERROR_MESSAGE );
@@ -450,13 +509,13 @@ public final class TaxonomyDataManager extends RunnableProcess {
         }
         if ( ( _phy == null ) || _phy.isEmpty() ) {
             try {
-                JOptionPane.showMessageDialog( _mf,
+                JOptionPane.showMessageDialog( _mf.getThisFrame(),
                                                "None of the external node taxonomies could be resolved",
                                                "Taxonomy Tool Failed",
                                                JOptionPane.WARNING_MESSAGE );
             }
             catch ( final Exception e ) {
-                // Not important if this fails, do nothing. 
+                // Not important if this fails, do nothing.
             }
             return;
         }
@@ -502,18 +561,18 @@ public final class TaxonomyDataManager extends RunnableProcess {
                 sb.append( "..." );
             }
             try {
-                JOptionPane.showMessageDialog( _mf,
+                JOptionPane.showMessageDialog( _mf.getThisFrame(),
                                                sb.toString(),
                                                "Taxonomy Tool Completed",
                                                JOptionPane.WARNING_MESSAGE );
             }
             catch ( final Exception e ) {
-                // Not important if this fails, do nothing. 
+                // Not important if this fails, do nothing.
             }
         }
         else {
             try {
-                JOptionPane.showMessageDialog( _mf,
+                JOptionPane.showMessageDialog( _mf.getThisFrame(),
                                                "Taxonomy tool successfully completed",
                                                "Taxonomy Tool Completed",
                                                JOptionPane.INFORMATION_MESSAGE );