JAL-2051 better checking of retrieved / duplicated accession ids
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 29 Apr 2016 14:59:23 +0000 (15:59 +0100)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 29 Apr 2016 14:59:23 +0000 (15:59 +0100)
src/jalview/analysis/CrossRef.java
src/jalview/ext/ensembl/EnsemblGene.java
src/jalview/ext/ensembl/EnsemblRestClient.java
src/jalview/gui/SequenceFetcher.java

index 3563eba..7e77fc1 100644 (file)
@@ -327,8 +327,8 @@ public class CrossRef
           }
           if (l > 0)
           {
-            System.out
-                    .println("Attempting to retrieve cross referenced sequences.");
+            // System.out
+            // .println("Attempting to retrieve cross referenced sequences.");
             DBRefEntry[] t = new DBRefEntry[l];
             l = 0;
             for (int r = 0; r < xrfs.length; r++)
index 84b5dcf..4dd1bba 100644 (file)
@@ -11,7 +11,6 @@ import jalview.io.gff.SequenceOntologyI;
 import jalview.schemes.FeatureColourAdapter;
 import jalview.schemes.FeatureSettingsAdapter;
 import jalview.util.MapList;
-import jalview.util.StringUtils;
 
 import java.awt.Color;
 import java.io.UnsupportedEncodingException;
@@ -108,47 +107,84 @@ public class EnsemblGene extends EnsemblSeqProxy
   public AlignmentI getSequenceRecords(String query) throws Exception
   {
     /*
-     * if given a transcript id, look up its gene parent
+     * convert to a non-duplicated list of gene identifiers
      */
-    if (isTranscriptIdentifier(query))
+    List<String> geneIds = getGeneIds(query);
+
+    AlignmentI al = null;
+    for (String geneId : geneIds)
     {
-      query = new EnsemblLookup(getDomain()).getParent(query);
-      if (query == null)
+      /*
+       * fetch the gene sequence(s) with features and xrefs
+       */
+      AlignmentI geneAlignment = super.getSequenceRecords(geneId);
+
+      if (geneAlignment.getHeight() == 1)
       {
-        return null;
+        getTranscripts(geneAlignment, geneId);
+      }
+      if (al == null)
+      {
+        al = geneAlignment;
+      }
+      else
+      {
+        al.append(geneAlignment);
       }
     }
+    return al;
+  }
 
-    /*
-     * if given a gene or other external name, lookup and fetch 
-     * the corresponding gene for all model organisms 
-     */
-    if (!isGeneIdentifier(query))
+  /**
+   * Converts a query, which may contain one or more gene or transcript
+   * identifiers, into a non-redundant list of gene identifiers.
+   * 
+   * @param accessions
+   * @return
+   */
+  List<String> getGeneIds(String accessions)
+  {
+    List<String> geneIds = new ArrayList<String>();
+
+    for (String acc : accessions.split(getAccessionSeparator()))
     {
-      List<String> geneIds = new EnsemblSymbol(getDomain()).getIds(query);
-      if (geneIds.isEmpty())
+      if (isGeneIdentifier(acc))
       {
-        return null;
+        if (!geneIds.contains(acc))
+        {
+          geneIds.add(acc);
+        }
       }
-      String theIds = StringUtils.listToDelimitedString(geneIds,
-              getAccessionSeparator());
-      return getSequenceRecords(theIds);
-    }
 
-    /*
-     * fetch the gene sequence(s) with features and xrefs
-     */
-    AlignmentI al = super.getSequenceRecords(query);
+      /*
+       * if given a transcript id, look up its gene parent
+       */
+      else if (isTranscriptIdentifier(acc))
+      {
+        String geneId = new EnsemblLookup(getDomain()).getParent(acc);
+        if (geneId != null && !geneIds.contains(geneId))
+        {
+          geneIds.add(geneId);
+        }
+      }
 
-    /*
-     * if we retrieved a single gene, get its transcripts as well
-     */
-    if (al.getHeight() == 1)
-    {
-      getTranscripts(al, query);
+      /*
+       * if given a gene or other external name, lookup and fetch 
+       * the corresponding gene for all model organisms 
+       */
+      else
+      {
+        List<String> ids = new EnsemblSymbol(getDomain()).getIds(acc);
+        for (String geneId : ids)
+        {
+          if (!geneIds.contains(geneId))
+          {
+            geneIds.add(geneId);
+          }
+        }
+      }
     }
-
-    return al;
+    return geneIds;
   }
 
   /**
@@ -551,10 +587,4 @@ public class EnsemblGene extends EnsemblSeqProxy
     };
   }
 
-  @Override
-  public int getMaximumQueryCount()
-  {
-    return 1;
-  }
-
 }
index 441ec7c..6a564f1 100644 (file)
@@ -85,11 +85,25 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher
     domain = d;
   }
 
+  /**
+   * Answers true if the query matches the regular expression pattern for an
+   * Ensembl transcript stable identifier
+   * 
+   * @param query
+   * @return
+   */
   public boolean isTranscriptIdentifier(String query)
   {
     return query == null ? false : TRANSCRIPT_REGEX.search(query);
   }
 
+  /**
+   * Answers true if the query matches the regular expression pattern for an
+   * Ensembl gene stable identifier
+   * 
+   * @param query
+   * @return
+   */
   public boolean isGeneIdentifier(String query)
   {
     return query == null ? false : GENE_REGEX.search(query);
index 7dcd2b9..812bf76 100755 (executable)
@@ -539,30 +539,31 @@ public class SequenceFetcher extends JPanel implements Runnable
     // TODO: Refactor to GUI independent code and write tests.
     // indicate if successive sources should be merged into one alignment.
     boolean addToLast = false;
-    ArrayList<String> aresultq = new ArrayList<String>(), presultTitle = new ArrayList<String>();
-    ArrayList<AlignmentI> presult = new ArrayList<AlignmentI>(), aresult = new ArrayList<AlignmentI>();
+    List<String> aresultq = new ArrayList<String>();
+    List<String> presultTitle = new ArrayList<String>();
+    List<AlignmentI> presult = new ArrayList<AlignmentI>();
+    List<AlignmentI> aresult = new ArrayList<AlignmentI>();
     Iterator<DbSourceProxy> proxies = database.getSelectedSources()
             .iterator();
     String[] qries;
-    List<String> nextfetch = Arrays.asList(qries = textArea.getText()
+    List<String> nextFetch = Arrays.asList(qries = textArea.getText()
             .split(";"));
     Iterator<String> en = Arrays.asList(new String[0]).iterator();
     int nqueries = qries.length;
 
     FeatureSettingsModelI preferredFeatureColours = null;
-    while (proxies.hasNext() && (en.hasNext() || nextfetch.size() > 0))
+    while (proxies.hasNext() && (en.hasNext() || nextFetch.size() > 0))
     {
-      if (!en.hasNext() && nextfetch.size() > 0)
+      if (!en.hasNext() && nextFetch.size() > 0)
       {
-        en = nextfetch.iterator();
-        nqueries = nextfetch.size();
+        en = nextFetch.iterator();
+        nqueries = nextFetch.size();
         // save the remaining queries in the original array
-        qries = nextfetch.toArray(new String[nqueries]);
-        nextfetch = new ArrayList<String>();
+        qries = nextFetch.toArray(new String[nqueries]);
+        nextFetch = new ArrayList<String>();
       }
 
       DbSourceProxy proxy = proxies.next();
-      boolean isAliSource = false;
       try
       {
         // update status
@@ -573,122 +574,27 @@ public class SequenceFetcher extends JPanel implements Runnable
                             Integer.valueOf(nqueries).toString(),
                             proxy.getDbName() }), Thread.currentThread()
                         .hashCode());
-        isAliSource = proxy.isAlignmentSource();
         if (proxy.getMaximumQueryCount() == 1)
         {
+          /*
+           * proxy only handles one accession id at a time
+           */
           while (en.hasNext())
           {
-            String item = en.next();
-            try
+            String acc = en.next();
+            if (!fetchSingleAccession(proxy, acc, aresultq, aresult))
             {
-              if (aresult != null)
-              {
-                try
-                {
-                  // give the server a chance to breathe
-                  Thread.sleep(5);
-                } catch (Exception e)
-                {
-                  //
-                }
-
-              }
-
-              AlignmentI indres = null;
-              try
-              {
-                indres = proxy.getSequenceRecords(item);
-              } catch (OutOfMemoryError oome)
-              {
-                new OOMWarning("fetching " + item + " from "
-                        + proxy.getDbName(), oome, this);
-              }
-              if (indres != null)
-              {
-                aresultq.add(item);
-                aresult.add(indres);
-              }
-              else
-              {
-                nextfetch.add(item);
-              }
-            } catch (Exception e)
-            {
-              Cache.log.info(
-                      "Error retrieving " + item
-                      + " from " + proxy.getDbName(), e);
-              nextfetch.add(item);
+              nextFetch.add(acc);
             }
           }
         }
         else
         {
-          StringBuffer multiacc = new StringBuffer();
-          ArrayList<String> tosend = new ArrayList<String>();
-          while (en.hasNext())
-          {
-            String nel = en.next();
-            tosend.add(nel);
-            multiacc.append(nel);
-            if (en.hasNext())
-            {
-              multiacc.append(proxy.getAccessionSeparator());
-            }
-          }
-          try
-          {
-            AlignmentI rslt;
-            SequenceI[] rs;
-            List<String> nores = new ArrayList<String>();
-            rslt = proxy.getSequenceRecords(multiacc.toString());
-            if (rslt == null || rslt.getHeight() == 0)
-            {
-              // no results - pass on all queries to next source
-              nextfetch.addAll(tosend);
-            }
-            else
-            {
-              aresultq.add(multiacc.toString());
-              aresult.add(rslt);
-
-              rs = rslt.getSequencesArray();
-              // search for each query in the dbrefs associated with each
-              // sequence
-              // returned.
-              // ones we do not find will be used to query next source (if any)
-              for (String q : tosend)
-              {
-                DBRefEntry dbr = new DBRefEntry(), found[] = null;
-                dbr.setSource(proxy.getDbSource());
-                dbr.setVersion(null);
-                String accId = proxy.getAccessionIdFromQuery(q);
-                dbr.setAccessionId(accId);
-                boolean rfound = false;
-                for (int r = 0; r < rs.length; r++)
-                {
-                  if (rs[r] != null)
-                  {
-                    found = DBRefUtils.searchRefs(rs[r].getDBRefs(), accId);
-                    if (found != null && found.length > 0)
-                    {
-                      rfound = true;
-                      rs[r] = null;
-                    }
-                  }
-                }
-                if (!rfound)
-                {
-                  nextfetch.add(q);
-                }
-              }
-            }
-          } catch (OutOfMemoryError oome)
-          {
-            new OOMWarning("fetching " + multiacc + " from "
-                    + database.getSelectedItem(), oome, this);
-          }
+          /*
+           * proxy can fetch multiple accessions at one time
+           */
+          fetchMultipleAccessions(proxy, en, aresultq, aresult, nextFetch);
         }
-
       } catch (Exception e)
       {
         showErrorMessage("Error retrieving " + textArea.getText()
@@ -701,7 +607,6 @@ public class SequenceFetcher extends JPanel implements Runnable
         e.printStackTrace();
       } catch (OutOfMemoryError e)
       {
-        // resets dialog box - so we don't use OOMwarning here.
         showErrorMessage("Out of Memory when retrieving "
                 + textArea.getText()
                 + " from "
@@ -714,6 +619,7 @@ public class SequenceFetcher extends JPanel implements Runnable
                 + " from " + database.getSelectedItem());
         e.printStackTrace();
       }
+
       // Stack results ready for opening in alignment windows
       if (aresult != null && aresult.size() > 0)
       {
@@ -725,7 +631,7 @@ public class SequenceFetcher extends JPanel implements Runnable
         }
 
         AlignmentI ar = null;
-        if (isAliSource)
+        if (proxy.isAlignmentSource())
         {
           addToLast = false;
           // new window for each result
@@ -755,7 +661,6 @@ public class SequenceFetcher extends JPanel implements Runnable
             {
               ar.append(aresult.remove(0));
             }
-            ;
           }
           addToLast = true;
           presult.add(ar);
@@ -779,14 +684,14 @@ public class SequenceFetcher extends JPanel implements Runnable
     }
     // only remove visual delay after we finished parsing.
     guiWindow.setProgressBar(null, Thread.currentThread().hashCode());
-    if (nextfetch.size() > 0)
+    if (nextFetch.size() > 0)
     {
       StringBuffer sb = new StringBuffer();
       sb.append("Didn't retrieve the following "
-              + (nextfetch.size() == 1 ? "query" : nextfetch.size()
+              + (nextFetch.size() == 1 ? "query" : nextFetch.size()
                       + " queries") + ": \n");
       int l = sb.length(), lr = 0;
-      for (String s : nextfetch)
+      for (String s : nextFetch)
       {
         if (l != sb.length())
         {
@@ -804,6 +709,161 @@ public class SequenceFetcher extends JPanel implements Runnable
   }
 
   /**
+   * Tries to fetch one or more accession ids from the database proxy
+   * 
+   * @param proxy
+   * @param accessions
+   *          the queries to fetch
+   * @param aresultq
+   *          a successful queries list to add to
+   * @param aresult
+   *          a list of retrieved alignments to add to
+   * @param nextFetch
+   *          failed queries are added to this list
+   * @throws Exception
+   */
+  void fetchMultipleAccessions(DbSourceProxy proxy,
+          Iterator<String> accessions, List<String> aresultq,
+          List<AlignmentI> aresult, List<String> nextFetch)
+          throws Exception
+  {
+    StringBuilder multiacc = new StringBuilder();
+    List<String> tosend = new ArrayList<String>();
+    while (accessions.hasNext())
+    {
+      String nel = accessions.next();
+      tosend.add(nel);
+      multiacc.append(nel);
+      if (accessions.hasNext())
+      {
+        multiacc.append(proxy.getAccessionSeparator());
+      }
+    }
+
+    try
+    {
+      String query = multiacc.toString();
+      AlignmentI rslt = proxy.getSequenceRecords(query);
+      if (rslt == null || rslt.getHeight() == 0)
+      {
+        // no results - pass on all queries to next source
+        nextFetch.addAll(tosend);
+      }
+      else
+      {
+        aresultq.add(query);
+        aresult.add(rslt);
+        if (tosend.size() > 1)
+        {
+          checkResultForQueries(rslt, tosend, nextFetch, proxy);
+        }
+      }
+    } catch (OutOfMemoryError oome)
+    {
+      new OOMWarning("fetching " + multiacc + " from "
+              + database.getSelectedItem(), oome, this);
+    }
+  }
+
+  /**
+   * Query for a single accession id via the database proxy
+   * 
+   * @param proxy
+   * @param accession
+   * @param aresultq
+   *          a list of successful queries to add to
+   * @param aresult
+   *          a list of retrieved alignments to add to
+   * @return true if the fetch was successful, else false
+   */
+  boolean fetchSingleAccession(DbSourceProxy proxy, String accession,
+          List<String> aresultq, List<AlignmentI> aresult)
+  {
+    boolean success = false;
+    try
+    {
+      if (aresult != null)
+      {
+        try
+        {
+          // give the server a chance to breathe
+          Thread.sleep(5);
+        } catch (Exception e)
+        {
+          //
+        }
+      }
+
+      AlignmentI indres = null;
+      try
+      {
+        indres = proxy.getSequenceRecords(accession);
+      } catch (OutOfMemoryError oome)
+      {
+        new OOMWarning("fetching " + accession + " from "
+                + proxy.getDbName(), oome, this);
+      }
+      if (indres != null)
+      {
+        aresultq.add(accession);
+        aresult.add(indres);
+        success = true;
+      }
+    } catch (Exception e)
+    {
+      Cache.log.info(
+              "Error retrieving " + accession
+              + " from " + proxy.getDbName(), e);
+    } finally
+    {
+      return success;
+    }
+  }
+
+  /**
+   * Checks which of the queries were successfully retrieved by searching the
+   * DBRefs of the retrieved sequences for a match. Any not found are added to
+   * the 'nextFetch' list.
+   * 
+   * @param rslt
+   * @param queries
+   * @param nextFetch
+   * @param proxy
+   */
+  void checkResultForQueries(AlignmentI rslt, List<String> queries,
+          List<String> nextFetch, DbSourceProxy proxy)
+  {
+    SequenceI[] rs = rslt.getSequencesArray();
+
+    for (String q : queries)
+    {
+      DBRefEntry[] found = null;
+      DBRefEntry dbr = new DBRefEntry();
+      dbr.setSource(proxy.getDbSource());
+      dbr.setVersion(null);
+      String accId = proxy.getAccessionIdFromQuery(q);
+      dbr.setAccessionId(accId);
+      boolean rfound = false;
+      for (int r = 0; r < rs.length; r++)
+      {
+        if (rs[r] != null)
+        {
+          found = DBRefUtils.searchRefs(rs[r].getDBRefs(), accId);
+          if (found != null && found.length > 0)
+          {
+            rfound = true;
+            break;
+          }
+        }
+      }
+      if (!rfound)
+      {
+        nextFetch.add(q);
+      }
+    }
+  }
+
+  /**
    * 
    * @return a standard title for any results retrieved using the currently
    *         selected source and settings