JAL-1705 refactored cross-reference fetching (CCDS, Uniprot, PDB)
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Sat, 30 Jan 2016 06:33:01 +0000 (06:33 +0000)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Sat, 30 Jan 2016 06:33:01 +0000 (06:33 +0000)
src/jalview/ext/ensembl/EnsemblCdna.java
src/jalview/ext/ensembl/EnsemblGene.java
src/jalview/ext/ensembl/EnsemblProtein.java
src/jalview/ext/ensembl/EnsemblSeqProxy.java
src/jalview/ext/ensembl/EnsemblXref.java

index a2ecfcd..139e44f 100644 (file)
@@ -3,6 +3,8 @@ package jalview.ext.ensembl;
 import jalview.datamodel.SequenceFeature;
 import jalview.io.gff.SequenceOntology;
 
+import java.util.List;
+
 import com.stevesoft.pat.Regex;
 
 public class EnsemblCdna extends EnsemblSeqProxy
@@ -78,4 +80,12 @@ public class EnsemblCdna extends EnsemblSeqProxy
     return false;
   }
 
+  @Override
+  protected List<String> getCrossReferenceDatabases()
+  {
+    return super.getCrossReferenceDatabases();
+    // 30/01/16 also found Vega_transcript, OTTT, ENS_LRG_transcript, UCSC,
+    // HGNC_trans_name, RefSeq_mRNA, RefSeq_mRNA_predicted
+  }
+
 }
index b5ea686..1325bec 100644 (file)
@@ -290,4 +290,13 @@ public class EnsemblGene extends EnsemblSeqProxy
     return false;
   }
 
+  @Override
+  protected List<String> getCrossReferenceDatabases()
+  {
+    // found these for ENSG00000157764 on 30/01/2016:
+    // return new String[] {"Vega_gene", "OTTG", "ENS_LRG_gene", "ArrayExpress",
+    // "EntrezGene", "HGNC", "MIM_GENE", "MIM_MORBID", "WikiGene"};
+    return super.getCrossReferenceDatabases();
+  }
+
 }
index 5238f98..c40fdd0 100644 (file)
@@ -3,9 +3,15 @@ package jalview.ext.ensembl;
 import jalview.datamodel.AlignmentI;
 import jalview.datamodel.SequenceFeature;
 
+import java.util.Arrays;
+import java.util.List;
+
 public class EnsemblProtein extends EnsemblSeqProxy
 {
 
+  private static final List<String> CROSSREFS = Arrays.asList(new String[] {
+      "PDB", "Uniprot/SPTREMBL", "Uniprot/SWISSPROT" });
+
   public EnsemblProtein()
   {
     super();
@@ -64,4 +70,10 @@ public class EnsemblProtein extends EnsemblSeqProxy
     return false;
   }
 
+  @Override
+  protected List<String> getCrossReferenceDatabases()
+  {
+    return CROSSREFS;
+  }
+
 }
index 8698b78..0bfeda1 100644 (file)
@@ -36,6 +36,9 @@ import java.util.Map.Entry;
  */
 public abstract class EnsemblSeqProxy extends EnsemblRestClient
 {
+  private static final List<String> CROSS_REFERENCES = Arrays
+          .asList(new String[] { "CCDS" });
+
   protected static final String CONSEQUENCE_TYPE = "consequence_type";
 
   protected static final String PARENT = "Parent";
@@ -173,7 +176,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       addFeaturesAndProduct(accId, alignment);
     }
 
-    inProgress = false;
+    for (SequenceI seq : alignment.getSequences())
+    {
+      getCrossReferences(seq);
+    }
+
     System.out.println(getClass().getName() + " took "
             + (System.currentTimeMillis() - now) + "ms to fetch");
     return alignment;
@@ -265,8 +272,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       proteinSeq.createDatasetSequence();
       querySeq.createDatasetSequence();
 
-      getProteinCrossReferences(proteinSeq);
-
       MapList mapList = mapCdsToProtein(querySeq, proteinSeq);
       if (mapList != null)
       {
@@ -293,26 +298,35 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
    * Get Uniprot and PDB xrefs from Ensembl, and attach them to the protein
    * sequence
    * 
-   * @param proteinSeq
+   * @param seq
    */
-  protected void getProteinCrossReferences(SequenceI proteinSeq)
+  protected void getCrossReferences(SequenceI seq)
   {
-    while (proteinSeq.getDatasetSequence() != null)
+    while (seq.getDatasetSequence() != null)
     {
-      proteinSeq = proteinSeq.getDatasetSequence();
+      seq = seq.getDatasetSequence();
     }
 
     EnsemblXref xrefFetcher = new EnsemblXref();
-    List<DBRefEntry> xrefs = xrefFetcher.getCrossReferences(
-            proteinSeq.getName(), "PDB", "Uniprot/SPTREMBL",
-            "Uniprot/SWISSPROT");
+    List<DBRefEntry> xrefs = xrefFetcher.getCrossReferences(seq.getName(),
+            getCrossReferenceDatabases());
     for (DBRefEntry xref : xrefs)
     {
-      proteinSeq.addDBRef(xref);
+      seq.addDBRef(xref);
     }
   }
 
   /**
+   * Returns a list of database names to be used when fetching cross-references.
+   * 
+   * @return
+   */
+  protected List<String> getCrossReferenceDatabases()
+  {
+    return CROSS_REFERENCES;
+  }
+
+  /**
    * Returns a mapping from dna to protein by inspecting sequence features of
    * type "CDS" on the dna.
    * 
index 6a4f369..36bd7c5 100644 (file)
@@ -60,15 +60,16 @@ public class EnsemblXref extends EnsemblRestClient
   /**
    * Calls the Ensembl xrefs REST endpoint and retrieves any cross-references
    * ("primary_id") for the given identifier (Ensembl accession id) and database
-   * name. The "dbname" returned by Ensembl is canonicalised to Jalview's
-   * standard version, and a DBRefEntry constructed.
+   * names. The "dbname" returned by Ensembl is canonicalised to Jalview's
+   * standard version, and a DBRefEntry constructed. If no databases are
+   * specified, all available cross-references are retrieved.
    * 
    * @param identifier
-   * @param database
+   * @param databases
    * @return
    */
   public List<DBRefEntry> getCrossReferences(String identifier,
-          String... database)
+          List<String> databases)
   {
     List<DBRefEntry> result = new ArrayList<DBRefEntry>();
     List<String> ids = new ArrayList<String>();
@@ -77,22 +78,12 @@ public class EnsemblXref extends EnsemblRestClient
     BufferedReader br = null;
     try
     {
-      for (String db : database)
-      {
-        URL url = getUrl(identifier, db);
+      URL url = getUrl(identifier);
         if (url != null)
         {
           br = getHttpResponse(url, ids);
         }
-        for (DBRefEntry xref : parseResponse(br))
-        {
-          if (!result.contains(xref))
-          {
-            result.add(xref);
-          }
-        }
-        br.close();
-      }
+      return (parseResponse(br, databases));
     } catch (IOException e)
     {
       // ignore
@@ -114,14 +105,17 @@ public class EnsemblXref extends EnsemblRestClient
   }
 
   /**
-   * Parses "primary_id" and "dbname" values from the JSON response and returns
-   * a list of DBRefEntry constructed.
+   * Parses "primary_id" and "dbname" values from the JSON response and
+   * constructs a DBRefEntry if the dbname is in the list supplied. Returns a
+   * list of DBRefEntry created.
    * 
    * @param br
+   * @param databases
    * @return
    * @throws IOException
    */
-  protected List<DBRefEntry> parseResponse(BufferedReader br)
+  protected List<DBRefEntry> parseResponse(BufferedReader br,
+          List<String> databases)
           throws IOException
   {
     JSONParser jp = new JSONParser();
@@ -134,6 +128,11 @@ public class EnsemblXref extends EnsemblRestClient
       {
         JSONObject val = (JSONObject) rvals.next();
         String dbName = val.get("dbname").toString();
+        if (databases != null && !databases.isEmpty()
+                && !databases.contains(dbName))
+        {
+          continue;
+        }
         String id = val.get("primary_id").toString();
         if (dbName != null && id != null)
         {
@@ -149,10 +148,18 @@ public class EnsemblXref extends EnsemblRestClient
     return result;
   }
 
-  protected URL getUrl(String identifier, String db)
+  /**
+   * Returns the URL for the REST endpoint to fetch all cross-references for an
+   * identifier. Note this may return protein cross-references for nucleotide.
+   * Filter the returned list as required.
+   * 
+   * @param identifier
+   * @return
+   */
+  protected URL getUrl(String identifier)
   {
     String url = ENSEMBL_REST + "/xrefs/id/" + identifier
-            + "?content-type=application/json&external_db=" + db;
+            + "?content-type=application/json&all_levels=1";
     try
     {
       return new URL(url);