JAL-1705 additional tests, validation regexp tweaks, javadoc
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Thu, 4 Feb 2016 10:29:09 +0000 (10:29 +0000)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Thu, 4 Feb 2016 10:29:09 +0000 (10:29 +0000)
src/jalview/ext/ensembl/EnsemblCdna.java
src/jalview/ext/ensembl/EnsemblFeatures.java [new file with mode: 0644]
src/jalview/ext/ensembl/EnsemblGene.java
src/jalview/ext/ensembl/EnsemblGenome.java
src/jalview/ext/ensembl/EnsemblProtein.java
src/jalview/ext/ensembl/EnsemblSeqProxy.java
src/jalview/ext/ensembl/EnsemblSequenceFetcher.java
src/jalview/ext/ensembl/EnsemblXref.java
test/jalview/datamodel/SequenceTest.java
test/jalview/ext/ensembl/EnsemblProteinTest.java [new file with mode: 0644]
test/jalview/ext/ensembl/EnsemblSeqProxyTest.java

index 373286f..d4d1c08 100644 (file)
@@ -10,8 +10,11 @@ import com.stevesoft.pat.Regex;
 
 public class EnsemblCdna extends EnsemblSeqProxy
 {
+  // TODO modify to accept other species e.g. ENSMUSPnnn
+  private static final Regex ACCESSION_REGEX = new Regex("((ENST|ENSG|CCDS)[0-9.]{3,})");
+  
   /*
-   * fetch exon features on genomic sequence (to identify the cdnaregions)
+   * fetch exon features on genomic sequence (to identify the cdna regions)
    * and cds and variation features (to retain)
    */
   private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
@@ -38,7 +41,7 @@ public class EnsemblCdna extends EnsemblSeqProxy
   @Override
   public Regex getAccessionValidator()
   {
-    return new Regex("((ENST|ENSG|CCDS)[0-9.]{3,})");
+    return ACCESSION_REGEX;
   }
 
   @Override
diff --git a/src/jalview/ext/ensembl/EnsemblFeatures.java b/src/jalview/ext/ensembl/EnsemblFeatures.java
new file mode 100644 (file)
index 0000000..22faba9
--- /dev/null
@@ -0,0 +1,123 @@
+package jalview.ext.ensembl;
+
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.io.FeaturesFile;
+import jalview.io.FileParse;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A client for fetching and processing Ensembl feature data in GFF format by
+ * calling the overlap REST service
+ * 
+ * @author gmcarstairs
+ * @see http://rest.ensembl.org/documentation/info/overlap_id
+ */
+class EnsemblFeatures extends EnsemblRestClient
+{
+  /*
+   * The default features to retrieve from Ensembl
+   * can override in getSequenceRecords parameter
+   */
+  private EnsemblFeatureType[] featuresWanted = { EnsemblFeatureType.cds,
+      EnsemblFeatureType.exon, EnsemblFeatureType.variation };
+
+  @Override
+  public String getDbName()
+  {
+    return "ENSEMBL (features)";
+  }
+
+  /**
+   * Makes a query to the REST overlap endpoint for the given sequence
+   * identifier. This returns an 'alignment' consisting of one 'dummy sequence'
+   * (the genomic sequence for which overlap features are returned by the
+   * service). This sequence will have on it sequence features which are the
+   * real information of interest, such as CDS regions or sequence variations.
+   */
+  @Override
+  public AlignmentI getSequenceRecords(String query) throws IOException
+  {
+    // TODO: use a vararg String... for getSequenceRecords instead?
+    List<String> queries = new ArrayList<String>();
+    queries.add(query);
+    FileParse fp = getSequenceReader(queries);
+    FeaturesFile fr = new FeaturesFile(fp);
+    return new Alignment(fr.getSeqsAsArray());
+  }
+
+  /**
+   * Returns a URL for the REST overlap endpoint
+   * 
+   * @param ids
+   * @return
+   */
+  @Override
+  protected URL getUrl(List<String> ids) throws MalformedURLException
+  {
+    StringBuffer urlstring = new StringBuffer(128);
+    urlstring.append(ENSEMBL_REST).append("/overlap/id/")
+            .append(ids.get(0));
+
+    // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
+    urlstring.append("?content-type=text/x-gff3");
+
+    /*
+     * specify  features to retrieve
+     * @see http://rest.ensembl.org/documentation/info/overlap_id
+     * could make the list a configurable entry in jalview.properties
+     */
+    for (EnsemblFeatureType feature : featuresWanted)
+    {
+      urlstring.append("&feature=").append(feature.name());
+    }
+
+    return new URL(urlstring.toString());
+  }
+
+  @Override
+  protected boolean useGetRequest()
+  {
+    return true;
+  }
+
+  /**
+   * Returns the MIME type for GFF3. For GET requests the Content-type header
+   * describes the required encoding of the response.
+   */
+  @Override
+  protected String getRequestMimeType(boolean multipleIds)
+  {
+    return "text/x-gff3";
+  }
+
+  /**
+   * Returns the MIME type for GFF3.
+   */
+  @Override
+  protected String getResponseMimeType()
+  {
+    return "text/x-gff3";
+  }
+
+  /**
+   * Overloaded method that allows a list of features to retrieve to be
+   * specified
+   * 
+   * @param accId
+   * @param features
+   * @return
+   * @throws IOException
+   */
+  protected AlignmentI getSequenceRecords(String accId,
+          EnsemblFeatureType[] features) throws IOException
+  {
+    featuresWanted = features;
+    return getSequenceRecords(accId);
+  }
+}
index df246f8..dc28796 100644 (file)
@@ -12,6 +12,8 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
+import com.stevesoft.pat.Regex;
+
 /**
  * A class that fetches genomic sequence and all transcripts for an Ensembl gene
  * 
@@ -19,6 +21,10 @@ import java.util.List;
  */
 public class EnsemblGene extends EnsemblSeqProxy
 {
+  // TODO modify to accept other species e.g. ENSMUSGnnn
+  private static final Regex ACCESSION_REGEX = new Regex(
+          "((ENSG)[0-9]{11})");
+
   private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
       EnsemblFeatureType.gene, EnsemblFeatureType.transcript,
       EnsemblFeatureType.exon, EnsemblFeatureType.cds,
@@ -309,4 +315,10 @@ public class EnsemblGene extends EnsemblSeqProxy
   {
   }
 
+  @Override
+  public Regex getAccessionValidator()
+  {
+    return ACCESSION_REGEX;
+  }
+
 }
index 6bbc3e9..e977e62 100644 (file)
@@ -20,7 +20,7 @@ public class EnsemblGenome extends EnsemblSeqProxy
   @Override
   public String getDbName()
   {
-    return "ENSEMBL (Genome)";
+    return "ENSEMBL (Genomic)";
   }
 
   @Override
index c40fdd0..8f23984 100644 (file)
@@ -6,8 +6,13 @@ import jalview.datamodel.SequenceFeature;
 import java.util.Arrays;
 import java.util.List;
 
+import com.stevesoft.pat.Regex;
+
 public class EnsemblProtein extends EnsemblSeqProxy
 {
+  // TODO modify to accept other species e.g. ENSMUSPnnn
+  private static final Regex ACCESSION_REGEX = new Regex(
+          "((ENSP|CCDS)[0-9.]{3,})");
 
   private static final List<String> CROSSREFS = Arrays.asList(new String[] {
       "PDB", "Uniprot/SPTREMBL", "Uniprot/SWISSPROT" });
@@ -76,4 +81,10 @@ public class EnsemblProtein extends EnsemblSeqProxy
     return CROSSREFS;
   }
 
+  @Override
+  public Regex getAccessionValidator()
+  {
+    return ACCESSION_REGEX;
+  }
+
 }
index b2804f2..e77051d 100644 (file)
@@ -48,25 +48,28 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
 
   protected static final String NAME = "Name";
 
+  /*
+   * enum for 'type' parameter to the /sequence REST service
+   */
   public enum EnsemblSeqType
   {
     /**
-     * type=genomic for the full dna including introns
+     * type=genomic to fetch full dna including introns
      */
     GENOMIC("genomic"),
 
     /**
-     * type=cdna for transcribed dna including UTRs
+     * type=cdna to fetch dna including UTRs
      */
     CDNA("cdna"),
 
     /**
-     * type=cds for coding dna excluding UTRs
+     * type=cds to fetch coding dna excluding UTRs
      */
     CDS("cds"),
 
     /**
-     * type=protein for the peptide product sequence
+     * type=protein to fetch peptide product sequence
      */
     PROTEIN("protein");
 
@@ -201,7 +204,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
        * get 'dummy' genomic sequence with exon, cds and variation features
        */
       SequenceI genomicSequence = null;
-      EnsemblOverlap gffFetcher = new EnsemblOverlap();
+      EnsemblFeatures gffFetcher = new EnsemblFeatures();
       EnsemblFeatureType[] features = getFeaturesToFetch();
       AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
               features);
@@ -268,7 +271,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
       MapList mapList = mapCdsToProtein(querySeq, proteinSeq);
       if (mapList != null)
       {
-        Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList);
+        // clunky: ensure Uniprot xref if we have one is on mapped sequence
+        SequenceI ds = proteinSeq.getDatasetSequence();
+        ds.setSourceDBRef(proteinSeq.getSourceDBRef());
+        Mapping map = new Mapping(ds, mapList);
         DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(),
                 accId, map);
         querySeq.getDatasetSequence().addDBRef(dbr);
@@ -919,6 +925,22 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient
         count++;
       }
     }
+
+    /*
+     * ugly sort to get sequence features in start position order
+     * - would be better to store in Sequence as a TreeSet instead?
+     */
+    Arrays.sort(peptide.getSequenceFeatures(),
+            new Comparator<SequenceFeature>()
+            {
+              @Override
+              public int compare(SequenceFeature o1, SequenceFeature o2)
+              {
+                int c = Integer.compare(o1.getBegin(), o2.getBegin());
+                return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
+                        : c;
+              }
+            });
     return count;
   }
 
index f1b96e2..2e32bd2 100644 (file)
@@ -9,13 +9,15 @@ import com.stevesoft.pat.Regex;
  * A base class for Ensembl sequence fetchers
  * 
  * @author gmcarstairs
- *
  */
-public abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl
+abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl
 {
+  // TODO modify to accept other species e.g. ENSMUSTnnn
+  private static final Regex ACCESSION_REGEX = new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})");
+
   /*
-   * possible values for the 'feature' parameter of the REST overlap endpoint
-   * @see 
+   * possible values for the 'feature' parameter of the /overlap REST service
+   * @see http://rest.ensembl.org/documentation/info/overlap_id
    */
   protected enum EnsemblFeatureType
   {
@@ -43,10 +45,16 @@ public abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl
     return " ";
   }
 
+  /**
+   * Ensembl accession are ENST + 11 digits for human transcript, ENSG for human
+   * gene. Other species insert 3 letters e.g. ENSMUST..., ENSMUSG...
+   * 
+   * @see http://www.ensembl.org/Help/View?id=151
+   */
   @Override
   public Regex getAccessionValidator()
   {
-    return new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})");
+    return ACCESSION_REGEX;
   }
 
   @Override
index 36bd7c5..d4c5b18 100644 (file)
@@ -17,7 +17,14 @@ import org.json.simple.JSONObject;
 import org.json.simple.parser.JSONParser;
 import org.json.simple.parser.ParseException;
 
-public class EnsemblXref extends EnsemblRestClient
+/**
+ * A class to fetch cross-references from Ensembl by calling the /xrefs REST
+ * service
+ * 
+ * @author gmcarstairs
+ *
+ */
+class EnsemblXref extends EnsemblRestClient
 {
 
   @Override
index dcc8ef7..0d40037 100644 (file)
@@ -384,11 +384,18 @@ public class SequenceTest
   {
     SequenceI sq = new Sequence("Seq1", "CD");
     sq.setDatasetSequence(new Sequence("Seq1", "ABCDEF"));
+    sq.getDatasetSequence().addSequenceFeature(
+            new SequenceFeature("", "", 1, 2, 0f, null));
     sq.setStart(3);
     sq.setEnd(4);
     SequenceI derived = sq.deriveSequence();
     assertEquals("CD", derived.getSequenceAsString());
     assertSame(sq.getDatasetSequence(), derived.getDatasetSequence());
+
+    assertNull(((Sequence) seq).sequenceFeatures);
+    assertNull(((Sequence) derived).sequenceFeatures);
+    assertNotNull(seq.getSequenceFeatures());
+    assertSame(seq.getSequenceFeatures(), derived.getSequenceFeatures());
   }
 
   /**
diff --git a/test/jalview/ext/ensembl/EnsemblProteinTest.java b/test/jalview/ext/ensembl/EnsemblProteinTest.java
new file mode 100644 (file)
index 0000000..bd0e7b3
--- /dev/null
@@ -0,0 +1,19 @@
+package jalview.ext.ensembl;
+
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class EnsemblProteinTest
+{
+
+  @Test(groups = "Functional")
+  public void testIsValidReference() throws Exception
+  {
+    EnsemblSequenceFetcher esq = new EnsemblProtein();
+    Assert.assertTrue(esq.isValidReference("CCDS5863.1"));
+    Assert.assertTrue(esq.isValidReference("ENSP00000288602"));
+    Assert.assertFalse(esq.isValidReference("ENST00000288602"));
+    Assert.assertFalse(esq.isValidReference("ENSG00000288602"));
+  }
+
+}
index ed936d5..a6694eb 100644 (file)
@@ -120,22 +120,6 @@ public class EnsemblSeqProxyTest
     SequenceOntologyFactory.setInstance(null);
   }
 
-  @DataProvider(name = "queries")
-  public Object[][] createQueryData(Method m)
-  {
-    return new Object[][] { { "CCDS5863.1" }, { "ENSP00000288602" } };
-  }
-
-  @Test(dataProvider = "queries")
-  public void testIsValidReference(String query) throws Exception
-  {
-    EnsemblSequenceFetcher esq = new EnsemblProtein();
-    Assert.assertTrue(esq.isValidReference(query),
-            "Expected reference string " + query
-                    + " to be valid for regex "
-                    + esq.getAccessionValidator().toString());
-  }
-
   @DataProvider(name = "ens_seqs")
   public Object[][] createData(Method m)
   {