public class EnsemblCdna extends EnsemblSeqProxy
{
+ // TODO modify to accept other species e.g. ENSMUSPnnn
+ private static final Regex ACCESSION_REGEX = new Regex("((ENST|ENSG|CCDS)[0-9.]{3,})");
+
/*
- * fetch exon features on genomic sequence (to identify the cdnaregions)
+ * fetch exon features on genomic sequence (to identify the cdna regions)
* and cds and variation features (to retain)
*/
private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
@Override
public Regex getAccessionValidator()
{
- return new Regex("((ENST|ENSG|CCDS)[0-9.]{3,})");
+ return ACCESSION_REGEX;
}
@Override
--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.io.FeaturesFile;
+import jalview.io.FileParse;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A client for fetching and processing Ensembl feature data in GFF format by
+ * calling the overlap REST service
+ *
+ * @author gmcarstairs
+ * @see http://rest.ensembl.org/documentation/info/overlap_id
+ */
+class EnsemblFeatures extends EnsemblRestClient
+{
+ /*
+ * The default features to retrieve from Ensembl
+ * can override in getSequenceRecords parameter
+ */
+ private EnsemblFeatureType[] featuresWanted = { EnsemblFeatureType.cds,
+ EnsemblFeatureType.exon, EnsemblFeatureType.variation };
+
+ @Override
+ public String getDbName()
+ {
+ return "ENSEMBL (features)";
+ }
+
+ /**
+ * Makes a query to the REST overlap endpoint for the given sequence
+ * identifier. This returns an 'alignment' consisting of one 'dummy sequence'
+ * (the genomic sequence for which overlap features are returned by the
+ * service). This sequence will have on it sequence features which are the
+ * real information of interest, such as CDS regions or sequence variations.
+ */
+ @Override
+ public AlignmentI getSequenceRecords(String query) throws IOException
+ {
+ // TODO: use a vararg String... for getSequenceRecords instead?
+ List<String> queries = new ArrayList<String>();
+ queries.add(query);
+ FileParse fp = getSequenceReader(queries);
+ FeaturesFile fr = new FeaturesFile(fp);
+ return new Alignment(fr.getSeqsAsArray());
+ }
+
+ /**
+ * Returns a URL for the REST overlap endpoint
+ *
+ * @param ids
+ * @return
+ */
+ @Override
+ protected URL getUrl(List<String> ids) throws MalformedURLException
+ {
+ StringBuffer urlstring = new StringBuffer(128);
+ urlstring.append(ENSEMBL_REST).append("/overlap/id/")
+ .append(ids.get(0));
+
+ // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats
+ urlstring.append("?content-type=text/x-gff3");
+
+ /*
+ * specify features to retrieve
+ * @see http://rest.ensembl.org/documentation/info/overlap_id
+ * could make the list a configurable entry in jalview.properties
+ */
+ for (EnsemblFeatureType feature : featuresWanted)
+ {
+ urlstring.append("&feature=").append(feature.name());
+ }
+
+ return new URL(urlstring.toString());
+ }
+
+ @Override
+ protected boolean useGetRequest()
+ {
+ return true;
+ }
+
+ /**
+ * Returns the MIME type for GFF3. For GET requests the Content-type header
+ * describes the required encoding of the response.
+ */
+ @Override
+ protected String getRequestMimeType(boolean multipleIds)
+ {
+ return "text/x-gff3";
+ }
+
+ /**
+ * Returns the MIME type for GFF3.
+ */
+ @Override
+ protected String getResponseMimeType()
+ {
+ return "text/x-gff3";
+ }
+
+ /**
+ * Overloaded method that allows a list of features to retrieve to be
+ * specified
+ *
+ * @param accId
+ * @param features
+ * @return
+ * @throws IOException
+ */
+ protected AlignmentI getSequenceRecords(String accId,
+ EnsemblFeatureType[] features) throws IOException
+ {
+ featuresWanted = features;
+ return getSequenceRecords(accId);
+ }
+}
import java.util.Arrays;
import java.util.List;
+import com.stevesoft.pat.Regex;
+
/**
* A class that fetches genomic sequence and all transcripts for an Ensembl gene
*
*/
public class EnsemblGene extends EnsemblSeqProxy
{
+ // TODO modify to accept other species e.g. ENSMUSGnnn
+ private static final Regex ACCESSION_REGEX = new Regex(
+ "((ENSG)[0-9]{11})");
+
private static final EnsemblFeatureType[] FEATURES_TO_FETCH = {
EnsemblFeatureType.gene, EnsemblFeatureType.transcript,
EnsemblFeatureType.exon, EnsemblFeatureType.cds,
{
}
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return ACCESSION_REGEX;
+ }
+
}
@Override
public String getDbName()
{
- return "ENSEMBL (Genome)";
+ return "ENSEMBL (Genomic)";
}
@Override
import java.util.Arrays;
import java.util.List;
+import com.stevesoft.pat.Regex;
+
public class EnsemblProtein extends EnsemblSeqProxy
{
+ // TODO modify to accept other species e.g. ENSMUSPnnn
+ private static final Regex ACCESSION_REGEX = new Regex(
+ "((ENSP|CCDS)[0-9.]{3,})");
private static final List<String> CROSSREFS = Arrays.asList(new String[] {
"PDB", "Uniprot/SPTREMBL", "Uniprot/SWISSPROT" });
return CROSSREFS;
}
+ @Override
+ public Regex getAccessionValidator()
+ {
+ return ACCESSION_REGEX;
+ }
+
}
protected static final String NAME = "Name";
+ /*
+ * enum for 'type' parameter to the /sequence REST service
+ */
public enum EnsemblSeqType
{
/**
- * type=genomic for the full dna including introns
+ * type=genomic to fetch full dna including introns
*/
GENOMIC("genomic"),
/**
- * type=cdna for transcribed dna including UTRs
+ * type=cdna to fetch dna including UTRs
*/
CDNA("cdna"),
/**
- * type=cds for coding dna excluding UTRs
+ * type=cds to fetch coding dna excluding UTRs
*/
CDS("cds"),
/**
- * type=protein for the peptide product sequence
+ * type=protein to fetch peptide product sequence
*/
PROTEIN("protein");
* get 'dummy' genomic sequence with exon, cds and variation features
*/
SequenceI genomicSequence = null;
- EnsemblOverlap gffFetcher = new EnsemblOverlap();
+ EnsemblFeatures gffFetcher = new EnsemblFeatures();
EnsemblFeatureType[] features = getFeaturesToFetch();
AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId,
features);
MapList mapList = mapCdsToProtein(querySeq, proteinSeq);
if (mapList != null)
{
- Mapping map = new Mapping(proteinSeq.getDatasetSequence(), mapList);
+ // clunky: ensure Uniprot xref if we have one is on mapped sequence
+ SequenceI ds = proteinSeq.getDatasetSequence();
+ ds.setSourceDBRef(proteinSeq.getSourceDBRef());
+ Mapping map = new Mapping(ds, mapList);
DBRefEntry dbr = new DBRefEntry(getDbSource(), getDbVersion(),
accId, map);
querySeq.getDatasetSequence().addDBRef(dbr);
count++;
}
}
+
+ /*
+ * ugly sort to get sequence features in start position order
+ * - would be better to store in Sequence as a TreeSet instead?
+ */
+ Arrays.sort(peptide.getSequenceFeatures(),
+ new Comparator<SequenceFeature>()
+ {
+ @Override
+ public int compare(SequenceFeature o1, SequenceFeature o2)
+ {
+ int c = Integer.compare(o1.getBegin(), o2.getBegin());
+ return c == 0 ? Integer.compare(o1.getEnd(), o2.getEnd())
+ : c;
+ }
+ });
return count;
}
* A base class for Ensembl sequence fetchers
*
* @author gmcarstairs
- *
*/
-public abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl
+abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl
{
+ // TODO modify to accept other species e.g. ENSMUSTnnn
+ private static final Regex ACCESSION_REGEX = new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})");
+
/*
- * possible values for the 'feature' parameter of the REST overlap endpoint
- * @see
+ * possible values for the 'feature' parameter of the /overlap REST service
+ * @see http://rest.ensembl.org/documentation/info/overlap_id
*/
protected enum EnsemblFeatureType
{
return " ";
}
+ /**
+ * Ensembl accession are ENST + 11 digits for human transcript, ENSG for human
+ * gene. Other species insert 3 letters e.g. ENSMUST..., ENSMUSG...
+ *
+ * @see http://www.ensembl.org/Help/View?id=151
+ */
@Override
public Regex getAccessionValidator()
{
- return new Regex("((ENSP|ENST|ENSG|CCDS)[0-9.]{3,})");
+ return ACCESSION_REGEX;
}
@Override
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
-public class EnsemblXref extends EnsemblRestClient
+/**
+ * A class to fetch cross-references from Ensembl by calling the /xrefs REST
+ * service
+ *
+ * @author gmcarstairs
+ *
+ */
+class EnsemblXref extends EnsemblRestClient
{
@Override
{
SequenceI sq = new Sequence("Seq1", "CD");
sq.setDatasetSequence(new Sequence("Seq1", "ABCDEF"));
+ sq.getDatasetSequence().addSequenceFeature(
+ new SequenceFeature("", "", 1, 2, 0f, null));
sq.setStart(3);
sq.setEnd(4);
SequenceI derived = sq.deriveSequence();
assertEquals("CD", derived.getSequenceAsString());
assertSame(sq.getDatasetSequence(), derived.getDatasetSequence());
+
+ assertNull(((Sequence) seq).sequenceFeatures);
+ assertNull(((Sequence) derived).sequenceFeatures);
+ assertNotNull(seq.getSequenceFeatures());
+ assertSame(seq.getSequenceFeatures(), derived.getSequenceFeatures());
}
/**
--- /dev/null
+package jalview.ext.ensembl;
+
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class EnsemblProteinTest
+{
+
+ @Test(groups = "Functional")
+ public void testIsValidReference() throws Exception
+ {
+ EnsemblSequenceFetcher esq = new EnsemblProtein();
+ Assert.assertTrue(esq.isValidReference("CCDS5863.1"));
+ Assert.assertTrue(esq.isValidReference("ENSP00000288602"));
+ Assert.assertFalse(esq.isValidReference("ENST00000288602"));
+ Assert.assertFalse(esq.isValidReference("ENSG00000288602"));
+ }
+
+}
SequenceOntologyFactory.setInstance(null);
}
- @DataProvider(name = "queries")
- public Object[][] createQueryData(Method m)
- {
- return new Object[][] { { "CCDS5863.1" }, { "ENSP00000288602" } };
- }
-
- @Test(dataProvider = "queries")
- public void testIsValidReference(String query) throws Exception
- {
- EnsemblSequenceFetcher esq = new EnsemblProtein();
- Assert.assertTrue(esq.isValidReference(query),
- "Expected reference string " + query
- + " to be valid for regex "
- + esq.getAccessionValidator().toString());
- }
-
@DataProvider(name = "ens_seqs")
public Object[][] createData(Method m)
{