package jalview.ext.ensembl; import jalview.datamodel.DBRefSource; import jalview.ws.seqfetcher.DbSourceProxyImpl; import com.stevesoft.pat.Regex; /** * A base class for Ensembl sequence fetchers * * @author gmcarstairs */ abstract class EnsemblSequenceFetcher extends DbSourceProxyImpl { /* * accepts ENSG/T/E/P with 11 digits * or ENSMUSP or similar for other species * or CCDSnnnnn.nn with at least 3 digits */ private static final Regex ACCESSION_REGEX = new Regex( "(ENS([A-Z]{3}|)[GTEP]{1}[0-9]{11}$)" + "|" + "(CCDS[0-9.]{3,}$)"); /* * possible values for the 'feature' parameter of the /overlap REST service * @see http://rest.ensembl.org/documentation/info/overlap_id */ protected enum EnsemblFeatureType { gene, transcript, cds, exon, repeat, simple, misc, variation, somatic_variation, structural_variation, somatic_structural_variation, constrained, regulatory } @Override public String getDbSource() { // NB ensure Uniprot xrefs are canonicalised from "Ensembl" to "ENSEMBL" return DBRefSource.ENSEMBL; // "ENSEMBL" } @Override public String getDbVersion() { return "0"; } @Override public String getAccessionSeparator() { return " "; } /** * Ensembl accession are ENST + 11 digits for human transcript, ENSG for human * gene. Other species insert 3 letters e.g. ENSMUST..., ENSMUSG... * * @see http://www.ensembl.org/Help/View?id=151 */ @Override public Regex getAccessionValidator() { return ACCESSION_REGEX; } @Override public boolean isValidReference(String accession) { return getAccessionValidator().search(accession); } @Override public int getTier() { return 0; } /** * Default test query is a transcript */ @Override public String getTestQuery() { // has CDS on reverse strand: return "ENST00000288602"; // ENST00000461457 // forward strand } @Override public boolean isDnaCoding() { return true; } }