package jalview.ext.ensembl; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceFeature; import java.util.Arrays; import java.util.List; import com.stevesoft.pat.Regex; /** * A client to fetch protein translated sequence for an Ensembl identifier * * @author gmcarstairs * */ public class EnsemblProtein extends EnsemblSeqProxy { /* * accepts ENSP with 11 digits * or ENSMUSP or similar for other species * or CCDSnnnnn.nn with at least 3 digits */ private static final Regex ACCESSION_REGEX = new Regex( "(ENS([A-Z]{3}|)P[0-9]{11}$)" + "|" + "(CCDS[0-9.]{3,}$)"); private static final List CROSSREFS = Arrays.asList(new String[] { "PDB", "Uniprot/SPTREMBL", "Uniprot/SWISSPROT" }); /** * Default constructor (to use rest.ensembl.org) */ public EnsemblProtein() { super(); } /** * Constructor given the target domain to fetch data from * * @param d */ public EnsemblProtein(String d) { super(d); } @Override public String getDbName() { return "ENSEMBL (Protein)"; } @Override protected EnsemblSeqType getSourceEnsemblType() { return EnsemblSeqType.PROTEIN; } /** * Returns false, as this fetcher does not retrieve DNA sequences. */ @Override public boolean isDnaCoding() { return false; } /** * Test query is to the protein translation of transcript ENST00000288602 */ @Override public String getTestQuery() { return "ENSP00000288602"; } /** * Overrides base class method to do nothing - genomic features are not * applicable to the protein product sequence */ @Override protected void addFeaturesAndProduct(String accId, AlignmentI alignment) { } @Override protected EnsemblFeatureType[] getFeaturesToFetch() { // not applicable - can't fetch genomic features for a protein sequence return null; } @Override protected boolean identifiesSequence(SequenceFeature sf, String accId) { // not applicable - protein sequence is not a 'subset' of genomic sequence return false; } @Override protected List getCrossReferenceDatabases() { return CROSSREFS; } @Override public Regex getAccessionValidator() { return ACCESSION_REGEX; } /** * Returns an accession id for a query, including conversion of ENST* to * ENSP*. This supports querying for the protein sequence for a transcript * (ENST identifier) and returning the ENSP identifier. */ @Override public String getAccessionIdFromQuery(String query) { String accId = super.getAccessionIdFromQuery(query); /* * ensure last character before (11) digits is P * ENST00000288602 -> ENSP00000288602 * ENSMUST00000288602 -> ENSMUSP00000288602 */ if (accId != null && accId.length() >= 12) { char[] chars = accId.toCharArray(); chars[chars.length - 12] = 'P'; accId = new String(chars); } return accId; } }