package jalview.ext.ensembl; import jalview.api.FeatureColourI; import jalview.api.FeatureSettingsI; import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; import jalview.schemes.FeatureColourAdapter; import jalview.schemes.FeatureSettingsAdapter; import jalview.util.MapList; import jalview.util.StringUtils; import java.awt.Color; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import com.stevesoft.pat.Regex; /** * A class that fetches genomic sequence and all transcripts for an Ensembl gene * * @author gmcarstairs */ public class EnsemblGene extends EnsemblSeqProxy { private static final String GENE_PREFIX = "gene:"; /* * accepts anything as we will attempt lookup of gene or * transcript id or gene name */ private static final Regex ACCESSION_REGEX = new Regex(".*"); private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { EnsemblFeatureType.gene, EnsemblFeatureType.transcript, EnsemblFeatureType.exon, EnsemblFeatureType.cds, EnsemblFeatureType.variation }; @Override public String getDbName() { return "ENSEMBL (GENE)"; } @Override protected EnsemblFeatureType[] getFeaturesToFetch() { return FEATURES_TO_FETCH; } @Override protected EnsemblSeqType getSourceEnsemblType() { return EnsemblSeqType.GENOMIC; } /** * Returns an alignment containing the gene(s) for the given gene or * transcript identifier, or external identifier (e.g. Uniprot id). If given a * gene name or external identifier, returns any related gene sequences found * for model organisms. If only a single gene is queried for, then its * transcripts are also retrieved and added to the alignment.
* Method: * * * @param query * one or more identifiers separated by a space * @return an alignment containing one or more genes, and possibly * transcripts, or null */ @Override public AlignmentI getSequenceRecords(String query) throws Exception { // todo: tidy up handling of one or multiple accession ids String[] queries = query.split(getAccessionSeparator()); /* * if given a transcript id, look up its gene parent */ if (isTranscriptIdentifier(query)) { // we are assuming all transcripts have the same gene parent here query = new EnsemblLookup().getParent(queries[0]); if (query == null) { return null; } } /* * if given a gene or other external name, lookup and fetch * the corresponding gene for all model organisms */ if (!isGeneIdentifier(query)) { List geneIds = new EnsemblSymbol().getIds(query); if (geneIds.isEmpty()) { return null; } String theIds = StringUtils.listToDelimitedString(geneIds, getAccessionSeparator()); return getSequenceRecords(theIds); } /* * fetch the gene sequence(s) with features and xrefs */ AlignmentI al = super.getSequenceRecords(query); /* * if we retrieved a single gene, get its transcripts as well */ if (al.getHeight() == 1) { getTranscripts(al, query); } return al; } /** * Attempts to get Ensembl stable identifiers for model organisms for a gene * name by calling the xrefs symbol REST service to resolve the gene name. * * @param query * @return */ protected String getGeneIdentifiersForName(String query) { List ids = new EnsemblSymbol().getIds(query); if (ids != null) { for (String id : ids) { if (isGeneIdentifier(id)) { return id; } } } return null; } /** * Constructs all transcripts for the gene, as identified by "transcript" * features whose Parent is the requested gene. The coding transcript * sequences (i.e. with introns omitted) are added to the alignment. * * @param al * @param accId * @throws Exception */ protected void getTranscripts(AlignmentI al, String accId) throws Exception { SequenceI gene = al.getSequenceAt(0); List transcriptFeatures = getTranscriptFeatures(accId, gene); for (SequenceFeature transcriptFeature : transcriptFeatures) { makeTranscript(transcriptFeature, al, gene); } clearGeneFeatures(gene); } /** * Remove unwanted features (transcript, exon, CDS) from the gene sequence * after we have used them to derive transcripts and transfer features * * @param gene */ protected void clearGeneFeatures(SequenceI gene) { SequenceFeature[] sfs = gene.getSequenceFeatures(); if (sfs != null) { SequenceOntologyI so = SequenceOntologyFactory.getInstance(); List filtered = new ArrayList(); for (SequenceFeature sf : sfs) { String type = sf.getType(); if (!isTranscript(type) && !so.isA(type, SequenceOntologyI.EXON) && !so.isA(type, SequenceOntologyI.CDS)) { filtered.add(sf); } } gene.setSequenceFeatures(filtered .toArray(new SequenceFeature[filtered .size()])); } } /** * Constructs a spliced transcript sequence by finding 'exon' features for the * given id (or failing that 'CDS'). Copies features on to the new sequence. * 'Aligns' the new sequence against the gene sequence by padding with gaps, * and adds it to the alignment. * * @param transcriptFeature * @param al * the alignment to which to add the new sequence * @param gene * the parent gene sequence, with features * @return */ SequenceI makeTranscript(SequenceFeature transcriptFeature, AlignmentI al, SequenceI gene) { String accId = getTranscriptId(transcriptFeature); if (accId == null) { return null; } /* * NB we are mapping from gene sequence (not genome), so do not * need to check for reverse strand (gene and transcript sequences * are in forward sense) */ /* * make a gene-length sequence filled with gaps * we will fill in the bases for transcript regions */ char[] seqChars = new char[gene.getLength()]; Arrays.fill(seqChars, al.getGapCharacter()); /* * look for exon features of the transcript, failing that for CDS * (for example ENSG00000124610 has 1 CDS but no exon features) */ String parentId = "transcript:" + accId; List splices = findFeatures(gene, SequenceOntologyI.EXON, parentId); if (splices.isEmpty()) { splices = findFeatures(gene, SequenceOntologyI.CDS, parentId); } int transcriptLength = 0; final char[] geneChars = gene.getSequence(); int offset = gene.getStart(); // to convert to 0-based positions List mappedFrom = new ArrayList(); for (SequenceFeature sf : splices) { int start = sf.getBegin() - offset; int end = sf.getEnd() - offset; int spliceLength = end - start + 1; System.arraycopy(geneChars, start, seqChars, start, spliceLength); transcriptLength += spliceLength; mappedFrom.add(new int[] { sf.getBegin(), sf.getEnd() }); } Sequence transcript = new Sequence(accId, seqChars, 1, transcriptLength); String geneName = (String) transcriptFeature.getValue(NAME); if (geneName != null) { transcript.setDescription(geneName); } transcript.createDatasetSequence(); al.addSequence(transcript); /* * transfer features to the new sequence; we use EnsemblCdna to do this, * to filter out unwanted features types (see method retainFeature) */ List mapTo = new ArrayList(); mapTo.add(new int[] { 1, transcriptLength }); MapList mapping = new MapList(mappedFrom, mapTo, 1, 1); new EnsemblCdna().transferFeatures(gene.getSequenceFeatures(), transcript.getDatasetSequence(), mapping, parentId); /* * fetch and save cross-references */ super.getCrossReferences(transcript); /* * and finally fetch the protein product and save as a cross-reference */ new EnsemblCdna().addProteinProduct(transcript); return transcript; } /** * Returns the 'transcript_id' property of the sequence feature (or null) * * @param feature * @return */ protected String getTranscriptId(SequenceFeature feature) { return (String) feature.getValue("transcript_id"); } /** * Returns a list of the transcript features on the sequence whose Parent is * the gene for the accession id. * * @param accId * @param geneSequence * @return */ protected List getTranscriptFeatures(String accId, SequenceI geneSequence) { List transcriptFeatures = new ArrayList(); String parentIdentifier = GENE_PREFIX + accId; SequenceFeature[] sfs = geneSequence.getSequenceFeatures(); if (sfs != null) { for (SequenceFeature sf : sfs) { if (isTranscript(sf.getType())) { String parent = (String) sf.getValue(PARENT); if (parentIdentifier.equals(parent)) { transcriptFeatures.add(sf); } } } } return transcriptFeatures; } @Override public String getDescription() { return "Fetches all transcripts and variant features for a gene or transcript"; } /** * Default test query is a gene id (can also enter a transcript id) */ @Override public String getTestQuery() { return "ENSG00000157764"; // BRAF, 5 transcripts, reverse strand // ENSG00000090266 // NDUFB2, 15 transcripts, forward strand // ENSG00000101812 // H2BFM histone, 3 transcripts, forward strand // ENSG00000123569 // H2BFWT histone, 2 transcripts, reverse strand } /** * Answers true for a feature of type 'gene' (or a sub-type of gene in the * Sequence Ontology), whose ID is the accession we are retrieving */ @Override protected boolean identifiesSequence(SequenceFeature sf, String accId) { if (SequenceOntologyFactory.getInstance().isA(sf.getType(), SequenceOntologyI.GENE)) { String id = (String) sf.getValue(ID); if ((GENE_PREFIX + accId).equals(id)) { return true; } } return false; } /** * Answers true unless feature type is 'gene', or 'transcript' with a parent * which is a different gene. We need the gene features to identify the range, * but it is redundant information on the gene sequence. Checking the parent * allows us to drop transcript features which belong to different * (overlapping) genes. */ @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { SequenceOntologyI so = SequenceOntologyFactory.getInstance(); String type = sf.getType(); if (so.isA(type, SequenceOntologyI.GENE)) { return false; } if (isTranscript(type)) { String parent = (String) sf.getValue(PARENT); if (!(GENE_PREFIX + accessionId).equals(parent)) { return false; } } return true; } /** * Answers false. This allows an optimisation - a single 'gene' feature is all * that is needed to identify the positions of the gene on the genomic * sequence. */ @Override protected boolean isSpliceable() { return false; } @Override protected List getCrossReferenceDatabases() { // found these for ENSG00000157764 on 30/01/2016: // return new String[] {"Vega_gene", "OTTG", "ENS_LRG_gene", "ArrayExpress", // "EntrezGene", "HGNC", "MIM_GENE", "MIM_MORBID", "WikiGene"}; return super.getCrossReferenceDatabases(); } /** * Override to do nothing as Ensembl doesn't return a protein sequence for a * gene identifier */ @Override protected void addProteinProduct(SequenceI querySeq) { } @Override public Regex getAccessionValidator() { return ACCESSION_REGEX; } @Override public FeatureSettingsI getFeatureColourScheme() { return new FeatureSettingsAdapter() { SequenceOntologyI so = SequenceOntologyFactory.getInstance(); @Override public boolean isFeatureDisplayed(String type) { return (so.isA(type, SequenceOntologyI.EXON) || so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT)); } @Override public FeatureColourI getFeatureColour(String type) { if (so.isA(type, SequenceOntologyI.EXON)) { return new FeatureColourAdapter() { @Override public boolean isColourByLabel() { return true; } }; } if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT)) { return new FeatureColourAdapter() { @Override public Color getColour() { return Color.RED; } }; } return null; } /** * order to render sequence_variant after exon after the rest */ @Override public int compare(String feature1, String feature2) { if (so.isA(feature1, SequenceOntologyI.SEQUENCE_VARIANT)) { return +1; } if (so.isA(feature2, SequenceOntologyI.SEQUENCE_VARIANT)) { return -1; } if (so.isA(feature1, SequenceOntologyI.EXON)) { return +1; } if (so.isA(feature2, SequenceOntologyI.EXON)) { return -1; } return 0; } }; } }