package jalview.ext.ensembl; import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.io.gff.SequenceOntology; import jalview.util.MapList; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * A class that fetches genomic sequence and all transcripts for an Ensembl gene * * @author gmcarstairs */ public class EnsemblGene extends EnsemblSeqProxy { private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { EnsemblFeatureType.gene, EnsemblFeatureType.transcript, EnsemblFeatureType.exon, EnsemblFeatureType.cds, EnsemblFeatureType.variation }; @Override public String getDbName() { return "ENSEMBL (GENE)"; } @Override protected EnsemblFeatureType[] getFeaturesToFetch() { return FEATURES_TO_FETCH; } @Override protected EnsemblSeqType getSourceEnsemblType() { return EnsemblSeqType.GENOMIC; } /** * Builds an alignment of all transcripts for the requested gene: * */ @Override public AlignmentI getSequenceRecords(String query) throws Exception { // TODO ? if an ENST identifier is supplied, convert to ENSG? AlignmentI al = super.getSequenceRecords(query); if (al.getHeight() > 0) { getTranscripts(al, query); } return al; } /** * Constructs all transcripts for the gene, as identified by "transcript" * features whose Parent is the requested gene. The coding transcript * sequences (i.e. with introns omitted) are added to the alignment. * * @param al * @param accId * @throws Exception */ protected void getTranscripts(AlignmentI al, String accId) throws Exception { SequenceI gene = al.getSequenceAt(0); List transcriptFeatures = getTranscriptFeatures(accId, gene); for (SequenceFeature transcriptFeature : transcriptFeatures) { makeTranscript(transcriptFeature, al, gene); } } /** * Constructs a spliced transcript sequence by finding 'exon' features for the * given id (or failing that 'CDS'). Copies features on to the new sequence. * 'Aligns' the new sequence against the gene sequence by padding with gaps, * and adds it to the alignment. * * @param transcriptFeature * @param al * the alignment to which to add the new sequence * @param gene * the parent gene sequence, with features * @return */ SequenceI makeTranscript(SequenceFeature transcriptFeature, AlignmentI al, SequenceI gene) { String accId = (String) transcriptFeature.getValue("transcript_id"); if (accId == null) { return null; } /* * NB we are mapping from gene sequence (not genome), so do not * need to check for reverse strand (gene and transcript sequences * are in forward sense) */ /* * make a gene-length sequence filled with gaps * we will fill in the bases for transcript regions */ char[] seqChars = new char[gene.getLength()]; Arrays.fill(seqChars, al.getGapCharacter()); /* * look for exon features of the transcript, failing that for CDS * (for example ENSG00000124610 has 1 CDS but no exon features) */ String parentId = "transcript:" + accId; List splices = findFeatures(gene, SequenceOntology.EXON, parentId); if (splices.isEmpty()) { splices = findFeatures(gene, SequenceOntology.CDS, parentId); } int transcriptLength = 0; final char[] geneChars = gene.getSequence(); int offset = gene.getStart(); // to convert to 0-based positions List mappedFrom = new ArrayList(); for (SequenceFeature sf : splices) { int start = sf.getBegin() - offset; int end = sf.getEnd() - offset; int spliceLength = end - start + 1; System.arraycopy(geneChars, start, seqChars, start, spliceLength); transcriptLength += spliceLength; mappedFrom.add(new int[] { sf.getBegin(), sf.getEnd() }); } Sequence transcript = new Sequence(accId, seqChars, 1, transcriptLength); String geneName = (String) transcriptFeature.getValue(NAME); if (geneName != null) { transcript.setDescription(geneName); } transcript.createDatasetSequence(); al.addSequence(transcript); /* * transfer features to the new sequence; we use EnsemblCdna to do this, * to filter out unwanted features types (see method retainFeature) */ List mapTo = new ArrayList(); mapTo.add(new int[] { 1, transcriptLength }); MapList mapping = new MapList(mappedFrom, mapTo, 1, 1); new EnsemblCdna().transferFeatures(gene.getSequenceFeatures(), transcript.getDatasetSequence(), mapping, parentId); /* * and finally fetch the protein product and save as a cross-reference */ addProteinProduct(transcript); return transcript; } /** * Returns a list of the transcript features on the sequence whose Parent is * the gene for the accession id. * * @param accId * @param geneSequence * @return */ protected List getTranscriptFeatures(String accId, SequenceI geneSequence) { List transcriptFeatures = new ArrayList(); String parentIdentifier = "gene:" + accId; SequenceFeature[] sfs = geneSequence.getSequenceFeatures(); if (sfs != null) { for (SequenceFeature sf : sfs) { if (isTranscript(sf.getType())) { String parent = (String) sf.getValue(PARENT); if (parentIdentifier.equals(parent)) { transcriptFeatures.add(sf); } } } } return transcriptFeatures; } @Override public String getDescription() { return "Fetches all transcripts and variant features for a gene"; } /** * Default test query is a transcript */ @Override public String getTestQuery() { return "ENSG00000157764"; // BRAF, 5 transcripts, reverse strand // ENSG00000090266 // NDUFB2, 15 transcripts, forward strand // ENSG00000101812 // H2BFM histone, 3 transcripts, forward strand // ENSG00000123569 // H2BFWT histone, 2 transcripts, reverse strand } /** * Answers true for a feature of type 'gene' (or a sub-type of gene in the * Sequence Ontology), whose ID is the accession we are retrieving */ @Override protected boolean identifiesSequence(SequenceFeature sf, String accId) { if (SequenceOntology.getInstance().isA(sf.getType(), SequenceOntology.GENE)) { String id = (String) sf.getValue(ID); if (("gene:" + accId).equals(id)) { return true; } } return false; } /** * Answers true unless feature type is 'gene', or 'transcript' with a parent * which is a different gene. We need the gene features to identify the range, * but it is redundant information on the gene sequence. Checking the parent * allows us to drop transcript features which belong to different * (overlapping) genes. */ @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { if (SequenceOntology.getInstance().isA(sf.getType(), SequenceOntology.GENE)) { return false; } if (isTranscript(sf.getType())) { String parent = (String) sf.getValue(PARENT); if (!("gene:" + accessionId).equals(parent)) { return false; } } return true; } /** * Answers false. This allows an optimisation - a single 'gene' feature is all * that is needed to identify the positions of the gene on the genomic * sequence. */ @Override protected boolean isSpliceable() { return false; } }