X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fensembl%2FEnsemblGene.java;h=5b57e5e135135b99ea498947b2934f5cee86c25e;hb=0b8abe58b934e03c34575b06d532a99f0ba70196;hp=bbdab55e316ef518dce4bf86106e8949f019d348;hpb=f9b80711054b61e8c2257488a1637e15616cb9c9;p=jalview.git diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index bbdab55..5b57e5e 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -1,13 +1,14 @@ package jalview.ext.ensembl; -import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.AlignmentI; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.io.gff.SequenceOntology; import jalview.util.MapList; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -58,6 +59,7 @@ public class EnsemblGene extends EnsemblSeqProxy @Override public AlignmentI getSequenceRecords(String query) throws Exception { + // TODO ? if an ENST identifier is supplied, convert to ENSG? AlignmentI al = super.getSequenceRecords(query); if (al.getHeight() > 0) { @@ -68,8 +70,9 @@ public class EnsemblGene extends EnsemblSeqProxy } /** - * Find and fetch all transcripts for the gene, as identified by "transcript" - * features whose Parent is the requested gene + * Constructs all transcripts for the gene, as identified by "transcript" + * features whose Parent is the requested gene. The coding transcript + * sequences (i.e. with introns omitted) are added to the alignment. * * @param al * @param accId @@ -79,90 +82,141 @@ public class EnsemblGene extends EnsemblSeqProxy throws Exception { SequenceI gene = al.getSequenceAt(0); - List transcriptIds = getTranscriptIds(accId, gene); + List transcriptFeatures = getTranscriptFeatures(accId, + gene); - // TODO: could just use features and genomic sequence - // to generate the transcript sequences - faster - // could also grab "Name" as transcript description (gene name) - for (String transcriptId : transcriptIds) + for (SequenceFeature transcriptFeature : transcriptFeatures) { - /* - * fetch and map the transcript sequence; we can pass in the gene - * sequence with features marked to save fetching it again - */ - EnsemblCdna cdnaFetcher = new EnsemblCdna(); - AlignmentI al2 = cdnaFetcher.getSequenceRecords(transcriptId, - gene); - for (SequenceI seq : al2.getSequences()) - { - /* - * build mapping from gene sequence to transcript - */ - MapList mapping = cdnaFetcher.getGenomicRanges(gene, transcriptId, - seq.getStart()); - - /* - * align the transcript to the gene - */ - AlignedCodonFrame acf = new AlignedCodonFrame(); - acf.addMap(gene, seq, mapping); - char gap = al.getGapCharacter(); - // AlignmentUtils.alignSequenceAs(seq, gene, acf, String.valueOf(gap), - // gap, false, false); - - al.addSequence(seq); - } + makeTranscript(transcriptFeature, al, gene); } } /** - * Returns a list of the ids of transcript features on the sequence whose - * Parent is the gene for the accession id + * Constructs a spliced transcript sequence by finding 'exon' features for the + * given id (or failing that 'CDS'). Copies features on to the new sequence. + * 'Aligns' the new sequence against the gene sequence by padding with gaps, + * and adds it to the alignment. * - * @param accId - * @param geneSequence + * @param transcriptFeature + * @param al + * the alignment to which to add the new sequence + * @param gene + * the parent gene sequence, with features * @return */ - protected List getTranscriptIds(String accId, SequenceI geneSequence) + SequenceI makeTranscript(SequenceFeature transcriptFeature, + AlignmentI al, SequenceI gene) { - SequenceOntology so = SequenceOntology.getInstance(); - List transcriptIds = new ArrayList(); + String accId = (String) transcriptFeature.getValue("transcript_id"); + if (accId == null) + { + return null; + } /* - * scan for transcript features belonging to our gene; - * also remove any which belong to other genes + * NB we are mapping from gene sequence (not genome), so do not + * need to check for reverse strand (gene and transcript sequences + * are in forward sense) */ - SequenceFeature[] sfs = geneSequence.getSequenceFeatures(); + + /* + * make a gene-length sequence filled with gaps + * we will fill in the bases for transcript regions + */ + char[] seqChars = new char[gene.getLength()]; + Arrays.fill(seqChars, al.getGapCharacter()); + + /* + * look for exon features of the transcript, failing that for CDS + * (for example ENSG00000124610 has 1 CDS but no exon features) + */ + String parentId = "transcript:" + accId; + List splices = findFeatures(gene, + SequenceOntology.EXON, parentId); + if (splices.isEmpty()) + { + splices = findFeatures(gene, SequenceOntology.CDS, parentId); + } + + int transcriptLength = 0; + final char[] geneChars = gene.getSequence(); + int offset = gene.getStart(); // to convert to 0-based positions + List mappedFrom = new ArrayList(); + + for (SequenceFeature sf : splices) + { + int start = sf.getBegin() - offset; + int end = sf.getEnd() - offset; + int spliceLength = end - start + 1; + System.arraycopy(geneChars, start, seqChars, start, spliceLength); + transcriptLength += spliceLength; + mappedFrom.add(new int[] { sf.getBegin(), sf.getEnd() }); + } + + Sequence transcript = new Sequence(accId, seqChars, 1, transcriptLength); + transcript.createDatasetSequence(); + + al.addSequence(transcript); + + /* + * transfer features to the new sequence; we use EnsemblCdna to do this, + * to filter out unwanted features types (see method retainFeature) + */ + List mapTo = new ArrayList(); + mapTo.add(new int[] { 1, transcriptLength }); + MapList mapping = new MapList(mappedFrom, mapTo, 1, 1); + new EnsemblCdna().transferFeatures(gene.getSequenceFeatures(), + transcript.getDatasetSequence(), mapping, parentId); + + /* + * and finally fetch the protein product and save as a cross-reference + */ + addProteinProduct(transcript); + + return transcript; + } + + /** + * Returns a list of the transcript features on the sequence whose Parent is + * the gene for the accession id. Also removes all transcript features from + * the gene sequence, as we have no further need for them and they obscure + * more useful features on the display. + * + * @param accId + * @param geneSequence + * @return + */ + protected List getTranscriptFeatures(String accId, + SequenceI geneSequence) + { + List transcriptFeatures = new ArrayList(); + List keptFeatures = new ArrayList(); - boolean featureDropped = false; String parentIdentifier = "gene:" + accId; - for (SequenceFeature sf : sfs) + SequenceFeature[] sfs = geneSequence.getSequenceFeatures(); + + if (sfs != null) { - if (so.isA(sf.getType(), SequenceOntology.TRANSCRIPT)) + for (SequenceFeature sf : sfs) { - String parent = (String) sf.getValue(PARENT); - if (parentIdentifier.equals(parent)) + if (isTranscript(sf.getType())) { - transcriptIds.add((String) sf.getValue("transcript_id")); - keptFeatures.add(sf); + String parent = (String) sf.getValue(PARENT); + if (parentIdentifier.equals(parent)) + { + transcriptFeatures.add(sf); + } } else { - featureDropped = true; + keptFeatures.add(sf); } } - else - { - keptFeatures.add(sf); - } - } - if (featureDropped) - { - geneSequence.getDatasetSequence().setSequenceFeatures( - keptFeatures.toArray(new SequenceFeature[keptFeatures - .size()])); } - return transcriptIds; + SequenceFeature[] featuresRetained = keptFeatures.toArray(new SequenceFeature[keptFeatures.size()]); + geneSequence.getDatasetSequence().setSequenceFeatures(featuresRetained); + + return transcriptFeatures; } @Override @@ -177,8 +231,10 @@ public class EnsemblGene extends EnsemblSeqProxy @Override public String getTestQuery() { - return "ENSG00000157764"; // reverse strand - // ENSG00000090266 // forward strand + return "ENSG00000157764"; // BRAF, 5 transcripts, reverse strand + // ENSG00000090266 // NDUFB2, 15 transcripts, forward strand + // ENSG00000101812 // H2BFM histone, 3 transcripts, forward strand + // ENSG00000123569 // H2BFWT histone, 2 transcripts, reverse strand } /** @@ -201,14 +257,41 @@ public class EnsemblGene extends EnsemblSeqProxy } /** - * Answers true unless feature type is 'gene'. We need the gene features to - * identify the range, but it is redundant information on the gene sequence. + * Answers true unless feature type is 'gene', or 'transcript' with a parent + * which is a different gene. We need the gene features to identify the range, + * but it is redundant information on the gene sequence. Checking the parent + * allows us to drop transcript features which belong to different + * (overlapping) genes. */ @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { - return !SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.GENE); + if (SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.GENE)) + { + return false; + } + + if (isTranscript(sf.getType())) + { + String parent = (String) sf.getValue(PARENT); + if (!("gene:" + accessionId).equals(parent)) + { + return false; + } + } + return true; + } + + /** + * Answers false. This allows an optimisation - a single 'gene' feature is all + * that is needed to identify the positions of the gene on the genomic + * sequence. + */ + @Override + protected boolean isSpliceable() + { + return false; } }