From 0b8abe58b934e03c34575b06d532a99f0ba70196 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 26 Jan 2016 16:36:03 +0000 Subject: [PATCH] JAL-1705 derive transcript sequences from gene sequence/GFF; handle CDS phase != 0 --- src/jalview/datamodel/SequenceFeature.java | 25 ++- src/jalview/ext/ensembl/EnsemblCdna.java | 16 +- src/jalview/ext/ensembl/EnsemblCds.java | 2 +- src/jalview/ext/ensembl/EnsemblGene.java | 221 +++++++++++++++++-------- src/jalview/ext/ensembl/EnsemblGenome.java | 11 +- src/jalview/ext/ensembl/EnsemblProtein.java | 4 +- src/jalview/ext/ensembl/EnsemblSeqProxy.java | 223 ++++++++++++++++++++------ src/jalview/io/FeaturesFile.java | 5 +- src/jalview/io/gff/Gff3Helper.java | 4 +- src/jalview/io/gff/GffHelperBase.java | 2 + 10 files changed, 373 insertions(+), 140 deletions(-) diff --git a/src/jalview/datamodel/SequenceFeature.java b/src/jalview/datamodel/SequenceFeature.java index 6f514f7..252f46c 100755 --- a/src/jalview/datamodel/SequenceFeature.java +++ b/src/jalview/datamodel/SequenceFeature.java @@ -36,6 +36,9 @@ public class SequenceFeature private static final String STRAND = "STRAND"; + // private key for Phase designed not to conflict with real GFF data + private static final String PHASE = "!Phase"; + private static final String ATTRIBUTES = "ATTRIBUTES"; public int begin; @@ -237,7 +240,7 @@ public class SequenceFeature } /** - * Used for getting values which are not in the basic set. eg STRAND, FRAME + * Used for getting values which are not in the basic set. eg STRAND, PHASE * for GFF file * * @param key @@ -355,4 +358,24 @@ public class SequenceFeature setValue(STRAND, strand); } + public void setPhase(String phase) + { + setValue(PHASE, phase); + } + + public String getPhase() + { + return (String) getValue(PHASE); + } + + /** + * Readable representation, for debug only, not guaranteed not to change + * between versions + */ + @Override + public String toString() + { + return String.format("%d %d %s %s", getBegin(), getEnd(), getType(), + getDescription()); + } } diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index e4eb873..0a97425 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -45,20 +45,22 @@ public class EnsemblCdna extends EnsemblSeqProxy } /** - * Answers true unless the feature type is 'exon' (or a sub-type of exon in - * the Sequence Ontology). Exon features are only retrieved in order to - * identify the exon sequence loci, and are redundant information on the exon - * sequence itself. + * Answers true unless the feature type is 'transcript' or 'exon' (or a + * sub-type in the Sequence Ontology). These features are only retrieved in + * order to identify the exon sequence loci, and are redundant information on + * the exon sequence itself. */ @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { - if (SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.EXON)) + SequenceOntology so = SequenceOntology.getInstance(); + String type = sf.getType(); + + if (isTranscript(type) || so.isA(type, SequenceOntology.EXON)) { return false; } - return super.retainFeature(sf, accessionId); + return featureMayBelong(sf, accessionId); } /** diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java index 58cf8fa..1f875a7 100644 --- a/src/jalview/ext/ensembl/EnsemblCds.java +++ b/src/jalview/ext/ensembl/EnsemblCds.java @@ -52,7 +52,7 @@ public class EnsemblCds extends EnsemblSeqProxy { return false; } - return super.retainFeature(sf, accessionId); + return featureMayBelong(sf, accessionId); } /** diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index bbdab55..5b57e5e 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -1,13 +1,14 @@ package jalview.ext.ensembl; -import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.AlignmentI; +import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.io.gff.SequenceOntology; import jalview.util.MapList; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -58,6 +59,7 @@ public class EnsemblGene extends EnsemblSeqProxy @Override public AlignmentI getSequenceRecords(String query) throws Exception { + // TODO ? if an ENST identifier is supplied, convert to ENSG? AlignmentI al = super.getSequenceRecords(query); if (al.getHeight() > 0) { @@ -68,8 +70,9 @@ public class EnsemblGene extends EnsemblSeqProxy } /** - * Find and fetch all transcripts for the gene, as identified by "transcript" - * features whose Parent is the requested gene + * Constructs all transcripts for the gene, as identified by "transcript" + * features whose Parent is the requested gene. The coding transcript + * sequences (i.e. with introns omitted) are added to the alignment. * * @param al * @param accId @@ -79,90 +82,141 @@ public class EnsemblGene extends EnsemblSeqProxy throws Exception { SequenceI gene = al.getSequenceAt(0); - List transcriptIds = getTranscriptIds(accId, gene); + List transcriptFeatures = getTranscriptFeatures(accId, + gene); - // TODO: could just use features and genomic sequence - // to generate the transcript sequences - faster - // could also grab "Name" as transcript description (gene name) - for (String transcriptId : transcriptIds) + for (SequenceFeature transcriptFeature : transcriptFeatures) { - /* - * fetch and map the transcript sequence; we can pass in the gene - * sequence with features marked to save fetching it again - */ - EnsemblCdna cdnaFetcher = new EnsemblCdna(); - AlignmentI al2 = cdnaFetcher.getSequenceRecords(transcriptId, - gene); - for (SequenceI seq : al2.getSequences()) - { - /* - * build mapping from gene sequence to transcript - */ - MapList mapping = cdnaFetcher.getGenomicRanges(gene, transcriptId, - seq.getStart()); - - /* - * align the transcript to the gene - */ - AlignedCodonFrame acf = new AlignedCodonFrame(); - acf.addMap(gene, seq, mapping); - char gap = al.getGapCharacter(); - // AlignmentUtils.alignSequenceAs(seq, gene, acf, String.valueOf(gap), - // gap, false, false); - - al.addSequence(seq); - } + makeTranscript(transcriptFeature, al, gene); } } /** - * Returns a list of the ids of transcript features on the sequence whose - * Parent is the gene for the accession id + * Constructs a spliced transcript sequence by finding 'exon' features for the + * given id (or failing that 'CDS'). Copies features on to the new sequence. + * 'Aligns' the new sequence against the gene sequence by padding with gaps, + * and adds it to the alignment. * - * @param accId - * @param geneSequence + * @param transcriptFeature + * @param al + * the alignment to which to add the new sequence + * @param gene + * the parent gene sequence, with features * @return */ - protected List getTranscriptIds(String accId, SequenceI geneSequence) + SequenceI makeTranscript(SequenceFeature transcriptFeature, + AlignmentI al, SequenceI gene) { - SequenceOntology so = SequenceOntology.getInstance(); - List transcriptIds = new ArrayList(); + String accId = (String) transcriptFeature.getValue("transcript_id"); + if (accId == null) + { + return null; + } /* - * scan for transcript features belonging to our gene; - * also remove any which belong to other genes + * NB we are mapping from gene sequence (not genome), so do not + * need to check for reverse strand (gene and transcript sequences + * are in forward sense) */ - SequenceFeature[] sfs = geneSequence.getSequenceFeatures(); + + /* + * make a gene-length sequence filled with gaps + * we will fill in the bases for transcript regions + */ + char[] seqChars = new char[gene.getLength()]; + Arrays.fill(seqChars, al.getGapCharacter()); + + /* + * look for exon features of the transcript, failing that for CDS + * (for example ENSG00000124610 has 1 CDS but no exon features) + */ + String parentId = "transcript:" + accId; + List splices = findFeatures(gene, + SequenceOntology.EXON, parentId); + if (splices.isEmpty()) + { + splices = findFeatures(gene, SequenceOntology.CDS, parentId); + } + + int transcriptLength = 0; + final char[] geneChars = gene.getSequence(); + int offset = gene.getStart(); // to convert to 0-based positions + List mappedFrom = new ArrayList(); + + for (SequenceFeature sf : splices) + { + int start = sf.getBegin() - offset; + int end = sf.getEnd() - offset; + int spliceLength = end - start + 1; + System.arraycopy(geneChars, start, seqChars, start, spliceLength); + transcriptLength += spliceLength; + mappedFrom.add(new int[] { sf.getBegin(), sf.getEnd() }); + } + + Sequence transcript = new Sequence(accId, seqChars, 1, transcriptLength); + transcript.createDatasetSequence(); + + al.addSequence(transcript); + + /* + * transfer features to the new sequence; we use EnsemblCdna to do this, + * to filter out unwanted features types (see method retainFeature) + */ + List mapTo = new ArrayList(); + mapTo.add(new int[] { 1, transcriptLength }); + MapList mapping = new MapList(mappedFrom, mapTo, 1, 1); + new EnsemblCdna().transferFeatures(gene.getSequenceFeatures(), + transcript.getDatasetSequence(), mapping, parentId); + + /* + * and finally fetch the protein product and save as a cross-reference + */ + addProteinProduct(transcript); + + return transcript; + } + + /** + * Returns a list of the transcript features on the sequence whose Parent is + * the gene for the accession id. Also removes all transcript features from + * the gene sequence, as we have no further need for them and they obscure + * more useful features on the display. + * + * @param accId + * @param geneSequence + * @return + */ + protected List getTranscriptFeatures(String accId, + SequenceI geneSequence) + { + List transcriptFeatures = new ArrayList(); + List keptFeatures = new ArrayList(); - boolean featureDropped = false; String parentIdentifier = "gene:" + accId; - for (SequenceFeature sf : sfs) + SequenceFeature[] sfs = geneSequence.getSequenceFeatures(); + + if (sfs != null) { - if (so.isA(sf.getType(), SequenceOntology.TRANSCRIPT)) + for (SequenceFeature sf : sfs) { - String parent = (String) sf.getValue(PARENT); - if (parentIdentifier.equals(parent)) + if (isTranscript(sf.getType())) { - transcriptIds.add((String) sf.getValue("transcript_id")); - keptFeatures.add(sf); + String parent = (String) sf.getValue(PARENT); + if (parentIdentifier.equals(parent)) + { + transcriptFeatures.add(sf); + } } else { - featureDropped = true; + keptFeatures.add(sf); } } - else - { - keptFeatures.add(sf); - } - } - if (featureDropped) - { - geneSequence.getDatasetSequence().setSequenceFeatures( - keptFeatures.toArray(new SequenceFeature[keptFeatures - .size()])); } - return transcriptIds; + SequenceFeature[] featuresRetained = keptFeatures.toArray(new SequenceFeature[keptFeatures.size()]); + geneSequence.getDatasetSequence().setSequenceFeatures(featuresRetained); + + return transcriptFeatures; } @Override @@ -177,8 +231,10 @@ public class EnsemblGene extends EnsemblSeqProxy @Override public String getTestQuery() { - return "ENSG00000157764"; // reverse strand - // ENSG00000090266 // forward strand + return "ENSG00000157764"; // BRAF, 5 transcripts, reverse strand + // ENSG00000090266 // NDUFB2, 15 transcripts, forward strand + // ENSG00000101812 // H2BFM histone, 3 transcripts, forward strand + // ENSG00000123569 // H2BFWT histone, 2 transcripts, reverse strand } /** @@ -201,14 +257,41 @@ public class EnsemblGene extends EnsemblSeqProxy } /** - * Answers true unless feature type is 'gene'. We need the gene features to - * identify the range, but it is redundant information on the gene sequence. + * Answers true unless feature type is 'gene', or 'transcript' with a parent + * which is a different gene. We need the gene features to identify the range, + * but it is redundant information on the gene sequence. Checking the parent + * allows us to drop transcript features which belong to different + * (overlapping) genes. */ @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { - return !SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.GENE); + if (SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.GENE)) + { + return false; + } + + if (isTranscript(sf.getType())) + { + String parent = (String) sf.getValue(PARENT); + if (!("gene:" + accessionId).equals(parent)) + { + return false; + } + } + return true; + } + + /** + * Answers false. This allows an optimisation - a single 'gene' feature is all + * that is needed to identify the positions of the gene on the genomic + * sequence. + */ + @Override + protected boolean isSpliceable() + { + return false; } } diff --git a/src/jalview/ext/ensembl/EnsemblGenome.java b/src/jalview/ext/ensembl/EnsemblGenome.java index b9fbbdf..8238da0 100644 --- a/src/jalview/ext/ensembl/EnsemblGenome.java +++ b/src/jalview/ext/ensembl/EnsemblGenome.java @@ -1,7 +1,6 @@ package jalview.ext.ensembl; import jalview.datamodel.SequenceFeature; -import jalview.io.gff.SequenceOntology; public class EnsemblGenome extends EnsemblSeqProxy { @@ -11,7 +10,7 @@ public class EnsemblGenome extends EnsemblSeqProxy */ private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { EnsemblFeatureType.transcript, EnsemblFeatureType.exon, - EnsemblFeatureType.cds, EnsemblFeatureType.variation }; + EnsemblFeatureType.cds /*, EnsemblFeatureType.variation */}; public EnsemblGenome() { @@ -45,12 +44,11 @@ public class EnsemblGenome extends EnsemblSeqProxy @Override protected boolean retainFeature(SequenceFeature sf, String accessionId) { - if (SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.TRANSCRIPT)) + if (isTranscript(sf.getType())) { return false; } - return super.retainFeature(sf, accessionId); + return featureMayBelong(sf, accessionId); } /** @@ -61,8 +59,7 @@ public class EnsemblGenome extends EnsemblSeqProxy @Override protected boolean identifiesSequence(SequenceFeature sf, String accId) { - if (SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.TRANSCRIPT)) + if (isTranscript(sf.getType())) { String id = (String) sf.getValue(ID); if (("transcript:" + accId).equals(id)) diff --git a/src/jalview/ext/ensembl/EnsemblProtein.java b/src/jalview/ext/ensembl/EnsemblProtein.java index 5238f98..8a57a16 100644 --- a/src/jalview/ext/ensembl/EnsemblProtein.java +++ b/src/jalview/ext/ensembl/EnsemblProtein.java @@ -2,6 +2,7 @@ package jalview.ext.ensembl; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; public class EnsemblProtein extends EnsemblSeqProxy { @@ -46,7 +47,8 @@ public class EnsemblProtein extends EnsemblSeqProxy * applicable to the protein product sequence */ @Override - protected void addFeaturesAndProduct(String accId, AlignmentI alignment) + protected void addFeaturesAndProduct(String accId, AlignmentI alignment, + SequenceI genomicSequence) { } diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 7153a9e..d7811b9 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -30,10 +30,18 @@ import java.util.List; */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { + protected static final String CONSEQUENCE_TYPE = "consequence_type"; + protected static final String PARENT = "Parent"; protected static final String ID = "ID"; + /* + * this needs special handling, as it isA sequence_variant in the + * Sequence Ontology, but behaves in Ensembl as if it isA transcript + */ + protected static final String NMD_VARIANT = "NMD_transcript_variant"; + public enum EnsemblSeqType { /** @@ -94,11 +102,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } - /* - * genomic sequence, with features retrieved from the REST overlap service - */ - private SequenceI genomicSequence; - /** * Constructor */ @@ -108,17 +111,20 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /** * Makes the sequence queries to Ensembl's REST service and returns an - * alignment consisting of the returned sequences + * alignment consisting of the returned sequences. This overloaded method + * allows the genomic sequence (with features) to be passed in if it has + * already been retrieved, to avoid repeat calls to fetch it. */ - @Override - public AlignmentI getSequenceRecords(String query) throws Exception + public AlignmentI getSequenceRecords(String query, + SequenceI genomicSequence) throws Exception { long now = System.currentTimeMillis(); // TODO use a String... query vararg instead? // danger: accession separator used as a regex here, a string elsewhere // in this case it is ok (it is just a space), but (e.g.) '\' would not be - List allIds = Arrays.asList(query.split(getAccessionSeparator())); + List allIds = Arrays.asList(query + .split(getAccessionSeparator())); AlignmentI alignment = null; inProgress = true; @@ -157,7 +163,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ for (String accId : allIds) { - addFeaturesAndProduct(accId, alignment); + addFeaturesAndProduct(accId, alignment, genomicSequence); } inProgress = false; @@ -175,7 +181,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * @param accId * @param alignment */ - protected void addFeaturesAndProduct(String accId, AlignmentI alignment) + protected void addFeaturesAndProduct(String accId, AlignmentI alignment, + SequenceI genomicSequence) { try { @@ -190,14 +197,14 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient features); if (geneFeatures.getHeight() > 0) { - /* - * transfer features to the query sequence - */ genomicSequence = geneFeatures.getSequenceAt(0); } } if (genomicSequence != null) { + /* + * transfer features to the query sequence + */ SequenceI querySeq = alignment.findName(accId); if (transferFeatures(accId, genomicSequence, querySeq)) { @@ -291,6 +298,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * dna here since the retrieved sequence is as transcribed (reverse * complement for reverse strand), i.e in the same sense as the peptide. */ + boolean fivePrimeIncomplete = false; for (SequenceFeature sf : sfs) { /* @@ -298,19 +306,48 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ if (so.isA(sf.getType(), SequenceOntology.CDS)) { - ranges.add(new int[] { sf.getBegin(), sf.getEnd() }); - mappedDnaLength += Math.abs(sf.getEnd() - sf.getBegin()) + 1; + int phase = 0; + try { + phase = Integer.parseInt(sf.getPhase()); + } catch (NumberFormatException e) + { + // ignore + } + /* + * phase > 0 on first codon means 5' incomplete - skip to the start + * of the next codon; example ENST00000496384 + */ + int begin = sf.getBegin(); + int end = sf.getEnd(); + if (ranges.isEmpty() && phase > 0) + { + fivePrimeIncomplete = true; + begin += phase; + if (begin > end) + { + continue; // shouldn't happen? + } + } + ranges.add(new int[] { begin, end }); + mappedDnaLength += Math.abs(end - begin) + 1; } } int proteinLength = proteinSeq.getLength(); List proteinRange = new ArrayList(); - proteinRange.add(new int[] { 1, proteinLength }); + int proteinStart = 1; + if (fivePrimeIncomplete && proteinSeq.getCharAt(0) == 'X') + { + proteinStart = 2; + proteinLength--; + } + proteinRange.add(new int[] { proteinStart, proteinLength }); /* - * dna length should map to protein (or protein minus stop codon) + * dna length should map to protein (or protein plus stop codon) */ - if (mappedDnaLength == 3 * proteinLength - || mappedDnaLength == 3 * (proteinLength + 1)) + int codesForResidues = mappedDnaLength / 3; + if (codesForResidues == proteinLength + || codesForResidues == (proteinLength + 1)) { return new MapList(ranges, proteinRange, 3, 1); } @@ -470,11 +507,10 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /* - * generously size for initial number of cds regions + * generously initial size for number of cds regions * (worst case titin Q8WZ42 has c. 313 exons) */ List regions = new ArrayList(100); - int sourceLength = sourceSequence.getLength(); int mappedLength = 0; int direction = 1; // forward boolean directionSet = false; @@ -512,11 +548,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); - if (mappedLength >= sourceLength) + if (!isSpliceable()) { /* - * break for the case of matching gene features v gene sequence - * - only need to locate the 'gene' feature for accId + * 'gene' sequence is contiguous so we can stop as soon as its + * identifying feature has been found */ break; } @@ -543,6 +579,15 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** + * Answers true if the sequence being retrieved may occupy discontiguous + * regions on the genomic sequence. + */ + protected boolean isSpliceable() + { + return true; + } + + /** * Returns true if the sequence feature marks positions of the genomic * sequence feature which are within the sequence being retrieved. For * example, an 'exon' feature whose parent is the target transcript marks the @@ -556,21 +601,22 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String accId); /** - * Transfers the sequence feature to the target sequence, adjusting its start - * and end range based on the 'overlap' ranges. Features which do not overlap - * the target sequence are ignored, as are features with a parent other than - * the target sequence id. + * Transfers the sequence feature to the target sequence, locating its start + * and end range based on the mapping. Features which do not overlap the + * target sequence are ignored. * * @param sf * @param targetSequence - * @param overlap + * @param mapping + * mapping from the sequence feature's coordinates to the target + * sequence */ protected void transferFeature(SequenceFeature sf, - SequenceI targetSequence, MapList overlap) + SequenceI targetSequence, MapList mapping) { int start = sf.getBegin(); int end = sf.getEnd(); - int[] mappedRange = overlap.locateInTo(start, end); + int[] mappedRange = mapping.locateInTo(start, end); if (mappedRange != null) { @@ -584,7 +630,7 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient */ if (SequenceOntology.getInstance().isSequenceVariant(sf.getType())) { - String consequence = (String) sf.getValue("consequence_type"); + String consequence = (String) sf.getValue(CONSEQUENCE_TYPE); if (consequence != null) { SequenceFeature sf2 = new SequenceFeature("consequence", @@ -594,7 +640,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } } } - } /** @@ -614,20 +659,37 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - MapList overlap = getGenomicRanges(sourceSequence, accessionId, + MapList mapping = getGenomicRanges(sourceSequence, accessionId, targetSequence.getStart()); - if (overlap == null) + if (mapping == null) { return false; } - final boolean forwardStrand = overlap.isFromForwardStrand(); + return transferFeatures(sfs, targetSequence, mapping, accessionId); + } + + /** + * Transfer features to the target sequence. The start/end positions are + * converted using the mapping. Features which do not overlap are ignored. + * Features whose parent is not the specified identifier are also ignored. + * + * @param features + * @param targetSequence + * @param mapping + * @param parentId + * @return + */ + protected boolean transferFeatures(SequenceFeature[] features, + SequenceI targetSequence, MapList mapping, String parentId) + { + final boolean forwardStrand = mapping.isFromForwardStrand(); /* * sort features by start position (descending if reverse strand) * before transferring (in forwards order) to the target sequence */ - Arrays.sort(sfs, new Comparator() + Arrays.sort(features, new Comparator() { @Override public int compare(SequenceFeature o1, SequenceFeature o2) @@ -638,11 +700,11 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient }); boolean transferred = false; - for (SequenceFeature sf : sfs) + for (SequenceFeature sf : features) { - if (retainFeature(sf, accessionId)) + if (retainFeature(sf, parentId)) { - transferFeature(sf, targetSequence, overlap); + transferFeature(sf, targetSequence, mapping); transferred = true; } } @@ -650,15 +712,31 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } /** - * Answers true if the feature is one to attach to the retrieved sequence + * Answers true if the feature type is one we want to keep for the sequence. + * Some features are only retrieved in order to identify the sequence range, + * and may then be discarded as redundant information (e.g. "CDS" feature for + * a CDS sequence). + */ + @SuppressWarnings("unused") + protected boolean retainFeature(SequenceFeature sf, String accessionId) + { + return true; // override as required + } + + /** + * Answers true if the feature has a Parent which refers to the given + * accession id, or if the feature has no parent. Answers false if the + * feature's Parent is for a different accession id. * - * @param type + * @param sf + * @param identifier * @return */ - protected boolean retainFeature(SequenceFeature sf, String accessionId) + protected boolean featureMayBelong(SequenceFeature sf, String identifier) { String parent = (String) sf.getValue(PARENT); - if (parent != null && !parent.contains(accessionId)) + // using contains to allow for prefix "gene:", "transcript:" etc + if (parent != null && !parent.contains(identifier)) { // this genomic feature belongs to a different transcript return false; @@ -673,10 +751,57 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient + " sequence with variant features"; } - public AlignmentI getSequenceRecords(String transcriptId, - SequenceI geneSeq) throws Exception + @Override + public AlignmentI getSequenceRecords(String identifier) throws Exception + { + return getSequenceRecords(identifier, null); + } + + /** + * Returns a (possibly empty) list of features on the sequence which have the + * specified sequence ontology type (or a sub-type of it), and the given + * identifier as parent + * + * @param sequence + * @param type + * @param parentId + * @return + */ + protected List findFeatures(SequenceI sequence, + String type, String parentId) + { + List result = new ArrayList(); + + SequenceFeature[] sfs = sequence.getSequenceFeatures(); + if (sfs != null) { + SequenceOntology so = SequenceOntology.getInstance(); + for (SequenceFeature sf :sfs) { + if (so.isA(sf.getType(), type)) + { + String parent = (String) sf.getValue(PARENT); + if (parent.equals(parentId)) + { + result.add(sf); + } + } + } + } + return result; + } + + /** + * Answers true if the feature type is either 'NMD_transcript_variant' or + * 'transcript' or one of its sub-types in the Sequence Ontology. This is + * needed because NMD_transcript_variant behaves like 'transcript' in Ensembl + * although strictly speaking it is not (it is a sub-type of + * sequence_variant). + * + * @param featureType + * @return + */ + public static boolean isTranscript(String featureType) { - this.genomicSequence = geneSeq; - return getSequenceRecords(transcriptId); + return NMD_VARIANT.equals(featureType) + || SequenceOntology.getInstance().isA(featureType, SequenceOntology.TRANSCRIPT); } } diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index 2d91a08..2dd5f26 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -74,8 +74,6 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI private static final String NOTE = "Note"; - protected static final String FRAME = "FRAME"; - protected static final String TAB = "\t"; protected static final String GFF_VERSION = "##gff-version"; @@ -1078,7 +1076,8 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI out.append(strand == 1 ? "+" : (strand == -1 ? "-" : ".")); out.append(TAB); - out.append(sf.getValue(FRAME, ".")); + String phase = sf.getPhase(); + out.append(phase == null ? "." : phase); // miscellaneous key-values (GFF column 9) String attributes = sf.getAttributes(); diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java index 1ffe27cb..4a2a50e 100644 --- a/src/jalview/io/gff/Gff3Helper.java +++ b/src/jalview/io/gff/Gff3Helper.java @@ -5,6 +5,7 @@ import jalview.datamodel.AlignmentI; import jalview.datamodel.MappingType; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.ext.ensembl.EnsemblSeqProxy; import jalview.util.MapList; import jalview.util.StringUtils; @@ -383,8 +384,7 @@ public class Gff3Helper extends GffHelperBase /* * Ensembl returns gene name as 'Name' for a transcript */ - if (SequenceOntology.getInstance().isA(sf.getType(), - SequenceOntology.TRANSCRIPT)) + if (EnsemblSeqProxy.isTranscript(sf.getType())) { desc = StringUtils.listToDelimitedString(attributes.get("Name"), ","); } diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java index d4a5358..499fa7b 100644 --- a/src/jalview/io/gff/GffHelperBase.java +++ b/src/jalview/io/gff/GffHelperBase.java @@ -335,6 +335,8 @@ public abstract class GffHelperBase implements GffHelperI sf.setStrand(gff[STRAND_COL]); + sf.setPhase(gff[PHASE_COL]); + if (attributes != null) { /* -- 1.7.10.2