From f9b80711054b61e8c2257488a1637e15616cb9c9 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 25 Jan 2016 10:45:02 +0000 Subject: [PATCH] JAL-1705 EnsemblGene added, and related refactoring --- src/jalview/ext/ensembl/EnsemblCdna.java | 11 +- src/jalview/ext/ensembl/EnsemblCds.java | 9 +- src/jalview/ext/ensembl/EnsemblGene.java | 214 ++++++++++++++++++++ src/jalview/ext/ensembl/EnsemblGenome.java | 10 +- src/jalview/ext/ensembl/EnsemblOverlap.java | 11 +- src/jalview/ext/ensembl/EnsemblRestClient.java | 10 +- src/jalview/ext/ensembl/EnsemblSeqProxy.java | 150 ++++++++++---- src/jalview/io/gff/Gff3Helper.java | 9 + src/jalview/io/gff/SequenceOntology.java | 3 + src/jalview/ws/SequenceFetcher.java | 2 + .../jalview/ext/ensembl/EnsemblRestClientTest.java | 6 +- test/jalview/ext/ensembl/EnsemblSeqProxyTest.java | 7 +- 12 files changed, 373 insertions(+), 69 deletions(-) create mode 100644 src/jalview/ext/ensembl/EnsemblGene.java diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index 2ac4956..e4eb873 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -47,13 +47,18 @@ public class EnsemblCdna extends EnsemblSeqProxy /** * Answers true unless the feature type is 'exon' (or a sub-type of exon in * the Sequence Ontology). Exon features are only retrieved in order to - * identify the exon sequence range, and are redundant information on the exon + * identify the exon sequence loci, and are redundant information on the exon * sequence itself. */ @Override - protected boolean retainFeature(String type) + protected boolean retainFeature(SequenceFeature sf, String accessionId) { - return !SequenceOntology.getInstance().isA(type, SequenceOntology.EXON); + if (SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.EXON)) + { + return false; + } + return super.retainFeature(sf, accessionId); } /** diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java index e366569..58cf8fa 100644 --- a/src/jalview/ext/ensembl/EnsemblCds.java +++ b/src/jalview/ext/ensembl/EnsemblCds.java @@ -45,9 +45,14 @@ public class EnsemblCds extends EnsemblSeqProxy * itself. */ @Override - protected boolean retainFeature(String type) + protected boolean retainFeature(SequenceFeature sf, String accessionId) { - return !SequenceOntology.getInstance().isA(type, SequenceOntology.CDS); + if (SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.CDS)) + { + return false; + } + return super.retainFeature(sf, accessionId); } /** diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java new file mode 100644 index 0000000..bbdab55 --- /dev/null +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -0,0 +1,214 @@ +package jalview.ext.ensembl; + +import jalview.datamodel.AlignedCodonFrame; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.io.gff.SequenceOntology; +import jalview.util.MapList; + +import java.util.ArrayList; +import java.util.List; + +/** + * A class that fetches genomic sequence and all transcripts for an Ensembl gene + * + * @author gmcarstairs + */ +public class EnsemblGene extends EnsemblSeqProxy +{ + private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { + EnsemblFeatureType.gene, EnsemblFeatureType.transcript, + EnsemblFeatureType.exon, EnsemblFeatureType.cds, + EnsemblFeatureType.variation }; + + @Override + public String getDbName() + { + return "ENSEMBL (GENE)"; + } + + @Override + protected EnsemblFeatureType[] getFeaturesToFetch() + { + return FEATURES_TO_FETCH; + } + + @Override + protected EnsemblSeqType getSourceEnsemblType() + { + return EnsemblSeqType.GENOMIC; + } + + /** + * Builds an alignment of all transcripts for the requested gene: + * + */ + @Override + public AlignmentI getSequenceRecords(String query) throws Exception + { + AlignmentI al = super.getSequenceRecords(query); + if (al.getHeight() > 0) + { + getTranscripts(al, query); + } + + return al; + } + + /** + * Find and fetch all transcripts for the gene, as identified by "transcript" + * features whose Parent is the requested gene + * + * @param al + * @param accId + * @throws Exception + */ + protected void getTranscripts(AlignmentI al, String accId) + throws Exception + { + SequenceI gene = al.getSequenceAt(0); + List transcriptIds = getTranscriptIds(accId, gene); + + // TODO: could just use features and genomic sequence + // to generate the transcript sequences - faster + // could also grab "Name" as transcript description (gene name) + for (String transcriptId : transcriptIds) + { + /* + * fetch and map the transcript sequence; we can pass in the gene + * sequence with features marked to save fetching it again + */ + EnsemblCdna cdnaFetcher = new EnsemblCdna(); + AlignmentI al2 = cdnaFetcher.getSequenceRecords(transcriptId, + gene); + for (SequenceI seq : al2.getSequences()) + { + /* + * build mapping from gene sequence to transcript + */ + MapList mapping = cdnaFetcher.getGenomicRanges(gene, transcriptId, + seq.getStart()); + + /* + * align the transcript to the gene + */ + AlignedCodonFrame acf = new AlignedCodonFrame(); + acf.addMap(gene, seq, mapping); + char gap = al.getGapCharacter(); + // AlignmentUtils.alignSequenceAs(seq, gene, acf, String.valueOf(gap), + // gap, false, false); + + al.addSequence(seq); + } + } + } + + /** + * Returns a list of the ids of transcript features on the sequence whose + * Parent is the gene for the accession id + * + * @param accId + * @param geneSequence + * @return + */ + protected List getTranscriptIds(String accId, SequenceI geneSequence) + { + SequenceOntology so = SequenceOntology.getInstance(); + List transcriptIds = new ArrayList(); + + /* + * scan for transcript features belonging to our gene; + * also remove any which belong to other genes + */ + SequenceFeature[] sfs = geneSequence.getSequenceFeatures(); + List keptFeatures = new ArrayList(); + boolean featureDropped = false; + String parentIdentifier = "gene:" + accId; + for (SequenceFeature sf : sfs) + { + if (so.isA(sf.getType(), SequenceOntology.TRANSCRIPT)) + { + String parent = (String) sf.getValue(PARENT); + if (parentIdentifier.equals(parent)) + { + transcriptIds.add((String) sf.getValue("transcript_id")); + keptFeatures.add(sf); + } + else + { + featureDropped = true; + } + } + else + { + keptFeatures.add(sf); + } + } + if (featureDropped) + { + geneSequence.getDatasetSequence().setSequenceFeatures( + keptFeatures.toArray(new SequenceFeature[keptFeatures + .size()])); + } + return transcriptIds; + } + + @Override + public String getDescription() + { + return "Fetches all transcripts and variant features for a gene"; + } + + /** + * Default test query is a transcript + */ + @Override + public String getTestQuery() + { + return "ENSG00000157764"; // reverse strand + // ENSG00000090266 // forward strand + } + + /** + * Answers true for a feature of type 'gene' (or a sub-type of gene in the + * Sequence Ontology), whose ID is the accession we are retrieving + */ + @Override + protected boolean identifiesSequence(SequenceFeature sf, String accId) + { + if (SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.GENE)) + { + String id = (String) sf.getValue(ID); + if (("gene:" + accId).equals(id)) + { + return true; + } + } + return false; + } + + /** + * Answers true unless feature type is 'gene'. We need the gene features to + * identify the range, but it is redundant information on the gene sequence. + */ + @Override + protected boolean retainFeature(SequenceFeature sf, String accessionId) + { + return !SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.GENE); + } + +} diff --git a/src/jalview/ext/ensembl/EnsemblGenome.java b/src/jalview/ext/ensembl/EnsemblGenome.java index 60f8c46..b9fbbdf 100644 --- a/src/jalview/ext/ensembl/EnsemblGenome.java +++ b/src/jalview/ext/ensembl/EnsemblGenome.java @@ -43,10 +43,14 @@ public class EnsemblGenome extends EnsemblSeqProxy * redundant information on the transcript sequence itself. */ @Override - protected boolean retainFeature(String type) + protected boolean retainFeature(SequenceFeature sf, String accessionId) { - return !SequenceOntology.getInstance().isA(type, - SequenceOntology.TRANSCRIPT); + if (SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.TRANSCRIPT)) + { + return false; + } + return super.retainFeature(sf, accessionId); } /** diff --git a/src/jalview/ext/ensembl/EnsemblOverlap.java b/src/jalview/ext/ensembl/EnsemblOverlap.java index 2ec4664..1663d0d 100644 --- a/src/jalview/ext/ensembl/EnsemblOverlap.java +++ b/src/jalview/ext/ensembl/EnsemblOverlap.java @@ -83,7 +83,7 @@ public class EnsemblOverlap extends EnsemblRestClient } @Override - public boolean useGetRequest() + protected boolean useGetRequest() { return true; } @@ -93,7 +93,7 @@ public class EnsemblOverlap extends EnsemblRestClient * describes the required encoding of the response. */ @Override - public String getRequestMimeType() + protected String getRequestMimeType() { return "text/x-gff3"; } @@ -102,7 +102,7 @@ public class EnsemblOverlap extends EnsemblRestClient * Returns the MIME type for GFF3. */ @Override - public String getResponseMimeType() + protected String getResponseMimeType() { return "text/x-gff3"; } @@ -116,9 +116,8 @@ public class EnsemblOverlap extends EnsemblRestClient * @return * @throws IOException */ - public AlignmentI getSequenceRecords(String accId, - EnsemblFeatureType[] features) - throws IOException + protected AlignmentI getSequenceRecords(String accId, + EnsemblFeatureType[] features) throws IOException { featuresWanted = features; return getSequenceRecords(accId); diff --git a/src/jalview/ext/ensembl/EnsemblRestClient.java b/src/jalview/ext/ensembl/EnsemblRestClient.java index 52993e9..02b13ef 100644 --- a/src/jalview/ext/ensembl/EnsemblRestClient.java +++ b/src/jalview/ext/ensembl/EnsemblRestClient.java @@ -64,7 +64,7 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher * * @return */ - public abstract boolean useGetRequest(); + protected abstract boolean useGetRequest(); /** * Return the desired value for the Content-Type request header @@ -72,7 +72,7 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher * @return * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers */ - public abstract String getRequestMimeType(); + protected abstract String getRequestMimeType(); /** * Return the desired value for the Accept request header @@ -80,7 +80,7 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher * @return * @see https://github.com/Ensembl/ensembl-rest/wiki/HTTP-Headers */ - public abstract String getResponseMimeType(); + protected abstract String getResponseMimeType(); /** * Tries to connect to Ensembl's REST 'ping' endpoint, and returns true if @@ -115,7 +115,7 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher * @return * @throws IOException */ - public FileParse getSequenceReader(List ids) + protected FileParse getSequenceReader(List ids) throws IOException { URL url = getUrl(ids); @@ -163,7 +163,7 @@ abstract class EnsemblRestClient extends EnsemblSequenceFetcher * * @return */ - public boolean isEnsemblAvailable() + protected boolean isEnsemblAvailable() { long now = System.currentTimeMillis(); boolean retest = now - lastCheck > RETEST_INTERVAL; diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index b805417..7153a9e 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -92,7 +92,12 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return (forwards ? 1 : -1) * Integer.compare(o1[0], o2[0]); } - }; + } + + /* + * genomic sequence, with features retrieved from the REST overlap service + */ + private SequenceI genomicSequence; /** * Constructor @@ -177,24 +182,32 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient /* * get 'dummy' genomic sequence with exon, cds and variation features */ - EnsemblOverlap gffFetcher = new EnsemblOverlap(); - EnsemblFeatureType[] features = getFeaturesToFetch(); - AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, - features); - if (geneFeatures.getHeight() > 0) + if (genomicSequence == null) + { + EnsemblOverlap gffFetcher = new EnsemblOverlap(); + EnsemblFeatureType[] features = getFeaturesToFetch(); + AlignmentI geneFeatures = gffFetcher.getSequenceRecords(accId, + features); + if (geneFeatures.getHeight() > 0) + { + /* + * transfer features to the query sequence + */ + genomicSequence = geneFeatures.getSequenceAt(0); + } + } + if (genomicSequence != null) { - /* - * transfer features to the query sequence - */ - SequenceI genomicSequence = geneFeatures.getSequenceAt(0); SequenceI querySeq = alignment.findName(accId); - transferFeatures(accId, genomicSequence, querySeq); + if (transferFeatures(accId, genomicSequence, querySeq)) + { - /* - * fetch and map protein product, and add it as a cross-reference - * of the retrieved sequence - */ - addProteinProduct(querySeq); + /* + * fetch and map protein product, and add it as a cross-reference + * of the retrieved sequence + */ + addProteinProduct(querySeq); + } } } catch (IOException e) { @@ -398,19 +411,19 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } @Override - public boolean useGetRequest() + protected boolean useGetRequest() { return false; } @Override - public String getRequestMimeType() + protected String getRequestMimeType() { return "application/json"; } @Override - public String getResponseMimeType() + protected String getResponseMimeType() { return "text/x-fasta"; } @@ -441,17 +454,27 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * backwards (for negative strand). Aborts and returns null if both positive * and negative strand are found (this should not normally happen). * - * @param sfs + * @param sourceSequence * @param accId + * @param start + * the start position of the sequence we are mapping to * @return */ - protected MapList getGenomicRanges(SequenceFeature[] sfs, String accId) + protected MapList getGenomicRanges(SequenceI sourceSequence, + String accId, int start) { + SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); + if (sfs == null) + { + return null; + } + /* * generously size for initial number of cds regions * (worst case titin Q8WZ42 has c. 313 exons) */ List regions = new ArrayList(100); + int sourceLength = sourceSequence.getLength(); int mappedLength = 0; int direction = 1; // forward boolean directionSet = false; @@ -485,12 +508,28 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } else { - regions.add(new int[] { sf.getBegin(), sf.getEnd() }); - } - mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); + regions.add(new int[] { sf.getBegin(), sf.getEnd() }); } + mappedLength += Math.abs(sf.getEnd() - sf.getBegin() + 1); + + if (mappedLength >= sourceLength) + { + /* + * break for the case of matching gene features v gene sequence + * - only need to locate the 'gene' feature for accId + */ + break; + } + } } + if (regions.isEmpty()) + { + System.out.println("Failed to identify target sequence for " + accId + + " from genomic features"); + return null; + } + /* * a final sort is needed since Ensembl returns CDS sorted within source * (havana / ensembl_havana) @@ -498,14 +537,16 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient Collections.sort(regions, new RangeSorter(direction == 1)); List to = new ArrayList(); - to.add(new int[] { 1, mappedLength }); + to.add(new int[] { start, start + mappedLength - 1 }); return new MapList(regions, to, 1, 1); } /** - * Returns true if the sequence feature identifies positions of the genomic - * sequence feature which are within the sequence being retrieved. + * Returns true if the sequence feature marks positions of the genomic + * sequence feature which are within the sequence being retrieved. For + * example, an 'exon' feature whose parent is the target transcript marks the + * cdna positions of the transcript. * * @param sf * @param accId @@ -527,13 +568,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected void transferFeature(SequenceFeature sf, SequenceI targetSequence, MapList overlap) { - String parent = (String) sf.getValue(PARENT); - if (parent != null && !parent.contains(targetSequence.getName())) - { - // this genomic feature belongs to a different transcript - return; - } - int start = sf.getBegin(); int end = sf.getEnd(); int[] mappedRange = overlap.locateInTo(start, end); @@ -541,9 +575,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient if (mappedRange != null) { SequenceFeature copy = new SequenceFeature(sf); - int offset = targetSequence.getStart() - 1; - copy.setBegin(offset + Math.min(mappedRange[0], mappedRange[1])); - copy.setEnd(offset + Math.max(mappedRange[0], mappedRange[1])); + copy.setBegin(Math.min(mappedRange[0], mappedRange[1])); + copy.setEnd(Math.max(mappedRange[0], mappedRange[1])); targetSequence.addSequenceFeature(copy); /* @@ -570,17 +603,23 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient * @param accessionId * @param sourceSequence * @param targetSequence + * @return true if any features were transferred, else false */ - protected void transferFeatures(String accessionId, + protected boolean transferFeatures(String accessionId, SequenceI sourceSequence, SequenceI targetSequence) { if (sourceSequence == null || targetSequence == null) { - return; + return false; } SequenceFeature[] sfs = sourceSequence.getSequenceFeatures(); - MapList overlap = getGenomicRanges(sfs, accessionId); + MapList overlap = getGenomicRanges(sourceSequence, accessionId, + targetSequence.getStart()); + if (overlap == null) + { + return false; + } final boolean forwardStrand = overlap.isFromForwardStrand(); @@ -598,23 +637,46 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient } }); + boolean transferred = false; for (SequenceFeature sf : sfs) { - if (retainFeature(sf.getType())) + if (retainFeature(sf, accessionId)) { transferFeature(sf, targetSequence, overlap); + transferred = true; } } + return transferred; } /** - * Answers true if the feature type is one to attach to the retrieved sequence + * Answers true if the feature is one to attach to the retrieved sequence * * @param type * @return */ - protected boolean retainFeature(@SuppressWarnings("unused") String type) + protected boolean retainFeature(SequenceFeature sf, String accessionId) + { + String parent = (String) sf.getValue(PARENT); + if (parent != null && !parent.contains(accessionId)) + { + // this genomic feature belongs to a different transcript + return false; + } + return true; + } + + @Override + public String getDescription() + { + return "Ensembl " + getSourceEnsemblType().getType() + + " sequence with variant features"; + } + + public AlignmentI getSequenceRecords(String transcriptId, + SequenceI geneSeq) throws Exception { - return true; // default is to keep all + this.genomicSequence = geneSeq; + return getSequenceRecords(transcriptId); } } diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java index d878f64..1ffe27cb 100644 --- a/src/jalview/io/gff/Gff3Helper.java +++ b/src/jalview/io/gff/Gff3Helper.java @@ -379,6 +379,15 @@ public class Gff3Helper extends GffHelperBase desc = StringUtils.listToDelimitedString( attributes.get("alleles"), ","); } + + /* + * Ensembl returns gene name as 'Name' for a transcript + */ + if (SequenceOntology.getInstance().isA(sf.getType(), + SequenceOntology.TRANSCRIPT)) + { + desc = StringUtils.listToDelimitedString(attributes.get("Name"), ","); + } return desc; } } diff --git a/src/jalview/io/gff/SequenceOntology.java b/src/jalview/io/gff/SequenceOntology.java index 441d296..685b83e 100644 --- a/src/jalview/io/gff/SequenceOntology.java +++ b/src/jalview/io/gff/SequenceOntology.java @@ -43,6 +43,9 @@ public class SequenceOntology // SO:0000673 public static final String TRANSCRIPT = "transcript"; + // SO:0000704 + public static final String GENE = "gene"; + /* * singleton instance of this class */ diff --git a/src/jalview/ws/SequenceFetcher.java b/src/jalview/ws/SequenceFetcher.java index a5300db..909f515 100644 --- a/src/jalview/ws/SequenceFetcher.java +++ b/src/jalview/ws/SequenceFetcher.java @@ -22,6 +22,7 @@ package jalview.ws; import jalview.ext.ensembl.EnsemblCdna; import jalview.ext.ensembl.EnsemblCds; +import jalview.ext.ensembl.EnsemblGene; import jalview.ext.ensembl.EnsemblGenome; import jalview.ext.ensembl.EnsemblProtein; import jalview.ws.dbsources.EmblCdsSource; @@ -64,6 +65,7 @@ public class SequenceFetcher extends ASequenceFetcher addDBRefSourceImpl(EnsemblProtein.class); addDBRefSourceImpl(EnsemblCds.class); addDBRefSourceImpl(EnsemblGenome.class); + addDBRefSourceImpl(EnsemblGene.class); addDBRefSourceImpl(EnsemblCdna.class); addDBRefSourceImpl(EmblSource.class); addDBRefSourceImpl(EmblCdsSource.class); diff --git a/test/jalview/ext/ensembl/EnsemblRestClientTest.java b/test/jalview/ext/ensembl/EnsemblRestClientTest.java index 086adbb..c7b3397 100644 --- a/test/jalview/ext/ensembl/EnsemblRestClientTest.java +++ b/test/jalview/ext/ensembl/EnsemblRestClientTest.java @@ -36,19 +36,19 @@ public class EnsemblRestClientTest } @Override - public boolean useGetRequest() + protected boolean useGetRequest() { return false; } @Override - public String getRequestMimeType() + protected String getRequestMimeType() { return null; } @Override - public String getResponseMimeType() + protected String getResponseMimeType() { return null; } diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index 3ca74b0..a06465f 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -182,21 +182,21 @@ public class EnsemblSeqProxyTest } @Override - public boolean useGetRequest() + protected boolean useGetRequest() { // TODO Auto-generated method stub return false; } @Override - public String getRequestMimeType() + protected String getRequestMimeType() { // TODO Auto-generated method stub return null; } @Override - public String getResponseMimeType() + protected String getResponseMimeType() { // TODO Auto-generated method stub return null; @@ -208,4 +208,5 @@ public class EnsemblSeqProxyTest + (isAvailable ? "UP!" : "DOWN or unreachable ******************* BAD!")); } + // todo lots of tests } \ No newline at end of file -- 1.7.10.2