From: gmungoc Date: Thu, 25 Oct 2018 13:56:01 +0000 (+0100) Subject: JAL-3143 fetch Ensembl(Genomes) features as JSON not GFF X-Git-Tag: Release_2_11_1_0~58^2~5 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=cd268aa5e06c4327489110c59c987e3e071eb038;p=jalview.git JAL-3143 fetch Ensembl(Genomes) features as JSON not GFF --- diff --git a/src/jalview/ext/ensembl/EnsemblCdna.java b/src/jalview/ext/ensembl/EnsemblCdna.java index 7384327..e01ad17 100644 --- a/src/jalview/ext/ensembl/EnsemblCdna.java +++ b/src/jalview/ext/ensembl/EnsemblCdna.java @@ -22,7 +22,6 @@ package jalview.ext.ensembl; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyI; import java.util.ArrayList; @@ -127,7 +126,7 @@ public class EnsemblCdna extends EnsemblSeqProxy for (SequenceFeature sf : sfs) { String parentFeature = (String) sf.getValue(PARENT); - if (("transcript:" + accId).equals(parentFeature)) + if (accId.equals(parentFeature)) { result.add(sf); } diff --git a/src/jalview/ext/ensembl/EnsemblCds.java b/src/jalview/ext/ensembl/EnsemblCds.java index 8a71b64..8f13d99 100644 --- a/src/jalview/ext/ensembl/EnsemblCds.java +++ b/src/jalview/ext/ensembl/EnsemblCds.java @@ -116,7 +116,7 @@ public class EnsemblCds extends EnsemblSeqProxy for (SequenceFeature sf : sfs) { String parentFeature = (String) sf.getValue(PARENT); - if (("transcript:" + accId).equals(parentFeature)) + if ( accId.equals(parentFeature)) { result.add(sf); } diff --git a/src/jalview/ext/ensembl/EnsemblFeatures.java b/src/jalview/ext/ensembl/EnsemblFeatures.java index 582eac6..a133381 100644 --- a/src/jalview/ext/ensembl/EnsemblFeatures.java +++ b/src/jalview/ext/ensembl/EnsemblFeatures.java @@ -22,17 +22,24 @@ package jalview.ext.ensembl; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; -import jalview.io.DataSourceType; -import jalview.io.FeaturesFile; -import jalview.io.FileParse; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.io.gff.SequenceOntologyI; import java.io.BufferedReader; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + /** * A client for fetching and processing Ensembl feature data in GFF format by * calling the overlap REST service @@ -91,9 +98,128 @@ class EnsemblFeatures extends EnsemblRestClient { return null; } - FeaturesFile fr = new FeaturesFile( - new FileParse(fp, null, DataSourceType.URL)); - return new Alignment(fr.getSeqsAsArray()); + + SequenceI seq = parseFeaturesJson(fp); + return new Alignment(new SequenceI[] { seq }); + } + + /** + * Parses the JSON response into Jalview sequence features and attaches them + * to a dummy sequence + * + * @param br + * @return + */ + private SequenceI parseFeaturesJson(BufferedReader br) + { + SequenceI seq = new Sequence("Dummy", ""); + + JSONParser jp = new JSONParser(); + try + { + JSONArray responses = (JSONArray) jp.parse(br); + Iterator rvals = responses.iterator(); + while (rvals.hasNext()) + { + try + { + JSONObject obj = (JSONObject) rvals.next(); + String type = obj.get("feature_type").toString(); + int start = Integer.parseInt(obj.get("start").toString()); + int end = Integer.parseInt(obj.get("end").toString()); + String source = obj.get("source").toString(); + String strand = obj.get("strand").toString(); + Object value = obj.get("consequence_type"); + value = obj.get("alleles"); + JSONArray allelesArray = (JSONArray) value; + String alleles = allelesArray == null ? null + : allelesArray.toString(); // todo need as a List? + value = obj.get("clinical_significance"); + JSONArray clinSigArray = (JSONArray) value; + String clinSig = clinSigArray == null ? null + : clinSigArray.toString(); + + /* + * convert 'variation' to 'sequence_variant', and 'cds' to 'CDS' + * so as to have a valid SO term for the feature type + * ('gene', 'exon', 'transcript' don't need any conversion) + */ + if ("variation".equals(type)) + { + type = SequenceOntologyI.SEQUENCE_VARIANT; + } + else if (SequenceOntologyI.CDS.equalsIgnoreCase((type))) + { + type = SequenceOntologyI.CDS; + } + + String desc = getFirstNotNull(obj, "alleles", "external_name", + JSON_ID); + SequenceFeature sf = new SequenceFeature(type, desc, start, end, + source); + sf.setStrand("1".equals(strand) ? "+" : "-"); + setFeatureAttribute(sf, obj, "id"); + setFeatureAttribute(sf, obj, "Parent"); + setFeatureAttribute(sf, obj, "consequence_type"); + sf.setValue("alleles", alleles); + sf.setValue("clinical_significance", clinSig); + + seq.addSequenceFeature(sf); + } catch (Throwable t) + { + // ignore - keep trying other features + } + } + } catch (ParseException | IOException e) + { + // ignore + } + + return seq; + } + + /** + * Returns the first non-null attribute found (if any) as a string + * + * @param obj + * @param keys + * @return + */ + protected String getFirstNotNull(JSONObject obj, String... keys) + { + String desc = null; + + for (String key : keys) + { + Object val = obj.get(key); + if (val != null) + { + String s = val.toString(); + if (!s.isEmpty()) + { + return s; + } + } + } + return desc; + } + + /** + * A helper method that reads the 'key' entry in the JSON object, and if not + * null, sets its string value as an attribute on the sequence feature + * + * @param sf + * @param obj + * @param key + */ + protected void setFeatureAttribute(SequenceFeature sf, JSONObject obj, + String key) + { + Object object = obj.get(key); + if (object != null) + { + sf.setValue(key, object.toString()); + } } /** @@ -109,7 +235,7 @@ class EnsemblFeatures extends EnsemblRestClient urlstring.append(getDomain()).append("/overlap/id/").append(ids.get(0)); // @see https://github.com/Ensembl/ensembl-rest/wiki/Output-formats - urlstring.append("?content-type=text/x-gff3"); + urlstring.append("?content-type=" + getResponseMimeType()); /* * specify object_type=gene in case is shared by transcript and/or protein; @@ -145,16 +271,16 @@ class EnsemblFeatures extends EnsemblRestClient @Override protected String getRequestMimeType() { - return "text/x-gff3"; + return "application/json"; } /** - * Returns the MIME type for GFF3 + * Returns the MIME type wanted for the response */ @Override protected String getResponseMimeType() { - return "text/x-gff3"; + return "application/json"; } /** diff --git a/src/jalview/ext/ensembl/EnsemblGene.java b/src/jalview/ext/ensembl/EnsemblGene.java index 36b19e2..7648536 100644 --- a/src/jalview/ext/ensembl/EnsemblGene.java +++ b/src/jalview/ext/ensembl/EnsemblGene.java @@ -51,8 +51,6 @@ import com.stevesoft.pat.Regex; */ public class EnsemblGene extends EnsemblSeqProxy { - private static final String GENE_PREFIX = "gene:"; - /* * accepts anything as we will attempt lookup of gene or * transcript id or gene name @@ -368,7 +366,7 @@ public class EnsemblGene extends EnsemblSeqProxy * look for exon features of the transcript, failing that for CDS * (for example ENSG00000124610 has 1 CDS but no exon features) */ - String parentId = "transcript:" + accId; + String parentId = accId; List splices = findFeatures(gene, SequenceOntologyI.EXON, parentId); if (splices.isEmpty()) @@ -399,7 +397,7 @@ public class EnsemblGene extends EnsemblSeqProxy * Ensembl has gene name as transcript Name * EnsemblGenomes doesn't, but has a url-encoded description field */ - String description = (String) transcriptFeature.getValue(NAME); + String description = transcriptFeature.getDescription(); if (description == null) { description = (String) transcriptFeature.getValue(DESCRIPTION); @@ -488,7 +486,7 @@ public class EnsemblGene extends EnsemblSeqProxy */ protected String getTranscriptId(SequenceFeature feature) { - return (String) feature.getValue("transcript_id"); + return (String) feature.getValue(JSON_ID); } /** @@ -510,7 +508,7 @@ public class EnsemblGene extends EnsemblSeqProxy { List transcriptFeatures = new ArrayList<>(); - String parentIdentifier = GENE_PREFIX + accId; + String parentIdentifier = accId; List sfs = geneSequence.getFeatures() .getFeaturesByOntology(SequenceOntologyI.TRANSCRIPT); @@ -561,9 +559,8 @@ public class EnsemblGene extends EnsemblSeqProxy .getFeaturesByOntology(SequenceOntologyI.GENE); for (SequenceFeature sf : sfs) { - // NB features as gff use 'ID'; rest services return as 'id' - String id = (String) sf.getValue("ID"); - if ((GENE_PREFIX + accId).equalsIgnoreCase(id)) + String id = (String) sf.getValue(JSON_ID); + if (accId.equalsIgnoreCase(id)) { result.add(sf); } @@ -590,7 +587,7 @@ public class EnsemblGene extends EnsemblSeqProxy if (isTranscript(type)) { String parent = (String) sf.getValue(PARENT); - if (!(GENE_PREFIX + accessionId).equalsIgnoreCase(parent)) + if (!accessionId.equalsIgnoreCase(parent)) { return false; } diff --git a/src/jalview/ext/ensembl/EnsemblGenome.java b/src/jalview/ext/ensembl/EnsemblGenome.java index 6684e20..4f59bc5 100644 --- a/src/jalview/ext/ensembl/EnsemblGenome.java +++ b/src/jalview/ext/ensembl/EnsemblGenome.java @@ -117,9 +117,8 @@ public class EnsemblGenome extends EnsemblSeqProxy SequenceOntologyI.NMD_TRANSCRIPT_VARIANT); for (SequenceFeature sf : sfs) { - // NB features as gff use 'ID'; rest services return as 'id' - String id = (String) sf.getValue("ID"); - if (("transcript:" + accId).equals(id)) + String id = (String) sf.getValue(JSON_ID); + if (accId.equals(id)) { result.add(sf); } diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 7b448fd..7a37c8a 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -62,8 +62,6 @@ import org.json.simple.parser.ParseException; */ public abstract class EnsemblSeqProxy extends EnsemblRestClient { - protected static final String NAME = "Name"; - protected static final String DESCRIPTION = "description"; /* @@ -867,9 +865,8 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient protected boolean featureMayBelong(SequenceFeature sf, String identifier) { String parent = (String) sf.getValue(PARENT); - // using contains to allow for prefix "gene:", "transcript:" etc if (parent != null - && !parent.toUpperCase().contains(identifier.toUpperCase())) + && !parent.equalsIgnoreCase(identifier)) { // this genomic feature belongs to a different transcript return false; @@ -877,6 +874,9 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient return true; } + /** + * Answers a short description of the sequence fetcher + */ @Override public String getDescription() {