From 5e1b1391f58f31578c436e5ed4e571b0ceef8c9d Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 22 Sep 2017 11:58:48 +0100 Subject: [PATCH] JAL-2738 lookup GRCh38/37 gene mapping --- src/jalview/ext/ensembl/EnsemblMap.java | 152 ++++++++++++++++++++++++++ src/jalview/io/vcf/VCFLoader.java | 181 +++++++++++++++++++++++++++---- 2 files changed, 312 insertions(+), 21 deletions(-) create mode 100644 src/jalview/ext/ensembl/EnsemblMap.java diff --git a/src/jalview/ext/ensembl/EnsemblMap.java b/src/jalview/ext/ensembl/EnsemblMap.java new file mode 100644 index 0000000..d522ea8 --- /dev/null +++ b/src/jalview/ext/ensembl/EnsemblMap.java @@ -0,0 +1,152 @@ +package jalview.ext.ensembl; + +import jalview.datamodel.AlignmentI; +import jalview.datamodel.DBRefSource; + +import java.io.BufferedReader; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; +import java.util.List; + +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + +public class EnsemblMap extends EnsemblRestClient +{ + + /** + * Default constructor (to use rest.ensembl.org) + */ + public EnsemblMap() + { + super(); + } + + /** + * Constructor given the target domain to fetch data from + * + * @param + */ + public EnsemblMap(String domain) + { + super(domain); + } + + @Override + public String getDbName() + { + return DBRefSource.ENSEMBL; + } + + @Override + public AlignmentI getSequenceRecords(String queries) throws Exception + { + return null; // not used + } + + protected URL getUrl(String species, String chromosome, String fromRef, + String toRef, int startPos, int endPos) + throws MalformedURLException + { + String url = getDomain() + "/map/" + species + "/" + fromRef + "/" + + chromosome + ":" + startPos + ".." + endPos + ":1/" + toRef + + "?content-type=application/json"; + try + { + return new URL(url); + } catch (MalformedURLException e) + { + return null; + } + } + + @Override + protected boolean useGetRequest() + { + return true; + } + + @Override + protected String getRequestMimeType(boolean multipleIds) + { + return "application/json"; + } + + @Override + protected String getResponseMimeType() + { + return "application/json"; + } + + @Override + protected URL getUrl(List ids) throws MalformedURLException + { + return null; // not used + } + + public int[] getMapping(String species, String chromosome, + String fromRef, String toRef, int[] queryRange) + { + URL url = null; + BufferedReader br = null; + + try + { + url = getUrl(species, chromosome, fromRef, toRef, queryRange[0], + queryRange[1]); + br = getHttpResponse(url, null); + return (parseResponse(br)); + } catch (Throwable t) + { + System.out.println("Error calling " + url + ": " + t.getMessage()); + return null; + } + } + + /** + * Parses the JSON response from the /map REST service. The format is (with + * some fields omitted) + * + *
+   *  {"mappings": 
+   *    [{
+   *       "original": {"end":45109016,"start":45051610},
+   *       "mapped"  : {"end":43186384,"start":43128978} 
+   *  }] }
+   * 
+ * + * @param br + * @return + */ + protected int[] parseResponse(BufferedReader br) + { + int[] result = null; + JSONParser jp = new JSONParser(); + + try + { + JSONObject parsed = (JSONObject) jp.parse(br); + JSONArray mappings = (JSONArray) parsed.get("mappings"); + + Iterator rvals = mappings.iterator(); + while (rvals.hasNext()) + { + // todo check for "mapped" + JSONObject val = (JSONObject) rvals.next(); + JSONObject mapped = (JSONObject) val.get("mapped"); + String start = mapped.get("start").toString(); + String end = mapped.get("end").toString(); + result = new int[] { Integer.parseInt(start), Integer.parseInt(end) }; + } + } catch (IOException | ParseException | NumberFormatException e) + { + // ignore + } + return result; + } + +} diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 0b13143..48f55d4 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -14,17 +14,37 @@ import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.ext.ensembl.EnsemblMap; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; +/** + * A class to read VCF data (using the htsjdk) and add variants as sequence + * features on dna and any related protein product sequences + * + * @author gmcarstairs + */ public class VCFLoader { - AlignmentI al; + private static final String EXCL = "!"; + + /* + * the alignment we are associated VCF data with + */ + private AlignmentI al; + + /* + * mappings between VCF and sequence reference assembly regions, as + * key = "species!chromosome!fromAssembly!toAssembly + * value = Map{fromRange, toRange} + */ + private Map> assemblyMappings; /** * Constructor given an alignment context @@ -34,11 +54,17 @@ public class VCFLoader public VCFLoader(AlignmentI alignment) { al = alignment; + + // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} + assemblyMappings = new HashMap>(); } /** * Loads VCF on to an alignment - provided it can be related to one or more - * sequence's chromosomal coordinates + * sequence's chromosomal coordinates. + *

+ * This method is not thread safe - concurrent threads should use separate + * instances of this class. * * @param filePath */ @@ -128,7 +154,8 @@ public class VCFLoader * @param isVcfRefGrch37 * @return */ - protected int loadVCF(SequenceI seq, VCFReader reader, boolean isVcfRefGrch37) + protected int loadVCF(SequenceI seq, VCFReader reader, + boolean isVcfRefGrch37) { int count = 0; GeneLoci seqCoords = ((Sequence) seq).getGeneLoci(); @@ -169,17 +196,32 @@ public class VCFLoader String chromosome = seqCoords.getChromosome(); String seqRef = seqCoords.getReference(); String species = seqCoords.getSpecies(); - + + // TODO handle species properly + if ("".equals(species)) + { + species = "human"; + } + /* * map chromosomal coordinates from GRCh38 (sequence) to * GRCh37 (VCF) if necessary */ + // TODO generalise for other assemblies and species int offset = 0; - if ("GRCh38".equalsIgnoreCase(seqRef) && isVcfRefGrch37) + String fromRef = "GRCh38"; + if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37) { + String toRef = "GRCh37"; int[] newRange = mapReferenceRange(range, chromosome, species, - "GRCh38", - "GRCh37"); + fromRef, toRef); + if (newRange == null) + { + System.err.println(String.format( + "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, + fromRef, range[0], range[1], toRef)); + return 0; + } offset = newRange[0] - range[0]; range = newRange; } @@ -203,12 +245,14 @@ public class VCFLoader int end = variant.getEnd() - offset; /* - * get location in sequence coordinates + * convert chromosomal location to sequence coordinates + * - null if a partially overlapping feature */ int[] seqLocation = mapping.locateInFrom(start, end); - int featureStart = seqLocation[0]; - int featureEnd = seqLocation[1]; - addVariantFeatures(seq, variant, featureStart, featureEnd); + if (seqLocation != null) + { + addVariantFeatures(seq, variant, seqLocation[0], seqLocation[1]); + } } variants.close(); @@ -274,24 +318,119 @@ public class VCFLoader } /** - * Call the Ensembl REST service that maps from one assembly reference's - * coordinates to another's + * Determines the location of the query range (chromosome positions) in a + * different reference assembly. + *

+ * If the range is just a subregion of one for which we already have a mapping + * (for example, an exon sub-region of a gene), then the mapping is just + * computed arithmetically. + *

+ * Otherwise, calls the Ensembl REST service that maps from one assembly + * reference's coordinates to another's * - * @param range + * @param queryRange + * start-end chromosomal range in 'fromRef' coordinates * @param chromosome * @param species * @param fromRef + * assembly reference for the query coordinates * @param toRef - * @return + * assembly reference we wish to translate to + * @return the start-end range in 'toRef' coordinates */ - protected int[] mapReferenceRange(int[] range, String chromosome, + protected int[] mapReferenceRange(int[] queryRange, String chromosome, String species, String fromRef, String toRef) { - // TODO call - // http://rest.ensembl.org/map/species/fromRef/chromosome:range[0]..range[1]:1/toRef?content-type=application/json - // and parse the JSON response + /* + * first try shorcut of computing the mapping as a subregion of one + * we already have (e.g. for an exon, if we have the gene mapping) + */ + int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome, + species, fromRef, toRef); + if (mappedRange != null) + { + return mappedRange; + } + + /* + * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37 + */ + EnsemblMap mapper = new EnsemblMap(); + int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef, + queryRange); + + if (mapping == null) + { + // mapping service failure + return null; + } - // 1922632 is the difference between 37 and 38 for chromosome 17 - return new int[] { range[0] - 1922632, range[1] - 1922632 }; + /* + * save mapping for possible future re-use + */ + String key = species + EXCL + chromosome + EXCL + fromRef + EXCL + + toRef; + if (!assemblyMappings.containsKey(key)) + { + assemblyMappings.put(key, new HashMap()); + } + + assemblyMappings.get(key).put(queryRange, mapping); + + return mapping; + } + + /** + * If we already have a 1:1 contiguous mapping which subsumes the given query + * range, this method just calculates and returns the subset of that mapping, + * else it returns null. In practical terms, if a gene has a contiguous + * mapping between (for example) GRCh37 and GRCh38, then we assume that its + * subsidiary exons occupy unchanged relative positions, and just compute + * these as offsets, rather than do another lookup of the mapping. + *

+ * If in future these assumptions prove invalid (e.g. for bacterial dna?!), + * simply remove this method or let it always return null. + *

+ * Warning: many rapid calls to the /map service map result in a 429 overload + * error response + * + * @param queryRange + * @param chromosome + * @param species + * @param fromRef + * @param toRef + * @return + */ + protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome, + String species, String fromRef, String toRef) + { + String key = species + EXCL + chromosome + EXCL + fromRef + EXCL + + toRef; + if (assemblyMappings.containsKey(key)) + { + Map mappedRanges = assemblyMappings.get(key); + for (Entry mappedRange : mappedRanges.entrySet()) + { + int[] fromRange = mappedRange.getKey(); + int[] toRange = mappedRange.getValue(); + if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0]) + { + /* + * mapping is 1:1 in length, so we trust it to have no discontinuities + */ + if (fromRange[0] <= queryRange[0] && fromRange[1] >= queryRange[1]) + { + /* + * fromRange subsumes our query range + */ + int offset = queryRange[0] - fromRange[0]; + int mappedRangeFrom = toRange[0] + offset; + int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]); + return new int[] { mappedRangeFrom, mappedRangeTo }; + } + } + } + } + return null; } } -- 1.7.10.2