--- /dev/null
+package jalview.ext.ensembl;
+
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.DBRefSource;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+import java.util.List;
+
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
+
+public class EnsemblMap extends EnsemblRestClient
+{
+
+ /**
+ * Default constructor (to use rest.ensembl.org)
+ */
+ public EnsemblMap()
+ {
+ super();
+ }
+
+ /**
+ * Constructor given the target domain to fetch data from
+ *
+ * @param
+ */
+ public EnsemblMap(String domain)
+ {
+ super(domain);
+ }
+
+ @Override
+ public String getDbName()
+ {
+ return DBRefSource.ENSEMBL;
+ }
+
+ @Override
+ public AlignmentI getSequenceRecords(String queries) throws Exception
+ {
+ return null; // not used
+ }
+
+ protected URL getUrl(String species, String chromosome, String fromRef,
+ String toRef, int startPos, int endPos)
+ throws MalformedURLException
+ {
+ String url = getDomain() + "/map/" + species + "/" + fromRef + "/"
+ + chromosome + ":" + startPos + ".." + endPos + ":1/" + toRef
+ + "?content-type=application/json";
+ try
+ {
+ return new URL(url);
+ } catch (MalformedURLException e)
+ {
+ return null;
+ }
+ }
+
+ @Override
+ protected boolean useGetRequest()
+ {
+ return true;
+ }
+
+ @Override
+ protected String getRequestMimeType(boolean multipleIds)
+ {
+ return "application/json";
+ }
+
+ @Override
+ protected String getResponseMimeType()
+ {
+ return "application/json";
+ }
+
+ @Override
+ protected URL getUrl(List<String> ids) throws MalformedURLException
+ {
+ return null; // not used
+ }
+
+ public int[] getMapping(String species, String chromosome,
+ String fromRef, String toRef, int[] queryRange)
+ {
+ URL url = null;
+ BufferedReader br = null;
+
+ try
+ {
+ url = getUrl(species, chromosome, fromRef, toRef, queryRange[0],
+ queryRange[1]);
+ br = getHttpResponse(url, null);
+ return (parseResponse(br));
+ } catch (Throwable t)
+ {
+ System.out.println("Error calling " + url + ": " + t.getMessage());
+ return null;
+ }
+ }
+
+ /**
+ * Parses the JSON response from the /map REST service. The format is (with
+ * some fields omitted)
+ *
+ * <pre>
+ * {"mappings":
+ * [{
+ * "original": {"end":45109016,"start":45051610},
+ * "mapped" : {"end":43186384,"start":43128978}
+ * }] }
+ * </pre>
+ *
+ * @param br
+ * @return
+ */
+ protected int[] parseResponse(BufferedReader br)
+ {
+ int[] result = null;
+ JSONParser jp = new JSONParser();
+
+ try
+ {
+ JSONObject parsed = (JSONObject) jp.parse(br);
+ JSONArray mappings = (JSONArray) parsed.get("mappings");
+
+ Iterator rvals = mappings.iterator();
+ while (rvals.hasNext())
+ {
+ // todo check for "mapped"
+ JSONObject val = (JSONObject) rvals.next();
+ JSONObject mapped = (JSONObject) val.get("mapped");
+ String start = mapped.get("start").toString();
+ String end = mapped.get("end").toString();
+ result = new int[] { Integer.parseInt(start), Integer.parseInt(end) };
+ }
+ } catch (IOException | ParseException | NumberFormatException e)
+ {
+ // ignore
+ }
+ return result;
+ }
+
+}
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
+import jalview.ext.ensembl.EnsemblMap;
import jalview.ext.htsjdk.VCFReader;
import jalview.io.gff.SequenceOntologyI;
import jalview.util.MapList;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+/**
+ * A class to read VCF data (using the htsjdk) and add variants as sequence
+ * features on dna and any related protein product sequences
+ *
+ * @author gmcarstairs
+ */
public class VCFLoader
{
- AlignmentI al;
+ private static final String EXCL = "!";
+
+ /*
+ * the alignment we are associated VCF data with
+ */
+ private AlignmentI al;
+
+ /*
+ * mappings between VCF and sequence reference assembly regions, as
+ * key = "species!chromosome!fromAssembly!toAssembly
+ * value = Map{fromRange, toRange}
+ */
+ private Map<String, Map<int[], int[]>> assemblyMappings;
/**
* Constructor given an alignment context
public VCFLoader(AlignmentI alignment)
{
al = alignment;
+
+ // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
+ assemblyMappings = new HashMap<String, Map<int[], int[]>>();
}
/**
* Loads VCF on to an alignment - provided it can be related to one or more
- * sequence's chromosomal coordinates
+ * sequence's chromosomal coordinates.
+ * <p>
+ * This method is not thread safe - concurrent threads should use separate
+ * instances of this class.
*
* @param filePath
*/
* @param isVcfRefGrch37
* @return
*/
- protected int loadVCF(SequenceI seq, VCFReader reader, boolean isVcfRefGrch37)
+ protected int loadVCF(SequenceI seq, VCFReader reader,
+ boolean isVcfRefGrch37)
{
int count = 0;
GeneLoci seqCoords = ((Sequence) seq).getGeneLoci();
String chromosome = seqCoords.getChromosome();
String seqRef = seqCoords.getReference();
String species = seqCoords.getSpecies();
-
+
+ // TODO handle species properly
+ if ("".equals(species))
+ {
+ species = "human";
+ }
+
/*
* map chromosomal coordinates from GRCh38 (sequence) to
* GRCh37 (VCF) if necessary
*/
+ // TODO generalise for other assemblies and species
int offset = 0;
- if ("GRCh38".equalsIgnoreCase(seqRef) && isVcfRefGrch37)
+ String fromRef = "GRCh38";
+ if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37)
{
+ String toRef = "GRCh37";
int[] newRange = mapReferenceRange(range, chromosome, species,
- "GRCh38",
- "GRCh37");
+ fromRef, toRef);
+ if (newRange == null)
+ {
+ System.err.println(String.format(
+ "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
+ fromRef, range[0], range[1], toRef));
+ return 0;
+ }
offset = newRange[0] - range[0];
range = newRange;
}
int end = variant.getEnd() - offset;
/*
- * get location in sequence coordinates
+ * convert chromosomal location to sequence coordinates
+ * - null if a partially overlapping feature
*/
int[] seqLocation = mapping.locateInFrom(start, end);
- int featureStart = seqLocation[0];
- int featureEnd = seqLocation[1];
- addVariantFeatures(seq, variant, featureStart, featureEnd);
+ if (seqLocation != null)
+ {
+ addVariantFeatures(seq, variant, seqLocation[0], seqLocation[1]);
+ }
}
variants.close();
}
/**
- * Call the Ensembl REST service that maps from one assembly reference's
- * coordinates to another's
+ * Determines the location of the query range (chromosome positions) in a
+ * different reference assembly.
+ * <p>
+ * If the range is just a subregion of one for which we already have a mapping
+ * (for example, an exon sub-region of a gene), then the mapping is just
+ * computed arithmetically.
+ * <p>
+ * Otherwise, calls the Ensembl REST service that maps from one assembly
+ * reference's coordinates to another's
*
- * @param range
+ * @param queryRange
+ * start-end chromosomal range in 'fromRef' coordinates
* @param chromosome
* @param species
* @param fromRef
+ * assembly reference for the query coordinates
* @param toRef
- * @return
+ * assembly reference we wish to translate to
+ * @return the start-end range in 'toRef' coordinates
*/
- protected int[] mapReferenceRange(int[] range, String chromosome,
+ protected int[] mapReferenceRange(int[] queryRange, String chromosome,
String species, String fromRef, String toRef)
{
- // TODO call
- // http://rest.ensembl.org/map/species/fromRef/chromosome:range[0]..range[1]:1/toRef?content-type=application/json
- // and parse the JSON response
+ /*
+ * first try shorcut of computing the mapping as a subregion of one
+ * we already have (e.g. for an exon, if we have the gene mapping)
+ */
+ int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
+ species, fromRef, toRef);
+ if (mappedRange != null)
+ {
+ return mappedRange;
+ }
+
+ /*
+ * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
+ */
+ EnsemblMap mapper = new EnsemblMap();
+ int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef,
+ queryRange);
+
+ if (mapping == null)
+ {
+ // mapping service failure
+ return null;
+ }
- // 1922632 is the difference between 37 and 38 for chromosome 17
- return new int[] { range[0] - 1922632, range[1] - 1922632 };
+ /*
+ * save mapping for possible future re-use
+ */
+ String key = species + EXCL + chromosome + EXCL + fromRef + EXCL
+ + toRef;
+ if (!assemblyMappings.containsKey(key))
+ {
+ assemblyMappings.put(key, new HashMap<int[], int[]>());
+ }
+
+ assemblyMappings.get(key).put(queryRange, mapping);
+
+ return mapping;
+ }
+
+ /**
+ * If we already have a 1:1 contiguous mapping which subsumes the given query
+ * range, this method just calculates and returns the subset of that mapping,
+ * else it returns null. In practical terms, if a gene has a contiguous
+ * mapping between (for example) GRCh37 and GRCh38, then we assume that its
+ * subsidiary exons occupy unchanged relative positions, and just compute
+ * these as offsets, rather than do another lookup of the mapping.
+ * <p>
+ * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
+ * simply remove this method or let it always return null.
+ * <p>
+ * Warning: many rapid calls to the /map service map result in a 429 overload
+ * error response
+ *
+ * @param queryRange
+ * @param chromosome
+ * @param species
+ * @param fromRef
+ * @param toRef
+ * @return
+ */
+ protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
+ String species, String fromRef, String toRef)
+ {
+ String key = species + EXCL + chromosome + EXCL + fromRef + EXCL
+ + toRef;
+ if (assemblyMappings.containsKey(key))
+ {
+ Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
+ for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
+ {
+ int[] fromRange = mappedRange.getKey();
+ int[] toRange = mappedRange.getValue();
+ if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
+ {
+ /*
+ * mapping is 1:1 in length, so we trust it to have no discontinuities
+ */
+ if (fromRange[0] <= queryRange[0] && fromRange[1] >= queryRange[1])
+ {
+ /*
+ * fromRange subsumes our query range
+ */
+ int offset = queryRange[0] - fromRange[0];
+ int mappedRangeFrom = toRange[0] + offset;
+ int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
+ return new int[] { mappedRangeFrom, mappedRangeTo };
+ }
+ }
+ }
+ }
+ return null;
}
}