1 package jalview.io.vcf;
3 import htsjdk.samtools.util.CloseableIterator;
4 import htsjdk.variant.variantcontext.Allele;
5 import htsjdk.variant.variantcontext.VariantContext;
6 import htsjdk.variant.vcf.VCFHeader;
7 import htsjdk.variant.vcf.VCFHeaderLine;
9 import jalview.analysis.AlignmentUtils;
10 import jalview.datamodel.AlignmentI;
11 import jalview.datamodel.DBRefEntry;
12 import jalview.datamodel.GeneLoci;
13 import jalview.datamodel.Mapping;
14 import jalview.datamodel.Sequence;
15 import jalview.datamodel.SequenceFeature;
16 import jalview.datamodel.SequenceI;
17 import jalview.ext.ensembl.EnsemblMap;
18 import jalview.ext.htsjdk.VCFReader;
19 import jalview.io.gff.SequenceOntologyI;
20 import jalview.util.MapList;
22 import java.util.HashMap;
23 import java.util.List;
25 import java.util.Map.Entry;
28 * A class to read VCF data (using the htsjdk) and add variants as sequence
29 * features on dna and any related protein product sequences
33 public class VCFLoader
35 private static final String EXCL = "!";
38 * the alignment we are associated VCF data with
40 private AlignmentI al;
43 * mappings between VCF and sequence reference assembly regions, as
44 * key = "species!chromosome!fromAssembly!toAssembly
45 * value = Map{fromRange, toRange}
47 private Map<String, Map<int[], int[]>> assemblyMappings;
50 * Constructor given an alignment context
54 public VCFLoader(AlignmentI alignment)
58 // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
59 assemblyMappings = new HashMap<String, Map<int[], int[]>>();
63 * Loads VCF on to an alignment - provided it can be related to one or more
64 * sequence's chromosomal coordinates.
66 * This method is not thread safe - concurrent threads should use separate
67 * instances of this class.
71 public void loadVCF(String filePath)
73 VCFReader reader = null;
77 long start = System.currentTimeMillis();
78 reader = new VCFReader(filePath);
80 VCFHeader header = reader.getFileHeader();
81 VCFHeaderLine ref = header
82 .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
83 // check if reference is wrt assembly19 (GRCh37)
84 // todo may need to allow user to specify reference assembly?
85 boolean isRefGrch37 = (ref != null && ref.getValue().contains(
92 * query for VCF overlapping each sequence in turn
94 for (SequenceI seq : al.getSequences())
96 int added = loadVCF(seq, reader, isRefGrch37);
101 computePeptideVariants(seq);
104 long elapsed = System.currentTimeMillis() - start;
105 System.out.println(String.format(
106 "Added %d VCF variants to %d sequence(s) (%dms)", varCount,
110 } catch (Throwable e)
112 System.err.println("Error processing VCF: " + e.getMessage());
118 * (Re-)computes peptide variants from dna variants, for any protein sequence
119 * to which the dna sequence has a mapping. Note that although duplicate
120 * features may get computed, they will not be added, since duplicate sequence
121 * features are ignored in Sequence.addSequenceFeature.
125 protected void computePeptideVariants(SequenceI dnaSeq)
127 DBRefEntry[] dbrefs = dnaSeq.getDBRefs();
132 for (DBRefEntry dbref : dbrefs)
134 Mapping mapping = dbref.getMap();
135 if (mapping == null || mapping.getTo() == null
136 || mapping.getMap().getFromRatio() != 3)
140 AlignmentUtils.computeProteinFeatures(dnaSeq, mapping.getTo(),
146 * Tries to add overlapping variants read from a VCF file to the given
147 * sequence, and returns the number of overlapping variants found. Note that
148 * this requires the sequence to hold information as to its chromosomal
149 * positions and reference, in order to be able to map the VCF variants to the
154 * @param isVcfRefGrch37
157 protected int loadVCF(SequenceI seq, VCFReader reader,
158 boolean isVcfRefGrch37)
161 GeneLoci seqCoords = ((Sequence) seq).getGeneLoci();
162 if (seqCoords == null)
167 MapList mapping = seqCoords.getMapping();
168 List<int[]> seqChromosomalContigs = mapping.getToRanges();
169 for (int[] range : seqChromosomalContigs)
171 count += addVcfVariants(seq, reader, range, isVcfRefGrch37);
178 * Queries the VCF reader for any variants that overlap the given chromosome
179 * region of the sequence, and adds as variant features. Returns the number of
180 * overlapping variants found.
185 * start-end range of a sequence region in its chromosomal
187 * @param isVcfRefGrch37
188 * true if the VCF is with reference to GRCh37
191 protected int addVcfVariants(SequenceI seq, VCFReader reader,
192 int[] range, boolean isVcfRefGrch37)
194 GeneLoci seqCoords = ((Sequence) seq).getGeneLoci();
196 String chromosome = seqCoords.getChromosome();
197 String seqRef = seqCoords.getReference();
198 String species = seqCoords.getSpecies();
200 // TODO handle species properly
201 if ("".equals(species))
207 * map chromosomal coordinates from GRCh38 (sequence) to
208 * GRCh37 (VCF) if necessary
210 // TODO generalise for other assemblies and species
212 String fromRef = "GRCh38";
213 if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37)
215 String toRef = "GRCh37";
216 int[] newRange = mapReferenceRange(range, chromosome, species,
218 if (newRange == null)
220 System.err.println(String.format(
221 "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
222 fromRef, range[0], range[1], toRef));
225 offset = newRange[0] - range[0];
230 * query the VCF for overlaps
233 MapList mapping = seqCoords.getMapping();
235 CloseableIterator<VariantContext> variants = reader.query(chromosome,
237 while (variants.hasNext())
240 * get variant location in sequence chromosomal coordinates
242 VariantContext variant = variants.next();
244 int start = variant.getStart() - offset;
245 int end = variant.getEnd() - offset;
248 * convert chromosomal location to sequence coordinates
249 * - null if a partially overlapping feature
251 int[] seqLocation = mapping.locateInFrom(start, end);
252 if (seqLocation != null)
254 addVariantFeatures(seq, variant, seqLocation[0], seqLocation[1]);
264 * Inspects the VCF variant record, and adds variant features to the sequence
268 * @param featureStart
271 protected void addVariantFeatures(SequenceI seq, VariantContext variant,
272 int featureStart, int featureEnd)
274 StringBuilder sb = new StringBuilder();
275 sb.append(variant.getReference().getBaseString());
278 for (Allele allele : variant.getAlleles())
280 if (!allele.isReference())
282 sb.append(",").append(allele.getBaseString());
286 String alleles = sb.toString(); // e.g. G,A,C
288 String type = SequenceOntologyI.SEQUENCE_VARIANT;
290 if (alleleCount == 1)
294 score = (float) variant.getAttributeAsDouble("AF", 0d);
295 } catch (NumberFormatException e)
300 SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
301 featureEnd, score, "VCF");
304 * only add 'alleles' property if a SNP, as we can
305 * only handle SNPs when computing peptide variants
309 sf.setValue("alleles", alleles);
312 Map<String, Object> atts = variant.getAttributes();
313 for (Entry<String, Object> att : atts.entrySet())
315 sf.setValue(att.getKey(), att.getValue());
317 seq.addSequenceFeature(sf);
321 * Determines the location of the query range (chromosome positions) in a
322 * different reference assembly.
324 * If the range is just a subregion of one for which we already have a mapping
325 * (for example, an exon sub-region of a gene), then the mapping is just
326 * computed arithmetically.
328 * Otherwise, calls the Ensembl REST service that maps from one assembly
329 * reference's coordinates to another's
332 * start-end chromosomal range in 'fromRef' coordinates
336 * assembly reference for the query coordinates
338 * assembly reference we wish to translate to
339 * @return the start-end range in 'toRef' coordinates
341 protected int[] mapReferenceRange(int[] queryRange, String chromosome,
342 String species, String fromRef, String toRef)
345 * first try shorcut of computing the mapping as a subregion of one
346 * we already have (e.g. for an exon, if we have the gene mapping)
348 int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
349 species, fromRef, toRef);
350 if (mappedRange != null)
356 * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
358 EnsemblMap mapper = new EnsemblMap();
359 int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef,
364 // mapping service failure
369 * save mapping for possible future re-use
371 String key = species + EXCL + chromosome + EXCL + fromRef + EXCL
373 if (!assemblyMappings.containsKey(key))
375 assemblyMappings.put(key, new HashMap<int[], int[]>());
378 assemblyMappings.get(key).put(queryRange, mapping);
384 * If we already have a 1:1 contiguous mapping which subsumes the given query
385 * range, this method just calculates and returns the subset of that mapping,
386 * else it returns null. In practical terms, if a gene has a contiguous
387 * mapping between (for example) GRCh37 and GRCh38, then we assume that its
388 * subsidiary exons occupy unchanged relative positions, and just compute
389 * these as offsets, rather than do another lookup of the mapping.
391 * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
392 * simply remove this method or let it always return null.
394 * Warning: many rapid calls to the /map service map result in a 429 overload
404 protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
405 String species, String fromRef, String toRef)
407 String key = species + EXCL + chromosome + EXCL + fromRef + EXCL
409 if (assemblyMappings.containsKey(key))
411 Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
412 for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
414 int[] fromRange = mappedRange.getKey();
415 int[] toRange = mappedRange.getValue();
416 if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
419 * mapping is 1:1 in length, so we trust it to have no discontinuities
421 if (fromRange[0] <= queryRange[0] && fromRange[1] >= queryRange[1])
424 * fromRange subsumes our query range
426 int offset = queryRange[0] - fromRange[0];
427 int mappedRangeFrom = toRange[0] + offset;
428 int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
429 return new int[] { mappedRangeFrom, mappedRangeTo };