1 package jalview.io.vcf;
3 import htsjdk.samtools.util.CloseableIterator;
4 import htsjdk.variant.variantcontext.Allele;
5 import htsjdk.variant.variantcontext.VariantContext;
6 import htsjdk.variant.vcf.VCFHeader;
7 import htsjdk.variant.vcf.VCFHeaderLine;
9 import jalview.analysis.AlignmentUtils;
10 import jalview.analysis.Dna;
11 import jalview.api.AlignViewControllerGuiI;
12 import jalview.datamodel.AlignmentI;
13 import jalview.datamodel.DBRefEntry;
14 import jalview.datamodel.GeneLoci;
15 import jalview.datamodel.Mapping;
16 import jalview.datamodel.SequenceFeature;
17 import jalview.datamodel.SequenceI;
18 import jalview.ext.ensembl.EnsemblMap;
19 import jalview.ext.htsjdk.VCFReader;
20 import jalview.io.gff.Gff3Helper;
21 import jalview.io.gff.SequenceOntologyI;
22 import jalview.util.MapList;
23 import jalview.util.MappingUtils;
25 import java.io.IOException;
26 import java.util.HashMap;
27 import java.util.List;
29 import java.util.Map.Entry;
32 * A class to read VCF data (using the htsjdk) and add variants as sequence
33 * features on dna and any related protein product sequences
37 public class VCFLoader
39 private static final String EXCL = "!";
42 * the alignment we are associated VCF data with
44 private AlignmentI al;
47 * mappings between VCF and sequence reference assembly regions, as
48 * key = "species!chromosome!fromAssembly!toAssembly
49 * value = Map{fromRange, toRange}
51 private Map<String, Map<int[], int[]>> assemblyMappings;
54 * Constructor given an alignment context
58 public VCFLoader(AlignmentI alignment)
62 // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
63 assemblyMappings = new HashMap<String, Map<int[], int[]>>();
67 * Loads VCF on to an alignment - provided it can be related to one or more
68 * sequence's chromosomal coordinates.
70 * This method is not thread safe - concurrent threads should use separate
71 * instances of this class.
76 public void loadVCF(String filePath, AlignViewControllerGuiI status)
78 VCFReader reader = null;
82 // long start = System.currentTimeMillis();
83 reader = new VCFReader(filePath);
85 VCFHeader header = reader.getFileHeader();
86 VCFHeaderLine ref = header
87 .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
88 // check if reference is wrt assembly19 (GRCh37)
89 // todo may need to allow user to specify reference assembly?
90 boolean isRefGrch37 = (ref != null && ref.getValue().contains(
97 * query for VCF overlapping each sequence in turn
99 for (SequenceI seq : al.getSequences())
101 int added = loadVCF(seq, reader, isRefGrch37);
106 computePeptideVariants(seq);
109 // long elapsed = System.currentTimeMillis() - start;
110 String msg = String.format("Added %d VCF variants to %d sequence(s)",
114 status.setStatus(msg);
116 } catch (Throwable e)
118 System.err.println("Error processing VCF: " + e.getMessage());
127 } catch (IOException e)
136 * (Re-)computes peptide variants from dna variants, for any protein sequence
137 * to which the dna sequence has a mapping. Note that although duplicate
138 * features may get computed, they will not be added, since duplicate sequence
139 * features are ignored in Sequence.addSequenceFeature.
143 protected void computePeptideVariants(SequenceI dnaSeq)
145 DBRefEntry[] dbrefs = dnaSeq.getDBRefs();
150 for (DBRefEntry dbref : dbrefs)
152 Mapping mapping = dbref.getMap();
153 if (mapping == null || mapping.getTo() == null
154 || mapping.getMap().getFromRatio() != 3)
158 AlignmentUtils.computeProteinFeatures(dnaSeq, mapping.getTo(),
164 * Tries to add overlapping variants read from a VCF file to the given
165 * sequence, and returns the number of overlapping variants found. Note that
166 * this requires the sequence to hold information as to its chromosomal
167 * positions and reference, in order to be able to map the VCF variants to the
172 * @param isVcfRefGrch37
175 protected int loadVCF(SequenceI seq, VCFReader reader,
176 boolean isVcfRefGrch37)
179 GeneLoci seqCoords = seq.getGeneLoci();
180 if (seqCoords == null)
185 List<int[]> seqChromosomalContigs = seqCoords.mapping.getToRanges();
186 for (int[] range : seqChromosomalContigs)
188 count += addVcfVariants(seq, reader, range, isVcfRefGrch37);
195 * Queries the VCF reader for any variants that overlap the given chromosome
196 * region of the sequence, and adds as variant features. Returns the number of
197 * overlapping variants found.
202 * start-end range of a sequence region in its chromosomal
204 * @param isVcfRefGrch37
205 * true if the VCF is with reference to GRCh37
208 protected int addVcfVariants(SequenceI seq, VCFReader reader,
209 int[] range, boolean isVcfRefGrch37)
211 GeneLoci seqCoords = seq.getGeneLoci();
213 String chromosome = seqCoords.chromosome;
214 String seqRef = seqCoords.assembly;
215 String species = seqCoords.species;
217 // TODO handle species properly
218 if ("".equals(species))
224 * map chromosomal coordinates from GRCh38 (sequence) to
225 * GRCh37 (VCF) if necessary
227 // TODO generalise for other assemblies and species
229 String fromRef = "GRCh38";
230 if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37)
232 String toRef = "GRCh37";
233 int[] newRange = mapReferenceRange(range, chromosome, species,
235 if (newRange == null)
237 System.err.println(String.format(
238 "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
239 fromRef, range[0], range[1], toRef));
242 offset = newRange[0] - range[0];
246 boolean forwardStrand = range[0] <= range[1];
249 * query the VCF for overlaps
250 * (convert a reverse strand range to forwards)
253 MapList mapping = seqCoords.mapping;
255 int fromLocus = Math.min(range[0], range[1]);
256 int toLocus = Math.max(range[0], range[1]);
257 CloseableIterator<VariantContext> variants = reader.query(chromosome,
259 while (variants.hasNext())
262 * get variant location in sequence chromosomal coordinates
264 VariantContext variant = variants.next();
267 * we can only process SNP variants (which can be reported
268 * as part of a MIXED variant record
270 if (!variant.isSNP() && !variant.isMixed())
276 int start = variant.getStart() - offset;
277 int end = variant.getEnd() - offset;
280 * convert chromosomal location to sequence coordinates
281 * - null if a partially overlapping feature
283 int[] seqLocation = mapping.locateInFrom(start, end);
284 if (seqLocation != null)
286 addVariantFeatures(seq, variant, seqLocation[0], seqLocation[1],
297 * Inspects the VCF variant record, and adds variant features to the sequence.
298 * Only SNP variants are added, not INDELs.
300 * If the sequence maps to the reverse strand of the chromosome, reference and
301 * variant bases are recorded as their complements (C/G, A/T).
305 * @param featureStart
307 * @param forwardStrand
309 protected void addVariantFeatures(SequenceI seq, VariantContext variant,
310 int featureStart, int featureEnd, boolean forwardStrand)
312 byte[] reference = variant.getReference().getBases();
313 if (reference.length != 1)
316 * sorry, we don't handle INDEL variants
322 * for now we extract allele frequency as feature score; note
323 * this attribute is String for a simple SNP, but List<String> if
324 * multiple alleles at the locus; we extract for the simple case only
326 Object af = variant.getAttribute("AF");
328 if (af instanceof String)
332 score = Float.parseFloat((String) af);
333 } catch (NumberFormatException e)
339 StringBuilder sb = new StringBuilder();
340 sb.append(forwardStrand ? (char) reference[0] : complement(reference));
343 * inspect alleles and record SNP variants (as the variant
344 * record could be MIXED and include INDEL and SNP alleles)
345 * warning: getAlleles gives no guarantee as to the order
346 * in which they are returned
348 for (Allele allele : variant.getAlleles())
350 if (!allele.isReference())
352 byte[] alleleBase = allele.getBases();
353 if (alleleBase.length == 1)
355 sb.append(",").append(
356 forwardStrand ? (char) alleleBase[0]
357 : complement(alleleBase));
361 String alleles = sb.toString(); // e.g. G,A,C
363 String type = SequenceOntologyI.SEQUENCE_VARIANT;
365 SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
366 featureEnd, score, "VCF");
368 sf.setValue(Gff3Helper.ALLELES, alleles);
370 Map<String, Object> atts = variant.getAttributes();
371 for (Entry<String, Object> att : atts.entrySet())
373 sf.setValue(att.getKey(), att.getValue());
375 seq.addSequenceFeature(sf);
379 * A convenience method to complement a dna base and return the string value
385 protected String complement(byte[] reference)
387 return String.valueOf(Dna.getComplement((char) reference[0]));
391 * Determines the location of the query range (chromosome positions) in a
392 * different reference assembly.
394 * If the range is just a subregion of one for which we already have a mapping
395 * (for example, an exon sub-region of a gene), then the mapping is just
396 * computed arithmetically.
398 * Otherwise, calls the Ensembl REST service that maps from one assembly
399 * reference's coordinates to another's
402 * start-end chromosomal range in 'fromRef' coordinates
406 * assembly reference for the query coordinates
408 * assembly reference we wish to translate to
409 * @return the start-end range in 'toRef' coordinates
411 protected int[] mapReferenceRange(int[] queryRange, String chromosome,
412 String species, String fromRef, String toRef)
415 * first try shorcut of computing the mapping as a subregion of one
416 * we already have (e.g. for an exon, if we have the gene mapping)
418 int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
419 species, fromRef, toRef);
420 if (mappedRange != null)
426 * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
428 EnsemblMap mapper = new EnsemblMap();
429 int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef,
434 // mapping service failure
439 * save mapping for possible future re-use
441 String key = makeRangesKey(chromosome, species, fromRef, toRef);
442 if (!assemblyMappings.containsKey(key))
444 assemblyMappings.put(key, new HashMap<int[], int[]>());
447 assemblyMappings.get(key).put(queryRange, mapping);
453 * If we already have a 1:1 contiguous mapping which subsumes the given query
454 * range, this method just calculates and returns the subset of that mapping,
455 * else it returns null. In practical terms, if a gene has a contiguous
456 * mapping between (for example) GRCh37 and GRCh38, then we assume that its
457 * subsidiary exons occupy unchanged relative positions, and just compute
458 * these as offsets, rather than do another lookup of the mapping.
460 * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
461 * simply remove this method or let it always return null.
463 * Warning: many rapid calls to the /map service map result in a 429 overload
473 protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
474 String species, String fromRef, String toRef)
476 String key = makeRangesKey(chromosome, species, fromRef, toRef);
477 if (assemblyMappings.containsKey(key))
479 Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
480 for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
482 int[] fromRange = mappedRange.getKey();
483 int[] toRange = mappedRange.getValue();
484 if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
487 * mapping is 1:1 in length, so we trust it to have no discontinuities
489 if (MappingUtils.rangeContains(fromRange, queryRange))
492 * fromRange subsumes our query range
494 int offset = queryRange[0] - fromRange[0];
495 int mappedRangeFrom = toRange[0] + offset;
496 int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
497 return new int[] { mappedRangeFrom, mappedRangeTo };
506 * Formats a ranges map lookup key
514 protected static String makeRangesKey(String chromosome, String species,
515 String fromRef, String toRef)
517 return species + EXCL + chromosome + EXCL + fromRef + EXCL