1 package jalview.io.vcf;
3 import htsjdk.samtools.util.CloseableIterator;
4 import htsjdk.variant.variantcontext.Allele;
5 import htsjdk.variant.variantcontext.VariantContext;
6 import htsjdk.variant.vcf.VCFHeader;
7 import htsjdk.variant.vcf.VCFHeaderLine;
9 import jalview.analysis.AlignmentUtils;
10 import jalview.analysis.Dna;
11 import jalview.api.AlignViewControllerGuiI;
12 import jalview.datamodel.AlignmentI;
13 import jalview.datamodel.DBRefEntry;
14 import jalview.datamodel.GeneLociI;
15 import jalview.datamodel.Mapping;
16 import jalview.datamodel.SequenceFeature;
17 import jalview.datamodel.SequenceI;
18 import jalview.ext.ensembl.EnsemblMap;
19 import jalview.ext.htsjdk.VCFReader;
20 import jalview.io.gff.Gff3Helper;
21 import jalview.io.gff.SequenceOntologyI;
22 import jalview.util.MapList;
23 import jalview.util.MappingUtils;
24 import jalview.util.MessageManager;
26 import java.io.IOException;
27 import java.util.HashMap;
28 import java.util.List;
30 import java.util.Map.Entry;
33 * A class to read VCF data (using the htsjdk) and add variants as sequence
34 * features on dna and any related protein product sequences
38 public class VCFLoader
40 private static final String FEATURE_GROUP_VCF = "VCF";
42 private static final String EXCL = "!";
45 * the alignment we are associated VCF data with
47 private AlignmentI al;
50 * mappings between VCF and sequence reference assembly regions, as
51 * key = "species!chromosome!fromAssembly!toAssembly
52 * value = Map{fromRange, toRange}
54 private Map<String, Map<int[], int[]>> assemblyMappings;
57 * Constructor given an alignment context
61 public VCFLoader(AlignmentI alignment)
65 // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
66 assemblyMappings = new HashMap<String, Map<int[], int[]>>();
70 * Starts a new thread to query and load VCF variant data on to the alignment
72 * This method is not thread safe - concurrent threads should use separate
73 * instances of this class.
78 public void loadVCF(final String filePath,
79 final AlignViewControllerGuiI gui)
83 gui.setStatus(MessageManager.getString("label.searching_vcf"));
92 VCFLoader.this.doLoad(filePath, gui);
99 * Loads VCF on to an alignment - provided it can be related to one or more
100 * sequence's chromosomal coordinates.
105 protected void doLoad(String filePath, AlignViewControllerGuiI gui)
107 VCFReader reader = null;
110 // long start = System.currentTimeMillis();
111 reader = new VCFReader(filePath);
113 VCFHeader header = reader.getFileHeader();
114 VCFHeaderLine ref = header
115 .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
116 // check if reference is wrt assembly19 (GRCh37)
117 // todo may need to allow user to specify reference assembly?
118 boolean isRefGrch37 = (ref != null && ref.getValue().contains(
125 * query for VCF overlapping each sequence in turn
127 for (SequenceI seq : al.getSequences())
129 int added = loadVCF(seq, reader, isRefGrch37);
134 transferAddedFeatures(seq);
139 // long elapsed = System.currentTimeMillis() - start;
140 String msg = MessageManager.formatMessage("label.added_vcf",
143 gui.getFeatureSettingsUI().discoverAllFeatureData();
145 } catch (Throwable e)
147 System.err.println("Error processing VCF: " + e.getMessage());
151 gui.setStatus("Error occurred - see console for details");
160 } catch (IOException e)
169 * Transfers VCF features to sequences to which this sequence has a mapping.
170 * If the mapping is 1:3, computes peptide variants from nucleotide variants.
174 protected void transferAddedFeatures(SequenceI seq)
176 DBRefEntry[] dbrefs = seq.getDBRefs();
181 for (DBRefEntry dbref : dbrefs)
183 Mapping mapping = dbref.getMap();
184 if (mapping == null || mapping.getTo() == null)
189 SequenceI mapTo = mapping.getTo();
190 MapList map = mapping.getMap();
191 if (map.getFromRatio() == 3)
194 * dna-to-peptide product mapping
196 AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
201 * nucleotide-to-nucleotide mapping e.g. transcript to CDS
203 // TODO no DBRef to CDS is added to transcripts
204 List<SequenceFeature> features = seq.getFeatures()
205 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
206 for (SequenceFeature sf : features)
208 if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
210 transferFeature(sf, mapTo, map);
218 * Tries to add overlapping variants read from a VCF file to the given
219 * sequence, and returns the number of overlapping variants found. Note that
220 * this requires the sequence to hold information as to its chromosomal
221 * positions and reference, in order to be able to map the VCF variants to the
226 * @param isVcfRefGrch37
229 protected int loadVCF(SequenceI seq, VCFReader reader,
230 boolean isVcfRefGrch37)
233 GeneLociI seqCoords = seq.getGeneLoci();
234 if (seqCoords == null)
239 List<int[]> seqChromosomalContigs = seqCoords.getMap().getToRanges();
240 for (int[] range : seqChromosomalContigs)
242 count += addVcfVariants(seq, reader, range, isVcfRefGrch37);
249 * Queries the VCF reader for any variants that overlap the given chromosome
250 * region of the sequence, and adds as variant features. Returns the number of
251 * overlapping variants found.
256 * start-end range of a sequence region in its chromosomal
258 * @param isVcfRefGrch37
259 * true if the VCF is with reference to GRCh37
262 protected int addVcfVariants(SequenceI seq, VCFReader reader,
263 int[] range, boolean isVcfRefGrch37)
265 GeneLociI seqCoords = seq.getGeneLoci();
267 String chromosome = seqCoords.getChromosomeId();
268 String seqRef = seqCoords.getAssemblyId();
269 String species = seqCoords.getSpeciesId();
271 // TODO handle species properly
272 if ("".equals(species))
278 * map chromosomal coordinates from GRCh38 (sequence) to
279 * GRCh37 (VCF) if necessary
281 // TODO generalise for other assemblies and species
283 String fromRef = "GRCh38";
284 if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37)
286 String toRef = "GRCh37";
287 int[] newRange = mapReferenceRange(range, chromosome, species,
289 if (newRange == null)
291 System.err.println(String.format(
292 "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
293 fromRef, range[0], range[1], toRef));
296 offset = newRange[0] - range[0];
300 boolean forwardStrand = range[0] <= range[1];
303 * query the VCF for overlaps
304 * (convert a reverse strand range to forwards)
307 MapList mapping = seqCoords.getMap();
309 int fromLocus = Math.min(range[0], range[1]);
310 int toLocus = Math.max(range[0], range[1]);
311 CloseableIterator<VariantContext> variants = reader.query(chromosome,
313 while (variants.hasNext())
316 * get variant location in sequence chromosomal coordinates
318 VariantContext variant = variants.next();
321 * we can only process SNP variants (which can be reported
322 * as part of a MIXED variant record
324 if (!variant.isSNP() && !variant.isMixed())
330 int start = variant.getStart() - offset;
331 int end = variant.getEnd() - offset;
334 * convert chromosomal location to sequence coordinates
335 * - null if a partially overlapping feature
337 int[] seqLocation = mapping.locateInFrom(start, end);
338 if (seqLocation != null)
340 addVariantFeatures(seq, variant, seqLocation[0], seqLocation[1],
351 * Inspects the VCF variant record, and adds variant features to the sequence.
352 * Only SNP variants are added, not INDELs.
354 * If the sequence maps to the reverse strand of the chromosome, reference and
355 * variant bases are recorded as their complements (C/G, A/T).
359 * @param featureStart
361 * @param forwardStrand
363 protected void addVariantFeatures(SequenceI seq, VariantContext variant,
364 int featureStart, int featureEnd, boolean forwardStrand)
366 byte[] reference = variant.getReference().getBases();
367 if (reference.length != 1)
370 * sorry, we don't handle INDEL variants
376 * for now we extract allele frequency as feature score; note
377 * this attribute is String for a simple SNP, but List<String> if
378 * multiple alleles at the locus; we extract for the simple case only
380 Object af = variant.getAttribute("AF");
382 if (af instanceof String)
386 score = Float.parseFloat((String) af);
387 } catch (NumberFormatException e)
393 StringBuilder sb = new StringBuilder();
394 sb.append(forwardStrand ? (char) reference[0] : complement(reference));
397 * inspect alleles and record SNP variants (as the variant
398 * record could be MIXED and include INDEL and SNP alleles)
399 * warning: getAlleles gives no guarantee as to the order
400 * in which they are returned
402 for (Allele allele : variant.getAlleles())
404 if (!allele.isReference())
406 byte[] alleleBase = allele.getBases();
407 if (alleleBase.length == 1)
409 sb.append(",").append(
410 forwardStrand ? (char) alleleBase[0]
411 : complement(alleleBase));
415 String alleles = sb.toString(); // e.g. G,A,C
417 String type = SequenceOntologyI.SEQUENCE_VARIANT;
419 SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
420 featureEnd, score, FEATURE_GROUP_VCF);
422 sf.setValue(Gff3Helper.ALLELES, alleles);
424 Map<String, Object> atts = variant.getAttributes();
425 for (Entry<String, Object> att : atts.entrySet())
427 sf.setValue(att.getKey(), att.getValue());
429 seq.addSequenceFeature(sf);
433 * A convenience method to complement a dna base and return the string value
439 protected String complement(byte[] reference)
441 return String.valueOf(Dna.getComplement((char) reference[0]));
445 * Determines the location of the query range (chromosome positions) in a
446 * different reference assembly.
448 * If the range is just a subregion of one for which we already have a mapping
449 * (for example, an exon sub-region of a gene), then the mapping is just
450 * computed arithmetically.
452 * Otherwise, calls the Ensembl REST service that maps from one assembly
453 * reference's coordinates to another's
456 * start-end chromosomal range in 'fromRef' coordinates
460 * assembly reference for the query coordinates
462 * assembly reference we wish to translate to
463 * @return the start-end range in 'toRef' coordinates
465 protected int[] mapReferenceRange(int[] queryRange, String chromosome,
466 String species, String fromRef, String toRef)
469 * first try shorcut of computing the mapping as a subregion of one
470 * we already have (e.g. for an exon, if we have the gene mapping)
472 int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
473 species, fromRef, toRef);
474 if (mappedRange != null)
480 * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
482 EnsemblMap mapper = new EnsemblMap();
483 int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef,
488 // mapping service failure
493 * save mapping for possible future re-use
495 String key = makeRangesKey(chromosome, species, fromRef, toRef);
496 if (!assemblyMappings.containsKey(key))
498 assemblyMappings.put(key, new HashMap<int[], int[]>());
501 assemblyMappings.get(key).put(queryRange, mapping);
507 * If we already have a 1:1 contiguous mapping which subsumes the given query
508 * range, this method just calculates and returns the subset of that mapping,
509 * else it returns null. In practical terms, if a gene has a contiguous
510 * mapping between (for example) GRCh37 and GRCh38, then we assume that its
511 * subsidiary exons occupy unchanged relative positions, and just compute
512 * these as offsets, rather than do another lookup of the mapping.
514 * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
515 * simply remove this method or let it always return null.
517 * Warning: many rapid calls to the /map service map result in a 429 overload
527 protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
528 String species, String fromRef, String toRef)
530 String key = makeRangesKey(chromosome, species, fromRef, toRef);
531 if (assemblyMappings.containsKey(key))
533 Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
534 for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
536 int[] fromRange = mappedRange.getKey();
537 int[] toRange = mappedRange.getValue();
538 if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
541 * mapping is 1:1 in length, so we trust it to have no discontinuities
543 if (MappingUtils.rangeContains(fromRange, queryRange))
546 * fromRange subsumes our query range
548 int offset = queryRange[0] - fromRange[0];
549 int mappedRangeFrom = toRange[0] + offset;
550 int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
551 return new int[] { mappedRangeFrom, mappedRangeTo };
560 * Transfers the sequence feature to the target sequence, locating its start
561 * and end range based on the mapping. Features which do not overlap the
562 * target sequence are ignored.
565 * @param targetSequence
567 * mapping from the feature's coordinates to the target sequence
569 protected void transferFeature(SequenceFeature sf,
570 SequenceI targetSequence, MapList mapping)
572 int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
574 if (mappedRange != null)
576 String group = sf.getFeatureGroup();
577 int newBegin = Math.min(mappedRange[0], mappedRange[1]);
578 int newEnd = Math.max(mappedRange[0], mappedRange[1]);
579 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
580 group, sf.getScore());
581 targetSequence.addSequenceFeature(copy);
586 * Formats a ranges map lookup key
594 protected static String makeRangesKey(String chromosome, String species,
595 String fromRef, String toRef)
597 return species + EXCL + chromosome + EXCL + fromRef + EXCL