1 package jalview.io.vcf;
3 import htsjdk.samtools.util.CloseableIterator;
4 import htsjdk.variant.variantcontext.Allele;
5 import htsjdk.variant.variantcontext.VariantContext;
6 import htsjdk.variant.vcf.VCFHeader;
7 import htsjdk.variant.vcf.VCFHeaderLine;
8 import htsjdk.variant.vcf.VCFHeaderLineCount;
10 import jalview.analysis.AlignmentUtils;
11 import jalview.analysis.Dna;
12 import jalview.api.AlignViewControllerGuiI;
13 import jalview.datamodel.AlignmentI;
14 import jalview.datamodel.DBRefEntry;
15 import jalview.datamodel.GeneLociI;
16 import jalview.datamodel.Mapping;
17 import jalview.datamodel.SequenceFeature;
18 import jalview.datamodel.SequenceI;
19 import jalview.ext.ensembl.EnsemblMap;
20 import jalview.ext.htsjdk.VCFReader;
21 import jalview.io.gff.Gff3Helper;
22 import jalview.io.gff.SequenceOntologyI;
23 import jalview.util.MapList;
24 import jalview.util.MappingUtils;
25 import jalview.util.MessageManager;
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.List;
32 import java.util.Map.Entry;
35 * A class to read VCF data (using the htsjdk) and add variants as sequence
36 * features on dna and any related protein product sequences
40 public class VCFLoader
42 private static final String ALLELE_FREQUENCY_KEY = "AF";
44 private static final String COMMA = ",";
46 private static final boolean FEATURE_PER_ALLELE = true;
48 private static final String FEATURE_GROUP_VCF = "VCF";
50 private static final String EXCL = "!";
53 * the alignment we are associated VCF data with
55 private AlignmentI al;
58 * mappings between VCF and sequence reference assembly regions, as
59 * key = "species!chromosome!fromAssembly!toAssembly
60 * value = Map{fromRange, toRange}
62 private Map<String, Map<int[], int[]>> assemblyMappings;
65 * holds details of the VCF header lines (metadata)
67 private VCFHeader header;
70 * Constructor given an alignment context
74 public VCFLoader(AlignmentI alignment)
78 // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
79 assemblyMappings = new HashMap<String, Map<int[], int[]>>();
83 * Starts a new thread to query and load VCF variant data on to the alignment
85 * This method is not thread safe - concurrent threads should use separate
86 * instances of this class.
91 public void loadVCF(final String filePath,
92 final AlignViewControllerGuiI gui)
96 gui.setStatus(MessageManager.getString("label.searching_vcf"));
105 VCFLoader.this.doLoad(filePath, gui);
112 * Loads VCF on to an alignment - provided it can be related to one or more
113 * sequence's chromosomal coordinates
117 * optional callback handler for messages
119 protected void doLoad(String filePath, AlignViewControllerGuiI gui)
121 VCFReader reader = null;
124 // long start = System.currentTimeMillis();
125 reader = new VCFReader(filePath);
127 header = reader.getFileHeader();
128 VCFHeaderLine ref = header
129 .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
130 // check if reference is wrt assembly19 (GRCh37)
131 // todo may need to allow user to specify reference assembly?
132 boolean isRefGrch37 = (ref != null && ref.getValue().contains(
139 * query for VCF overlapping each sequence in turn
141 for (SequenceI seq : al.getSequences())
143 int added = loadVCF(seq, reader, isRefGrch37);
148 transferAddedFeatures(seq);
153 // long elapsed = System.currentTimeMillis() - start;
154 String msg = MessageManager.formatMessage("label.added_vcf",
157 if (gui.getFeatureSettingsUI() != null)
159 gui.getFeatureSettingsUI().discoverAllFeatureData();
162 } catch (Throwable e)
164 System.err.println("Error processing VCF: " + e.getMessage());
168 gui.setStatus("Error occurred - see console for details");
177 } catch (IOException e)
186 * Transfers VCF features to sequences to which this sequence has a mapping.
187 * If the mapping is 1:3, computes peptide variants from nucleotide variants.
191 protected void transferAddedFeatures(SequenceI seq)
193 DBRefEntry[] dbrefs = seq.getDBRefs();
198 for (DBRefEntry dbref : dbrefs)
200 Mapping mapping = dbref.getMap();
201 if (mapping == null || mapping.getTo() == null)
206 SequenceI mapTo = mapping.getTo();
207 MapList map = mapping.getMap();
208 if (map.getFromRatio() == 3)
211 * dna-to-peptide product mapping
213 AlignmentUtils.computeProteinFeatures(seq, mapTo, map);
218 * nucleotide-to-nucleotide mapping e.g. transcript to CDS
220 // TODO no DBRef to CDS is added to transcripts
221 List<SequenceFeature> features = seq.getFeatures()
222 .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT);
223 for (SequenceFeature sf : features)
225 if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup()))
227 transferFeature(sf, mapTo, map);
235 * Tries to add overlapping variants read from a VCF file to the given
236 * sequence, and returns the number of variant features added. Note that this
237 * requires the sequence to hold information as to its chromosomal positions
238 * and reference, in order to be able to map the VCF variants to the sequence.
242 * @param isVcfRefGrch37
245 protected int loadVCF(SequenceI seq, VCFReader reader,
246 boolean isVcfRefGrch37)
249 GeneLociI seqCoords = seq.getGeneLoci();
250 if (seqCoords == null)
255 List<int[]> seqChromosomalContigs = seqCoords.getMap().getToRanges();
256 for (int[] range : seqChromosomalContigs)
258 count += addVcfVariants(seq, reader, range, isVcfRefGrch37);
265 * Queries the VCF reader for any variants that overlap the given chromosome
266 * region of the sequence, and adds as variant features. Returns the number of
267 * overlapping variants found.
272 * start-end range of a sequence region in its chromosomal
274 * @param isVcfRefGrch37
275 * true if the VCF is with reference to GRCh37
278 protected int addVcfVariants(SequenceI seq, VCFReader reader,
279 int[] range, boolean isVcfRefGrch37)
281 GeneLociI seqCoords = seq.getGeneLoci();
283 String chromosome = seqCoords.getChromosomeId();
284 String seqRef = seqCoords.getAssemblyId();
285 String species = seqCoords.getSpeciesId();
288 * map chromosomal coordinates from GRCh38 (sequence) to
289 * GRCh37 (VCF) if necessary
291 // TODO generalise for other assemblies and species
293 String fromRef = "GRCh38";
294 if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37)
296 String toRef = "GRCh37";
297 int[] newRange = mapReferenceRange(range, chromosome, "human",
299 if (newRange == null)
301 System.err.println(String.format(
302 "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome,
303 fromRef, range[0], range[1], toRef));
306 offset = newRange[0] - range[0];
310 boolean forwardStrand = range[0] <= range[1];
313 * query the VCF for overlaps
314 * (convert a reverse strand range to forwards)
317 MapList mapping = seqCoords.getMap();
319 int fromLocus = Math.min(range[0], range[1]);
320 int toLocus = Math.max(range[0], range[1]);
321 CloseableIterator<VariantContext> variants = reader.query(chromosome,
323 while (variants.hasNext())
326 * get variant location in sequence chromosomal coordinates
328 VariantContext variant = variants.next();
331 * we can only process SNP variants (which can be reported
332 * as part of a MIXED variant record
334 if (!variant.isSNP() && !variant.isMixed())
339 int start = variant.getStart() - offset;
340 int end = variant.getEnd() - offset;
343 * convert chromosomal location to sequence coordinates
344 * - null if a partially overlapping feature
346 int[] seqLocation = mapping.locateInFrom(start, end);
347 if (seqLocation != null)
349 count += addVariantFeature(seq, variant, seqLocation[0],
350 seqLocation[1], forwardStrand);
360 * Inspects the VCF variant record, and adds variant features to the sequence.
361 * Only SNP variants are added, not INDELs. Returns the number of features
364 * If the sequence maps to the reverse strand of the chromosome, reference and
365 * variant bases are recorded as their complements (C/G, A/T).
369 * @param featureStart
371 * @param forwardStrand
373 protected int addVariantFeature(SequenceI seq, VariantContext variant,
374 int featureStart, int featureEnd, boolean forwardStrand)
376 byte[] reference = variant.getReference().getBases();
377 if (reference.length != 1)
380 * sorry, we don't handle INDEL variants
385 if (FEATURE_PER_ALLELE)
387 return addAlleleFeatures(seq, variant, featureStart, featureEnd,
392 * for now we extract allele frequency as feature score; note
393 * this attribute is String for a simple SNP, but List<String> if
394 * multiple alleles at the locus; we extract for the simple case only
396 float score = getAlleleFrequency(variant, 0);
398 StringBuilder sb = new StringBuilder();
399 sb.append(forwardStrand ? (char) reference[0] : complement(reference));
402 * inspect alleles and record SNP variants (as the variant
403 * record could be MIXED and include INDEL and SNP alleles)
404 * warning: getAlleles gives no guarantee as to the order
405 * in which they are returned
407 for (Allele allele : variant.getAlleles())
409 if (!allele.isReference())
411 byte[] alleleBase = allele.getBases();
412 if (alleleBase.length == 1)
414 sb.append(COMMA).append(
415 forwardStrand ? (char) alleleBase[0]
416 : complement(alleleBase));
420 String alleles = sb.toString(); // e.g. G,A,C
422 String type = SequenceOntologyI.SEQUENCE_VARIANT;
424 SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
425 featureEnd, score, FEATURE_GROUP_VCF);
427 sf.setValue(Gff3Helper.ALLELES, alleles);
429 Map<String, Object> atts = variant.getAttributes();
430 for (Entry<String, Object> att : atts.entrySet())
432 sf.setValue(att.getKey(), att.getValue());
434 seq.addSequenceFeature(sf);
440 * A convenience method to get the AF value for the given alternate allele
447 protected float getAlleleFrequency(VariantContext variant, int alleleIndex)
450 String attributeValue = getAttributeValue(variant,
451 ALLELE_FREQUENCY_KEY, alleleIndex);
452 if (attributeValue != null)
456 score = Float.parseFloat(attributeValue);
457 } catch (NumberFormatException e)
467 * A convenience method to get an attribute value for an alternate allele
470 * @param attributeName
474 protected String getAttributeValue(VariantContext variant,
475 String attributeName, int alleleIndex)
477 Object att = variant.getAttribute(attributeName);
479 if (att instanceof String)
483 else if (att instanceof ArrayList)
485 return ((List<String>) att).get(alleleIndex);
492 * Adds one variant feature for each SNP allele in the VCF variant record, and
493 * returns the number of features added.
497 * @param featureStart
499 * @param forwardStrand
502 protected int addAlleleFeatures(SequenceI seq, VariantContext variant,
503 int featureStart, int featureEnd, boolean forwardStrand)
508 * Javadoc says getAlternateAlleles() imposes no order on the list returned
509 * so we proceed defensively to get them in strict order
511 int altAlleleCount = variant.getAlternateAlleles().size();
512 for (int i = 0; i < altAlleleCount; i++)
514 added += addAlleleFeature(seq, variant, i, featureStart, featureEnd,
521 * Inspects one allele and attempts to add a variant feature for it to the
522 * sequence. Only SNP variants are added as features. We extract as much as
523 * possible of the additional data associated with this allele to store in the
524 * feature's key-value map. Answers the number of features added (0 or 1).
528 * @param altAlleleIndex
529 * @param featureStart
531 * @param forwardStrand
534 protected int addAlleleFeature(SequenceI seq, VariantContext variant,
535 int altAlleleIndex, int featureStart, int featureEnd,
536 boolean forwardStrand)
538 byte[] reference = variant.getReference().getBases();
539 Allele alt = variant.getAlternateAllele(altAlleleIndex);
540 byte[] allele = alt.getBases();
541 if (allele.length != 1)
550 * build the ref,alt allele description e.g. "G,A"
552 StringBuilder sb = new StringBuilder();
553 sb.append(forwardStrand ? (char) reference[0] : complement(reference));
555 sb.append(forwardStrand ? (char) allele[0] : complement(allele));
556 String alleles = sb.toString(); // e.g. G,A
558 String type = SequenceOntologyI.SEQUENCE_VARIANT;
559 float score = getAlleleFrequency(variant, altAlleleIndex);
561 SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
562 featureEnd, score, FEATURE_GROUP_VCF);
564 sf.setValue(Gff3Helper.ALLELES, alleles);
566 addAlleleProperties(variant, sf, altAlleleIndex);
568 seq.addSequenceFeature(sf);
574 * Add any allele-specific VCF key-value data to the sequence feature
578 * @param altAlelleIndex
580 protected void addAlleleProperties(VariantContext variant,
581 SequenceFeature sf, final int altAlelleIndex)
583 Map<String, Object> atts = variant.getAttributes();
586 * process variant data, extracting values which are allele-specific
587 * these may be per alternate allele (INFO[key].Number = 'A')
588 * or per allele including reference (INFO[key].Number = 'R')
590 for (Entry<String, Object> att : atts.entrySet())
592 String key = att.getKey();
593 VCFHeaderLineCount number = header.getInfoHeaderLine(key)
595 int index = altAlelleIndex;
596 if (number == VCFHeaderLineCount.R)
599 * one value per allele including reference, so bump index
600 * e.g. the 3rd value is for the 2nd alternate allele
605 * CSQ behaves as if Number=A but declares as Number=.
606 * so give it special treatment
608 else if (!"CSQ".equals(key) && number != VCFHeaderLineCount.A)
611 * don't save other values as not allele-related
617 * take the index'th value
619 String value = getAttributeValue(variant, key, index);
622 sf.setValue(key, value);
628 * A convenience method to complement a dna base and return the string value
634 protected String complement(byte[] reference)
636 return String.valueOf(Dna.getComplement((char) reference[0]));
640 * Determines the location of the query range (chromosome positions) in a
641 * different reference assembly.
643 * If the range is just a subregion of one for which we already have a mapping
644 * (for example, an exon sub-region of a gene), then the mapping is just
645 * computed arithmetically.
647 * Otherwise, calls the Ensembl REST service that maps from one assembly
648 * reference's coordinates to another's
651 * start-end chromosomal range in 'fromRef' coordinates
655 * assembly reference for the query coordinates
657 * assembly reference we wish to translate to
658 * @return the start-end range in 'toRef' coordinates
660 protected int[] mapReferenceRange(int[] queryRange, String chromosome,
661 String species, String fromRef, String toRef)
664 * first try shorcut of computing the mapping as a subregion of one
665 * we already have (e.g. for an exon, if we have the gene mapping)
667 int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome,
668 species, fromRef, toRef);
669 if (mappedRange != null)
675 * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37
677 EnsemblMap mapper = new EnsemblMap();
678 int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef,
683 // mapping service failure
688 * save mapping for possible future re-use
690 String key = makeRangesKey(chromosome, species, fromRef, toRef);
691 if (!assemblyMappings.containsKey(key))
693 assemblyMappings.put(key, new HashMap<int[], int[]>());
696 assemblyMappings.get(key).put(queryRange, mapping);
702 * If we already have a 1:1 contiguous mapping which subsumes the given query
703 * range, this method just calculates and returns the subset of that mapping,
704 * else it returns null. In practical terms, if a gene has a contiguous
705 * mapping between (for example) GRCh37 and GRCh38, then we assume that its
706 * subsidiary exons occupy unchanged relative positions, and just compute
707 * these as offsets, rather than do another lookup of the mapping.
709 * If in future these assumptions prove invalid (e.g. for bacterial dna?!),
710 * simply remove this method or let it always return null.
712 * Warning: many rapid calls to the /map service map result in a 429 overload
722 protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome,
723 String species, String fromRef, String toRef)
725 String key = makeRangesKey(chromosome, species, fromRef, toRef);
726 if (assemblyMappings.containsKey(key))
728 Map<int[], int[]> mappedRanges = assemblyMappings.get(key);
729 for (Entry<int[], int[]> mappedRange : mappedRanges.entrySet())
731 int[] fromRange = mappedRange.getKey();
732 int[] toRange = mappedRange.getValue();
733 if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0])
736 * mapping is 1:1 in length, so we trust it to have no discontinuities
738 if (MappingUtils.rangeContains(fromRange, queryRange))
741 * fromRange subsumes our query range
743 int offset = queryRange[0] - fromRange[0];
744 int mappedRangeFrom = toRange[0] + offset;
745 int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]);
746 return new int[] { mappedRangeFrom, mappedRangeTo };
755 * Transfers the sequence feature to the target sequence, locating its start
756 * and end range based on the mapping. Features which do not overlap the
757 * target sequence are ignored.
760 * @param targetSequence
762 * mapping from the feature's coordinates to the target sequence
764 protected void transferFeature(SequenceFeature sf,
765 SequenceI targetSequence, MapList mapping)
767 int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd());
769 if (mappedRange != null)
771 String group = sf.getFeatureGroup();
772 int newBegin = Math.min(mappedRange[0], mappedRange[1]);
773 int newEnd = Math.max(mappedRange[0], mappedRange[1]);
774 SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd,
775 group, sf.getScore());
776 targetSequence.addSequenceFeature(copy);
781 * Formats a ranges map lookup key
789 protected static String makeRangesKey(String chromosome, String species,
790 String fromRef, String toRef)
792 return species + EXCL + chromosome + EXCL + fromRef + EXCL