1 package jalview.io.vcf;
3 import htsjdk.samtools.util.CloseableIterator;
4 import htsjdk.variant.variantcontext.Allele;
5 import htsjdk.variant.variantcontext.VariantContext;
6 import htsjdk.variant.vcf.VCFHeader;
7 import htsjdk.variant.vcf.VCFHeaderLine;
9 import jalview.analysis.AlignmentUtils;
10 import jalview.datamodel.AlignmentI;
11 import jalview.datamodel.DBRefEntry;
12 import jalview.datamodel.GeneLoci;
13 import jalview.datamodel.Mapping;
14 import jalview.datamodel.Sequence;
15 import jalview.datamodel.SequenceFeature;
16 import jalview.datamodel.SequenceI;
17 import jalview.ext.htsjdk.VCFReader;
18 import jalview.io.gff.SequenceOntologyI;
19 import jalview.util.MapList;
21 import java.util.List;
23 import java.util.Map.Entry;
25 public class VCFLoader
30 * Constructor given an alignment context
34 public VCFLoader(AlignmentI alignment)
40 * Loads VCF on to an alignment - provided it can be related to one or more
41 * sequence's chromosomal coordinates
45 public void loadVCF(String filePath)
47 VCFReader reader = null;
51 long start = System.currentTimeMillis();
52 reader = new VCFReader(filePath);
54 VCFHeader header = reader.getFileHeader();
55 VCFHeaderLine ref = header
56 .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
57 // check if reference is wrt assembly19 (GRCh37)
58 // todo may need to allow user to specify reference assembly?
59 boolean isRefGrch37 = (ref != null && ref.getValue().contains(
66 * query for VCF overlapping each sequence in turn
68 for (SequenceI seq : al.getSequences())
70 int added = loadVCF(seq, reader, isRefGrch37);
75 computePeptideVariants(seq);
78 long elapsed = System.currentTimeMillis() - start;
79 System.out.println(String.format(
80 "Added %d VCF variants to %d sequence(s) (%dms)", varCount,
86 System.err.println("Error processing VCF: " + e.getMessage());
92 * (Re-)computes peptide variants from dna variants, for any protein sequence
93 * to which the dna sequence has a mapping. Note that although duplicate
94 * features may get computed, they will not be added, since duplicate sequence
95 * features are ignored in Sequence.addSequenceFeature.
99 protected void computePeptideVariants(SequenceI dnaSeq)
101 DBRefEntry[] dbrefs = dnaSeq.getDBRefs();
106 for (DBRefEntry dbref : dbrefs)
108 Mapping mapping = dbref.getMap();
109 if (mapping == null || mapping.getTo() == null
110 || mapping.getMap().getFromRatio() != 3)
114 AlignmentUtils.computeProteinFeatures(dnaSeq, mapping.getTo(),
120 * Tries to add overlapping variants read from a VCF file to the given
121 * sequence, and returns the number of overlapping variants found. Note that
122 * this requires the sequence to hold information as to its chromosomal
123 * positions and reference, in order to be able to map the VCF variants to the
128 * @param isVcfRefGrch37
131 protected int loadVCF(SequenceI seq, VCFReader reader, boolean isVcfRefGrch37)
134 GeneLoci seqCoords = ((Sequence) seq).getGeneLoci();
135 if (seqCoords == null)
140 MapList mapping = seqCoords.getMapping();
141 List<int[]> seqChromosomalContigs = mapping.getToRanges();
142 for (int[] range : seqChromosomalContigs)
144 count += addVcfVariants(seq, reader, range, isVcfRefGrch37);
151 * Queries the VCF reader for any variants that overlap the given chromosome
152 * region of the sequence, and adds as variant features. Returns the number of
153 * overlapping variants found.
158 * start-end range of a sequence region in its chromosomal
160 * @param isVcfRefGrch37
161 * true if the VCF is with reference to GRCh37
164 protected int addVcfVariants(SequenceI seq, VCFReader reader,
165 int[] range, boolean isVcfRefGrch37)
167 GeneLoci seqCoords = ((Sequence) seq).getGeneLoci();
169 String chromosome = seqCoords.getChromosome();
170 String seqRef = seqCoords.getReference();
171 String species = seqCoords.getSpecies();
174 * map chromosomal coordinates from GRCh38 (sequence) to
175 * GRCh37 (VCF) if necessary
178 if ("GRCh38".equalsIgnoreCase(seqRef) && isVcfRefGrch37)
180 int[] newRange = mapReferenceRange(range, chromosome, species,
183 offset = newRange[0] - range[0];
188 * query the VCF for overlaps
191 MapList mapping = seqCoords.getMapping();
193 CloseableIterator<VariantContext> variants = reader.query(chromosome,
195 while (variants.hasNext())
198 * get variant location in sequence chromosomal coordinates
200 VariantContext variant = variants.next();
202 int start = variant.getStart() - offset;
203 int end = variant.getEnd() - offset;
206 * get location in sequence coordinates
208 int[] seqLocation = mapping.locateInFrom(start, end);
209 int featureStart = seqLocation[0];
210 int featureEnd = seqLocation[1];
211 addVariantFeatures(seq, variant, featureStart, featureEnd);
220 * Inspects the VCF variant record, and adds variant features to the sequence
224 * @param featureStart
227 protected void addVariantFeatures(SequenceI seq, VariantContext variant,
228 int featureStart, int featureEnd)
230 StringBuilder sb = new StringBuilder();
231 sb.append(variant.getReference().getBaseString());
234 for (Allele allele : variant.getAlleles())
236 if (!allele.isReference())
238 sb.append(",").append(allele.getBaseString());
242 String alleles = sb.toString(); // e.g. G,A,C
244 String type = SequenceOntologyI.SEQUENCE_VARIANT;
246 if (alleleCount == 1)
250 score = (float) variant.getAttributeAsDouble("AF", 0d);
251 } catch (NumberFormatException e)
256 SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
257 featureEnd, score, "VCF");
260 * only add 'alleles' property if a SNP, as we can
261 * only handle SNPs when computing peptide variants
265 sf.setValue("alleles", alleles);
268 Map<String, Object> atts = variant.getAttributes();
269 for (Entry<String, Object> att : atts.entrySet())
271 sf.setValue(att.getKey(), att.getValue());
273 seq.addSequenceFeature(sf);
277 * Call the Ensembl REST service that maps from one assembly reference's
278 * coordinates to another's
287 protected int[] mapReferenceRange(int[] range, String chromosome,
288 String species, String fromRef, String toRef)
291 // http://rest.ensembl.org/map/species/fromRef/chromosome:range[0]..range[1]:1/toRef?content-type=application/json
292 // and parse the JSON response
294 // 1922632 is the difference between 37 and 38 for chromosome 17
295 return new int[] { range[0] - 1922632, range[1] - 1922632 };