1 package jalview.io.vcf;
3 import htsjdk.samtools.util.CloseableIterator;
4 import htsjdk.variant.variantcontext.Allele;
5 import htsjdk.variant.variantcontext.VariantContext;
6 import htsjdk.variant.vcf.VCFHeader;
7 import htsjdk.variant.vcf.VCFHeaderLine;
9 import jalview.datamodel.AlignmentI;
10 import jalview.datamodel.GeneLoci;
11 import jalview.datamodel.Sequence;
12 import jalview.datamodel.SequenceFeature;
13 import jalview.datamodel.SequenceI;
14 import jalview.ext.htsjdk.VCFReader;
15 import jalview.io.gff.SequenceOntologyI;
16 import jalview.util.MapList;
18 import java.util.List;
20 import java.util.Map.Entry;
22 public class VCFLoader
27 * Constructor given an alignment context
31 public VCFLoader(AlignmentI alignment)
37 * Loads VCF on to an alignment - provided it can be related to one or more
38 * sequence's chromosomal coordinates
42 public void loadVCF(String filePath)
44 VCFReader reader = null;
48 reader = new VCFReader(filePath);
50 VCFHeader header = reader.getFileHeader();
51 VCFHeaderLine ref = header
52 .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
53 // check if reference is wrt assembly19 (GRCh37)
54 // todo may need to allow user to specify reference assembly?
55 boolean isRefGrch37 = (ref != null && ref.getValue().contains(
61 for (SequenceI seq : al.getSequences())
63 int added = loadVCF(seq, reader, isRefGrch37);
70 System.out.println(String.format(
71 "Added %d VCF variants to %d sequence(s)", varCount,
77 System.err.println("Error processing VCF: " + e.getMessage());
83 * Tries to add overlapping variants read from a VCF file to the given
84 * sequence, and returns the number of overlapping variants found. Note that
85 * this requires the sequence to hold information as to its chromosomal
86 * positions and reference, in order to be able to map the VCF variants to the
91 * @param isVcfRefGrch37
94 protected int loadVCF(SequenceI seq, VCFReader reader, boolean isVcfRefGrch37)
97 GeneLoci seqCoords = ((Sequence) seq).getGeneLoci();
98 if (seqCoords == null)
104 * fudge - ensure chromosomal mapping from range is sequence start/end
105 * (in case end == 0 when the mapping is first created)
107 MapList mapping = seqCoords.getMapping();
108 if (mapping.getFromRanges().get(0)[1] == 0)
110 mapping.getFromRanges().get(0)[1] = seq.getEnd();
113 List<int[]> seqChromosomalContigs = mapping.getToRanges();
114 for (int[] range : seqChromosomalContigs)
116 count += addVcfVariants(seq, reader, range, isVcfRefGrch37);
123 * Queries the VCF reader for any variants that overlap the given chromosome
124 * region of the sequence, and adds as variant features. Returns the number of
125 * overlapping variants found.
130 * start-end range of a sequence region in its chromosomal
132 * @param isVcfRefGrch37
133 * true if the VCF is with reference to GRCh37
136 protected int addVcfVariants(SequenceI seq, VCFReader reader,
137 int[] range, boolean isVcfRefGrch37)
139 GeneLoci seqCoords = ((Sequence) seq).getGeneLoci();
141 String chromosome = seqCoords.getChromosome();
142 String seqRef = seqCoords.getReference();
143 String species = seqCoords.getSpecies();
146 * map chromosomal coordinates from GRCh38 (sequence) to
147 * GRCh37 (VCF) if necessary
150 if ("GRCh38".equalsIgnoreCase(seqRef) && isVcfRefGrch37)
152 int[] newRange = mapReferenceRange(range, chromosome, species,
155 offset = newRange[0] - range[0];
160 * query the VCF for overlaps
163 MapList mapping = seqCoords.getMapping();
165 CloseableIterator<VariantContext> variants = reader.query(chromosome,
167 while (variants.hasNext())
170 * get variant location in sequence chromosomal coordinates
172 VariantContext variant = variants.next();
174 int start = variant.getStart() - offset;
175 int end = variant.getEnd() - offset;
178 * get location in sequence coordinates
180 int[] seqLocation = mapping.locateInFrom(start, end);
181 int featureStart = seqLocation[0];
182 int featureEnd = seqLocation[1];
183 addVariantFeatures(seq, variant, featureStart, featureEnd);
192 * Inspects the VCF variant record, and adds variant features to the sequence
196 * @param featureStart
199 protected void addVariantFeatures(SequenceI seq, VariantContext variant,
200 int featureStart, int featureEnd)
202 StringBuilder sb = new StringBuilder();
203 sb.append(variant.getReference().getBaseString());
205 for (Allele allele : variant.getAlleles())
207 if (!allele.isReference())
209 sb.append(",").append(allele.getBaseString());
212 String alleles = sb.toString(); // e.g. G,A,C
214 String type = SequenceOntologyI.SEQUENCE_VARIANT;
215 SequenceFeature sf = new SequenceFeature(type, alleles, featureStart,
219 * only add 'alleles' property if a SNP, as we can
220 * only handle SNPs when computing peptide variants
224 sf.setValue("alleles", alleles);
227 Map<String, Object> atts = variant.getAttributes();
228 for (Entry<String, Object> att : atts.entrySet())
230 sf.setValue(att.getKey(), att.getValue());
232 seq.addSequenceFeature(sf);
236 * Call the Ensembl REST service that maps from one assembly reference's
237 * coordinates to another's
246 protected int[] mapReferenceRange(int[] range, String chromosome,
247 String species, String fromRef, String toRef)
250 // http://rest.ensembl.org/map/species/fromRef/chromosome:range[0]..range[1]:1/toRef?content-type=application/json
251 // and parse the JSON response
253 // 1922632 is the difference between 37 and 38 for chromosome 17
254 return new int[] { range[0] - 1922632, range[1] - 1922632 };