package jalview.io.vcf; import htsjdk.samtools.util.CloseableIterator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import jalview.analysis.AlignmentUtils; import jalview.api.AlignViewControllerGuiI; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLoci; import jalview.datamodel.Mapping; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.ext.ensembl.EnsemblMap; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; /** * A class to read VCF data (using the htsjdk) and add variants as sequence * features on dna and any related protein product sequences * * @author gmcarstairs */ public class VCFLoader { private static final String EXCL = "!"; /* * the alignment we are associated VCF data with */ private AlignmentI al; /* * mappings between VCF and sequence reference assembly regions, as * key = "species!chromosome!fromAssembly!toAssembly * value = Map{fromRange, toRange} */ private Map> assemblyMappings; /** * Constructor given an alignment context * * @param alignment */ public VCFLoader(AlignmentI alignment) { al = alignment; // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} assemblyMappings = new HashMap>(); } /** * Loads VCF on to an alignment - provided it can be related to one or more * sequence's chromosomal coordinates. *

* This method is not thread safe - concurrent threads should use separate * instances of this class. * * @param filePath * @param status */ public void loadVCF(String filePath, AlignViewControllerGuiI status) { VCFReader reader = null; try { // long start = System.currentTimeMillis(); reader = new VCFReader(filePath); VCFHeader header = reader.getFileHeader(); VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); // check if reference is wrt assembly19 (GRCh37) // todo may need to allow user to specify reference assembly? boolean isRefGrch37 = (ref != null && ref.getValue().contains( "assembly19")); int varCount = 0; int seqCount = 0; /* * query for VCF overlapping each sequence in turn */ for (SequenceI seq : al.getSequences()) { int added = loadVCF(seq, reader, isRefGrch37); if (added > 0) { seqCount++; varCount += added; computePeptideVariants(seq); } } // long elapsed = System.currentTimeMillis() - start; String msg = String.format("Added %d VCF variants to %d sequence(s)", varCount, seqCount); if (status != null) { status.setStatus(msg); } } catch (Throwable e) { System.err.println("Error processing VCF: " + e.getMessage()); e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { // ignore } } } } /** * (Re-)computes peptide variants from dna variants, for any protein sequence * to which the dna sequence has a mapping. Note that although duplicate * features may get computed, they will not be added, since duplicate sequence * features are ignored in Sequence.addSequenceFeature. * * @param dnaSeq */ protected void computePeptideVariants(SequenceI dnaSeq) { DBRefEntry[] dbrefs = dnaSeq.getDBRefs(); if (dbrefs == null) { return; } for (DBRefEntry dbref : dbrefs) { Mapping mapping = dbref.getMap(); if (mapping == null || mapping.getTo() == null || mapping.getMap().getFromRatio() != 3) { continue; } AlignmentUtils.computeProteinFeatures(dnaSeq, mapping.getTo(), mapping.getMap()); } } /** * Tries to add overlapping variants read from a VCF file to the given * sequence, and returns the number of overlapping variants found. Note that * this requires the sequence to hold information as to its chromosomal * positions and reference, in order to be able to map the VCF variants to the * sequence. * * @param seq * @param reader * @param isVcfRefGrch37 * @return */ protected int loadVCF(SequenceI seq, VCFReader reader, boolean isVcfRefGrch37) { int count = 0; GeneLoci seqCoords = ((Sequence) seq).getGeneLoci(); if (seqCoords == null) { return 0; } List seqChromosomalContigs = seqCoords.mapping.getToRanges(); for (int[] range : seqChromosomalContigs) { count += addVcfVariants(seq, reader, range, isVcfRefGrch37); } return count; } /** * Queries the VCF reader for any variants that overlap the given chromosome * region of the sequence, and adds as variant features. Returns the number of * overlapping variants found. * * @param seq * @param reader * @param range * start-end range of a sequence region in its chromosomal * coordinates * @param isVcfRefGrch37 * true if the VCF is with reference to GRCh37 * @return */ protected int addVcfVariants(SequenceI seq, VCFReader reader, int[] range, boolean isVcfRefGrch37) { GeneLoci seqCoords = ((Sequence) seq).getGeneLoci(); String chromosome = seqCoords.chromosome; String seqRef = seqCoords.assembly; String species = seqCoords.species; // TODO handle species properly if ("".equals(species)) { species = "human"; } /* * map chromosomal coordinates from GRCh38 (sequence) to * GRCh37 (VCF) if necessary */ // TODO generalise for other assemblies and species int offset = 0; String fromRef = "GRCh38"; if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37) { String toRef = "GRCh37"; int[] newRange = mapReferenceRange(range, chromosome, species, fromRef, toRef); if (newRange == null) { System.err.println(String.format( "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, fromRef, range[0], range[1], toRef)); return 0; } offset = newRange[0] - range[0]; range = newRange; } /* * query the VCF for overlaps * (convert a reverse strand range to forwards) */ int count = 0; MapList mapping = seqCoords.mapping; int fromLocus = Math.min(range[0], range[1]); int toLocus = Math.max(range[0], range[1]); CloseableIterator variants = reader.query(chromosome, fromLocus, toLocus); while (variants.hasNext()) { /* * get variant location in sequence chromosomal coordinates */ VariantContext variant = variants.next(); count++; int start = variant.getStart() - offset; int end = variant.getEnd() - offset; /* * convert chromosomal location to sequence coordinates * - null if a partially overlapping feature */ int[] seqLocation = mapping.locateInFrom(start, end); if (seqLocation != null) { addVariantFeatures(seq, variant, seqLocation[0], seqLocation[1]); } } variants.close(); return count; } /** * Inspects the VCF variant record, and adds variant features to the sequence * * @param seq * @param variant * @param featureStart * @param featureEnd */ protected void addVariantFeatures(SequenceI seq, VariantContext variant, int featureStart, int featureEnd) { StringBuilder sb = new StringBuilder(); sb.append(variant.getReference().getBaseString()); int alleleCount = 0; for (Allele allele : variant.getAlleles()) { if (!allele.isReference()) { sb.append(",").append(allele.getBaseString()); alleleCount++; } } String alleles = sb.toString(); // e.g. G,A,C String type = SequenceOntologyI.SEQUENCE_VARIANT; float score = 0f; if (alleleCount == 1) { try { score = (float) variant.getAttributeAsDouble("AF", 0d); } catch (NumberFormatException e) { // leave score as 0 } } SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, featureEnd, score, "VCF"); /* * only add 'alleles' property if a SNP, as we can * only handle SNPs when computing peptide variants */ if (variant.isSNP()) { sf.setValue("alleles", alleles); } Map atts = variant.getAttributes(); for (Entry att : atts.entrySet()) { sf.setValue(att.getKey(), att.getValue()); } seq.addSequenceFeature(sf); } /** * Determines the location of the query range (chromosome positions) in a * different reference assembly. *

* If the range is just a subregion of one for which we already have a mapping * (for example, an exon sub-region of a gene), then the mapping is just * computed arithmetically. *

* Otherwise, calls the Ensembl REST service that maps from one assembly * reference's coordinates to another's * * @param queryRange * start-end chromosomal range in 'fromRef' coordinates * @param chromosome * @param species * @param fromRef * assembly reference for the query coordinates * @param toRef * assembly reference we wish to translate to * @return the start-end range in 'toRef' coordinates */ protected int[] mapReferenceRange(int[] queryRange, String chromosome, String species, String fromRef, String toRef) { /* * first try shorcut of computing the mapping as a subregion of one * we already have (e.g. for an exon, if we have the gene mapping) */ int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome, species, fromRef, toRef); if (mappedRange != null) { return mappedRange; } /* * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37 */ EnsemblMap mapper = new EnsemblMap(); int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef, queryRange); if (mapping == null) { // mapping service failure return null; } /* * save mapping for possible future re-use */ String key = makeRangesKey(chromosome, species, fromRef, toRef); if (!assemblyMappings.containsKey(key)) { assemblyMappings.put(key, new HashMap()); } assemblyMappings.get(key).put(queryRange, mapping); return mapping; } /** * If we already have a 1:1 contiguous mapping which subsumes the given query * range, this method just calculates and returns the subset of that mapping, * else it returns null. In practical terms, if a gene has a contiguous * mapping between (for example) GRCh37 and GRCh38, then we assume that its * subsidiary exons occupy unchanged relative positions, and just compute * these as offsets, rather than do another lookup of the mapping. *

* If in future these assumptions prove invalid (e.g. for bacterial dna?!), * simply remove this method or let it always return null. *

* Warning: many rapid calls to the /map service map result in a 429 overload * error response * * @param queryRange * @param chromosome * @param species * @param fromRef * @param toRef * @return */ protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome, String species, String fromRef, String toRef) { String key = makeRangesKey(chromosome, species, fromRef, toRef); if (assemblyMappings.containsKey(key)) { Map mappedRanges = assemblyMappings.get(key); for (Entry mappedRange : mappedRanges.entrySet()) { int[] fromRange = mappedRange.getKey(); int[] toRange = mappedRange.getValue(); if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0]) { /* * mapping is 1:1 in length, so we trust it to have no discontinuities */ if (rangeContains(fromRange, queryRange)) { /* * fromRange subsumes our query range */ int offset = queryRange[0] - fromRange[0]; int mappedRangeFrom = toRange[0] + offset; int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]); return new int[] { mappedRangeFrom, mappedRangeTo }; } } } } return null; } /** * Answers true if range's start-end positions include those of queryRange, * where either range might be in reverse direction, else false * * @param range * @param queryRange * @return */ protected static boolean rangeContains(int[] range, int[] queryRange) { int min = Math.min(range[0], range[1]); int max = Math.max(range[0], range[1]); return (min <= queryRange[0] && max >= queryRange[0] && min <= queryRange[1] && max >= queryRange[1]); } /** * Formats a ranges map lookup key * * @param chromosome * @param species * @param fromRef * @param toRef * @return */ protected static String makeRangesKey(String chromosome, String species, String fromRef, String toRef) { return species + EXCL + chromosome + EXCL + fromRef + EXCL + toRef; } }