package jalview.io.vcf; import htsjdk.samtools.util.CloseableIterator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFInfoHeaderLine; import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.ext.ensembl.EnsemblMap; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.MessageManager; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; /** * A class to read VCF data (using the htsjdk) and add variants as sequence * features on dna and any related protein product sequences * * @author gmcarstairs */ public class VCFLoader { /* * keys to fields of VEP CSQ consequence data * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ private static final String ALLELE_KEY = "Allele"; private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... private static final String FEATURE_KEY = "Feature"; // Ensembl stable id /* * what comes before column headings in CSQ Description field */ private static final String FORMAT = "Format: "; /* * default VCF INFO key for VEP consequence data * NB this can be overridden running VEP with --vcf_info_field * - we don't handle this case (require CSQ identifier) */ private static final String CSQ = "CSQ"; /* * separator for fields in consequence data */ private static final String PIPE = "|"; private static final String PIPE_REGEX = "\\" + PIPE; /* * key for Allele Frequency output by VEP * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ private static final String ALLELE_FREQUENCY_KEY = "AF"; /* * delimiter that separates multiple consequence data blocks */ private static final String COMMA = ","; /* * the feature group assigned to a VCF variant in Jalview */ private static final String FEATURE_GROUP_VCF = "VCF"; /* * internal delimiter used to build keys for assemblyMappings * */ private static final String EXCL = "!"; /* * the alignment we are associating VCF data with */ private AlignmentI al; /* * mappings between VCF and sequence reference assembly regions, as * key = "species!chromosome!fromAssembly!toAssembly * value = Map{fromRange, toRange} */ private Map> assemblyMappings; /* * holds details of the VCF header lines (metadata) */ private VCFHeader header; /* * the position (0...) of field in each block of * CSQ (consequence) data (if declared in the VCF INFO header for CSQ) * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ private int csqAlleleFieldIndex = -1; private int csqAlleleNumberFieldIndex = -1; private int csqFeatureFieldIndex = -1; /** * Constructor given an alignment context * * @param alignment */ public VCFLoader(AlignmentI alignment) { al = alignment; // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} assemblyMappings = new HashMap>(); } /** * Starts a new thread to query and load VCF variant data on to the alignment *

* This method is not thread safe - concurrent threads should use separate * instances of this class. * * @param filePath * @param gui */ public void loadVCF(final String filePath, final AlignViewControllerGuiI gui) { if (gui != null) { gui.setStatus(MessageManager.getString("label.searching_vcf")); } new Thread() { @Override public void run() { VCFLoader.this.doLoad(filePath, gui); } }.start(); } /** * Loads VCF on to an alignment - provided it can be related to one or more * sequence's chromosomal coordinates * * @param filePath * @param gui * optional callback handler for messages */ protected void doLoad(String filePath, AlignViewControllerGuiI gui) { VCFReader reader = null; try { // long start = System.currentTimeMillis(); reader = new VCFReader(filePath); header = reader.getFileHeader(); VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); /* * get offset of CSQ ALLELE_NUM and Feature if declared */ locateCsqFields(); // check if reference is wrt assembly19 (GRCh37) // todo may need to allow user to specify reference assembly? boolean isRefGrch37 = (ref != null && ref.getValue().contains( "assembly19")); int varCount = 0; int seqCount = 0; /* * query for VCF overlapping each sequence in turn */ for (SequenceI seq : al.getSequences()) { int added = loadSequenceVCF(seq, reader, isRefGrch37); if (added > 0) { seqCount++; varCount += added; transferAddedFeatures(seq); } } if (gui != null) { // long elapsed = System.currentTimeMillis() - start; String msg = MessageManager.formatMessage("label.added_vcf", varCount, seqCount); gui.setStatus(msg); if (gui.getFeatureSettingsUI() != null) { gui.getFeatureSettingsUI().discoverAllFeatureData(); } } } catch (Throwable e) { System.err.println("Error processing VCF: " + e.getMessage()); e.printStackTrace(); if (gui != null) { gui.setStatus("Error occurred - see console for details"); } } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { // ignore } } } } /** * Records the position of selected fields defined in the CSQ INFO header (if * there is one). CSQ fields are declared in the CSQ INFO Description e.g. *

* Description="Consequence ...from ... VEP. Format: Allele|Consequence|... */ protected void locateCsqFields() { VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ); if (csqInfo == null) { return; } String desc = csqInfo.getDescription(); int formatPos = desc.indexOf(FORMAT); if (formatPos == -1) { System.err.println("Parse error, failed to find " + FORMAT + " in " + desc); return; } desc = desc.substring(formatPos + FORMAT.length()); if (desc != null) { String[] format = desc.split(PIPE_REGEX); int index = 0; for (String field : format) { if (ALLELE_NUM_KEY.equals(field)) { csqAlleleNumberFieldIndex = index; } if (ALLELE_KEY.equals(field)) { csqAlleleFieldIndex = index; } if (FEATURE_KEY.equals(field)) { csqFeatureFieldIndex = index; } index++; } } } /** * Transfers VCF features to sequences to which this sequence has a mapping. * If the mapping is 1:3, computes peptide variants from nucleotide variants. * * @param seq */ protected void transferAddedFeatures(SequenceI seq) { DBRefEntry[] dbrefs = seq.getDBRefs(); if (dbrefs == null) { return; } for (DBRefEntry dbref : dbrefs) { Mapping mapping = dbref.getMap(); if (mapping == null || mapping.getTo() == null) { continue; } SequenceI mapTo = mapping.getTo(); MapList map = mapping.getMap(); if (map.getFromRatio() == 3) { /* * dna-to-peptide product mapping */ AlignmentUtils.computeProteinFeatures(seq, mapTo, map); } else { /* * nucleotide-to-nucleotide mapping e.g. transcript to CDS */ List features = seq.getFeatures() .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT); for (SequenceFeature sf : features) { if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup())) { transferFeature(sf, mapTo, map); } } } } } /** * Tries to add overlapping variants read from a VCF file to the given * sequence, and returns the number of variant features added. Note that this * requires the sequence to hold information as to its chromosomal positions * and reference, in order to be able to map the VCF variants to the sequence. * * @param seq * @param reader * @param isVcfRefGrch37 * @return */ protected int loadSequenceVCF(SequenceI seq, VCFReader reader, boolean isVcfRefGrch37) { int count = 0; GeneLociI seqCoords = seq.getGeneLoci(); if (seqCoords == null) { System.out.println(String.format( "Can't query VCF for %s as chromosome coordinates not known", seq.getName())); return 0; } List seqChromosomalContigs = seqCoords.getMap().getToRanges(); for (int[] range : seqChromosomalContigs) { count += addVcfVariants(seq, reader, range, isVcfRefGrch37); } return count; } /** * Queries the VCF reader for any variants that overlap the given chromosome * region of the sequence, and adds as variant features. Returns the number of * overlapping variants found. * * @param seq * @param reader * @param range * start-end range of a sequence region in its chromosomal * coordinates * @param isVcfRefGrch37 * true if the VCF is with reference to GRCh37 * @return */ protected int addVcfVariants(SequenceI seq, VCFReader reader, int[] range, boolean isVcfRefGrch37) { GeneLociI seqCoords = seq.getGeneLoci(); String chromosome = seqCoords.getChromosomeId(); String seqRef = seqCoords.getAssemblyId(); String species = seqCoords.getSpeciesId(); /* * map chromosomal coordinates from GRCh38 (sequence) to * GRCh37 (VCF) if necessary */ // TODO generalise for other assemblies and species int offset = 0; String fromRef = "GRCh38"; if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37) { String toRef = "GRCh37"; int[] newRange = mapReferenceRange(range, chromosome, "human", fromRef, toRef); if (newRange == null) { System.err.println(String.format( "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, fromRef, range[0], range[1], toRef)); return 0; } offset = newRange[0] - range[0]; range = newRange; } boolean forwardStrand = range[0] <= range[1]; /* * query the VCF for overlaps * (convert a reverse strand range to forwards) */ int count = 0; MapList mapping = seqCoords.getMap(); int fromLocus = Math.min(range[0], range[1]); int toLocus = Math.max(range[0], range[1]); CloseableIterator variants = reader.query(chromosome, fromLocus, toLocus); while (variants.hasNext()) { /* * get variant location in sequence chromosomal coordinates */ VariantContext variant = variants.next(); /* * we can only process SNP variants (which can be reported * as part of a MIXED variant record */ if (!variant.isSNP() && !variant.isMixed()) { // continue; } int start = variant.getStart() - offset; int end = variant.getEnd() - offset; /* * convert chromosomal location to sequence coordinates * - null if a partially overlapping feature */ int[] seqLocation = mapping.locateInFrom(start, end); if (seqLocation != null) { count += addAlleleFeatures(seq, variant, seqLocation[0], seqLocation[1], forwardStrand); } } variants.close(); return count; } /** * A convenience method to get the AF value for the given alternate allele * index * * @param variant * @param alleleIndex * @return */ protected float getAlleleFrequency(VariantContext variant, int alleleIndex) { float score = 0f; String attributeValue = getAttributeValue(variant, ALLELE_FREQUENCY_KEY, alleleIndex); if (attributeValue != null) { try { score = Float.parseFloat(attributeValue); } catch (NumberFormatException e) { // leave as 0 } } return score; } /** * A convenience method to get an attribute value for an alternate allele * * @param variant * @param attributeName * @param alleleIndex * @return */ protected String getAttributeValue(VariantContext variant, String attributeName, int alleleIndex) { Object att = variant.getAttribute(attributeName); if (att instanceof String) { return (String) att; } else if (att instanceof ArrayList) { return ((List) att).get(alleleIndex); } return null; } /** * Adds one variant feature for each allele in the VCF variant record, and * returns the number of features added. * * @param seq * @param variant * @param featureStart * @param featureEnd * @param forwardStrand * @return */ protected int addAlleleFeatures(SequenceI seq, VariantContext variant, int featureStart, int featureEnd, boolean forwardStrand) { int added = 0; /* * Javadoc says getAlternateAlleles() imposes no order on the list returned * so we proceed defensively to get them in strict order */ int altAlleleCount = variant.getAlternateAlleles().size(); for (int i = 0; i < altAlleleCount; i++) { added += addAlleleFeature(seq, variant, i, featureStart, featureEnd, forwardStrand); } return added; } /** * Inspects one allele and attempts to add a variant feature for it to the * sequence. We extract as much as possible of the additional data associated * with this allele to store in the feature's key-value map. Answers the * number of features added (0 or 1). * * @param seq * @param variant * @param altAlleleIndex * (0, 1..) * @param featureStart * @param featureEnd * @param forwardStrand * @return */ protected int addAlleleFeature(SequenceI seq, VariantContext variant, int altAlleleIndex, int featureStart, int featureEnd, boolean forwardStrand) { String reference = variant.getReference().getBaseString(); Allele alt = variant.getAlternateAllele(altAlleleIndex); String allele = alt.getBaseString(); if (allele.length() != 1) { /* * not a SNP variant */ // return 0; } /* * build the ref,alt allele description e.g. "G,A", using the base * complement if the sequence is on the reverse strand */ // TODO check how structural variants are shown on reverse strand StringBuilder sb = new StringBuilder(); sb.append(forwardStrand ? reference : Dna.reverseComplement(reference)); sb.append(COMMA); sb.append(forwardStrand ? allele : Dna.reverseComplement(allele)); String alleles = sb.toString(); // e.g. G,A String type = SequenceOntologyI.SEQUENCE_VARIANT; float score = getAlleleFrequency(variant, altAlleleIndex); SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, featureEnd, score, FEATURE_GROUP_VCF); sf.setValue(Gff3Helper.ALLELES, alleles); addAlleleProperties(variant, seq, sf, altAlleleIndex); seq.addSequenceFeature(sf); return 1; } /** * Add any allele-specific VCF key-value data to the sequence feature * * @param variant * @param seq * @param sf * @param altAlelleIndex * (0, 1..) */ protected void addAlleleProperties(VariantContext variant, SequenceI seq, SequenceFeature sf, final int altAlelleIndex) { Map atts = variant.getAttributes(); for (Entry att : atts.entrySet()) { String key = att.getKey(); /* * extract Consequence data (if present) that we are able to * associated with the allele for this variant feature */ if (CSQ.equals(key)) { addConsequences(variant, seq, sf, altAlelleIndex); continue; } /* * we extract values for other data which are allele-specific; * these may be per alternate allele (INFO[key].Number = 'A') * or per allele including reference (INFO[key].Number = 'R') */ VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key); if (infoHeader == null) { /* * can't be sure what data belongs to this allele, so * play safe and don't take any */ continue; } VCFHeaderLineCount number = infoHeader.getCountType(); int index = altAlelleIndex; if (number == VCFHeaderLineCount.R) { /* * one value per allele including reference, so bump index * e.g. the 3rd value is for the 2nd alternate allele */ index++; } else if (number != VCFHeaderLineCount.A) { /* * don't save other values as not allele-related */ continue; } /* * take the index'th value */ String value = getAttributeValue(variant, key, index); if (value != null) { sf.setValue(key, value); } } } /** * Inspects CSQ data blocks (consequences) and adds attributes on the sequence * feature for the current allele (and transcript if applicable) *

* Allele matching: if field ALLELE_NUM is present, it must match * altAlleleIndex. If not present, then field Allele value must match the VCF * Allele. *

* Transcript matching: if sequence name can be identified to at least one of * the consequences' Feature values, then select only consequences that match * the value (i.e. consequences for the current transcript sequence). If not, * take all consequences (this is the case when adding features to the gene * sequence). * * @param variant * @param seq * @param sf * @param altAlelleIndex * (0, 1..) */ protected void addConsequences(VariantContext variant, SequenceI seq, SequenceFeature sf, int altAlelleIndex) { Object value = variant.getAttribute(CSQ); if (value == null || !(value instanceof ArrayList)) { return; } List consequences = (List) value; /* * if CSQ data includes 'Feature', and any value matches the sequence name, * then restrict consequence data to only the matching value (transcript) * i.e. just pick out consequences for the transcript the variant feature is on */ String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase(); String matchFeature = null; if (csqFeatureFieldIndex > -1) { for (String consequence : consequences) { String[] csqFields = consequence.split(PIPE_REGEX); if (csqFields.length > csqFeatureFieldIndex) { String featureIdentifier = csqFields[csqFeatureFieldIndex]; if (featureIdentifier.length() > 4 && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) { matchFeature = featureIdentifier; } } } } StringBuilder sb = new StringBuilder(128); boolean found = false; for (String consequence : consequences) { String[] csqFields = consequence.split(PIPE_REGEX); if (includeConsequence(csqFields, matchFeature, variant, altAlelleIndex)) { if (found) { sb.append(COMMA); } found = true; sb.append(consequence); } } if (found) { sf.setValue(CSQ, sb.toString()); } } /** * Answers true if we want to associate this block of consequence data with * the specified alternate allele of the VCF variant. *

* If consequence data includes the ALLELE_NUM field, then this has to match * altAlleleIndex. Otherwise the Allele field of the consequence data has to * match the allele value. *

* Optionally (if matchFeature is not null), restrict to only include * consequences whose Feature value matches. This allows us to attach * consequences to their respective transcripts. * * @param csqFields * @param matchFeature * @param variant * @param altAlelleIndex * (0, 1..) * @return */ protected boolean includeConsequence(String[] csqFields, String matchFeature, VariantContext variant, int altAlelleIndex) { /* * check consequence is for the current transcript */ if (matchFeature != null) { if (csqFields.length <= csqFeatureFieldIndex) { return false; } String featureIdentifier = csqFields[csqFeatureFieldIndex]; if (!featureIdentifier.equals(matchFeature)) { return false; // consequence is for a different transcript } } /* * if ALLELE_NUM is present, it must match altAlleleIndex * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex */ if (csqAlleleNumberFieldIndex > -1) { if (csqFields.length <= csqAlleleNumberFieldIndex) { return false; } String alleleNum = csqFields[csqAlleleNumberFieldIndex]; return String.valueOf(altAlelleIndex + 1).equals(alleleNum); } /* * else consequence allele must match variant allele */ if (csqAlleleFieldIndex > -1 && csqFields.length > csqAlleleFieldIndex) { String csqAllele = csqFields[csqAlleleFieldIndex]; String vcfAllele = variant.getAlternateAllele(altAlelleIndex) .getBaseString(); return csqAllele.equals(vcfAllele); } return false; } /** * A convenience method to complement a dna base and return the string value * of its complement * * @param reference * @return */ protected String complement(byte[] reference) { return String.valueOf(Dna.getComplement((char) reference[0])); } /** * Determines the location of the query range (chromosome positions) in a * different reference assembly. *

* If the range is just a subregion of one for which we already have a mapping * (for example, an exon sub-region of a gene), then the mapping is just * computed arithmetically. *

* Otherwise, calls the Ensembl REST service that maps from one assembly * reference's coordinates to another's * * @param queryRange * start-end chromosomal range in 'fromRef' coordinates * @param chromosome * @param species * @param fromRef * assembly reference for the query coordinates * @param toRef * assembly reference we wish to translate to * @return the start-end range in 'toRef' coordinates */ protected int[] mapReferenceRange(int[] queryRange, String chromosome, String species, String fromRef, String toRef) { /* * first try shorcut of computing the mapping as a subregion of one * we already have (e.g. for an exon, if we have the gene mapping) */ int[] mappedRange = findSubsumedRangeMapping(queryRange, chromosome, species, fromRef, toRef); if (mappedRange != null) { return mappedRange; } /* * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37 */ EnsemblMap mapper = new EnsemblMap(); int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef, queryRange); if (mapping == null) { // mapping service failure return null; } /* * save mapping for possible future re-use */ String key = makeRangesKey(chromosome, species, fromRef, toRef); if (!assemblyMappings.containsKey(key)) { assemblyMappings.put(key, new HashMap()); } assemblyMappings.get(key).put(queryRange, mapping); return mapping; } /** * If we already have a 1:1 contiguous mapping which subsumes the given query * range, this method just calculates and returns the subset of that mapping, * else it returns null. In practical terms, if a gene has a contiguous * mapping between (for example) GRCh37 and GRCh38, then we assume that its * subsidiary exons occupy unchanged relative positions, and just compute * these as offsets, rather than do another lookup of the mapping. *

* If in future these assumptions prove invalid (e.g. for bacterial dna?!), * simply remove this method or let it always return null. *

* Warning: many rapid calls to the /map service map result in a 429 overload * error response * * @param queryRange * @param chromosome * @param species * @param fromRef * @param toRef * @return */ protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome, String species, String fromRef, String toRef) { String key = makeRangesKey(chromosome, species, fromRef, toRef); if (assemblyMappings.containsKey(key)) { Map mappedRanges = assemblyMappings.get(key); for (Entry mappedRange : mappedRanges.entrySet()) { int[] fromRange = mappedRange.getKey(); int[] toRange = mappedRange.getValue(); if (fromRange[1] - fromRange[0] == toRange[1] - toRange[0]) { /* * mapping is 1:1 in length, so we trust it to have no discontinuities */ if (MappingUtils.rangeContains(fromRange, queryRange)) { /* * fromRange subsumes our query range */ int offset = queryRange[0] - fromRange[0]; int mappedRangeFrom = toRange[0] + offset; int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]); return new int[] { mappedRangeFrom, mappedRangeTo }; } } } } return null; } /** * Transfers the sequence feature to the target sequence, locating its start * and end range based on the mapping. Features which do not overlap the * target sequence are ignored. * * @param sf * @param targetSequence * @param mapping * mapping from the feature's coordinates to the target sequence */ protected void transferFeature(SequenceFeature sf, SequenceI targetSequence, MapList mapping) { int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd()); if (mappedRange != null) { String group = sf.getFeatureGroup(); int newBegin = Math.min(mappedRange[0], mappedRange[1]); int newEnd = Math.max(mappedRange[0], mappedRange[1]); SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, group, sf.getScore()); targetSequence.addSequenceFeature(copy); } } /** * Formats a ranges map lookup key * * @param chromosome * @param species * @param fromRef * @param toRef * @return */ protected static String makeRangesKey(String chromosome, String species, String fromRef, String toRef) { return species + EXCL + chromosome + EXCL + fromRef + EXCL + toRef; } }