X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=d4618113a22c73b97dd6a93ba42459515bc777b2;hb=3569b83486d410891bf61fe4157ac48062491e3a;hp=44e8e16531dacaaefa36e2f1cca4fc4c85b9815f;hpb=f5ddcb62310a9f32545231d0c5e1d7c364bfbb53;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 44e8e16..d461811 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -1,32 +1,48 @@ package jalview.io.vcf; -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; - -import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; -import jalview.datamodel.AlignmentI; +import jalview.bin.Cache; import jalview.datamodel.DBRefEntry; -import jalview.datamodel.GeneLoci; +import jalview.datamodel.GeneLociI; import jalview.datamodel.Mapping; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.datamodel.features.FeatureAttributeType; +import jalview.datamodel.features.FeatureSource; +import jalview.datamodel.features.FeatureSources; import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; import jalview.ext.htsjdk.VCFReader; import jalview.io.gff.Gff3Helper; import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import jalview.util.MappingUtils; +import jalview.util.MessageManager; +import java.io.File; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import htsjdk.samtools.SAMException; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; /** * A class to read VCF data (using the htsjdk) and add variants as sequence @@ -36,344 +52,1282 @@ import java.util.Map.Entry; */ public class VCFLoader { + private static final String UTF_8 = "UTF-8"; + + private static final String DEFAULT_SPECIES = "homo_sapiens"; + + /** + * A class to model the mapping from sequence to VCF coordinates. Cases include + * + */ + class VCFMap + { + final String chromosome; + + final MapList map; + + VCFMap(String chr, MapList m) + { + chromosome = chr; + map = m; + } + + @Override + public String toString() + { + return chromosome + ":" + map.toString(); + } + } + + /* + * Lookup keys, and default values, for Preference entries that describe + * patterns for VCF and VEP fields to capture + */ + private static final String VEP_FIELDS_PREF = "VEP_FIELDS"; + + private static final String VCF_FIELDS_PREF = "VCF_FIELDS"; + + private static final String DEFAULT_VCF_FIELDS = ".*"; + + private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG"; + + /* + * Lookup keys, and default values, for Preference entries that give + * mappings from tokens in the 'reference' header to species or assembly + */ + private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY"; + + private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh37,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38"; + + private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human + + private static final String DEFAULT_REFERENCE = "grch37"; // fallback default is human GRCh37 + + /* + * keys to fields of VEP CSQ consequence data + * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html + */ + private static final String CSQ_CONSEQUENCE_KEY = "Consequence"; + private static final String CSQ_ALLELE_KEY = "Allele"; + private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... + private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id + + /* + * default VCF INFO key for VEP consequence data + * NB this can be overridden running VEP with --vcf_info_field + * - we don't handle this case (require identifier to be CSQ) + */ + private static final String CSQ_FIELD = "CSQ"; + + /* + * separator for fields in consequence data is '|' + */ + private static final String PIPE_REGEX = "\\|"; + + /* + * delimiter that separates multiple consequence data blocks + */ + private static final String COMMA = ","; + + /* + * the feature group assigned to a VCF variant in Jalview + */ + private static final String FEATURE_GROUP_VCF = "VCF"; + + /* + * internal delimiter used to build keys for assemblyMappings + * + */ private static final String EXCL = "!"; /* - * the alignment we are associated VCF data with + * the VCF file we are processing + */ + protected String vcfFilePath; + + /* + * mappings between VCF and sequence reference assembly regions, as + * key = "species!chromosome!fromAssembly!toAssembly + * value = Map{fromRange, toRange} + */ + private Map> assemblyMappings; + + private VCFReader reader; + + /* + * holds details of the VCF header lines (metadata) + */ + private VCFHeader header; + + /* + * species (as a valid Ensembl term) the VCF is for + */ + private String vcfSpecies; + + /* + * genome assembly version (as a valid Ensembl identifier) the VCF is for + */ + private String vcfAssembly; + + /* + * a Dictionary of contigs (if present) referenced in the VCF file + */ + private SAMSequenceDictionary dictionary; + + /* + * the position (0...) of field in each block of + * CSQ (consequence) data (if declared in the VCF INFO header for CSQ) + * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html + */ + private int csqConsequenceFieldIndex = -1; + private int csqAlleleFieldIndex = -1; + private int csqAlleleNumberFieldIndex = -1; + private int csqFeatureFieldIndex = -1; + + // todo the same fields for SnpEff ANN data if wanted + // see http://snpeff.sourceforge.net/SnpEff_manual.html#input + + /* + * a unique identifier under which to save metadata about feature + * attributes (selected INFO field data) + */ + private String sourceId; + + /* + * The INFO IDs of data that is both present in the VCF file, and + * also matched by any filters for data of interest + */ + List vcfFieldsOfInterest; + + /* + * The field offsets and identifiers for VEP (CSQ) data that is both present + * in the VCF file, and also matched by any filters for data of interest + * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ... + */ + Map vepFieldsOfInterest; + + /** + * Constructor given a VCF file + * + * @param alignment + */ + public VCFLoader(String vcfFile) + { + try + { + initialise(vcfFile); + } catch (IOException e) + { + System.err.println("Error opening VCF file: " + e.getMessage()); + } + + // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} + assemblyMappings = new HashMap<>(); + } + + /** + * Starts a new thread to query and load VCF variant data on to the given + * sequences + *

+ * This method is not thread safe - concurrent threads should use separate + * instances of this class. + * + * @param seqs + * @param gui + */ + public void loadVCF(SequenceI[] seqs, final AlignViewControllerGuiI gui) + { + if (gui != null) + { + gui.setStatus(MessageManager.getString("label.searching_vcf")); + } + + new Thread() + { + @Override + public void run() + { + VCFLoader.this.doLoad(seqs, gui); + } + }.start(); + } + + /** + * Reads the specified contig sequence and adds its VCF variants to it + * + * @param contig + * the id of a single sequence (contig) to load + * @return */ - private AlignmentI al; + public SequenceI loadVCFContig(String contig) + { + VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + if (headerLine == null) + { + Cache.log.error("VCF reference header not found"); + return null; + } + String ref = headerLine.getValue(); + if (ref.startsWith("file://")) + { + ref = ref.substring(7); + } + setSpeciesAndAssembly(ref); + + SequenceI seq = null; + File dbFile = new File(ref); + + if (dbFile.exists()) + { + HtsContigDb db = new HtsContigDb("", dbFile); + seq = db.getSequenceProxy(contig); + loadSequenceVCF(seq); + db.close(); + } + else + { + Cache.log.error("VCF reference not found: " + ref); + } + + return seq; + } + + /** + * Loads VCF on to one or more sequences + * + * @param seqs + * @param gui + * optional callback handler for messages + */ + protected void doLoad(SequenceI[] seqs, AlignViewControllerGuiI gui) + { + try + { + VCFHeaderLine ref = header + .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + String reference = ref == null ? null : ref.getValue(); + + setSpeciesAndAssembly(reference); + + int varCount = 0; + int seqCount = 0; + + /* + * query for VCF overlapping each sequence in turn + */ + for (SequenceI seq : seqs) + { + int added = loadSequenceVCF(seq); + if (added > 0) + { + seqCount++; + varCount += added; + transferAddedFeatures(seq); + } + } + if (gui != null) + { + String msg = MessageManager.formatMessage("label.added_vcf", + varCount, seqCount); + gui.setStatus(msg); + if (gui.getFeatureSettingsUI() != null) + { + gui.getFeatureSettingsUI().discoverAllFeatureData(); + } + } + } catch (Throwable e) + { + System.err.println("Error processing VCF: " + e.getMessage()); + e.printStackTrace(); + if (gui != null) + { + gui.setStatus("Error occurred - see console for details"); + } + } finally + { + if (reader != null) + { + try + { + reader.close(); + } catch (IOException e) + { + // ignore + } + } + header = null; + dictionary = null; + } + } + + /** + * Attempts to determine and save the species and genome assembly version to + * which the VCF data applies. This may be done by parsing the {@code reference} + * header line, configured in a property file, or (potentially) confirmed + * interactively by the user. + *

+ * The saved values should be identifiers valid for Ensembl's REST service + * {@code map} endpoint, so they can be used (if necessary) to retrieve the + * mapping between VCF coordinates and sequence coordinates. + * + * @param reference + * @see https://rest.ensembl.org/documentation/info/assembly_map + * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml + * @see https://rest.ensembl.org/info/species?content-type=text/xml + */ + protected void setSpeciesAndAssembly(String reference) + { + if (reference == null) + { + Cache.log.error("No VCF ##reference found, defaulting to " + + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES); + reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified + } + reference = reference.toLowerCase(); + + /* + * for a non-human species, or other assembly identifier, + * specify as a Jalview property file entry e.g. + * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37 + * VCF_SPECIES = c_elegans=celegans + * to map a token in the reference header to a value + */ + String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY); + for (String token : prop.split(",")) + { + String[] tokens = token.split("="); + if (tokens.length == 2) + { + if (reference.contains(tokens[0].trim().toLowerCase())) + { + vcfAssembly = tokens[1].trim(); + break; + } + } + } + + vcfSpecies = DEFAULT_SPECIES; + prop = Cache.getProperty(VCF_SPECIES); + if (prop != null) + { + for (String token : prop.split(",")) + { + String[] tokens = token.split("="); + if (tokens.length == 2) + { + if (reference.contains(tokens[0].trim().toLowerCase())) + { + vcfSpecies = tokens[1].trim(); + break; + } + } + } + } + } + + /** + * Opens the VCF file and parses header data + * + * @param filePath + * @throws IOException + */ + private void initialise(String filePath) throws IOException + { + vcfFilePath = filePath; + + reader = new VCFReader(filePath); + + header = reader.getFileHeader(); + + try + { + dictionary = header.getSequenceDictionary(); + } catch (SAMException e) + { + // ignore - thrown if any contig line lacks length info + } + + sourceId = filePath; + + saveMetadata(sourceId); + + /* + * get offset of CSQ ALLELE_NUM and Feature if declared + */ + parseCsqHeader(); + } + + /** + * Reads metadata (such as INFO field descriptions and datatypes) and saves + * them for future reference + * + * @param theSourceId + */ + void saveMetadata(String theSourceId) + { + List vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF, + DEFAULT_VCF_FIELDS); + vcfFieldsOfInterest = new ArrayList<>(); + + FeatureSource metadata = new FeatureSource(theSourceId); + + for (VCFInfoHeaderLine info : header.getInfoHeaderLines()) + { + String attributeId = info.getID(); + String desc = info.getDescription(); + VCFHeaderLineType type = info.getType(); + FeatureAttributeType attType = null; + switch (type) + { + case Character: + attType = FeatureAttributeType.Character; + break; + case Flag: + attType = FeatureAttributeType.Flag; + break; + case Float: + attType = FeatureAttributeType.Float; + break; + case Integer: + attType = FeatureAttributeType.Integer; + break; + case String: + attType = FeatureAttributeType.String; + break; + } + metadata.setAttributeName(attributeId, desc); + metadata.setAttributeType(attributeId, attType); + + if (isFieldWanted(attributeId, vcfFieldPatterns)) + { + vcfFieldsOfInterest.add(attributeId); + } + } + + FeatureSources.getInstance().addSource(theSourceId, metadata); + } + + /** + * Answers true if the field id is matched by any of the filter patterns, else + * false. Matching is against regular expression patterns, and is not + * case-sensitive. + * + * @param id + * @param filters + * @return + */ + private boolean isFieldWanted(String id, List filters) + { + for (Pattern p : filters) + { + if (p.matcher(id.toUpperCase()).matches()) + { + return true; + } + } + return false; + } + + /** + * Records 'wanted' fields defined in the CSQ INFO header (if there is one). + * Also records the position of selected fields (Allele, ALLELE_NUM, Feature) + * required for processing. + *

+ * CSQ fields are declared in the CSQ INFO Description e.g. + *

+ * Description="Consequence ...from ... VEP. Format: Allele|Consequence|... + */ + protected void parseCsqHeader() + { + List vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF, + DEFAULT_VEP_FIELDS); + vepFieldsOfInterest = new HashMap<>(); + + VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD); + if (csqInfo == null) + { + return; + } + + /* + * parse out the pipe-separated list of CSQ fields; we assume here that + * these form the last part of the description, and contain no spaces + */ + String desc = csqInfo.getDescription(); + int spacePos = desc.lastIndexOf(" "); + desc = desc.substring(spacePos + 1); + + if (desc != null) + { + String[] format = desc.split(PIPE_REGEX); + int index = 0; + for (String field : format) + { + if (CSQ_CONSEQUENCE_KEY.equals(field)) + { + csqConsequenceFieldIndex = index; + } + if (CSQ_ALLELE_NUM_KEY.equals(field)) + { + csqAlleleNumberFieldIndex = index; + } + if (CSQ_ALLELE_KEY.equals(field)) + { + csqAlleleFieldIndex = index; + } + if (CSQ_FEATURE_KEY.equals(field)) + { + csqFeatureFieldIndex = index; + } + + if (isFieldWanted(field, vepFieldFilters)) + { + vepFieldsOfInterest.put(index, field); + } + + index++; + } + } + } + + /** + * Reads the Preference value for the given key, with default specified if no + * preference set. The value is interpreted as a comma-separated list of + * regular expressions, and converted into a list of compiled patterns ready + * for matching. Patterns are forced to upper-case for non-case-sensitive + * matching. + *

+ * This supports user-defined filters for fields of interest to capture while + * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO + * fields with an ID of AF, or starting with AC, would be matched. + * + * @param key + * @param def + * @return + */ + private List getFieldMatchers(String key, String def) + { + String pref = Cache.getDefault(key, def); + List patterns = new ArrayList<>(); + String[] tokens = pref.split(","); + for (String token : tokens) + { + try + { + patterns.add(Pattern.compile(token.toUpperCase())); + } catch (PatternSyntaxException e) + { + System.err.println("Invalid pattern ignored: " + token); + } + } + return patterns; + } + + /** + * Transfers VCF features to sequences to which this sequence has a mapping. + * If the mapping is 3:1, computes peptide variants from nucleotide variants. + * + * @param seq + */ + protected void transferAddedFeatures(SequenceI seq) + { + DBRefEntry[] dbrefs = seq.getDBRefs(); + if (dbrefs == null) + { + return; + } + for (DBRefEntry dbref : dbrefs) + { + Mapping mapping = dbref.getMap(); + if (mapping == null || mapping.getTo() == null) + { + continue; + } + + SequenceI mapTo = mapping.getTo(); + MapList map = mapping.getMap(); + if (map.getFromRatio() == 3) + { + /* + * dna-to-peptide product mapping + */ + // JAL-3187 render on the fly instead + // AlignmentUtils.computeProteinFeatures(seq, mapTo, map); + } + else + { + /* + * nucleotide-to-nucleotide mapping e.g. transcript to CDS + */ + List features = seq.getFeatures() + .getPositionalFeatures(SequenceOntologyI.SEQUENCE_VARIANT); + for (SequenceFeature sf : features) + { + if (FEATURE_GROUP_VCF.equals(sf.getFeatureGroup())) + { + transferFeature(sf, mapTo, map); + } + } + } + } + } + + /** + * Tries to add overlapping variants read from a VCF file to the given sequence, + * and returns the number of variant features added + * + * @param seq + * @return + */ + protected int loadSequenceVCF(SequenceI seq) + { + VCFMap vcfMap = getVcfMap(seq); + if (vcfMap == null) + { + return 0; + } + + /* + * work with the dataset sequence here + */ + SequenceI dss = seq.getDatasetSequence(); + if (dss == null) + { + dss = seq; + } + return addVcfVariants(dss, vcfMap); + } + + /** + * Answers a map from sequence coordinates to VCF chromosome ranges + * + * @param seq + * @return + */ + private VCFMap getVcfMap(SequenceI seq) + { + /* + * simplest case: sequence has id and length matching a VCF contig + */ + VCFMap vcfMap = null; + if (dictionary != null) + { + vcfMap = getContigMap(seq); + } + if (vcfMap != null) + { + return vcfMap; + } + + /* + * otherwise, map to VCF from chromosomal coordinates + * of the sequence (if known) + */ + GeneLociI seqCoords = seq.getGeneLoci(); + if (seqCoords == null) + { + Cache.log.warn(String.format( + "Can't query VCF for %s as chromosome coordinates not known", + seq.getName())); + return null; + } + + String species = seqCoords.getSpeciesId(); + String chromosome = seqCoords.getChromosomeId(); + String seqRef = seqCoords.getAssemblyId(); + MapList map = seqCoords.getMapping(); + + // note this requires the configured species to match that + // returned with the Ensembl sequence; todo: support aliases? + if (!vcfSpecies.equalsIgnoreCase(species)) + { + Cache.log.warn("No VCF loaded to " + seq.getName() + + " as species not matched"); + return null; + } + + if (seqRef.equalsIgnoreCase(vcfAssembly)) + { + return new VCFMap(chromosome, map); + } + + /* + * VCF data has a different reference assembly to the sequence: + * query Ensembl to map chromosomal coordinates from sequence to VCF + */ + List toVcfRanges = new ArrayList<>(); + List fromSequenceRanges = new ArrayList<>(); + + for (int[] range : map.getToRanges()) + { + int[] fromRange = map.locateInFrom(range[0], range[1]); + if (fromRange == null) + { + // corrupted map?!? + continue; + } + + int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef, + vcfAssembly); + if (newRange == null) + { + Cache.log.error( + String.format("Failed to map %s:%s:%s:%d:%d to %s", species, + chromosome, seqRef, range[0], range[1], + vcfAssembly)); + continue; + } + else + { + toVcfRanges.add(newRange); + fromSequenceRanges.add(fromRange); + } + } - /* - * mappings between VCF and sequence reference assembly regions, as - * key = "species!chromosome!fromAssembly!toAssembly - * value = Map{fromRange, toRange} - */ - private Map> assemblyMappings; + return new VCFMap(chromosome, + new MapList(fromSequenceRanges, toVcfRanges, 1, 1)); + } /** - * Constructor given an alignment context + * If the sequence id matches a contig declared in the VCF file, and the + * sequence length matches the contig length, then returns a 1:1 map of the + * sequence to the contig, else returns null * - * @param alignment + * @param seq + * @return */ - public VCFLoader(AlignmentI alignment) + private VCFMap getContigMap(SequenceI seq) { - al = alignment; - - // map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange} - assemblyMappings = new HashMap>(); + String id = seq.getName(); + SAMSequenceRecord contig = dictionary.getSequence(id); + if (contig != null) + { + int len = seq.getLength(); + if (len == contig.getSequenceLength()) + { + MapList map = new MapList(new int[] { 1, len }, + new int[] + { 1, len }, 1, 1); + return new VCFMap(id, map); + } + } + return null; } /** - * Loads VCF on to an alignment - provided it can be related to one or more - * sequence's chromosomal coordinates. - *

- * This method is not thread safe - concurrent threads should use separate - * instances of this class. + * Queries the VCF reader for any variants that overlap the mapped chromosome + * ranges of the sequence, and adds as variant features. Returns the number of + * overlapping variants found. * - * @param filePath - * @param status + * @param seq + * @param map + * mapping from sequence to VCF coordinates + * @return */ - public void loadVCF(String filePath, AlignViewControllerGuiI status) + protected int addVcfVariants(SequenceI seq, VCFMap map) { - VCFReader reader = null; - - try - { - // long start = System.currentTimeMillis(); - reader = new VCFReader(filePath); - - VCFHeader header = reader.getFileHeader(); - VCFHeaderLine ref = header - .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); - // check if reference is wrt assembly19 (GRCh37) - // todo may need to allow user to specify reference assembly? - boolean isRefGrch37 = (ref != null && ref.getValue().contains( - "assembly19")); + boolean forwardStrand = map.map.isToForwardStrand(); - int varCount = 0; - int seqCount = 0; + /* + * query the VCF for overlaps of each contiguous chromosomal region + */ + int count = 0; - /* - * query for VCF overlapping each sequence in turn - */ - for (SequenceI seq : al.getSequences()) - { - int added = loadVCF(seq, reader, isRefGrch37); - if (added > 0) - { - seqCount++; - varCount += added; - computePeptideVariants(seq); - } - } - // long elapsed = System.currentTimeMillis() - start; - String msg = String.format("Added %d VCF variants to %d sequence(s)", - varCount, seqCount); - if (status != null) - { - status.setStatus(msg); - } - } catch (Throwable e) - { - System.err.println("Error processing VCF: " + e.getMessage()); - e.printStackTrace(); - } finally + for (int[] range : map.map.getToRanges()) { - if (reader != null) + int vcfStart = Math.min(range[0], range[1]); + int vcfEnd = Math.max(range[0], range[1]); + CloseableIterator variants = reader + .query(map.chromosome, vcfStart, vcfEnd); + while (variants.hasNext()) { - try - { - reader.close(); - } catch (IOException e) + VariantContext variant = variants.next(); + + int[] featureRange = map.map.locateInFrom(variant.getStart(), + variant.getEnd()); + + if (featureRange != null) { - // ignore + int featureStart = Math.min(featureRange[0], featureRange[1]); + int featureEnd = Math.max(featureRange[0], featureRange[1]); + count += addAlleleFeatures(seq, variant, featureStart, featureEnd, + forwardStrand); } } + variants.close(); } + + return count; } /** - * (Re-)computes peptide variants from dna variants, for any protein sequence - * to which the dna sequence has a mapping. Note that although duplicate - * features may get computed, they will not be added, since duplicate sequence - * features are ignored in Sequence.addSequenceFeature. + * A convenience method to get an attribute value for an alternate allele * - * @param dnaSeq + * @param variant + * @param attributeName + * @param alleleIndex + * @return */ - protected void computePeptideVariants(SequenceI dnaSeq) + protected String getAttributeValue(VariantContext variant, + String attributeName, int alleleIndex) { - DBRefEntry[] dbrefs = dnaSeq.getDBRefs(); - if (dbrefs == null) + Object att = variant.getAttribute(attributeName); + + if (att instanceof String) { - return; + return (String) att; } - for (DBRefEntry dbref : dbrefs) + else if (att instanceof ArrayList) { - Mapping mapping = dbref.getMap(); - if (mapping == null || mapping.getTo() == null - || mapping.getMap().getFromRatio() != 3) - { - continue; - } - AlignmentUtils.computeProteinFeatures(dnaSeq, mapping.getTo(), - mapping.getMap()); + return ((List) att).get(alleleIndex); } + + return null; } /** - * Tries to add overlapping variants read from a VCF file to the given - * sequence, and returns the number of overlapping variants found. Note that - * this requires the sequence to hold information as to its chromosomal - * positions and reference, in order to be able to map the VCF variants to the - * sequence. + * Adds one variant feature for each allele in the VCF variant record, and + * returns the number of features added. * * @param seq - * @param reader - * @param isVcfRefGrch37 + * @param variant + * @param featureStart + * @param featureEnd + * @param forwardStrand * @return */ - protected int loadVCF(SequenceI seq, VCFReader reader, - boolean isVcfRefGrch37) + protected int addAlleleFeatures(SequenceI seq, VariantContext variant, + int featureStart, int featureEnd, boolean forwardStrand) { - int count = 0; - GeneLoci seqCoords = seq.getGeneLoci(); - if (seqCoords == null) + int added = 0; + + /* + * Javadoc says getAlternateAlleles() imposes no order on the list returned + * so we proceed defensively to get them in strict order + */ + int altAlleleCount = variant.getAlternateAlleles().size(); + for (int i = 0; i < altAlleleCount; i++) { - return 0; + added += addAlleleFeature(seq, variant, i, featureStart, featureEnd, + forwardStrand); + } + return added; + } + + /** + * Inspects one allele and attempts to add a variant feature for it to the + * sequence. The additional data associated with this allele is extracted to + * store in the feature's key-value map. Answers the number of features added (0 + * or 1). + * + * @param seq + * @param variant + * @param altAlleleIndex + * (0, 1..) + * @param featureStart + * @param featureEnd + * @param forwardStrand + * @return + */ + protected int addAlleleFeature(SequenceI seq, VariantContext variant, + int altAlleleIndex, int featureStart, int featureEnd, + boolean forwardStrand) + { + String reference = variant.getReference().getBaseString(); + Allele alt = variant.getAlternateAllele(altAlleleIndex); + String allele = alt.getBaseString(); + + /* + * insertion after a genomic base, if on reverse strand, has to be + * converted to insertion of complement after the preceding position + */ + int referenceLength = reference.length(); + if (!forwardStrand && allele.length() > referenceLength + && allele.startsWith(reference)) + { + featureStart -= referenceLength; + featureEnd = featureStart; + char insertAfter = seq.getCharAt(featureStart - seq.getStart()); + reference = Dna.reverseComplement(String.valueOf(insertAfter)); + allele = allele.substring(referenceLength) + reference; } - List seqChromosomalContigs = seqCoords.mapping.getToRanges(); - for (int[] range : seqChromosomalContigs) + /* + * build the ref,alt allele description e.g. "G,A", using the base + * complement if the sequence is on the reverse strand + */ + StringBuilder sb = new StringBuilder(); + sb.append(forwardStrand ? reference : Dna.reverseComplement(reference)); + sb.append(COMMA); + sb.append(forwardStrand ? allele : Dna.reverseComplement(allele)); + String alleles = sb.toString(); // e.g. G,A + + /* + * pick out the consequence data (if any) that is for the current allele + * and feature (transcript) that matches the current sequence + */ + String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD, + altAlleleIndex, csqAlleleFieldIndex, + csqAlleleNumberFieldIndex, seq.getName().toLowerCase(), + csqFeatureFieldIndex); + + /* + * pick out the ontology term for the consequence type + */ + String type = SequenceOntologyI.SEQUENCE_VARIANT; + if (consequence != null) { - count += addVcfVariants(seq, reader, range, isVcfRefGrch37); + type = getOntologyTerm(consequence); } - return count; + SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, + featureEnd, FEATURE_GROUP_VCF); + sf.setSource(sourceId); + + sf.setValue(Gff3Helper.ALLELES, alleles); + + addAlleleProperties(variant, sf, altAlleleIndex, consequence); + + seq.addSequenceFeature(sf); + + return 1; } /** - * Queries the VCF reader for any variants that overlap the given chromosome - * region of the sequence, and adds as variant features. Returns the number of - * overlapping variants found. + * Determines the Sequence Ontology term to use for the variant feature type in + * Jalview. The default is 'sequence_variant', but a more specific term is used + * if: + *

    + *
  • VEP (or SnpEff) Consequence annotation is included in the VCF
  • + *
  • sequence id can be matched to VEP Feature (or SnpEff Feature_ID)
  • + *
* - * @param seq - * @param reader - * @param range - * start-end range of a sequence region in its chromosomal - * coordinates - * @param isVcfRefGrch37 - * true if the VCF is with reference to GRCh37 + * @param consequence * @return + * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060 */ - protected int addVcfVariants(SequenceI seq, VCFReader reader, - int[] range, boolean isVcfRefGrch37) + String getOntologyTerm(String consequence) { - GeneLoci seqCoords = seq.getGeneLoci(); + String type = SequenceOntologyI.SEQUENCE_VARIANT; + + /* + * could we associate Consequence data with this allele and feature (transcript)? + * if so, prefer the consequence term from that data + */ + if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1 + { + /* + * no Consequence data so we can't refine the ontology term + */ + return type; + } + + if (consequence != null) + { + String[] csqFields = consequence.split(PIPE_REGEX); + if (csqFields.length > csqConsequenceFieldIndex) + { + type = csqFields[csqConsequenceFieldIndex]; + } + } + else + { + // todo the same for SnpEff consequence data matching if wanted + } + + /* + * if of the form (e.g.) missense_variant&splice_region_variant, + * just take the first ('most severe') consequence + */ + if (type != null) + { + int pos = type.indexOf('&'); + if (pos > 0) + { + type = type.substring(0, pos); + } + } + return type; + } - String chromosome = seqCoords.chromosome; - String seqRef = seqCoords.assembly; - String species = seqCoords.species; + /** + * Returns matched consequence data if it can be found, else null. + *
    + *
  • inspects the VCF data for key 'vcfInfoId'
  • + *
  • splits this on comma (to distinct consequences)
  • + *
  • returns the first consequence (if any) where
  • + *
      + *
    • the allele matches the altAlleleIndex'th allele of variant
    • + *
    • the feature matches the sequence name (e.g. transcript id)
    • + *
    + *
+ * If matched, the consequence is returned (as pipe-delimited fields). + * + * @param variant + * @param vcfInfoId + * @param altAlleleIndex + * @param alleleFieldIndex + * @param alleleNumberFieldIndex + * @param seqName + * @param featureFieldIndex + * @return + */ + private String getConsequenceForAlleleAndFeature(VariantContext variant, + String vcfInfoId, int altAlleleIndex, int alleleFieldIndex, + int alleleNumberFieldIndex, + String seqName, int featureFieldIndex) + { + if (alleleFieldIndex == -1 || featureFieldIndex == -1) + { + return null; + } + Object value = variant.getAttribute(vcfInfoId); - // TODO handle species properly - if ("".equals(species)) + if (value == null || !(value instanceof List)) { - species = "human"; + return null; } /* - * map chromosomal coordinates from GRCh38 (sequence) to - * GRCh37 (VCF) if necessary + * inspect each consequence in turn (comma-separated blocks + * extracted by htsjdk) */ - // TODO generalise for other assemblies and species - int offset = 0; - String fromRef = "GRCh38"; - if (fromRef.equalsIgnoreCase(seqRef) && isVcfRefGrch37) - { - String toRef = "GRCh37"; - int[] newRange = mapReferenceRange(range, chromosome, species, - fromRef, toRef); - if (newRange == null) + List consequences = (List) value; + + for (String consequence : consequences) + { + String[] csqFields = consequence.split(PIPE_REGEX); + if (csqFields.length > featureFieldIndex) { - System.err.println(String.format( - "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, - fromRef, range[0], range[1], toRef)); - return 0; + String featureIdentifier = csqFields[featureFieldIndex]; + if (featureIdentifier.length() > 4 + && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) + { + /* + * feature (transcript) matched - now check for allele match + */ + if (matchAllele(variant, altAlleleIndex, csqFields, + alleleFieldIndex, alleleNumberFieldIndex)) + { + return consequence; + } + } } - offset = newRange[0] - range[0]; - range = newRange; } + return null; + } - boolean forwardStrand = range[0] <= range[1]; + private boolean matchAllele(VariantContext variant, int altAlleleIndex, + String[] csqFields, int alleleFieldIndex, + int alleleNumberFieldIndex) + { + /* + * if ALLELE_NUM is present, it must match altAlleleIndex + * NB first alternate allele is 1 for ALLELE_NUM, 0 for altAlleleIndex + */ + if (alleleNumberFieldIndex > -1) + { + if (csqFields.length <= alleleNumberFieldIndex) + { + return false; + } + String alleleNum = csqFields[alleleNumberFieldIndex]; + return String.valueOf(altAlleleIndex + 1).equals(alleleNum); + } /* - * query the VCF for overlaps - * (convert a reverse strand range to forwards) + * else consequence allele must match variant allele */ - int count = 0; - MapList mapping = seqCoords.mapping; + if (alleleFieldIndex > -1 && csqFields.length > alleleFieldIndex) + { + String csqAllele = csqFields[alleleFieldIndex]; + String vcfAllele = variant.getAlternateAllele(altAlleleIndex) + .getBaseString(); + return csqAllele.equals(vcfAllele); + } + return false; + } + + /** + * Add any allele-specific VCF key-value data to the sequence feature + * + * @param variant + * @param sf + * @param altAlelleIndex + * (0, 1..) + * @param consequence + * if not null, the consequence specific to this sequence (transcript + * feature) and allele + */ + protected void addAlleleProperties(VariantContext variant, + SequenceFeature sf, final int altAlelleIndex, String consequence) + { + Map atts = variant.getAttributes(); - int fromLocus = Math.min(range[0], range[1]); - int toLocus = Math.max(range[0], range[1]); - CloseableIterator variants = reader.query(chromosome, - fromLocus, toLocus); - while (variants.hasNext()) + for (Entry att : atts.entrySet()) { + String key = att.getKey(); + /* - * get variant location in sequence chromosomal coordinates + * extract Consequence data (if present) that we are able to + * associated with the allele for this variant feature */ - VariantContext variant = variants.next(); + if (CSQ_FIELD.equals(key)) + { + addConsequences(variant, sf, consequence); + continue; + } /* - * we can only process SNP variants (which can be reported - * as part of a MIXED variant record + * filter out fields we don't want to capture */ - if (!variant.isSNP() && !variant.isMixed()) + if (!vcfFieldsOfInterest.contains(key)) { continue; } - count++; - int start = variant.getStart() - offset; - int end = variant.getEnd() - offset; - /* - * convert chromosomal location to sequence coordinates - * - null if a partially overlapping feature + * we extract values for other data which are allele-specific; + * these may be per alternate allele (INFO[key].Number = 'A') + * or per allele including reference (INFO[key].Number = 'R') */ - int[] seqLocation = mapping.locateInFrom(start, end); - if (seqLocation != null) + VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(key); + if (infoHeader == null) { - addVariantFeatures(seq, variant, seqLocation[0], seqLocation[1], - forwardStrand); + /* + * can't be sure what data belongs to this allele, so + * play safe and don't take any + */ + continue; } - } - variants.close(); + VCFHeaderLineCount number = infoHeader.getCountType(); + int index = altAlelleIndex; + if (number == VCFHeaderLineCount.R) + { + /* + * one value per allele including reference, so bump index + * e.g. the 3rd value is for the 2nd alternate allele + */ + index++; + } + else if (number != VCFHeaderLineCount.A) + { + /* + * don't save other values as not allele-related + */ + continue; + } - return count; + /* + * take the index'th value + */ + String value = getAttributeValue(variant, key, index); + if (value != null) + { + /* + * VCF spec requires encoding of special characters e.g. '=' + * so decode them here before storing + */ + try + { + value = URLDecoder.decode(value, UTF_8); + } catch (UnsupportedEncodingException e) + { + } + sf.setValue(key, value); + } + } } /** - * Inspects the VCF variant record, and adds variant features to the sequence. - * Only SNP variants are added, not INDELs. + * Inspects CSQ data blocks (consequences) and adds attributes on the sequence + * feature. *

- * If the sequence maps to the reverse strand of the chromosome, reference and - * variant bases are recorded as their complements (C/G, A/T). + * If myConsequence is not null, then this is the specific + * consequence data (pipe-delimited fields) that is for the current allele and + * transcript (sequence) being processed) * - * @param seq * @param variant - * @param featureStart - * @param featureEnd - * @param forwardStrand + * @param sf + * @param myConsequence */ - protected void addVariantFeatures(SequenceI seq, VariantContext variant, - int featureStart, int featureEnd, boolean forwardStrand) + protected void addConsequences(VariantContext variant, SequenceFeature sf, + String myConsequence) { - byte[] reference = variant.getReference().getBases(); - if (reference.length != 1) - { - /* - * sorry, we don't handle INDEL variants - */ - return; - } + Object value = variant.getAttribute(CSQ_FIELD); - /* - * for now we extract allele frequency as feature score; note - * this attribute is String for a simple SNP, but List if - * multiple alleles at the locus; we extract for the simple case only - */ - Object af = variant.getAttribute("AF"); - float score = 0f; - if (af instanceof String) + if (value == null || !(value instanceof List)) { - try - { - score = Float.parseFloat((String) af); - } catch (NumberFormatException e) - { - // leave as 0 - } + return; } - StringBuilder sb = new StringBuilder(); - sb.append(forwardStrand ? reference : Dna - .getComplement((char) reference[0])); + List consequences = (List) value; /* - * inspect alleles and record SNP variants (as the variant - * record could be MIXED and include INDEL and SNP alleles) - * warning: getAlleles gives no guarantee as to the order - * in which they are returned + * inspect CSQ consequences; restrict to the consequence + * associated with the current transcript (Feature) */ - for (Allele allele : variant.getAlleles()) + Map csqValues = new HashMap<>(); + + for (String consequence : consequences) { - if (!allele.isReference()) + if (myConsequence == null || myConsequence.equals(consequence)) { - byte[] alleleBase = allele.getBases(); - if (alleleBase.length == 1) + String[] csqFields = consequence.split(PIPE_REGEX); + + /* + * inspect individual fields of this consequence, copying non-null + * values which are 'fields of interest' + */ + int i = 0; + for (String field : csqFields) { - sb.append(",").append( - forwardStrand ? alleleBase : Dna - .getComplement((char) alleleBase[0])); + if (field != null && field.length() > 0) + { + String id = vepFieldsOfInterest.get(i); + if (id != null) + { + /* + * VCF spec requires encoding of special characters e.g. '=' + * so decode them here before storing + */ + try + { + field = URLDecoder.decode(field, UTF_8); + } catch (UnsupportedEncodingException e) + { + } + csqValues.put(id, field); + } + } + i++; } } } - String alleles = sb.toString(); // e.g. G,A,C - - String type = SequenceOntologyI.SEQUENCE_VARIANT; - - SequenceFeature sf = new SequenceFeature(type, alleles, featureStart, - featureEnd, score, "VCF"); - - sf.setValue(Gff3Helper.ALLELES, alleles); - Map atts = variant.getAttributes(); - for (Entry att : atts.entrySet()) + if (!csqValues.isEmpty()) { - sf.setValue(att.getKey(), att.getValue()); + sf.setValue(CSQ_FIELD, csqValues); } - seq.addSequenceFeature(sf); + } + + /** + * A convenience method to complement a dna base and return the string value + * of its complement + * + * @param reference + * @return + */ + protected String complement(byte[] reference) + { + return String.valueOf(Dna.getComplement((char) reference[0])); } /** @@ -415,8 +1369,8 @@ public class VCFLoader * call (e.g.) http://rest.ensembl.org/map/human/GRCh38/17:45051610..45109016:1/GRCh37 */ EnsemblMap mapper = new EnsemblMap(); - int[] mapping = mapper.getMapping(species, chromosome, fromRef, toRef, - queryRange); + int[] mapping = mapper.getAssemblyMapping(species, chromosome, fromRef, + toRef, queryRange); if (mapping == null) { @@ -492,6 +1446,32 @@ public class VCFLoader } /** + * Transfers the sequence feature to the target sequence, locating its start + * and end range based on the mapping. Features which do not overlap the + * target sequence are ignored. + * + * @param sf + * @param targetSequence + * @param mapping + * mapping from the feature's coordinates to the target sequence + */ + protected void transferFeature(SequenceFeature sf, + SequenceI targetSequence, MapList mapping) + { + int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd()); + + if (mappedRange != null) + { + String group = sf.getFeatureGroup(); + int newBegin = Math.min(mappedRange[0], mappedRange[1]); + int newEnd = Math.max(mappedRange[0], mappedRange[1]); + SequenceFeature copy = new SequenceFeature(sf, newBegin, newEnd, + group, sf.getScore()); + targetSequence.addSequenceFeature(copy); + } + } + + /** * Formats a ranges map lookup key * * @param chromosome