From ecb4fe242314037f6bd1d748bcfac5bbe1276afa Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 2 Oct 2017 15:37:01 +0100 Subject: [PATCH] JAL-2738 filter VEP consequence data to that for current transcript --- src/jalview/io/vcf/VCFLoader.java | 220 +++++++++++++++++++++++++++++++++++-- 1 file changed, 208 insertions(+), 12 deletions(-) diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 0acf49e..89b1d2a 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -6,6 +6,7 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; @@ -39,18 +40,58 @@ import java.util.Map.Entry; */ public class VCFLoader { + /* + * keys to fields of VEP CSQ consequence data + * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html + */ + private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... + + private static final String FEATURE_KEY = "Feature"; // Ensembl stable id + + /* + * default VCF INFO key for VEP consequence data + * NB this can be overridden running VEP with --vcf_info_field + * - we don't handle this case (require CSQ identifier) + */ + private static final String CSQ = "CSQ"; + + /* + * separator for fields in consequence data + */ + private static final String PIPE = "|"; + + private static final String PIPE_REGEX = "\\" + PIPE; + + /* + * key for Allele Frequency output by VEP + * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html + */ private static final String ALLELE_FREQUENCY_KEY = "AF"; + /* + * delimiter that separates multiple consequence data blocks + */ private static final String COMMA = ","; + /* + * (temporary) flag that determines whether Jalview adds one feature + * per VCF record, or one per allele (preferred) + */ private static final boolean FEATURE_PER_ALLELE = true; + /* + * the feature group assigned to a VCF variant in Jalview + */ private static final String FEATURE_GROUP_VCF = "VCF"; + /* + * internal delimiter used to build keys for assemblyMappings + * + */ private static final String EXCL = "!"; /* - * the alignment we are associated VCF data with + * the alignment we are associating VCF data with */ private AlignmentI al; @@ -66,6 +107,15 @@ public class VCFLoader */ private VCFHeader header; + /* + * the position (0...) of the ALLELE_NUM field in each block of + * CSQ (consequence) data (if declared in the VCF INFO header for CSQ) + * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html + */ + private int csqAlleleNumberFieldIndex = -1; + + private int csqFeatureFieldIndex = -1; + /** * Constructor given an alignment context * @@ -127,6 +177,12 @@ public class VCFLoader header = reader.getFileHeader(); VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + + /* + * note offset of CSQ ALLELE_NUM field if it is declared + */ + findAlleleNumberFieldIndex(); + // check if reference is wrt assembly19 (GRCh37) // todo may need to allow user to specify reference assembly? boolean isRefGrch37 = (ref != null && ref.getValue().contains( @@ -183,6 +239,36 @@ public class VCFLoader } /** + * If the CSQ INFO header declares that ALLELE_NUM is included in the data, + * record its (pipe-delimited) offset in each (comma-delimited) consequence + * block; CSQ fields are declared in the CSQ INFO Description e.g. + *

+ * Description="Consequence ...from ... VEP. Format: Allele|Consequence|... + */ + protected void findAlleleNumberFieldIndex() + { + VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ); + String desc = csqInfo.getDescription(); + if (desc != null) + { + String[] format = desc.split(PIPE_REGEX); + int index = 0; + for (String field : format) + { + if (ALLELE_NUM_KEY.equals(field)) + { + csqAlleleNumberFieldIndex = index; + } + if (FEATURE_KEY.equals(field)) + { + csqFeatureFieldIndex = index; + } + index++; + } + } + } + + /** * Transfers VCF features to sequences to which this sequence has a mapping. * If the mapping is 1:3, computes peptide variants from nucleotide variants. * @@ -563,7 +649,7 @@ public class VCFLoader sf.setValue(Gff3Helper.ALLELES, alleles); - addAlleleProperties(variant, sf, altAlleleIndex); + addAlleleProperties(variant, seq, sf, altAlleleIndex); seq.addSequenceFeature(sf); @@ -574,22 +660,35 @@ public class VCFLoader * Add any allele-specific VCF key-value data to the sequence feature * * @param variant + * @param seq * @param sf * @param altAlelleIndex */ protected void addAlleleProperties(VariantContext variant, + SequenceI seq, SequenceFeature sf, final int altAlelleIndex) { Map atts = variant.getAttributes(); - /* - * process variant data, extracting values which are allele-specific - * these may be per alternate allele (INFO[key].Number = 'A') - * or per allele including reference (INFO[key].Number = 'R') - */ for (Entry att : atts.entrySet()) { String key = att.getKey(); + + /* + * extract Consequence data (if present) that we are able to + * associated with the allele for this variant feature + */ + if (CSQ.equals(key) && csqAlleleNumberFieldIndex > -1) + { + addConsequences(att.getValue(), seq, sf, altAlelleIndex + 1); + return; + } + + /* + * we extract values for other data which are allele-specific; + * these may be per alternate allele (INFO[key].Number = 'A') + * or per allele including reference (INFO[key].Number = 'R') + */ VCFHeaderLineCount number = header.getInfoHeaderLine(key) .getCountType(); int index = altAlelleIndex; @@ -601,11 +700,7 @@ public class VCFLoader */ index++; } - /* - * CSQ behaves as if Number=A but declares as Number=. - * so give it special treatment - */ - else if (!"CSQ".equals(key) && number != VCFHeaderLineCount.A) + else if (number != VCFHeaderLineCount.A) { /* * don't save other values as not allele-related @@ -625,6 +720,107 @@ public class VCFLoader } /** + * Inspects CSQ data blocks (consequences) and adds attributes on the sequence + * feature for the current allele (and transcript if applicable) + *

+ * Allele matching: we require field ALLELE_NUM to match altAlleleIndex. If + * the CSQ data does not include ALLELE_NUM values then no data is added to + * the variant feature. + *

+ * Transcript matching: if sequence name can be identified to at least one of + * the consequences' Feature values, then select only consequences that match + * the value (i.e. consequences for the current transcript sequence). If not, + * take all consequences (this is the case when adding features to the gene + * sequence). + * + * @param value + * @param seq + * @param sf + * @param altAlelleIndex + * (1=first alternative allele...) + */ + protected void addConsequences(Object value, SequenceI seq, + SequenceFeature sf, + int altAlelleIndex) + { + if (!(value instanceof ArrayList)) + { + return; + } + + List consequences = (List) value; + + /* + * if CSQ data includes 'Feature', and any value matches the sequence name, + * then restrict consequence data to the matching value (transcript) + * i.e. just pick out consequences for the transcript the variant feature is on + */ + String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase(); + boolean matchFeature = false; + String matchFeatureValue = null; + if (csqFeatureFieldIndex > -1) + { + for (String consequence : consequences) + { + String[] csqFields = consequence.split(PIPE_REGEX); + if (csqFields.length > csqFeatureFieldIndex) + { + String featureIdentifier = csqFields[csqFeatureFieldIndex]; + if (featureIdentifier.length() > 4 + && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) + { + matchFeature = true; + matchFeatureValue = featureIdentifier; + } + } + } + } + + StringBuilder sb = new StringBuilder(128); + boolean found = false; + + for (String consequence : consequences) + { + String[] csqFields = consequence.split(PIPE_REGEX); + + /* + * check consequence is for the current transcript + */ + if (matchFeature) + { + if (csqFields.length <= csqFeatureFieldIndex) + { + continue; + } + String featureIdentifier = csqFields[csqFeatureFieldIndex]; + if (!featureIdentifier.equals(matchFeatureValue)) + { + continue; // consequence is for a different transcript + } + } + + if (csqFields.length > csqAlleleNumberFieldIndex) + { + String alleleNum = csqFields[csqAlleleNumberFieldIndex]; + if (String.valueOf(altAlelleIndex).equals(alleleNum)) + { + if (found) + { + sb.append(COMMA); + } + found = true; + sb.append(consequence); + } + } + } + + if (found) + { + sf.setValue(CSQ, sb.toString()); + } + } + + /** * A convenience method to complement a dna base and return the string value * of its complement * -- 1.7.10.2