import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
import jalview.analysis.AlignmentUtils;
import jalview.analysis.Dna;
*/
public class VCFLoader
{
+ /*
+ * keys to fields of VEP CSQ consequence data
+ * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
+ */
+ private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
+
+ private static final String FEATURE_KEY = "Feature"; // Ensembl stable id
+
+ /*
+ * default VCF INFO key for VEP consequence data
+ * NB this can be overridden running VEP with --vcf_info_field
+ * - we don't handle this case (require CSQ identifier)
+ */
+ private static final String CSQ = "CSQ";
+
+ /*
+ * separator for fields in consequence data
+ */
+ private static final String PIPE = "|";
+
+ private static final String PIPE_REGEX = "\\" + PIPE;
+
+ /*
+ * key for Allele Frequency output by VEP
+ * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
+ */
private static final String ALLELE_FREQUENCY_KEY = "AF";
+ /*
+ * delimiter that separates multiple consequence data blocks
+ */
private static final String COMMA = ",";
+ /*
+ * (temporary) flag that determines whether Jalview adds one feature
+ * per VCF record, or one per allele (preferred)
+ */
private static final boolean FEATURE_PER_ALLELE = true;
+ /*
+ * the feature group assigned to a VCF variant in Jalview
+ */
private static final String FEATURE_GROUP_VCF = "VCF";
+ /*
+ * internal delimiter used to build keys for assemblyMappings
+ *
+ */
private static final String EXCL = "!";
/*
- * the alignment we are associated VCF data with
+ * the alignment we are associating VCF data with
*/
private AlignmentI al;
*/
private VCFHeader header;
+ /*
+ * the position (0...) of the ALLELE_NUM field in each block of
+ * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
+ * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
+ */
+ private int csqAlleleNumberFieldIndex = -1;
+
+ private int csqFeatureFieldIndex = -1;
+
/**
* Constructor given an alignment context
*
header = reader.getFileHeader();
VCFHeaderLine ref = header
.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
+
+ /*
+ * note offset of CSQ ALLELE_NUM field if it is declared
+ */
+ findAlleleNumberFieldIndex();
+
// check if reference is wrt assembly19 (GRCh37)
// todo may need to allow user to specify reference assembly?
boolean isRefGrch37 = (ref != null && ref.getValue().contains(
}
/**
+ * If the CSQ INFO header declares that ALLELE_NUM is included in the data,
+ * record its (pipe-delimited) offset in each (comma-delimited) consequence
+ * block; CSQ fields are declared in the CSQ INFO Description e.g.
+ * <p>
+ * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
+ */
+ protected void findAlleleNumberFieldIndex()
+ {
+ VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ);
+ String desc = csqInfo.getDescription();
+ if (desc != null)
+ {
+ String[] format = desc.split(PIPE_REGEX);
+ int index = 0;
+ for (String field : format)
+ {
+ if (ALLELE_NUM_KEY.equals(field))
+ {
+ csqAlleleNumberFieldIndex = index;
+ }
+ if (FEATURE_KEY.equals(field))
+ {
+ csqFeatureFieldIndex = index;
+ }
+ index++;
+ }
+ }
+ }
+
+ /**
* Transfers VCF features to sequences to which this sequence has a mapping.
* If the mapping is 1:3, computes peptide variants from nucleotide variants.
*
sf.setValue(Gff3Helper.ALLELES, alleles);
- addAlleleProperties(variant, sf, altAlleleIndex);
+ addAlleleProperties(variant, seq, sf, altAlleleIndex);
seq.addSequenceFeature(sf);
* Add any allele-specific VCF key-value data to the sequence feature
*
* @param variant
+ * @param seq
* @param sf
* @param altAlelleIndex
*/
protected void addAlleleProperties(VariantContext variant,
+ SequenceI seq,
SequenceFeature sf, final int altAlelleIndex)
{
Map<String, Object> atts = variant.getAttributes();
- /*
- * process variant data, extracting values which are allele-specific
- * these may be per alternate allele (INFO[key].Number = 'A')
- * or per allele including reference (INFO[key].Number = 'R')
- */
for (Entry<String, Object> att : atts.entrySet())
{
String key = att.getKey();
+
+ /*
+ * extract Consequence data (if present) that we are able to
+ * associated with the allele for this variant feature
+ */
+ if (CSQ.equals(key) && csqAlleleNumberFieldIndex > -1)
+ {
+ addConsequences(att.getValue(), seq, sf, altAlelleIndex + 1);
+ return;
+ }
+
+ /*
+ * we extract values for other data which are allele-specific;
+ * these may be per alternate allele (INFO[key].Number = 'A')
+ * or per allele including reference (INFO[key].Number = 'R')
+ */
VCFHeaderLineCount number = header.getInfoHeaderLine(key)
.getCountType();
int index = altAlelleIndex;
*/
index++;
}
- /*
- * CSQ behaves as if Number=A but declares as Number=.
- * so give it special treatment
- */
- else if (!"CSQ".equals(key) && number != VCFHeaderLineCount.A)
+ else if (number != VCFHeaderLineCount.A)
{
/*
* don't save other values as not allele-related
}
/**
+ * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
+ * feature for the current allele (and transcript if applicable)
+ * <p>
+ * Allele matching: we require field ALLELE_NUM to match altAlleleIndex. If
+ * the CSQ data does not include ALLELE_NUM values then no data is added to
+ * the variant feature.
+ * <p>
+ * Transcript matching: if sequence name can be identified to at least one of
+ * the consequences' Feature values, then select only consequences that match
+ * the value (i.e. consequences for the current transcript sequence). If not,
+ * take all consequences (this is the case when adding features to the gene
+ * sequence).
+ *
+ * @param value
+ * @param seq
+ * @param sf
+ * @param altAlelleIndex
+ * (1=first alternative allele...)
+ */
+ protected void addConsequences(Object value, SequenceI seq,
+ SequenceFeature sf,
+ int altAlelleIndex)
+ {
+ if (!(value instanceof ArrayList<?>))
+ {
+ return;
+ }
+
+ List<String> consequences = (List<String>) value;
+
+ /*
+ * if CSQ data includes 'Feature', and any value matches the sequence name,
+ * then restrict consequence data to the matching value (transcript)
+ * i.e. just pick out consequences for the transcript the variant feature is on
+ */
+ String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase();
+ boolean matchFeature = false;
+ String matchFeatureValue = null;
+ if (csqFeatureFieldIndex > -1)
+ {
+ for (String consequence : consequences)
+ {
+ String[] csqFields = consequence.split(PIPE_REGEX);
+ if (csqFields.length > csqFeatureFieldIndex)
+ {
+ String featureIdentifier = csqFields[csqFeatureFieldIndex];
+ if (featureIdentifier.length() > 4
+ && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
+ {
+ matchFeature = true;
+ matchFeatureValue = featureIdentifier;
+ }
+ }
+ }
+ }
+
+ StringBuilder sb = new StringBuilder(128);
+ boolean found = false;
+
+ for (String consequence : consequences)
+ {
+ String[] csqFields = consequence.split(PIPE_REGEX);
+
+ /*
+ * check consequence is for the current transcript
+ */
+ if (matchFeature)
+ {
+ if (csqFields.length <= csqFeatureFieldIndex)
+ {
+ continue;
+ }
+ String featureIdentifier = csqFields[csqFeatureFieldIndex];
+ if (!featureIdentifier.equals(matchFeatureValue))
+ {
+ continue; // consequence is for a different transcript
+ }
+ }
+
+ if (csqFields.length > csqAlleleNumberFieldIndex)
+ {
+ String alleleNum = csqFields[csqAlleleNumberFieldIndex];
+ if (String.valueOf(altAlelleIndex).equals(alleleNum))
+ {
+ if (found)
+ {
+ sb.append(COMMA);
+ }
+ found = true;
+ sb.append(consequence);
+ }
+ }
+ }
+
+ if (found)
+ {
+ sf.setValue(CSQ, sb.toString());
+ }
+ }
+
+ /**
* A convenience method to complement a dna base and return the string value
* of its complement
*