package jalview.io.vcf;
-import htsjdk.samtools.util.CloseableIterator;
-import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.vcf.VCFHeader;
-import htsjdk.variant.vcf.VCFHeaderLine;
-import htsjdk.variant.vcf.VCFHeaderLineCount;
-import htsjdk.variant.vcf.VCFHeaderLineType;
-import htsjdk.variant.vcf.VCFInfoHeaderLine;
-
import jalview.analysis.AlignmentUtils;
import jalview.analysis.Dna;
import jalview.api.AlignViewControllerGuiI;
+import jalview.bin.Cache;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.GeneLociI;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import htsjdk.samtools.util.CloseableIterator;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFHeader;
+import htsjdk.variant.vcf.VCFHeaderLine;
+import htsjdk.variant.vcf.VCFHeaderLineCount;
+import htsjdk.variant.vcf.VCFHeaderLineType;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
/**
* A class to read VCF data (using the htsjdk) and add variants as sequence
public class VCFLoader
{
/*
+ * Lookup keys, and default values, for Preference entries that describe
+ * patterns for VCF and VEP fields to capture
+ */
+ private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
+
+ private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
+
+ private static final String DEFAULT_VCF_FIELDS = "AF,AC*";
+
+ private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
+
+ /*
* keys to fields of VEP CSQ consequence data
* see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
*/
private static final String FEATURE_KEY = "Feature"; // Ensembl stable id
/*
- * what comes before column headings in CSQ Description field
- */
- private static final String FORMAT = "Format: ";
-
- /*
* default VCF INFO key for VEP consequence data
* NB this can be overridden running VEP with --vcf_info_field
- * - we don't handle this case (require CSQ identifier)
+ * - we don't handle this case (require identifier to be CSQ)
*/
- private static final String CSQ = "CSQ";
+ private static final String CSQ_FIELD = "CSQ";
/*
- * separator for fields in consequence data
+ * separator for fields in consequence data is '|'
*/
- private static final String PIPE = "|";
-
- private static final String PIPE_REGEX = "\\" + PIPE;
+ private static final String PIPE_REGEX = "\\|";
/*
* key for Allele Frequency output by VEP
*/
private String sourceId;
+ /*
+ * The INFO IDs of data that is both present in the VCF file, and
+ * also matched by any filters for data of interest
+ */
+ List<String> vcfFieldsOfInterest;
+
+ /*
+ * The field offsets and identifiers for VEP (CSQ) data that is both present
+ * in the VCF file, and also matched by any filters for data of interest
+ * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
+ */
+ Map<Integer, String> vepFieldsOfInterest;
+
/**
* Constructor given an alignment context
*
al = alignment;
// map of species!chromosome!fromAssembly!toAssembly to {fromRange, toRange}
- assemblyMappings = new HashMap<String, Map<int[], int[]>>();
+ assemblyMappings = new HashMap<>();
}
/**
/*
* get offset of CSQ ALLELE_NUM and Feature if declared
*/
- locateCsqFields();
+ parseCsqHeader();
VCFHeaderLine ref = header
.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
* Reads metadata (such as INFO field descriptions and datatypes) and saves
* them for future reference
*
- * @param sourceId
+ * @param theSourceId
*/
- void saveMetadata(String sourceId)
+ void saveMetadata(String theSourceId)
{
- FeatureSource metadata = new FeatureSource(sourceId);
+ List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
+ DEFAULT_VCF_FIELDS);
+ vcfFieldsOfInterest = new ArrayList<>();
+
+ FeatureSource metadata = new FeatureSource(theSourceId);
for (VCFInfoHeaderLine info : header.getInfoHeaderLines())
{
}
metadata.setAttributeName(attributeId, desc);
metadata.setAttributeType(attributeId, attType);
+
+ if (isFieldWanted(attributeId, vcfFieldPatterns))
+ {
+ vcfFieldsOfInterest.add(attributeId);
+ }
}
- FeatureSources.getInstance().addSource(sourceId, metadata);
+ FeatureSources.getInstance().addSource(theSourceId, metadata);
}
/**
- * Records the position of selected fields defined in the CSQ INFO header (if
- * there is one). CSQ fields are declared in the CSQ INFO Description e.g.
+ * Answers true if the field id is matched by any of the filter patterns, else
+ * false. Matching is against regular expression patterns, and is not
+ * case-sensitive.
+ *
+ * @param id
+ * @param filters
+ * @return
+ */
+ private boolean isFieldWanted(String id, List<Pattern> filters)
+ {
+ for (Pattern p : filters)
+ {
+ if (p.matcher(id.toUpperCase()).matches())
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
+ * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
+ * required for processing.
+ * <p>
+ * CSQ fields are declared in the CSQ INFO Description e.g.
* <p>
* Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
*/
- protected void locateCsqFields()
+ protected void parseCsqHeader()
{
- VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ);
+ List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
+ DEFAULT_VEP_FIELDS);
+ vepFieldsOfInterest = new HashMap<>();
+
+ VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
if (csqInfo == null)
{
return;
}
+ /*
+ * parse out the pipe-separated list of CSQ fields; we assume here that
+ * these form the last part of the description, and contain no spaces
+ */
String desc = csqInfo.getDescription();
- int formatPos = desc.indexOf(FORMAT);
- if (formatPos == -1)
- {
- System.err.println("Parse error, failed to find " + FORMAT
- + " in " + desc);
- return;
- }
- desc = desc.substring(formatPos + FORMAT.length());
+ int spacePos = desc.lastIndexOf(" ");
+ desc = desc.substring(spacePos + 1);
if (desc != null)
{
{
csqFeatureFieldIndex = index;
}
+
+ if (isFieldWanted(field, vepFieldFilters))
+ {
+ vepFieldsOfInterest.put(index, field);
+ }
+
index++;
}
}
}
/**
+ * Reads the Preference value for the given key, with default specified if no
+ * preference set. The value is interpreted as a comma-separated list of
+ * regular expressions, and converted into a list of compiled patterns ready
+ * for matching. Patterns are forced to upper-case for non-case-sensitive
+ * matching.
+ * <p>
+ * This supports user-defined filters for fields of interest to capture while
+ * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
+ * fields with an ID of AF, or starting with AC, would be matched.
+ *
+ * @param key
+ * @param def
+ * @return
+ */
+ private List<Pattern> getFieldMatchers(String key, String def)
+ {
+ String pref = Cache.getDefault(key, def);
+ List<Pattern> patterns = new ArrayList<>();
+ String[] tokens = pref.split(",");
+ for (String token : tokens)
+ {
+ try
+ {
+ patterns.add(Pattern.compile(token.toUpperCase()));
+ } catch (PatternSyntaxException e)
+ {
+ System.err.println("Invalid pattern ignored: " + token);
+ }
+ }
+ return patterns;
+ }
+
+ /**
* Transfers VCF features to sequences to which this sequence has a mapping.
* If the mapping is 3:1, computes peptide variants from nucleotide variants.
*
* extract Consequence data (if present) that we are able to
* associated with the allele for this variant feature
*/
- if (CSQ.equals(key))
+ if (CSQ_FIELD.equals(key))
{
addConsequences(variant, seq, sf, altAlelleIndex);
continue;
}
/*
+ * filter out fields we don't want to capture
+ */
+ if (!vcfFieldsOfInterest.contains(key))
+ {
+ continue;
+ }
+
+ /*
* we extract values for other data which are allele-specific;
* these may be per alternate allele (INFO[key].Number = 'A')
* or per allele including reference (INFO[key].Number = 'R')
protected void addConsequences(VariantContext variant, SequenceI seq,
SequenceFeature sf, int altAlelleIndex)
{
- Object value = variant.getAttribute(CSQ);
+ Object value = variant.getAttribute(CSQ_FIELD);
if (value == null || !(value instanceof ArrayList<?>))
{
}
}
- StringBuilder sb = new StringBuilder(128);
- boolean found = false;
+ /*
+ * inspect CSQ consequences; where possible restrict to the consequence
+ * associated with the current transcript (Feature)
+ */
+ Map<String, String> csqValues = new HashMap<>();
for (String consequence : consequences)
{
if (includeConsequence(csqFields, matchFeature, variant,
altAlelleIndex))
{
- if (found)
+ /*
+ * inspect individual fields of this consequence, copying non-null
+ * values which are 'fields of interest'
+ */
+ int i = 0;
+ for (String field : csqFields)
{
- sb.append(COMMA);
+ if (field != null && field.length() > 0)
+ {
+ String id = vepFieldsOfInterest.get(i);
+ if (id != null)
+ {
+ csqValues.put(id, field);
+ }
+ }
+ i++;
}
- found = true;
- sb.append(consequence);
}
}
- if (found)
+ if (!csqValues.isEmpty())
{
- sf.setValue(CSQ, sb.toString());
+ sf.setValue(CSQ_FIELD, csqValues);
}
}