From 54393c08af2e0ee01bfa7204961dc0bc02d8b63a Mon Sep 17 00:00:00 2001 From: gmungoc Date: Wed, 15 Nov 2017 10:34:28 +0000 Subject: [PATCH] JAL-2835 add filters for VCF/VEP 'fields of interest' --- src/jalview/io/vcf/VCFLoader.java | 156 +++++++++++++++++++++++++------------ 1 file changed, 107 insertions(+), 49 deletions(-) diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index e7c5a34..960615a 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -12,6 +12,7 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; +import jalview.bin.Cache; import jalview.datamodel.AlignmentI; import jalview.datamodel.DBRefEntry; import jalview.datamodel.GeneLociI; @@ -35,6 +36,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.regex.Pattern; /** * A class to read VCF data (using the htsjdk) and add variants as sequence @@ -45,6 +47,18 @@ import java.util.Map.Entry; public class VCFLoader { /* + * Lookup keys, and default values, for Preference entries that describe + * patterns for VCF and VEP fields to capture + */ + private static final String VEP_FIELDS_PREF = "VEP_FIELDS"; + + private static final String VCF_FIELDS_PREF = "VCF_FIELDS"; + + private static final String DEFAULT_VCF_FIELDS = "AF,AC*"; + + private static final String DEFAULT_VEP_FIELDS = "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG"; + + /* * keys to fields of VEP CSQ consequence data * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ @@ -54,11 +68,6 @@ public class VCFLoader private static final String FEATURE_KEY = "Feature"; // Ensembl stable id /* - * what comes before column headings in CSQ Description field - */ - private static final String FORMAT = "Format: "; - - /* * default VCF INFO key for VEP consequence data * NB this can be overridden running VEP with --vcf_info_field * - we don't handle this case (require identifier to be CSQ) @@ -124,9 +133,18 @@ public class VCFLoader */ private String sourceId; + /* + * The INFO IDs of data that is both present in the VCF file, and + * also matched by any filters for data of interest + */ List vcfFieldsOfInterest; - List vepFieldsOfInterest; + /* + * The field offsets and identifiers for VEP (CSQ) data that is both present + * in the VCF file, and also matched by any filters for data of interest + * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ... + */ + Map vepFieldsOfInterest; /** * Constructor given an alignment context @@ -195,7 +213,7 @@ public class VCFLoader /* * get offset of CSQ ALLELE_NUM and Feature if declared */ - locateCsqFields(); + parseCsqHeader(); VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); @@ -259,6 +277,8 @@ public class VCFLoader */ void saveMetadata(String theSourceId) { + List vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF, + DEFAULT_VCF_FIELDS); vcfFieldsOfInterest = new ArrayList<>(); FeatureSource metadata = new FeatureSource(theSourceId); @@ -290,7 +310,7 @@ public class VCFLoader metadata.setAttributeName(attributeId, desc); metadata.setAttributeType(attributeId, attType); - if (isVcfFieldWanted(attributeId)) + if (isFieldWanted(attributeId, vcfFieldPatterns)) { vcfFieldsOfInterest.add(attributeId); } @@ -300,39 +320,40 @@ public class VCFLoader } /** - * Answers true if the VCF id is one we wish to capture in Jalview, else false + * Answers true if the field id is matched by any of the filter patterns, else + * false. Matching is against regular expression patterns, and is not + * case-sensitive. * * @param id + * @param filters * @return */ - private boolean isVcfFieldWanted(String id) + private boolean isFieldWanted(String id, List filters) { - // TODO option to match patterns in a Preferences entry? - return true; - } - - /** - * Answers true if the VEP (CSQ) id is one we wish to capture in Jalview, else - * false - * - * @param id - * @return - */ - private boolean isVepFieldWanted(String id) - { - // TODO option to match patterns in a Preferences entry? - return true; + for (Pattern p : filters) + { + if (p.matcher(id.toUpperCase()).matches()) + { + return true; + } + } + return false; } /** - * Records the position of selected fields defined in the CSQ INFO header (if - * there is one). CSQ fields are declared in the CSQ INFO Description e.g. + * Records 'wanted' fields defined in the CSQ INFO header (if there is one). + * Also records the position of selected fields (Allele, ALLELE_NUM, Feature) + * required for processing. + *

+ * CSQ fields are declared in the CSQ INFO Description e.g. *

* Description="Consequence ...from ... VEP. Format: Allele|Consequence|... */ - protected void locateCsqFields() + protected void parseCsqHeader() { - vepFieldsOfInterest = new ArrayList<>(); + List vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF, + DEFAULT_VEP_FIELDS); + vepFieldsOfInterest = new HashMap<>(); VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD); if (csqInfo == null) @@ -340,15 +361,13 @@ public class VCFLoader return; } + /* + * parse out the pipe-separated list of CSQ fields; we assume here that + * these form the last part of the description, and contain no spaces + */ String desc = csqInfo.getDescription(); - int formatPos = desc.indexOf(FORMAT); - if (formatPos == -1) - { - System.err.println("Parse error, failed to find " + FORMAT - + " in " + desc); - return; - } - desc = desc.substring(formatPos + FORMAT.length()); + int spacePos = desc.lastIndexOf(" "); + desc = desc.substring(spacePos + 1); if (desc != null) { @@ -369,9 +388,9 @@ public class VCFLoader csqFeatureFieldIndex = index; } - if (isVepFieldWanted(field)) + if (isFieldWanted(field, vepFieldFilters)) { - vepFieldsOfInterest.add(field); + vepFieldsOfInterest.put(index, field); } index++; @@ -380,6 +399,33 @@ public class VCFLoader } /** + * Reads the Preference value for the given key, with default specified if no + * preference set. The value is interpreted as a comma-separated list of + * regular expressions, and converted into a list of compiled patterns ready + * for matching. Patterns are forced to upper-case for non-case-sensitive + * matching. + *

+ * This supports user-defined filters for fields of interest to capture while + * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO + * fields with an ID of AF, or starting with AC, would be matched. + * + * @param key + * @param def + * @return + */ + private List getFieldMatchers(String key, String def) + { + String pref = Cache.getDefault(key, def); + List patterns = new ArrayList<>(); + String[] tokens = pref.split(","); + for (String token : tokens) + { + patterns.add(Pattern.compile(token.toUpperCase())); + } + return patterns; + } + + /** * Transfers VCF features to sequences to which this sequence has a mapping. * If the mapping is 3:1, computes peptide variants from nucleotide variants. * @@ -858,10 +904,11 @@ public class VCFLoader } } - StringBuilder sb = new StringBuilder(128); - boolean found = false; - - // todo check against vepFieldsOfInterest as well somewhere + /* + * inspect CSQ consequences; where possible restrict to the consequence + * associated with the current transcript (Feature) + */ + Map csqValues = new HashMap<>(); for (String consequence : consequences) { @@ -870,18 +917,29 @@ public class VCFLoader if (includeConsequence(csqFields, matchFeature, variant, altAlelleIndex)) { - if (found) + /* + * inspect individual fields of this consequence, copying non-null + * values which are 'fields of interest' + */ + int i = 0; + for (String field : csqFields) { - sb.append(COMMA); + if (field != null && field.length() > 0) + { + String id = vepFieldsOfInterest.get(i); + if (id != null) + { + csqValues.put(id, field); + } + } + i++; } - found = true; - sb.append(consequence); } } - if (found) + if (!csqValues.isEmpty()) { - sf.setValue(CSQ_FIELD, sb.toString()); + sf.setValue(CSQ_FIELD, csqValues); } } -- 1.7.10.2