import jalview.analysis.AlignmentUtils;
import jalview.analysis.Dna;
import jalview.api.AlignViewControllerGuiI;
+import jalview.bin.Cache;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.DBRefEntry;
import jalview.datamodel.GeneLociI;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.regex.Pattern;
/**
* A class to read VCF data (using the htsjdk) and add variants as sequence
public class VCFLoader
{
/*
+ * Lookup keys, and default values, for Preference entries that describe
+ * patterns for VCF and VEP fields to capture
+ */
+ private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
+
+ private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
+
+ private static final String DEFAULT_VCF_FIELDS = "AF,AC*";
+
+ private static final String DEFAULT_VEP_FIELDS = "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
+
+ /*
* keys to fields of VEP CSQ consequence data
* see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
*/
private static final String FEATURE_KEY = "Feature"; // Ensembl stable id
/*
- * what comes before column headings in CSQ Description field
- */
- private static final String FORMAT = "Format: ";
-
- /*
* default VCF INFO key for VEP consequence data
* NB this can be overridden running VEP with --vcf_info_field
* - we don't handle this case (require identifier to be CSQ)
*/
private String sourceId;
+ /*
+ * The INFO IDs of data that is both present in the VCF file, and
+ * also matched by any filters for data of interest
+ */
List<String> vcfFieldsOfInterest;
- List<String> vepFieldsOfInterest;
+ /*
+ * The field offsets and identifiers for VEP (CSQ) data that is both present
+ * in the VCF file, and also matched by any filters for data of interest
+ * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
+ */
+ Map<Integer, String> vepFieldsOfInterest;
/**
* Constructor given an alignment context
/*
* get offset of CSQ ALLELE_NUM and Feature if declared
*/
- locateCsqFields();
+ parseCsqHeader();
VCFHeaderLine ref = header
.getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
*/
void saveMetadata(String theSourceId)
{
+ List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
+ DEFAULT_VCF_FIELDS);
vcfFieldsOfInterest = new ArrayList<>();
FeatureSource metadata = new FeatureSource(theSourceId);
metadata.setAttributeName(attributeId, desc);
metadata.setAttributeType(attributeId, attType);
- if (isVcfFieldWanted(attributeId))
+ if (isFieldWanted(attributeId, vcfFieldPatterns))
{
vcfFieldsOfInterest.add(attributeId);
}
}
/**
- * Answers true if the VCF id is one we wish to capture in Jalview, else false
+ * Answers true if the field id is matched by any of the filter patterns, else
+ * false. Matching is against regular expression patterns, and is not
+ * case-sensitive.
*
* @param id
+ * @param filters
* @return
*/
- private boolean isVcfFieldWanted(String id)
+ private boolean isFieldWanted(String id, List<Pattern> filters)
{
- // TODO option to match patterns in a Preferences entry?
- return true;
- }
-
- /**
- * Answers true if the VEP (CSQ) id is one we wish to capture in Jalview, else
- * false
- *
- * @param id
- * @return
- */
- private boolean isVepFieldWanted(String id)
- {
- // TODO option to match patterns in a Preferences entry?
- return true;
+ for (Pattern p : filters)
+ {
+ if (p.matcher(id.toUpperCase()).matches())
+ {
+ return true;
+ }
+ }
+ return false;
}
/**
- * Records the position of selected fields defined in the CSQ INFO header (if
- * there is one). CSQ fields are declared in the CSQ INFO Description e.g.
+ * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
+ * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
+ * required for processing.
+ * <p>
+ * CSQ fields are declared in the CSQ INFO Description e.g.
* <p>
* Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
*/
- protected void locateCsqFields()
+ protected void parseCsqHeader()
{
- vepFieldsOfInterest = new ArrayList<>();
+ List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
+ DEFAULT_VEP_FIELDS);
+ vepFieldsOfInterest = new HashMap<>();
VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
if (csqInfo == null)
return;
}
+ /*
+ * parse out the pipe-separated list of CSQ fields; we assume here that
+ * these form the last part of the description, and contain no spaces
+ */
String desc = csqInfo.getDescription();
- int formatPos = desc.indexOf(FORMAT);
- if (formatPos == -1)
- {
- System.err.println("Parse error, failed to find " + FORMAT
- + " in " + desc);
- return;
- }
- desc = desc.substring(formatPos + FORMAT.length());
+ int spacePos = desc.lastIndexOf(" ");
+ desc = desc.substring(spacePos + 1);
if (desc != null)
{
csqFeatureFieldIndex = index;
}
- if (isVepFieldWanted(field))
+ if (isFieldWanted(field, vepFieldFilters))
{
- vepFieldsOfInterest.add(field);
+ vepFieldsOfInterest.put(index, field);
}
index++;
}
/**
+ * Reads the Preference value for the given key, with default specified if no
+ * preference set. The value is interpreted as a comma-separated list of
+ * regular expressions, and converted into a list of compiled patterns ready
+ * for matching. Patterns are forced to upper-case for non-case-sensitive
+ * matching.
+ * <p>
+ * This supports user-defined filters for fields of interest to capture while
+ * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
+ * fields with an ID of AF, or starting with AC, would be matched.
+ *
+ * @param key
+ * @param def
+ * @return
+ */
+ private List<Pattern> getFieldMatchers(String key, String def)
+ {
+ String pref = Cache.getDefault(key, def);
+ List<Pattern> patterns = new ArrayList<>();
+ String[] tokens = pref.split(",");
+ for (String token : tokens)
+ {
+ patterns.add(Pattern.compile(token.toUpperCase()));
+ }
+ return patterns;
+ }
+
+ /**
* Transfers VCF features to sequences to which this sequence has a mapping.
* If the mapping is 3:1, computes peptide variants from nucleotide variants.
*
}
}
- StringBuilder sb = new StringBuilder(128);
- boolean found = false;
-
- // todo check against vepFieldsOfInterest as well somewhere
+ /*
+ * inspect CSQ consequences; where possible restrict to the consequence
+ * associated with the current transcript (Feature)
+ */
+ Map<String, String> csqValues = new HashMap<>();
for (String consequence : consequences)
{
if (includeConsequence(csqFields, matchFeature, variant,
altAlelleIndex))
{
- if (found)
+ /*
+ * inspect individual fields of this consequence, copying non-null
+ * values which are 'fields of interest'
+ */
+ int i = 0;
+ for (String field : csqFields)
{
- sb.append(COMMA);
+ if (field != null && field.length() > 0)
+ {
+ String id = vepFieldsOfInterest.get(i);
+ if (id != null)
+ {
+ csqValues.put(id, field);
+ }
+ }
+ i++;
}
- found = true;
- sb.append(consequence);
}
}
- if (found)
+ if (!csqValues.isEmpty())
{
- sf.setValue(CSQ_FIELD, sb.toString());
+ sf.setValue(CSQ_FIELD, csqValues);
}
}