JAL-2835 add filters for VCF/VEP 'fields of interest'

author gmungoc <g.m.carstairs@dundee.ac.uk>

Wed, 15 Nov 2017 10:34:28 +0000 (10:34 +0000)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Wed, 15 Nov 2017 10:34:28 +0000 (10:34 +0000)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 15 Nov 2017 10:34:28 +0000 (10:34 +0000)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Wed, 15 Nov 2017 10:34:28 +0000 (10:34 +0000)
diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java

index e7c5a34..960615a 100644 (file)
--- a/src/jalview/io/vcf/VCFLoader.java
+++ b/src/jalview/io/vcf/VCFLoader.java
@@ -12,6 +12,7 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine;
  import jalview.analysis.AlignmentUtils;
  import jalview.analysis.Dna;
  import jalview.api.AlignViewControllerGuiI;
+import jalview.bin.Cache;
  import jalview.datamodel.AlignmentI;
  import jalview.datamodel.DBRefEntry;
  import jalview.datamodel.GeneLociI;
@@ -35,6 +36,7 @@ import java.util.HashMap;
  import java.util.List;
  import java.util.Map;
  import java.util.Map.Entry;
+import java.util.regex.Pattern;
  
  /**
   * A class to read VCF data (using the htsjdk) and add variants as sequence
@@ -45,6 +47,18 @@ import java.util.Map.Entry;
  public class VCFLoader
  {
    /*
+   * Lookup keys, and default values, for Preference entries that describe
+   * patterns for VCF and VEP fields to capture 
+   */
+  private static final String VEP_FIELDS_PREF = "VEP_FIELDS";
+
+  private static final String VCF_FIELDS_PREF = "VCF_FIELDS";
+
+  private static final String DEFAULT_VCF_FIELDS = "AF,AC*";
+
+  private static final String DEFAULT_VEP_FIELDS = "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG";
+
+  /*
     * keys to fields of VEP CSQ consequence data
     * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
     */
@@ -54,11 +68,6 @@ public class VCFLoader
    private static final String FEATURE_KEY = "Feature"; // Ensembl stable id
  
    /*
-   * what comes before column headings in CSQ Description field
-   */
-  private static final String FORMAT = "Format: ";
-
-  /*
     * default VCF INFO key for VEP consequence data
     * NB this can be overridden running VEP with --vcf_info_field
     * - we don't handle this case (require identifier to be CSQ)
@@ -124,9 +133,18 @@ public class VCFLoader
     */
    private String sourceId;
  
+  /*
+   * The INFO IDs of data that is both present in the VCF file, and
+   * also matched by any filters for data of interest
+   */
    List<String> vcfFieldsOfInterest;
  
-  List<String> vepFieldsOfInterest;
+  /*
+   * The field offsets and identifiers for VEP (CSQ) data that is both present
+   * in the VCF file, and also matched by any filters for data of interest
+   * for example 0 -> Allele, 1 -> Consequence, ..., 36 -> SIFT, ...
+   */
+  Map<Integer, String> vepFieldsOfInterest;
  
    /**
     * Constructor given an alignment context
@@ -195,7 +213,7 @@ public class VCFLoader
        /*
         * get offset of CSQ ALLELE_NUM and Feature if declared
         */
-      locateCsqFields();
+      parseCsqHeader();
  
        VCFHeaderLine ref = header
                .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
@@ -259,6 +277,8 @@ public class VCFLoader
     */
    void saveMetadata(String theSourceId)
    {
+    List<Pattern> vcfFieldPatterns = getFieldMatchers(VCF_FIELDS_PREF,
+            DEFAULT_VCF_FIELDS);
      vcfFieldsOfInterest = new ArrayList<>();
  
      FeatureSource metadata = new FeatureSource(theSourceId);
@@ -290,7 +310,7 @@ public class VCFLoader
        metadata.setAttributeName(attributeId, desc);
        metadata.setAttributeType(attributeId, attType);
  
-      if (isVcfFieldWanted(attributeId))
+      if (isFieldWanted(attributeId, vcfFieldPatterns))
        {
          vcfFieldsOfInterest.add(attributeId);
        }
@@ -300,39 +320,40 @@ public class VCFLoader
    }
  
    /**
-   * Answers true if the VCF id is one we wish to capture in Jalview, else false
+   * Answers true if the field id is matched by any of the filter patterns, else
+   * false. Matching is against regular expression patterns, and is not
+   * case-sensitive.
     * 
     * @param id
+   * @param filters
     * @return
     */
-  private boolean isVcfFieldWanted(String id)
+  private boolean isFieldWanted(String id, List<Pattern> filters)
    {
-    // TODO option to match patterns in a Preferences entry?
-    return true;
-  }
-
-  /**
-   * Answers true if the VEP (CSQ) id is one we wish to capture in Jalview, else
-   * false
-   * 
-   * @param id
-   * @return
-   */
-  private boolean isVepFieldWanted(String id)
-  {
-    // TODO option to match patterns in a Preferences entry?
-    return true;
+    for (Pattern p : filters)
+    {
+      if (p.matcher(id.toUpperCase()).matches())
+      {
+        return true;
+      }
+    }
+    return false;
    }
  
    /**
-   * Records the position of selected fields defined in the CSQ INFO header (if
-   * there is one). CSQ fields are declared in the CSQ INFO Description e.g.
+   * Records 'wanted' fields defined in the CSQ INFO header (if there is one).
+   * Also records the position of selected fields (Allele, ALLELE_NUM, Feature)
+   * required for processing.
+   * <p>
+   * CSQ fields are declared in the CSQ INFO Description e.g.
     * <p>
     * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
     */
-  protected void locateCsqFields()
+  protected void parseCsqHeader()
    {
-    vepFieldsOfInterest = new ArrayList<>();
+    List<Pattern> vepFieldFilters = getFieldMatchers(VEP_FIELDS_PREF,
+            DEFAULT_VEP_FIELDS);
+    vepFieldsOfInterest = new HashMap<>();
  
      VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ_FIELD);
      if (csqInfo == null)
@@ -340,15 +361,13 @@ public class VCFLoader
        return;
      }
  
+    /*
+     * parse out the pipe-separated list of CSQ fields; we assume here that
+     * these form the last part of the description, and contain no spaces
+     */
      String desc = csqInfo.getDescription();
-    int formatPos = desc.indexOf(FORMAT);
-    if (formatPos == -1)
-    {
-      System.err.println("Parse error, failed to find " + FORMAT
-              + " in " + desc);
-      return;
-    }
-    desc = desc.substring(formatPos + FORMAT.length());
+    int spacePos = desc.lastIndexOf(" ");
+    desc = desc.substring(spacePos + 1);
  
      if (desc != null)
      {
@@ -369,9 +388,9 @@ public class VCFLoader
            csqFeatureFieldIndex = index;
          }
  
-        if (isVepFieldWanted(field))
+        if (isFieldWanted(field, vepFieldFilters))
          {
-          vepFieldsOfInterest.add(field);
+          vepFieldsOfInterest.put(index, field);
          }
  
          index++;
@@ -380,6 +399,33 @@ public class VCFLoader
    }
  
    /**
+   * Reads the Preference value for the given key, with default specified if no
+   * preference set. The value is interpreted as a comma-separated list of
+   * regular expressions, and converted into a list of compiled patterns ready
+   * for matching. Patterns are forced to upper-case for non-case-sensitive
+   * matching.
+   * <p>
+   * This supports user-defined filters for fields of interest to capture while
+   * processing data. For example, VCF_FIELDS = AF,AC* would mean that VCF INFO
+   * fields with an ID of AF, or starting with AC, would be matched.
+   * 
+   * @param key
+   * @param def
+   * @return
+   */
+  private List<Pattern> getFieldMatchers(String key, String def)
+  {
+    String pref = Cache.getDefault(key, def);
+    List<Pattern> patterns = new ArrayList<>();
+    String[] tokens = pref.split(",");
+    for (String token : tokens)
+    {
+      patterns.add(Pattern.compile(token.toUpperCase()));
+    }
+    return patterns;
+  }
+
+  /**
     * Transfers VCF features to sequences to which this sequence has a mapping.
     * If the mapping is 3:1, computes peptide variants from nucleotide variants.
     * 
@@ -858,10 +904,11 @@ public class VCFLoader
        }
      }
  
-    StringBuilder sb = new StringBuilder(128);
-    boolean found = false;
-
-    // todo check against vepFieldsOfInterest as well somewhere
+    /*
+     * inspect CSQ consequences; where possible restrict to the consequence
+     * associated with the current transcript (Feature)
+     */
+    Map<String, String> csqValues = new HashMap<>();
  
      for (String consequence : consequences)
      {
@@ -870,18 +917,29 @@ public class VCFLoader
        if (includeConsequence(csqFields, matchFeature, variant,
                altAlelleIndex))
        {
-        if (found)
+        /*
+         * inspect individual fields of this consequence, copying non-null
+         * values which are 'fields of interest'
+         */
+        int i = 0;
+        for (String field : csqFields)
          {
-          sb.append(COMMA);
+          if (field != null && field.length() > 0)
+          {
+            String id = vepFieldsOfInterest.get(i);
+            if (id != null)
+            {
+              csqValues.put(id, field);
+            }
+          }
+          i++;
          }
-        found = true;
-        sb.append(consequence);
        }
      }
  
-    if (found)
+    if (!csqValues.isEmpty())
      {
-      sf.setValue(CSQ_FIELD, sb.toString());
+      sf.setValue(CSQ_FIELD, csqValues);
      }
    }
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Wed, 15 Nov 2017 10:34:28 +0000 (10:34 +0000)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Wed, 15 Nov 2017 10:34:28 +0000 (10:34 +0000)