JAL-2738 filter VEP consequence data to that for current transcript

author gmungoc <g.m.carstairs@dundee.ac.uk>

Mon, 2 Oct 2017 14:37:01 +0000 (15:37 +0100)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Mon, 2 Oct 2017 14:37:01 +0000 (15:37 +0100)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Mon, 2 Oct 2017 14:37:01 +0000 (15:37 +0100)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Mon, 2 Oct 2017 14:37:01 +0000 (15:37 +0100)
diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java

index 0acf49e..89b1d2a 100644 (file)
--- a/src/jalview/io/vcf/VCFLoader.java
+++ b/src/jalview/io/vcf/VCFLoader.java
@@ -6,6 +6,7 @@ import htsjdk.variant.variantcontext.VariantContext;
  import htsjdk.variant.vcf.VCFHeader;
  import htsjdk.variant.vcf.VCFHeaderLine;
  import htsjdk.variant.vcf.VCFHeaderLineCount;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
  
  import jalview.analysis.AlignmentUtils;
  import jalview.analysis.Dna;
@@ -39,18 +40,58 @@ import java.util.Map.Entry;
   */
  public class VCFLoader
  {
+  /*
+   * keys to fields of VEP CSQ consequence data
+   * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html
+   */
+  private static final String ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1...
+
+  private static final String FEATURE_KEY = "Feature"; // Ensembl stable id
+
+  /*
+   * default VCF INFO key for VEP consequence data
+   * NB this can be overridden running VEP with --vcf_info_field
+   * - we don't handle this case (require CSQ identifier)
+   */
+  private static final String CSQ = "CSQ";
+
+  /*
+   * separator for fields in consequence data
+   */
+  private static final String PIPE = "|";
+
+  private static final String PIPE_REGEX = "\\" + PIPE;
+
+  /*
+   * key for Allele Frequency output by VEP
+   * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
+   */
    private static final String ALLELE_FREQUENCY_KEY = "AF";
  
+  /*
+   * delimiter that separates multiple consequence data blocks
+   */
    private static final String COMMA = ",";
  
+  /*
+   * (temporary) flag that determines whether Jalview adds one feature
+   * per VCF record, or one per allele (preferred)
+   */
    private static final boolean FEATURE_PER_ALLELE = true;
  
+  /*
+   * the feature group assigned to a VCF variant in Jalview
+   */
    private static final String FEATURE_GROUP_VCF = "VCF";
  
+  /*
+   * internal delimiter used to build keys for assemblyMappings
+   * 
+   */
    private static final String EXCL = "!";
  
    /*
-   * the alignment we are associated VCF data with
+   * the alignment we are associating VCF data with
     */
    private AlignmentI al;
  
@@ -66,6 +107,15 @@ public class VCFLoader
     */
    private VCFHeader header;
  
+  /*
+   * the position (0...) of the ALLELE_NUM field in each block of
+   * CSQ (consequence) data (if declared in the VCF INFO header for CSQ)
+   * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html
+   */
+  private int csqAlleleNumberFieldIndex = -1;
+
+  private int csqFeatureFieldIndex = -1;
+
    /**
     * Constructor given an alignment context
     * 
@@ -127,6 +177,12 @@ public class VCFLoader
        header = reader.getFileHeader();
        VCFHeaderLine ref = header
                .getOtherHeaderLine(VCFHeader.REFERENCE_KEY);
+
+      /*
+       * note offset of CSQ ALLELE_NUM field if it is declared
+       */
+      findAlleleNumberFieldIndex();
+
        // check if reference is wrt assembly19 (GRCh37)
        // todo may need to allow user to specify reference assembly?
        boolean isRefGrch37 = (ref != null && ref.getValue().contains(
@@ -183,6 +239,36 @@ public class VCFLoader
    }
  
    /**
+   * If the CSQ INFO header declares that ALLELE_NUM is included in the data,
+   * record its (pipe-delimited) offset in each (comma-delimited) consequence
+   * block; CSQ fields are declared in the CSQ INFO Description e.g.
+   * <p>
+   * Description="Consequence ...from ... VEP. Format: Allele|Consequence|...
+   */
+  protected void findAlleleNumberFieldIndex()
+  {
+    VCFInfoHeaderLine csqInfo = header.getInfoHeaderLine(CSQ);
+    String desc = csqInfo.getDescription();
+    if (desc != null)
+    {
+      String[] format = desc.split(PIPE_REGEX);
+      int index = 0;
+      for (String field : format)
+      {
+        if (ALLELE_NUM_KEY.equals(field))
+        {
+          csqAlleleNumberFieldIndex = index;
+        }
+        if (FEATURE_KEY.equals(field))
+        {
+          csqFeatureFieldIndex = index;
+        }
+        index++;
+      }
+    }
+  }
+
+  /**
     * Transfers VCF features to sequences to which this sequence has a mapping.
     * If the mapping is 1:3, computes peptide variants from nucleotide variants.
     * 
@@ -563,7 +649,7 @@ public class VCFLoader
  
      sf.setValue(Gff3Helper.ALLELES, alleles);
  
-    addAlleleProperties(variant, sf, altAlleleIndex);
+    addAlleleProperties(variant, seq, sf, altAlleleIndex);
  
      seq.addSequenceFeature(sf);
  
@@ -574,22 +660,35 @@ public class VCFLoader
     * Add any allele-specific VCF key-value data to the sequence feature
     * 
     * @param variant
+   * @param seq
     * @param sf
     * @param altAlelleIndex
     */
    protected void addAlleleProperties(VariantContext variant,
+ SequenceI seq,
            SequenceFeature sf, final int altAlelleIndex)
    {
      Map<String, Object> atts = variant.getAttributes();
  
-    /*
-     * process variant data, extracting values which are allele-specific 
-     * these may be per alternate allele (INFO[key].Number = 'A') 
-     * or per allele including reference (INFO[key].Number = 'R') 
-     */
      for (Entry<String, Object> att : atts.entrySet())
      {
        String key = att.getKey();
+
+      /*
+       * extract Consequence data (if present) that we are able to
+       * associated with the allele for this variant feature
+       */
+      if (CSQ.equals(key) && csqAlleleNumberFieldIndex > -1)
+      {
+        addConsequences(att.getValue(), seq, sf, altAlelleIndex + 1);
+        return;
+      }
+
+      /*
+       * we extract values for other data which are allele-specific; 
+       * these may be per alternate allele (INFO[key].Number = 'A') 
+       * or per allele including reference (INFO[key].Number = 'R') 
+       */
        VCFHeaderLineCount number = header.getInfoHeaderLine(key)
                .getCountType();
        int index = altAlelleIndex;
@@ -601,11 +700,7 @@ public class VCFLoader
           */
          index++;
        }
-      /*
-       * CSQ behaves as if Number=A but declares as Number=.
-       * so give it special treatment
-       */
-      else if (!"CSQ".equals(key) && number != VCFHeaderLineCount.A)
+      else if (number != VCFHeaderLineCount.A)
        {
          /*
           * don't save other values as not allele-related
@@ -625,6 +720,107 @@ public class VCFLoader
    }
  
    /**
+   * Inspects CSQ data blocks (consequences) and adds attributes on the sequence
+   * feature for the current allele (and transcript if applicable)
+   * <p>
+   * Allele matching: we require field ALLELE_NUM to match altAlleleIndex. If
+   * the CSQ data does not include ALLELE_NUM values then no data is added to
+   * the variant feature.
+   * <p>
+   * Transcript matching: if sequence name can be identified to at least one of
+   * the consequences' Feature values, then select only consequences that match
+   * the value (i.e. consequences for the current transcript sequence). If not,
+   * take all consequences (this is the case when adding features to the gene
+   * sequence).
+   * 
+   * @param value
+   * @param seq
+   * @param sf
+   * @param altAlelleIndex
+   *          (1=first alternative allele...)
+   */
+  protected void addConsequences(Object value, SequenceI seq,
+          SequenceFeature sf,
+          int altAlelleIndex)
+  {
+    if (!(value instanceof ArrayList<?>))
+    {
+      return;
+    }
+
+    List<String> consequences = (List<String>) value;
+
+    /*
+     * if CSQ data includes 'Feature', and any value matches the sequence name,
+     * then restrict consequence data to the matching value (transcript)
+     * i.e. just pick out consequences for the transcript the variant feature is on
+     */
+    String seqName = seq.getName()== null ? "" : seq.getName().toLowerCase();
+    boolean matchFeature = false;
+    String matchFeatureValue = null;
+    if (csqFeatureFieldIndex > -1)
+    {
+      for (String consequence : consequences)
+      {
+        String[] csqFields = consequence.split(PIPE_REGEX);
+        if (csqFields.length > csqFeatureFieldIndex)
+        {
+          String featureIdentifier = csqFields[csqFeatureFieldIndex];
+          if (featureIdentifier.length() > 4
+                  && seqName.indexOf(featureIdentifier.toLowerCase()) > -1)
+          {
+            matchFeature = true;
+            matchFeatureValue = featureIdentifier;
+          }
+        }
+      }
+    }
+
+    StringBuilder sb = new StringBuilder(128);
+    boolean found = false;
+
+    for (String consequence : consequences)
+    {
+      String[] csqFields = consequence.split(PIPE_REGEX);
+
+      /*
+       * check consequence is for the current transcript
+       */
+      if (matchFeature)
+      {
+        if (csqFields.length <= csqFeatureFieldIndex)
+        {
+          continue;
+        }
+        String featureIdentifier = csqFields[csqFeatureFieldIndex];
+        if (!featureIdentifier.equals(matchFeatureValue))
+        {
+          continue; // consequence is for a different transcript
+        }
+      }
+
+      if (csqFields.length > csqAlleleNumberFieldIndex)
+      {
+        String alleleNum = csqFields[csqAlleleNumberFieldIndex];
+        if (String.valueOf(altAlelleIndex).equals(alleleNum))
+        {
+          if (found)
+          {
+            sb.append(COMMA);
+          }
+          found = true;
+          sb.append(consequence);
+        }
+      }
+    }
+
+    if (found)
+    {
+      sf.setValue(CSQ, sb.toString());
+    }
+  }
+
+  /**
     * A convenience method to complement a dna base and return the string value
     * of its complement
     *
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Mon, 2 Oct 2017 14:37:01 +0000 (15:37 +0100)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Mon, 2 Oct 2017 14:37:01 +0000 (15:37 +0100)