JAL-1793 wrapper for plain or indexed VCF + simple tests
authorgmungoc <g.m.carstairs@dundee.ac.uk>
Tue, 19 Sep 2017 14:34:39 +0000 (15:34 +0100)
committergmungoc <g.m.carstairs@dundee.ac.uk>
Tue, 19 Sep 2017 14:34:39 +0000 (15:34 +0100)
src/jalview/ext/htsjdk/VCFReader.java [new file with mode: 0644]
test/jalview/ext/htsjdk/TabixFeatureReaderTest.java [deleted file]
test/jalview/ext/htsjdk/VCFReaderTest.java [moved from test/jalview/ext/htsjdk/VCFFileReaderTest.java with 65% similarity]

diff --git a/src/jalview/ext/htsjdk/VCFReader.java b/src/jalview/ext/htsjdk/VCFReader.java
new file mode 100644 (file)
index 0000000..8dfd7e2
--- /dev/null
@@ -0,0 +1,90 @@
+package jalview.ext.htsjdk;
+
+import htsjdk.samtools.util.CloseableIterator;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFFileReader;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+
+/**
+ * A thin wrapper for htsjdk classes to read either plain, or compressed, or
+ * compressed and indexed VCF files
+ */
+public class VCFReader implements Closeable, Iterable<VariantContext>
+{
+  private static final String GZ = "gz";
+
+  private static final String TBI_EXTENSION = ".tbi";
+
+  private boolean indexed;
+
+  private VCFFileReader reader;
+
+  /**
+   * Constructor given a raw or compressed VCF file or a (tabix) index file
+   * <p>
+   * For now, file type is inferred from its suffix: .gz or .bgz for compressed
+   * data, .tbi for an index file, anything else is assumed to be plain text
+   * VCF.
+   * 
+   * @param f
+   * @throws IOException
+   */
+  public VCFReader(String filePath) throws IOException
+  {
+    if (filePath.endsWith(GZ))
+    {
+      if (new File(filePath + TBI_EXTENSION).exists())
+      {
+        indexed = true;
+      }
+    }
+    else if (filePath.endsWith(TBI_EXTENSION))
+    {
+      indexed = true;
+      filePath = filePath.substring(0, filePath.length() - 4);
+    }
+
+    reader = new VCFFileReader(new File(filePath), indexed);
+  }
+
+  @Override
+  public void close() throws IOException
+  {
+    if (reader != null)
+    {
+      reader.close();
+    }
+  }
+
+  /**
+   * Returns an iterator over VCF variants in the file. The client should call
+   * close() on the iterator when finished with it.
+   */
+  @Override
+  public CloseableIterator<VariantContext> iterator()
+  {
+    return reader == null ? null : reader.iterator();
+  }
+
+  /**
+   * Queries for records overlapping the region specified. Note that this method
+   * requires a VCF file with an associated index. If no index exists a
+   * TribbleException will be thrown.
+   * 
+   * @param chrom
+   *          the chromosome to query
+   * @param start
+   *          query interval start
+   * @param end
+   *          query interval end
+   * @return
+   */
+  public CloseableIterator<VariantContext> query(final String chrom,
+          final int start, final int end)
+  {
+    return reader == null ? null : reader.query(chrom, start, end);
+  }
+}
diff --git a/test/jalview/ext/htsjdk/TabixFeatureReaderTest.java b/test/jalview/ext/htsjdk/TabixFeatureReaderTest.java
deleted file mode 100644 (file)
index 48a932e..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-package jalview.ext.htsjdk;
-
-import htsjdk.tribble.CloseableTribbleIterator;
-import htsjdk.tribble.TabixFeatureReader;
-import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.vcf.VCFCodec;
-
-import java.io.IOException;
-
-import org.testng.annotations.Test;
-
-public class TabixFeatureReaderTest
-{
-  // gnomAD exome variant dataset
-  private static final String VCF_PATH = "/Volumes/gjb/smacgowan/NOBACK/resources/gnomad/gnomad.exomes.r2.0.1.sites.vcf.gz";
-
-  // "https://storage.cloud.google.com/gnomad-public/release/2.0.1/vcf/exomes/gnomad.exomes.r2.0.1.sites.vcf.gz";
-
-  /**
-   * A 'test' that demonstrates querying an indexed VCF file for features in a
-   * specified interval
-   * 
-   * @throws IOException
-   */
-  @Test
-  public void testQuery() throws IOException
-  {
-    /*
-     * if not specified, assumes index file is filename.tbi
-     */
-    TabixFeatureReader<VariantContext, VCFCodec> reader = new TabixFeatureReader<>(
-            VCF_PATH, VCF_PATH, new VCFCodec());
-
-    /*
-     * gene NMT1 (human) is on chromosome 17
-     * GCHR38 (Ensembl): 45051610-45109016
-     * GCHR37 (gnoMAD): 43128978-43186384
-     * CDS begins at offset 9720, first CDS variant at offset 9724
-     */
-    CloseableTribbleIterator<VariantContext> features = reader.query("17",
-            43128978 + 9724, 43128978 + 9734); // first 11 CDS positions
-    while (features.hasNext())
-    {
-      System.out.println(features.next().toString());
-    }
-
-    features.close();
-    reader.close();
-  }
-
-}
similarity index 65%
rename from test/jalview/ext/htsjdk/VCFFileReaderTest.java
rename to test/jalview/ext/htsjdk/VCFReaderTest.java
index 50147e7..42c655d 100644 (file)
@@ -6,7 +6,6 @@ import static org.testng.Assert.assertTrue;
 import htsjdk.samtools.util.CloseableIterator;
 import htsjdk.variant.variantcontext.Allele;
 import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.vcf.VCFFileReader;
 
 import java.io.File;
 import java.io.IOException;
@@ -15,7 +14,7 @@ import java.util.List;
 
 import org.testng.annotations.Test;
 
-public class VCFFileReaderTest
+public class VCFReaderTest
 {
   private static final String[] VCF = new String[] {
       "##fileformat=VCFv4.2",
@@ -24,16 +23,19 @@ public class VCFFileReaderTest
       "20\t7\t.\tG\tGA\t.\tPASS\tDP=100", // insertion G/GA
       "18\t2\t.\tACG\tA\t.\tPASS\tDP=100" }; // deletion ACG/A
 
+  // gnomAD exome variant dataset
+  private static final String VCF_PATH = "/Volumes/gjb/smacgowan/NOBACK/resources/gnomad/gnomad.exomes.r2.0.1.sites.vcf.gz";
+
   /**
    * A test to exercise some basic functionality of the htsjdk VCF reader
    * 
    * @throws IOException
    */
   @Test(groups = "Functional")
-  public void testReadVcf() throws IOException
+  public void testReadVcf_plain() throws IOException
   {
     File f = writeVcfFile();
-    VCFFileReader reader = new VCFFileReader(f, false);
+    VCFReader reader = new VCFReader(f.getAbsolutePath());
     CloseableIterator<VariantContext> variants = reader.iterator();
 
     /*
@@ -103,4 +105,41 @@ public class VCFFileReaderTest
     pw.close();
     return f;
   }
+
+  // "https://storage.cloud.google.com/gnomad-public/release/2.0.1/vcf/exomes/gnomad.exomes.r2.0.1.sites.vcf.gz";
+  
+  /**
+   * A 'test' that demonstrates querying an indexed VCF file for features in a
+   * specified interval
+   * 
+   * @throws IOException
+   */
+  @Test
+  public void testQuery_indexed() throws IOException
+  {
+    /*
+     * if not specified, assumes index file is filename.tbi
+     */
+    VCFReader reader = new VCFReader(VCF_PATH);
+  
+    /*
+     * gene NMT1 (human) is on chromosome 17
+     * GCHR38 (Ensembl): 45051610-45109016
+     * GCHR37 (gnoMAD): 43128978-43186384
+     * CDS begins at offset 9720, first CDS variant at offset 9724
+     */
+    CloseableIterator<VariantContext> features = reader.query("17",
+            43128978 + 9724, 43128978 + 9734); // first 11 CDS positions
+
+    assertEquals(features.next().getStart(), 43138702);
+    assertEquals(features.next().getStart(), 43138704);
+    assertEquals(features.next().getStart(), 43138707);
+    assertEquals(features.next().getStart(), 43138708);
+    assertEquals(features.next().getStart(), 43138710);
+    assertEquals(features.next().getStart(), 43138711);
+    assertFalse(features.hasNext());
+
+    features.close();
+    reader.close();
+  }
 }