From f943e5b7a7a5ce2b819495f83dbad28028a9a956 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 19 Sep 2017 15:34:39 +0100 Subject: [PATCH] JAL-1793 wrapper for plain or indexed VCF + simple tests --- src/jalview/ext/htsjdk/VCFReader.java | 90 ++++++++++++++++++++ .../jalview/ext/htsjdk/TabixFeatureReaderTest.java | 51 ----------- .../{VCFFileReaderTest.java => VCFReaderTest.java} | 47 +++++++++- 3 files changed, 133 insertions(+), 55 deletions(-) create mode 100644 src/jalview/ext/htsjdk/VCFReader.java delete mode 100644 test/jalview/ext/htsjdk/TabixFeatureReaderTest.java rename test/jalview/ext/htsjdk/{VCFFileReaderTest.java => VCFReaderTest.java} (65%) diff --git a/src/jalview/ext/htsjdk/VCFReader.java b/src/jalview/ext/htsjdk/VCFReader.java new file mode 100644 index 0000000..8dfd7e2 --- /dev/null +++ b/src/jalview/ext/htsjdk/VCFReader.java @@ -0,0 +1,90 @@ +package jalview.ext.htsjdk; + +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFFileReader; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; + +/** + * A thin wrapper for htsjdk classes to read either plain, or compressed, or + * compressed and indexed VCF files + */ +public class VCFReader implements Closeable, Iterable +{ + private static final String GZ = "gz"; + + private static final String TBI_EXTENSION = ".tbi"; + + private boolean indexed; + + private VCFFileReader reader; + + /** + * Constructor given a raw or compressed VCF file or a (tabix) index file + *

+ * For now, file type is inferred from its suffix: .gz or .bgz for compressed + * data, .tbi for an index file, anything else is assumed to be plain text + * VCF. + * + * @param f + * @throws IOException + */ + public VCFReader(String filePath) throws IOException + { + if (filePath.endsWith(GZ)) + { + if (new File(filePath + TBI_EXTENSION).exists()) + { + indexed = true; + } + } + else if (filePath.endsWith(TBI_EXTENSION)) + { + indexed = true; + filePath = filePath.substring(0, filePath.length() - 4); + } + + reader = new VCFFileReader(new File(filePath), indexed); + } + + @Override + public void close() throws IOException + { + if (reader != null) + { + reader.close(); + } + } + + /** + * Returns an iterator over VCF variants in the file. The client should call + * close() on the iterator when finished with it. + */ + @Override + public CloseableIterator iterator() + { + return reader == null ? null : reader.iterator(); + } + + /** + * Queries for records overlapping the region specified. Note that this method + * requires a VCF file with an associated index. If no index exists a + * TribbleException will be thrown. + * + * @param chrom + * the chromosome to query + * @param start + * query interval start + * @param end + * query interval end + * @return + */ + public CloseableIterator query(final String chrom, + final int start, final int end) + { + return reader == null ? null : reader.query(chrom, start, end); + } +} diff --git a/test/jalview/ext/htsjdk/TabixFeatureReaderTest.java b/test/jalview/ext/htsjdk/TabixFeatureReaderTest.java deleted file mode 100644 index 48a932e..0000000 --- a/test/jalview/ext/htsjdk/TabixFeatureReaderTest.java +++ /dev/null @@ -1,51 +0,0 @@ -package jalview.ext.htsjdk; - -import htsjdk.tribble.CloseableTribbleIterator; -import htsjdk.tribble.TabixFeatureReader; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFCodec; - -import java.io.IOException; - -import org.testng.annotations.Test; - -public class TabixFeatureReaderTest -{ - // gnomAD exome variant dataset - private static final String VCF_PATH = "/Volumes/gjb/smacgowan/NOBACK/resources/gnomad/gnomad.exomes.r2.0.1.sites.vcf.gz"; - - // "https://storage.cloud.google.com/gnomad-public/release/2.0.1/vcf/exomes/gnomad.exomes.r2.0.1.sites.vcf.gz"; - - /** - * A 'test' that demonstrates querying an indexed VCF file for features in a - * specified interval - * - * @throws IOException - */ - @Test - public void testQuery() throws IOException - { - /* - * if not specified, assumes index file is filename.tbi - */ - TabixFeatureReader reader = new TabixFeatureReader<>( - VCF_PATH, VCF_PATH, new VCFCodec()); - - /* - * gene NMT1 (human) is on chromosome 17 - * GCHR38 (Ensembl): 45051610-45109016 - * GCHR37 (gnoMAD): 43128978-43186384 - * CDS begins at offset 9720, first CDS variant at offset 9724 - */ - CloseableTribbleIterator features = reader.query("17", - 43128978 + 9724, 43128978 + 9734); // first 11 CDS positions - while (features.hasNext()) - { - System.out.println(features.next().toString()); - } - - features.close(); - reader.close(); - } - -} diff --git a/test/jalview/ext/htsjdk/VCFFileReaderTest.java b/test/jalview/ext/htsjdk/VCFReaderTest.java similarity index 65% rename from test/jalview/ext/htsjdk/VCFFileReaderTest.java rename to test/jalview/ext/htsjdk/VCFReaderTest.java index 50147e7..42c655d 100644 --- a/test/jalview/ext/htsjdk/VCFFileReaderTest.java +++ b/test/jalview/ext/htsjdk/VCFReaderTest.java @@ -6,7 +6,6 @@ import static org.testng.Assert.assertTrue; import htsjdk.samtools.util.CloseableIterator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFFileReader; import java.io.File; import java.io.IOException; @@ -15,7 +14,7 @@ import java.util.List; import org.testng.annotations.Test; -public class VCFFileReaderTest +public class VCFReaderTest { private static final String[] VCF = new String[] { "##fileformat=VCFv4.2", @@ -24,16 +23,19 @@ public class VCFFileReaderTest "20\t7\t.\tG\tGA\t.\tPASS\tDP=100", // insertion G/GA "18\t2\t.\tACG\tA\t.\tPASS\tDP=100" }; // deletion ACG/A + // gnomAD exome variant dataset + private static final String VCF_PATH = "/Volumes/gjb/smacgowan/NOBACK/resources/gnomad/gnomad.exomes.r2.0.1.sites.vcf.gz"; + /** * A test to exercise some basic functionality of the htsjdk VCF reader * * @throws IOException */ @Test(groups = "Functional") - public void testReadVcf() throws IOException + public void testReadVcf_plain() throws IOException { File f = writeVcfFile(); - VCFFileReader reader = new VCFFileReader(f, false); + VCFReader reader = new VCFReader(f.getAbsolutePath()); CloseableIterator variants = reader.iterator(); /* @@ -103,4 +105,41 @@ public class VCFFileReaderTest pw.close(); return f; } + + // "https://storage.cloud.google.com/gnomad-public/release/2.0.1/vcf/exomes/gnomad.exomes.r2.0.1.sites.vcf.gz"; + + /** + * A 'test' that demonstrates querying an indexed VCF file for features in a + * specified interval + * + * @throws IOException + */ + @Test + public void testQuery_indexed() throws IOException + { + /* + * if not specified, assumes index file is filename.tbi + */ + VCFReader reader = new VCFReader(VCF_PATH); + + /* + * gene NMT1 (human) is on chromosome 17 + * GCHR38 (Ensembl): 45051610-45109016 + * GCHR37 (gnoMAD): 43128978-43186384 + * CDS begins at offset 9720, first CDS variant at offset 9724 + */ + CloseableIterator features = reader.query("17", + 43128978 + 9724, 43128978 + 9734); // first 11 CDS positions + + assertEquals(features.next().getStart(), 43138702); + assertEquals(features.next().getStart(), 43138704); + assertEquals(features.next().getStart(), 43138707); + assertEquals(features.next().getStart(), 43138708); + assertEquals(features.next().getStart(), 43138710); + assertEquals(features.next().getStart(), 43138711); + assertFalse(features.hasNext()); + + features.close(); + reader.close(); + } } -- 1.7.10.2