X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fext%2Fhtsjdk%2FVCFReader.java;h=2859e0f7ae297bfb6649393ace4c73769c7879bd;hb=3637bb169b4516a56818137e40cc6eff6cd4b969;hp=c5e09e041376e13ed1691bfe727b963f2b5c3d6b;hpb=9317cd655af4803461acc71581ecbdc0a6677069;p=jalview.git diff --git a/src/jalview/ext/htsjdk/VCFReader.java b/src/jalview/ext/htsjdk/VCFReader.java index c5e09e0..2859e0f 100644 --- a/src/jalview/ext/htsjdk/VCFReader.java +++ b/src/jalview/ext/htsjdk/VCFReader.java @@ -1,14 +1,16 @@ package jalview.ext.htsjdk; -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFFileReader; -import htsjdk.variant.vcf.VCFHeader; +import jalview.bin.Cache; import java.io.Closeable; import java.io.File; import java.io.IOException; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFFileReader; +import htsjdk.variant.vcf.VCFHeader; + /** * A thin wrapper for htsjdk classes to read either plain, or compressed, or * compressed and indexed VCF files @@ -19,36 +21,52 @@ public class VCFReader implements Closeable, Iterable private static final String TBI_EXTENSION = ".tbi"; + private static final String CSI_EXTENSION = ".csi"; + private boolean indexed; private VCFFileReader reader; /** - * Constructor given a raw or compressed VCF file or a (tabix) index file + * Constructor given a raw or compressed VCF file or a (csi or tabix) index file *

- * For now, file type is inferred from its suffix: .gz or .bgz for compressed - * data, .tbi for an index file, anything else is assumed to be plain text - * VCF. + * If the file path ends in ".tbi" or ".csi", or appending one of these + * extensions gives a valid file path, open as indexed, else as unindexed. * * @param f * @throws IOException */ public VCFReader(String filePath) throws IOException { - if (filePath.endsWith(GZ)) + indexed = false; + if (filePath.endsWith(TBI_EXTENSION) + || filePath.endsWith(CSI_EXTENSION)) { - if (new File(filePath + TBI_EXTENSION).exists()) - { - indexed = true; - } + indexed = true; + filePath = filePath.substring(0, filePath.length() - 4); } - else if (filePath.endsWith(TBI_EXTENSION)) + else if (new File(filePath + TBI_EXTENSION).exists()) + { + indexed = true; + } + else if (new File(filePath + CSI_EXTENSION).exists()) { indexed = true; - filePath = filePath.substring(0, filePath.length() - 4); } - reader = new VCFFileReader(new File(filePath), indexed); + /* + * we pass the name of the unindexed file to htsjdk, + * with a flag to assert whether it is indexed + */ + File file = new File(filePath); + if (file.exists()) + { + reader = new VCFFileReader(file, indexed); + } + else + { + Cache.log.error("File not found: " + filePath); + } } @Override @@ -72,9 +90,10 @@ public class VCFReader implements Closeable, Iterable /** * Queries for records overlapping the region specified. Note that this method - * requires a VCF file with an associated index. If no index exists a - * TribbleException will be thrown. Client code should call close() on the - * iterator when finished with it. + * is performant if the VCF file is indexed, and may be very slow if it is + * not. + *

+ * Client code should call close() on the iterator when finished with it. * * @param chrom * the chromosome to query @@ -87,7 +106,108 @@ public class VCFReader implements Closeable, Iterable public CloseableIterator query(final String chrom, final int start, final int end) { - return reader == null ? null : reader.query(chrom, start, end); + if (reader == null) + { + return null; + } + if (indexed) + { + return reader.query(chrom, start, end); + } + else + { + return queryUnindexed(chrom, start, end); + } + } + + /** + * Returns an iterator over variant records read from a flat file which + * overlap the specified chromosomal positions. Call close() on the iterator + * when finished with it! + * + * @param chrom + * @param start + * @param end + * @return + */ + protected CloseableIterator queryUnindexed( + final String chrom, final int start, final int end) + { + final CloseableIterator it = reader.iterator(); + + return new CloseableIterator() + { + boolean atEnd = false; + + // prime look-ahead buffer with next matching record + private VariantContext next = findNext(); + + private VariantContext findNext() + { + if (atEnd) + { + return null; + } + VariantContext variant = null; + while (it.hasNext()) + { + variant = it.next(); + int vstart = variant.getStart(); + + if (vstart > end) + { + atEnd = true; + close(); + return null; + } + + int vend = variant.getEnd(); + // todo what is the undeprecated way to get + // the chromosome for the variant? + if (chrom.equals(variant.getContig()) && (vstart <= end) + && (vend >= start)) + { + return variant; + } + } + return null; + } + + @Override + public boolean hasNext() + { + boolean hasNext = !atEnd && (next != null); + if (!hasNext) + { + close(); + } + return hasNext; + } + + @Override + public VariantContext next() + { + /* + * return the next match, and then re-prime + * it with the following one (if any) + */ + VariantContext temp = next; + next = findNext(); + return temp; + } + + @Override + public void remove() + { + // not implemented + } + + @Override + public void close() + { + it.close(); + } + }; } /** @@ -99,4 +219,15 @@ public class VCFReader implements Closeable, Iterable { return reader == null ? null : reader.getFileHeader(); } + + /** + * Answers true if we are processing a tab-indexed VCF file, false if it is a + * plain text (uncompressed) file. + * + * @return + */ + public boolean isIndex() + { + return indexed; + } }