From: gmungoc Date: Thu, 30 Nov 2017 15:20:19 +0000 (+0000) Subject: JAL-2850 improved use of htsjdk api to read contig info X-Git-Tag: Release_2_11_0~96 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=946a0619168de3fbe9858401d13bcd1981d0a0df;p=jalview.git JAL-2850 improved use of htsjdk api to read contig info --- diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 2a3ddef..a85802a 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -32,11 +32,12 @@ import java.util.TreeMap; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import htsjdk.samtools.SAMException; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFContigHeaderLine; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; @@ -147,6 +148,11 @@ public class VCFLoader private VCFHeader header; /* + * a Dictionary of contigs (if present) referenced in the VCF file + */ + private SAMSequenceDictionary dictionary; + + /* * the position (0...) of field in each block of * CSQ (consequence) data (if declared in the VCF INFO header for CSQ) * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html @@ -234,6 +240,14 @@ public class VCFLoader header = reader.getFileHeader(); + try + { + dictionary = header.getSequenceDictionary(); + } catch (SAMException e) + { + // ignore - thrown if any contig line lacks length info + } + sourceId = filePath; saveMetadata(sourceId); @@ -294,6 +308,8 @@ public class VCFLoader // ignore } } + header = null; + dictionary = null; } } @@ -543,17 +559,27 @@ public class VCFLoader /* * simplest case: sequence has id and length matching a VCF contig */ + VCFMap vcfMap = null; + if (dictionary != null) + { + vcfMap = getContigMap(seq); + } + if (vcfMap != null) + { + return vcfMap; + } + + /* + * otherwise, map to VCF from chromosomal coordinates + * of the sequence (if known) + */ GeneLociI seqCoords = seq.getGeneLoci(); if (seqCoords == null) { - VCFMap map = getContigMap(seq); - if (map == null) - { - Cache.log.warn(String.format( - "Can't query VCF for %s as chromosome coordinates not known", - seq.getName())); - } - return map; + Cache.log.warn(String.format( + "Can't query VCF for %s as chromosome coordinates not known", + seq.getName())); + return null; } String species = seqCoords.getSpeciesId(); @@ -619,8 +645,8 @@ public class VCFLoader /** * If the sequence id matches a contig declared in the VCF file, and the - * sequence matches the contig length, then returns a 1:1 map of the sequence to - * the contig, else returns null + * sequence length matches the contig length, then returns a 1:1 map of the + * sequence to the contig, else returns null * * @param seq * @return @@ -628,25 +654,17 @@ public class VCFLoader private VCFMap getContigMap(SequenceI seq) { String id = seq.getName(); - for (VCFContigHeaderLine contig : header.getContigLines()) + SAMSequenceRecord contig = dictionary.getSequence(id); + if (contig != null) { - if (contig.getID().equals(id)) + int len = seq.getLength(); + if (len == contig.getSequenceLength()) { - /* - * have to construct a SAMSequenceRecord to - * read the contig 'length' field! - */ - int len = seq.getLength(); - SAMSequenceRecord ssr = contig.getSAMSequenceRecord(); - if (len == ssr.getSequenceLength()) - { - MapList map = new MapList(new int[] { 1, len }, - new int[] - { 1, len }, 1, 1); - return new VCFMap(id, map); - } + MapList map = new MapList(new int[] { 1, len }, + new int[] + { 1, len }, 1, 1); + return new VCFMap(id, map); } - } return null; }