From: gmungoc Date: Mon, 27 Nov 2017 15:21:33 +0000 (+0000) Subject: JAL-2850 VCFLoader refactored to load VCF to contig sequences X-Git-Tag: Release_2_11_0~100 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=7748647e978e341e3ecba1b54b4cea8598ff727b;p=jalview.git JAL-2850 VCFLoader refactored to load VCF to contig sequences --- diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 155e773..2a3ddef 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -32,9 +32,11 @@ import java.util.TreeMap; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFContigHeaderLine; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; @@ -49,6 +51,29 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + /** + * A class to model the mapping from sequence to VCF coordinates. Cases include + * + */ + class VCFMap + { + final String chromosome; + + final MapList map; + + VCFMap(String chr, MapList m) + { + chromosome = chr; + map = m; + } + } + /* * Lookup keys, and default values, for Preference entries that describe * patterns for VCF and VEP fields to capture @@ -497,28 +522,155 @@ public class VCFLoader protected int loadSequenceVCF(SequenceI seq, VCFReader reader, String vcfAssembly) { - int count = 0; + VCFMap vcfMap = getVcfMap(seq, vcfAssembly); + if (vcfMap == null) + { + return 0; + } + + return addVcfVariants(seq, reader, vcfMap, vcfAssembly); + } + + /** + * Answers a map from sequence coordinates to VCF chromosome ranges + * + * @param seq + * @param vcfAssembly + * @return + */ + private VCFMap getVcfMap(SequenceI seq, String vcfAssembly) + { + /* + * simplest case: sequence has id and length matching a VCF contig + */ GeneLociI seqCoords = seq.getGeneLoci(); if (seqCoords == null) { - System.out.println(String.format( - "Can't query VCF for %s as chromosome coordinates not known", - seq.getName())); - return 0; + VCFMap map = getContigMap(seq); + if (map == null) + { + Cache.log.warn(String.format( + "Can't query VCF for %s as chromosome coordinates not known", + seq.getName())); + } + return map; + } + + String species = seqCoords.getSpeciesId(); + String chromosome = seqCoords.getChromosomeId(); + String seqRef = seqCoords.getAssemblyId(); + MapList map = seqCoords.getMap(); + + if (!vcfSpeciesMatchesSequence(vcfAssembly, species)) + { + return null; } - if (!vcfSpeciesMatchesSequence(vcfAssembly, seqCoords.getSpeciesId())) + if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef)) { - return 0; + return new VCFMap(chromosome, map); } - List seqChromosomalContigs = seqCoords.getMap().getToRanges(); - for (int[] range : seqChromosomalContigs) + if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl + || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD { - count += addVcfVariants(seq, reader, range, vcfAssembly); + return null; } - return count; + /* + * map chromosomal coordinates from sequence to VCF if the VCF + * data has a different reference assembly to the sequence + */ + // TODO generalise for cases other than GRCh38 -> GRCh37 ! + // - or get the user to choose in a dialog + + List toVcfRanges = new ArrayList<>(); + List fromSequenceRanges = new ArrayList<>(); + String toRef = "GRCh37"; + + for (int[] range : map.getToRanges()) + { + int[] fromRange = map.locateInFrom(range[0], range[1]); + if (fromRange == null) + { + // corrupted map?!? + continue; + } + + int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef, + toRef); + if (newRange == null) + { + Cache.log.error( + String.format("Failed to map %s:%s:%s:%d:%d to %s", species, + chromosome, seqRef, range[0], range[1], toRef)); + continue; + } + else + { + toVcfRanges.add(newRange); + fromSequenceRanges.add(fromRange); + } + } + + return new VCFMap(chromosome, + new MapList(fromSequenceRanges, toVcfRanges, 1, 1)); + } + + /** + * If the sequence id matches a contig declared in the VCF file, and the + * sequence matches the contig length, then returns a 1:1 map of the sequence to + * the contig, else returns null + * + * @param seq + * @return + */ + private VCFMap getContigMap(SequenceI seq) + { + String id = seq.getName(); + for (VCFContigHeaderLine contig : header.getContigLines()) + { + if (contig.getID().equals(id)) + { + /* + * have to construct a SAMSequenceRecord to + * read the contig 'length' field! + */ + int len = seq.getLength(); + SAMSequenceRecord ssr = contig.getSAMSequenceRecord(); + if (len == ssr.getSequenceLength()) + { + MapList map = new MapList(new int[] { 1, len }, + new int[] + { 1, len }, 1, 1); + return new VCFMap(id, map); + } + } + + } + return null; + } + + /** + * Answers true if we determine that the VCF data uses the same reference + * assembly as the sequence, else false + * + * @param vcfAssembly + * @param seqRef + * @return + */ + private boolean vcfAssemblyMatchesSequence(String vcfAssembly, + String seqRef) + { + // TODO improve on this stub, which handles gnomAD and + // hopes for the best for other cases + + if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl + && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD + { + return false; + } + return true; } /** @@ -556,93 +708,52 @@ public class VCFLoader } /** - * Queries the VCF reader for any variants that overlap the given chromosome - * region of the sequence, and adds as variant features. Returns the number of + * Queries the VCF reader for any variants that overlap the mapped chromosome + * ranges of the sequence, and adds as variant features. Returns the number of * overlapping variants found. * * @param seq * @param reader - * @param range - * start-end range of a sequence region in its chromosomal - * coordinates + * @param map + * mapping from sequence to VCF coordinates * @param vcfAssembly * the '##reference' identifier for the VCF reference assembly * @return */ protected int addVcfVariants(SequenceI seq, VCFReader reader, - int[] range, String vcfAssembly) + VCFMap map, String vcfAssembly) { - GeneLociI seqCoords = seq.getGeneLoci(); - - String chromosome = seqCoords.getChromosomeId(); - String seqRef = seqCoords.getAssemblyId(); - String species = seqCoords.getSpeciesId(); - - /* - * map chromosomal coordinates from sequence to VCF if the VCF - * data has a different reference assembly to the sequence - */ - // TODO generalise for non-human species - // - or get the user to choose in a dialog - - int offset = 0; - if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl - && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD - { - String toRef = "GRCh37"; - int[] newRange = mapReferenceRange(range, chromosome, "human", - seqRef, toRef); - if (newRange == null) - { - System.err.println(String.format( - "Failed to map %s:%s:%s:%d:%d to %s", species, chromosome, - seqRef, range[0], range[1], toRef)); - return 0; - } - offset = newRange[0] - range[0]; - range = newRange; - } - - boolean forwardStrand = range[0] <= range[1]; + boolean forwardStrand = map.map.isToForwardStrand(); /* - * query the VCF for overlaps - * (convert a reverse strand range to forwards) + * query the VCF for overlaps of each contiguous chromosomal region */ int count = 0; - MapList mapping = seqCoords.getMap(); - int fromLocus = Math.min(range[0], range[1]); - int toLocus = Math.max(range[0], range[1]); - CloseableIterator variants = reader.query(chromosome, - fromLocus, toLocus); - while (variants.hasNext()) + for (int[] range : map.map.getToRanges()) { - /* - * get variant location in sequence chromosomal coordinates - */ - VariantContext variant = variants.next(); + int vcfStart = Math.min(range[0], range[1]); + int vcfEnd = Math.max(range[0], range[1]); + CloseableIterator variants = reader + .query(map.chromosome, vcfStart, vcfEnd); + while (variants.hasNext()) + { + VariantContext variant = variants.next(); - int start = variant.getStart() - offset; - int end = variant.getEnd() - offset; + int[] featureRange = map.map.locateInFrom(variant.getStart(), + variant.getEnd()); - /* - * convert chromosomal location to sequence coordinates - * - may be reverse strand (convert to forward for sequence feature) - * - null if a partially overlapping feature - */ - int[] seqLocation = mapping.locateInFrom(start, end); - if (seqLocation != null) - { - int featureStart = Math.min(seqLocation[0], seqLocation[1]); - int featureEnd = Math.max(seqLocation[0], seqLocation[1]); - count += addAlleleFeatures(seq, variant, featureStart, featureEnd, - forwardStrand); + if (featureRange != null) + { + int featureStart = Math.min(featureRange[0], featureRange[1]); + int featureEnd = Math.max(featureRange[0], featureRange[1]); + count += addAlleleFeatures(seq, variant, featureStart, featureEnd, + forwardStrand); + } } + variants.close(); } - variants.close(); - return count; }