From 195aaaebc7c27996d1db214494025edfd1505d63 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 26 Jul 2019 15:40:28 +0200 Subject: [PATCH] JAL-3375 format validate VCF Integer and Float fields --- src/jalview/io/vcf/VCFLoader.java | 117 ++++++++++++++++++++++++++++---- test/jalview/io/vcf/VCFLoaderTest.java | 15 +++- 2 files changed, 114 insertions(+), 18 deletions(-) diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 053b52f..5544bd6 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -25,9 +25,11 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -35,8 +37,10 @@ import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; +import htsjdk.tribble.TribbleException; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; @@ -51,7 +55,7 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { - private static final String NO_VALUE = "."; + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' private static final String DEFAULT_SPECIES = "homo_sapiens"; @@ -211,6 +215,12 @@ public class VCFLoader */ Map vepFieldsOfInterest; + /* + * key:value for which rejected data has been seen + * (the error is logged only once for each combination) + */ + private Set badData; + /** * Constructor given a VCF file * @@ -841,24 +851,35 @@ public class VCFLoader { int vcfStart = Math.min(range[0], range[1]); int vcfEnd = Math.max(range[0], range[1]); - CloseableIterator variants = reader - .query(map.chromosome, vcfStart, vcfEnd); - while (variants.hasNext()) + try { - VariantContext variant = variants.next(); + CloseableIterator variants = reader + .query(map.chromosome, vcfStart, vcfEnd); + while (variants.hasNext()) + { + VariantContext variant = variants.next(); - int[] featureRange = map.map.locateInFrom(variant.getStart(), - variant.getEnd()); + int[] featureRange = map.map.locateInFrom(variant.getStart(), + variant.getEnd()); - if (featureRange != null) - { - int featureStart = Math.min(featureRange[0], featureRange[1]); - int featureEnd = Math.max(featureRange[0], featureRange[1]); - count += addAlleleFeatures(seq, variant, featureStart, featureEnd, - forwardStrand); + if (featureRange != null) + { + int featureStart = Math.min(featureRange[0], featureRange[1]); + int featureEnd = Math.max(featureRange[0], featureRange[1]); + count += addAlleleFeatures(seq, variant, featureStart, + featureEnd, forwardStrand); + } } + variants.close(); + } catch (TribbleException e) + { + /* + * RuntimeException throwable by htsjdk + */ + String msg = String.format("Error reading VCF for %s:%d-%d: %s ", + map.chromosome, vcfStart, vcfEnd); + Cache.log.error(msg); } - variants.close(); } return count; @@ -1235,7 +1256,7 @@ public class VCFLoader * take the index'th value */ String value = getAttributeValue(variant, key, index); - if (value != null) + if (value != null && isValid(variant, key, value)) { sf.setValue(key, value); } @@ -1243,6 +1264,72 @@ public class VCFLoader } /** + * Answers true for '.', null, or an empty value, or if the INFO type is String. + * If the INFO type is Integer or Float, answers false if the value is not in + * valid format. + * + * @param variant + * @param infoId + * @param value + * @return + */ + protected boolean isValid(VariantContext variant, String infoId, + String value) + { + if (value == null || value.isEmpty() || NO_VALUE.equals(value)) + { + return true; + } + VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(infoId); + if (infoHeader == null) + { + Cache.log.error("Field " + infoId + " has no INFO header"); + return false; + } + VCFHeaderLineType infoType = infoHeader.getType(); + try + { + if (infoType == VCFHeaderLineType.Integer) + { + Integer.parseInt(value); + } + else if (infoType == VCFHeaderLineType.Float) + { + Float.parseFloat(value); + } + } catch (NumberFormatException e) + { + logInvalidValue(variant, infoId, value); + return false; + } + return true; + } + + /** + * Logs an error message for malformed data; duplicate messages (same id and + * value) are not logged + * + * @param variant + * @param infoId + * @param value + */ + private void logInvalidValue(VariantContext variant, String infoId, + String value) + { + if (badData == null) + { + badData = new HashSet<>(); + } + String token = infoId + ":" + value; + if (!badData.contains(token)) + { + badData.add(token); + Cache.log.error(String.format("Invalid VCF data at %s:%d %s=%s", + variant.getContig(), variant.getStart(), infoId, value)); + } + } + + /** * Inspects CSQ data blocks (consequences) and adds attributes on the sequence * feature. *

diff --git a/test/jalview/io/vcf/VCFLoaderTest.java b/test/jalview/io/vcf/VCFLoaderTest.java index 6ba6fbe..808fe86 100644 --- a/test/jalview/io/vcf/VCFLoaderTest.java +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@ -64,11 +64,14 @@ public class VCFLoaderTest private static final String[] VCF = { "##fileformat=VCFv4.2", // fields other than AF are ignored when parsing as they have no INFO definition "##INFO=", + "##INFO=