X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=ea5b8e06f723f3ac512a903c780508e2158184f5;hb=57738a1f3c19b1c3a00bd3ac5108f8cd0af32f99;hp=5ba5d93975afa387de2506c8384019ef772377b2;hpb=352ce69c8b9372aeaa0e87e9df29b90680a4ad59;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 5ba5d93..ea5b8e0 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -1,33 +1,37 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.io.vcf; -import jalview.analysis.AlignmentUtils; -import jalview.analysis.Dna; -import jalview.api.AlignViewControllerGuiI; -import jalview.bin.Cache; -import jalview.datamodel.DBRefEntry; -import jalview.datamodel.GeneLociI; -import jalview.datamodel.Mapping; -import jalview.datamodel.SequenceFeature; -import jalview.datamodel.SequenceI; -import jalview.datamodel.features.FeatureAttributeType; -import jalview.datamodel.features.FeatureSource; -import jalview.datamodel.features.FeatureSources; -import jalview.ext.ensembl.EnsemblMap; -import jalview.ext.htsjdk.HtsContigDb; -import jalview.ext.htsjdk.VCFReader; -import jalview.io.gff.Gff3Helper; -import jalview.io.gff.SequenceOntologyI; -import jalview.util.MapList; -import jalview.util.MappingUtils; -import jalview.util.MessageManager; +import java.util.Locale; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -35,13 +39,36 @@ import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; +import htsjdk.tribble.TribbleException; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; +import jalview.analysis.Dna; +import jalview.api.AlignViewControllerGuiI; +import jalview.bin.Cache; +import jalview.bin.Console; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.GeneLociI; +import jalview.datamodel.Mapping; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.datamodel.features.FeatureAttributeType; +import jalview.datamodel.features.FeatureSource; +import jalview.datamodel.features.FeatureSources; +import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; +import jalview.ext.htsjdk.VCFReader; +import jalview.io.gff.Gff3Helper; +import jalview.io.gff.SequenceOntologyI; +import jalview.util.MapList; +import jalview.util.MappingUtils; +import jalview.util.MessageManager; +import jalview.util.StringUtils; /** * A class to read VCF data (using the htsjdk) and add variants as sequence @@ -51,14 +78,32 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + private static final String VCF_ENCODABLE = ":;=%,"; + + /* + * Jalview feature attributes for VCF fixed column data + */ + private static final String VCF_POS = "POS"; + + private static final String VCF_ID = "ID"; + + private static final String VCF_QUAL = "QUAL"; + + private static final String VCF_FILTER = "FILTER"; + + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' + + private static final String DEFAULT_SPECIES = "homo_sapiens"; + /** - * A class to model the mapping from sequence to VCF coordinates. Cases include + * A class to model the mapping from sequence to VCF coordinates. Cases + * include * */ class VCFMap @@ -82,7 +127,7 @@ public class VCFLoader /* * Lookup keys, and default values, for Preference entries that describe - * patterns for VCF and VEP fields to capture + * patterns for VCF and VEP fields to capture */ private static final String VEP_FIELDS_PREF = "VEP_FIELDS"; @@ -93,12 +138,29 @@ public class VCFLoader private static final String DEFAULT_VEP_FIELDS = ".*";// "Allele,Consequence,IMPACT,SWISSPROT,SIFT,PolyPhen,CLIN_SIG"; /* + * Lookup keys, and default values, for Preference entries that give + * mappings from tokens in the 'reference' header to species or assembly + */ + private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY"; + + private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh37,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38"; + + private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human + + private static final String DEFAULT_REFERENCE = "grch37"; // fallback default + // is human GRCh37 + + /* * keys to fields of VEP CSQ consequence data * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ private static final String CSQ_CONSEQUENCE_KEY = "Consequence"; + private static final String CSQ_ALLELE_KEY = "Allele"; - private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), 1... + + private static final String CSQ_ALLELE_NUM_KEY = "ALLELE_NUM"; // 0 (ref), + // 1... + private static final String CSQ_FEATURE_KEY = "Feature"; // Ensembl stable id /* @@ -114,12 +176,6 @@ public class VCFLoader private static final String PIPE_REGEX = "\\|"; /* - * key for Allele Frequency output by VEP - * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html - */ - private static final String ALLELE_FREQUENCY_KEY = "AF"; - - /* * delimiter that separates multiple consequence data blocks */ private static final String COMMA = ","; @@ -155,6 +211,16 @@ public class VCFLoader private VCFHeader header; /* + * species (as a valid Ensembl term) the VCF is for + */ + private String vcfSpecies; + + /* + * genome assembly version (as a valid Ensembl identifier) the VCF is for + */ + private String vcfAssembly; + + /* * a Dictionary of contigs (if present) referenced in the VCF file */ private SAMSequenceDictionary dictionary; @@ -165,8 +231,11 @@ public class VCFLoader * see http://www.ensembl.org/info/docs/tools/vep/vep_formats.html */ private int csqConsequenceFieldIndex = -1; + private int csqAlleleFieldIndex = -1; + private int csqAlleleNumberFieldIndex = -1; + private int csqFeatureFieldIndex = -1; // todo the same fields for SnpEff ANN data if wanted @@ -191,6 +260,12 @@ public class VCFLoader */ Map vepFieldsOfInterest; + /* + * key:value for which rejected data has been seen + * (the error is logged only once for each combination) + */ + private Set badData; + /** * Constructor given a VCF file * @@ -246,12 +321,19 @@ public class VCFLoader */ public SequenceI loadVCFContig(String contig) { - String ref = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY) - .getValue(); + VCFHeaderLine headerLine = header + .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + if (headerLine == null) + { + Console.error("VCF reference header not found"); + return null; + } + String ref = headerLine.getValue(); if (ref.startsWith("file://")) { ref = ref.substring(7); } + setSpeciesAndAssembly(ref); SequenceI seq = null; File dbFile = new File(ref); @@ -260,12 +342,12 @@ public class VCFLoader { HtsContigDb db = new HtsContigDb("", dbFile); seq = db.getSequenceProxy(contig); - loadSequenceVCF(seq, ref); + loadSequenceVCF(seq); db.close(); } else { - System.err.println("VCF reference not found: " + ref); + Console.error("VCF reference not found: " + ref); } return seq; @@ -284,7 +366,9 @@ public class VCFLoader { VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); - String vcfAssembly = ref.getValue(); + String reference = ref == null ? null : ref.getValue(); + + setSpeciesAndAssembly(reference); int varCount = 0; int seqCount = 0; @@ -294,7 +378,7 @@ public class VCFLoader */ for (SequenceI seq : seqs) { - int added = loadSequenceVCF(seq, vcfAssembly); + int added = loadSequenceVCF(seq); if (added > 0) { seqCount++; @@ -338,6 +422,71 @@ public class VCFLoader } /** + * Attempts to determine and save the species and genome assembly version to + * which the VCF data applies. This may be done by parsing the + * {@code reference} header line, configured in a property file, or + * (potentially) confirmed interactively by the user. + *

+ * The saved values should be identifiers valid for Ensembl's REST service + * {@code map} endpoint, so they can be used (if necessary) to retrieve the + * mapping between VCF coordinates and sequence coordinates. + * + * @param reference + * @see https://rest.ensembl.org/documentation/info/assembly_map + * @see https://rest.ensembl.org/info/assembly/human?content-type=text/xml + * @see https://rest.ensembl.org/info/species?content-type=text/xml + */ + protected void setSpeciesAndAssembly(String reference) + { + if (reference == null) + { + Console.error("No VCF ##reference found, defaulting to " + + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES); + reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified + } + reference = reference.toLowerCase(Locale.ROOT); + + /* + * for a non-human species, or other assembly identifier, + * specify as a Jalview property file entry e.g. + * VCF_ASSEMBLY = hs37=GRCh37,assembly19=GRCh37 + * VCF_SPECIES = c_elegans=celegans + * to map a token in the reference header to a value + */ + String prop = Cache.getDefault(VCF_ASSEMBLY, DEFAULT_VCF_ASSEMBLY); + for (String token : prop.split(",")) + { + String[] tokens = token.split("="); + if (tokens.length == 2) + { + if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT))) + { + vcfAssembly = tokens[1].trim(); + break; + } + } + } + + vcfSpecies = DEFAULT_SPECIES; + prop = Cache.getProperty(VCF_SPECIES); + if (prop != null) + { + for (String token : prop.split(",")) + { + String[] tokens = token.split("="); + if (tokens.length == 2) + { + if (reference.contains(tokens[0].trim().toLowerCase(Locale.ROOT))) + { + vcfSpecies = tokens[1].trim(); + break; + } + } + } + } + } + + /** * Opens the VCF file and parses header data * * @param filePath @@ -432,7 +581,7 @@ public class VCFLoader { for (Pattern p : filters) { - if (p.matcher(id.toUpperCase()).matches()) + if (p.matcher(id.toUpperCase(Locale.ROOT)).matches()) { return true; } @@ -526,7 +675,7 @@ public class VCFLoader { try { - patterns.add(Pattern.compile(token.toUpperCase())); + patterns.add(Pattern.compile(token.toUpperCase(Locale.ROOT))); } catch (PatternSyntaxException e) { System.err.println("Invalid pattern ignored: " + token); @@ -537,13 +686,12 @@ public class VCFLoader /** * Transfers VCF features to sequences to which this sequence has a mapping. - * If the mapping is 3:1, computes peptide variants from nucleotide variants. * * @param seq */ protected void transferAddedFeatures(SequenceI seq) { - DBRefEntry[] dbrefs = seq.getDBRefs(); + List dbrefs = seq.getDBRefs(); if (dbrefs == null) { return; @@ -563,7 +711,8 @@ public class VCFLoader /* * dna-to-peptide product mapping */ - AlignmentUtils.computeProteinFeatures(seq, mapTo, map); + // JAL-3187 render on the fly instead + // AlignmentUtils.computeProteinFeatures(seq, mapTo, map); } else { @@ -584,16 +733,15 @@ public class VCFLoader } /** - * Tries to add overlapping variants read from a VCF file to the given sequence, - * and returns the number of variant features added + * Tries to add overlapping variants read from a VCF file to the given + * sequence, and returns the number of variant features added * * @param seq - * @param vcfAssembly * @return */ - protected int loadSequenceVCF(SequenceI seq, String vcfAssembly) + protected int loadSequenceVCF(SequenceI seq) { - VCFMap vcfMap = getVcfMap(seq, vcfAssembly); + VCFMap vcfMap = getVcfMap(seq); if (vcfMap == null) { return 0; @@ -614,10 +762,9 @@ public class VCFLoader * Answers a map from sequence coordinates to VCF chromosome ranges * * @param seq - * @param vcfAssembly * @return */ - private VCFMap getVcfMap(SequenceI seq, String vcfAssembly) + private VCFMap getVcfMap(SequenceI seq) { /* * simplest case: sequence has id and length matching a VCF contig @@ -639,7 +786,7 @@ public class VCFLoader GeneLociI seqCoords = seq.getGeneLoci(); if (seqCoords == null) { - Cache.log.warn(String.format( + Console.warn(String.format( "Can't query VCF for %s as chromosome coordinates not known", seq.getName())); return null; @@ -648,34 +795,28 @@ public class VCFLoader String species = seqCoords.getSpeciesId(); String chromosome = seqCoords.getChromosomeId(); String seqRef = seqCoords.getAssemblyId(); - MapList map = seqCoords.getMap(); + MapList map = seqCoords.getMapping(); - if (!vcfSpeciesMatchesSequence(vcfAssembly, species)) + // note this requires the configured species to match that + // returned with the Ensembl sequence; todo: support aliases? + if (!vcfSpecies.equalsIgnoreCase(species)) { + Console.warn("No VCF loaded to " + seq.getName() + + " as species not matched"); return null; } - if (vcfAssemblyMatchesSequence(vcfAssembly, seqRef)) + if (seqRef.equalsIgnoreCase(vcfAssembly)) { return new VCFMap(chromosome, map); } - if (!"GRCh38".equalsIgnoreCase(seqRef) // Ensembl - || !vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD - { - return null; - } - /* - * map chromosomal coordinates from sequence to VCF if the VCF - * data has a different reference assembly to the sequence + * VCF data has a different reference assembly to the sequence: + * query Ensembl to map chromosomal coordinates from sequence to VCF */ - // TODO generalise for cases other than GRCh38 -> GRCh37 ! - // - or get the user to choose in a dialog - List toVcfRanges = new ArrayList<>(); List fromSequenceRanges = new ArrayList<>(); - String toRef = "GRCh37"; for (int[] range : map.getToRanges()) { @@ -687,12 +828,12 @@ public class VCFLoader } int[] newRange = mapReferenceRange(range, chromosome, "human", seqRef, - toRef); + vcfAssembly); if (newRange == null) { - Cache.log.error( - String.format("Failed to map %s:%s:%s:%d:%d to %s", species, - chromosome, seqRef, range[0], range[1], toRef)); + Console.error(String.format("Failed to map %s:%s:%s:%d:%d to %s", + species, chromosome, seqRef, range[0], range[1], + vcfAssembly)); continue; } else @@ -733,62 +874,6 @@ public class VCFLoader } /** - * Answers true if we determine that the VCF data uses the same reference - * assembly as the sequence, else false - * - * @param vcfAssembly - * @param seqRef - * @return - */ - private boolean vcfAssemblyMatchesSequence(String vcfAssembly, - String seqRef) - { - // TODO improve on this stub, which handles gnomAD and - // hopes for the best for other cases - - if ("GRCh38".equalsIgnoreCase(seqRef) // Ensembl - && vcfAssembly.contains("Homo_sapiens_assembly19")) // gnomAD - { - return false; - } - return true; - } - - /** - * Answers true if the species inferred from the VCF reference identifier - * matches that for the sequence - * - * @param vcfAssembly - * @param speciesId - * @return - */ - boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId) - { - // PROBLEM 1 - // there are many aliases for species - how to equate one with another? - // PROBLEM 2 - // VCF ##reference header is an unstructured URI - how to extract species? - // perhaps check if ref includes any (Ensembl) alias of speciesId?? - // TODO ask the user to confirm this?? - - if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example - && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id - { - return true; - } - - if (vcfAssembly.contains("c_elegans") // VEP VCF response example - && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl - { - return true; - } - - // this is not a sustainable solution... - - return false; - } - - /** * Queries the VCF reader for any variants that overlap the mapped chromosome * ranges of the sequence, and adds as variant features. Returns the number of * overlapping variants found. @@ -811,24 +896,42 @@ public class VCFLoader { int vcfStart = Math.min(range[0], range[1]); int vcfEnd = Math.max(range[0], range[1]); - CloseableIterator variants = reader - .query(map.chromosome, vcfStart, vcfEnd); - while (variants.hasNext()) + try { - VariantContext variant = variants.next(); + CloseableIterator variants = reader + .query(map.chromosome, vcfStart, vcfEnd); + while (variants.hasNext()) + { + VariantContext variant = variants.next(); - int[] featureRange = map.map.locateInFrom(variant.getStart(), - variant.getEnd()); + int[] featureRange = map.map.locateInFrom(variant.getStart(), + variant.getEnd()); - if (featureRange != null) - { - int featureStart = Math.min(featureRange[0], featureRange[1]); - int featureEnd = Math.max(featureRange[0], featureRange[1]); - count += addAlleleFeatures(seq, variant, featureStart, featureEnd, - forwardStrand); + /* + * only take features whose range is fully mappable to sequence positions + */ + if (featureRange != null) + { + int featureStart = Math.min(featureRange[0], featureRange[1]); + int featureEnd = Math.max(featureRange[0], featureRange[1]); + if (featureEnd - featureStart == variant.getEnd() + - variant.getStart()) + { + count += addAlleleFeatures(seq, variant, featureStart, + featureEnd, forwardStrand); + } + } } + variants.close(); + } catch (TribbleException e) + { + /* + * RuntimeException throwable by htsjdk + */ + String msg = String.format("Error reading VCF for %s:%d-%d: %s ", + map.chromosome, vcfStart, vcfEnd, e.getLocalizedMessage()); + Console.error(msg); } - variants.close(); } return count; @@ -891,8 +994,8 @@ public class VCFLoader /** * Inspects one allele and attempts to add a variant feature for it to the * sequence. The additional data associated with this allele is extracted to - * store in the feature's key-value map. Answers the number of features added (0 - * or 1). + * store in the feature's key-value map. Answers the number of features added + * (0 or 1). * * @param seq * @param variant @@ -940,10 +1043,10 @@ public class VCFLoader * pick out the consequence data (if any) that is for the current allele * and feature (transcript) that matches the current sequence */ - String consequence = getConsequenceForAlleleAndFeature(variant, CSQ_FIELD, - altAlleleIndex, csqAlleleFieldIndex, - csqAlleleNumberFieldIndex, seq.getName().toLowerCase(), - csqFeatureFieldIndex); + String consequence = getConsequenceForAlleleAndFeature(variant, + CSQ_FIELD, altAlleleIndex, csqAlleleFieldIndex, + csqAlleleNumberFieldIndex, + seq.getName().toLowerCase(Locale.ROOT), csqFeatureFieldIndex); /* * pick out the ontology term for the consequence type @@ -958,7 +1061,20 @@ public class VCFLoader featureEnd, FEATURE_GROUP_VCF); sf.setSource(sourceId); - sf.setValue(Gff3Helper.ALLELES, alleles); + /* + * save the derived alleles as a named attribute; this will be + * needed when Jalview computes derived peptide variants + */ + addFeatureAttribute(sf, Gff3Helper.ALLELES, alleles); + + /* + * add selected VCF fixed column data as feature attributes + */ + addFeatureAttribute(sf, VCF_POS, String.valueOf(variant.getStart())); + addFeatureAttribute(sf, VCF_ID, variant.getID()); + addFeatureAttribute(sf, VCF_QUAL, + String.valueOf(variant.getPhredScaledQual())); + addFeatureAttribute(sf, VCF_FILTER, getFilter(variant)); addAlleleProperties(variant, sf, altAlleleIndex, consequence); @@ -968,9 +1084,56 @@ public class VCFLoader } /** - * Determines the Sequence Ontology term to use for the variant feature type in - * Jalview. The default is 'sequence_variant', but a more specific term is used - * if: + * Answers the VCF FILTER value for the variant - or an approximation to it. + * This field is either PASS, or a semi-colon separated list of filters not + * passed. htsjdk saves filters as a HashSet, so the order when reassembled + * into a list may be different. + * + * @param variant + * @return + */ + String getFilter(VariantContext variant) + { + Set filters = variant.getFilters(); + if (filters.isEmpty()) + { + return NO_VALUE; + } + Iterator iterator = filters.iterator(); + String first = iterator.next(); + if (filters.size() == 1) + { + return first; + } + + StringBuilder sb = new StringBuilder(first); + while (iterator.hasNext()) + { + sb.append(";").append(iterator.next()); + } + + return sb.toString(); + } + + /** + * Adds one feature attribute unless the value is null, empty or '.' + * + * @param sf + * @param key + * @param value + */ + void addFeatureAttribute(SequenceFeature sf, String key, String value) + { + if (value != null && !value.isEmpty() && !NO_VALUE.equals(value)) + { + sf.setValue(key, value); + } + } + + /** + * Determines the Sequence Ontology term to use for the variant feature type + * in Jalview. The default is 'sequence_variant', but a more specific term is + * used if: *

    *
  • VEP (or SnpEff) Consequence annotation is included in the VCF
  • *
  • sequence id can be matched to VEP Feature (or SnpEff Feature_ID)
  • @@ -1048,8 +1211,7 @@ public class VCFLoader */ private String getConsequenceForAlleleAndFeature(VariantContext variant, String vcfInfoId, int altAlleleIndex, int alleleFieldIndex, - int alleleNumberFieldIndex, - String seqName, int featureFieldIndex) + int alleleNumberFieldIndex, String seqName, int featureFieldIndex) { if (alleleFieldIndex == -1 || featureFieldIndex == -1) { @@ -1074,8 +1236,8 @@ public class VCFLoader if (csqFields.length > featureFieldIndex) { String featureIdentifier = csqFields[featureFieldIndex]; - if (featureIdentifier.length() > 4 - && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) + if (featureIdentifier.length() > 4 && seqName + .indexOf(featureIdentifier.toLowerCase(Locale.ROOT)) > -1) { /* * feature (transcript) matched - now check for allele match @@ -1161,14 +1323,6 @@ public class VCFLoader } /* - * filter out fields we don't want to capture - */ - if (!vcfFieldsOfInterest.contains(key)) - { - continue; - } - - /* * we extract values for other data which are allele-specific; * these may be per alternate allele (INFO[key].Number = 'A') * or per allele including reference (INFO[key].Number = 'R') @@ -1205,10 +1359,81 @@ public class VCFLoader * take the index'th value */ String value = getAttributeValue(variant, key, index); - if (value != null) + if (value != null && isValid(variant, key, value)) + { + /* + * decode colon, semicolon, equals sign, percent sign, comma (only) + * as required by the VCF specification (para 1.2) + */ + value = StringUtils.urlDecode(value, VCF_ENCODABLE); + addFeatureAttribute(sf, key, value); + } + } + } + + /** + * Answers true for '.', null, or an empty value, or if the INFO type is + * String. If the INFO type is Integer or Float, answers false if the value is + * not in valid format. + * + * @param variant + * @param infoId + * @param value + * @return + */ + protected boolean isValid(VariantContext variant, String infoId, + String value) + { + if (value == null || value.isEmpty() || NO_VALUE.equals(value)) + { + return true; + } + VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(infoId); + if (infoHeader == null) + { + Console.error("Field " + infoId + " has no INFO header"); + return false; + } + VCFHeaderLineType infoType = infoHeader.getType(); + try + { + if (infoType == VCFHeaderLineType.Integer) { - sf.setValue(key, value); + Integer.parseInt(value); } + else if (infoType == VCFHeaderLineType.Float) + { + Float.parseFloat(value); + } + } catch (NumberFormatException e) + { + logInvalidValue(variant, infoId, value); + return false; + } + return true; + } + + /** + * Logs an error message for malformed data; duplicate messages (same id and + * value) are not logged + * + * @param variant + * @param infoId + * @param value + */ + private void logInvalidValue(VariantContext variant, String infoId, + String value) + { + if (badData == null) + { + badData = new HashSet<>(); + } + String token = infoId + ":" + value; + if (!badData.contains(token)) + { + badData.add(token); + Console.error(String.format("Invalid VCF data at %s:%d %s=%s", + variant.getContig(), variant.getStart(), infoId, value)); } } @@ -1260,6 +1485,11 @@ public class VCFLoader String id = vepFieldsOfInterest.get(i); if (id != null) { + /* + * VCF spec requires encoding of special characters e.g. '=' + * so decode them here before storing + */ + field = StringUtils.urlDecode(field, VCF_ENCODABLE); csqValues.put(id, field); } } @@ -1369,8 +1599,8 @@ public class VCFLoader * @param toRef * @return */ - protected int[] findSubsumedRangeMapping(int[] queryRange, String chromosome, - String species, String fromRef, String toRef) + protected int[] findSubsumedRangeMapping(int[] queryRange, + String chromosome, String species, String fromRef, String toRef) { String key = makeRangesKey(chromosome, species, fromRef, toRef); if (assemblyMappings.containsKey(key)) @@ -1392,7 +1622,8 @@ public class VCFLoader */ int offset = queryRange[0] - fromRange[0]; int mappedRangeFrom = toRange[0] + offset; - int mappedRangeTo = mappedRangeFrom + (queryRange[1] - queryRange[0]); + int mappedRangeTo = mappedRangeFrom + + (queryRange[1] - queryRange[0]); return new int[] { mappedRangeFrom, mappedRangeTo }; } } @@ -1415,7 +1646,7 @@ public class VCFLoader SequenceI targetSequence, MapList mapping) { int[] mappedRange = mapping.locateInTo(sf.getBegin(), sf.getEnd()); - + if (mappedRange != null) { String group = sf.getFeatureGroup(); @@ -1439,7 +1670,6 @@ public class VCFLoader protected static String makeRangesKey(String chromosome, String species, String fromRef, String toRef) { - return species + EXCL + chromosome + EXCL + fromRef + EXCL - + toRef; + return species + EXCL + chromosome + EXCL + fromRef + EXCL + toRef; } }