X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=1abe638517229ba40c740a542987159edbd34866;hb=b03b0404e7e6ff3e6abf0285df7b61fea69319f0;hp=f4ce1a38492465b8558b7df437d6fe00533fd33e;hpb=873061987e0feabe7be4ead7c1767840f1e60723;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index f4ce1a3..1abe638 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -1,6 +1,25 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.io.vcf; -import jalview.analysis.AlignmentUtils; import jalview.analysis.Dna; import jalview.api.AlignViewControllerGuiI; import jalview.bin.Cache; @@ -20,14 +39,19 @@ import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.MessageManager; +import jalview.util.StringUtils; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -35,8 +59,10 @@ import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; +import htsjdk.tribble.TribbleException; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; @@ -51,6 +77,21 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + private static final String VCF_ENCODABLE = ":;=%,"; + + /* + * Jalview feature attributes for VCF fixed column data + */ + private static final String VCF_POS = "POS"; + + private static final String VCF_ID = "ID"; + + private static final String VCF_QUAL = "QUAL"; + + private static final String VCF_FILTER = "FILTER"; + + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' + private static final String DEFAULT_SPECIES = "homo_sapiens"; /** @@ -100,10 +141,12 @@ public class VCFLoader */ private static final String VCF_ASSEMBLY = "VCF_ASSEMBLY"; - private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh38,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38"; + private static final String DEFAULT_VCF_ASSEMBLY = "assembly19=GRCh37,hs37=GRCh37,grch37=GRCh37,grch38=GRCh38"; private static final String VCF_SPECIES = "VCF_SPECIES"; // default is human + private static final String DEFAULT_REFERENCE = "grch37"; // fallback default is human GRCh37 + /* * keys to fields of VEP CSQ consequence data * see https://www.ensembl.org/info/docs/tools/vep/vep_formats.html @@ -207,10 +250,16 @@ public class VCFLoader */ Map vepFieldsOfInterest; + /* + * key:value for which rejected data has been seen + * (the error is logged only once for each combination) + */ + private Set badData; + /** - * Constructor given a VCF file + * Constructor given a path to a VCF file * - * @param alignment + * @param vcfFile */ public VCFLoader(String vcfFile) { @@ -262,8 +311,13 @@ public class VCFLoader */ public SequenceI loadVCFContig(String contig) { - String ref = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY) - .getValue(); + VCFHeaderLine headerLine = header.getOtherHeaderLine(VCFHeader.REFERENCE_KEY); + if (headerLine == null) + { + Cache.log.error("VCF reference header not found"); + return null; + } + String ref = headerLine.getValue(); if (ref.startsWith("file://")) { ref = ref.substring(7); @@ -282,7 +336,7 @@ public class VCFLoader } else { - System.err.println("VCF reference not found: " + ref); + Cache.log.error("VCF reference not found: " + ref); } return seq; @@ -301,7 +355,7 @@ public class VCFLoader { VCFHeaderLine ref = header .getOtherHeaderLine(VCFHeader.REFERENCE_KEY); - String reference = ref.getValue(); + String reference = ref == null ? null : ref.getValue(); setSpeciesAndAssembly(reference); @@ -373,7 +427,13 @@ public class VCFLoader */ protected void setSpeciesAndAssembly(String reference) { - vcfSpecies = DEFAULT_SPECIES; + if (reference == null) + { + Cache.log.error("No VCF ##reference found, defaulting to " + + DEFAULT_REFERENCE + ":" + DEFAULT_SPECIES); + reference = DEFAULT_REFERENCE; // default to GRCh37 if not specified + } + reference = reference.toLowerCase(); /* * for a non-human species, or other assembly identifier, @@ -396,6 +456,7 @@ public class VCFLoader } } + vcfSpecies = DEFAULT_SPECIES; prop = Cache.getProperty(VCF_SPECIES); if (prop != null) { @@ -509,7 +570,7 @@ public class VCFLoader { for (Pattern p : filters) { - if (p.matcher(id.toUpperCase()).matches()) + if (p.matcher(id.toUpperCase(Locale.ROOT)).matches()) { return true; } @@ -603,7 +664,7 @@ public class VCFLoader { try { - patterns.add(Pattern.compile(token.toUpperCase())); + patterns.add(Pattern.compile(token.toUpperCase(Locale.ROOT))); } catch (PatternSyntaxException e) { System.err.println("Invalid pattern ignored: " + token); @@ -640,7 +701,8 @@ public class VCFLoader /* * dna-to-peptide product mapping */ - AlignmentUtils.computeProteinFeatures(seq, mapTo, map); + // JAL-3187 render on the fly instead + // AlignmentUtils.computeProteinFeatures(seq, mapTo, map); } else { @@ -723,7 +785,7 @@ public class VCFLoader String species = seqCoords.getSpeciesId(); String chromosome = seqCoords.getChromosomeId(); String seqRef = seqCoords.getAssemblyId(); - MapList map = seqCoords.getMap(); + MapList map = seqCoords.getMapping(); // note this requires the configured species to match that // returned with the Ensembl sequence; todo: support aliases? @@ -803,40 +865,6 @@ public class VCFLoader } /** - * Answers true if the species inferred from the VCF reference identifier - * matches that for the sequence - * - * @param vcfAssembly - * @param speciesId - * @return - */ - boolean vcfSpeciesMatchesSequence(String vcfAssembly, String speciesId) - { - // PROBLEM 1 - // there are many aliases for species - how to equate one with another? - // PROBLEM 2 - // VCF ##reference header is an unstructured URI - how to extract species? - // perhaps check if ref includes any (Ensembl) alias of speciesId?? - // TODO ask the user to confirm this?? - - if (vcfAssembly.contains("Homo_sapiens") // gnomAD exome data example - && "HOMO_SAPIENS".equals(speciesId)) // Ensembl species id - { - return true; - } - - if (vcfAssembly.contains("c_elegans") // VEP VCF response example - && "CAENORHABDITIS_ELEGANS".equals(speciesId)) // Ensembl - { - return true; - } - - // this is not a sustainable solution... - - return false; - } - - /** * Queries the VCF reader for any variants that overlap the mapped chromosome * ranges of the sequence, and adds as variant features. Returns the number of * overlapping variants found. @@ -859,31 +887,46 @@ public class VCFLoader { int vcfStart = Math.min(range[0], range[1]); int vcfEnd = Math.max(range[0], range[1]); - CloseableIterator variants = reader - .query(map.chromosome, vcfStart, vcfEnd); - while (variants.hasNext()) + try { - VariantContext variant = variants.next(); + CloseableIterator variants = reader + .query(map.chromosome, vcfStart, vcfEnd); + while (variants.hasNext()) + { + VariantContext variant = variants.next(); - int[] featureRange = map.map.locateInFrom(variant.getStart(), - variant.getEnd()); + int[] featureRange = map.map.locateInFrom(variant.getStart(), + variant.getEnd()); - if (featureRange != null) - { - int featureStart = Math.min(featureRange[0], featureRange[1]); - int featureEnd = Math.max(featureRange[0], featureRange[1]); - count += addAlleleFeatures(seq, variant, featureStart, featureEnd, - forwardStrand); + if (featureRange != null) + { + int featureStart = Math.min(featureRange[0], featureRange[1]); + int featureEnd = Math.max(featureRange[0], featureRange[1]); + count += addAlleleFeatures(seq, variant, featureStart, + featureEnd, forwardStrand); + } } + variants.close(); + } catch (TribbleException e) + { + /* + * RuntimeException throwable by htsjdk + */ + String msg = String.format("Error reading VCF for %s:%d-%d: %s ", + map.chromosome, vcfStart, vcfEnd,e.getLocalizedMessage()); + Cache.log.error(msg); } - variants.close(); } return count; } /** - * A convenience method to get an attribute value for an alternate allele + * A convenience method to get an attribute value for an alternate allele. + * {@code alleleIndex} is the position in the list of values for the allele. + * If {@alleleIndex == -1} then all values are concatenated (comma-separated). + * This is the case for fields declared with "Number=." i.e. values are not + * related to specific alleles. * * @param variant * @param attributeName @@ -895,16 +938,25 @@ public class VCFLoader { Object att = variant.getAttribute(attributeName); + String result = null; if (att instanceof String) { - return (String) att; + result = (String) att; } - else if (att instanceof ArrayList) + else if (att instanceof List) { - return ((List) att).get(alleleIndex); + List theList = (List) att; + if (alleleIndex == -1) + { + result = StringUtils.listToDelimitedString(theList, ","); + } + else + { + result = theList.get(alleleIndex); + } } - return null; + return result; } /** @@ -1006,7 +1058,20 @@ public class VCFLoader featureEnd, FEATURE_GROUP_VCF); sf.setSource(sourceId); - sf.setValue(Gff3Helper.ALLELES, alleles); + /* + * save the derived alleles as a named attribute; this will be + * needed when Jalview computes derived peptide variants + */ + addFeatureAttribute(sf, Gff3Helper.ALLELES, alleles); + + /* + * add selected VCF fixed column data as feature attributes + */ + addFeatureAttribute(sf, VCF_POS, String.valueOf(variant.getStart())); + addFeatureAttribute(sf, VCF_ID, variant.getID()); + addFeatureAttribute(sf, VCF_QUAL, + String.valueOf(variant.getPhredScaledQual())); + addFeatureAttribute(sf, VCF_FILTER, getFilter(variant)); addAlleleProperties(variant, sf, altAlleleIndex, consequence); @@ -1016,6 +1081,53 @@ public class VCFLoader } /** + * Answers the VCF FILTER value for the variant - or an approximation to it. + * This field is either PASS, or a semi-colon separated list of filters not + * passed. htsjdk saves filters as a HashSet, so the order when reassembled into + * a list may be different. + * + * @param variant + * @return + */ + String getFilter(VariantContext variant) + { + Set filters = variant.getFilters(); + if (filters.isEmpty()) + { + return NO_VALUE; + } + Iterator iterator = filters.iterator(); + String first = iterator.next(); + if (filters.size() == 1) + { + return first; + } + + StringBuilder sb = new StringBuilder(first); + while (iterator.hasNext()) + { + sb.append(";").append(iterator.next()); + } + + return sb.toString(); + } + + /** + * Adds one feature attribute unless the value is null, empty or '.' + * + * @param sf + * @param key + * @param value + */ + void addFeatureAttribute(SequenceFeature sf, String key, String value) + { + if (value != null && !value.isEmpty() && !NO_VALUE.equals(value)) + { + sf.setValue(key, value); + } + } + + /** * Determines the Sequence Ontology term to use for the variant feature type in * Jalview. The default is 'sequence_variant', but a more specific term is used * if: @@ -1209,14 +1321,6 @@ public class VCFLoader } /* - * filter out fields we don't want to capture - */ - if (!vcfFieldsOfInterest.contains(key)) - { - continue; - } - - /* * we extract values for other data which are allele-specific; * these may be per alternate allele (INFO[key].Number = 'A') * or per allele including reference (INFO[key].Number = 'R') @@ -1241,6 +1345,10 @@ public class VCFLoader */ index++; } + else if (number == VCFHeaderLineCount.UNBOUNDED) // . + { + index = -1; + } else if (number != VCFHeaderLineCount.A) { /* @@ -1253,14 +1361,85 @@ public class VCFLoader * take the index'th value */ String value = getAttributeValue(variant, key, index); - if (value != null) + if (value != null && isValid(variant, key, value)) { - sf.setValue(key, value); + /* + * decode colon, semicolon, equals sign, percent sign, comma (only) + * as required by the VCF specification (para 1.2) + */ + value = StringUtils.urlDecode(value, VCF_ENCODABLE); + addFeatureAttribute(sf, key, value); } } } /** + * Answers true for '.', null, or an empty value, or if the INFO type is String. + * If the INFO type is Integer or Float, answers false if the value is not in + * valid format. + * + * @param variant + * @param infoId + * @param value + * @return + */ + protected boolean isValid(VariantContext variant, String infoId, + String value) + { + if (value == null || value.isEmpty() || NO_VALUE.equals(value)) + { + return true; + } + VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(infoId); + if (infoHeader == null) + { + Cache.log.error("Field " + infoId + " has no INFO header"); + return false; + } + VCFHeaderLineType infoType = infoHeader.getType(); + try + { + if (infoType == VCFHeaderLineType.Integer) + { + Integer.parseInt(value); + } + else if (infoType == VCFHeaderLineType.Float) + { + Float.parseFloat(value); + } + } catch (NumberFormatException e) + { + logInvalidValue(variant, infoId, value); + return false; + } + return true; + } + + /** + * Logs an error message for malformed data; duplicate messages (same id and + * value) are not logged + * + * @param variant + * @param infoId + * @param value + */ + private void logInvalidValue(VariantContext variant, String infoId, + String value) + { + if (badData == null) + { + badData = new HashSet<>(); + } + String token = infoId + ":" + value; + if (!badData.contains(token)) + { + badData.add(token); + Cache.log.error(String.format("Invalid VCF data at %s:%d %s=%s", + variant.getContig(), variant.getStart(), infoId, value)); + } + } + + /** * Inspects CSQ data blocks (consequences) and adds attributes on the sequence * feature. *

@@ -1308,6 +1487,11 @@ public class VCFLoader String id = vepFieldsOfInterest.get(i); if (id != null) { + /* + * VCF spec requires encoding of special characters e.g. '=' + * so decode them here before storing + */ + field = StringUtils.urlDecode(field, VCF_ENCODABLE); csqValues.put(id, field); } }