X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fvcf%2FVCFLoader.java;h=b96de681829fe90765824c7d17e082a222f9c340;hb=a1984b1c8c273ed33c7ce9283039f4027dcae2de;hp=20e3ccd48192504ad0c39bfcdda69be153a16749;hpb=0a680b4ff1aaad7580d3b10941233307e2190be4;p=jalview.git diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 20e3ccd..b96de68 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -1,32 +1,37 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ package jalview.io.vcf; -import jalview.analysis.AlignmentUtils; -import jalview.analysis.Dna; -import jalview.api.AlignViewControllerGuiI; -import jalview.bin.Cache; -import jalview.datamodel.AlignmentI; -import jalview.datamodel.DBRefEntry; -import jalview.datamodel.GeneLociI; -import jalview.datamodel.Mapping; -import jalview.datamodel.SequenceFeature; -import jalview.datamodel.SequenceI; -import jalview.datamodel.features.FeatureAttributeType; -import jalview.datamodel.features.FeatureSource; -import jalview.datamodel.features.FeatureSources; -import jalview.ext.ensembl.EnsemblMap; -import jalview.ext.htsjdk.VCFReader; -import jalview.io.gff.Gff3Helper; -import jalview.io.gff.SequenceOntologyI; -import jalview.util.MapList; -import jalview.util.MappingUtils; -import jalview.util.MessageManager; +import java.util.Locale; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -34,13 +39,35 @@ import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; +import htsjdk.tribble.TribbleException; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; +import jalview.analysis.Dna; +import jalview.api.AlignViewControllerGuiI; +import jalview.bin.Cache; +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.GeneLociI; +import jalview.datamodel.Mapping; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.datamodel.features.FeatureAttributeType; +import jalview.datamodel.features.FeatureSource; +import jalview.datamodel.features.FeatureSources; +import jalview.ext.ensembl.EnsemblMap; +import jalview.ext.htsjdk.HtsContigDb; +import jalview.ext.htsjdk.VCFReader; +import jalview.io.gff.Gff3Helper; +import jalview.io.gff.SequenceOntologyI; +import jalview.util.MapList; +import jalview.util.MappingUtils; +import jalview.util.MessageManager; +import jalview.util.StringUtils; /** * A class to read VCF data (using the htsjdk) and add variants as sequence @@ -50,6 +77,23 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + private static final String VCF_ENCODABLE = ":;=%,"; + + /* + * Jalview feature attributes for VCF fixed column data + */ + private static final String VCF_POS = "POS"; + + private static final String VCF_ID = "ID"; + + private static final String VCF_QUAL = "QUAL"; + + private static final String VCF_FILTER = "FILTER"; + + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' + + private static final String DEFAULT_SPECIES = "homo_sapiens"; + /** * A class to model the mapping from sequence to VCF coordinates. Cases include * * - * @param seq - * @param variant - * @param altAlleleIndex + * @param consequence * @return * @see http://www.sequenceontology.org/browser/current_svn/term/SO:0001060 */ - String getOntologyTerm(SequenceI seq, VariantContext variant, - int altAlleleIndex) + String getOntologyTerm(String consequence) { String type = SequenceOntologyI.SEQUENCE_VARIANT; + /* + * could we associate Consequence data with this allele and feature (transcript)? + * if so, prefer the consequence term from that data + */ if (csqAlleleFieldIndex == -1) // && snpEffAlleleFieldIndex == -1 { /* @@ -947,14 +1149,6 @@ public class VCFLoader return type; } - /* - * can we associate Consequence data with this allele and feature (transcript)? - * if so, prefer the consequence term from that data - */ - String consequence = getConsequenceForAlleleAndFeature(variant, - CSQ_FIELD, - altAlleleIndex, csqAlleleFieldIndex, csqAlleleNumberFieldIndex, - seq.getName().toLowerCase(), csqFeatureFieldIndex); if (consequence != null) { String[] csqFields = consequence.split(PIPE_REGEX); @@ -1034,7 +1228,7 @@ public class VCFLoader { String featureIdentifier = csqFields[featureFieldIndex]; if (featureIdentifier.length() > 4 - && seqName.indexOf(featureIdentifier.toLowerCase()) > -1) + && seqName.indexOf(featureIdentifier.toLowerCase(Locale.ROOT)) > -1) { /* * feature (transcript) matched - now check for allele match @@ -1085,13 +1279,15 @@ public class VCFLoader * Add any allele-specific VCF key-value data to the sequence feature * * @param variant - * @param seq * @param sf * @param altAlelleIndex * (0, 1..) + * @param consequence + * if not null, the consequence specific to this sequence (transcript + * feature) and allele */ - protected void addAlleleProperties(VariantContext variant, SequenceI seq, - SequenceFeature sf, final int altAlelleIndex) + protected void addAlleleProperties(VariantContext variant, + SequenceFeature sf, final int altAlelleIndex, String consequence) { Map atts = variant.getAttributes(); @@ -1105,7 +1301,7 @@ public class VCFLoader */ if (CSQ_FIELD.equals(key)) { - addConsequences(variant, seq, sf, altAlelleIndex); + addConsequences(variant, sf, consequence); continue; } @@ -1154,44 +1350,99 @@ public class VCFLoader * take the index'th value */ String value = getAttributeValue(variant, key, index); - if (value != null) + if (value != null && isValid(variant, key, value)) + { + /* + * decode colon, semicolon, equals sign, percent sign, comma (only) + * as required by the VCF specification (para 1.2) + */ + value = StringUtils.urlDecode(value, VCF_ENCODABLE); + addFeatureAttribute(sf, key, value); + } + } + } + + /** + * Answers true for '.', null, or an empty value, or if the INFO type is String. + * If the INFO type is Integer or Float, answers false if the value is not in + * valid format. + * + * @param variant + * @param infoId + * @param value + * @return + */ + protected boolean isValid(VariantContext variant, String infoId, + String value) + { + if (value == null || value.isEmpty() || NO_VALUE.equals(value)) + { + return true; + } + VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(infoId); + if (infoHeader == null) + { + Cache.error("Field " + infoId + " has no INFO header"); + return false; + } + VCFHeaderLineType infoType = infoHeader.getType(); + try + { + if (infoType == VCFHeaderLineType.Integer) + { + Integer.parseInt(value); + } + else if (infoType == VCFHeaderLineType.Float) { - sf.setValue(key, value); + Float.parseFloat(value); } + } catch (NumberFormatException e) + { + logInvalidValue(variant, infoId, value); + return false; + } + return true; + } + + /** + * Logs an error message for malformed data; duplicate messages (same id and + * value) are not logged + * + * @param variant + * @param infoId + * @param value + */ + private void logInvalidValue(VariantContext variant, String infoId, + String value) + { + if (badData == null) + { + badData = new HashSet<>(); + } + String token = infoId + ":" + value; + if (!badData.contains(token)) + { + badData.add(token); + Cache.error(String.format("Invalid VCF data at %s:%d %s=%s", + variant.getContig(), variant.getStart(), infoId, value)); } } /** * Inspects CSQ data blocks (consequences) and adds attributes on the sequence - * feature for the current allele (and transcript if applicable) - *

- * Allele matching: if field ALLELE_NUM is present, it must match - * altAlleleIndex. If not present, then field Allele value must match the VCF - * Allele. + * feature. *

- * Transcript matching: if sequence name can be identified to at least one of - * the consequences' Feature values, then select only consequences that match - * the value (i.e. consequences for the current transcript sequence). If not, - * take all consequences (this is the case when adding features to the gene - * sequence). + * If myConsequence is not null, then this is the specific + * consequence data (pipe-delimited fields) that is for the current allele and + * transcript (sequence) being processed) * * @param variant - * @param seq * @param sf - * @param altAlleleIndex - * (0, 1..) + * @param myConsequence */ - protected void addConsequences(VariantContext variant, SequenceI seq, - SequenceFeature sf, int altAlleleIndex) + protected void addConsequences(VariantContext variant, SequenceFeature sf, + String myConsequence) { - /* - * first try to identify the matching consequence - */ - String myConsequence = getConsequenceForAlleleAndFeature(variant, - CSQ_FIELD, altAlleleIndex, csqAlleleFieldIndex, - csqAlleleNumberFieldIndex, seq.getName().toLowerCase(), - csqFeatureFieldIndex); - Object value = variant.getAttribute(CSQ_FIELD); if (value == null || !(value instanceof List)) @@ -1225,6 +1476,11 @@ public class VCFLoader String id = vepFieldsOfInterest.get(i); if (id != null) { + /* + * VCF spec requires encoding of special characters e.g. '=' + * so decode them here before storing + */ + field = StringUtils.urlDecode(field, VCF_ENCODABLE); csqValues.put(id, field); } }