From: gmungoc Date: Fri, 26 Jul 2019 18:42:30 +0000 (+0200) Subject: Merge branch 'features/JAL-3375vcfValidation' into X-Git-Tag: Release_2_11_1_0~29 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=25da4ccb5679905ada8b21e4d21fd416c392ab76;p=jalview.git Merge branch 'features/JAL-3375vcfValidation' into feature/JAL-3187linkedFeatures Conflicts: src/jalview/io/vcf/VCFLoader.java test/jalview/io/vcf/VCFLoaderTest.java --- 25da4ccb5679905ada8b21e4d21fd416c392ab76 diff --cc src/jalview/io/vcf/VCFLoader.java index bb2948d,decff23..ac707d8 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@@ -22,10 -23,10 +22,12 @@@ import jalview.util.MessageManager import java.io.File; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashMap; + import java.util.HashSet; + import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@@ -52,18 -56,19 +57,31 @@@ import htsjdk.variant.vcf.VCFInfoHeader */ public class VCFLoader { + private static final String ENCODED_COMMA = "%2C"; + + private static final String ENCODED_PERCENT = "%25"; + + private static final String ENCODED_EQUALS = "%3D"; + + private static final String ENCODED_SEMICOLON = "%3B"; + + private static final String ENCODED_COLON = "%3A"; + + private static final String UTF_8 = "UTF-8"; + + /* + * Jalview feature attributes for VCF fixed column data + */ + private static final String VCF_POS = "POS"; + + private static final String VCF_ID = "ID"; + + private static final String VCF_QUAL = "QUAL"; + + private static final String VCF_FILTER = "FILTER"; + + private static final String NO_VALUE = VCFConstants.MISSING_VALUE_v4; // '.' + private static final String DEFAULT_SPECIES = "homo_sapiens"; /** @@@ -1239,44 -1328,80 +1334,110 @@@ * take the index'th value */ String value = getAttributeValue(variant, key, index); - if (value != null) + if (value != null && isValid(variant, key, value)) { + value = decodeSpecialCharacters(value); - sf.setValue(key, value); + addFeatureAttribute(sf, key, value); } } } /** + * Decodes colon, semicolon, equals sign, percent sign, comma to their decoded + * form. The VCF specification (para 1.2) requires these to be encoded where not + * used with their special meaning in the VCF syntax. Note that general URL + * decoding should not be applied, since this would incorrectly decode (for + * example) a '+' sign. + * + * @param value + * @return + */ + protected static String decodeSpecialCharacters(String value) + { + /* + * avoid regex compilation if it is not needed! + */ + if (!value.contains(ENCODED_COLON) && !value.contains(ENCODED_SEMICOLON) + && !value.contains(ENCODED_EQUALS) + && !value.contains(ENCODED_PERCENT) + && !value.contains(ENCODED_COMMA)) + { + return value; + } + + value = value.replace(ENCODED_COLON, ":") + .replace(ENCODED_SEMICOLON, ";").replace(ENCODED_EQUALS, "=") + .replace(ENCODED_PERCENT, "%").replace(ENCODED_COMMA, ","); + return value; + } + + /** + * Answers true for '.', null, or an empty value, or if the INFO type is String. + * If the INFO type is Integer or Float, answers false if the value is not in + * valid format. + * + * @param variant + * @param infoId + * @param value + * @return + */ + protected boolean isValid(VariantContext variant, String infoId, + String value) + { + if (value == null || value.isEmpty() || NO_VALUE.equals(value)) + { + return true; + } + VCFInfoHeaderLine infoHeader = header.getInfoHeaderLine(infoId); + if (infoHeader == null) + { + Cache.log.error("Field " + infoId + " has no INFO header"); + return false; + } + VCFHeaderLineType infoType = infoHeader.getType(); + try + { + if (infoType == VCFHeaderLineType.Integer) + { + Integer.parseInt(value); + } + else if (infoType == VCFHeaderLineType.Float) + { + Float.parseFloat(value); + } + } catch (NumberFormatException e) + { + logInvalidValue(variant, infoId, value); + return false; + } + return true; + } + + /** + * Logs an error message for malformed data; duplicate messages (same id and + * value) are not logged + * + * @param variant + * @param infoId + * @param value + */ + private void logInvalidValue(VariantContext variant, String infoId, + String value) + { + if (badData == null) + { + badData = new HashSet<>(); + } + String token = infoId + ":" + value; + if (!badData.contains(token)) + { + badData.add(token); + Cache.log.error(String.format("Invalid VCF data at %s:%d %s=%s", + variant.getContig(), variant.getStart(), infoId, value)); + } + } + + /** * Inspects CSQ data blocks (consequences) and adds attributes on the sequence * feature. *

diff --cc test/jalview/io/vcf/VCFLoaderTest.java index 999fc6c,1e88665..87cf727 --- a/test/jalview/io/vcf/VCFLoaderTest.java +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@@ -1,8 -1,9 +1,10 @@@ package jalview.io.vcf; + import static jalview.io.gff.SequenceOntologyI.SEQUENCE_VARIANT; import static org.testng.Assert.assertEquals; + import static org.testng.Assert.assertNull; import static org.testng.Assert.assertSame; +import static org.testng.Assert.assertTrue; import jalview.bin.Cache; import jalview.datamodel.AlignmentI; @@@ -11,12 -12,14 +13,12 @@@ import jalview.datamodel.Mapping import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; + import jalview.datamodel.features.FeatureAttributes; -import jalview.datamodel.features.FeatureAttributes.Datatype; import jalview.datamodel.features.SequenceFeatures; import jalview.gui.AlignFrame; import jalview.io.DataSourceType; import jalview.io.FileLoader; import jalview.io.gff.Gff3Helper; --import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import java.io.File; @@@ -180,21 -221,27 +220,14 @@@ public class VCFLoaderTes } } List proteinFeatures = peptide.getSequenceFeatures(); - assertEquals(proteinFeatures.size(), 3); - sf = proteinFeatures.get(0); - assertEquals(sf.getFeatureGroup(), "VCF"); - assertEquals(sf.getBegin(), 1); - assertEquals(sf.getEnd(), 1); - assertEquals(sf.getType(), SequenceOntologyI.NONSYNONYMOUS_VARIANT); - assertEquals(sf.getDescription(), "p.Ser1Thr"); /* - * check that sequence_variant attribute AF has been clocked as - * numeric with correct min and max values - * (i.e. invalid values have been ignored - JAL-3375) + * JAL-3187 don't precompute protein features, do dynamically instead */ - FeatureAttributes fa = FeatureAttributes.getInstance(); - assertSame(fa.getDatatype(SEQUENCE_VARIANT, "AF"), Datatype.Number); - float[] minmax = fa.getMinMax(SEQUENCE_VARIANT, "AF"); - assertEquals(minmax[0], 0.002f); - assertEquals(minmax[1], 0.005f); + assertTrue(proteinFeatures.isEmpty()); - // assertEquals(proteinFeatures.size(), 1); - // sf = proteinFeatures.get(0); - // assertEquals(sf.getFeatureGroup(), "VCF"); - // assertEquals(sf.getBegin(), 1); - // assertEquals(sf.getEnd(), 1); - // assertEquals(sf.getType(), SequenceOntologyI.NONSYNONYMOUS_VARIANT); - // assertEquals(sf.getDescription(), "p.Ser1Thr"); } - private File makeVcf() throws IOException + private File makeVcfFile() throws IOException { File f = File.createTempFile("Test", ".vcf"); f.deleteOnExit(); @@@ -452,17 -500,13 +486,11 @@@ } } List proteinFeatures = peptide.getSequenceFeatures(); - assertEquals(proteinFeatures.size(), 3); - sf = proteinFeatures.get(0); - assertEquals(sf.getFeatureGroup(), "VCF"); - assertEquals(sf.getBegin(), 6); - assertEquals(sf.getEnd(), 6); - assertEquals(sf.getType(), SequenceOntologyI.NONSYNONYMOUS_VARIANT); - assertEquals(sf.getDescription(), "p.Ala6Gly"); ++ + /* + * JAL-3187 don't precompute protein features, do dynamically instead + */ + assertTrue(proteinFeatures.isEmpty()); - // assertEquals(proteinFeatures.size(), 1); - // sf = proteinFeatures.get(0); - // assertEquals(sf.getFeatureGroup(), "VCF"); - // assertEquals(sf.getBegin(), 6); - // assertEquals(sf.getEnd(), 6); - // assertEquals(sf.getType(), SequenceOntologyI.NONSYNONYMOUS_VARIANT); - // assertEquals(sf.getDescription(), "p.Ala6Gly"); } /**