From 3569b83486d410891bf61fe4157ac48062491e3a Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 5 Jul 2019 10:09:12 +0100 Subject: [PATCH] JAL-1793 apply URL decoding when saving VCF attribute values (%3D -> =) --- src/jalview/io/vcf/VCFLoader.java | 32 ++++++++++++++++++++++++-------- test/jalview/io/vcf/VCFLoaderTest.java | 2 ++ test/jalview/io/vcf/testVcf.dat | 13 ------------- test/jalview/io/vcf/testVcf.vcf | 2 +- 4 files changed, 27 insertions(+), 22 deletions(-) delete mode 100644 test/jalview/io/vcf/testVcf.dat diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index 1ac9ad7..d461811 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -22,6 +22,8 @@ import jalview.util.MessageManager; import java.io.File; import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -50,6 +52,8 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + private static final String UTF_8 = "UTF-8"; + private static final String DEFAULT_SPECIES = "homo_sapiens"; /** @@ -1189,14 +1193,6 @@ public class VCFLoader } /* - * filter out fields we don't want to capture - */ - if (!vcfFieldsOfInterest.contains(key)) - { - continue; - } - - /* * we extract values for other data which are allele-specific; * these may be per alternate allele (INFO[key].Number = 'A') * or per allele including reference (INFO[key].Number = 'R') @@ -1235,6 +1231,16 @@ public class VCFLoader String value = getAttributeValue(variant, key, index); if (value != null) { + /* + * VCF spec requires encoding of special characters e.g. '=' + * so decode them here before storing + */ + try + { + value = URLDecoder.decode(value, UTF_8); + } catch (UnsupportedEncodingException e) + { + } sf.setValue(key, value); } } @@ -1288,6 +1294,16 @@ public class VCFLoader String id = vepFieldsOfInterest.get(i); if (id != null) { + /* + * VCF spec requires encoding of special characters e.g. '=' + * so decode them here before storing + */ + try + { + field = URLDecoder.decode(field, UTF_8); + } catch (UnsupportedEncodingException e) + { + } csqValues.put(id, field); } } diff --git a/test/jalview/io/vcf/VCFLoaderTest.java b/test/jalview/io/vcf/VCFLoaderTest.java index 20cabbd..fb7a4e4 100644 --- a/test/jalview/io/vcf/VCFLoaderTest.java +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@ -504,6 +504,7 @@ public class VCFLoaderTest // gene features include Consequence for all transcripts Map map = (Map) sf.getValue("CSQ"); assertEquals(map.size(), 9); + assertEquals(map.get("PolyPhen"), "Bad"); sf = geneFeatures.get(1); assertEquals(sf.getBegin(), 5); @@ -513,6 +514,7 @@ public class VCFLoaderTest assertEquals(sf.getValue("alleles"), "C,T"); map = (Map) sf.getValue("CSQ"); assertEquals(map.size(), 9); + assertEquals(map.get("PolyPhen"), "Bad++"); // %3B%3B decoded sf = geneFeatures.get(2); assertEquals(sf.getBegin(), 9); diff --git a/test/jalview/io/vcf/testVcf.dat b/test/jalview/io/vcf/testVcf.dat deleted file mode 100644 index 77e070c..0000000 --- a/test/jalview/io/vcf/testVcf.dat +++ /dev/null @@ -1,13 +0,0 @@ -##fileformat=VCFv4.2 -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##reference=/Homo_sapiens/GRCh38 -#CHROM POS ID REF ALT QUAL FILTER INFO -5 45051610 . C A 81.96 RF;AC0 AC=1;AF=0.1;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=A|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,A|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad -5 45051614 . C T 1666.64 RF AC=1;AF=0.2;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=T|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,T|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad -5 45051618 . CGG C 41.94 AC0 AC=1;AF=0.3;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=C|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,C|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad,CSQ=CGT|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,CGT|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad -5 45051622 . C G,T 224.23 RF;AC0 AC=1,2;AF=0.4,0.5;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=G|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,G|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad,T|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,T|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad -5 45051626 . A AC,G 433.35 RF;AC0 AC=3,4;AF=0.6,0.7;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=G|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,G|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad,AC|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,AC|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad diff --git a/test/jalview/io/vcf/testVcf.vcf b/test/jalview/io/vcf/testVcf.vcf index 77e070c..8a16a90 100644 --- a/test/jalview/io/vcf/testVcf.vcf +++ b/test/jalview/io/vcf/testVcf.vcf @@ -7,7 +7,7 @@ ##reference=/Homo_sapiens/GRCh38 #CHROM POS ID REF ALT QUAL FILTER INFO 5 45051610 . C A 81.96 RF;AC0 AC=1;AF=0.1;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=A|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,A|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad -5 45051614 . C T 1666.64 RF AC=1;AF=0.2;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=T|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,T|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad +5 45051614 . C T 1666.64 RF AC=1;AF=0.2;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=T|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,T|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad%2B%2B 5 45051618 . CGG C 41.94 AC0 AC=1;AF=0.3;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=C|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,C|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad,CSQ=CGT|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,CGT|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad 5 45051622 . C G,T 224.23 RF;AC0 AC=1,2;AF=0.4,0.5;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=G|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,G|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad,T|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,T|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad 5 45051626 . A AC,G 433.35 RF;AC0 AC=3,4;AF=0.6,0.7;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=G|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,G|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad,AC|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,AC|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad -- 1.7.10.2