From 94fe1bfb6ca65c8a787a336ebffde44df6795803 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 26 Jul 2019 18:52:01 +0200 Subject: [PATCH] JAL-1793 only decode specific special characters in VCF data --- src/jalview/io/vcf/VCFLoader.java | 50 +++++++++++++++++++++++++------- test/jalview/io/vcf/VCFLoaderTest.java | 13 +++++++++ 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index d461811..bb2948d 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -52,6 +52,16 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { + private static final String ENCODED_COMMA = "%2C"; + + private static final String ENCODED_PERCENT = "%25"; + + private static final String ENCODED_EQUALS = "%3D"; + + private static final String ENCODED_SEMICOLON = "%3B"; + + private static final String ENCODED_COLON = "%3A"; + private static final String UTF_8 = "UTF-8"; private static final String DEFAULT_SPECIES = "homo_sapiens"; @@ -1231,22 +1241,42 @@ public class VCFLoader String value = getAttributeValue(variant, key, index); if (value != null) { - /* - * VCF spec requires encoding of special characters e.g. '=' - * so decode them here before storing - */ - try - { - value = URLDecoder.decode(value, UTF_8); - } catch (UnsupportedEncodingException e) - { - } + value = decodeSpecialCharacters(value); sf.setValue(key, value); } } } /** + * Decodes colon, semicolon, equals sign, percent sign, comma to their decoded + * form. The VCF specification (para 1.2) requires these to be encoded where not + * used with their special meaning in the VCF syntax. Note that general URL + * decoding should not be applied, since this would incorrectly decode (for + * example) a '+' sign. + * + * @param value + * @return + */ + protected static String decodeSpecialCharacters(String value) + { + /* + * avoid regex compilation if it is not needed! + */ + if (!value.contains(ENCODED_COLON) && !value.contains(ENCODED_SEMICOLON) + && !value.contains(ENCODED_EQUALS) + && !value.contains(ENCODED_PERCENT) + && !value.contains(ENCODED_COMMA)) + { + return value; + } + + value = value.replace(ENCODED_COLON, ":") + .replace(ENCODED_SEMICOLON, ";").replace(ENCODED_EQUALS, "=") + .replace(ENCODED_PERCENT, "%").replace(ENCODED_COMMA, ","); + return value; + } + + /** * Inspects CSQ data blocks (consequences) and adds attributes on the sequence * feature. *

diff --git a/test/jalview/io/vcf/VCFLoaderTest.java b/test/jalview/io/vcf/VCFLoaderTest.java index fb7a4e4..999fc6c 100644 --- a/test/jalview/io/vcf/VCFLoaderTest.java +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@ -1,6 +1,7 @@ package jalview.io.vcf; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertSame; import static org.testng.Assert.assertTrue; import jalview.bin.Cache; @@ -733,4 +734,16 @@ public class VCFLoaderTest assertEquals(sf.getEnd(), 15); assertEquals(sf.getDescription(), "T,C"); } + + @Test(groups = "Functional") + public void testDecodeSpecialCharacters() throws IOException + { + String encoded = "hello world"; + String decoded = VCFLoader.decodeSpecialCharacters(encoded); + assertSame(encoded, decoded); // no change needed + + encoded = "ab%3Acd%3Bef%3Dgh%25ij%2Ckl%3A"; + decoded = VCFLoader.decodeSpecialCharacters(encoded); + assertEquals(decoded, "ab:cd;ef=gh%ij,kl:"); + } } \ No newline at end of file -- 1.7.10.2