From 3b3c59cfa50e942d2fa5b367b7117cade9459ce7 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 22 Aug 2019 15:48:38 +0100 Subject: [PATCH] JAL-3121 round trip GFF attributes including map-valued attributes --- src/jalview/datamodel/SequenceFeature.java | 45 ++---- src/jalview/ext/ensembl/EnsemblSeqProxy.java | 12 -- src/jalview/io/FeaturesFile.java | 153 +++++++++++++++------ src/jalview/io/gff/Gff3Helper.java | 5 + src/jalview/io/gff/GffHelperBase.java | 74 ++++++++-- src/jalview/io/gff/GffHelperI.java | 1 + src/jalview/io/vcf/VCFLoader.java | 57 ++------ src/jalview/project/Jalview2XML.java | 6 +- src/jalview/util/StringUtils.java | 126 ++++++++++++++++- src/jalview/ws/dbsources/EmblXmlSource.java | 12 -- test/jalview/ext/ensembl/EnsemblSeqProxyTest.java | 7 - test/jalview/io/FeaturesFileTest.java | 33 +++-- test/jalview/io/vcf/VCFLoaderTest.java | 13 -- test/jalview/util/StringUtilsTest.java | 64 ++++++++- 14 files changed, 410 insertions(+), 198 deletions(-) diff --git a/src/jalview/datamodel/SequenceFeature.java b/src/jalview/datamodel/SequenceFeature.java index c8a7def..2dd9cf0 100755 --- a/src/jalview/datamodel/SequenceFeature.java +++ b/src/jalview/datamodel/SequenceFeature.java @@ -28,7 +28,7 @@ import jalview.datamodel.features.FeatureSources; import jalview.util.StringUtils; import java.util.Comparator; -import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.Map; import java.util.Map.Entry; import java.util.SortedMap; @@ -50,10 +50,10 @@ public class SequenceFeature implements FeatureLocationI private static final String STATUS = "status"; - private static final String STRAND = "STRAND"; + public static final String STRAND = "STRAND"; - // private key for Phase designed not to conflict with real GFF data - private static final String PHASE = "!Phase"; + // key for Phase designed not to conflict with real GFF data + public static final String PHASE = "!Phase"; // private key for ENA location designed not to conflict with real GFF data private static final String LOCATION = "!Location"; @@ -61,12 +61,6 @@ public class SequenceFeature implements FeatureLocationI private static final String ROW_DATA = "%s%s%s"; /* - * ATTRIBUTES is reserved for the GFF 'column 9' data, formatted as - * name1=value1;name2=value2,value3;...etc - */ - private static final String ATTRIBUTES = "ATTRIBUTES"; - - /* * type, begin, end, featureGroup, score and contactFeature are final * to ensure that the integrity of SequenceFeatures data store * can't be broken by direct update of these fields @@ -174,19 +168,13 @@ public class SequenceFeature implements FeatureLocationI if (sf.otherDetails != null) { - otherDetails = new HashMap<>(); - for (Entry entry : sf.otherDetails.entrySet()) - { - otherDetails.put(entry.getKey(), entry.getValue()); - } + otherDetails = new LinkedHashMap<>(); + otherDetails.putAll(sf.otherDetails); } if (sf.links != null && sf.links.size() > 0) { links = new Vector<>(); - for (int i = 0, iSize = sf.links.size(); i < iSize; i++) - { - links.addElement(sf.links.elementAt(i)); - } + links.addAll(sf.links); } } @@ -440,7 +428,10 @@ public class SequenceFeature implements FeatureLocationI { if (otherDetails == null) { - otherDetails = new HashMap<>(); + /* + * LinkedHashMap preserves insertion order of attributes + */ + otherDetails = new LinkedHashMap<>(); } otherDetails.put(key, value); @@ -483,16 +474,6 @@ public class SequenceFeature implements FeatureLocationI return (String) getValue(STATUS); } - public void setAttributes(String attr) - { - setValue(ATTRIBUTES, attr); - } - - public String getAttributes() - { - return (String) getValue(ATTRIBUTES); - } - /** * Return 1 for forward strand ('+' in GFF), -1 for reverse strand ('-' in * GFF), and 0 for unknown or not (validly) specified @@ -643,10 +624,6 @@ public class SequenceFeature implements FeatureLocationI for (Entry entry : ordered.entrySet()) { String key = entry.getKey(); - if (ATTRIBUTES.equals(key)) - { - continue; // to avoid double reporting - } Object value = entry.getValue(); if (value instanceof Map) diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 001e18e..b22b9c7 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -724,18 +724,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String comp = complement.toString(); sf.setValue(Gff3Helper.ALLELES, comp); sf.setDescription(comp); - - /* - * replace value of "alleles=" in sf.ATTRIBUTES as well - * so 'output as GFF' shows reverse complement alleles - */ - String atts = sf.getAttributes(); - if (atts != null) - { - atts = atts.replace(Gff3Helper.ALLELES + "=" + alleles, - Gff3Helper.ALLELES + "=" + comp); - sf.setAttributes(atts); - } } /** diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index a69788b..9a4dc0e 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -36,7 +36,6 @@ import jalview.datamodel.SequenceI; import jalview.datamodel.features.FeatureMatcherSet; import jalview.datamodel.features.FeatureMatcherSetI; import jalview.gui.Desktop; -import jalview.io.gff.GffHelperBase; import jalview.io.gff.GffHelperFactory; import jalview.io.gff.GffHelperI; import jalview.schemes.FeatureColour; @@ -75,6 +74,12 @@ import java.util.TreeMap; */ public class FeaturesFile extends AlignFile implements FeaturesSourceI { + /* + * map-valued attributes are prefixed with this for output to GFF3; + * the prefix is removed if found on reading + */ + public static final String MAP_ATTRIBUTE_PREFIX = "jvmap_"; + private static final String TAB_REGEX = "\\t"; private static final String STARTGROUP = "STARTGROUP"; @@ -1126,12 +1131,112 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI String phase = sf.getPhase(); out.append(phase == null ? "." : phase); - // miscellaneous key-values (GFF column 9) - String attributes = sf.getAttributes(); - if (attributes != null) + if (sf.otherDetails != null && !sf.otherDetails.isEmpty()) + { + Map map = sf.otherDetails; + formatAttributes(out, map); + } + } + + /** + * A helper method that outputs attributes stored in the map as + * semicolon-delimited values e.g. + * + *
+   * AC_Male=0;AF_NFE=0.00000e 00;Hom_FIN=0;GQ_MEDIAN=9
+   * 
+ * + * A map-valued attribute is formatted as a comma-delimited list within braces, + * for example + * + *
+   * jvmap_CSQ={ALLELE_NUM=1,UNIPARC=UPI0002841053,Feature=ENST00000585561}
+   * 
+ * + * The {@code jvmap_} prefix designates a values map and is removed if the value + * is parsed when read in. (The GFF3 specification allows 'semi-structured data' + * to be represented provided the attribute name begins with a lower case + * letter.) + * + * @param sb + * @param map + * @see http://gmod.org/wiki/GFF3#GFF3_Format + */ + void formatAttributes(StringBuilder sb, Map map) + { + sb.append(TAB); + boolean first = true; + for (String key : map.keySet()) + { + if (SequenceFeature.STRAND.equals(key) + || SequenceFeature.PHASE.equals(key)) + { + /* + * values stashed in map but output to their own columns + */ + continue; + } + { + if (!first) + { + sb.append(";"); + } + } + first = false; + Object value = map.get(key); + if (value instanceof Map) + { + formatMapAttribute(sb, key, (Map) value); + } + else + { + String formatted = StringUtils.urlEncode(value.toString(), + GffHelperI.GFF_ENCODABLE); + sb.append(key).append("=").append(formatted); + } + } + } + + /** + * Formats the map entries as + * + *
+   * jvmap_key={key1=value1,key2=value2,...}
+   * 
+ * + * and appends this to the string buffer + * + * @param sb + * @param key + * @param map + */ + private void formatMapAttribute(StringBuilder sb, String key, + Map map) + { + if (map == null || map.isEmpty()) + { + return; + } + + /* + * AbstractMap.toString would be a shortcut here, but more reliable + * to code the required format in case toString changes in future + */ + sb.append(MAP_ATTRIBUTE_PREFIX).append(key).append("={"); + boolean first = true; + for (Entry entry : map.entrySet()) { - out.append(TAB).append(attributes); + if (!first) + { + sb.append(","); + } + first = false; + sb.append(entry.getKey().toString()).append("="); + String formatted = StringUtils.urlEncode(entry.getValue().toString(), + GffHelperI.GFF_ENCODABLE); + sb.append(formatted); } + sb.append("}"); } /** @@ -1139,11 +1244,11 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI * format) * * @param alignedRegions - * a list of "Align fromStart toStart fromCount" + * a list of "Align fromStart toStart fromCount" * @param mapIsFromCdna - * if true, 'from' is dna, else 'from' is protein + * if true, 'from' is dna, else 'from' is protein * @param strand - * either 1 (forward) or -1 (reverse) + * either 1 (forward) or -1 (reverse) * @return * @throws IOException */ @@ -1279,38 +1384,6 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI } /** - * Process the 'column 9' data of the GFF file. This is less formally defined, - * and its interpretation will vary depending on the tool that has generated - * it. - * - * @param attributes - * @param sf - */ - protected void processGffColumnNine(String attributes, SequenceFeature sf) - { - sf.setAttributes(attributes); - - /* - * Parse attributes in column 9 and add them to the sequence feature's - * 'otherData' table; use Note as a best proxy for description - */ - char nameValueSeparator = gffVersion == 3 ? '=' : ' '; - // TODO check we don't break GFF2 values which include commas here - Map> nameValues = GffHelperBase - .parseNameValuePairs(attributes, ";", nameValueSeparator, ","); - for (Entry> attr : nameValues.entrySet()) - { - String values = StringUtils.listToDelimitedString(attr.getValue(), - "; "); - sf.setValue(attr.getKey(), values); - if (NOTE.equals(attr.getKey())) - { - sf.setDescription(values); - } - } - } - - /** * After encountering ##fasta in a GFF3 file, process the remainder of the * file as FAST sequence data. Any placeholder sequences created during * feature parsing are updated with the actual sequences. diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java index a25a014..56ec5ef 100644 --- a/src/jalview/io/gff/Gff3Helper.java +++ b/src/jalview/io/gff/Gff3Helper.java @@ -424,6 +424,11 @@ public class Gff3Helper extends GffHelperBase desc = (String) sf.getValue(ID); } + /* + * and decode comma, equals, semi-colon as required by GFF3 spec + */ + desc = StringUtils.urlDecode(desc, GFF_ENCODABLE); + return desc; } } diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java index 1d4d3ac..d034c8d 100644 --- a/src/jalview/io/gff/GffHelperBase.java +++ b/src/jalview/io/gff/GffHelperBase.java @@ -27,6 +27,7 @@ import jalview.datamodel.MappingType; import jalview.datamodel.SequenceDummy; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.io.FeaturesFile; import jalview.util.MapList; import jalview.util.StringUtils; @@ -286,7 +287,7 @@ public abstract class GffHelperBase implements GffHelperI String namesDelimiter, char nameValueSeparator, String valuesDelimiter) { - Map> map = new HashMap>(); + Map> map = new HashMap<>(); if (text == null || text.trim().length() == 0) { return map; @@ -314,7 +315,7 @@ public abstract class GffHelperBase implements GffHelperI List vals = map.get(key); if (vals == null) { - vals = new ArrayList(); + vals = new ArrayList<>(); map.put(key, vals); } for (String val : values.split(valuesDelimiter)) @@ -357,8 +358,7 @@ public abstract class GffHelperBase implements GffHelperI int end = Integer.parseInt(gff[END_COL]); /* - * default 'score' is 0 rather than Float.NaN as the latter currently - * disables the 'graduated colour => colour by label' option + * default 'score' is 0 rather than Float.NaN - see JAL-2554 */ float score = 0f; try @@ -379,22 +379,66 @@ public abstract class GffHelperBase implements GffHelperI if (attributes != null) { /* - * save 'raw' column 9 to allow roundtrip output as input - */ - sf.setAttributes(gff[ATTRIBUTES_COL]); - - /* * Add attributes in column 9 to the sequence feature's - * 'otherData' table; use Note as a best proxy for description + * 'otherData' table; use Note as a best proxy for description; + * decode any encoded comma, equals, semi-colon as per GFF3 spec */ for (Entry> attr : attributes.entrySet()) { - String values = StringUtils.listToDelimitedString(attr.getValue(), - ","); - sf.setValue(attr.getKey(), values); - if (NOTE.equals(attr.getKey())) + String key = attr.getKey(); + List value = attr.getValue(); + if (key.startsWith(FeaturesFile.MAP_ATTRIBUTE_PREFIX)) + { + /* + * e.g. jvmap_CSQ={ALLELE_NUM=1,CDS_position=249,Codons=caG/caT} + */ + String trueKey = key + .substring(FeaturesFile.MAP_ATTRIBUTE_PREFIX.length()); + if (trueKey.isEmpty() || value.isEmpty() + || !value.get(0).startsWith("{") + || !value.get(value.size() - 1).endsWith("}")) + { + System.err.println("Malformed GFF data '" + value.toString() + + "' for " + key); + continue; + } + Map values = new HashMap<>(); + for (String entry : value) + { + if (entry.startsWith("{")) + { + entry = entry.substring(1); + } + if (entry.endsWith("}")) + { + entry = entry.substring(0, entry.length() - 1); + } + String[] fields = entry.split(","); + for (String field : fields) + { + String[] keyValue = field.split("="); + if (keyValue.length == 2) + { + String theKey = StringUtils.urlDecode(keyValue[0], + GFF_ENCODABLE); + String theValue = StringUtils.urlDecode(keyValue[1], + GFF_ENCODABLE); + values.put(theKey, theValue); + } + } + } + sf.setValue(trueKey, values); + } + else { - sf.setDescription(values); + String values = StringUtils + .listToDelimitedString(value, ","); + values = StringUtils.urlDecode(values, GFF_ENCODABLE); + sf.setValue(key, values); + if (NOTE.equals(key)) + { + sf.setDescription(values); + } } } } diff --git a/src/jalview/io/gff/GffHelperI.java b/src/jalview/io/gff/GffHelperI.java index 7fbcf5c..8b341ac 100644 --- a/src/jalview/io/gff/GffHelperI.java +++ b/src/jalview/io/gff/GffHelperI.java @@ -35,6 +35,7 @@ import java.util.List; */ public interface GffHelperI { + final String GFF_ENCODABLE = ",=;"; final String RENAME_TOKEN = "$RENAME_TO$"; diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index ac707d8..cbdd66c 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -19,11 +19,10 @@ import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.MessageManager; +import jalview.util.StringUtils; import java.io.File; import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -57,17 +56,7 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { - private static final String ENCODED_COMMA = "%2C"; - - private static final String ENCODED_PERCENT = "%25"; - - private static final String ENCODED_EQUALS = "%3D"; - - private static final String ENCODED_SEMICOLON = "%3B"; - - private static final String ENCODED_COLON = "%3A"; - - private static final String UTF_8 = "UTF-8"; + private static final String VCF_ENCODABLE = ":;=%,"; /* * Jalview feature attributes for VCF fixed column data @@ -1336,42 +1325,17 @@ public class VCFLoader String value = getAttributeValue(variant, key, index); if (value != null && isValid(variant, key, value)) { - value = decodeSpecialCharacters(value); + /* + * decode colon, semicolon, equals sign, percent sign, comma (only) + * as required by the VCF specification (para 1.2) + */ + value = StringUtils.urlDecode(value, VCF_ENCODABLE); addFeatureAttribute(sf, key, value); } } } /** - * Decodes colon, semicolon, equals sign, percent sign, comma to their decoded - * form. The VCF specification (para 1.2) requires these to be encoded where not - * used with their special meaning in the VCF syntax. Note that general URL - * decoding should not be applied, since this would incorrectly decode (for - * example) a '+' sign. - * - * @param value - * @return - */ - protected static String decodeSpecialCharacters(String value) - { - /* - * avoid regex compilation if it is not needed! - */ - if (!value.contains(ENCODED_COLON) && !value.contains(ENCODED_SEMICOLON) - && !value.contains(ENCODED_EQUALS) - && !value.contains(ENCODED_PERCENT) - && !value.contains(ENCODED_COMMA)) - { - return value; - } - - value = value.replace(ENCODED_COLON, ":") - .replace(ENCODED_SEMICOLON, ";").replace(ENCODED_EQUALS, "=") - .replace(ENCODED_PERCENT, "%").replace(ENCODED_COMMA, ","); - return value; - } - - /** * Answers true for '.', null, or an empty value, or if the INFO type is String. * If the INFO type is Integer or Float, answers false if the value is not in * valid format. @@ -1489,12 +1453,7 @@ public class VCFLoader * VCF spec requires encoding of special characters e.g. '=' * so decode them here before storing */ - try - { - field = URLDecoder.decode(field, UTF_8); - } catch (UnsupportedEncodingException e) - { - } + field = StringUtils.urlDecode(field, VCF_ENCODABLE); csqValues.put(id, field); } } diff --git a/src/jalview/project/Jalview2XML.java b/src/jalview/project/Jalview2XML.java index 2d8a4a6..ca0423b 100644 --- a/src/jalview/project/Jalview2XML.java +++ b/src/jalview/project/Jalview2XML.java @@ -3336,8 +3336,10 @@ public class Jalview2XML || tmpSeq.getEnd() != jseq.getEnd()) { System.err.println( - "Warning JAL-2154 regression: updating start/end for sequence " - + tmpSeq.toString() + " to " + jseq); + String.format("Warning JAL-2154 regression: updating start/end for sequence %s from %d/%d to %d/%d", + tmpSeq.getName(), tmpSeq.getStart(), + tmpSeq.getEnd(), jseq.getStart(), + jseq.getEnd())); } } else diff --git a/src/jalview/util/StringUtils.java b/src/jalview/util/StringUtils.java index 2e8ace8..1f114a8 100644 --- a/src/jalview/util/StringUtils.java +++ b/src/jalview/util/StringUtils.java @@ -20,6 +20,8 @@ */ package jalview.util; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; @@ -29,8 +31,16 @@ public class StringUtils private static final Pattern DELIMITERS_PATTERN = Pattern .compile(".*='[^']*(?!')"); + private static final char PERCENT = '%'; + private static final boolean DEBUG = false; + /* + * URL encoded characters, indexed by char value + * e.g. urlEncodings['='] = urlEncodings[61] = "%3D" + */ + private static String[] urlEncodings = new String[255]; + /** * Returns a new character array, after inserting characters into the given * character array. @@ -146,7 +156,7 @@ public class StringUtils { return null; } - List jv = new ArrayList(); + List jv = new ArrayList<>(); int cp = 0, pos, escape; boolean wasescaped = false, wasquoted = false; String lstitem = null; @@ -444,4 +454,118 @@ public class StringUtils } return text; } + + /** + * Answers the input string with any occurrences of the 'encodeable' characters + * replaced by their URL encoding + * + * @param s + * @param encodable + * @return + */ + public static String urlEncode(String s, String encodable) + { + if (s == null || s.isEmpty()) + { + return s; + } + + /* + * do % encoding first, as otherwise it may double-encode! + */ + if (encodable.indexOf(PERCENT) != -1) + { + s = urlEncode(s, PERCENT); + } + + for (char c : encodable.toCharArray()) + { + if (c != PERCENT) + { + s = urlEncode(s, c); + } + } + return s; + } + + /** + * Answers the input string with any occurrences of {@code c} replaced with + * their url encoding. Answers the input string if it is unchanged. + * + * @param s + * @param c + * @return + */ + static String urlEncode(String s, char c) + { + String decoded = String.valueOf(c); + if (s.indexOf(decoded) != -1) + { + String encoded = getUrlEncoding(c); + if (!encoded.equals(decoded)) + { + s = s.replace(decoded, encoded); + } + } + return s; + } + + /** + * Answers the input string with any occurrences of the specified (unencoded) + * characters replaced by their URL decoding. + *

+ * Example: {@code urlDecode("a%3Db%3Bc", "-;=,")} should answer + * {@code "a=b;c"}. + * + * @param s + * @param encodable + * @return + */ + public static String urlDecode(String s, String encodable) + { + if (s == null || s.isEmpty()) + { + return s; + } + + for (char c : encodable.toCharArray()) + { + String encoded = getUrlEncoding(c); + if (s.indexOf(encoded) != -1) + { + String decoded = String.valueOf(c); + s = s.replace(encoded, decoded); + } + } + return s; + } + + /** + * Does a lazy lookup of the url encoding of the given character, saving the + * value for repeat lookups + * + * @param c + * @return + */ + private static String getUrlEncoding(char c) + { + if (c < 0 || c >= urlEncodings.length) + { + return String.valueOf(c); + } + + String enc = urlEncodings[c]; + if (enc == null) + { + try + { + enc = urlEncodings[c] = URLEncoder.encode(String.valueOf(c), + "UTF-8"); + } catch (UnsupportedEncodingException e) + { + enc = urlEncodings[c] = String.valueOf(c); + } + } + return enc; + } } diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java index e114ea9..a420d9f 100644 --- a/src/jalview/ws/dbsources/EmblXmlSource.java +++ b/src/jalview/ws/dbsources/EmblXmlSource.java @@ -52,7 +52,6 @@ import java.util.Hashtable; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.regex.Pattern; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; @@ -68,8 +67,6 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy */ private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found."; - private static final Pattern SPACE_PATTERN = Pattern.compile(" "); - public EmblXmlSource() { super(); @@ -703,19 +700,10 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group); if (!vals.isEmpty()) { - StringBuilder sb = new StringBuilder(); - boolean first = true; for (Entry val : vals.entrySet()) { - if (!first) - { - sb.append(";"); - } - sb.append(val.getKey()).append("=").append(val.getValue()); - first = false; sf.setValue(val.getKey(), val.getValue()); } - sf.setAttributes(sb.toString()); } return sf; } diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index 17e92c8..e17b4a6 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -21,12 +21,10 @@ package jalview.ext.ensembl; import static org.testng.AssertJUnit.assertEquals; -import static org.testng.AssertJUnit.assertSame; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.datamodel.features.SequenceFeatures; import jalview.gui.JvOptionPane; import jalview.io.DataSourceType; import jalview.io.FastaFile; @@ -34,8 +32,6 @@ import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyLite; import java.lang.reflect.Method; -import java.util.Arrays; -import java.util.List; import org.testng.Assert; import org.testng.annotations.AfterClass; @@ -223,7 +219,6 @@ public class EnsemblSeqProxyTest SequenceFeature sf = new SequenceFeature("sequence_variant", alleles, 1, 2, 0f, null); sf.setValue("alleles", alleles); - sf.setAttributes("x=y,z;alleles=" + alleles + ";a=b,c"); EnsemblSeqProxy.reverseComplementAlleles(sf); String revcomp = "G,C,GTA-,HGMD_MUTATION,gtc"; @@ -231,7 +226,5 @@ public class EnsemblSeqProxyTest assertEquals(revcomp, sf.getDescription()); // verify alleles attribute is updated with reverse complement assertEquals(revcomp, sf.getValue("alleles")); - // verify attributes string is updated with reverse complement - assertEquals("x=y,z;alleles=" + revcomp + ";a=b,c", sf.getAttributes()); } } diff --git a/test/jalview/io/FeaturesFileTest.java b/test/jalview/io/FeaturesFileTest.java index 090de6f..5154ef2 100644 --- a/test/jalview/io/FeaturesFileTest.java +++ b/test/jalview/io/FeaturesFileTest.java @@ -268,10 +268,11 @@ public class FeaturesFileTest AlignFrame af = new AlignFrame(al, 500, 500); Map colours = af.getFeatureRenderer() .getFeatureColours(); - // GFF3 uses '=' separator for name/value pairs in colum 9 + // GFF3 uses '=' separator for name/value pairs in column 9 + // comma (%2C) equals (%3D) or semi-colon (%3B) should be url-escaped in values String gffData = "##gff-version 3\n" + "FER_CAPAA\tuniprot\tMETAL\t39\t39\t0.0\t.\t.\t" - + "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO:0000255|PROSITE-ProRule:PRU00465\n" + + "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO%3B0000255%2CPROSITE%3DProRule:PRU00465\n" + "FER1_SOLLC\tuniprot\tPfam\t55\t130\t3.0\t.\t.\tID=$23"; FeaturesFile featuresFile = new FeaturesFile(gffData, DataSourceType.PASTE); @@ -289,9 +290,13 @@ public class FeaturesFileTest assertEquals(39, sf.end); assertEquals("uniprot", sf.featureGroup); assertEquals("METAL", sf.type); - assertEquals( - "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO:0000255|PROSITE-ProRule:PRU00465", - sf.getValue("ATTRIBUTES")); + assertEquals(4, sf.otherDetails.size()); + assertEquals("ECO;0000255,PROSITE=ProRule:PRU00465", + sf.otherDetails.get("evidence")); + assertEquals("Iron-sulfur (2Fe-2S),another note", + sf.otherDetails.get("Note")); + assertEquals(".", sf.otherDetails.get("STRAND")); + assertEquals(".", sf.otherDetails.get("!Phase")); // verify feature on FER1_SOLLC1 sfs = al.getSequenceAt(2).getDatasetSequence().getSequenceFeatures(); @@ -593,9 +598,10 @@ public class FeaturesFileTest "s3dm")); SequenceFeature sf = new SequenceFeature("Pfam", "", 20, 20, 0f, "Uniprot"); - sf.setAttributes("x=y;black=white"); sf.setStrand("+"); sf.setPhase("2"); + sf.setValue("x", "y"); + sf.setValue("black", "white"); al.getSequenceAt(1).addSequenceFeature(sf); /* @@ -772,8 +778,8 @@ public class FeaturesFileTest String exported = featuresFile.printGffFormat(al.getSequencesArray(), fr, false, false); String expected = gffHeader - + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\n" - + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\n"; + + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\tclin_sig=Likely Pathogenic;AF=24\n" + + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\tclin_sig=Benign;AF=46\n"; assertEquals(expected, exported); /* @@ -786,7 +792,8 @@ public class FeaturesFileTest fr.setColour("METAL", fc); exported = featuresFile.printGffFormat(al.getSequencesArray(), fr, false, false); - expected = gffHeader + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\n"; + expected = gffHeader + + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\tclin_sig=Likely Pathogenic;AF=24\n"; assertEquals(expected, exported); /* @@ -795,8 +802,9 @@ public class FeaturesFileTest fc.setAboveThreshold(false); exported = featuresFile.printGffFormat(al.getSequencesArray(), fr, false, false); - expected = gffHeader + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\n" - + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\n"; + expected = gffHeader + + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\tclin_sig=Likely Pathogenic;AF=24\n" + + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\tclin_sig=Benign;AF=46\n"; assertEquals(expected, exported); /* @@ -808,7 +816,8 @@ public class FeaturesFileTest fr.setFeatureFilter("METAL", filter); exported = featuresFile.printGffFormat(al.getSequencesArray(), fr, false, false); - expected = gffHeader + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\n"; + expected = gffHeader + + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\tclin_sig=Benign;AF=46\n"; assertEquals(expected, exported); } diff --git a/test/jalview/io/vcf/VCFLoaderTest.java b/test/jalview/io/vcf/VCFLoaderTest.java index 87cf727..97b609d 100644 --- a/test/jalview/io/vcf/VCFLoaderTest.java +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@ -3,7 +3,6 @@ package jalview.io.vcf; import static jalview.io.gff.SequenceOntologyI.SEQUENCE_VARIANT; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertNull; -import static org.testng.Assert.assertSame; import static org.testng.Assert.assertTrue; import jalview.bin.Cache; @@ -762,16 +761,4 @@ public class VCFLoaderTest assertEquals(sf.getEnd(), 15); assertEquals(sf.getDescription(), "T,C"); } - - @Test(groups = "Functional") - public void testDecodeSpecialCharacters() throws IOException - { - String encoded = "hello world"; - String decoded = VCFLoader.decodeSpecialCharacters(encoded); - assertSame(encoded, decoded); // no change needed - - encoded = "ab%3Acd%3Bef%3Dgh%25ij%2Ckl%3A"; - decoded = VCFLoader.decodeSpecialCharacters(encoded); - assertEquals(decoded, "ab:cd;ef=gh%ij,kl:"); - } } \ No newline at end of file diff --git a/test/jalview/util/StringUtilsTest.java b/test/jalview/util/StringUtilsTest.java index 084219a..9cc8d1c 100644 --- a/test/jalview/util/StringUtilsTest.java +++ b/test/jalview/util/StringUtilsTest.java @@ -145,7 +145,7 @@ public class StringUtilsTest public void testListToDelimitedString() { assertEquals("", StringUtils.listToDelimitedString(null, ";")); - List list = new ArrayList(); + List list = new ArrayList<>(); assertEquals("", StringUtils.listToDelimitedString(list, ";")); list.add("now"); assertEquals("now", StringUtils.listToDelimitedString(list, ";")); @@ -250,4 +250,66 @@ public class StringUtilsTest assertEquals("kdHydro < 12.53", StringUtils.stripHtmlTags("kdHydro < 12.53")); } + + @Test(groups = { "Functional" }) + public void testUrlEncode() + { + // degenerate cases + assertNull(StringUtils.urlEncode(null, ";,")); + assertEquals("", StringUtils.urlEncode("", "")); + assertEquals("", StringUtils.urlEncode("", ";,")); + + // sanity checks, see + // https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_reserved_characters + assertEquals("+", StringUtils.urlEncode(" ", " ")); + assertEquals("%25", StringUtils.urlEncode("%", "%")); + assertEquals(".", StringUtils.urlEncode(".", ".")); // note . is not encoded + assertEquals("%3A", StringUtils.urlEncode(":", ":")); + assertEquals("%3B", StringUtils.urlEncode(";", ";")); + assertEquals("%3D", StringUtils.urlEncode("=", "=")); + assertEquals("%2C", StringUtils.urlEncode(",", ",")); + + // check % does not get recursively encoded! + assertEquals("a%25b%3Dc%3Bd%3Ae%2C%2C", + StringUtils.urlEncode("a%b=c;d:e,,", "=,;:%")); + + // = not in the list for encoding + assertEquals("a=b", StringUtils.urlEncode("a=b", ";,")); + + // encode = (as %3B) and ; (as %3D) + assertEquals("a%3Db.c%3B", StringUtils.urlEncode("a=b.c;", ";=,")); + + // . and space not in the list for encoding + assertEquals("a%3Db.c d", StringUtils.urlEncode("a=b.c d", ";=,")); + + // encode space also (as +) + assertEquals("a%3Db.c+d", StringUtils.urlEncode("a=b.c d", ";=, ")); + + // . does not get encoded even if requested - behaviour of URLEncoder + assertEquals("a%3Db.c+d.e%3Df", + StringUtils.urlEncode("a=b.c d.e=f", ";=,. ")); + } + + @Test(groups = { "Functional" }) + public void testUrlDecode() + { + // degenerate cases + assertNull(StringUtils.urlDecode(null, ";,")); + assertEquals("", StringUtils.urlDecode("", "")); + assertEquals("", StringUtils.urlDecode("", ";,")); + + // = not in the list for encoding + assertEquals("a%3Db", StringUtils.urlDecode("a%3Db", ";,")); + + // decode = and ; but not . + assertEquals("a=b%3Ec; d", + StringUtils.urlDecode("a%3Db%3Ec; d", ";=,")); + + // space not in the list for decoding + assertEquals("a=b;c+d", StringUtils.urlDecode("a%3Db%3Bc+d", ";=,")); + + // decode space also; %3E is not decoded to . + assertEquals("a=b%3Ec d=,", + StringUtils.urlDecode("a%3Db%3Ec+d%3D%2C", ";=, ")); + } } -- 1.7.10.2