From: gmungoc Date: Tue, 28 Jan 2020 12:30:07 +0000 (+0000) Subject: Merge branch 'feature/JAL-3121gffAttributeMap' into develop X-Git-Tag: Release_2_11_1_1~32 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=561472b407c20a9d0a49cd5e87b624e325407751;hp=89f9ad40a3e1d7f1765f3bf5369dcc3636a824ff;p=jalview.git Merge branch 'feature/JAL-3121gffAttributeMap' into develop --- diff --git a/help/help/html/features/featuresFormat.html b/help/help/html/features/featuresFormat.html index 0226175..4df0b0c 100755 --- a/help/help/html/features/featuresFormat.html +++ b/help/help/html/features/featuresFormat.html @@ -202,7 +202,11 @@ GFF data (this mixed format capability was added in Jalview 2.6).

- +

Feature attributes can be included as name=value pairs in GFF3 column 9, including (since Jalview 2.11.1.0) 'nested' sub-attributes, for example: +
alleles=G,A,C;AF=6;CSQ=SIFT=deleterious,tolerated,PolyPhen=possibly_damaging(0.907) +
where SIFT and PolyPhen are sub-attributes of CSQ. This data is preserved if features are exported in GFF format (but not, currently, + in Jalview format). +

Jalview's sequence feature format

diff --git a/src/jalview/datamodel/SequenceFeature.java b/src/jalview/datamodel/SequenceFeature.java index c8a7def..2dd9cf0 100755 --- a/src/jalview/datamodel/SequenceFeature.java +++ b/src/jalview/datamodel/SequenceFeature.java @@ -28,7 +28,7 @@ import jalview.datamodel.features.FeatureSources; import jalview.util.StringUtils; import java.util.Comparator; -import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.Map; import java.util.Map.Entry; import java.util.SortedMap; @@ -50,10 +50,10 @@ public class SequenceFeature implements FeatureLocationI private static final String STATUS = "status"; - private static final String STRAND = "STRAND"; + public static final String STRAND = "STRAND"; - // private key for Phase designed not to conflict with real GFF data - private static final String PHASE = "!Phase"; + // key for Phase designed not to conflict with real GFF data + public static final String PHASE = "!Phase"; // private key for ENA location designed not to conflict with real GFF data private static final String LOCATION = "!Location"; @@ -61,12 +61,6 @@ public class SequenceFeature implements FeatureLocationI private static final String ROW_DATA = "%s%s%s"; /* - * ATTRIBUTES is reserved for the GFF 'column 9' data, formatted as - * name1=value1;name2=value2,value3;...etc - */ - private static final String ATTRIBUTES = "ATTRIBUTES"; - - /* * type, begin, end, featureGroup, score and contactFeature are final * to ensure that the integrity of SequenceFeatures data store * can't be broken by direct update of these fields @@ -174,19 +168,13 @@ public class SequenceFeature implements FeatureLocationI if (sf.otherDetails != null) { - otherDetails = new HashMap<>(); - for (Entry entry : sf.otherDetails.entrySet()) - { - otherDetails.put(entry.getKey(), entry.getValue()); - } + otherDetails = new LinkedHashMap<>(); + otherDetails.putAll(sf.otherDetails); } if (sf.links != null && sf.links.size() > 0) { links = new Vector<>(); - for (int i = 0, iSize = sf.links.size(); i < iSize; i++) - { - links.addElement(sf.links.elementAt(i)); - } + links.addAll(sf.links); } } @@ -440,7 +428,10 @@ public class SequenceFeature implements FeatureLocationI { if (otherDetails == null) { - otherDetails = new HashMap<>(); + /* + * LinkedHashMap preserves insertion order of attributes + */ + otherDetails = new LinkedHashMap<>(); } otherDetails.put(key, value); @@ -483,16 +474,6 @@ public class SequenceFeature implements FeatureLocationI return (String) getValue(STATUS); } - public void setAttributes(String attr) - { - setValue(ATTRIBUTES, attr); - } - - public String getAttributes() - { - return (String) getValue(ATTRIBUTES); - } - /** * Return 1 for forward strand ('+' in GFF), -1 for reverse strand ('-' in * GFF), and 0 for unknown or not (validly) specified @@ -643,10 +624,6 @@ public class SequenceFeature implements FeatureLocationI for (Entry entry : ordered.entrySet()) { String key = entry.getKey(); - if (ATTRIBUTES.equals(key)) - { - continue; // to avoid double reporting - } Object value = entry.getValue(); if (value instanceof Map) diff --git a/src/jalview/ext/ensembl/EnsemblSeqProxy.java b/src/jalview/ext/ensembl/EnsemblSeqProxy.java index 001e18e..b22b9c7 100644 --- a/src/jalview/ext/ensembl/EnsemblSeqProxy.java +++ b/src/jalview/ext/ensembl/EnsemblSeqProxy.java @@ -724,18 +724,6 @@ public abstract class EnsemblSeqProxy extends EnsemblRestClient String comp = complement.toString(); sf.setValue(Gff3Helper.ALLELES, comp); sf.setDescription(comp); - - /* - * replace value of "alleles=" in sf.ATTRIBUTES as well - * so 'output as GFF' shows reverse complement alleles - */ - String atts = sf.getAttributes(); - if (atts != null) - { - atts = atts.replace(Gff3Helper.ALLELES + "=" + alleles, - Gff3Helper.ALLELES + "=" + comp); - sf.setAttributes(atts); - } } /** diff --git a/src/jalview/io/FeaturesFile.java b/src/jalview/io/FeaturesFile.java index a69788b..a8a3746 100755 --- a/src/jalview/io/FeaturesFile.java +++ b/src/jalview/io/FeaturesFile.java @@ -36,7 +36,6 @@ import jalview.datamodel.SequenceI; import jalview.datamodel.features.FeatureMatcherSet; import jalview.datamodel.features.FeatureMatcherSetI; import jalview.gui.Desktop; -import jalview.io.gff.GffHelperBase; import jalview.io.gff.GffHelperFactory; import jalview.io.gff.GffHelperI; import jalview.schemes.FeatureColour; @@ -75,6 +74,8 @@ import java.util.TreeMap; */ public class FeaturesFile extends AlignFile implements FeaturesSourceI { + private static final String EQUALS = "="; + private static final String TAB_REGEX = "\\t"; private static final String STARTGROUP = "STARTGROUP"; @@ -87,8 +88,6 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI private static final String ID_NOT_SPECIFIED = "ID_NOT_SPECIFIED"; - private static final String NOTE = "Note"; - protected static final String GFF_VERSION = "##gff-version"; private AlignmentI lastmatchedAl = null; @@ -1126,11 +1125,110 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI String phase = sf.getPhase(); out.append(phase == null ? "." : phase); - // miscellaneous key-values (GFF column 9) - String attributes = sf.getAttributes(); - if (attributes != null) + if (sf.otherDetails != null && !sf.otherDetails.isEmpty()) + { + Map map = sf.otherDetails; + formatAttributes(out, map); + } + } + + /** + * A helper method that outputs attributes stored in the map as + * semicolon-delimited values e.g. + * + *
+   * AC_Male=0;AF_NFE=0.00000e 00;Hom_FIN=0;GQ_MEDIAN=9
+   * 
+ * + * A map-valued attribute is formatted as a comma-delimited list within braces, + * for example + * + *
+   * jvmap_CSQ={ALLELE_NUM=1,UNIPARC=UPI0002841053,Feature=ENST00000585561}
+   * 
+ * + * The {@code jvmap_} prefix designates a values map and is removed if the value + * is parsed when read in. (The GFF3 specification allows 'semi-structured data' + * to be represented provided the attribute name begins with a lower case + * letter.) + * + * @param sb + * @param map + * @see http://gmod.org/wiki/GFF3#GFF3_Format + */ + void formatAttributes(StringBuilder sb, Map map) + { + sb.append(TAB); + boolean first = true; + for (String key : map.keySet()) + { + if (SequenceFeature.STRAND.equals(key) + || SequenceFeature.PHASE.equals(key)) + { + /* + * values stashed in map but output to their own columns + */ + continue; + } + { + if (!first) + { + sb.append(";"); + } + } + first = false; + Object value = map.get(key); + if (value instanceof Map) + { + formatMapAttribute(sb, key, (Map) value); + } + else + { + String formatted = StringUtils.urlEncode(value.toString(), + GffHelperI.GFF_ENCODABLE); + sb.append(key).append(EQUALS).append(formatted); + } + } + } + + /** + * Formats the map entries as + * + *
+   * key=key1=value1,key2=value2,...
+   * 
+ * + * and appends this to the string buffer + * + * @param sb + * @param key + * @param map + */ + private void formatMapAttribute(StringBuilder sb, String key, + Map map) + { + if (map == null || map.isEmpty()) + { + return; + } + + /* + * AbstractMap.toString would be a shortcut here, but more reliable + * to code the required format in case toString changes in future + */ + sb.append(key).append(EQUALS); + boolean first = true; + for (Entry entry : map.entrySet()) { - out.append(TAB).append(attributes); + if (!first) + { + sb.append(","); + } + first = false; + sb.append(entry.getKey().toString()).append(EQUALS); + String formatted = StringUtils.urlEncode(entry.getValue().toString(), + GffHelperI.GFF_ENCODABLE); + sb.append(formatted); } } @@ -1139,11 +1237,11 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI * format) * * @param alignedRegions - * a list of "Align fromStart toStart fromCount" + * a list of "Align fromStart toStart fromCount" * @param mapIsFromCdna - * if true, 'from' is dna, else 'from' is protein + * if true, 'from' is dna, else 'from' is protein * @param strand - * either 1 (forward) or -1 (reverse) + * either 1 (forward) or -1 (reverse) * @return * @throws IOException */ @@ -1279,38 +1377,6 @@ public class FeaturesFile extends AlignFile implements FeaturesSourceI } /** - * Process the 'column 9' data of the GFF file. This is less formally defined, - * and its interpretation will vary depending on the tool that has generated - * it. - * - * @param attributes - * @param sf - */ - protected void processGffColumnNine(String attributes, SequenceFeature sf) - { - sf.setAttributes(attributes); - - /* - * Parse attributes in column 9 and add them to the sequence feature's - * 'otherData' table; use Note as a best proxy for description - */ - char nameValueSeparator = gffVersion == 3 ? '=' : ' '; - // TODO check we don't break GFF2 values which include commas here - Map> nameValues = GffHelperBase - .parseNameValuePairs(attributes, ";", nameValueSeparator, ","); - for (Entry> attr : nameValues.entrySet()) - { - String values = StringUtils.listToDelimitedString(attr.getValue(), - "; "); - sf.setValue(attr.getKey(), values); - if (NOTE.equals(attr.getKey())) - { - sf.setDescription(values); - } - } - } - - /** * After encountering ##fasta in a GFF3 file, process the remainder of the * file as FAST sequence data. Any placeholder sequences created during * feature parsing are updated with the actual sequences. diff --git a/src/jalview/io/gff/Gff2Helper.java b/src/jalview/io/gff/Gff2Helper.java index 19045d5..a15a116 100644 --- a/src/jalview/io/gff/Gff2Helper.java +++ b/src/jalview/io/gff/Gff2Helper.java @@ -38,20 +38,10 @@ public class Gff2Helper extends GffHelperBase */ public static Map> parseNameValuePairs(String text) { - // TODO: can a value include a comma? if so it will be broken by this return parseNameValuePairs(text, ";", ' ', ","); } /** - * Return ' ' as the name-value separator used in column 9 attributes. - */ - @Override - protected char getNameValueSeparator() - { - return ' '; - } - - /** * Default processing if not overridden is just to construct a sequence * feature */ diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java index a25a014..1ef8848 100644 --- a/src/jalview/io/gff/Gff3Helper.java +++ b/src/jalview/io/gff/Gff3Helper.java @@ -350,15 +350,6 @@ public class Gff3Helper extends GffHelperBase } /** - * Return '=' as the name-value separator used in column 9 attributes. - */ - @Override - protected char getNameValueSeparator() - { - return '='; - } - - /** * Modifies the default SequenceFeature in order to set the Target sequence id * as the description */ @@ -424,6 +415,11 @@ public class Gff3Helper extends GffHelperBase desc = (String) sf.getValue(ID); } + /* + * and decode comma, equals, semi-colon as required by GFF3 spec + */ + desc = StringUtils.urlDecode(desc, GFF_ENCODABLE); + return desc; } } diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java index 1d4d3ac..3db1755 100644 --- a/src/jalview/io/gff/GffHelperBase.java +++ b/src/jalview/io/gff/GffHelperBase.java @@ -43,7 +43,13 @@ import java.util.Map.Entry; */ public abstract class GffHelperBase implements GffHelperI { - private static final String NOTE = "Note"; + private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: "; + + protected static final String COMMA = ","; + + protected static final String EQUALS = "="; + + protected static final String NOTE = "Note"; /* * GFF columns 1-9 (zero-indexed): @@ -260,9 +266,12 @@ public abstract class GffHelperBase implements GffHelperI /** * Parses the input line to a map of name / value(s) pairs. For example the - * line
+ * line + * + *
    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
-   * 
+ *
+ * * if parsed with delimiter=";" and separators {' ', '='}
* would return a map with { Notes={Fe=S, Metal}, Method={manual curation, * prediction}, source={Pfam}}
@@ -272,57 +281,80 @@ public abstract class GffHelperBase implements GffHelperI * name), or GFF3 format (which uses '=' as the name/value delimiter, and * strictly does not allow repeat occurrences of the same name - but does * allow a comma-separated list of values). + *

+ * Returns a (possibly empty) map of lists of values by attribute name. * * @param text * @param namesDelimiter * the major delimiter between name-value pairs * @param nameValueSeparator - * one or more separators used between name and value + * separator used between name and value * @param valuesDelimiter * delimits a list of more than one value - * @return the name-values map (which may be empty but never null) + * @return */ public static Map> parseNameValuePairs(String text, String namesDelimiter, char nameValueSeparator, String valuesDelimiter) { - Map> map = new HashMap>(); + Map> map = new HashMap<>(); if (text == null || text.trim().length() == 0) { return map; } - for (String pair : text.trim().split(namesDelimiter)) + /* + * split by major delimiter (; for GFF3) + */ + for (String nameValuePair : text.trim().split(namesDelimiter)) { - pair = pair.trim(); - if (pair.length() == 0) + nameValuePair = nameValuePair.trim(); + if (nameValuePair.length() == 0) { continue; } - int sepPos = pair.indexOf(nameValueSeparator); + /* + * find name/value separator (= for GFF3) + */ + int sepPos = nameValuePair.indexOf(nameValueSeparator); if (sepPos == -1) { - // no name=value present + // no name=value found continue; } - String key = pair.substring(0, sepPos).trim(); - String values = pair.substring(sepPos + 1).trim(); - if (values.length() > 0) + String name = nameValuePair.substring(0, sepPos).trim(); + String values = nameValuePair.substring(sepPos + 1).trim(); + if (values.isEmpty()) + { + continue; + } + + List vals = map.get(name); + if (vals == null) + { + vals = new ArrayList<>(); + map.put(name, vals); + } + + /* + * if 'values' contains more name/value separators, parse as a map + * (nested sub-attribute values) + */ + if (values.indexOf(nameValueSeparator) != -1) + { + vals.add(values); + } + else { - List vals = map.get(key); - if (vals == null) - { - vals = new ArrayList(); - map.put(key, vals); - } for (String val : values.split(valuesDelimiter)) { vals.add(val); } } } + return map; } @@ -357,8 +389,7 @@ public abstract class GffHelperBase implements GffHelperI int end = Integer.parseInt(gff[END_COL]); /* - * default 'score' is 0 rather than Float.NaN as the latter currently - * disables the 'graduated colour => colour by label' option + * default 'score' is 0 rather than Float.NaN - see JAL-2554 */ float score = 0f; try @@ -379,22 +410,32 @@ public abstract class GffHelperBase implements GffHelperI if (attributes != null) { /* - * save 'raw' column 9 to allow roundtrip output as input - */ - sf.setAttributes(gff[ATTRIBUTES_COL]); - - /* * Add attributes in column 9 to the sequence feature's - * 'otherData' table; use Note as a best proxy for description + * 'otherData' table; use Note as a best proxy for description; + * decode any encoded comma, equals, semi-colon as per GFF3 spec */ for (Entry> attr : attributes.entrySet()) { - String values = StringUtils.listToDelimitedString(attr.getValue(), - ","); - sf.setValue(attr.getKey(), values); - if (NOTE.equals(attr.getKey())) + String key = attr.getKey(); + List values = attr.getValue(); + if (values.size() == 1 && values.get(0).contains(EQUALS)) + { + /* + * 'value' is actually nested subattributes as x=a,y=b,z=c + */ + Map valueMap = parseAttributeMap(values.get(0)); + sf.setValue(key, valueMap); + } + else { - sf.setDescription(values); + String csvValues = StringUtils.listToDelimitedString(values, + COMMA); + csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE); + sf.setValue(key, csvValues); + if (NOTE.equals(key)) + { + sf.setDescription(csvValues); + } } } } @@ -408,12 +449,102 @@ public abstract class GffHelperBase implements GffHelperI } /** - * Returns the character used to separate attributes names from values in GFF - * column 9. This is space for GFF2, '=' for GFF3. + * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map + * of {@code key, + * value}
+ * An input string like {@code a=b,c,d=e,f=g,h} is parsed to + * + *

+   * a = "b,c"
+   * d = "e"
+   * f = "g,h"
+   * 
+ * + * @param s * * @return */ - protected abstract char getNameValueSeparator(); + protected static Map parseAttributeMap(String s) + { + Map map = new HashMap<>(); + String[] fields = s.split(EQUALS); + + /* + * format validation + */ + boolean valid = true; + if (fields.length < 2) + { + /* + * need at least A=B here + */ + valid = false; + } + else if (fields[0].isEmpty() || fields[0].contains(COMMA)) + { + /* + * A,B=C is not a valid start, nor is =C + */ + valid = false; + } + else + { + for (int i = 1; i < fields.length - 1; i++) + { + if (fields[i].isEmpty() || !fields[i].contains(COMMA)) + { + /* + * intermediate tokens must include value,name + */ + valid = false; + } + } + } + + if (!valid) + { + System.err.println(INVALID_GFF_ATTRIBUTE_FORMAT + s); + return map; + } + + int i = 0; + while (i < fields.length - 1) + { + boolean lastPair = i == fields.length - 2; + String before = fields[i]; + String after = fields[i + 1]; + + /* + * if 'key' looks like a,b,c then the last token is the + * key + */ + String theKey = before.contains(COMMA) + ? before.substring(before.lastIndexOf(COMMA) + 1) + : before; + + theKey = theKey.trim(); + if (theKey.isEmpty()) + { + System.err.println(INVALID_GFF_ATTRIBUTE_FORMAT + s); + map.clear(); + return map; + } + + /* + * if 'value' looks like a,b,c then all but the last token is the value, + * unless this is the last field (no more = to follow), in which case + * all of it makes up the value + */ + String theValue = after.contains(COMMA) && !lastPair + ? after.substring(0, after.lastIndexOf(COMMA)) + : after; + map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE), + StringUtils.urlDecode(theValue, GFF_ENCODABLE)); + i += 1; + } + + return map; + } /** * Returns any existing mapping held on the alignment between the given diff --git a/src/jalview/io/gff/GffHelperI.java b/src/jalview/io/gff/GffHelperI.java index 7fbcf5c..387ee60 100644 --- a/src/jalview/io/gff/GffHelperI.java +++ b/src/jalview/io/gff/GffHelperI.java @@ -35,6 +35,12 @@ import java.util.List; */ public interface GffHelperI { + /* + * GFF3 spec requires comma, equals, semi-colon, tab, percent characters to be + * encoded as %2C, %3D, %3B, %09, %25 respectively within data values + * see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md + */ + final String GFF_ENCODABLE = ",=;\t%"; final String RENAME_TOKEN = "$RENAME_TO$"; diff --git a/src/jalview/io/vcf/VCFLoader.java b/src/jalview/io/vcf/VCFLoader.java index ac707d8..cbdd66c 100644 --- a/src/jalview/io/vcf/VCFLoader.java +++ b/src/jalview/io/vcf/VCFLoader.java @@ -19,11 +19,10 @@ import jalview.io.gff.SequenceOntologyI; import jalview.util.MapList; import jalview.util.MappingUtils; import jalview.util.MessageManager; +import jalview.util.StringUtils; import java.io.File; import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -57,17 +56,7 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; */ public class VCFLoader { - private static final String ENCODED_COMMA = "%2C"; - - private static final String ENCODED_PERCENT = "%25"; - - private static final String ENCODED_EQUALS = "%3D"; - - private static final String ENCODED_SEMICOLON = "%3B"; - - private static final String ENCODED_COLON = "%3A"; - - private static final String UTF_8 = "UTF-8"; + private static final String VCF_ENCODABLE = ":;=%,"; /* * Jalview feature attributes for VCF fixed column data @@ -1336,42 +1325,17 @@ public class VCFLoader String value = getAttributeValue(variant, key, index); if (value != null && isValid(variant, key, value)) { - value = decodeSpecialCharacters(value); + /* + * decode colon, semicolon, equals sign, percent sign, comma (only) + * as required by the VCF specification (para 1.2) + */ + value = StringUtils.urlDecode(value, VCF_ENCODABLE); addFeatureAttribute(sf, key, value); } } } /** - * Decodes colon, semicolon, equals sign, percent sign, comma to their decoded - * form. The VCF specification (para 1.2) requires these to be encoded where not - * used with their special meaning in the VCF syntax. Note that general URL - * decoding should not be applied, since this would incorrectly decode (for - * example) a '+' sign. - * - * @param value - * @return - */ - protected static String decodeSpecialCharacters(String value) - { - /* - * avoid regex compilation if it is not needed! - */ - if (!value.contains(ENCODED_COLON) && !value.contains(ENCODED_SEMICOLON) - && !value.contains(ENCODED_EQUALS) - && !value.contains(ENCODED_PERCENT) - && !value.contains(ENCODED_COMMA)) - { - return value; - } - - value = value.replace(ENCODED_COLON, ":") - .replace(ENCODED_SEMICOLON, ";").replace(ENCODED_EQUALS, "=") - .replace(ENCODED_PERCENT, "%").replace(ENCODED_COMMA, ","); - return value; - } - - /** * Answers true for '.', null, or an empty value, or if the INFO type is String. * If the INFO type is Integer or Float, answers false if the value is not in * valid format. @@ -1489,12 +1453,7 @@ public class VCFLoader * VCF spec requires encoding of special characters e.g. '=' * so decode them here before storing */ - try - { - field = URLDecoder.decode(field, UTF_8); - } catch (UnsupportedEncodingException e) - { - } + field = StringUtils.urlDecode(field, VCF_ENCODABLE); csqValues.put(id, field); } } diff --git a/src/jalview/project/Jalview2XML.java b/src/jalview/project/Jalview2XML.java index 2d8a4a6..ca0423b 100644 --- a/src/jalview/project/Jalview2XML.java +++ b/src/jalview/project/Jalview2XML.java @@ -3336,8 +3336,10 @@ public class Jalview2XML || tmpSeq.getEnd() != jseq.getEnd()) { System.err.println( - "Warning JAL-2154 regression: updating start/end for sequence " - + tmpSeq.toString() + " to " + jseq); + String.format("Warning JAL-2154 regression: updating start/end for sequence %s from %d/%d to %d/%d", + tmpSeq.getName(), tmpSeq.getStart(), + tmpSeq.getEnd(), jseq.getStart(), + jseq.getEnd())); } } else diff --git a/src/jalview/util/StringUtils.java b/src/jalview/util/StringUtils.java index 2e8ace8..1f114a8 100644 --- a/src/jalview/util/StringUtils.java +++ b/src/jalview/util/StringUtils.java @@ -20,6 +20,8 @@ */ package jalview.util; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; @@ -29,8 +31,16 @@ public class StringUtils private static final Pattern DELIMITERS_PATTERN = Pattern .compile(".*='[^']*(?!')"); + private static final char PERCENT = '%'; + private static final boolean DEBUG = false; + /* + * URL encoded characters, indexed by char value + * e.g. urlEncodings['='] = urlEncodings[61] = "%3D" + */ + private static String[] urlEncodings = new String[255]; + /** * Returns a new character array, after inserting characters into the given * character array. @@ -146,7 +156,7 @@ public class StringUtils { return null; } - List jv = new ArrayList(); + List jv = new ArrayList<>(); int cp = 0, pos, escape; boolean wasescaped = false, wasquoted = false; String lstitem = null; @@ -444,4 +454,118 @@ public class StringUtils } return text; } + + /** + * Answers the input string with any occurrences of the 'encodeable' characters + * replaced by their URL encoding + * + * @param s + * @param encodable + * @return + */ + public static String urlEncode(String s, String encodable) + { + if (s == null || s.isEmpty()) + { + return s; + } + + /* + * do % encoding first, as otherwise it may double-encode! + */ + if (encodable.indexOf(PERCENT) != -1) + { + s = urlEncode(s, PERCENT); + } + + for (char c : encodable.toCharArray()) + { + if (c != PERCENT) + { + s = urlEncode(s, c); + } + } + return s; + } + + /** + * Answers the input string with any occurrences of {@code c} replaced with + * their url encoding. Answers the input string if it is unchanged. + * + * @param s + * @param c + * @return + */ + static String urlEncode(String s, char c) + { + String decoded = String.valueOf(c); + if (s.indexOf(decoded) != -1) + { + String encoded = getUrlEncoding(c); + if (!encoded.equals(decoded)) + { + s = s.replace(decoded, encoded); + } + } + return s; + } + + /** + * Answers the input string with any occurrences of the specified (unencoded) + * characters replaced by their URL decoding. + *

+ * Example: {@code urlDecode("a%3Db%3Bc", "-;=,")} should answer + * {@code "a=b;c"}. + * + * @param s + * @param encodable + * @return + */ + public static String urlDecode(String s, String encodable) + { + if (s == null || s.isEmpty()) + { + return s; + } + + for (char c : encodable.toCharArray()) + { + String encoded = getUrlEncoding(c); + if (s.indexOf(encoded) != -1) + { + String decoded = String.valueOf(c); + s = s.replace(encoded, decoded); + } + } + return s; + } + + /** + * Does a lazy lookup of the url encoding of the given character, saving the + * value for repeat lookups + * + * @param c + * @return + */ + private static String getUrlEncoding(char c) + { + if (c < 0 || c >= urlEncodings.length) + { + return String.valueOf(c); + } + + String enc = urlEncodings[c]; + if (enc == null) + { + try + { + enc = urlEncodings[c] = URLEncoder.encode(String.valueOf(c), + "UTF-8"); + } catch (UnsupportedEncodingException e) + { + enc = urlEncodings[c] = String.valueOf(c); + } + } + return enc; + } } diff --git a/src/jalview/ws/dbsources/EmblXmlSource.java b/src/jalview/ws/dbsources/EmblXmlSource.java index e114ea9..a420d9f 100644 --- a/src/jalview/ws/dbsources/EmblXmlSource.java +++ b/src/jalview/ws/dbsources/EmblXmlSource.java @@ -52,7 +52,6 @@ import java.util.Hashtable; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.regex.Pattern; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; @@ -68,8 +67,6 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy */ private static final String EMBL_NOT_FOUND_REPLY = "ERROR 12 No entries found."; - private static final Pattern SPACE_PATTERN = Pattern.compile(" "); - public EmblXmlSource() { super(); @@ -703,19 +700,10 @@ public abstract class EmblXmlSource extends EbiFileRetrievedProxy SequenceFeature sf = new SequenceFeature(type, desc, begin, end, group); if (!vals.isEmpty()) { - StringBuilder sb = new StringBuilder(); - boolean first = true; for (Entry val : vals.entrySet()) { - if (!first) - { - sb.append(";"); - } - sb.append(val.getKey()).append("=").append(val.getValue()); - first = false; sf.setValue(val.getKey(), val.getValue()); } - sf.setAttributes(sb.toString()); } return sf; } diff --git a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java index 17e92c8..e17b4a6 100644 --- a/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java +++ b/test/jalview/ext/ensembl/EnsemblSeqProxyTest.java @@ -21,12 +21,10 @@ package jalview.ext.ensembl; import static org.testng.AssertJUnit.assertEquals; -import static org.testng.AssertJUnit.assertSame; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.datamodel.features.SequenceFeatures; import jalview.gui.JvOptionPane; import jalview.io.DataSourceType; import jalview.io.FastaFile; @@ -34,8 +32,6 @@ import jalview.io.gff.SequenceOntologyFactory; import jalview.io.gff.SequenceOntologyLite; import java.lang.reflect.Method; -import java.util.Arrays; -import java.util.List; import org.testng.Assert; import org.testng.annotations.AfterClass; @@ -223,7 +219,6 @@ public class EnsemblSeqProxyTest SequenceFeature sf = new SequenceFeature("sequence_variant", alleles, 1, 2, 0f, null); sf.setValue("alleles", alleles); - sf.setAttributes("x=y,z;alleles=" + alleles + ";a=b,c"); EnsemblSeqProxy.reverseComplementAlleles(sf); String revcomp = "G,C,GTA-,HGMD_MUTATION,gtc"; @@ -231,7 +226,5 @@ public class EnsemblSeqProxyTest assertEquals(revcomp, sf.getDescription()); // verify alleles attribute is updated with reverse complement assertEquals(revcomp, sf.getValue("alleles")); - // verify attributes string is updated with reverse complement - assertEquals("x=y,z;alleles=" + revcomp + ";a=b,c", sf.getAttributes()); } } diff --git a/test/jalview/io/FeaturesFileTest.java b/test/jalview/io/FeaturesFileTest.java index 090de6f..298ae6b 100644 --- a/test/jalview/io/FeaturesFileTest.java +++ b/test/jalview/io/FeaturesFileTest.java @@ -268,10 +268,12 @@ public class FeaturesFileTest AlignFrame af = new AlignFrame(al, 500, 500); Map colours = af.getFeatureRenderer() .getFeatureColours(); - // GFF3 uses '=' separator for name/value pairs in colum 9 + // GFF3 uses '=' separator for name/value pairs in column 9 + // comma (%2C) equals (%3D) or semi-colon (%3B) should be url-escaped in values String gffData = "##gff-version 3\n" + "FER_CAPAA\tuniprot\tMETAL\t39\t39\t0.0\t.\t.\t" - + "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO:0000255|PROSITE-ProRule:PRU00465\n" + + "Note=Iron-sulfur (2Fe-2S);Note=another note,and another;evidence=ECO%3B0000255%2CPROSITE%3DProRule:PRU00465;" + + "CSQ=AF=21,POLYPHEN=benign,possibly_damaging,clin_sig=Benign%3Dgood\n" + "FER1_SOLLC\tuniprot\tPfam\t55\t130\t3.0\t.\t.\tID=$23"; FeaturesFile featuresFile = new FeaturesFile(gffData, DataSourceType.PASTE); @@ -284,14 +286,25 @@ public class FeaturesFileTest assertEquals(1, sfs.size()); SequenceFeature sf = sfs.get(0); // description parsed from Note attribute - assertEquals("Iron-sulfur (2Fe-2S),another note", sf.description); + assertEquals("Iron-sulfur (2Fe-2S),another note,and another", + sf.description); assertEquals(39, sf.begin); assertEquals(39, sf.end); assertEquals("uniprot", sf.featureGroup); assertEquals("METAL", sf.type); - assertEquals( - "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO:0000255|PROSITE-ProRule:PRU00465", - sf.getValue("ATTRIBUTES")); + assertEquals(5, sf.otherDetails.size()); + assertEquals("ECO;0000255,PROSITE=ProRule:PRU00465", // url decoded + sf.getValue("evidence")); + assertEquals("Iron-sulfur (2Fe-2S),another note,and another", + sf.getValue("Note")); + assertEquals("21", sf.getValueAsString("CSQ", "AF")); + assertEquals("benign,possibly_damaging", + sf.getValueAsString("CSQ", "POLYPHEN")); + assertEquals("Benign=good", sf.getValueAsString("CSQ", "clin_sig")); // url decoded + // todo change STRAND and !Phase into fields of SequenceFeature instead + assertEquals(".", sf.otherDetails.get("STRAND")); + assertEquals(0, sf.getStrand()); + assertEquals(".", sf.getPhase()); // verify feature on FER1_SOLLC1 sfs = al.getSequenceAt(2).getDatasetSequence().getSequenceFeatures(); @@ -593,9 +606,14 @@ public class FeaturesFileTest "s3dm")); SequenceFeature sf = new SequenceFeature("Pfam", "", 20, 20, 0f, "Uniprot"); - sf.setAttributes("x=y;black=white"); sf.setStrand("+"); sf.setPhase("2"); + sf.setValue("x", "y"); + sf.setValue("black", "white"); + Map csq = new HashMap<>(); + csq.put("SIFT", "benign,mostly benign,cloudy, with meatballs"); + csq.put("consequence", "missense_variant"); + sf.setValue("CSQ", csq); al.getSequenceAt(1).addSequenceFeature(sf); /* @@ -660,7 +678,11 @@ public class FeaturesFileTest // Pfam feature columns include strand(+), phase(2), attributes expected = gffHeader + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\n" - + "FER_CAPAN\tUniprot\tPfam\t20\t20\t0.0\t+\t2\tx=y;black=white\n" + // CSQ output as CSQ=att1=value1,att2=value2 + // note all commas are encoded here which is wrong - it should be + // SIFT=benign,mostly benign,cloudy%2C with meatballs + + "FER_CAPAN\tUniprot\tPfam\t20\t20\t0.0\t+\t2\tx=y;black=white;" + + "CSQ=SIFT=benign%2Cmostly benign%2Ccloudy%2C with meatballs,consequence=missense_variant\n" + "FER_CAPAN\ts3dm\tGAMMA-TURN\t36\t38\t2.1\t.\t.\n"; assertEquals(expected, exported); } @@ -772,8 +794,8 @@ public class FeaturesFileTest String exported = featuresFile.printGffFormat(al.getSequencesArray(), fr, false, false); String expected = gffHeader - + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\n" - + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\n"; + + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\tclin_sig=Likely Pathogenic;AF=24\n" + + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\tclin_sig=Benign;AF=46\n"; assertEquals(expected, exported); /* @@ -786,7 +808,8 @@ public class FeaturesFileTest fr.setColour("METAL", fc); exported = featuresFile.printGffFormat(al.getSequencesArray(), fr, false, false); - expected = gffHeader + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\n"; + expected = gffHeader + + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\tclin_sig=Likely Pathogenic;AF=24\n"; assertEquals(expected, exported); /* @@ -795,8 +818,9 @@ public class FeaturesFileTest fc.setAboveThreshold(false); exported = featuresFile.printGffFormat(al.getSequencesArray(), fr, false, false); - expected = gffHeader + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\n" - + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\n"; + expected = gffHeader + + "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\tclin_sig=Likely Pathogenic;AF=24\n" + + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\tclin_sig=Benign;AF=46\n"; assertEquals(expected, exported); /* @@ -808,7 +832,8 @@ public class FeaturesFileTest fr.setFeatureFilter("METAL", filter); exported = featuresFile.printGffFormat(al.getSequencesArray(), fr, false, false); - expected = gffHeader + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\n"; + expected = gffHeader + + "FER_CAPAA\tCath\tMETAL\t41\t41\t0.6\t.\t.\tclin_sig=Benign;AF=46\n"; assertEquals(expected, exported); } diff --git a/test/jalview/io/gff/GffHelperBaseTest.java b/test/jalview/io/gff/GffHelperBaseTest.java index 7fb716f..a23518d 100644 --- a/test/jalview/io/gff/GffHelperBaseTest.java +++ b/test/jalview/io/gff/GffHelperBaseTest.java @@ -20,9 +20,10 @@ */ package jalview.io.gff; -import static org.testng.AssertJUnit.assertEquals; -import static org.testng.AssertJUnit.assertFalse; -import static org.testng.AssertJUnit.assertTrue; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; import jalview.gui.JvOptionPane; @@ -59,25 +60,38 @@ public class GffHelperBaseTest Map> map = GffHelperBase.parseNameValuePairs( "hello world", ";", ' ', ", "); - assertEquals(1, map.size()); - assertEquals(1, map.get("hello").size()); - assertEquals("world", map.get("hello").get(0)); + assertEquals(map.size(), 1); + assertEquals(map.get("hello").size(), 1); + assertEquals(map.get("hello").get(0), "world"); map = GffHelperBase .parseNameValuePairs( - "Method= manual curation ;nothing; Notes=F2 S ; Notes=Metal,Shiny; Type=", + "Method= manual curation ;nothing; Notes=F2 S ; Notes=Metal,Shiny%2Csmooth; Type=", ";", '=', ","); // Type is ignored as no value was supplied - assertEquals(2, map.size()); + assertEquals(map.size(), 2); - assertEquals(1, map.get("Method").size()); - assertEquals("manual curation", map.get("Method").get(0)); // trimmed + assertEquals(map.get("Method").size(), 1); + assertEquals(map.get("Method").get(0), "manual curation"); // trimmed - assertEquals(3, map.get("Notes").size()); - assertEquals("F2 S", map.get("Notes").get(0)); - assertEquals("Metal", map.get("Notes").get(1)); - assertEquals("Shiny", map.get("Notes").get(2)); + assertEquals(map.get("Notes").size(), 3); + assertEquals(map.get("Notes").get(0), "F2 S"); + assertEquals(map.get("Notes").get(1), "Metal"); + assertEquals(map.get("Notes").get(2), "Shiny%2Csmooth"); // not decoded here + + /* + * gff3 style with nested attribute values + */ + String csqValue = "POLYPHEN=possibly_damaging,probably_damaging,SIFT=tolerated%2Cdeleterious"; + map = GffHelperBase.parseNameValuePairs("hello=world;CSQ=" + csqValue, + ";", '=', ","); + assertEquals(map.size(), 2); // keys hello, CSQ + assertEquals(map.get("hello").size(), 1); + assertEquals(map.get("hello").get(0), "world"); + // CSQ values is read 'raw' here, and parsed further elsewhere + assertEquals(map.get("CSQ").size(), 1); + assertEquals(map.get("CSQ").get(0), csqValue); } /** @@ -89,110 +103,164 @@ public class GffHelperBaseTest int[] from = { 1, 12 }; int[] to = { 20, 31 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); - assertEquals("[1, 12]", Arrays.toString(from)); // unchanged - assertEquals("[20, 31]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[1, 12]"); // unchanged + assertEquals(Arrays.toString(to), "[20, 31]"); // unchanged // from too long: from = new int[] { 1, 13 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); - assertEquals("[1, 12]", Arrays.toString(from)); // trimmed - assertEquals("[20, 31]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[1, 12]"); // trimmed + assertEquals(Arrays.toString(to), "[20, 31]"); // unchanged // to too long: to = new int[] { 20, 33 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); - assertEquals("[1, 12]", Arrays.toString(from)); // unchanged - assertEquals("[20, 31]", Arrays.toString(to)); // trimmed + assertEquals(Arrays.toString(from), "[1, 12]"); // unchanged + assertEquals(Arrays.toString(to), "[20, 31]"); // trimmed // from reversed: from = new int[] { 12, 1 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); - assertEquals("[12, 1]", Arrays.toString(from)); // unchanged - assertEquals("[20, 31]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[12, 1]"); // unchanged + assertEquals(Arrays.toString(to), "[20, 31]"); // unchanged // to reversed: to = new int[] { 31, 20 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); - assertEquals("[12, 1]", Arrays.toString(from)); // unchanged - assertEquals("[31, 20]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[12, 1]"); // unchanged + assertEquals(Arrays.toString(to), "[31, 20]"); // unchanged // from reversed and too long: from = new int[] { 14, 1 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); - assertEquals("[14, 3]", Arrays.toString(from)); // end trimmed - assertEquals("[31, 20]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[14, 3]"); // end trimmed + assertEquals(Arrays.toString(to), "[31, 20]"); // unchanged // to reversed and too long: to = new int[] { 31, 10 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 1)); - assertEquals("[14, 3]", Arrays.toString(from)); // unchanged - assertEquals("[31, 20]", Arrays.toString(to)); // end trimmed + assertEquals(Arrays.toString(from), "[14, 3]"); // unchanged + assertEquals(Arrays.toString(to), "[31, 20]"); // end trimmed // cdna to peptide (matching) from = new int[] { 1, 18 }; to = new int[] { 4, 9 }; assertTrue(GffHelperBase.trimMapping(from, to, 3, 1)); - assertEquals("[1, 18]", Arrays.toString(from)); // unchanged - assertEquals("[4, 9]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[1, 18]"); // unchanged + assertEquals(Arrays.toString(to), "[4, 9]"); // unchanged // overlong cdna to peptide from = new int[] { 1, 20 }; assertTrue(GffHelperBase.trimMapping(from, to, 3, 1)); - assertEquals("[1, 18]", Arrays.toString(from)); // end trimmed - assertEquals("[4, 9]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[1, 18]"); // end trimmed + assertEquals(Arrays.toString(to), "[4, 9]"); // unchanged // overlong cdna (reversed) to peptide from = new int[] { 20, 1 }; assertTrue(GffHelperBase.trimMapping(from, to, 3, 1)); - assertEquals("[20, 3]", Arrays.toString(from)); // end trimmed - assertEquals("[4, 9]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[20, 3]"); // end trimmed + assertEquals(Arrays.toString(to), "[4, 9]"); // unchanged // overlong cdna (reversed) to peptide (reversed) from = new int[] { 20, 1 }; to = new int[] { 9, 4 }; assertTrue(GffHelperBase.trimMapping(from, to, 3, 1)); - assertEquals("[20, 3]", Arrays.toString(from)); // end trimmed - assertEquals("[9, 4]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[20, 3]"); // end trimmed + assertEquals(Arrays.toString(to), "[9, 4]"); // unchanged // peptide to cdna (matching) from = new int[] { 4, 9 }; to = new int[] { 1, 18 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 3)); - assertEquals("[4, 9]", Arrays.toString(from)); // unchanged - assertEquals("[1, 18]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[4, 9]"); // unchanged + assertEquals(Arrays.toString(to), "[1, 18]"); // unchanged // peptide to overlong cdna to = new int[] { 1, 20 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 3)); - assertEquals("[4, 9]", Arrays.toString(from)); // unchanged - assertEquals("[1, 18]", Arrays.toString(to)); // end trimmed + assertEquals(Arrays.toString(from), "[4, 9]"); // unchanged + assertEquals(Arrays.toString(to), "[1, 18]"); // end trimmed // peptide to overlong cdna (reversed) to = new int[] { 20, 1 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 3)); - assertEquals("[4, 9]", Arrays.toString(from)); // unchanged - assertEquals("[20, 3]", Arrays.toString(to)); // end trimmed + assertEquals(Arrays.toString(from), "[4, 9]"); // unchanged + assertEquals(Arrays.toString(to), "[20, 3]"); // end trimmed // peptide (reversed) to overlong cdna (reversed) from = new int[] { 9, 4 }; to = new int[] { 20, 1 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 3)); - assertEquals("[9, 4]", Arrays.toString(from)); // unchanged - assertEquals("[20, 3]", Arrays.toString(to)); // end trimmed + assertEquals(Arrays.toString(from), "[9, 4]"); // unchanged + assertEquals(Arrays.toString(to), "[20, 3]"); // end trimmed // overlong peptide to word-length cdna from = new int[] { 4, 10 }; to = new int[] { 1, 18 }; assertTrue(GffHelperBase.trimMapping(from, to, 1, 3)); - assertEquals("[4, 9]", Arrays.toString(from)); // end trimmed - assertEquals("[1, 18]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[4, 9]"); // end trimmed + assertEquals(Arrays.toString(to), "[1, 18]"); // unchanged // overlong peptide to non-word-length cdna from = new int[] { 4, 10 }; to = new int[] { 1, 19 }; assertFalse(GffHelperBase.trimMapping(from, to, 1, 3)); - assertEquals("[4, 10]", Arrays.toString(from)); // unchanged - assertEquals("[1, 19]", Arrays.toString(to)); // unchanged + assertEquals(Arrays.toString(from), "[4, 10]"); // unchanged + assertEquals(Arrays.toString(to), "[1, 19]"); // unchanged + } + + @Test(groups = { "Functional" }) + public void testParseAttributeMap() + { + Map map = GffHelperBase + .parseAttributeMap("A=B,C%2C%3D%3B%09%25D,X=Y"); + assertEquals(map.size(), 2); + // value of A is everything up to and excluding ,X= + assertEquals(map.get("A"), "B,C,=;\t%D"); + assertEquals(map.get("X"), "Y"); + + /* + * malformed cases should result in an empty map + */ + map = GffHelperBase.parseAttributeMap("=B=Y"); + assertTrue(map.isEmpty()); + // first token should be an attribute name only, no commas + map = GffHelperBase.parseAttributeMap("A,B=C"); + assertTrue(map.isEmpty()); + // intermediate tokens need at least one comma (value,name=) + map = GffHelperBase.parseAttributeMap("A=B=C"); + assertTrue(map.isEmpty()); + // last token may have a comma or not + map = GffHelperBase.parseAttributeMap("A=B"); + assertEquals(map.get("A"), "B"); + map = GffHelperBase.parseAttributeMap("A=B,C"); + assertEquals(map.get("A"), "B,C"); + map = GffHelperBase.parseAttributeMap("A"); + assertTrue(map.isEmpty()); + map = GffHelperBase.parseAttributeMap("A="); + assertTrue(map.isEmpty()); + map = GffHelperBase.parseAttributeMap("A==C"); + assertTrue(map.isEmpty()); + map = GffHelperBase.parseAttributeMap("=A"); + assertTrue(map.isEmpty()); + map = GffHelperBase.parseAttributeMap("="); + assertTrue(map.isEmpty()); + map = GffHelperBase.parseAttributeMap(","); + assertTrue(map.isEmpty()); + map = GffHelperBase.parseAttributeMap(" "); + assertTrue(map.isEmpty()); + map = GffHelperBase.parseAttributeMap(""); + assertTrue(map.isEmpty()); + map = GffHelperBase.parseAttributeMap("A=B, =C"); + assertTrue(map.isEmpty()); + try + { + GffHelperBase.parseAttributeMap(null); + fail("expected exception"); + } catch (NullPointerException e) + { + // expected + } } } diff --git a/test/jalview/io/vcf/VCFLoaderTest.java b/test/jalview/io/vcf/VCFLoaderTest.java index 87cf727..b206f8c 100644 --- a/test/jalview/io/vcf/VCFLoaderTest.java +++ b/test/jalview/io/vcf/VCFLoaderTest.java @@ -3,7 +3,6 @@ package jalview.io.vcf; import static jalview.io.gff.SequenceOntologyI.SEQUENCE_VARIANT; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertNull; -import static org.testng.Assert.assertSame; import static org.testng.Assert.assertTrue; import jalview.bin.Cache; @@ -543,7 +542,7 @@ public class VCFLoaderTest assertEquals(sf.getValue("alleles"), "C,T"); map = (Map) sf.getValue("CSQ"); assertEquals(map.size(), 9); - assertEquals(map.get("PolyPhen"), "Bad++"); // %3B%3B decoded + assertEquals(map.get("PolyPhen"), "Bad;;"); // %3B%3B decoded sf = geneFeatures.get(2); assertEquals(sf.getBegin(), 9); @@ -762,16 +761,4 @@ public class VCFLoaderTest assertEquals(sf.getEnd(), 15); assertEquals(sf.getDescription(), "T,C"); } - - @Test(groups = "Functional") - public void testDecodeSpecialCharacters() throws IOException - { - String encoded = "hello world"; - String decoded = VCFLoader.decodeSpecialCharacters(encoded); - assertSame(encoded, decoded); // no change needed - - encoded = "ab%3Acd%3Bef%3Dgh%25ij%2Ckl%3A"; - decoded = VCFLoader.decodeSpecialCharacters(encoded); - assertEquals(decoded, "ab:cd;ef=gh%ij,kl:"); - } } \ No newline at end of file diff --git a/test/jalview/io/vcf/testVcf.vcf b/test/jalview/io/vcf/testVcf.vcf index 8a16a90..1956cbc 100644 --- a/test/jalview/io/vcf/testVcf.vcf +++ b/test/jalview/io/vcf/testVcf.vcf @@ -7,7 +7,7 @@ ##reference=/Homo_sapiens/GRCh38 #CHROM POS ID REF ALT QUAL FILTER INFO 5 45051610 . C A 81.96 RF;AC0 AC=1;AF=0.1;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=A|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,A|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad -5 45051614 . C T 1666.64 RF AC=1;AF=0.2;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=T|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,T|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad%2B%2B +5 45051614 . C T 1666.64 RF AC=1;AF=0.2;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=T|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,T|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad%3B%3B 5 45051618 . CGG C 41.94 AC0 AC=1;AF=0.3;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=C|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,C|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad,CSQ=CGT|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,CGT|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad 5 45051622 . C G,T 224.23 RF;AC0 AC=1,2;AF=0.4,0.5;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=G|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,G|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad,T|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,T|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad 5 45051626 . A AC,G 433.35 RF;AC0 AC=3,4;AF=0.6,0.7;AN=0;AF_Female=2;AB_MEDIAN=6.00000e-01;CSQ=G|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,G|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad,AC|missense_variant|MODIFIER|WASH7P|gene3|Transcript|transcript3|rna|Benign,AC|downstream_gene_variant|MODIFIER|WASH7P|gene3|Transcript|transcript4|mrna|Bad diff --git a/test/jalview/util/StringUtilsTest.java b/test/jalview/util/StringUtilsTest.java index 084219a..37506c0 100644 --- a/test/jalview/util/StringUtilsTest.java +++ b/test/jalview/util/StringUtilsTest.java @@ -145,7 +145,7 @@ public class StringUtilsTest public void testListToDelimitedString() { assertEquals("", StringUtils.listToDelimitedString(null, ";")); - List list = new ArrayList(); + List list = new ArrayList<>(); assertEquals("", StringUtils.listToDelimitedString(list, ";")); list.add("now"); assertEquals("now", StringUtils.listToDelimitedString(list, ";")); @@ -250,4 +250,70 @@ public class StringUtilsTest assertEquals("kdHydro < 12.53", StringUtils.stripHtmlTags("kdHydro < 12.53")); } + + @Test(groups = { "Functional" }) + public void testUrlEncode() + { + // degenerate cases + assertNull(StringUtils.urlEncode(null, ";,")); + assertEquals("", StringUtils.urlEncode("", "")); + assertEquals("", StringUtils.urlEncode("", ";,")); + + // sanity checks, see + // https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_reserved_characters + assertEquals("+", StringUtils.urlEncode(" ", " ")); + assertEquals("%25", StringUtils.urlEncode("%", "%")); + assertEquals(".", StringUtils.urlEncode(".", ".")); // note . is not encoded + assertEquals("%3A", StringUtils.urlEncode(":", ":")); + assertEquals("%3B", StringUtils.urlEncode(";", ";")); + assertEquals("%3D", StringUtils.urlEncode("=", "=")); + assertEquals("%2C", StringUtils.urlEncode(",", ",")); + + // check % does not get recursively encoded! + assertEquals("a%25b%3Dc%3Bd%3Ae%2C%2C", + StringUtils.urlEncode("a%b=c;d:e,,", "=,;:%")); + + // = not in the list for encoding + assertEquals("a=b", StringUtils.urlEncode("a=b", ";,")); + + // encode = (as %3B) and ; (as %3D) + assertEquals("a%3Db.c%3B", StringUtils.urlEncode("a=b.c;", ";=,")); + + // . and space not in the list for encoding + assertEquals("a%3Db.c d", StringUtils.urlEncode("a=b.c d", ";=,")); + + // encode space also (as +) + assertEquals("a%3Db.c+d", StringUtils.urlEncode("a=b.c d", ";=, ")); + + // . does not get encoded even if requested - behaviour of URLEncoder + assertEquals("a%3Db.c+d.e%3Df", + StringUtils.urlEncode("a=b.c d.e=f", ";=,. ")); + } + + @Test(groups = { "Functional" }) + public void testUrlDecode() + { + // degenerate cases + assertNull(StringUtils.urlDecode(null, ";,")); + assertEquals("", StringUtils.urlDecode("", "")); + assertEquals("", StringUtils.urlDecode("", ";,")); + + // = not in the list for encoding + assertEquals("a%3Db", StringUtils.urlDecode("a%3Db", ";,")); + + // decode = and ; but not . + assertEquals("a=b%3Ec; d", + StringUtils.urlDecode("a%3Db%3Ec; d", ";=,")); + + // space not in the list for decoding + assertEquals("a=b;c+d", StringUtils.urlDecode("a%3Db%3Bc+d", ";=,")); + + // decode space also; %3E is not decoded to . + assertEquals("a=b%3Ec d=,", + StringUtils.urlDecode("a%3Db%3Ec+d%3D%2C", ";=, ")); + + // decode encoded % (%25) + assertEquals("a,=;\t%z", + StringUtils.urlDecode("a%2C%3D%3B%09%25z", ";=,\t%")); + } }