X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fgff%2FGffHelperBase.java;h=3db175523db2505f5e9292a9fec2e6a3544f759d;hb=56a0c6ce1f8c909b60b875a371dd9a1296ca6fe1;hp=de9212f04b8833a249d94a3e86d00599be8c3243;hpb=d90461d46979dda9326255b1e1e85ce34c465ea3;p=jalview.git diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java index de9212f..3db1755 100644 --- a/src/jalview/io/gff/GffHelperBase.java +++ b/src/jalview/io/gff/GffHelperBase.java @@ -20,8 +20,6 @@ */ package jalview.io.gff; -import static jalview.io.FeaturesFile.MAP_ATTRIBUTE_PREFIX; - import jalview.analysis.SequenceIdMatcher; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.AlignmentI; @@ -29,7 +27,6 @@ import jalview.datamodel.MappingType; import jalview.datamodel.SequenceDummy; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; -import jalview.io.FeaturesFile; import jalview.util.MapList; import jalview.util.StringUtils; @@ -46,9 +43,13 @@ import java.util.Map.Entry; */ public abstract class GffHelperBase implements GffHelperI { - private static final String COMMA = ","; + private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: "; + + protected static final String COMMA = ","; - private static final String NOTE = "Note"; + protected static final String EQUALS = "="; + + protected static final String NOTE = "Note"; /* * GFF columns 1-9 (zero-indexed): @@ -264,29 +265,32 @@ public abstract class GffHelperBase implements GffHelperI } /** - * Parses the input line to a map of name / value(s) pairs. For example the line - *
+ * Parses the input line to a map of name / value(s) pairs. For example the + * line + * + *
    * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
-   * 
+ *
+ * * if parsed with delimiter=";" and separators {' ', '='}
* would return a map with { Notes={Fe=S, Metal}, Method={manual curation, * prediction}, source={Pfam}}
* * This method supports parsing of either GFF2 format (which uses space ' ' as - * the name/value delimiter, and allows multiple occurrences of the same name), - * or GFF3 format (which uses '=' as the name/value delimiter, and strictly does - * not allow repeat occurrences of the same name - but does allow a - * comma-separated list of values). + * the name/value delimiter, and allows multiple occurrences of the same + * name), or GFF3 format (which uses '=' as the name/value delimiter, and + * strictly does not allow repeat occurrences of the same name - but does + * allow a comma-separated list of values). *

* Returns a (possibly empty) map of lists of values by attribute name. * * @param text * @param namesDelimiter - * the major delimiter between name-value pairs + * the major delimiter between name-value pairs * @param nameValueSeparator - * separator used between name and value + * separator used between name and value * @param valuesDelimiter - * delimits a list of more than one value + * delimits a list of more than one value * @return */ public static Map> parseNameValuePairs(String text, @@ -299,60 +303,58 @@ public abstract class GffHelperBase implements GffHelperI return map; } - for (String pair : text.trim().split(namesDelimiter)) + /* + * split by major delimiter (; for GFF3) + */ + for (String nameValuePair : text.trim().split(namesDelimiter)) { - pair = pair.trim(); - if (pair.length() == 0) + nameValuePair = nameValuePair.trim(); + if (nameValuePair.length() == 0) { continue; } - int sepPos = pair.indexOf(nameValueSeparator); + /* + * find name/value separator (= for GFF3) + */ + int sepPos = nameValuePair.indexOf(nameValueSeparator); if (sepPos == -1) { // no name=value found continue; } - String key = pair.substring(0, sepPos).trim(); - String values = pair.substring(sepPos + 1).trim(); - if (values.length() > 0) + String name = nameValuePair.substring(0, sepPos).trim(); + String values = nameValuePair.substring(sepPos + 1).trim(); + if (values.isEmpty()) { - List vals = map.get(key); - if (vals == null) - { - vals = new ArrayList<>(); - map.put(key, vals); - } + continue; + } - /* - * special case: formatted as jvmap_AttName={a=b,c=d,...} - * save the value within { } for parsing at a later stage - */ - if (key.startsWith(MAP_ATTRIBUTE_PREFIX)) - { + List vals = map.get(name); + if (vals == null) + { + vals = new ArrayList<>(); + map.put(name, vals); + } - if (key.length() > MAP_ATTRIBUTE_PREFIX.length() - && values.startsWith("{") - && values.endsWith("}")) - { - vals.add(values.substring(1, values.length() - 1)); - } - else - { - System.err.println("Malformed GFF data '" + values.toString() - + "' for " + key); - } - } - else + /* + * if 'values' contains more name/value separators, parse as a map + * (nested sub-attribute values) + */ + if (values.indexOf(nameValueSeparator) != -1) + { + vals.add(values); + } + else + { + for (String val : values.split(valuesDelimiter)) { - for (String val : values.split(valuesDelimiter)) - { - vals.add(val); - } + vals.add(val); } } } + return map; } @@ -416,10 +418,12 @@ public abstract class GffHelperBase implements GffHelperI { String key = attr.getKey(); List values = attr.getValue(); - if (key.startsWith(FeaturesFile.MAP_ATTRIBUTE_PREFIX)) + if (values.size() == 1 && values.get(0).contains(EQUALS)) { - key = key.substring(FeaturesFile.MAP_ATTRIBUTE_PREFIX.length()); - Map valueMap = parseAttributeMap(values); + /* + * 'value' is actually nested subattributes as x=a,y=b,z=c + */ + Map valueMap = parseAttributeMap(values.get(0)); sf.setValue(key, valueMap); } else @@ -445,31 +449,100 @@ public abstract class GffHelperBase implements GffHelperI } /** - * Parses one or more list of comma-separated key=value pairs into a Map of - * {key, value} + * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map + * of {@code key, + * value}
+ * An input string like {@code a=b,c,d=e,f=g,h} is parsed to + * + *

+   * a = "b,c"
+   * d = "e"
+   * f = "g,h"
+   * 
+ * + * @param s * - * @param values * @return */ - protected Map parseAttributeMap(List values) + protected static Map parseAttributeMap(String s) { Map map = new HashMap<>(); - for (String entry : values) + String[] fields = s.split(EQUALS); + + /* + * format validation + */ + boolean valid = true; + if (fields.length < 2) + { + /* + * need at least A=B here + */ + valid = false; + } + else if (fields[0].isEmpty() || fields[0].contains(COMMA)) + { + /* + * A,B=C is not a valid start, nor is =C + */ + valid = false; + } + else { - String[] fields = entry.split(COMMA); - for (String field : fields) + for (int i = 1; i < fields.length - 1; i++) { - String[] keyValue = field.split("="); - if (keyValue.length == 2) + if (fields[i].isEmpty() || !fields[i].contains(COMMA)) { - String theKey = StringUtils.urlDecode(keyValue[0], - GFF_ENCODABLE); - String theValue = StringUtils.urlDecode(keyValue[1], - GFF_ENCODABLE); - map.put(theKey, theValue); + /* + * intermediate tokens must include value,name + */ + valid = false; } } } + + if (!valid) + { + System.err.println(INVALID_GFF_ATTRIBUTE_FORMAT + s); + return map; + } + + int i = 0; + while (i < fields.length - 1) + { + boolean lastPair = i == fields.length - 2; + String before = fields[i]; + String after = fields[i + 1]; + + /* + * if 'key' looks like a,b,c then the last token is the + * key + */ + String theKey = before.contains(COMMA) + ? before.substring(before.lastIndexOf(COMMA) + 1) + : before; + + theKey = theKey.trim(); + if (theKey.isEmpty()) + { + System.err.println(INVALID_GFF_ATTRIBUTE_FORMAT + s); + map.clear(); + return map; + } + + /* + * if 'value' looks like a,b,c then all but the last token is the value, + * unless this is the last field (no more = to follow), in which case + * all of it makes up the value + */ + String theValue = after.contains(COMMA) && !lastPair + ? after.substring(0, after.lastIndexOf(COMMA)) + : after; + map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE), + StringUtils.urlDecode(theValue, GFF_ENCODABLE)); + i += 1; + } + return map; }