From d90461d46979dda9326255b1e1e85ce34c465ea3 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 22 Aug 2019 17:26:53 +0100 Subject: [PATCH] JAL-3121 slightly cleaner parsing of, and unit test for, map attributes --- src/jalview/io/gff/Gff2Helper.java | 10 --- src/jalview/io/gff/Gff3Helper.java | 9 --- src/jalview/io/gff/GffHelperBase.java | 136 ++++++++++++++++++--------------- test/jalview/io/FeaturesFileTest.java | 17 +++-- 4 files changed, 86 insertions(+), 86 deletions(-) diff --git a/src/jalview/io/gff/Gff2Helper.java b/src/jalview/io/gff/Gff2Helper.java index 19045d5..a15a116 100644 --- a/src/jalview/io/gff/Gff2Helper.java +++ b/src/jalview/io/gff/Gff2Helper.java @@ -38,20 +38,10 @@ public class Gff2Helper extends GffHelperBase */ public static Map> parseNameValuePairs(String text) { - // TODO: can a value include a comma? if so it will be broken by this return parseNameValuePairs(text, ";", ' ', ","); } /** - * Return ' ' as the name-value separator used in column 9 attributes. - */ - @Override - protected char getNameValueSeparator() - { - return ' '; - } - - /** * Default processing if not overridden is just to construct a sequence * feature */ diff --git a/src/jalview/io/gff/Gff3Helper.java b/src/jalview/io/gff/Gff3Helper.java index 56ec5ef..1ef8848 100644 --- a/src/jalview/io/gff/Gff3Helper.java +++ b/src/jalview/io/gff/Gff3Helper.java @@ -350,15 +350,6 @@ public class Gff3Helper extends GffHelperBase } /** - * Return '=' as the name-value separator used in column 9 attributes. - */ - @Override - protected char getNameValueSeparator() - { - return '='; - } - - /** * Modifies the default SequenceFeature in order to set the Target sequence id * as the description */ diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java index d034c8d..de9212f 100644 --- a/src/jalview/io/gff/GffHelperBase.java +++ b/src/jalview/io/gff/GffHelperBase.java @@ -20,6 +20,8 @@ */ package jalview.io.gff; +import static jalview.io.FeaturesFile.MAP_ATTRIBUTE_PREFIX; + import jalview.analysis.SequenceIdMatcher; import jalview.datamodel.AlignedCodonFrame; import jalview.datamodel.AlignmentI; @@ -44,6 +46,8 @@ import java.util.Map.Entry; */ public abstract class GffHelperBase implements GffHelperI { + private static final String COMMA = ","; + private static final String NOTE = "Note"; /* @@ -260,8 +264,8 @@ public abstract class GffHelperBase implements GffHelperI } /** - * Parses the input line to a map of name / value(s) pairs. For example the - * line
+ * Parses the input line to a map of name / value(s) pairs. For example the line + *
* Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal *
* if parsed with delimiter=";" and separators {' ', '='}
@@ -269,19 +273,21 @@ public abstract class GffHelperBase implements GffHelperI * prediction}, source={Pfam}}
* * This method supports parsing of either GFF2 format (which uses space ' ' as - * the name/value delimiter, and allows multiple occurrences of the same - * name), or GFF3 format (which uses '=' as the name/value delimiter, and - * strictly does not allow repeat occurrences of the same name - but does - * allow a comma-separated list of values). + * the name/value delimiter, and allows multiple occurrences of the same name), + * or GFF3 format (which uses '=' as the name/value delimiter, and strictly does + * not allow repeat occurrences of the same name - but does allow a + * comma-separated list of values). + *

+ * Returns a (possibly empty) map of lists of values by attribute name. * * @param text * @param namesDelimiter - * the major delimiter between name-value pairs + * the major delimiter between name-value pairs * @param nameValueSeparator - * one or more separators used between name and value + * separator used between name and value * @param valuesDelimiter - * delimits a list of more than one value - * @return the name-values map (which may be empty but never null) + * delimits a list of more than one value + * @return */ public static Map> parseNameValuePairs(String text, String namesDelimiter, char nameValueSeparator, @@ -304,7 +310,7 @@ public abstract class GffHelperBase implements GffHelperI int sepPos = pair.indexOf(nameValueSeparator); if (sepPos == -1) { - // no name=value present + // no name=value found continue; } @@ -318,9 +324,32 @@ public abstract class GffHelperBase implements GffHelperI vals = new ArrayList<>(); map.put(key, vals); } - for (String val : values.split(valuesDelimiter)) + + /* + * special case: formatted as jvmap_AttName={a=b,c=d,...} + * save the value within { } for parsing at a later stage + */ + if (key.startsWith(MAP_ATTRIBUTE_PREFIX)) + { + + if (key.length() > MAP_ATTRIBUTE_PREFIX.length() + && values.startsWith("{") + && values.endsWith("}")) + { + vals.add(values.substring(1, values.length() - 1)); + } + else + { + System.err.println("Malformed GFF data '" + values.toString() + + "' for " + key); + } + } + else { - vals.add(val); + for (String val : values.split(valuesDelimiter)) + { + vals.add(val); + } } } } @@ -386,58 +415,22 @@ public abstract class GffHelperBase implements GffHelperI for (Entry> attr : attributes.entrySet()) { String key = attr.getKey(); - List value = attr.getValue(); + List values = attr.getValue(); if (key.startsWith(FeaturesFile.MAP_ATTRIBUTE_PREFIX)) { - /* - * e.g. jvmap_CSQ={ALLELE_NUM=1,CDS_position=249,Codons=caG/caT} - */ - String trueKey = key - .substring(FeaturesFile.MAP_ATTRIBUTE_PREFIX.length()); - if (trueKey.isEmpty() || value.isEmpty() - || !value.get(0).startsWith("{") - || !value.get(value.size() - 1).endsWith("}")) - { - System.err.println("Malformed GFF data '" + value.toString() - + "' for " + key); - continue; - } - Map values = new HashMap<>(); - for (String entry : value) - { - if (entry.startsWith("{")) - { - entry = entry.substring(1); - } - if (entry.endsWith("}")) - { - entry = entry.substring(0, entry.length() - 1); - } - String[] fields = entry.split(","); - for (String field : fields) - { - String[] keyValue = field.split("="); - if (keyValue.length == 2) - { - String theKey = StringUtils.urlDecode(keyValue[0], - GFF_ENCODABLE); - String theValue = StringUtils.urlDecode(keyValue[1], - GFF_ENCODABLE); - values.put(theKey, theValue); - } - } - } - sf.setValue(trueKey, values); + key = key.substring(FeaturesFile.MAP_ATTRIBUTE_PREFIX.length()); + Map valueMap = parseAttributeMap(values); + sf.setValue(key, valueMap); } else { - String values = StringUtils - .listToDelimitedString(value, ","); - values = StringUtils.urlDecode(values, GFF_ENCODABLE); - sf.setValue(key, values); + String csvValues = StringUtils.listToDelimitedString(values, + COMMA); + csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE); + sf.setValue(key, csvValues); if (NOTE.equals(key)) { - sf.setDescription(values); + sf.setDescription(csvValues); } } } @@ -452,12 +445,33 @@ public abstract class GffHelperBase implements GffHelperI } /** - * Returns the character used to separate attributes names from values in GFF - * column 9. This is space for GFF2, '=' for GFF3. + * Parses one or more list of comma-separated key=value pairs into a Map of + * {key, value} * + * @param values * @return */ - protected abstract char getNameValueSeparator(); + protected Map parseAttributeMap(List values) + { + Map map = new HashMap<>(); + for (String entry : values) + { + String[] fields = entry.split(COMMA); + for (String field : fields) + { + String[] keyValue = field.split("="); + if (keyValue.length == 2) + { + String theKey = StringUtils.urlDecode(keyValue[0], + GFF_ENCODABLE); + String theValue = StringUtils.urlDecode(keyValue[1], + GFF_ENCODABLE); + map.put(theKey, theValue); + } + } + } + return map; + } /** * Returns any existing mapping held on the alignment between the given diff --git a/test/jalview/io/FeaturesFileTest.java b/test/jalview/io/FeaturesFileTest.java index 5154ef2..959c413 100644 --- a/test/jalview/io/FeaturesFileTest.java +++ b/test/jalview/io/FeaturesFileTest.java @@ -272,7 +272,8 @@ public class FeaturesFileTest // comma (%2C) equals (%3D) or semi-colon (%3B) should be url-escaped in values String gffData = "##gff-version 3\n" + "FER_CAPAA\tuniprot\tMETAL\t39\t39\t0.0\t.\t.\t" - + "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO%3B0000255%2CPROSITE%3DProRule:PRU00465\n" + + "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO%3B0000255%2CPROSITE%3DProRule:PRU00465;" + + "jvmap_CSQ={AF=21,clin_sig=Benign%3Dgood}\n" + "FER1_SOLLC\tuniprot\tPfam\t55\t130\t3.0\t.\t.\tID=$23"; FeaturesFile featuresFile = new FeaturesFile(gffData, DataSourceType.PASTE); @@ -290,13 +291,17 @@ public class FeaturesFileTest assertEquals(39, sf.end); assertEquals("uniprot", sf.featureGroup); assertEquals("METAL", sf.type); - assertEquals(4, sf.otherDetails.size()); - assertEquals("ECO;0000255,PROSITE=ProRule:PRU00465", - sf.otherDetails.get("evidence")); + assertEquals(5, sf.otherDetails.size()); + assertEquals("ECO;0000255,PROSITE=ProRule:PRU00465", // url decoded + sf.getValue("evidence")); assertEquals("Iron-sulfur (2Fe-2S),another note", - sf.otherDetails.get("Note")); + sf.getValue("Note")); + assertEquals("21", sf.getValueAsString("CSQ", "AF")); + assertEquals("Benign=good", sf.getValueAsString("CSQ", "clin_sig")); // url decoded + // todo change STRAND and !Phase into fields of SequenceFeature instead assertEquals(".", sf.otherDetails.get("STRAND")); - assertEquals(".", sf.otherDetails.get("!Phase")); + assertEquals(0, sf.getStrand()); + assertEquals(".", sf.getPhase()); // verify feature on FER1_SOLLC1 sfs = al.getSequenceAt(2).getDatasetSequence().getSequenceFeatures(); -- 1.7.10.2