*/
package jalview.io.gff;
-import static jalview.io.FeaturesFile.MAP_ATTRIBUTE_PREFIX;
-
import jalview.analysis.SequenceIdMatcher;
import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.SequenceDummy;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
-import jalview.io.FeaturesFile;
import jalview.util.MapList;
import jalview.util.StringUtils;
*/
public abstract class GffHelperBase implements GffHelperI
{
- private static final String COMMA = ",";
+ protected static final String COMMA = ",";
+
+ protected static final String EQUALS = "=";
- private static final String NOTE = "Note";
+ protected static final String NOTE = "Note";
/*
* GFF columns 1-9 (zero-indexed):
}
/**
- * Parses the input line to a map of name / value(s) pairs. For example the line
- * <br>
+ * Parses the input line to a map of name / value(s) pairs. For example the
+ * line
+ *
+ * <pre>
* Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
- * <br>
+ * </pre>
+ *
* if parsed with delimiter=";" and separators {' ', '='} <br>
* would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
* prediction}, source={Pfam}} <br>
*
* This method supports parsing of either GFF2 format (which uses space ' ' as
- * the name/value delimiter, and allows multiple occurrences of the same name),
- * or GFF3 format (which uses '=' as the name/value delimiter, and strictly does
- * not allow repeat occurrences of the same name - but does allow a
- * comma-separated list of values).
+ * the name/value delimiter, and allows multiple occurrences of the same
+ * name), or GFF3 format (which uses '=' as the name/value delimiter, and
+ * strictly does not allow repeat occurrences of the same name - but does
+ * allow a comma-separated list of values).
* <p>
* Returns a (possibly empty) map of lists of values by attribute name.
*
* @param text
* @param namesDelimiter
- * the major delimiter between name-value pairs
+ * the major delimiter between name-value pairs
* @param nameValueSeparator
- * separator used between name and value
+ * separator used between name and value
* @param valuesDelimiter
- * delimits a list of more than one value
+ * delimits a list of more than one value
* @return
*/
public static Map<String, List<String>> parseNameValuePairs(String text,
return map;
}
- for (String pair : text.trim().split(namesDelimiter))
+ /*
+ * split by major delimiter (; for GFF3)
+ */
+ for (String nameValuePair : text.trim().split(namesDelimiter))
{
- pair = pair.trim();
- if (pair.length() == 0)
+ nameValuePair = nameValuePair.trim();
+ if (nameValuePair.length() == 0)
{
continue;
}
- int sepPos = pair.indexOf(nameValueSeparator);
+ /*
+ * find name/value separator (= for GFF3)
+ */
+ int sepPos = nameValuePair.indexOf(nameValueSeparator);
if (sepPos == -1)
{
// no name=value found
continue;
}
- String key = pair.substring(0, sepPos).trim();
- String values = pair.substring(sepPos + 1).trim();
- if (values.length() > 0)
+ String name = nameValuePair.substring(0, sepPos).trim();
+ String values = nameValuePair.substring(sepPos + 1).trim();
+ if (values.isEmpty())
{
- List<String> vals = map.get(key);
- if (vals == null)
- {
- vals = new ArrayList<>();
- map.put(key, vals);
- }
+ continue;
+ }
- /*
- * special case: formatted as jvmap_AttName={a=b,c=d,...}
- * save the value within { } for parsing at a later stage
- */
- if (key.startsWith(MAP_ATTRIBUTE_PREFIX))
- {
+ List<String> vals = map.get(name);
+ if (vals == null)
+ {
+ vals = new ArrayList<>();
+ map.put(name, vals);
+ }
- if (key.length() > MAP_ATTRIBUTE_PREFIX.length()
- && values.startsWith("{")
- && values.endsWith("}"))
- {
- vals.add(values.substring(1, values.length() - 1));
- }
- else
- {
- System.err.println("Malformed GFF data '" + values.toString()
- + "' for " + key);
- }
- }
- else
+ /*
+ * if 'values' contains more name/value separators, parse as a map
+ * (nested sub-attribute values)
+ */
+ if (values.indexOf(nameValueSeparator) != -1)
+ {
+ vals.add(values);
+ }
+ else
+ {
+ for (String val : values.split(valuesDelimiter))
{
- for (String val : values.split(valuesDelimiter))
- {
- vals.add(val);
- }
+ vals.add(val);
}
}
}
+
return map;
}
{
String key = attr.getKey();
List<String> values = attr.getValue();
- if (key.startsWith(FeaturesFile.MAP_ATTRIBUTE_PREFIX))
+ if (values.size() == 1 && values.get(0).contains(EQUALS))
{
- key = key.substring(FeaturesFile.MAP_ATTRIBUTE_PREFIX.length());
- Map<String, String> valueMap = parseAttributeMap(values);
+ /*
+ * 'value' is actually nested subattributes as x=a,y=b,z=c
+ */
+ Map<String, String> valueMap = parseAttributeMap(values.get(0));
sf.setValue(key, valueMap);
}
else
}
/**
- * Parses one or more list of comma-separated key=value pairs into a Map of
- * {key, value}
+ * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map
+ * of {@code key,
+ * value} <br>
+ * An input string like {@code a=b,c,d=e,f=g,h} is parsed to
+ *
+ * <pre>
+ * a = "b,c"
+ * d = "e"
+ * f = "g,h"
+ * </pre>
+ *
+ * @param s
*
- * @param values
* @return
*/
- protected Map<String, String> parseAttributeMap(List<String> values)
+ protected static Map<String, String> parseAttributeMap(String s)
{
Map<String, String> map = new HashMap<>();
- for (String entry : values)
+ String[] fields = s.split(EQUALS);
+ int i = 0;
+ while (i < fields.length - 1)
{
- String[] fields = entry.split(COMMA);
- for (String field : fields)
- {
- String[] keyValue = field.split("=");
- if (keyValue.length == 2)
- {
- String theKey = StringUtils.urlDecode(keyValue[0],
- GFF_ENCODABLE);
- String theValue = StringUtils.urlDecode(keyValue[1],
- GFF_ENCODABLE);
- map.put(theKey, theValue);
- }
- }
+ boolean lastPair = i == fields.length - 2;
+ String before = fields[i];
+ String after = fields[i + 1];
+
+ /*
+ * if 'key' looks like a,b,c then the last token is the
+ * key
+ */
+ String theKey = before.contains(COMMA)
+ ? before.substring(before.lastIndexOf(COMMA) + 1)
+ : before;
+
+ /*
+ * if 'value' looks like a,b,c then all but the last token is the value,
+ * unless this is the last field (no more = to follow), in which case
+ * all of it makes up the value
+ */
+ String theValue = after.contains(COMMA) && !lastPair
+ ? after.substring(0, after.lastIndexOf(COMMA))
+ : after;
+ map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE),
+ StringUtils.urlDecode(theValue, GFF_ENCODABLE));
+ i += 1;
}
+
return map;
}
// comma (%2C) equals (%3D) or semi-colon (%3B) should be url-escaped in values
String gffData = "##gff-version 3\n"
+ "FER_CAPAA\tuniprot\tMETAL\t39\t39\t0.0\t.\t.\t"
- + "Note=Iron-sulfur (2Fe-2S);Note=another note;evidence=ECO%3B0000255%2CPROSITE%3DProRule:PRU00465;"
- + "jvmap_CSQ={AF=21,clin_sig=Benign%3Dgood}\n"
+ + "Note=Iron-sulfur (2Fe-2S);Note=another note,and another;evidence=ECO%3B0000255%2CPROSITE%3DProRule:PRU00465;"
+ + "CSQ=AF=21,POLYPHEN=benign,possibly_damaging,clin_sig=Benign%3Dgood\n"
+ "FER1_SOLLC\tuniprot\tPfam\t55\t130\t3.0\t.\t.\tID=$23";
FeaturesFile featuresFile = new FeaturesFile(gffData,
DataSourceType.PASTE);
assertEquals(1, sfs.size());
SequenceFeature sf = sfs.get(0);
// description parsed from Note attribute
- assertEquals("Iron-sulfur (2Fe-2S),another note", sf.description);
+ assertEquals("Iron-sulfur (2Fe-2S),another note,and another",
+ sf.description);
assertEquals(39, sf.begin);
assertEquals(39, sf.end);
assertEquals("uniprot", sf.featureGroup);
assertEquals(5, sf.otherDetails.size());
assertEquals("ECO;0000255,PROSITE=ProRule:PRU00465", // url decoded
sf.getValue("evidence"));
- assertEquals("Iron-sulfur (2Fe-2S),another note",
+ assertEquals("Iron-sulfur (2Fe-2S),another note,and another",
sf.getValue("Note"));
assertEquals("21", sf.getValueAsString("CSQ", "AF"));
+ assertEquals("benign,possibly_damaging",
+ sf.getValueAsString("CSQ", "POLYPHEN"));
assertEquals("Benign=good", sf.getValueAsString("CSQ", "clin_sig")); // url decoded
// todo change STRAND and !Phase into fields of SequenceFeature instead
assertEquals(".", sf.otherDetails.get("STRAND"));
sf.setPhase("2");
sf.setValue("x", "y");
sf.setValue("black", "white");
+ Map<String, String> csq = new HashMap<>();
+ csq.put("SIFT", "benign,mostly benign,cloudy, with meatballs");
+ csq.put("consequence", "missense_variant");
+ sf.setValue("CSQ", csq);
al.getSequenceAt(1).addSequenceFeature(sf);
/*
// Pfam feature columns include strand(+), phase(2), attributes
expected = gffHeader
+ "FER_CAPAA\tCath\tMETAL\t39\t39\t1.2\t.\t.\n"
- + "FER_CAPAN\tUniprot\tPfam\t20\t20\t0.0\t+\t2\tx=y;black=white\n"
+ // CSQ output as CSQ=att1=value1,att2=value2
+ // note all commas are encoded here which is wrong - it should be
+ // SIFT=benign,mostly benign,cloudy%2C with meatballs
+ + "FER_CAPAN\tUniprot\tPfam\t20\t20\t0.0\t+\t2\tx=y;black=white;"
+ + "CSQ=SIFT=benign%2Cmostly benign%2Ccloudy%2C with meatballs,consequence=missense_variant\n"
+ "FER_CAPAN\ts3dm\tGAMMA-TURN\t36\t38\t2.1\t.\t.\n";
assertEquals(expected, exported);
}
*/
package jalview.io.gff;
-import static org.testng.AssertJUnit.assertEquals;
-import static org.testng.AssertJUnit.assertFalse;
-import static org.testng.AssertJUnit.assertTrue;
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertTrue;
+import static org.testng.Assert.fail;
import jalview.gui.JvOptionPane;
Map<String, List<String>> map = GffHelperBase.parseNameValuePairs(
"hello world", ";", ' ', ", ");
- assertEquals(1, map.size());
- assertEquals(1, map.get("hello").size());
- assertEquals("world", map.get("hello").get(0));
+ assertEquals(map.size(), 1);
+ assertEquals(map.get("hello").size(), 1);
+ assertEquals(map.get("hello").get(0), "world");
map = GffHelperBase
.parseNameValuePairs(
- "Method= manual curation ;nothing; Notes=F2 S ; Notes=Metal,Shiny; Type=",
+ "Method= manual curation ;nothing; Notes=F2 S ; Notes=Metal,Shiny%2Csmooth; Type=",
";", '=', ",");
// Type is ignored as no value was supplied
- assertEquals(2, map.size());
-
- assertEquals(1, map.get("Method").size());
- assertEquals("manual curation", map.get("Method").get(0)); // trimmed
-
- assertEquals(3, map.get("Notes").size());
- assertEquals("F2 S", map.get("Notes").get(0));
- assertEquals("Metal", map.get("Notes").get(1));
- assertEquals("Shiny", map.get("Notes").get(2));
+ assertEquals(map.size(), 2);
+
+ assertEquals(map.get("Method").size(), 1);
+ assertEquals(map.get("Method").get(0), "manual curation"); // trimmed
+
+ assertEquals(map.get("Notes").size(), 3);
+ assertEquals(map.get("Notes").get(0), "F2 S");
+ assertEquals(map.get("Notes").get(1), "Metal");
+ assertEquals(map.get("Notes").get(2), "Shiny%2Csmooth"); // not decoded here
+
+ /*
+ * gff3 style with nested attribute values
+ */
+ String csqValue = "POLYPHEN=possibly_damaging,probably_damaging,SIFT=tolerated%2Cdeleterious";
+ map = GffHelperBase.parseNameValuePairs("hello=world;CSQ=" + csqValue,
+ ";", '=', ",");
+ assertEquals(map.size(), 2); // keys hello, CSQ
+ assertEquals(map.get("hello").size(), 1);
+ assertEquals(map.get("hello").get(0), "world");
+ // CSQ values is read 'raw' here, and parsed further elsewhere
+ assertEquals(map.get("CSQ").size(), 1);
+ assertEquals(map.get("CSQ").get(0), csqValue);
}
/**
int[] from = { 1, 12 };
int[] to = { 20, 31 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
- assertEquals("[1, 12]", Arrays.toString(from)); // unchanged
- assertEquals("[20, 31]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[1, 12]"); // unchanged
+ assertEquals(Arrays.toString(to), "[20, 31]"); // unchanged
// from too long:
from = new int[] { 1, 13 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
- assertEquals("[1, 12]", Arrays.toString(from)); // trimmed
- assertEquals("[20, 31]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[1, 12]"); // trimmed
+ assertEquals(Arrays.toString(to), "[20, 31]"); // unchanged
// to too long:
to = new int[] { 20, 33 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
- assertEquals("[1, 12]", Arrays.toString(from)); // unchanged
- assertEquals("[20, 31]", Arrays.toString(to)); // trimmed
+ assertEquals(Arrays.toString(from), "[1, 12]"); // unchanged
+ assertEquals(Arrays.toString(to), "[20, 31]"); // trimmed
// from reversed:
from = new int[] { 12, 1 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
- assertEquals("[12, 1]", Arrays.toString(from)); // unchanged
- assertEquals("[20, 31]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[12, 1]"); // unchanged
+ assertEquals(Arrays.toString(to), "[20, 31]"); // unchanged
// to reversed:
to = new int[] { 31, 20 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
- assertEquals("[12, 1]", Arrays.toString(from)); // unchanged
- assertEquals("[31, 20]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[12, 1]"); // unchanged
+ assertEquals(Arrays.toString(to), "[31, 20]"); // unchanged
// from reversed and too long:
from = new int[] { 14, 1 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
- assertEquals("[14, 3]", Arrays.toString(from)); // end trimmed
- assertEquals("[31, 20]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[14, 3]"); // end trimmed
+ assertEquals(Arrays.toString(to), "[31, 20]"); // unchanged
// to reversed and too long:
to = new int[] { 31, 10 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 1));
- assertEquals("[14, 3]", Arrays.toString(from)); // unchanged
- assertEquals("[31, 20]", Arrays.toString(to)); // end trimmed
+ assertEquals(Arrays.toString(from), "[14, 3]"); // unchanged
+ assertEquals(Arrays.toString(to), "[31, 20]"); // end trimmed
// cdna to peptide (matching)
from = new int[] { 1, 18 };
to = new int[] { 4, 9 };
assertTrue(GffHelperBase.trimMapping(from, to, 3, 1));
- assertEquals("[1, 18]", Arrays.toString(from)); // unchanged
- assertEquals("[4, 9]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[1, 18]"); // unchanged
+ assertEquals(Arrays.toString(to), "[4, 9]"); // unchanged
// overlong cdna to peptide
from = new int[] { 1, 20 };
assertTrue(GffHelperBase.trimMapping(from, to, 3, 1));
- assertEquals("[1, 18]", Arrays.toString(from)); // end trimmed
- assertEquals("[4, 9]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[1, 18]"); // end trimmed
+ assertEquals(Arrays.toString(to), "[4, 9]"); // unchanged
// overlong cdna (reversed) to peptide
from = new int[] { 20, 1 };
assertTrue(GffHelperBase.trimMapping(from, to, 3, 1));
- assertEquals("[20, 3]", Arrays.toString(from)); // end trimmed
- assertEquals("[4, 9]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[20, 3]"); // end trimmed
+ assertEquals(Arrays.toString(to), "[4, 9]"); // unchanged
// overlong cdna (reversed) to peptide (reversed)
from = new int[] { 20, 1 };
to = new int[] { 9, 4 };
assertTrue(GffHelperBase.trimMapping(from, to, 3, 1));
- assertEquals("[20, 3]", Arrays.toString(from)); // end trimmed
- assertEquals("[9, 4]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[20, 3]"); // end trimmed
+ assertEquals(Arrays.toString(to), "[9, 4]"); // unchanged
// peptide to cdna (matching)
from = new int[] { 4, 9 };
to = new int[] { 1, 18 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 3));
- assertEquals("[4, 9]", Arrays.toString(from)); // unchanged
- assertEquals("[1, 18]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[4, 9]"); // unchanged
+ assertEquals(Arrays.toString(to), "[1, 18]"); // unchanged
// peptide to overlong cdna
to = new int[] { 1, 20 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 3));
- assertEquals("[4, 9]", Arrays.toString(from)); // unchanged
- assertEquals("[1, 18]", Arrays.toString(to)); // end trimmed
+ assertEquals(Arrays.toString(from), "[4, 9]"); // unchanged
+ assertEquals(Arrays.toString(to), "[1, 18]"); // end trimmed
// peptide to overlong cdna (reversed)
to = new int[] { 20, 1 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 3));
- assertEquals("[4, 9]", Arrays.toString(from)); // unchanged
- assertEquals("[20, 3]", Arrays.toString(to)); // end trimmed
+ assertEquals(Arrays.toString(from), "[4, 9]"); // unchanged
+ assertEquals(Arrays.toString(to), "[20, 3]"); // end trimmed
// peptide (reversed) to overlong cdna (reversed)
from = new int[] { 9, 4 };
to = new int[] { 20, 1 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 3));
- assertEquals("[9, 4]", Arrays.toString(from)); // unchanged
- assertEquals("[20, 3]", Arrays.toString(to)); // end trimmed
+ assertEquals(Arrays.toString(from), "[9, 4]"); // unchanged
+ assertEquals(Arrays.toString(to), "[20, 3]"); // end trimmed
// overlong peptide to word-length cdna
from = new int[] { 4, 10 };
to = new int[] { 1, 18 };
assertTrue(GffHelperBase.trimMapping(from, to, 1, 3));
- assertEquals("[4, 9]", Arrays.toString(from)); // end trimmed
- assertEquals("[1, 18]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[4, 9]"); // end trimmed
+ assertEquals(Arrays.toString(to), "[1, 18]"); // unchanged
// overlong peptide to non-word-length cdna
from = new int[] { 4, 10 };
to = new int[] { 1, 19 };
assertFalse(GffHelperBase.trimMapping(from, to, 1, 3));
- assertEquals("[4, 10]", Arrays.toString(from)); // unchanged
- assertEquals("[1, 19]", Arrays.toString(to)); // unchanged
+ assertEquals(Arrays.toString(from), "[4, 10]"); // unchanged
+ assertEquals(Arrays.toString(to), "[1, 19]"); // unchanged
+ }
+ @Test(groups = { "Functional" })
+ public void testParseAttributeMap()
+ {
+ Map<String, String> map = GffHelperBase
+ .parseAttributeMap("A=B,C%2C%3D%3B%09%25D");
+ assertEquals(map.get("A"), "B,C,=;\t%D");
+
+ try
+ {
+ GffHelperBase.parseAttributeMap(null);
+ fail("expected exception");
+ } catch (NullPointerException e)
+ {
+ // expected
+ }
}
}