X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fio%2Fgff%2FGffHelperBase.java;h=009734316d490d06a4aa95d0d70e10dda025dcc9;hb=eb3e681d6e82ccdd5d312d1981dfb306e7f479f0;hp=fbde9d99c5e6887b0cedc62a71cf563bb6ccb151;hpb=8f920d337154e092f5f9056ffde3cdf2735eca43;p=jalview.git
diff --git a/src/jalview/io/gff/GffHelperBase.java b/src/jalview/io/gff/GffHelperBase.java
index fbde9d9..0097343 100644
--- a/src/jalview/io/gff/GffHelperBase.java
+++ b/src/jalview/io/gff/GffHelperBase.java
@@ -1,3 +1,23 @@
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see .
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
package jalview.io.gff;
import jalview.analysis.SequenceIdMatcher;
@@ -23,7 +43,13 @@ import java.util.Map.Entry;
*/
public abstract class GffHelperBase implements GffHelperI
{
- private static final String NOTE = "Note";
+ private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: ";
+
+ protected static final String COMMA = ",";
+
+ protected static final String EQUALS = "=";
+
+ protected static final String NOTE = "Note";
/*
* GFF columns 1-9 (zero-indexed):
@@ -88,8 +114,9 @@ public abstract class GffHelperBase implements GffHelperI
*/
if (!trimMapping(from, to, fromRatio, toRatio))
{
- System.err.println("Ignoring mapping from " + Arrays.toString(from)
- + " to " + Arrays.toString(to) + " as counts don't match!");
+ jalview.bin.Console.errPrintln(
+ "Ignoring mapping from " + Arrays.toString(from) + " to "
+ + Arrays.toString(to) + " as counts don't match!");
return null;
}
@@ -130,8 +157,8 @@ public abstract class GffHelperBase implements GffHelperI
* restrict from range to make them match up
* it's kind of arbitrary which end we truncate - here it is the end
*/
- System.err.print("Truncating mapping from " + Arrays.toString(from)
- + " to ");
+ System.err.print(
+ "Truncating mapping from " + Arrays.toString(from) + " to ");
if (from[1] > from[0])
{
from[1] -= fromOverlap / toRatio;
@@ -140,7 +167,7 @@ public abstract class GffHelperBase implements GffHelperI
{
from[1] += fromOverlap / toRatio;
}
- System.err.println(Arrays.toString(from));
+ jalview.bin.Console.errPrintln(Arrays.toString(from));
return true;
}
else if (fromOverlap < 0 && fromOverlap % fromRatio == 0)
@@ -149,8 +176,8 @@ public abstract class GffHelperBase implements GffHelperI
/*
* restrict to range to make them match up
*/
- System.err.print("Truncating mapping to " + Arrays.toString(to)
- + " to ");
+ System.err.print(
+ "Truncating mapping to " + Arrays.toString(to) + " to ");
if (to[1] > to[0])
{
to[1] -= fromOverlap / fromRatio;
@@ -159,7 +186,7 @@ public abstract class GffHelperBase implements GffHelperI
{
to[1] += fromOverlap / fromRatio;
}
- System.err.println(Arrays.toString(to));
+ jalview.bin.Console.errPrintln(Arrays.toString(to));
return true;
}
@@ -240,8 +267,12 @@ public abstract class GffHelperBase implements GffHelperI
/**
* Parses the input line to a map of name / value(s) pairs. For example the
- * line
- * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
+ * line
+ *
+ *
+ * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal
+ *
+ *
* if parsed with delimiter=";" and separators {' ', '='}
* would return a map with { Notes={Fe=S, Metal}, Method={manual curation,
* prediction}, source={Pfam}}
@@ -251,57 +282,80 @@ public abstract class GffHelperBase implements GffHelperI
* name), or GFF3 format (which uses '=' as the name/value delimiter, and
* strictly does not allow repeat occurrences of the same name - but does
* allow a comma-separated list of values).
+ *
+ * Returns a (possibly empty) map of lists of values by attribute name.
*
* @param text
* @param namesDelimiter
* the major delimiter between name-value pairs
* @param nameValueSeparator
- * one or more separators used between name and value
+ * separator used between name and value
* @param valuesDelimiter
* delimits a list of more than one value
- * @return the name-values map (which may be empty but never null)
+ * @return
*/
public static Map> parseNameValuePairs(String text,
String namesDelimiter, char nameValueSeparator,
String valuesDelimiter)
{
- Map> map = new HashMap>();
+ Map> map = new HashMap<>();
if (text == null || text.trim().length() == 0)
{
return map;
}
- for (String pair : text.trim().split(namesDelimiter))
+ /*
+ * split by major delimiter (; for GFF3)
+ */
+ for (String nameValuePair : text.trim().split(namesDelimiter))
{
- pair = pair.trim();
- if (pair.length() == 0)
+ nameValuePair = nameValuePair.trim();
+ if (nameValuePair.length() == 0)
{
continue;
}
- int sepPos = pair.indexOf(nameValueSeparator);
+ /*
+ * find name/value separator (= for GFF3)
+ */
+ int sepPos = nameValuePair.indexOf(nameValueSeparator);
if (sepPos == -1)
{
- // no name=value present
+ // no name=value found
continue;
}
- String key = pair.substring(0, sepPos).trim();
- String values = pair.substring(sepPos + 1).trim();
- if (values.length() > 0)
+ String name = nameValuePair.substring(0, sepPos).trim();
+ String values = nameValuePair.substring(sepPos + 1).trim();
+ if (values.isEmpty())
+ {
+ continue;
+ }
+
+ List vals = map.get(name);
+ if (vals == null)
+ {
+ vals = new ArrayList<>();
+ map.put(name, vals);
+ }
+
+ /*
+ * if 'values' contains more name/value separators, parse as a map
+ * (nested sub-attribute values)
+ */
+ if (values.indexOf(nameValueSeparator) != -1)
+ {
+ vals.add(values);
+ }
+ else
{
- List vals = map.get(key);
- if (vals == null)
- {
- vals = new ArrayList();
- map.put(key, vals);
- }
for (String val : values.split(valuesDelimiter))
{
vals.add(val);
}
}
}
+
return map;
}
@@ -317,41 +371,73 @@ public abstract class GffHelperBase implements GffHelperI
protected SequenceFeature buildSequenceFeature(String[] gff,
Map> attributes)
{
+ return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes);
+ }
+
+ /**
+ * @param gff
+ * @param typeColumn
+ * @param group
+ * @param attributes
+ * @return
+ */
+ protected SequenceFeature buildSequenceFeature(String[] gff,
+ int typeColumn, String group,
+ Map> attributes)
+ {
try
{
int start = Integer.parseInt(gff[START_COL]);
int end = Integer.parseInt(gff[END_COL]);
- float score = Float.NaN;
+
+ /*
+ * default 'score' is 0 rather than Float.NaN - see JAL-2554
+ */
+ float score = 0f;
try
{
score = Float.parseFloat(gff[SCORE_COL]);
} catch (NumberFormatException nfe)
{
- // e.g. '.' - leave as NaN to indicate no score
+ // e.g. '.' - leave as zero
}
- SequenceFeature sf = new SequenceFeature(gff[TYPE_COL],
- gff[SOURCE_COL], start, end, score, gff[SOURCE_COL]);
+ SequenceFeature sf = new SequenceFeature(gff[typeColumn],
+ gff[SOURCE_COL], start, end, score, group);
+
+ sf.setStrand(gff[STRAND_COL]);
+
+ sf.setPhase(gff[PHASE_COL]);
if (attributes != null)
{
/*
- * save 'raw' column 9 to allow roundtrip output as input
- */
- sf.setAttributes(gff[ATTRIBUTES_COL]);
-
- /*
* Add attributes in column 9 to the sequence feature's
- * 'otherData' table; use Note as a best proxy for description
+ * 'otherData' table; use Note as a best proxy for description;
+ * decode any encoded comma, equals, semi-colon as per GFF3 spec
*/
for (Entry> attr : attributes.entrySet())
{
- String values = StringUtils.listToDelimitedString(
- attr.getValue(), "; ");
- sf.setValue(attr.getKey(), values);
- if (NOTE.equals(attr.getKey()))
+ String key = attr.getKey();
+ List values = attr.getValue();
+ if (values.size() == 1 && values.get(0).contains(EQUALS))
+ {
+ /*
+ * 'value' is actually nested subattributes as x=a,y=b,z=c
+ */
+ Map valueMap = parseAttributeMap(values.get(0));
+ sf.setValue(key, valueMap);
+ }
+ else
{
- sf.setDescription(values);
+ String csvValues = StringUtils.listToDelimitedString(values,
+ COMMA);
+ csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE);
+ sf.setValue(key, csvValues);
+ if (NOTE.equals(key))
+ {
+ sf.setDescription(csvValues);
+ }
}
}
}
@@ -359,18 +445,109 @@ public abstract class GffHelperBase implements GffHelperI
return sf;
} catch (NumberFormatException nfe)
{
- System.err.println("Invalid number in gff: " + nfe.getMessage());
+ jalview.bin.Console
+ .errPrintln("Invalid number in gff: " + nfe.getMessage());
return null;
}
}
/**
- * Returns the character used to separate attributes names from values in GFF
- * column 9. This is space for GFF2, '=' for GFF3.
+ * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map
+ * of {@code key,
+ * value}
+ * An input string like {@code a=b,c,d=e,f=g,h} is parsed to
+ *
+ *
+ * a = "b,c"
+ * d = "e"
+ * f = "g,h"
+ *
+ *
+ * @param s
*
* @return
*/
- protected abstract char getNameValueSeparator();
+ protected static Map parseAttributeMap(String s)
+ {
+ Map map = new HashMap<>();
+ String[] fields = s.split(EQUALS);
+
+ /*
+ * format validation
+ */
+ boolean valid = true;
+ if (fields.length < 2)
+ {
+ /*
+ * need at least A=B here
+ */
+ valid = false;
+ }
+ else if (fields[0].isEmpty() || fields[0].contains(COMMA))
+ {
+ /*
+ * A,B=C is not a valid start, nor is =C
+ */
+ valid = false;
+ }
+ else
+ {
+ for (int i = 1; i < fields.length - 1; i++)
+ {
+ if (fields[i].isEmpty() || !fields[i].contains(COMMA))
+ {
+ /*
+ * intermediate tokens must include value,name
+ */
+ valid = false;
+ }
+ }
+ }
+
+ if (!valid)
+ {
+ jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s);
+ return map;
+ }
+
+ int i = 0;
+ while (i < fields.length - 1)
+ {
+ boolean lastPair = i == fields.length - 2;
+ String before = fields[i];
+ String after = fields[i + 1];
+
+ /*
+ * if 'key' looks like a,b,c then the last token is the
+ * key
+ */
+ String theKey = before.contains(COMMA)
+ ? before.substring(before.lastIndexOf(COMMA) + 1)
+ : before;
+
+ theKey = theKey.trim();
+ if (theKey.isEmpty())
+ {
+ jalview.bin.Console.errPrintln(INVALID_GFF_ATTRIBUTE_FORMAT + s);
+ map.clear();
+ return map;
+ }
+
+ /*
+ * if 'value' looks like a,b,c then all but the last token is the value,
+ * unless this is the last field (no more = to follow), in which case
+ * all of it makes up the value
+ */
+ String theValue = after.contains(COMMA) && !lastPair
+ ? after.substring(0, after.lastIndexOf(COMMA))
+ : after;
+ map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE),
+ StringUtils.urlDecode(theValue, GFF_ENCODABLE));
+ i += 1;
+ }
+
+ return map;
+ }
/**
* Returns any existing mapping held on the alignment between the given
@@ -383,7 +560,8 @@ public abstract class GffHelperBase implements GffHelperI
* @param toSeq
* @return
*/
- protected AlignedCodonFrame getMapping(AlignmentI align, SequenceI fromSeq, SequenceI toSeq)
+ protected AlignedCodonFrame getMapping(AlignmentI align,
+ SequenceI fromSeq, SequenceI toSeq)
{
AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq);
if (acf == null)