{
private static final String QUOTE = "\"";
+ private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
+
/**
* A data bean class to hold values parsed from one CDS Feature (FT)
*/
super(false, fp); // don't parse immediately
this.sourceDb = sourceId;
dbrefs = new ArrayList<>();
-
+
/*
* using TreeMap gives CDS sequences in alphabetical, so readable, order
*/
CdsData data = new CdsData();
data.cdsLocation = tokens[2];
+ // TODO location can be over >1 line e.g. EAW51554
line = nextLine();
while (line != null)
/*
* extract qualifier, e.g. FT /protein_id="CAA37824.1"
+ * - the value may extend over more than one line
+ * - if the value has enclosing quotes, these are removed
+ * - escaped double quotes ("") are reduced to a single character
*/
int slashPos = line.indexOf('/');
if (slashPos == -1)
{
Cache.log.error("Unexpected EMBL line ignored: " + line);
+ line = nextLine();
continue;
}
int eqPos = line.indexOf('=', slashPos + 1);
if (eqPos == -1)
{
// can happen, e.g. /ribosomal_slippage
-// Cache.log.error("Unexpected EMBL line ignored: " + line);
+ // Cache.log.error("Unexpected EMBL line ignored: " + line);
line = nextLine();
continue;
}
String qualifier = line.substring(slashPos + 1, eqPos);
String value = line.substring(eqPos + 1);
- if (value.startsWith(QUOTE) && value.endsWith(QUOTE))
- {
- value = value.substring(1, value.length() - 1);
- }
+ value = removeQuotes(value);
+ StringBuilder sb = new StringBuilder().append(value);
+ line = parseFeatureQualifier(sb, qualifier);
+ String featureValue = sb.toString();
if ("protein_id".equals(qualifier))
{
- data.proteinId = value;
- line = nextLine();
+ data.proteinId = featureValue;
}
else if ("codon_start".equals(qualifier))
{
try
{
- data.codonStart = Integer.parseInt(value.trim());
+ data.codonStart = Integer.parseInt(featureValue.trim());
} catch (NumberFormatException e)
{
Cache.log.error("Invalid codon_start in XML for " + this.accession
+ ": " + e.getMessage());
}
- line = nextLine();
}
else if ("db_xref".equals(qualifier))
{
- String[] parts = value.split(":");
+ String[] parts = featureValue.split(":");
if (parts.length == 2)
{
String db = parts[0].trim();
DBRefEntry dbref = new DBRefEntry(db, "0", parts[1].trim());
data.xrefs.add(dbref);
}
- line = nextLine();
}
else if ("product".equals(qualifier))
{
- // sometimes name is returned e.g. for V00488
- data.proteinName = value;
- line = nextLine();
+ data.proteinName = featureValue;
}
else if ("translation".equals(qualifier))
{
- line = parseTranslation(value, data);
+ data.translation = featureValue;
}
- else if (!"".equals(value))
+ else if (!"".equals(featureValue))
{
// throw anything else into the additional properties hash
- data.cdsProps.put(qualifier, value);
- line = nextLine();
+ data.cdsProps.put(qualifier, featureValue);
}
}
}
/**
- * Reads and returns the CDS translation from one or more lines of the file,
- * and returns the next line after that
+ * Removes leading or trailing double quotes (") unless doubled, and changes
+ * any 'escaped' (doubled) double quotes to single characters. As per the
+ * Feature Table specification for Qualifiers, Free Text.
*
* @param value
- * the first line of the translation (likely quoted)
- * @param data
* @return
- * @throws IOException
*/
- String parseTranslation(String value, CdsData data) throws IOException
+ static String removeQuotes(String value)
{
- StringBuilder sb = new StringBuilder(this.length / 3 + 1);
- sb.append(value.replace(QUOTE, ""));
+ if (value == null)
+ {
+ return null;
+ }
+ if (value.startsWith(QUOTE) && !value.startsWith(DOUBLED_QUOTE))
+ {
+ value = value.substring(1);
+ }
+ if (value.endsWith(QUOTE) && !value.endsWith(DOUBLED_QUOTE))
+ {
+ value = value.substring(0, value.length() - 1);
+ }
+ value = value.replace(DOUBLED_QUOTE, QUOTE);
+ return value;
+ }
+ /**
+ * Reads the value of a feature (FT) qualifier from one or more lines of the
+ * file, and returns the next line after that. Values are appended to the
+ * string buffer, which should be already primed with the value read from the
+ * first line for the qualifier (with any leading double quote removed).
+ * Enclosing double quotes are removed, and escaped (repeated) double quotes
+ * reduced to one only. For example for
+ *
+ * <pre>
+ * FT /note="gene_id=hCG28070.3
+ * FT ""foobar"" isoform=CRA_b"
+ * the returned value is
+ * gene_id=hCG28070.3 "foobar" isoform=CRA_b
+ * </pre>
+ *
+ * Note the side-effect of this method, to advance data reading to the next
+ * line after the feature qualifier.
+ *
+ * @param sb
+ * a string buffer primed with the first line of the value
+ * @param qualifierName
+ * @return
+ * @throws IOException
+ */
+ String parseFeatureQualifier(StringBuilder sb, String qualifierName)
+ throws IOException
+ {
String line;
while ((line = nextLine()) != null)
{
String[] tokens = line.split(WHITESPACE);
if (tokens.length < 2)
{
- Cache.log.error("Ignoring bad EMBL line: " + line);
+ Cache.log.error("Ignoring bad EMBL line for " + this.accession
+ + ": " + line);
break;
}
if (tokens[1].startsWith("/"))
{
break; // next feature qualifier
}
- sb.append(tokens[1].replace(QUOTE, ""));
- }
- data.translation = sb.toString();
+ /*
+ * heuristic rule: most multi-line value (e.g. /product) are text,
+ * so add a space for word boundary at a new line; not for translation
+ */
+ if (!"translation".equals(qualifierName))
+ {
+ sb.append(" ");
+ }
+
+ /*
+ * remove trailing " and unescape doubled ""
+ */
+ String data = removeQuotes(tokens[1]);
+ sb.append(data);
+ }
return line;
}
*/
void buildSequence()
{
+ if (this.accession == null || this.sequenceString == null)
+ {
+ Cache.log.error("Failed to parse data from EMBL");
+ return;
+ }
+
String name = this.accession;
if (this.sourceDb != null)
{
map.setMappedFromId(data.proteinId);
dnaToEmblProteinRef.setMap(map);
dna.addDBRef(dnaToEmblProteinRef);
- }
+ }
/*
* comment brought forward from EmblXmlSource, lines 447-451:
public void testParse_noUniprotXref() throws IOException
{
// MN908947 cut down to 40BP, one CDS, length 5 peptide for test purposes
+ // plus an additional (invented) test case:
+ // - multi-line /product qualifier including escaped quotes
String data = "ID MN908947; SV 3; linear; genomic RNA; STD; VRL; 20 BP.\n"
+ "DE Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1,\n"
+ "FT CDS 3..17\n"
+ "FT /protein_id=\"QHD43415.1\"\n"
- + "FT /product=\"orf1ab polyprotein\"\n"
+ + "FT /product=\"orf1ab polyprotein\n"
+ + "FT \"\"foobar\"\" \"\n"
+ "FT /translation=\"MRKLD\n"
+ "SQ Sequence 7496 BP; 2450 A; 1290 C; 1434 G; 2322 T; 0 other;\n"
+ " ggatGcgtaa gttagacgaa attttgtctt tgcgcacaga 40\n";
mapping = dbref.getMap();
SequenceI mapTo = mapping.getTo();
assertEquals(mapTo.getName(), "QHD43415.1");
- assertEquals(mapTo.getDescription(), "orf1ab polyprotein");
+ // the /product qualifier transfers to protein product description
+ assertEquals(mapTo.getDescription(), "orf1ab polyprotein \"foobar\"");
assertEquals(mapTo.getSequenceAsString(), "MRKLD");
map = mapping.getMap();
assertEquals(map.getFromLowest(), 3);
truncated = EmblFlatFile.adjustForProteinLength(7, exons);
assertSame(exons, truncated);
}
+
+ @Test(groups = "Functional")
+ public void testRemoveQuotes()
+ {
+ assertNull(EmblFlatFile.removeQuotes(null));
+ assertEquals(EmblFlatFile.removeQuotes("No quotes here"), "No quotes here");
+ assertEquals(EmblFlatFile.removeQuotes("\"Enclosing quotes\""), "Enclosing quotes");
+ assertEquals(EmblFlatFile.removeQuotes("\"Escaped \"\"quotes\"\" example\""), "Escaped \"quotes\" example");
+ }
}