private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
/**
+ * when true, interpret the mol_type 'source' feature attribute
+ * and generate an RNA sequence from the DNA record
+ */
+ private boolean produceRna=true;
+ /**
* A data bean class to hold values parsed from one CDS Feature (FT)
*/
class CdsData
private List<DBRefEntry> dbrefs; // from DR
+ private boolean sequenceStringIsRNA=false;
private String sequenceString; // from SQ lines
/*
*
* @throws IOException
*/
+ @Override
public void parse() throws IOException
{
String line = nextLine();
String parseFT(String line) throws IOException
{
String[] tokens = line.split(WHITESPACE);
- if (tokens.length < 3 || !"CDS".equals(tokens[1]))
+ if (tokens.length < 3 || (!"CDS".equals(tokens[1]) && !"source".equals(tokens[1])))
{
return nextLine();
}
-
+
+ if (tokens[1].equals("source"))
+ {
+ return parseSourceQualifiers(tokens);
+ }
+ /*
+ * parse location - which may be over more than one line e.g. EAW51554
+ */
CdsData data = new CdsData();
- data.cdsLocation = tokens[2];
- // TODO location can be over >1 line e.g. EAW51554
+ StringBuilder sb = new StringBuilder().append(tokens[2]);
+ line = parseFeatureQualifier(sb, "CDS");
+ data.cdsLocation = sb.toString();
- line = nextLine();
while (line != null)
{
if (!line.startsWith("FT ")) // 4 spaces
String qualifier = line.substring(slashPos + 1, eqPos);
String value = line.substring(eqPos + 1);
value = removeQuotes(value);
- StringBuilder sb = new StringBuilder().append(value);
+ sb = new StringBuilder().append(value);
line = parseFeatureQualifier(sb, qualifier);
String featureValue = sb.toString();
}
/**
+ * process attributes for 'source' until the next FT feature entry
+ * only interested in 'mol_type'
+ * @param tokens
+ * @return
+ * @throws IOException
+ */
+ private String parseSourceQualifiers(String[] tokens) throws IOException
+ {
+ if (!"source".equals(tokens[1]))
+ {
+ throw (new RuntimeException("Not given a source qualifier"));
+ }
+ // search for mol_type attribute
+
+ StringBuilder sb = new StringBuilder().append(tokens[2]); // extent of
+ // sequence
+
+ String line = parseFeatureQualifier(sb, "source");
+ while (line != null)
+ {
+ if (!line.startsWith("FT ")) // four spaces, end of this feature table
+ // entry
+ {
+ return line;
+ }
+
+ int p = line.indexOf("\\mol_type");
+ int qs = line.indexOf("\"", p);
+ int qe = line.indexOf("\"", qs + 1);
+ String qualifier=line.substring(qs,qe).toLowerCase();
+ if (qualifier.indexOf("rna") > -1)
+ {
+ sequenceStringIsRNA = true;
+ }
+ if (qualifier.indexOf("dna") > -1)
+ {
+ sequenceStringIsRNA = false;
+ }
+ line=parseFeatureQualifier(sb, "source");
+ }
+ return line;
+ }
+
+ /**
* Removes leading or trailing double quotes (") unless doubled, and changes
* any 'escaped' (doubled) double quotes to single characters. As per the
* Feature Table specification for Qualifiers, Free Text.
*/
static String removeQuotes(String value)
{
- if (value == null)
+ if (value == null)
{
return null;
}
* heuristic rule: most multi-line value (e.g. /product) are text,
* so add a space for word boundary at a new line; not for translation
*/
- if (!"translation".equals(qualifierName))
+ if (!"translation".equals(qualifierName)
+ && !"CDS".equals(qualifierName))
{
sb.append(" ");
}
{
name = this.sourceDb + "|" + name;
}
+
+ if (produceRna && sequenceStringIsRNA)
+ {
+ sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
+ }
+
SequenceI seq = new Sequence(name, this.sequenceString);
seq.setDescription(this.description);
int exonLength = MappingUtils.getLength(Arrays.asList(exon));
/*
- * if exon length matches protein, or is shorter, or longer by the
- * length of a stop codon (3 bases), then leave it unchanged
+ * if exon length matches protein, or is shorter, then leave it unchanged
*/
- if (expectedCdsLength >= exonLength
- || expectedCdsLength == exonLength - 3)
+ if (expectedCdsLength >= exonLength)
{
return exon;
}