JAL-3821 rough and ready patch translates t/T to u/U when mol_type includes rna

[jalview.git] / src / jalview / io / EmblFlatFile.java
diff --git a/src/jalview/io/EmblFlatFile.java b/src/jalview/io/EmblFlatFile.java

index 900aef8..91bf780 100644 (file)
--- a/src/jalview/io/EmblFlatFile.java
+++ b/src/jalview/io/EmblFlatFile.java
@@ -50,6 +50,11 @@ public class EmblFlatFile extends AlignFile // FileParse
    private static final String DOUBLED_QUOTE = QUOTE + QUOTE;
  
    /**
+   * when true, interpret the mol_type 'source' feature attribute
+   * and generate an RNA sequence from the DNA record
+   */
+  private boolean produceRna=true;
+  /**
     * A data bean class to hold values parsed from one CDS Feature (FT)
     */
    class CdsData
@@ -86,6 +91,7 @@ public class EmblFlatFile extends AlignFile // FileParse
  
    private List<DBRefEntry> dbrefs; // from DR
  
+  private boolean sequenceStringIsRNA=false;
    private String sequenceString; // from SQ lines
  
    /*
@@ -118,6 +124,7 @@ public class EmblFlatFile extends AlignFile // FileParse
     * 
     * @throws IOException
     */
+  @Override
    public void parse() throws IOException
    {
      String line = nextLine();
@@ -317,16 +324,23 @@ public class EmblFlatFile extends AlignFile // FileParse
    String parseFT(String line) throws IOException
    {
      String[] tokens = line.split(WHITESPACE);
-    if (tokens.length < 3 || !"CDS".equals(tokens[1]))
+    if (tokens.length < 3 || (!"CDS".equals(tokens[1]) && !"source".equals(tokens[1])))
      {
        return nextLine();
      }
-
+    
+    if (tokens[1].equals("source"))
+    {
+      return parseSourceQualifiers(tokens);
+    }
+    /*
+     * parse location - which may be over more than one line e.g. EAW51554
+     */
      CdsData data = new CdsData();
-    data.cdsLocation = tokens[2];
-    // TODO location can be over >1 line e.g. EAW51554
+    StringBuilder sb = new StringBuilder().append(tokens[2]);
+    line = parseFeatureQualifier(sb, "CDS");
+    data.cdsLocation = sb.toString();
  
-    line = nextLine();
      while (line != null)
      {
        if (!line.startsWith("FT    ")) // 4 spaces
@@ -359,7 +373,7 @@ public class EmblFlatFile extends AlignFile // FileParse
        String qualifier = line.substring(slashPos + 1, eqPos);
        String value = line.substring(eqPos + 1);
        value = removeQuotes(value);
-      StringBuilder sb = new StringBuilder().append(value);
+      sb = new StringBuilder().append(value);
        line = parseFeatureQualifier(sb, qualifier);
        String featureValue = sb.toString();
  
@@ -418,6 +432,50 @@ public class EmblFlatFile extends AlignFile // FileParse
    }
  
    /**
+   * process attributes for 'source' until the next FT feature entry
+   * only interested in 'mol_type'
+   * @param tokens
+   * @return
+   * @throws IOException
+   */
+  private String parseSourceQualifiers(String[] tokens) throws IOException
+  {
+    if (!"source".equals(tokens[1]))
+    {
+      throw (new RuntimeException("Not given a source qualifier"));
+    }
+    // search for mol_type attribute
+
+    StringBuilder sb = new StringBuilder().append(tokens[2]); // extent of
+                                                              // sequence
+
+    String line = parseFeatureQualifier(sb, "source");
+    while (line != null)
+    {
+      if (!line.startsWith("FT    ")) // four spaces, end of this feature table
+                                      // entry
+      {
+        return line;
+      }
+
+      int p = line.indexOf("\\mol_type");
+      int qs = line.indexOf("\"", p);
+      int qe = line.indexOf("\"", qs + 1);
+      String qualifier=line.substring(qs,qe).toLowerCase();
+      if (qualifier.indexOf("rna") > -1)
+      {
+        sequenceStringIsRNA = true;
+      }
+      if (qualifier.indexOf("dna") > -1)
+      {
+        sequenceStringIsRNA = false;
+      }
+      line=parseFeatureQualifier(sb, "source");
+    }
+    return line;
+  }
+
+  /**
     * Removes leading or trailing double quotes (") unless doubled, and changes
     * any 'escaped' (doubled) double quotes to single characters. As per the
     * Feature Table specification for Qualifiers, Free Text.
@@ -427,7 +485,7 @@ public class EmblFlatFile extends AlignFile // FileParse
     */
    static String removeQuotes(String value)
    {
-    if (value == null) 
+    if (value == null)
      {
        return null;
      }
@@ -493,7 +551,8 @@ public class EmblFlatFile extends AlignFile // FileParse
         * heuristic rule: most multi-line value (e.g. /product) are text,
         * so add a space for word boundary at a new line; not for translation
         */
-      if (!"translation".equals(qualifierName))
+      if (!"translation".equals(qualifierName)
+              && !"CDS".equals(qualifierName))
        {
          sb.append(" ");
        }
@@ -524,6 +583,12 @@ public class EmblFlatFile extends AlignFile // FileParse
      {
        name = this.sourceDb + "|" + name;
      }
+    
+    if (produceRna && sequenceStringIsRNA)
+    {
+      sequenceString = sequenceString.replace('T', 'U').replace('t', 'u');
+    }
+    
      SequenceI seq = new Sequence(name, this.sequenceString);
      seq.setDescription(this.description);
  
@@ -836,11 +901,9 @@ public class EmblFlatFile extends AlignFile // FileParse
      int exonLength = MappingUtils.getLength(Arrays.asList(exon));
  
      /*
-     * if exon length matches protein, or is shorter, or longer by the 
-     * length of a stop codon (3 bases), then leave it unchanged
+     * if exon length matches protein, or is shorter, then leave it unchanged
       */
-    if (expectedCdsLength >= exonLength
-            || expectedCdsLength == exonLength - 3)
+    if (expectedCdsLength >= exonLength)
      {
        return exon;
      }