JAL-3532 TODO comment

[jalview.git] / src / jalview / io / StockholmFile.java
diff --git a/src/jalview/io/StockholmFile.java b/src/jalview/io/StockholmFile.java

index f5b5177..b9f1274 100644 (file)
--- a/src/jalview/io/StockholmFile.java
+++ b/src/jalview/io/StockholmFile.java
@@ -28,12 +28,14 @@ import jalview.datamodel.AlignmentAnnotation;
  import jalview.datamodel.AlignmentI;
  import jalview.datamodel.Annotation;
  import jalview.datamodel.DBRefEntry;
+import jalview.datamodel.DBRefSource;
  import jalview.datamodel.Mapping;
  import jalview.datamodel.Sequence;
  import jalview.datamodel.SequenceFeature;
  import jalview.datamodel.SequenceI;
  import jalview.schemes.ResidueProperties;
  import jalview.util.Comparison;
+import jalview.util.DBRefUtils;
  import jalview.util.Format;
  import jalview.util.MessageManager;
  
@@ -83,6 +85,14 @@ public class StockholmFile extends AlignFile
    public static final Regex DETECT_BRACKETS = new Regex(
            "(<|>|\\[|\\]|\\(|\\)|\\{|\\})");
  
+  // WUSS extended symbols. Avoid ambiguity with protein SS annotations by using NOT_RNASS first.
+  public static final String RNASS_BRACKETS = "<>[](){}AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz";
+
+  // use the following regex to decide an annotations (whole) line is NOT an RNA
+  // SS (it contains only E,H,e,h and other non-brace/non-alpha chars)
+  private static final Regex NOT_RNASS = new Regex(
+          "^[^<>[\\](){}A-DF-Za-df-z]*$");
+
    StringBuffer out; // output buffer
  
    AlignmentI al;
@@ -197,7 +207,7 @@ public class StockholmFile extends AlignFile
      String version;
      // String id;
      Hashtable seqAnn = new Hashtable(); // Sequence related annotations
-    LinkedHashMap<String, String> seqs = new LinkedHashMap<String, String>();
+    LinkedHashMap<String, String> seqs = new LinkedHashMap<>();
      Regex p, r, rend, s, x;
      // Temporary line for processing RNA annotation
      // String RNAannot = "";
@@ -519,6 +529,9 @@ public class StockholmFile extends AlignFile
                treeName = an.stringMatched(2);
                treeString = new StringBuffer();
              }
+            // TODO: JAL-3532 - this is where GF comments and database references are lost
+            // suggest overriding this method for Stockholm files to catch and properly
+            // process CC, DR etc into multivalued properties
              setAlignmentProperty(an.stringMatched(1), an.stringMatched(2));
            }
          }
@@ -658,7 +671,7 @@ public class StockholmFile extends AlignFile
                strucAnn = new Hashtable();
              }
  
-            Vector<AlignmentAnnotation> newStruc = new Vector<AlignmentAnnotation>();
+            Vector<AlignmentAnnotation> newStruc = new Vector<>();
              parseAnnotationRow(newStruc, type, ns);
              for (AlignmentAnnotation alan : newStruc)
              {
@@ -710,7 +723,7 @@ public class StockholmFile extends AlignFile
    private void guessDatabaseFor(Sequence seqO, String dbr, String dbsource)
    {
      DBRefEntry dbrf = null;
-    List<DBRefEntry> dbrs = new ArrayList<DBRefEntry>();
+    List<DBRefEntry> dbrs = new ArrayList<>();
      String seqdb = "Unknown", sdbac = "" + dbr;
      int st = -1, en = -1, p;
      if ((st = sdbac.indexOf("/")) > -1)
@@ -824,9 +837,14 @@ public class StockholmFile extends AlignFile
      }
      boolean ss = false, posterior = false;
      type = id2type(type);
+
+    boolean isrnass = false;
      if (type.equalsIgnoreCase("secondary structure"))
      {
        ss = true;
+      isrnass = !NOT_RNASS.search(annots); // sorry about the double negative
+                                           // here (it's easier for dealing with
+                                           // other non-alpha-non-brace chars)
      }
      if (type.equalsIgnoreCase("posterior probability"))
      {
@@ -844,7 +862,7 @@ public class StockholmFile extends AlignFile
        {
          // if (" .-_".indexOf(pos) == -1)
          {
-          if (DETECT_BRACKETS.search(pos))
+          if (isrnass && RNASS_BRACKETS.indexOf(pos) >= 0)
            {
              ann.secondaryStructure = Rna.getRNASecStrucState(pos).charAt(0);
              ann.displayCharacter = "" + pos.charAt(0);
@@ -917,6 +935,11 @@ public class StockholmFile extends AlignFile
      return annot;
    }
  
+  private String dbref_to_ac_record(DBRefEntry ref)
+  {
+    return ref.getSource().toString() + " ; "
+            + ref.getAccessionId().toString();
+  }
    @Override
    public String print(SequenceI[] s, boolean jvSuffix)
    {
@@ -931,6 +954,7 @@ public class StockholmFile extends AlignFile
      Hashtable dataRef = null;
      while ((in < s.length) && (s[in] != null))
      {
+      boolean isAA = s[in].isProtein();
        String tmp = printId(s[in], jvSuffix);
        max = Math.max(max, s[in].getLength());
  
@@ -940,17 +964,33 @@ public class StockholmFile extends AlignFile
        }
        if (s[in].getDBRefs() != null)
        {
-        for (int idb = 0; idb < s[in].getDBRefs().length; idb++)
+        if (dataRef == null)
+        {
+          dataRef = new Hashtable();
+        }
+        List<DBRefEntry> primrefs = s[in].getPrimaryDBRefs();
+        if (primrefs.size() >= 1)
+        {
+          dataRef.put(tmp, dbref_to_ac_record(primrefs.get(0)));
+        }
+        else
          {
-          if (dataRef == null)
+          for (int idb = 0; idb < s[in].getDBRefs().length; idb++)
            {
-            dataRef = new Hashtable();
+            DBRefEntry dbref = s[in].getDBRefs()[idb];
+            dataRef.put(tmp, dbref_to_ac_record(dbref));
+            // if we put in a uniprot or EMBL record then we're done:
+            if (isAA && DBRefSource.UNIPROT
+                    .equals(DBRefUtils.getCanonicalName(dbref.getSource())))
+            {
+              break;
+            }
+            if (!isAA && DBRefSource.EMBL
+                    .equals(DBRefUtils.getCanonicalName(dbref.getSource())))
+            {
+              break;
+            }
            }
-
-          String datAs1 = s[in].getDBRefs()[idb].getSource().toString()
-                  + " ; "
-                  + s[in].getDBRefs()[idb].getAccessionId().toString();
-          dataRef.put(tmp, datAs1);
          }
        }
        in++;
@@ -1114,22 +1154,36 @@ public class StockholmFile extends AlignFile
      String ch = (annot == null)
              ? ((sequenceI == null) ? "-"
                      : Character.toString(sequenceI.getCharAt(k)))
-            : annot.displayCharacter;
+            : (annot.displayCharacter == null
+                    ? String.valueOf(annot.secondaryStructure)
+                    : annot.displayCharacter);
+    if (ch == null)
+    {
+      ch = " ";
+    }
      if (key != null && key.equals("SS"))
      {
+      char ssannotchar = ' ';
+      boolean charset = false;
        if (annot == null)
        {
          // sensible gap character
-        return ' ';
+        ssannotchar = ' ';
+        charset = true;
        }
        else
        {
          // valid secondary structure AND no alternative label (e.g. ' B')
          if (annot.secondaryStructure > ' ' && ch.length() < 2)
          {
-          return annot.secondaryStructure;
+          ssannotchar = annot.secondaryStructure;
+          charset = true;
          }
        }
+      if (charset)
+      {
+        return (ssannotchar == ' ' && isrna) ? '.' : ssannotchar;
+      }
      }
  
      if (ch.length() == 0)
@@ -1144,7 +1198,9 @@ public class StockholmFile extends AlignFile
      {
        seq = ch.charAt(1);
      }
-    return seq;
+
+    return (seq == ' ' && key != null && key.equals("SS") && isrna) ? '.'
+            : seq;
    }
  
    public String print()