JAL-2996 JAL-3053 ~ | : [] {} () treated as gap characters.

[jalview.git] / src / jalview / util / Comparison.java
diff --git a/src/jalview/util/Comparison.java b/src/jalview/util/Comparison.java

index 17d3a70..aa6bcd4 100644 (file)
--- a/src/jalview/util/Comparison.java
+++ b/src/jalview/util/Comparison.java
@@ -40,9 +40,29 @@ public class Comparison
  
    public static final char GAP_DASH = '-';
  
+  public static final char GAP_TILDE = '~';
+
+  public static final char GAP_PIPE = '|';
+
+  public static final char GAP_COLON = ':';
+
+  public static final char GAP_LPAREN = '(';
+
+  public static final char GAP_RPAREN = ')';
+
+  public static final char GAP_LSQBR = '[';
+
+  public static final char GAP_RSQBR = ']';
+
+  public static final char GAP_LBRACE = '{';
+
+  public static final char GAP_RBRACE = '}';
+
    public static final String GapChars = new String(
            new char[]
-          { GAP_SPACE, GAP_DOT, GAP_DASH });
+          { GAP_SPACE, GAP_DOT, GAP_DASH, GAP_TILDE, GAP_PIPE, GAP_COLON,
+              GAP_LPAREN,
+              GAP_RPAREN, GAP_LSQBR, GAP_RSQBR, GAP_LBRACE, GAP_RBRACE });
  
    /**
     * DOCUMENT ME!
@@ -256,7 +276,24 @@ public class Comparison
     */
    public static final boolean isGap(char c)
    {
-    return (c == GAP_DASH || c == GAP_DOT || c == GAP_SPACE) ? true : false;
+    switch (c)
+    {
+    case GAP_SPACE:
+    case GAP_DOT:
+    case GAP_DASH:
+    case GAP_TILDE:
+    case GAP_PIPE:
+    case GAP_COLON:
+    case GAP_LPAREN:
+    case GAP_RPAREN:
+    case GAP_LSQBR:
+    case GAP_RSQBR:
+    case GAP_LBRACE:
+    case GAP_RBRACE:
+      return true;
+    default:
+      return false;
+    }
    }
  
    /**
@@ -285,35 +322,10 @@ public class Comparison
      {
        return false;
      }
-    char[][] letters = new char[seqs.length][];
-    for (int i = 0; i < seqs.length; i++)
-    {
-      if (seqs[i] != null)
-      {
-        char[] sequence = seqs[i].getSequence();
-        if (sequence != null)
-        {
-          letters[i] = sequence;
-        }
-      }
-    }
-
-    return areNucleotide(letters);
-  }
  
-  /**
-   * Answers true if more than 85% of the sequence residues (ignoring gaps) are
-   * A, G, C, T or U, else false. This is just a heuristic guess and may give a
-   * wrong answer (as AGCT are also amino acid codes).
-   * 
-   * @param letters
-   * @return
-   */
-  static final boolean areNucleotide(char[][] letters)
-  {
      int ntCount = 0;
      int aaCount = 0;
-    for (char[] seq : letters)
+    for (SequenceI seq : seqs)
      {
        if (seq == null)
        {
@@ -321,8 +333,10 @@ public class Comparison
        }
        // TODO could possibly make an informed guess just from the first sequence
        // to save a lengthy calculation
-      for (char c : seq)
+      int len = seq.getLength();
+      for (int i = 0; i < len; i++)
        {
+        char c = seq.getCharAt(i);
          if (isNucleotide(c))
          {
            ntCount++;
@@ -414,7 +428,7 @@ public class Comparison
      {
        return false;
      }
-    List<SequenceI> flattened = new ArrayList<SequenceI>();
+    List<SequenceI> flattened = new ArrayList<>();
      for (SequenceI[] ss : seqs)
      {
        for (SequenceI s : ss)