JAL-1483 PIDModel potential replacement for PIDDistanceModel (and

author gmungoc <g.m.carstairs@dundee.ac.uk>

Fri, 24 Feb 2017 15:21:41 +0000 (15:21 +0000)

committer gmungoc <g.m.carstairs@dundee.ac.uk>

Fri, 24 Feb 2017 15:21:41 +0000 (15:21 +0000)
author gmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 24 Feb 2017 15:21:41 +0000 (15:21 +0000)
committer gmungoc <g.m.carstairs@dundee.ac.uk>
Fri, 24 Feb 2017 15:21:41 +0000 (15:21 +0000)
diff --git a/src/jalview/analysis/scoremodels/PIDModel.java b/src/jalview/analysis/scoremodels/PIDModel.java

new file mode 100644 (file)

index 0000000..58667c0
--- /dev/null
+++ b/src/jalview/analysis/scoremodels/PIDModel.java
@@ -0,0 +1,184 @@
+package jalview.analysis.scoremodels;
+
+import jalview.api.analysis.PairwiseScoreModelI;
+import jalview.api.analysis.SimilarityParamsI;
+import jalview.api.analysis.SimilarityScoreModelI;
+import jalview.datamodel.AlignmentView;
+import jalview.math.Matrix;
+import jalview.math.MatrixI;
+import jalview.util.Comparison;
+
+/**
+ * A class to provide sequence pairwise similarity based on residue identity
+ */
+public class PIDModel implements SimilarityScoreModelI,
+        PairwiseScoreModelI
+{
+
+  @Override
+  public String getName()
+  {
+    return "% Identity (PID)";
+  }
+
+  @Override
+  public boolean isDNA()
+  {
+    return true;
+  }
+
+  @Override
+  public boolean isProtein()
+  {
+    return true;
+  }
+
+  /**
+   * Answers 1 if c and d are the same residue (ignoring case), and not gap
+   * characters. Answers 0 for non-matching or gap characters.
+   */
+  @Override
+  public float getPairwiseScore(char c, char d)
+  {
+    c = toUpper(c);
+    d = toUpper(d);
+    if (c == d && !Comparison.isGap(c))
+    {
+      return 1f;
+    }
+    return 0f;
+  }
+
+  /**
+   * @param c
+   */
+  protected static char toUpper(char c)
+  {
+    if ('a' <= c && c <= 'z')
+    {
+      c += 'A' - 'a';
+    }
+    return c;
+  }
+
+  @Override
+  public MatrixI findSimilarities(AlignmentView seqData)
+  {
+    // TODO reuse code in ScoreMatrix instead somehow
+    String[] seqs = seqData.getSequenceStrings(' ');
+    return findSimilarities(seqs, SimilarityParams.Jalview);
+  }
+
+  /**
+   * Compute percentage identity scores, using the gap treatment and
+   * normalisation specified by the options parameter
+   * 
+   * @param seqs
+   * @param options
+   * @return
+   */
+  protected MatrixI findSimilarities(String[] seqs,
+          SimilarityParamsI options)
+  {
+    double[][] values = new double[seqs.length][];
+    for (int row = 0; row < seqs.length; row++)
+    {
+      values[row] = new double[seqs.length];
+      for (int col = 0; col < seqs.length; col++)
+      {
+        double total = computePID(seqs[row], seqs[col], options);
+        values[row][col] = total;
+      }
+    }
+    return new Matrix(values);
+  }
+
+  /**
+   * Computes a percentage identity for two sequences, using the algorithm
+   * choices specified by the options parameter
+   * 
+   * @param seq1
+   * @param seq2
+   * @param options
+   * @return
+   */
+  protected double computePID(String seq1, String seq2,
+          SimilarityParamsI options)
+  {
+    int len1 = seq1.length();
+    int len2 = seq2.length();
+    int width = Math.max(len1, len2);
+    int total = 0;
+    int divideBy = 0;
+
+    for (int i = 0; i < width; i++)
+    {
+      if (i >= len1 || i >= len2)
+      {
+        /*
+         * off the end of one sequence; stop if we are only matching
+         * on the shorter sequence length, else treat as trailing gap
+         */
+        if (options.denominateByShortestLength())
+        {
+          break;
+        }
+        if (options.denominatorIncludesGaps())
+        {
+          divideBy++;
+        }
+        if (options.matchGaps())
+        {
+          total++;
+        }
+        continue;
+      }
+      char c1 = seq1.charAt(i);
+      char c2 = seq2.charAt(i);
+      boolean gap1 = Comparison.isGap(c1);
+      boolean gap2 = Comparison.isGap(c2);
+
+      if (gap1 && gap2)
+      {
+        /*
+         * gap-gap: include if options say so, if so
+         * have to score as identity; else ignore
+         */
+        if (options.includeGappedColumns())
+        {
+          divideBy++;
+          total++;
+        }
+        continue;
+      }
+
+      if (gap1 || gap2)
+      {
+        /*
+         * gap-residue: include if options say so, 
+         * count as match if options say so
+         */
+        if (options.denominatorIncludesGaps())
+        {
+          divideBy++;
+        }
+        if (options.matchGaps())
+        {
+          total++;
+        }
+        continue;
+      }
+
+      /*
+       * remaining case is gap-residue
+       */
+      if (toUpper(c1) == toUpper(c2))
+      {
+        total++;
+      }
+      divideBy++;
+    }
+
+    return divideBy == 0 ? 0D : 100D * total / divideBy;
+  }
+}
diff --git a/test/jalview/analysis/scoremodels/PIDModelTest.java b/test/jalview/analysis/scoremodels/PIDModelTest.java

new file mode 100644 (file)

index 0000000..24b44a3
--- /dev/null
+++ b/test/jalview/analysis/scoremodels/PIDModelTest.java
@@ -0,0 +1,83 @@
+package jalview.analysis.scoremodels;
+
+import static org.testng.Assert.assertEquals;
+
+import jalview.util.Comparison;
+
+import org.testng.annotations.Test;
+
+public class PIDModelTest
+{
+  private static final double DELTA = 0.00001D;
+
+  @Test(groups = "Functional")
+  public void testGetPairwiseScore()
+  {
+    PIDModel sm = new PIDModel();
+    assertEquals(sm.getPairwiseScore('A', 'A'), 1f);
+    assertEquals(sm.getPairwiseScore('A', 'a'), 1f);
+    assertEquals(sm.getPairwiseScore('a', 'A'), 1f);
+    assertEquals(sm.getPairwiseScore('A', 'B'), 0f);
+    assertEquals(sm.getPairwiseScore('A', ' '), 0f);
+    assertEquals(sm.getPairwiseScore(' ', ' '), 0f);
+    assertEquals(sm.getPairwiseScore('.', '.'), 0f);
+    assertEquals(sm.getPairwiseScore('-', '-'), 0f);
+  }
+
+  /**
+   * Regression test to verify that a (suitably configured) PIDModel computes
+   * the same percentage identities as the Comparison.PID method
+   */
+  @Test(groups = "Functional")
+  public void testComputePID_matchesComparisonPID()
+  {
+    /*
+     * same length, no gaps
+     */
+    String s1 = "ARFNQDWSGI";
+    String s2 = "ARKNQDQSGI";
+    double newScore = new PIDModel().computePID(s1, s2,
+            SimilarityParams.Jalview);
+    double oldScore = Comparison.PID(s1, s2);
+    assertEquals(newScore, oldScore, DELTA);
+
+    /*
+     * same length, with gaps
+     */
+    s1 = "-RFNQDWSGI";
+    s2 = "ARKNQ-QSGI";
+    newScore = new PIDModel().computePID(s1, s2,
+            SimilarityParams.Jalview);
+    oldScore = Comparison.PID(s1, s2);
+    assertEquals(newScore, oldScore, DELTA);
+
+    /*
+     * s2 longer than s1, with gaps
+     */
+    s1 = "ARK-";
+    s2 = "-RFNQ";
+    newScore = new PIDModel().computePID(s1, s2,
+            SimilarityParams.Jalview);
+    oldScore = Comparison.PID(s1, s2);
+    assertEquals(newScore, oldScore, DELTA);
+
+    /*
+     * s1 longer than s2, with gaps
+     */
+    s1 = "-RFNQ";
+    s2 = "ARK-";
+    newScore = new PIDModel().computePID(s1, s2,
+            SimilarityParams.Jalview);
+    oldScore = Comparison.PID(s1, s2);
+    assertEquals(newScore, oldScore, DELTA);
+
+    /*
+     * same but now also with gapped columns
+     */
+    s1 = "-R-F-NQ";
+    s2 = "AR-K--";
+    newScore = new PIDModel().computePID(s1, s2, SimilarityParams.Jalview);
+    oldScore = Comparison.PID(s1, s2);
+    assertEquals(newScore, oldScore, DELTA);
+  }
+}
author	gmungoc <g.m.carstairs@dundee.ac.uk>
	Fri, 24 Feb 2017 15:21:41 +0000 (15:21 +0000)
committer	gmungoc <g.m.carstairs@dundee.ac.uk>
	Fri, 24 Feb 2017 15:21:41 +0000 (15:21 +0000)
src/jalview/analysis/scoremodels/PIDModel.java	[new file with mode: 0644]	patch \| blob
test/jalview/analysis/scoremodels/PIDModelTest.java	[new file with mode: 0644]	patch \| blob