From: gmungoc Date: Fri, 24 Feb 2017 15:21:41 +0000 (+0000) Subject: JAL-1483 PIDModel potential replacement for PIDDistanceModel (and X-Git-Tag: Release_2_10_2~3^2~105^2~2^2~88 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=5346ea15df13baf5b342e467b70358c2212fd1ef;p=jalview.git JAL-1483 PIDModel potential replacement for PIDDistanceModel (and seqspace and Comparison.PID?) --- diff --git a/src/jalview/analysis/scoremodels/PIDModel.java b/src/jalview/analysis/scoremodels/PIDModel.java new file mode 100644 index 0000000..58667c0 --- /dev/null +++ b/src/jalview/analysis/scoremodels/PIDModel.java @@ -0,0 +1,184 @@ +package jalview.analysis.scoremodels; + +import jalview.api.analysis.PairwiseScoreModelI; +import jalview.api.analysis.SimilarityParamsI; +import jalview.api.analysis.SimilarityScoreModelI; +import jalview.datamodel.AlignmentView; +import jalview.math.Matrix; +import jalview.math.MatrixI; +import jalview.util.Comparison; + +/** + * A class to provide sequence pairwise similarity based on residue identity + */ +public class PIDModel implements SimilarityScoreModelI, + PairwiseScoreModelI +{ + + @Override + public String getName() + { + return "% Identity (PID)"; + } + + @Override + public boolean isDNA() + { + return true; + } + + @Override + public boolean isProtein() + { + return true; + } + + /** + * Answers 1 if c and d are the same residue (ignoring case), and not gap + * characters. Answers 0 for non-matching or gap characters. + */ + @Override + public float getPairwiseScore(char c, char d) + { + c = toUpper(c); + d = toUpper(d); + if (c == d && !Comparison.isGap(c)) + { + return 1f; + } + return 0f; + } + + /** + * @param c + */ + protected static char toUpper(char c) + { + if ('a' <= c && c <= 'z') + { + c += 'A' - 'a'; + } + return c; + } + + @Override + public MatrixI findSimilarities(AlignmentView seqData) + { + // TODO reuse code in ScoreMatrix instead somehow + String[] seqs = seqData.getSequenceStrings(' '); + return findSimilarities(seqs, SimilarityParams.Jalview); + } + + /** + * Compute percentage identity scores, using the gap treatment and + * normalisation specified by the options parameter + * + * @param seqs + * @param options + * @return + */ + protected MatrixI findSimilarities(String[] seqs, + SimilarityParamsI options) + { + double[][] values = new double[seqs.length][]; + for (int row = 0; row < seqs.length; row++) + { + values[row] = new double[seqs.length]; + for (int col = 0; col < seqs.length; col++) + { + double total = computePID(seqs[row], seqs[col], options); + values[row][col] = total; + } + } + return new Matrix(values); + } + + /** + * Computes a percentage identity for two sequences, using the algorithm + * choices specified by the options parameter + * + * @param seq1 + * @param seq2 + * @param options + * @return + */ + protected double computePID(String seq1, String seq2, + SimilarityParamsI options) + { + int len1 = seq1.length(); + int len2 = seq2.length(); + int width = Math.max(len1, len2); + int total = 0; + int divideBy = 0; + + for (int i = 0; i < width; i++) + { + if (i >= len1 || i >= len2) + { + /* + * off the end of one sequence; stop if we are only matching + * on the shorter sequence length, else treat as trailing gap + */ + if (options.denominateByShortestLength()) + { + break; + } + if (options.denominatorIncludesGaps()) + { + divideBy++; + } + if (options.matchGaps()) + { + total++; + } + continue; + } + char c1 = seq1.charAt(i); + char c2 = seq2.charAt(i); + boolean gap1 = Comparison.isGap(c1); + boolean gap2 = Comparison.isGap(c2); + + if (gap1 && gap2) + { + /* + * gap-gap: include if options say so, if so + * have to score as identity; else ignore + */ + if (options.includeGappedColumns()) + { + divideBy++; + total++; + } + continue; + } + + if (gap1 || gap2) + { + /* + * gap-residue: include if options say so, + * count as match if options say so + */ + if (options.denominatorIncludesGaps()) + { + divideBy++; + } + if (options.matchGaps()) + { + total++; + } + continue; + } + + /* + * remaining case is gap-residue + */ + if (toUpper(c1) == toUpper(c2)) + { + total++; + } + divideBy++; + } + + return divideBy == 0 ? 0D : 100D * total / divideBy; + } +} diff --git a/test/jalview/analysis/scoremodels/PIDModelTest.java b/test/jalview/analysis/scoremodels/PIDModelTest.java new file mode 100644 index 0000000..24b44a3 --- /dev/null +++ b/test/jalview/analysis/scoremodels/PIDModelTest.java @@ -0,0 +1,83 @@ +package jalview.analysis.scoremodels; + +import static org.testng.Assert.assertEquals; + +import jalview.util.Comparison; + +import org.testng.annotations.Test; + +public class PIDModelTest +{ + private static final double DELTA = 0.00001D; + + @Test(groups = "Functional") + public void testGetPairwiseScore() + { + PIDModel sm = new PIDModel(); + assertEquals(sm.getPairwiseScore('A', 'A'), 1f); + assertEquals(sm.getPairwiseScore('A', 'a'), 1f); + assertEquals(sm.getPairwiseScore('a', 'A'), 1f); + assertEquals(sm.getPairwiseScore('A', 'B'), 0f); + assertEquals(sm.getPairwiseScore('A', ' '), 0f); + assertEquals(sm.getPairwiseScore(' ', ' '), 0f); + assertEquals(sm.getPairwiseScore('.', '.'), 0f); + assertEquals(sm.getPairwiseScore('-', '-'), 0f); + } + + /** + * Regression test to verify that a (suitably configured) PIDModel computes + * the same percentage identities as the Comparison.PID method + */ + @Test(groups = "Functional") + public void testComputePID_matchesComparisonPID() + { + /* + * same length, no gaps + */ + String s1 = "ARFNQDWSGI"; + String s2 = "ARKNQDQSGI"; + double newScore = new PIDModel().computePID(s1, s2, + SimilarityParams.Jalview); + double oldScore = Comparison.PID(s1, s2); + assertEquals(newScore, oldScore, DELTA); + + /* + * same length, with gaps + */ + s1 = "-RFNQDWSGI"; + s2 = "ARKNQ-QSGI"; + newScore = new PIDModel().computePID(s1, s2, + SimilarityParams.Jalview); + oldScore = Comparison.PID(s1, s2); + assertEquals(newScore, oldScore, DELTA); + + /* + * s2 longer than s1, with gaps + */ + s1 = "ARK-"; + s2 = "-RFNQ"; + newScore = new PIDModel().computePID(s1, s2, + SimilarityParams.Jalview); + oldScore = Comparison.PID(s1, s2); + assertEquals(newScore, oldScore, DELTA); + + /* + * s1 longer than s2, with gaps + */ + s1 = "-RFNQ"; + s2 = "ARK-"; + newScore = new PIDModel().computePID(s1, s2, + SimilarityParams.Jalview); + oldScore = Comparison.PID(s1, s2); + assertEquals(newScore, oldScore, DELTA); + + /* + * same but now also with gapped columns + */ + s1 = "-R-F-NQ"; + s2 = "AR-K--"; + newScore = new PIDModel().computePID(s1, s2, SimilarityParams.Jalview); + oldScore = Comparison.PID(s1, s2); + assertEquals(newScore, oldScore, DELTA); + } +}