From baad2f0ba2b171dd3d52c17afa46ef800334ea5e Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 27 Feb 2017 18:55:26 +0000 Subject: [PATCH] JAL-838 ScoreMatrix now respects SimilarityParams! --- src/jalview/analysis/scoremodels/PIDModel.java | 4 +- src/jalview/analysis/scoremodels/ScoreMatrix.java | 81 ++++++++++-- .../analysis/scoremodels/SimilarityParams.java | 2 +- src/jalview/api/analysis/PairwiseScoreModelI.java | 22 +--- src/jalview/api/analysis/SimilarityParamsI.java | 12 +- .../analysis/scoremodels/ScoreMatrixTest.java | 131 ++++++++++++++++++++ 6 files changed, 211 insertions(+), 41 deletions(-) diff --git a/src/jalview/analysis/scoremodels/PIDModel.java b/src/jalview/analysis/scoremodels/PIDModel.java index 24e9a1c..50c4a71 100644 --- a/src/jalview/analysis/scoremodels/PIDModel.java +++ b/src/jalview/analysis/scoremodels/PIDModel.java @@ -124,7 +124,7 @@ public class PIDModel implements SimilarityScoreModelI, { break; } - if (options.denominatorIncludesGaps()) + if (options.includesGaps()) { divideBy++; } @@ -159,7 +159,7 @@ public class PIDModel implements SimilarityScoreModelI, * gap-residue: include if options say so, * count as match if options say so */ - if (options.denominatorIncludesGaps()) + if (options.includesGaps()) { divideBy++; } diff --git a/src/jalview/analysis/scoremodels/ScoreMatrix.java b/src/jalview/analysis/scoremodels/ScoreMatrix.java index 84835a4..41d7383 100644 --- a/src/jalview/analysis/scoremodels/ScoreMatrix.java +++ b/src/jalview/analysis/scoremodels/ScoreMatrix.java @@ -26,6 +26,7 @@ import jalview.api.analysis.SimilarityScoreModelI; import jalview.datamodel.AlignmentView; import jalview.math.Matrix; import jalview.math.MatrixI; +import jalview.util.Comparison; import java.util.Arrays; @@ -355,28 +356,22 @@ public class ScoreMatrix implements SimilarityScoreModelI, } /** + * Computes pairwise similarities of a set of sequences using the given + * parameters + * * @param seqs + * @param params * @return */ - protected MatrixI findSimilarities(String[] seqs, - SimilarityParamsI options) + protected MatrixI findSimilarities(String[] seqs, SimilarityParamsI params) { - // todo use options in calculation double[][] values = new double[seqs.length][]; for (int row = 0; row < seqs.length; row++) { values[row] = new double[seqs.length]; for (int col = 0; col < seqs.length; col++) { - int total = 0; - int width = Math.min(seqs[row].length(), seqs[col].length()); - for (int i = 0; i < width; i++) - { - char c1 = seqs[row].charAt(i); - char c2 = seqs[col].charAt(i); - float score = getPairwiseScore(c1, c2); - total += score; - } + double total = computeSimilarity(seqs[row], seqs[col], params); values[row][col] = total; } } @@ -384,6 +379,68 @@ public class ScoreMatrix implements SimilarityScoreModelI, } /** + * Calculates the pairwise similarity of two strings using the given + * calculation parameters + * + * @param seq1 + * @param seq2 + * @param params + * @return + */ + protected double computeSimilarity(String seq1, String seq2, + SimilarityParamsI params) + { + int len1 = seq1.length(); + int len2 = seq2.length(); + double total = 0; + + int width = Math.max(len1, len2); + for (int i = 0; i < width; i++) + { + if (i >= len1 || i >= len2) + { + /* + * off the end of one sequence; stop if we are only matching + * on the shorter sequence length, else treat as trailing gap + */ + if (params.denominateByShortestLength()) + { + break; + } + } + // Change GAP_SPACE to GAP_DASH if we adopt - for gap in matrices + char c1 = i >= len1 ? Comparison.GAP_SPACE : seq1.charAt(i); + char c2 = i >= len2 ? Comparison.GAP_SPACE : seq2.charAt(i); + boolean gap1 = Comparison.isGap(c1); + boolean gap2 = Comparison.isGap(c2); + + if (gap1 && gap2) + { + /* + * gap-gap: include if options say so, else ignore + */ + if (!params.includeGappedColumns()) + { + continue; + } + } + else if (gap1 || gap2) + { + /* + * gap-residue: score if options say so + */ + if (!params.includesGaps()) + { + continue; + } + } + float score = getPairwiseScore(c1, c2); + total += score; + } + return total; + } + + /** * Answers a hashcode computed from the symbol alphabet and the matrix score * values */ diff --git a/src/jalview/analysis/scoremodels/SimilarityParams.java b/src/jalview/analysis/scoremodels/SimilarityParams.java index b6f2ba2..556cdc1 100644 --- a/src/jalview/analysis/scoremodels/SimilarityParams.java +++ b/src/jalview/analysis/scoremodels/SimilarityParams.java @@ -75,7 +75,7 @@ public class SimilarityParams implements SimilarityParamsI } @Override - public boolean denominatorIncludesGaps() + public boolean includesGaps() { return denominatorIncludesGaps; } diff --git a/src/jalview/api/analysis/PairwiseScoreModelI.java b/src/jalview/api/analysis/PairwiseScoreModelI.java index 241348c..ecada36 100644 --- a/src/jalview/api/analysis/PairwiseScoreModelI.java +++ b/src/jalview/api/analysis/PairwiseScoreModelI.java @@ -17,26 +17,6 @@ public interface PairwiseScoreModelI * @return */ abstract public float getPairwiseScore(char c, char d); - - /** - * Returns a readable name for the model, suitable for display in menus - * - * @return - */ - String getName(); - - /** - * Answers true if the model is applicable to nucleotide data - * - * @return - */ - boolean isDNA(); - - /** - * Answers true if the model is applicable to peptide data - * - * @return - */ - boolean isProtein(); + // TODO make this static when Java 8 } diff --git a/src/jalview/api/analysis/SimilarityParamsI.java b/src/jalview/api/analysis/SimilarityParamsI.java index 9ec2151..7985e8b 100644 --- a/src/jalview/api/analysis/SimilarityParamsI.java +++ b/src/jalview/api/analysis/SimilarityParamsI.java @@ -19,17 +19,19 @@ public interface SimilarityParamsI * * @return */ + // TODO is this specific to a PID score only? + // score matrix will compute whatever is configured for gap-residue boolean matchGaps(); /** - * Answers true if the demoninator (normalisation factor) of the score count - * includes gap-residue positions, false if it only includes residue-residue - * aligned positions. Gap-gap positions are included if this and - * includeGappedColumns both answer true. + * Answers true if gaps are included in the calculation. This may affect the + * calculated score, the denominator (normalisation factor) of the score, or + * both. Gap-gap positions are included if this and includeGappedColumns both + * answer true. * * @return */ - boolean denominatorIncludesGaps(); + boolean includesGaps(); /** * Answers true if only the shortest sequence length is used to divide the diff --git a/test/jalview/analysis/scoremodels/ScoreMatrixTest.java b/test/jalview/analysis/scoremodels/ScoreMatrixTest.java index 1076d43..85be5b7 100644 --- a/test/jalview/analysis/scoremodels/ScoreMatrixTest.java +++ b/test/jalview/analysis/scoremodels/ScoreMatrixTest.java @@ -6,6 +6,7 @@ import static org.testng.Assert.assertNotSame; import static org.testng.Assert.assertTrue; import static org.testng.internal.junit.ArrayAsserts.assertArrayEquals; +import jalview.api.analysis.SimilarityParamsI; import jalview.io.DataSourceType; import jalview.io.FileParse; import jalview.io.ScoreMatrixFile; @@ -244,4 +245,134 @@ public class ScoreMatrixTest assertTrue(sm.equals(sm2)); assertEquals(sm.hashCode(), sm2.hashCode()); } + + /** + * Tests for percentage identity variants where the longer length of two + * sequences is used + */ + @Test(groups = "Functional") + public void testcomputeSimilarity_matchLongestSequence() + { + // TODO params.matchGaps() is not used for ScoreMatrix + // - includeGaps is sufficient (there is no denominator) + // ==> bespoke parameters only 3 booleans? + /* + * for now, using space for gap to match callers of + * AlignmentView.getSequenceStrings() + * may change this to '-' (with corresponding change to matrices) + */ + String s1 = "FR K S"; + String s2 = "FS L"; + ScoreMatrix blosum = ScoreModels.getInstance().getBlosum62(); + + /* + * score gap-gap and gap-char + * shorter sequence treated as if with trailing gaps + * score = F^F + R^S + -^- + K^- + -^L + S^- + * = 6 + -1 + 1 + -4 + -4 + -4 = -6 + */ + SimilarityParamsI params = new SimilarityParams(true, true, true, false); + assertEquals(blosum.computeSimilarity(s1, s2, params), -6d); + // matchGap (arg2) is ignored: + params = new SimilarityParams(true, false, true, false); + assertEquals(blosum.computeSimilarity(s1, s2, params), -6d); + + /* + * score gap-char but not gap-gap + * score = F^F + R^S + 0 + K^- + -^L + S^- + * = 6 + -1 + 0 + -4 + -4 + -4 = -7 + */ + params = new SimilarityParams(false, true, true, false); + assertEquals(blosum.computeSimilarity(s1, s2, params), -7d); + // matchGap (arg2) is ignored: + params = new SimilarityParams(false, false, true, false); + assertEquals(blosum.computeSimilarity(s1, s2, params), -7d); + + /* + * score gap-gap but not gap-char + * score = F^F + R^S + -^- + 0 + 0 + 0 + * = 6 + -1 + 1 = 6 + */ + params = new SimilarityParams(true, false, false, false); + assertEquals(blosum.computeSimilarity(s1, s2, params), 6d); + // matchGap (arg2) is ignored: + params = new SimilarityParams(true, true, false, false); + assertEquals(blosum.computeSimilarity(s1, s2, params), 6d); + + /* + * score neither gap-gap nor gap-char + * score = F^F + R^S + 0 + 0 + 0 + 0 + * = 6 + -1 = 5 + */ + params = new SimilarityParams(false, false, false, false); + assertEquals(blosum.computeSimilarity(s1, s2, params), 5d); + // matchGap (arg2) is ignored: + params = new SimilarityParams(false, true, false, false); + assertEquals(blosum.computeSimilarity(s1, s2, params), 5d); + } + + /** + * Tests for percentage identity variants where only the shorter length of two + * sequences is used + */ + @Test(groups = "Functional") + public void testcomputeSimilarity_matchShortestSequence() + { + // TODO params.matchGaps() is not used for ScoreMatrix + // - includeGaps is sufficient (there is no denominator) + // ==> bespoke parameters only 3 booleans? + /* + * for now, using space for gap to match callers of + * AlignmentView.getSequenceStrings() + * may change this to '-' (with corresponding change to matrices) + */ + String s1 = "FR K S"; + String s2 = "FS L"; + ScoreMatrix blosum = ScoreModels.getInstance().getBlosum62(); + + /* + * score gap-gap and gap-char + * shorter sequence treated as if with trailing gaps + * score = F^F + R^S + -^- + K^- + -^L + * = 6 + -1 + 1 + -4 + -4 = -2 + */ + SimilarityParamsI params = new SimilarityParams(true, true, true, true); + assertEquals(blosum.computeSimilarity(s1, s2, params), -2d); + // matchGap (arg2) is ignored: + params = new SimilarityParams(true, false, true, true); + assertEquals(blosum.computeSimilarity(s1, s2, params), -2d); + + /* + * score gap-char but not gap-gap + * score = F^F + R^S + 0 + K^- + -^L + * = 6 + -1 + 0 + -4 + -4 = -3 + */ + params = new SimilarityParams(false, true, true, true); + assertEquals(blosum.computeSimilarity(s1, s2, params), -3d); + // matchGap (arg2) is ignored: + params = new SimilarityParams(false, false, true, true); + assertEquals(blosum.computeSimilarity(s1, s2, params), -3d); + + /* + * score gap-gap but not gap-char + * score = F^F + R^S + -^- + 0 + 0 + * = 6 + -1 + 1 = 6 + */ + params = new SimilarityParams(true, false, false, true); + assertEquals(blosum.computeSimilarity(s1, s2, params), 6d); + // matchGap (arg2) is ignored: + params = new SimilarityParams(true, true, false, true); + assertEquals(blosum.computeSimilarity(s1, s2, params), 6d); + + /* + * score neither gap-gap nor gap-char + * score = F^F + R^S + 0 + 0 + 0 + * = 6 + -1 = 5 + */ + params = new SimilarityParams(false, false, false, true); + assertEquals(blosum.computeSimilarity(s1, s2, params), 5d); + // matchGap (arg2) is ignored: + params = new SimilarityParams(false, true, false, true); + assertEquals(blosum.computeSimilarity(s1, s2, params), 5d); + } } -- 1.7.10.2