From 1ee1c1730f3b52aee4d039b416f276a1c55257df Mon Sep 17 00:00:00 2001 From: gmungoc Date: Wed, 29 Mar 2017 12:38:18 +0100 Subject: [PATCH] JAL-2416 revert to '*' not '-' in score matrix, handle gaps explicitly in AlignSeq --- resources/scoreModel/blosum62.scm | 4 +- resources/scoreModel/pam250.scm | 4 +- src/jalview/analysis/AlignSeq.java | 46 ++++++++++---------- src/jalview/analysis/scoremodels/ScoreMatrix.java | 32 +++----------- .../analysis/scoremodels/ScoreMatrixTest.java | 15 +------ test/jalview/io/ScoreMatrixFileTest.java | 2 - 6 files changed, 35 insertions(+), 68 deletions(-) diff --git a/resources/scoreModel/blosum62.scm b/resources/scoreModel/blosum62.scm index 0d6ffab..b0e927d 100644 --- a/resources/scoreModel/blosum62.scm +++ b/resources/scoreModel/blosum62.scm @@ -7,7 +7,7 @@ ScoreMatrix BLOSUM62 # The 'guide symbol' at the start of each row of score values is optional # Values may be integer or floating point, delimited by tab, space, comma or combinations # - A R N D C Q E G H I L K M F P S T W Y V B Z X - + A R N D C Q E G H I L K M F P S T W Y V B Z X * A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 @@ -31,4 +31,4 @@ V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 -- -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 +* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 diff --git a/resources/scoreModel/pam250.scm b/resources/scoreModel/pam250.scm index 10a46ec..898c723 100644 --- a/resources/scoreModel/pam250.scm +++ b/resources/scoreModel/pam250.scm @@ -5,7 +5,7 @@ ScoreMatrix PAM250 # Scores are not case sensitive, unless column(s) are provided for lower case characters # Values may be integer or floating point, delimited by tab, space, comma or combinations # - A R N D C Q E G H I L K M F P S T W Y V B Z X - + A R N D C Q E G H I L K M F P S T W Y V B Z X * A 2 -2 0 0 -2 0 0 1 -1 -1 -2 -1 -1 -3 1 1 1 -6 -3 0 0 0 0 -8 R -2 6 0 -1 -4 1 -1 -3 2 -2 -3 3 0 -4 0 0 -1 2 -4 -2 -1 0 -1 -8 N 0 0 2 2 -4 1 1 0 2 -2 -3 1 -2 -3 0 1 0 -4 -2 -2 2 1 0 -8 @@ -29,4 +29,4 @@ V 0 -2 -2 -2 -2 -2 -2 -1 -2 4 2 -2 2 -1 -1 -1 0 -6 -2 4 -2 -2 -1 -8 B 0 -1 2 3 -4 1 3 0 1 -2 -3 1 -2 -4 -1 0 0 -5 -3 -2 3 2 -1 -8 Z 0 0 1 3 -5 3 3 0 2 -2 -3 0 -2 -5 0 0 -1 -6 -4 -2 2 3 -1 -8 X 0 -1 0 -1 -3 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 0 0 -4 -2 -1 -1 -1 -1 -8 -- -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 +* -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 diff --git a/src/jalview/analysis/AlignSeq.java b/src/jalview/analysis/AlignSeq.java index d04e897..6bf812c 100755 --- a/src/jalview/analysis/AlignSeq.java +++ b/src/jalview/analysis/AlignSeq.java @@ -112,16 +112,14 @@ public class AlignSeq int gapExtend = 20; - float[][] lookup; - - int gapIndex = 23; - StringBuffer output = new StringBuffer(); String type; // AlignSeq.PEP or AlignSeq.DNA private ScoreMatrix scoreModel; + private static final int GAP_INDEX = -1; + /** * Creates a new AlignSeq object. * @@ -322,10 +320,6 @@ public class AlignSeq return; } - seq1 = new int[s1str.length()]; - - seq2 = new int[s2str.length()]; - score = new float[s1str.length()][s2str.length()]; E = new float[s1str.length()][s2str.length()]; @@ -351,8 +345,6 @@ public class AlignSeq type = moleculeType; scoreModel = ScoreModels.getInstance().getDefaultModel( PEP.equals(type)); - lookup = scoreModel.getMatrix(); - gapIndex = scoreModel.getGapIndex(); } /** @@ -417,13 +409,13 @@ public class AlignSeq else if (trace == 1) { j--; - aseq1[count] = gapIndex; + aseq1[count] = GAP_INDEX; sb1.replace(sb1.length() - 1, sb1.length(), "-"); } else if (trace == -1) { i--; - aseq2[count] = gapIndex; + aseq2[count] = GAP_INDEX; sb2.replace(sb2.length() - 1, sb2.length(), "-"); } @@ -433,13 +425,13 @@ public class AlignSeq seq1start = i + 1; seq2start = j + 1; - if (aseq1[count] != gapIndex) + if (aseq1[count] != GAP_INDEX) { aseq1[count] = seq1[i]; sb1.append(s1str.charAt(i)); } - if (aseq2[count] != gapIndex) + if (aseq2[count] != GAP_INDEX) { aseq2[count] = seq2[j]; sb2.append(s2str.charAt(j)); @@ -596,7 +588,10 @@ public class AlignSeq public int findTrace(int i, int j) { int t = 0; - float max = score[i - 1][j - 1] + (lookup[seq1[i]][seq2[j]] * 10); + // float pairwiseScore = lookup[seq1[i]][seq2[j]]; + float pairwiseScore = scoreModel.getPairwiseScore(s1str.charAt(i), + s2str.charAt(j)); + float max = score[i - 1][j - 1] + (pairwiseScore * 10); if (F[i][j] > max) { @@ -640,7 +635,8 @@ public class AlignSeq int m = seq2.length; // top left hand element - score[0][0] = lookup[seq1[0]][seq2[0]] * 10; + score[0][0] = scoreModel.getPairwiseScore(s1str.charAt(0), + s2str.charAt(0)) * 10; E[0][0] = -gapExtend; F[0][0] = 0; @@ -651,7 +647,9 @@ public class AlignSeq E[0][j] = max(score[0][j - 1] - gapOpen, E[0][j - 1] - gapExtend); F[0][j] = -gapExtend; - score[0][j] = max(lookup[seq1[0]][seq2[j]] * 10, -gapOpen, -gapExtend); + float pairwiseScore = scoreModel.getPairwiseScore(s1str.charAt(0), + s2str.charAt(j)); + score[0][j] = max(pairwiseScore * 10, -gapOpen, -gapExtend); traceback[0][j] = 1; } @@ -662,7 +660,9 @@ public class AlignSeq E[i][0] = -gapOpen; F[i][0] = max(score[i - 1][0] - gapOpen, F[i - 1][0] - gapExtend); - score[i][0] = max(lookup[seq1[i]][seq2[0]] * 10, E[i][0], F[i][0]); + float pairwiseScore = scoreModel.getPairwiseScore(s1str.charAt(i), + s2str.charAt(0)); + score[i][0] = max(pairwiseScore * 10, E[i][0], F[i][0]); traceback[i][0] = -1; } @@ -674,8 +674,10 @@ public class AlignSeq E[i][j] = max(score[i][j - 1] - gapOpen, E[i][j - 1] - gapExtend); F[i][j] = max(score[i - 1][j] - gapOpen, F[i - 1][j] - gapExtend); + float pairwiseScore = scoreModel.getPairwiseScore(s1str.charAt(i), + s2str.charAt(j)); score[i][j] = max(score[i - 1][j - 1] - + (lookup[seq1[i]][seq2[j]] * 10), E[i][j], F[i][j]); + + (pairwiseScore * 10), E[i][j], F[i][j]); traceback[i][j] = findTrace(i, j); } } @@ -797,7 +799,7 @@ public class AlignSeq public static void displayMatrix(Graphics g, int[][] mat, int n, int m, int psize) { - // TODO method dosen't seem to be referenced anywhere delete?? + // TODO method doesn't seem to be referenced anywhere delete?? int max = -1000; int min = 1000; @@ -971,8 +973,8 @@ public class AlignSeq bestm = msq; } } - System.out.println("Best Score for " + (matches.size() + 1) + " :" - + bestscore); + // System.out.println("Best Score for " + (matches.size() + 1) + " :" + // + bestscore); matches.add(bestm); aligns.add(bestaseq); al.deleteSequence(bestm); diff --git a/src/jalview/analysis/scoremodels/ScoreMatrix.java b/src/jalview/analysis/scoremodels/ScoreMatrix.java index e2c14e9..8bc2c04 100644 --- a/src/jalview/analysis/scoremodels/ScoreMatrix.java +++ b/src/jalview/analysis/scoremodels/ScoreMatrix.java @@ -37,6 +37,8 @@ import java.util.Arrays; public class ScoreMatrix implements SimilarityScoreModelI, PairwiseScoreModelI { + private static final char GAP_CHARACTER = Comparison.GAP_DASH; + /* * an arbitrary score to assign for identity of an unknown symbol * (this is the value on the diagonal in the * column of the NCBI matrix) @@ -45,12 +47,6 @@ public class ScoreMatrix implements SimilarityScoreModelI, private static final int UNKNOWN_IDENTITY_SCORE = 1; /* - * this fields records which gap character (if any) is used in the alphabet; - * space, dash or dot are recognised as gap symbols - */ - private char gapCharacter = '0'; - - /* * Jalview 2.10.1 treated gaps as X (peptide) or N (nucleotide) * for pairwise scoring; 2.10.2 uses gap score (last column) in * score matrix (JAL-2397) @@ -192,11 +188,6 @@ public class ScoreMatrix implements SimilarityScoreModelI, short pos = 0; for (char c : alphabet) { - if (Comparison.isGap(c)) - { - gapCharacter = c; - } - if (c <= MAX_ASCII) { index[c] = pos; @@ -283,19 +274,6 @@ public class ScoreMatrix implements SimilarityScoreModelI, } /** - * Answers the matrix index for the gap character, or -1 if unmapped in the - * matrix. Use this method only if using getMatrix in order to - * compute scores directly (without symbol lookup) for efficiency. - * - * @return - * @see #getMatrix() - */ - public int getGapIndex() - { - return getMatrixIndex(gapCharacter); - } - - /** * Returns the pairwise score for substituting c with d. If either c or d is * an unexpected character, returns 1 for identity (c == d), else the minimum * score value in the matrix. @@ -439,7 +417,7 @@ public class ScoreMatrix implements SimilarityScoreModelI, SimilarityParamsI options) { char gapChar = scoreGapAsAny ? (seqstrings.isNa() ? 'N' : 'X') - : gapCharacter; + : GAP_CHARACTER; String[] seqs = seqstrings.getSequenceStrings(gapChar); return findSimilarities(seqs, options); } @@ -498,8 +476,8 @@ public class ScoreMatrix implements SimilarityScoreModelI, } } - char c1 = i >= len1 ? gapCharacter : seq1.charAt(i); - char c2 = i >= len2 ? gapCharacter : seq2.charAt(i); + char c1 = i >= len1 ? GAP_CHARACTER : seq1.charAt(i); + char c2 = i >= len2 ? GAP_CHARACTER : seq2.charAt(i); boolean gap1 = Comparison.isGap(c1); boolean gap2 = Comparison.isGap(c2); diff --git a/test/jalview/analysis/scoremodels/ScoreMatrixTest.java b/test/jalview/analysis/scoremodels/ScoreMatrixTest.java index 16d9504..da17000 100644 --- a/test/jalview/analysis/scoremodels/ScoreMatrixTest.java +++ b/test/jalview/analysis/scoremodels/ScoreMatrixTest.java @@ -46,8 +46,6 @@ public class ScoreMatrixTest assertEquals(sm.getPairwiseScore('D', 'A'), -4f); // unknown-to-self gets a score of 1 assertEquals(sm.getPairwiseScore('D', 'D'), 1f); - - assertEquals(sm.getGapIndex(), -1); // no gap symbol } @Test( @@ -166,8 +164,8 @@ public class ScoreMatrixTest assertEquals(sm.getMatrixIndex('D'), 3); assertEquals(sm.getMatrixIndex('X'), 22); assertEquals(sm.getMatrixIndex('x'), 22); - assertEquals(sm.getMatrixIndex('-'), 23); - assertEquals(sm.getMatrixIndex('*'), -1); + assertEquals(sm.getMatrixIndex('-'), -1); + assertEquals(sm.getMatrixIndex('*'), 23); assertEquals(sm.getMatrixIndex('.'), -1); assertEquals(sm.getMatrixIndex(' '), -1); assertEquals(sm.getMatrixIndex('?'), -1); @@ -175,13 +173,6 @@ public class ScoreMatrixTest } @Test(groups = "Functional") - public void testGetGapIndex() - { - ScoreMatrix sm = ScoreModels.getInstance().getBlosum62(); - assertEquals(sm.getGapIndex(), 23); - } - - @Test(groups = "Functional") public void testGetSize() { ScoreMatrix sm = ScoreModels.getInstance().getBlosum62(); @@ -534,8 +525,6 @@ public class ScoreMatrixTest assertEquals(sm.getMatrixIndex('-'), 1); assertEquals(sm.getMatrixIndex(' '), -1); assertEquals(sm.getMatrixIndex('.'), -1); - - assertEquals(sm.getGapIndex(), 1); } @Test(groups = "Functional") diff --git a/test/jalview/io/ScoreMatrixFileTest.java b/test/jalview/io/ScoreMatrixFileTest.java index a98b2d6..1aa191a 100644 --- a/test/jalview/io/ScoreMatrixFileTest.java +++ b/test/jalview/io/ScoreMatrixFileTest.java @@ -310,7 +310,6 @@ public class ScoreMatrixFileTest assertFalse(sm.isDNA()); assertTrue(sm.isProtein()); assertEquals(20, sm.getSize()); - assertEquals(sm.getGapIndex(), -1); assertEquals(sm.getPairwiseScore('A', 'A'), 7f); assertEquals(sm.getPairwiseScore('A', 'R'), -3f); @@ -342,7 +341,6 @@ public class ScoreMatrixFileTest assertNotNull(sm); assertEquals(sm.getSize(), 3); - assertEquals(sm.getGapIndex(), -1); assertEquals(sm.getName(), "MyTest"); assertEquals(sm.getDescription(), "My description"); assertEquals(sm.getPairwiseScore('A', 'A'), 1.0f); -- 1.7.10.2