From 6129931bfa23eec90e6556a4d0412ef34aff5759 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 13 Feb 2017 17:00:37 +0000 Subject: [PATCH] JAL-2416 allow alphabet symbol (optional) in first column of score table --- resources/scoreModel/blosum62.scm | 53 +++++++++---------- resources/scoreModel/pam250.scm | 2 +- src/jalview/analysis/scoremodels/ScoreMatrix.java | 38 ++++++++++++-- .../analysis/scoremodels/ScoreMatrixTest.java | 31 ++++++++--- .../analysis/scoremodels/ScoreModelsTest.java | 54 ++++++++++++++++++++ 5 files changed, 142 insertions(+), 36 deletions(-) create mode 100644 test/jalview/analysis/scoremodels/ScoreModelsTest.java diff --git a/resources/scoreModel/blosum62.scm b/resources/scoreModel/blosum62.scm index 3df8833..c7af6b0 100644 --- a/resources/scoreModel/blosum62.scm +++ b/resources/scoreModel/blosum62.scm @@ -5,38 +5,39 @@ ARNDCQEGHILKMFPSTWYVBZX * # The first line declares a ScoreMatrix with the name BLOSUM62 (shown in menus) # The second line gives the symbols for which scores are held in the matrix # These may include a space (but not as the first or last character) -# Scores are not case sensitive, unless column(s) are provided for lower case characters # +# Scores are not symbol case sensitive, unless column(s) are provided for lower case characters +# The 'guide symbol' at the start of each row of score values is optional # -# Comment line with symbols is provided as a guide +# Comment header line with symbols is provided as a guide # Values may be integer or floating point, delimited by tab, space, comma or combinations # # A R N D C Q E G H I L K M F P S T W Y V B Z X * # - 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 -4 - -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 -4 - -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 -4 - -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 -4 - 0 3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 -4 - -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 -4 - -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 -4 - 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 -4 - -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 -4 - -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 -4 - -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 -4 - -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 -4 - -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 -4 - -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 -4 - -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 -4 - 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 -4 - 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 -4 - -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 -4 - -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 -4 - 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 -4 - -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 -4 - -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 -4 - 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 -4 - -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 1 +A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 -4 +R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 -4 +N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 -4 +D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 -4 +C 0 3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 -4 +Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 -4 +E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 -4 +G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 -4 +H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 -4 +I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 -4 +L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 -4 +K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 -4 +M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 -4 +F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 -4 +P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 -4 +S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 -4 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 -4 +W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 -4 +Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 -4 +V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 -4 +B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 -4 +Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 -4 +X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 1 +* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 1 # # A R N D C Q E G H I L K M F P S T W Y V B Z X * diff --git a/resources/scoreModel/pam250.scm b/resources/scoreModel/pam250.scm index b57485e..8df39a1 100644 --- a/resources/scoreModel/pam250.scm +++ b/resources/scoreModel/pam250.scm @@ -11,7 +11,7 @@ ARNDCQEGHILKMFPSTWYVBZX * # Comment line with symbols is provided as a guide # Values may be integer or floating point, delimited by tab, space, comma or combinations # -# A R N D C Q E G H I L K M F P S T W Y V B Z X * +# A R N D C Q E G H I L K M F P S T W Y V B Z X * # 2 -2 0 0 -2 0 0 1 -1 -1 -2 -1 -1 -3 1 1 1 -6 -3 0 0 0 0 -8 -8 -2 6 0 -1 -4 1 -1 -3 2 -2 -3 3 0 -4 0 0 -1 2 -4 -2 -1 0 -1 -8 -8 diff --git a/src/jalview/analysis/scoremodels/ScoreMatrix.java b/src/jalview/analysis/scoremodels/ScoreMatrix.java index f0115bf..3e63209 100644 --- a/src/jalview/analysis/scoremodels/ScoreMatrix.java +++ b/src/jalview/analysis/scoremodels/ScoreMatrix.java @@ -254,6 +254,19 @@ public class ScoreMatrix extends PairwiseSeqScoreModel implements * Parse a score matrix from the given input stream and returns a ScoreMatrix * object. If parsing fails, error messages are written to syserr and null is * returned. It is the caller's responsibility to close the input stream. + * Expected format: + * + *
+   * ScoreMatrix displayName
+   * # comment lines begin with hash sign
+   * # symbol alphabet should be the next non-comment line
+   * ARNDCQEGHILKMFPSTWYVBZX *
+   * # scores matrix, with space, comma or tab delimited values
+   * # [i, j] = score for substituting symbol[i] with symbol[j]
+   * # first column in each row is optionally the 'substituted' symbol
+   * A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 -4
+   * ..etc..
+   * 
* * @param is * @return @@ -334,20 +347,39 @@ public class ScoreMatrix extends PairwiseSeqScoreModel implements /* * subsequent lines should be the symbol scores + * optionally with the symbol as the first column for readability */ StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS); + if (scoreLine.countTokens() == size + 1) + { + /* + * check 'guide' symbol is the row'th letter of the alphabet + */ + String symbol = scoreLine.nextToken(); + if (symbol.length() > 1 + || symbol.charAt(0) != alphabet.charAt(row)) + { + System.err + .println(String + .format("Error parsing score matrix at line %d, expected %s but found %s", + lineNo, alphabet.charAt(row), symbol)); + return null; + } + } if (scoreLine.countTokens() != size) { System.err.println(String.format( - "Expected %d tokens at line %d but found %d", size, + "Expected %d scores at line %d but found %d", size, lineNo, scoreLine.countTokens())); return null; } scores[row] = new float[size]; int col = 0; String value = null; - while (scoreLine.hasMoreTokens()) { - try { + while (scoreLine.hasMoreTokens()) + { + try + { value = scoreLine.nextToken(); scores[row][col] = Float.valueOf(value); col++; diff --git a/test/jalview/analysis/scoremodels/ScoreMatrixTest.java b/test/jalview/analysis/scoremodels/ScoreMatrixTest.java index 462edf2..7c62854 100644 --- a/test/jalview/analysis/scoremodels/ScoreMatrixTest.java +++ b/test/jalview/analysis/scoremodels/ScoreMatrixTest.java @@ -72,15 +72,16 @@ public class ScoreMatrixTest /* * some messy but valid input data, with comma, space * or tab (or combinations) as score value delimiters + * this example includes 'guide' symbols on score rows */ String data = "ScoreMatrix MyTest\n" + "ATU tx-\n" - + "1.1,1.2,1.3,1.4, 1.5, 1.6, 1.7\n" - + "2.1 2.2 2.3 2.4 2.5 2.6 2.7\n" - + "3.1\t3.2\t3.3\t3.4\t3.5\t3.6\t3.7\n" + + "A,1.1,1.2,1.3,1.4, 1.5, 1.6, 1.7\n" + + "T,2.1 2.2 2.3 2.4 2.5 2.6 2.7\n" + + "U\t3.1\t3.2\t3.3\t3.4\t3.5\t3.6\t3.7\n" + " 4.1 ,4.2,\t,4.3 ,\t4.4\t, \4.5,4.6 4.7\n" - + ", 5.1,5.3,5.3,5.4,5.5, 5.6, 5.7\n" - + "\t6.1, 6.2 6.3 6.4 6.5 6.6 6.7\n" - + ", \t7.1\t7.2 7.3, 7.4, 7.5\t,7.6,7.7\n"; + + "t, 5.1,5.3,5.3,5.4,5.5, 5.6, 5.7\n" + + "x\t6.1, 6.2 6.3 6.4 6.5 6.6 6.7\n" + + "-, \t7.1\t7.2 7.3, 7.4, 7.5\t,7.6,7.7\n"; ScoreMatrix sm = ScoreMatrix.parse(new ByteArrayInputStream(data .getBytes())); assertNotNull(sm); @@ -99,6 +100,18 @@ public class ScoreMatrixTest assertEquals(sm.getPairwiseScore('A', '.'), 0f); // . unmapped assertEquals(sm.getPairwiseScore('-', '-'), 7.7f); assertEquals(sm.getPairwiseScore('A', (char) 128), 0f); // out of range + + /* + * without guide symbols on score rows + */ + data = "ScoreMatrix MyTest\nXY\n1 2\n3 4\n"; + sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); + assertNotNull(sm); + assertEquals(sm.getPairwiseScore('X', 'X'), 1f); + assertEquals(sm.getPairwiseScore('X', 'y'), 2f); + assertEquals(sm.getPairwiseScore('y', 'x'), 3f); + assertEquals(sm.getPairwiseScore('y', 'Y'), 4f); + assertEquals(sm.getPairwiseScore('D', 'R'), 0f); } @Test(groups = "Functional") @@ -168,5 +181,11 @@ public class ScoreMatrixTest sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); assertNull(sm); + /* + * Bad guide character on scores row + */ + data = "ScoreMatrix MyTest\nXY\nX 1 2\ny 3 4\n"; + sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); + assertNull(sm); } } diff --git a/test/jalview/analysis/scoremodels/ScoreModelsTest.java b/test/jalview/analysis/scoremodels/ScoreModelsTest.java new file mode 100644 index 0000000..03c4b84 --- /dev/null +++ b/test/jalview/analysis/scoremodels/ScoreModelsTest.java @@ -0,0 +1,54 @@ +package jalview.analysis.scoremodels; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +import jalview.api.analysis.ScoreModelI; + +import java.util.Iterator; + +import org.testng.annotations.Test; + +public class ScoreModelsTest +{ + /** + * Verify that the singleton constructor successfully loads Jalview's built-in + * score models + */ + @Test + public void testConstructor() + { + Iterator models = ScoreModels.getInstance().getModels() + .iterator(); + assertTrue(models.hasNext()); + + /* + * models are served in alphabetical order of name + * it so happens the 3 ScoreMatrix models precede the two + * others + */ + ScoreModelI sm = models.next(); + assertTrue(sm instanceof ScoreMatrix); + assertEquals(sm.getName(), "BLOSUM62"); + assertEquals(((ScoreMatrix) sm).getPairwiseScore('I', 'R'), -3f); + + sm = models.next(); + assertTrue(sm instanceof ScoreMatrix); + assertEquals(sm.getName(), "DNA"); + assertEquals(((ScoreMatrix) sm).getPairwiseScore('c', 'x'), 1f); + + sm = models.next(); + assertTrue(sm instanceof ScoreMatrix); + assertEquals(sm.getName(), "PAM250"); + assertEquals(((ScoreMatrix) sm).getPairwiseScore('R', 'C'), -4f); + + sm = models.next(); + assertFalse(sm instanceof ScoreMatrix); + assertEquals(sm.getName(), "PID"); + + sm = models.next(); + assertFalse(sm instanceof ScoreMatrix); + assertEquals(sm.getName(), "Sequence Feature Similarity"); + } +} -- 1.7.10.2