From a862a922bf20918fc3f5066ac92e4c69da07e105 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Tue, 28 Mar 2017 10:18:43 +0100 Subject: [PATCH] JAL-2416 more test coverage of matrix parser --- src/jalview/io/ScoreMatrixFile.java | 111 ++++++------- test/jalview/io/ScoreMatrixFileTest.java | 262 ++++++++++++++++++++++++++---- 2 files changed, 280 insertions(+), 93 deletions(-) diff --git a/src/jalview/io/ScoreMatrixFile.java b/src/jalview/io/ScoreMatrixFile.java index 4e89c3f..3a7ff4f 100644 --- a/src/jalview/io/ScoreMatrixFile.java +++ b/src/jalview/io/ScoreMatrixFile.java @@ -42,7 +42,15 @@ public class ScoreMatrixFile extends AlignFile implements private String matrixName; - boolean lowerDiagonalOnly; + /* + * aaindex format has scores for diagonal and below only + */ + boolean isLowerDiagonalOnly; + + /* + * ncbi format has symbols as first column on score rows + */ + boolean hasGuideColumn; /** * Constructor @@ -93,7 +101,7 @@ public class ScoreMatrixFile extends AlignFile implements int row = 0; String err = null; String data; - lowerDiagonalOnly = false; + isLowerDiagonalOnly = false; while ((data = nextLine()) != null) { @@ -111,8 +119,8 @@ public class ScoreMatrixFile extends AlignFile implements */ if (name != null) { - System.err - .println("Warning: 'ScoreMatrix' repeated in file at line " + throw new FileFormatException( + "Error: 'ScoreMatrix' repeated in file at line " + lineNo); } StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS); @@ -244,17 +252,8 @@ public class ScoreMatrixFile extends AlignFile implements { parseValues(data, lineNo, scores, row, alphabet); row++; - if (row == size) - { - break; - } } } - if (data != null) - { - System.err.println("Warning: unexpected extra data in matrix file: " - + data); - } ScoreMatrix sm = new ScoreMatrix(name, alphabet, scores); sm.setDescription(description); @@ -289,7 +288,24 @@ public class ScoreMatrixFile extends AlignFile implements StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS); int tokenCount = scoreLine.countTokens(); - if (tokenCount == size + 1) + + /* + * inspect first row to see if it includes the symbol in the first column, + * and to see if it is lower diagonal values only (i.e. just one score) + */ + if (row == 0) + { + if (data.startsWith(String.valueOf(alphabet[0]))) + { + hasGuideColumn = true; + } + if (tokenCount == (hasGuideColumn ? 2 : 1)) + { + isLowerDiagonalOnly = true; + } + } + + if (hasGuideColumn) { /* * check 'guide' symbol is the row'th letter of the alphabet @@ -302,36 +318,32 @@ public class ScoreMatrixFile extends AlignFile implements lineNo, alphabet[row], symbol); throw new FileFormatException(err); } + tokenCount = scoreLine.countTokens(); // excluding guide symbol } - tokenCount = scoreLine.countTokens(); - /* - * AAIndex format only has the lower diagonal i.e. - * 1 score in row 0, 2 in row 1, etc - * check this in all but the last row (which is the same either way) + * check the right number of values (lower diagonal or full format) */ - if (row < size - 1) + if (isLowerDiagonalOnly && tokenCount != row + 1) { - boolean lowerDiagonal = tokenCount == row + 1; - if (lowerDiagonalOnly && !lowerDiagonal) - { - /* - * had detected lower diagonal form but now it isn't - error - */ - err = String.format("Unexpected number of tokens at line %d", - lineNo); + err = String.format( + "Expected %d scores at line %d: '%s' but found %d", row + 1, + lineNo, data, tokenCount); throw new FileFormatException(err); - } - lowerDiagonalOnly = lowerDiagonal; } - if (!lowerDiagonalOnly && tokenCount != size) + if (!isLowerDiagonalOnly && tokenCount != size) { - err = String.format("Expected %d scores at line %d but found %d", - size, lineNo, scoreLine.countTokens()); + err = String.format( + "Expected %d scores at line %d: '%s' but found %d", size, + lineNo, data, scoreLine.countTokens()); throw new FileFormatException(err); } + + /* + * parse and set the values, setting the symmetrical value + * as well if lower diagonal format data + */ scores[row] = new float[size]; int col = 0; String value = null; @@ -341,7 +353,7 @@ public class ScoreMatrixFile extends AlignFile implements { value = scoreLine.nextToken(); scores[row][col] = Float.valueOf(value); - if (lowerDiagonalOnly) + if (isLowerDiagonalOnly) { scores[col][row] = scores[row][col]; } @@ -415,37 +427,6 @@ public class ScoreMatrixFile extends AlignFile implements return false; } - /** - * Answers true if the data line consists of the alphabet characters, - * delimited (as to provide a heading row). Otherwise returns false (e.g. if - * the data is a row of score values). - * - * @param data - * @param alphabet - * @return - */ - private boolean isHeaderLine(String data, String alphabet) - { - StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS); - int i = 0; - while (scoreLine.hasMoreElements()) - { - /* - * skip over characters in the alphabet that are - * also a delimiter (e.g. space) - */ - char symbol = alphabet.charAt(i++); - if (!DELIMITERS.contains(String.valueOf(symbol))) - { - if (!String.valueOf(symbol).equals(scoreLine.nextToken())) - { - return false; - } - } - } - return true; - } - public String getMatrixName() { return matrixName; diff --git a/test/jalview/io/ScoreMatrixFileTest.java b/test/jalview/io/ScoreMatrixFileTest.java index cb30874..52ad735 100644 --- a/test/jalview/io/ScoreMatrixFileTest.java +++ b/test/jalview/io/ScoreMatrixFileTest.java @@ -8,6 +8,7 @@ import static org.testng.Assert.assertTrue; import static org.testng.Assert.fail; import jalview.analysis.scoremodels.ScoreMatrix; +import jalview.analysis.scoremodels.ScoreModels; import java.io.IOException; import java.net.MalformedURLException; @@ -24,7 +25,9 @@ public class ScoreMatrixFileTest * @throws MalformedURLException */ @Test(groups = "Functional") - public void testParse() throws MalformedURLException, IOException + public void testParseMatrix_ncbiMixedDelimiters() + throws MalformedURLException, + IOException { /* * some messy but valid input data, with comma, space @@ -62,7 +65,7 @@ public class ScoreMatrixFileTest } @Test(groups = "Functional") - public void testParse_headerMissing() + public void testParseMatrix_headerMissing() { String data; @@ -80,9 +83,9 @@ public class ScoreMatrixFileTest } @Test(groups = "Functional") - public void testParse_notEnoughRows() + public void testParseMatrix_ncbiNotEnoughRows() { - String data = "ScoreMatrix MyTest\nX Y\n1 2\n"; + String data = "ScoreMatrix MyTest\nX Y Z\n1 2 3\n4 5 6\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -91,14 +94,14 @@ public class ScoreMatrixFileTest } catch (IOException e) { assertEquals(e.getMessage(), - "Expected 2 rows of score data in score matrix but only found 1"); + "Expected 3 rows of score data in score matrix but only found 2"); } } @Test(groups = "Functional") - public void testParse_notEnoughColumns() + public void testParseMatrix_ncbiNotEnoughColumns() { - String data = "ScoreMatrix MyTest\nX Y\n1 2\n3\n"; + String data = "ScoreMatrix MyTest\nX Y Z\n1 2 3\n4 5\n7 8 9\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -107,17 +110,17 @@ public class ScoreMatrixFileTest } catch (IOException e) { assertEquals(e.getMessage(), - "Expected 2 scores at line 4 but found 1"); + "Expected 3 scores at line 4: '4 5' but found 2"); } } @Test(groups = "Functional") - public void testParse_tooManyColumns() + public void testParseMatrix_ncbiTooManyColumns() { /* * with two too many columns: */ - String data = "ScoreMatrix MyTest\nX\tY\n1 2\n3 4 5 6\n"; + String data = "ScoreMatrix MyTest\nX\tY\tZ\n1 2 3\n4 5 6 7\n8 9 10\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -126,7 +129,7 @@ public class ScoreMatrixFileTest } catch (IOException e) { assertEquals(e.getMessage(), - "Expected 2 scores at line 4 but found 4"); + "Expected 3 scores at line 4: '4 5 6 7' but found 4"); } /* @@ -141,12 +144,11 @@ public class ScoreMatrixFileTest } catch (IOException e) { assertEquals(e.getMessage(), - "Expected 2 scores at line 4 but found 4"); + "Expected 2 scores at line 4: 'Y 3 4 5' but found 3"); } /* - * with no guide character and one too many columns: - * parser guesses the first column is the guide character + * with no guide character and one too many columns */ data = "ScoreMatrix MyTest\nX Y\n1 2\n3 4 5\n"; try @@ -157,14 +159,14 @@ public class ScoreMatrixFileTest } catch (IOException e) { assertEquals(e.getMessage(), - "Error parsing score matrix at line 4, expected 'Y' but found '3'"); + "Expected 2 scores at line 4: '3 4 5' but found 3"); } } @Test(groups = "Functional") - public void testParse_tooManyRows() + public void testParseMatrix_ncbiTooManyRows() { - String data = "ScoreMatrix MyTest\n\tX\tY\n1 2\n3 4\n6 7"; + String data = "ScoreMatrix MyTest\n\tX\tY\tZ\n1 2 3\n4 5 6\n7 8 9\n10 11 12\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -173,12 +175,12 @@ public class ScoreMatrixFileTest } catch (IOException e) { assertEquals(e.getMessage(), - "Unexpected extra input line in score model file: '6 7'"); + "Unexpected extra input line in score model file: '10 11 12'"); } } @Test(groups = "Functional") - public void testParse_badDelimiter() + public void testParseMatrix_ncbiBadDelimiter() { String data = "ScoreMatrix MyTest\n X Y Z\n1|2|3\n4|5|6\n"; try @@ -194,9 +196,9 @@ public class ScoreMatrixFileTest } @Test(groups = "Functional") - public void testParse_badFloat() + public void testParseMatrix_ncbiBadFloat() { - String data = "ScoreMatrix MyTest\n\tX\tY\n1 2\n3 four\n"; + String data = "ScoreMatrix MyTest\n\tX\tY\tZ\n1 2 3\n4 five 6\n7 8 9\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -205,12 +207,12 @@ public class ScoreMatrixFileTest } catch (IOException e) { assertEquals(e.getMessage(), - "Invalid score value 'four' at line 4 column 1"); + "Invalid score value 'five' at line 4 column 1"); } } @Test(groups = "Functional") - public void testParse_badGuideCharacter() + public void testParseMatrix_ncbiBadGuideCharacter() { String data = "ScoreMatrix MyTest\n\tX Y\nX 1 2\ny 3 4\n"; try @@ -223,13 +225,25 @@ public class ScoreMatrixFileTest assertEquals(e.getMessage(), "Error parsing score matrix at line 4, expected 'Y' but found 'y'"); } + + data = "ScoreMatrix MyTest\n\tX Y\nXX 1 2\nY 3 4\n"; + try + { + new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) + .parseMatrix(); + fail("expected exception"); + } catch (IOException e) + { + assertEquals(e.getMessage(), + "Error parsing score matrix at line 3, expected 'X' but found 'XX'"); + } } @Test(groups = "Functional") - public void testParse_nameMissing() + public void testParseMatrix_ncbiNameMissing() { /* - * Name missing + * Name missing on ScoreMatrix header line */ String data = "ScoreMatrix\nX Y\n1 2\n3 4\n"; try @@ -252,10 +266,11 @@ public class ScoreMatrixFileTest * @throws MalformedURLException */ @Test(groups = "Functional") - public void testParse_ncbiFormat() throws MalformedURLException, + public void testParseMatrix_ncbiFormat() throws MalformedURLException, IOException { - String data = "ScoreMatrix MyTest\n" + "\tA\tB\tC\n" + // input including comment and blank lines + String data = "ScoreMatrix MyTest\n#comment\n\n" + "\tA\tB\tC\n" + "A\t1.0\t2.0\t3.0\n" + "B\t4.0\t5.0\t6.0\n" + "C\t7.0\t8.0\t9.0\n"; FileParse fp = new FileParse(data, DataSourceType.PASTE); @@ -264,6 +279,7 @@ public class ScoreMatrixFileTest assertNotNull(sm); assertEquals(sm.getName(), "MyTest"); + assertEquals(parser.getMatrixName(), "MyTest"); assertEquals(sm.getPairwiseScore('A', 'A'), 1.0f); assertEquals(sm.getPairwiseScore('B', 'c'), 6.0f); assertEquals(sm.getSize(), 3); @@ -276,7 +292,8 @@ public class ScoreMatrixFileTest * @throws MalformedURLException */ @Test(groups = "Functional") - public void testParse_aaIndexBlosum80() throws MalformedURLException, + public void testParseMatrix_aaIndexBlosum80() + throws MalformedURLException, IOException { FileParse fp = new FileParse("resources/scoreModel/blosum80.scm", @@ -297,4 +314,193 @@ public class ScoreMatrixFileTest assertEquals(sm.getPairwiseScore('A', 'R'), -3f); assertEquals(sm.getPairwiseScore('r', 'a'), -3f); // A/a equivalent } + + /** + * Test a successful parse of a (small) score matrix file + * + * @throws IOException + * @throws MalformedURLException + */ + @Test(groups = "Functional") + public void testParseMatrix_aaindexFormat() throws MalformedURLException, + IOException + { + /* + * aaindex format has scores for diagonal and below only + */ + String data = "H MyTest\n" + "D My description\n" + "R PMID:1438297\n" + + "A Authors, names\n" + "T Journal title\n" + + "J Journal reference\n" + "* matrix in 1/3 Bit Units\n" + + "M rows = ABC, cols = ABC\n" + "A\t1.0\n" + + "B\t4.0\t5.0\n" + + "C\t7.0\t8.0\t9.0\n"; + FileParse fp = new FileParse(data, DataSourceType.PASTE); + ScoreMatrixFile parser = new ScoreMatrixFile(fp); + ScoreMatrix sm = parser.parseMatrix(); + + assertNotNull(sm); + assertEquals(sm.getSize(), 3); + assertEquals(sm.getGapIndex(), -1); + assertEquals(sm.getName(), "MyTest"); + assertEquals(sm.getDescription(), "My description"); + assertEquals(sm.getPairwiseScore('A', 'A'), 1.0f); + assertEquals(sm.getPairwiseScore('A', 'B'), 4.0f); + assertEquals(sm.getPairwiseScore('A', 'C'), 7.0f); + assertEquals(sm.getPairwiseScore('B', 'A'), 4.0f); + assertEquals(sm.getPairwiseScore('B', 'B'), 5.0f); + assertEquals(sm.getPairwiseScore('B', 'C'), 8.0f); + assertEquals(sm.getPairwiseScore('C', 'C'), 9.0f); + assertEquals(sm.getPairwiseScore('C', 'B'), 8.0f); + assertEquals(sm.getPairwiseScore('C', 'A'), 7.0f); + } + + @Test(groups = "Functional") + public void testParseMatrix_aaindex_mMissing() + throws MalformedURLException, + IOException + { + /* + * aaindex format but M cols=, rows= is missing + */ + String data = "H MyTest\n" + "A\t1.0\n" + + "B\t4.0\t5.0\n" + + "C\t7.0\t8.0\t9.0\n"; + FileParse fp = new FileParse(data, DataSourceType.PASTE); + ScoreMatrixFile parser = new ScoreMatrixFile(fp); + try + { + parser.parseMatrix(); + fail("Expected exception"); + } catch (FileFormatException e) + { + assertEquals(e.getMessage(), "No alphabet specified in matrix file"); + } + } + + @Test(groups = "Functional") + public void testParseMatrix_aaindex_rowColMismatch() + throws MalformedURLException, + IOException + { + String data = "H MyTest\n" + "M rows=ABC, cols=ABD\n" + "A\t1.0\n" + + "B\t4.0\t5.0\n" + + "C\t7.0\t8.0\t9.0\n"; + FileParse fp = new FileParse(data, DataSourceType.PASTE); + ScoreMatrixFile parser = new ScoreMatrixFile(fp); + try + { + parser.parseMatrix(); + fail("Expected exception"); + } catch (FileFormatException e) + { + assertEquals( + e.getMessage(), + "Unexpected aaIndex score matrix data at line 2: M rows=ABC, cols=ABD rows != cols"); + } + } + + @Test(groups = "Functional") + public void testParseMatrix_ncbiHeaderRepeated() + { + String data = "ScoreMatrix BLOSUM\nScoreMatrix PAM250\nX Y\n1 2\n3 4\n"; + try + { + new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) + .parseMatrix(); + fail("expected exception"); + } catch (IOException e) + { + assertEquals(e.getMessage(), + "Error: 'ScoreMatrix' repeated in file at line 2"); + } + } + + @Test(groups = "Functional") + public void testParseMatrix_aaindex_tooManyRows() + throws MalformedURLException, + IOException + { + String data = "H MyTest\n" + "M rows=ABC, cols=ABC\n" + "A\t1.0\n" + + "B\t4.0\t5.0\n" + "C\t7.0\t8.0\t9.0\n" + "C\t7.0\t8.0\t9.0\n"; + FileParse fp = new FileParse(data, DataSourceType.PASTE); + ScoreMatrixFile parser = new ScoreMatrixFile(fp); + try + { + parser.parseMatrix(); + fail("Expected exception"); + } catch (FileFormatException e) + { + assertEquals(e.getMessage(), "Too many data rows in matrix file"); + } + } + + @Test(groups = "Functional") + public void testParseMatrix_aaindex_extraDataLines() + throws MalformedURLException, + IOException + { + String data = "H MyTest\n" + "M rows=ABC, cols=ABC\n" + "A\t1.0\n" + + "B\t4.0\t5.0\n" + "C\t7.0\t8.0\t9.0\n" + "something extra\n"; + FileParse fp = new FileParse(data, DataSourceType.PASTE); + ScoreMatrixFile parser = new ScoreMatrixFile(fp); + try + { + parser.parseMatrix(); + fail("Expected exception"); + } catch (FileFormatException e) + { + assertEquals(e.getMessage(), "Too many data rows in matrix file"); + } + } + + @Test(groups = "Functional") + public void testParseMatrix_aaindex_tooFewColumns() + throws MalformedURLException, + IOException + { + String data = "H MyTest\n" + "M rows=ABC, cols=ABC\n" + "A\t1.0\n" + + "B\t4.0\t5.0\n" + "C\t7.0\t8.0\n"; + FileParse fp = new FileParse(data, DataSourceType.PASTE); + ScoreMatrixFile parser = new ScoreMatrixFile(fp); + try + { + parser.parseMatrix(); + fail("Expected exception"); + } catch (FileFormatException e) + { + assertEquals( + e.getMessage(), + "Expected 3 scores at line 5: 'C\t7.0\t8.0' but found 2"); + } + } + + /** + * Test a successful parse and register of a score matrix file + * + * @throws IOException + * @throws MalformedURLException + */ + @Test(groups = "Functional") + public void testParse_ncbiFormat() throws MalformedURLException, + IOException + { + assertNull(ScoreModels.getInstance().forName("MyNewTest")); + + String data = "ScoreMatrix MyNewTest\n" + "\tA\tB\tC\n" + + "A\t1.0\t2.0\t3.0\n" + "B\t4.0\t5.0\t6.0\n" + + "C\t7.0\t8.0\t9.0\n"; + FileParse fp = new FileParse(data, DataSourceType.PASTE); + ScoreMatrixFile parser = new ScoreMatrixFile(fp); + + parser.parse(); + + ScoreMatrix sm = (ScoreMatrix) ScoreModels.getInstance().forName( + "MyNewTest"); + assertNotNull(sm); + assertEquals(sm.getName(), "MyNewTest"); + assertEquals(parser.getMatrixName(), "MyNewTest"); + assertEquals(sm.getPairwiseScore('A', 'A'), 1.0f); + assertEquals(sm.getPairwiseScore('B', 'c'), 6.0f); + assertEquals(sm.getSize(), 3); + } } -- 1.7.10.2