From: gmungoc Date: Mon, 27 Mar 2017 18:14:17 +0000 (+0100) Subject: JAL-2416 parse score matrices in either NCBI or AAindex format X-Git-Tag: Release_2_10_2~3^2~105^2~2^2~41 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=e65e612cabab4118364c44b6075302e0d2881744;p=jalview.git JAL-2416 parse score matrices in either NCBI or AAindex format --- diff --git a/src/jalview/analysis/scoremodels/ScoreMatrix.java b/src/jalview/analysis/scoremodels/ScoreMatrix.java index f7da9f3..c71658b 100644 --- a/src/jalview/analysis/scoremodels/ScoreMatrix.java +++ b/src/jalview/analysis/scoremodels/ScoreMatrix.java @@ -318,7 +318,6 @@ public class ScoreMatrix implements SimilarityScoreModelI, else { sb.append("ScoreMatrix ").append(getName()).append("\n"); - sb.append(symbols).append("\n"); } for (char sym : symbols) { @@ -524,7 +523,7 @@ public class ScoreMatrix implements SimilarityScoreModelI, * * @return */ - public String getSymbols() + String getSymbols() { return new String(symbols); } diff --git a/src/jalview/io/IdentifyFile.java b/src/jalview/io/IdentifyFile.java index 4b6f8e4..035c1fa 100755 --- a/src/jalview/io/IdentifyFile.java +++ b/src/jalview/io/IdentifyFile.java @@ -98,12 +98,15 @@ public class IdentifyFile boolean lineswereskipped = false; boolean isBinary = false; // true if length is non-zero and non-printable // characters are encountered + try { if (!closeSource) { source.mark(); } + boolean aaIndexHeaderRead = false; + while ((data = source.nextLine()) != null) { bytesRead += data.length(); @@ -146,6 +149,15 @@ public class IdentifyFile reply = FileFormat.ScoreMatrix; break; } + if (data.startsWith("H ") && !aaIndexHeaderRead) + { + aaIndexHeaderRead = true; + } + if (data.startsWith("D ") && aaIndexHeaderRead) + { + reply = FileFormat.ScoreMatrix; + break; + } if (data.startsWith("##GFF-VERSION")) { // GFF - possibly embedded in a Jalview features file! diff --git a/src/jalview/io/ScoreMatrixFile.java b/src/jalview/io/ScoreMatrixFile.java index a332846..4e89c3f 100644 --- a/src/jalview/io/ScoreMatrixFile.java +++ b/src/jalview/io/ScoreMatrixFile.java @@ -19,7 +19,7 @@ import java.util.StringTokenizer; * ScoreMatrix BLOSUM62 * * - * Also accepts 'aaindex' format (as described at + * Also accepts 'AAindex' format (as described at * http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data * required being * @@ -42,6 +42,8 @@ public class ScoreMatrixFile extends AlignFile implements private String matrixName; + boolean lowerDiagonalOnly; + /** * Constructor * @@ -61,7 +63,8 @@ public class ScoreMatrixFile extends AlignFile implements /** * Parses the score matrix file, and if successful registers the matrix so it - * will be shown in Jalview menus. + * will be shown in Jalview menus. This method is not thread-safe (a separate + * instance of this class should be used by each thread). */ @Override public void parse() throws IOException @@ -84,12 +87,13 @@ public class ScoreMatrixFile extends AlignFile implements ScoreMatrix sm = null; int lineNo = 0; String name = null; - String alphabet = null; + char[] alphabet = null; float[][] scores = null; int size = 0; int row = 0; String err = null; String data; + lowerDiagonalOnly = false; while ((data = nextLine()) != null) { @@ -123,6 +127,13 @@ public class ScoreMatrixFile extends AlignFile implements name = data.substring(1).substring(data.substring(1).indexOf(name)); continue; } + else if (data.startsWith("H ") && name == null) + { + /* + * AAindex identifier + */ + return parseAAIndexFormat(lineNo, data); + } else if (name == null) { err = "Format error: 'ScoreMatrix ' should be the first non-comment line"; @@ -130,12 +141,20 @@ public class ScoreMatrixFile extends AlignFile implements } /* - * next line after ScoreMatrix should be the alphabet of scored symbols + * next non-comment line after ScoreMatrix should be the + * column header line with the alphabet of scored symbols */ if (alphabet == null) { - alphabet = data; - size = alphabet.length(); + StringTokenizer columnHeadings = new StringTokenizer(data, + DELIMITERS); + size = columnHeadings.countTokens(); + alphabet = new char[size]; + int col = 0; + while (columnHeadings.hasMoreTokens()) + { + alphabet[col++] = columnHeadings.nextToken().charAt(0); + } scores = new float[size][]; continue; } @@ -150,58 +169,7 @@ public class ScoreMatrixFile extends AlignFile implements throw new FileFormatException(err); } - /* - * permit an uncommented line with delimited residue headings - */ - if (isHeaderLine(data, alphabet)) - { - continue; - } - - /* - * subsequent lines should be the symbol scores - * optionally with the symbol as the first column for readability - */ - StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS); - int tokenCount = scoreLine.countTokens(); - if (tokenCount == size + 1) - { - /* - * check 'guide' symbol is the row'th letter of the alphabet - */ - String symbol = scoreLine.nextToken(); - if (symbol.length() > 1 || symbol.charAt(0) != alphabet.charAt(row)) - { - err = String - .format("Error parsing score matrix at line %d, expected '%s' but found '%s'", - lineNo, alphabet.charAt(row), symbol); - throw new FileFormatException(err); - } - } - if (scoreLine.countTokens() != size) - { - err = String.format("Expected %d scores at line %d but found %d", - size, lineNo, scoreLine.countTokens()); - throw new FileFormatException(err); - } - scores[row] = new float[size]; - int col = 0; - String value = null; - while (scoreLine.hasMoreTokens()) - { - try - { - value = scoreLine.nextToken(); - scores[row][col] = Float.valueOf(value); - col++; - } catch (NumberFormatException e) - { - err = String.format( - "Invalid score value '%s' at line %d column %d", value, - lineNo, col); - throw new FileFormatException(err); - } - } + parseValues(data, lineNo, scores, row, alphabet); row++; } @@ -219,13 +187,235 @@ public class ScoreMatrixFile extends AlignFile implements /* * If we get here, then name, alphabet and scores have been parsed successfully */ - sm = new ScoreMatrix(name, alphabet.toCharArray(), scores); + sm = new ScoreMatrix(name, alphabet, scores); matrixName = name; return sm; } /** + * Parse input as AAIndex format, starting from the header line with the + * accession id + * + * @param lineNo + * @param data + * @return + * @throws IOException + */ + protected ScoreMatrix parseAAIndexFormat(int lineNo, String data) + throws IOException + { + String name = data.substring(2).trim(); + String description = null; + + float[][] scores = null; + char[] alphabet = null; + int row = 0; + int size = 0; + + while ((data = nextLine()) != null) + { + lineNo++; + data = data.trim(); + if (skipAAindexLine(data)) + { + continue; + } + if (data.startsWith("D ")) + { + description = data.substring(2).trim(); + } + else if (data.startsWith("M ")) + { + alphabet = parseAAindexRowsColumns(lineNo, data); + size = alphabet.length; + scores = new float[size][size]; + } + else if (scores == null) + { + throw new FileFormatException( + "No alphabet specified in matrix file"); + } + else if (row >= size) + { + throw new FileFormatException("Too many data rows in matrix file"); + } + else + { + parseValues(data, lineNo, scores, row, alphabet); + row++; + if (row == size) + { + break; + } + } + } + if (data != null) + { + System.err.println("Warning: unexpected extra data in matrix file: " + + data); + } + + ScoreMatrix sm = new ScoreMatrix(name, alphabet, scores); + sm.setDescription(description); + matrixName = name; + + return sm; + } + + /** + * Parse one row of score values, delimited by whitespace or commas. The line + * may optionally include the symbol from which the scores are defined. Values + * may be present for all columns, or only up to the diagonal (in which case + * upper diagonal values are set symmetrically). + * + * @param data + * the line to be parsed + * @param lineNo + * @param scores + * the score matrix to add data to + * @param row + * the row number / alphabet index position + * @param alphabet + * @return + * @throws exception + * if invalid, or too few, or too many values + */ + protected void parseValues(String data, int lineNo, float[][] scores, + int row, char[] alphabet) throws FileFormatException + { + String err; + int size = alphabet.length; + StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS); + + int tokenCount = scoreLine.countTokens(); + if (tokenCount == size + 1) + { + /* + * check 'guide' symbol is the row'th letter of the alphabet + */ + String symbol = scoreLine.nextToken(); + if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row]) + { + err = String + .format("Error parsing score matrix at line %d, expected '%s' but found '%s'", + lineNo, alphabet[row], symbol); + throw new FileFormatException(err); + } + } + + tokenCount = scoreLine.countTokens(); + + /* + * AAIndex format only has the lower diagonal i.e. + * 1 score in row 0, 2 in row 1, etc + * check this in all but the last row (which is the same either way) + */ + if (row < size - 1) + { + boolean lowerDiagonal = tokenCount == row + 1; + if (lowerDiagonalOnly && !lowerDiagonal) + { + /* + * had detected lower diagonal form but now it isn't - error + */ + err = String.format("Unexpected number of tokens at line %d", + lineNo); + throw new FileFormatException(err); + } + lowerDiagonalOnly = lowerDiagonal; + } + + if (!lowerDiagonalOnly && tokenCount != size) + { + err = String.format("Expected %d scores at line %d but found %d", + size, lineNo, scoreLine.countTokens()); + throw new FileFormatException(err); + } + scores[row] = new float[size]; + int col = 0; + String value = null; + while (scoreLine.hasMoreTokens()) + { + try + { + value = scoreLine.nextToken(); + scores[row][col] = Float.valueOf(value); + if (lowerDiagonalOnly) + { + scores[col][row] = scores[row][col]; + } + col++; + } catch (NumberFormatException e) + { + err = String.format( + "Invalid score value '%s' at line %d column %d", value, + lineNo, col); + throw new FileFormatException(err); + } + } + } + + /** + * Parse the line in an aaindex file that looks like + * + *
+   * M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
+   * 
+ * + * rejecting it if rows and cols do not match. Returns the string of + * characters in the row/cols alphabet. + * + * @param lineNo + * @param data + * @return + * @throws FileFormatException + */ + protected char[] parseAAindexRowsColumns(int lineNo, String data) + throws FileFormatException + { + String err = "Unexpected aaIndex score matrix data at line " + lineNo + + ": " + data; + + try + { + String[] toks = data.split(","); + String rowsAlphabet = toks[0].split("=")[1].trim(); + String colsAlphabet = toks[1].split("=")[1].trim(); + if (!rowsAlphabet.equals(colsAlphabet)) + { + throw new FileFormatException("rows != cols"); + } + return rowsAlphabet.toCharArray(); + } catch (Throwable t) + { + throw new FileFormatException(err + " " + t.getMessage()); + } + } + + /** + * Answers true if line is one we are not interested in from AAindex format + * file + * + * @param data + * @return + */ + protected boolean skipAAindexLine(String data) + { + if (data.startsWith(COMMENT_CHAR) || data.length() == 0) + { + return true; + } + if (data.startsWith("*") || data.startsWith("R ") + || data.startsWith("A ") || data.startsWith("T ") + || data.startsWith("J ") || data.startsWith("//")) + { + return true; + } + return false; + } + + /** * Answers true if the data line consists of the alphabet characters, * delimited (as to provide a heading row). Otherwise returns false (e.g. if * the data is a row of score values). diff --git a/test/jalview/io/IdentifyFileTest.java b/test/jalview/io/IdentifyFileTest.java index 2e4b9e0..dd4f6ba 100644 --- a/test/jalview/io/IdentifyFileTest.java +++ b/test/jalview/io/IdentifyFileTest.java @@ -110,7 +110,8 @@ public class IdentifyFileTest { "examples/testdata/cullpdb_pc25_res3.0_R0.3_d150729_chains9361.fasta.15316", FileFormat.Fasta }, - { "resources/scoreModel/pam250.scm", FileFormat.ScoreMatrix } + { "resources/scoreModel/pam250.scm", FileFormat.ScoreMatrix }, + { "resources/scoreModel/blosum80.scm", FileFormat.ScoreMatrix } // { "examples/testdata/test.amsa", "AMSA" }, // { "examples/test.jnet", "JnetFile" }, }; diff --git a/test/jalview/io/ScoreMatrixFileTest.java b/test/jalview/io/ScoreMatrixFileTest.java index 44bb8aa..cb30874 100644 --- a/test/jalview/io/ScoreMatrixFileTest.java +++ b/test/jalview/io/ScoreMatrixFileTest.java @@ -3,6 +3,7 @@ package jalview.io; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; import static org.testng.Assert.fail; @@ -30,48 +31,34 @@ public class ScoreMatrixFileTest * or tab (or combinations) as score value delimiters * this example includes 'guide' symbols on score rows */ - String data = "ScoreMatrix MyTest (example)\n" + "ATU tx-\n" - + "A,1.1,1.2,1.3,1.4, 1.5, 1.6, 1.7\n" - + "T,2.1 2.2 2.3 2.4 2.5 2.6 2.7\n" - + "U\t3.1\t3.2\t3.3\t3.4\t3.5\t3.6\t3.7\n" - + " 4.1 ,4.2,\t,4.3 ,\t4.4\t, \4.5,4.6 4.7\n" - + "t, 5.1,5.3,5.3,5.4,5.5, 5.6, 5.7\n" - + "x\t6.1, 6.2 6.3 6.4 6.5 6.6 6.7\n" - + "-, \t7.1\t7.2 7.3, 7.4, 7.5\t,7.6,7.7\n"; + String data = "ScoreMatrix MyTest (example)\n" + "A\tT\tU\tt\tx\t-\n" + + "A,1.1,1.2,1.3,1.4, 1.5, 1.6\n" + + "T,2.1 2.2 2.3 2.4 2.5 2.6\n" + + "U\t3.1\t3.2\t3.3\t3.4\t3.5\t3.6\t\n" + + "t, 5.1,5.3,5.3,5.4,5.5, 5.6\n" + + "x\t6.1, 6.2 6.3 6.4 6.5 6.6\n" + + "-, \t7.1\t7.2 7.3, 7.4, 7.5\t,7.6\n"; FileParse fp = new FileParse(data, DataSourceType.PASTE); ScoreMatrixFile parser = new ScoreMatrixFile(fp); ScoreMatrix sm = parser.parseMatrix(); assertNotNull(sm); assertEquals(sm.getName(), "MyTest (example)"); + assertEquals(sm.getSize(), 6); + assertNull(sm.getDescription()); assertTrue(sm.isDNA()); assertFalse(sm.isProtein()); assertEquals(sm.getPairwiseScore('A', 'A'), 1.1f); assertEquals(sm.getPairwiseScore('A', 'T'), 1.2f); assertEquals(sm.getPairwiseScore('a', 'T'), 1.2f); // A/a equivalent - assertEquals(sm.getPairwiseScore('A', 't'), 1.5f); // T/t not equivalent - assertEquals(sm.getPairwiseScore('a', 't'), 1.5f); - assertEquals(sm.getPairwiseScore('T', ' '), 2.4f); - assertEquals(sm.getPairwiseScore('U', 'x'), 3.6f); - assertEquals(sm.getPairwiseScore('u', 'x'), 3.6f); + assertEquals(sm.getPairwiseScore('A', 't'), 1.4f); // T/t not equivalent + assertEquals(sm.getPairwiseScore('a', 't'), 1.4f); + assertEquals(sm.getPairwiseScore('U', 'x'), 3.5f); + assertEquals(sm.getPairwiseScore('u', 'x'), 3.5f); assertEquals(sm.getPairwiseScore('U', 'X'), 0f); // X (upper) unmapped assertEquals(sm.getPairwiseScore('A', '.'), 0f); // . unmapped - assertEquals(sm.getPairwiseScore('-', '-'), 7.7f); + assertEquals(sm.getPairwiseScore('-', '-'), 7.6f); assertEquals(sm.getPairwiseScore('A', (char) 128), 0f); // out of range - - /* - * without guide symbols on score rows - */ - data = "ScoreMatrix MyTest\nXY\n1 2\n3 4\n"; - fp = new FileParse(data, DataSourceType.PASTE); - parser = new ScoreMatrixFile(fp); - sm = parser.parseMatrix(); - assertNotNull(sm); - assertEquals(sm.getPairwiseScore('X', 'X'), 1f); - assertEquals(sm.getPairwiseScore('X', 'y'), 2f); - assertEquals(sm.getPairwiseScore('y', 'x'), 3f); - assertEquals(sm.getPairwiseScore('y', 'Y'), 4f); - assertEquals(sm.getPairwiseScore('D', 'R'), 0f); } @Test(groups = "Functional") @@ -79,7 +66,7 @@ public class ScoreMatrixFileTest { String data; - data = "XY\n1 2\n3 4\n"; + data = "X Y\n1 2\n3 4\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -95,7 +82,7 @@ public class ScoreMatrixFileTest @Test(groups = "Functional") public void testParse_notEnoughRows() { - String data = "ScoreMatrix MyTest\nXY\n1 2\n"; + String data = "ScoreMatrix MyTest\nX Y\n1 2\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -111,7 +98,7 @@ public class ScoreMatrixFileTest @Test(groups = "Functional") public void testParse_notEnoughColumns() { - String data = "ScoreMatrix MyTest\nXY\n1 2\n3\n"; + String data = "ScoreMatrix MyTest\nX Y\n1 2\n3\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -130,7 +117,7 @@ public class ScoreMatrixFileTest /* * with two too many columns: */ - String data = "ScoreMatrix MyTest\nXY\n1 2\n3 4 5 6\n"; + String data = "ScoreMatrix MyTest\nX\tY\n1 2\n3 4 5 6\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -145,7 +132,7 @@ public class ScoreMatrixFileTest /* * with guide character and one too many columns: */ - data = "ScoreMatrix MyTest\nXY\nX 1 2\nY 3 4 5\n"; + data = "ScoreMatrix MyTest\nX Y\nX 1 2\nY 3 4 5\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -161,7 +148,7 @@ public class ScoreMatrixFileTest * with no guide character and one too many columns: * parser guesses the first column is the guide character */ - data = "ScoreMatrix MyTest\nXY\n1 2\n3 4 5\n"; + data = "ScoreMatrix MyTest\nX Y\n1 2\n3 4 5\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -177,7 +164,7 @@ public class ScoreMatrixFileTest @Test(groups = "Functional") public void testParse_tooManyRows() { - String data = "ScoreMatrix MyTest\nXY\n1 2\n3 4\n6 7"; + String data = "ScoreMatrix MyTest\n\tX\tY\n1 2\n3 4\n6 7"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -193,7 +180,7 @@ public class ScoreMatrixFileTest @Test(groups = "Functional") public void testParse_badDelimiter() { - String data = "ScoreMatrix MyTest\nXY\n1|2\n3|4\n"; + String data = "ScoreMatrix MyTest\n X Y Z\n1|2|3\n4|5|6\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -202,14 +189,14 @@ public class ScoreMatrixFileTest } catch (IOException e) { assertEquals(e.getMessage(), - "Expected 2 scores at line 3 but found 1"); + "Invalid score value '1|2|3' at line 3 column 0"); } } @Test(groups = "Functional") public void testParse_badFloat() { - String data = "ScoreMatrix MyTest\nXY\n1 2\n3 four\n"; + String data = "ScoreMatrix MyTest\n\tX\tY\n1 2\n3 four\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -225,7 +212,7 @@ public class ScoreMatrixFileTest @Test(groups = "Functional") public void testParse_badGuideCharacter() { - String data = "ScoreMatrix MyTest\nXY\nX 1 2\ny 3 4\n"; + String data = "ScoreMatrix MyTest\n\tX Y\nX 1 2\ny 3 4\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -244,7 +231,7 @@ public class ScoreMatrixFileTest /* * Name missing */ - String data = "ScoreMatrix\nXY\n1 2\n3 4\n"; + String data = "ScoreMatrix\nX Y\n1 2\n3 4\n"; try { new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE)) @@ -265,10 +252,10 @@ public class ScoreMatrixFileTest * @throws MalformedURLException */ @Test(groups = "Functional") - public void testParse_withResidueHeading() throws MalformedURLException, + public void testParse_ncbiFormat() throws MalformedURLException, IOException { - String data = "ScoreMatrix MyTest\n" + "ABC\n" + "\tA\tB\tC\n" + String data = "ScoreMatrix MyTest\n" + "\tA\tB\tC\n" + "A\t1.0\t2.0\t3.0\n" + "B\t4.0\t5.0\t6.0\n" + "C\t7.0\t8.0\t9.0\n"; FileParse fp = new FileParse(data, DataSourceType.PASTE); @@ -281,4 +268,33 @@ public class ScoreMatrixFileTest assertEquals(sm.getPairwiseScore('B', 'c'), 6.0f); assertEquals(sm.getSize(), 3); } + + /** + * Test a successful parse of a (small) score matrix file + * + * @throws IOException + * @throws MalformedURLException + */ + @Test(groups = "Functional") + public void testParse_aaIndexBlosum80() throws MalformedURLException, + IOException + { + FileParse fp = new FileParse("resources/scoreModel/blosum80.scm", + DataSourceType.FILE); + ScoreMatrixFile parser = new ScoreMatrixFile(fp); + ScoreMatrix sm = parser.parseMatrix(); + + assertNotNull(sm); + assertEquals(sm.getName(), "HENS920103"); + assertEquals(sm.getDescription(), + "BLOSUM80 substitution matrix (Henikoff-Henikoff, 1992)"); + assertFalse(sm.isDNA()); + assertTrue(sm.isProtein()); + assertEquals(20, sm.getSize()); + assertEquals(sm.getGapIndex(), -1); + + assertEquals(sm.getPairwiseScore('A', 'A'), 7f); + assertEquals(sm.getPairwiseScore('A', 'R'), -3f); + assertEquals(sm.getPairwiseScore('r', 'a'), -3f); // A/a equivalent + } }