else
{
sb.append("ScoreMatrix ").append(getName()).append("\n");
- sb.append(symbols).append("\n");
}
for (char sym : symbols)
{
*
* @return
*/
- public String getSymbols()
+ String getSymbols()
{
return new String(symbols);
}
boolean lineswereskipped = false;
boolean isBinary = false; // true if length is non-zero and non-printable
// characters are encountered
+
try
{
if (!closeSource)
{
source.mark();
}
+ boolean aaIndexHeaderRead = false;
+
while ((data = source.nextLine()) != null)
{
bytesRead += data.length();
reply = FileFormat.ScoreMatrix;
break;
}
+ if (data.startsWith("H ") && !aaIndexHeaderRead)
+ {
+ aaIndexHeaderRead = true;
+ }
+ if (data.startsWith("D ") && aaIndexHeaderRead)
+ {
+ reply = FileFormat.ScoreMatrix;
+ break;
+ }
if (data.startsWith("##GFF-VERSION"))
{
// GFF - possibly embedded in a Jalview features file!
* ScoreMatrix BLOSUM62
* </pre>
*
- * Also accepts 'aaindex' format (as described at
+ * Also accepts 'AAindex' format (as described at
* http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data
* required being
*
private String matrixName;
+ boolean lowerDiagonalOnly;
+
/**
* Constructor
*
/**
* Parses the score matrix file, and if successful registers the matrix so it
- * will be shown in Jalview menus.
+ * will be shown in Jalview menus. This method is not thread-safe (a separate
+ * instance of this class should be used by each thread).
*/
@Override
public void parse() throws IOException
ScoreMatrix sm = null;
int lineNo = 0;
String name = null;
- String alphabet = null;
+ char[] alphabet = null;
float[][] scores = null;
int size = 0;
int row = 0;
String err = null;
String data;
+ lowerDiagonalOnly = false;
while ((data = nextLine()) != null)
{
name = data.substring(1).substring(data.substring(1).indexOf(name));
continue;
}
+ else if (data.startsWith("H ") && name == null)
+ {
+ /*
+ * AAindex identifier
+ */
+ return parseAAIndexFormat(lineNo, data);
+ }
else if (name == null)
{
err = "Format error: 'ScoreMatrix <name>' should be the first non-comment line";
}
/*
- * next line after ScoreMatrix should be the alphabet of scored symbols
+ * next non-comment line after ScoreMatrix should be the
+ * column header line with the alphabet of scored symbols
*/
if (alphabet == null)
{
- alphabet = data;
- size = alphabet.length();
+ StringTokenizer columnHeadings = new StringTokenizer(data,
+ DELIMITERS);
+ size = columnHeadings.countTokens();
+ alphabet = new char[size];
+ int col = 0;
+ while (columnHeadings.hasMoreTokens())
+ {
+ alphabet[col++] = columnHeadings.nextToken().charAt(0);
+ }
scores = new float[size][];
continue;
}
throw new FileFormatException(err);
}
- /*
- * permit an uncommented line with delimited residue headings
- */
- if (isHeaderLine(data, alphabet))
- {
- continue;
- }
-
- /*
- * subsequent lines should be the symbol scores
- * optionally with the symbol as the first column for readability
- */
- StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
- int tokenCount = scoreLine.countTokens();
- if (tokenCount == size + 1)
- {
- /*
- * check 'guide' symbol is the row'th letter of the alphabet
- */
- String symbol = scoreLine.nextToken();
- if (symbol.length() > 1 || symbol.charAt(0) != alphabet.charAt(row))
- {
- err = String
- .format("Error parsing score matrix at line %d, expected '%s' but found '%s'",
- lineNo, alphabet.charAt(row), symbol);
- throw new FileFormatException(err);
- }
- }
- if (scoreLine.countTokens() != size)
- {
- err = String.format("Expected %d scores at line %d but found %d",
- size, lineNo, scoreLine.countTokens());
- throw new FileFormatException(err);
- }
- scores[row] = new float[size];
- int col = 0;
- String value = null;
- while (scoreLine.hasMoreTokens())
- {
- try
- {
- value = scoreLine.nextToken();
- scores[row][col] = Float.valueOf(value);
- col++;
- } catch (NumberFormatException e)
- {
- err = String.format(
- "Invalid score value '%s' at line %d column %d", value,
- lineNo, col);
- throw new FileFormatException(err);
- }
- }
+ parseValues(data, lineNo, scores, row, alphabet);
row++;
}
/*
* If we get here, then name, alphabet and scores have been parsed successfully
*/
- sm = new ScoreMatrix(name, alphabet.toCharArray(), scores);
+ sm = new ScoreMatrix(name, alphabet, scores);
matrixName = name;
return sm;
}
/**
+ * Parse input as AAIndex format, starting from the header line with the
+ * accession id
+ *
+ * @param lineNo
+ * @param data
+ * @return
+ * @throws IOException
+ */
+ protected ScoreMatrix parseAAIndexFormat(int lineNo, String data)
+ throws IOException
+ {
+ String name = data.substring(2).trim();
+ String description = null;
+
+ float[][] scores = null;
+ char[] alphabet = null;
+ int row = 0;
+ int size = 0;
+
+ while ((data = nextLine()) != null)
+ {
+ lineNo++;
+ data = data.trim();
+ if (skipAAindexLine(data))
+ {
+ continue;
+ }
+ if (data.startsWith("D "))
+ {
+ description = data.substring(2).trim();
+ }
+ else if (data.startsWith("M "))
+ {
+ alphabet = parseAAindexRowsColumns(lineNo, data);
+ size = alphabet.length;
+ scores = new float[size][size];
+ }
+ else if (scores == null)
+ {
+ throw new FileFormatException(
+ "No alphabet specified in matrix file");
+ }
+ else if (row >= size)
+ {
+ throw new FileFormatException("Too many data rows in matrix file");
+ }
+ else
+ {
+ parseValues(data, lineNo, scores, row, alphabet);
+ row++;
+ if (row == size)
+ {
+ break;
+ }
+ }
+ }
+ if (data != null)
+ {
+ System.err.println("Warning: unexpected extra data in matrix file: "
+ + data);
+ }
+
+ ScoreMatrix sm = new ScoreMatrix(name, alphabet, scores);
+ sm.setDescription(description);
+ matrixName = name;
+
+ return sm;
+ }
+
+ /**
+ * Parse one row of score values, delimited by whitespace or commas. The line
+ * may optionally include the symbol from which the scores are defined. Values
+ * may be present for all columns, or only up to the diagonal (in which case
+ * upper diagonal values are set symmetrically).
+ *
+ * @param data
+ * the line to be parsed
+ * @param lineNo
+ * @param scores
+ * the score matrix to add data to
+ * @param row
+ * the row number / alphabet index position
+ * @param alphabet
+ * @return
+ * @throws exception
+ * if invalid, or too few, or too many values
+ */
+ protected void parseValues(String data, int lineNo, float[][] scores,
+ int row, char[] alphabet) throws FileFormatException
+ {
+ String err;
+ int size = alphabet.length;
+ StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
+
+ int tokenCount = scoreLine.countTokens();
+ if (tokenCount == size + 1)
+ {
+ /*
+ * check 'guide' symbol is the row'th letter of the alphabet
+ */
+ String symbol = scoreLine.nextToken();
+ if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row])
+ {
+ err = String
+ .format("Error parsing score matrix at line %d, expected '%s' but found '%s'",
+ lineNo, alphabet[row], symbol);
+ throw new FileFormatException(err);
+ }
+ }
+
+ tokenCount = scoreLine.countTokens();
+
+ /*
+ * AAIndex format only has the lower diagonal i.e.
+ * 1 score in row 0, 2 in row 1, etc
+ * check this in all but the last row (which is the same either way)
+ */
+ if (row < size - 1)
+ {
+ boolean lowerDiagonal = tokenCount == row + 1;
+ if (lowerDiagonalOnly && !lowerDiagonal)
+ {
+ /*
+ * had detected lower diagonal form but now it isn't - error
+ */
+ err = String.format("Unexpected number of tokens at line %d",
+ lineNo);
+ throw new FileFormatException(err);
+ }
+ lowerDiagonalOnly = lowerDiagonal;
+ }
+
+ if (!lowerDiagonalOnly && tokenCount != size)
+ {
+ err = String.format("Expected %d scores at line %d but found %d",
+ size, lineNo, scoreLine.countTokens());
+ throw new FileFormatException(err);
+ }
+ scores[row] = new float[size];
+ int col = 0;
+ String value = null;
+ while (scoreLine.hasMoreTokens())
+ {
+ try
+ {
+ value = scoreLine.nextToken();
+ scores[row][col] = Float.valueOf(value);
+ if (lowerDiagonalOnly)
+ {
+ scores[col][row] = scores[row][col];
+ }
+ col++;
+ } catch (NumberFormatException e)
+ {
+ err = String.format(
+ "Invalid score value '%s' at line %d column %d", value,
+ lineNo, col);
+ throw new FileFormatException(err);
+ }
+ }
+ }
+
+ /**
+ * Parse the line in an aaindex file that looks like
+ *
+ * <pre>
+ * M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
+ * </pre>
+ *
+ * rejecting it if rows and cols do not match. Returns the string of
+ * characters in the row/cols alphabet.
+ *
+ * @param lineNo
+ * @param data
+ * @return
+ * @throws FileFormatException
+ */
+ protected char[] parseAAindexRowsColumns(int lineNo, String data)
+ throws FileFormatException
+ {
+ String err = "Unexpected aaIndex score matrix data at line " + lineNo
+ + ": " + data;
+
+ try
+ {
+ String[] toks = data.split(",");
+ String rowsAlphabet = toks[0].split("=")[1].trim();
+ String colsAlphabet = toks[1].split("=")[1].trim();
+ if (!rowsAlphabet.equals(colsAlphabet))
+ {
+ throw new FileFormatException("rows != cols");
+ }
+ return rowsAlphabet.toCharArray();
+ } catch (Throwable t)
+ {
+ throw new FileFormatException(err + " " + t.getMessage());
+ }
+ }
+
+ /**
+ * Answers true if line is one we are not interested in from AAindex format
+ * file
+ *
+ * @param data
+ * @return
+ */
+ protected boolean skipAAindexLine(String data)
+ {
+ if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
+ {
+ return true;
+ }
+ if (data.startsWith("*") || data.startsWith("R ")
+ || data.startsWith("A ") || data.startsWith("T ")
+ || data.startsWith("J ") || data.startsWith("//"))
+ {
+ return true;
+ }
+ return false;
+ }
+
+ /**
* Answers true if the data line consists of the alphabet characters,
* delimited (as to provide a heading row). Otherwise returns false (e.g. if
* the data is a row of score values).
{
"examples/testdata/cullpdb_pc25_res3.0_R0.3_d150729_chains9361.fasta.15316",
FileFormat.Fasta },
- { "resources/scoreModel/pam250.scm", FileFormat.ScoreMatrix }
+ { "resources/scoreModel/pam250.scm", FileFormat.ScoreMatrix },
+ { "resources/scoreModel/blosum80.scm", FileFormat.ScoreMatrix }
// { "examples/testdata/test.amsa", "AMSA" },
// { "examples/test.jnet", "JnetFile" },
};
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertNotNull;
+import static org.testng.Assert.assertNull;
import static org.testng.Assert.assertTrue;
import static org.testng.Assert.fail;
* or tab (or combinations) as score value delimiters
* this example includes 'guide' symbols on score rows
*/
- String data = "ScoreMatrix MyTest (example)\n" + "ATU tx-\n"
- + "A,1.1,1.2,1.3,1.4, 1.5, 1.6, 1.7\n"
- + "T,2.1 2.2 2.3 2.4 2.5 2.6 2.7\n"
- + "U\t3.1\t3.2\t3.3\t3.4\t3.5\t3.6\t3.7\n"
- + " 4.1 ,4.2,\t,4.3 ,\t4.4\t, \4.5,4.6 4.7\n"
- + "t, 5.1,5.3,5.3,5.4,5.5, 5.6, 5.7\n"
- + "x\t6.1, 6.2 6.3 6.4 6.5 6.6 6.7\n"
- + "-, \t7.1\t7.2 7.3, 7.4, 7.5\t,7.6,7.7\n";
+ String data = "ScoreMatrix MyTest (example)\n" + "A\tT\tU\tt\tx\t-\n"
+ + "A,1.1,1.2,1.3,1.4, 1.5, 1.6\n"
+ + "T,2.1 2.2 2.3 2.4 2.5 2.6\n"
+ + "U\t3.1\t3.2\t3.3\t3.4\t3.5\t3.6\t\n"
+ + "t, 5.1,5.3,5.3,5.4,5.5, 5.6\n"
+ + "x\t6.1, 6.2 6.3 6.4 6.5 6.6\n"
+ + "-, \t7.1\t7.2 7.3, 7.4, 7.5\t,7.6\n";
FileParse fp = new FileParse(data, DataSourceType.PASTE);
ScoreMatrixFile parser = new ScoreMatrixFile(fp);
ScoreMatrix sm = parser.parseMatrix();
assertNotNull(sm);
assertEquals(sm.getName(), "MyTest (example)");
+ assertEquals(sm.getSize(), 6);
+ assertNull(sm.getDescription());
assertTrue(sm.isDNA());
assertFalse(sm.isProtein());
assertEquals(sm.getPairwiseScore('A', 'A'), 1.1f);
assertEquals(sm.getPairwiseScore('A', 'T'), 1.2f);
assertEquals(sm.getPairwiseScore('a', 'T'), 1.2f); // A/a equivalent
- assertEquals(sm.getPairwiseScore('A', 't'), 1.5f); // T/t not equivalent
- assertEquals(sm.getPairwiseScore('a', 't'), 1.5f);
- assertEquals(sm.getPairwiseScore('T', ' '), 2.4f);
- assertEquals(sm.getPairwiseScore('U', 'x'), 3.6f);
- assertEquals(sm.getPairwiseScore('u', 'x'), 3.6f);
+ assertEquals(sm.getPairwiseScore('A', 't'), 1.4f); // T/t not equivalent
+ assertEquals(sm.getPairwiseScore('a', 't'), 1.4f);
+ assertEquals(sm.getPairwiseScore('U', 'x'), 3.5f);
+ assertEquals(sm.getPairwiseScore('u', 'x'), 3.5f);
assertEquals(sm.getPairwiseScore('U', 'X'), 0f); // X (upper) unmapped
assertEquals(sm.getPairwiseScore('A', '.'), 0f); // . unmapped
- assertEquals(sm.getPairwiseScore('-', '-'), 7.7f);
+ assertEquals(sm.getPairwiseScore('-', '-'), 7.6f);
assertEquals(sm.getPairwiseScore('A', (char) 128), 0f); // out of range
-
- /*
- * without guide symbols on score rows
- */
- data = "ScoreMatrix MyTest\nXY\n1 2\n3 4\n";
- fp = new FileParse(data, DataSourceType.PASTE);
- parser = new ScoreMatrixFile(fp);
- sm = parser.parseMatrix();
- assertNotNull(sm);
- assertEquals(sm.getPairwiseScore('X', 'X'), 1f);
- assertEquals(sm.getPairwiseScore('X', 'y'), 2f);
- assertEquals(sm.getPairwiseScore('y', 'x'), 3f);
- assertEquals(sm.getPairwiseScore('y', 'Y'), 4f);
- assertEquals(sm.getPairwiseScore('D', 'R'), 0f);
}
@Test(groups = "Functional")
{
String data;
- data = "XY\n1 2\n3 4\n";
+ data = "X Y\n1 2\n3 4\n";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
@Test(groups = "Functional")
public void testParse_notEnoughRows()
{
- String data = "ScoreMatrix MyTest\nXY\n1 2\n";
+ String data = "ScoreMatrix MyTest\nX Y\n1 2\n";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
@Test(groups = "Functional")
public void testParse_notEnoughColumns()
{
- String data = "ScoreMatrix MyTest\nXY\n1 2\n3\n";
+ String data = "ScoreMatrix MyTest\nX Y\n1 2\n3\n";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
/*
* with two too many columns:
*/
- String data = "ScoreMatrix MyTest\nXY\n1 2\n3 4 5 6\n";
+ String data = "ScoreMatrix MyTest\nX\tY\n1 2\n3 4 5 6\n";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
/*
* with guide character and one too many columns:
*/
- data = "ScoreMatrix MyTest\nXY\nX 1 2\nY 3 4 5\n";
+ data = "ScoreMatrix MyTest\nX Y\nX 1 2\nY 3 4 5\n";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
* with no guide character and one too many columns:
* parser guesses the first column is the guide character
*/
- data = "ScoreMatrix MyTest\nXY\n1 2\n3 4 5\n";
+ data = "ScoreMatrix MyTest\nX Y\n1 2\n3 4 5\n";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
@Test(groups = "Functional")
public void testParse_tooManyRows()
{
- String data = "ScoreMatrix MyTest\nXY\n1 2\n3 4\n6 7";
+ String data = "ScoreMatrix MyTest\n\tX\tY\n1 2\n3 4\n6 7";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
@Test(groups = "Functional")
public void testParse_badDelimiter()
{
- String data = "ScoreMatrix MyTest\nXY\n1|2\n3|4\n";
+ String data = "ScoreMatrix MyTest\n X Y Z\n1|2|3\n4|5|6\n";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
} catch (IOException e)
{
assertEquals(e.getMessage(),
- "Expected 2 scores at line 3 but found 1");
+ "Invalid score value '1|2|3' at line 3 column 0");
}
}
@Test(groups = "Functional")
public void testParse_badFloat()
{
- String data = "ScoreMatrix MyTest\nXY\n1 2\n3 four\n";
+ String data = "ScoreMatrix MyTest\n\tX\tY\n1 2\n3 four\n";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
@Test(groups = "Functional")
public void testParse_badGuideCharacter()
{
- String data = "ScoreMatrix MyTest\nXY\nX 1 2\ny 3 4\n";
+ String data = "ScoreMatrix MyTest\n\tX Y\nX 1 2\ny 3 4\n";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
/*
* Name missing
*/
- String data = "ScoreMatrix\nXY\n1 2\n3 4\n";
+ String data = "ScoreMatrix\nX Y\n1 2\n3 4\n";
try
{
new ScoreMatrixFile(new FileParse(data, DataSourceType.PASTE))
* @throws MalformedURLException
*/
@Test(groups = "Functional")
- public void testParse_withResidueHeading() throws MalformedURLException,
+ public void testParse_ncbiFormat() throws MalformedURLException,
IOException
{
- String data = "ScoreMatrix MyTest\n" + "ABC\n" + "\tA\tB\tC\n"
+ String data = "ScoreMatrix MyTest\n" + "\tA\tB\tC\n"
+ "A\t1.0\t2.0\t3.0\n" + "B\t4.0\t5.0\t6.0\n"
+ "C\t7.0\t8.0\t9.0\n";
FileParse fp = new FileParse(data, DataSourceType.PASTE);
assertEquals(sm.getPairwiseScore('B', 'c'), 6.0f);
assertEquals(sm.getSize(), 3);
}
+
+ /**
+ * Test a successful parse of a (small) score matrix file
+ *
+ * @throws IOException
+ * @throws MalformedURLException
+ */
+ @Test(groups = "Functional")
+ public void testParse_aaIndexBlosum80() throws MalformedURLException,
+ IOException
+ {
+ FileParse fp = new FileParse("resources/scoreModel/blosum80.scm",
+ DataSourceType.FILE);
+ ScoreMatrixFile parser = new ScoreMatrixFile(fp);
+ ScoreMatrix sm = parser.parseMatrix();
+
+ assertNotNull(sm);
+ assertEquals(sm.getName(), "HENS920103");
+ assertEquals(sm.getDescription(),
+ "BLOSUM80 substitution matrix (Henikoff-Henikoff, 1992)");
+ assertFalse(sm.isDNA());
+ assertTrue(sm.isProtein());
+ assertEquals(20, sm.getSize());
+ assertEquals(sm.getGapIndex(), -1);
+
+ assertEquals(sm.getPairwiseScore('A', 'A'), 7f);
+ assertEquals(sm.getPairwiseScore('A', 'R'), -3f);
+ assertEquals(sm.getPairwiseScore('r', 'a'), -3f); // A/a equivalent
+ }
}