/**
* A class that can parse a file containing a substitution matrix and register
* it for use in Jalview
+ * <p>
+ * Accepts 'NCBI' format (e.g.
+ * https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt), with the
+ * addition of a header line to provide a matrix name, e.g.
*
- * @author gmcarstairs
- *
+ * <pre>
+ * ScoreMatrix BLOSUM62
+ * </pre>
+ *
+ * Also accepts 'AAindex' format (as described at
+ * http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data
+ * required being
+ *
+ * <pre>
+ * H accession number (used as score matrix identifier in Jalview)
+ * D description (used for tooltip in Jalview)
+ * M rows = symbolList
+ * and the substitution scores
+ * </pre>
*/
-// TODO modify the AlignFile / IdentifyFile pattern so that non-alignment files
-// like this are handled more naturally
public class ScoreMatrixFile extends AlignFile implements
AlignmentFileReaderI
{
private String matrixName;
+ /*
+ * aaindex format has scores for diagonal and below only
+ */
+ boolean isLowerDiagonalOnly;
+
+ /*
+ * ncbi format has symbols as first column on score rows
+ */
+ boolean hasGuideColumn;
+
/**
* Constructor
*
/**
* Parses the score matrix file, and if successful registers the matrix so it
- * will be shown in Jalview menus.
+ * will be shown in Jalview menus. This method is not thread-safe (a separate
+ * instance of this class should be used by each thread).
*/
@Override
public void parse() throws IOException
ScoreMatrix sm = null;
int lineNo = 0;
String name = null;
- String alphabet = null;
+ char[] alphabet = null;
float[][] scores = null;
int size = 0;
int row = 0;
String err = null;
String data;
+ isLowerDiagonalOnly = false;
while ((data = nextLine()) != null)
{
*/
if (name != null)
{
- System.err
- .println("Warning: 'ScoreMatrix' repeated in file at line "
+ throw new FileFormatException(
+ "Error: 'ScoreMatrix' repeated in file at line "
+ lineNo);
}
StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
name = data.substring(1).substring(data.substring(1).indexOf(name));
continue;
}
+ else if (data.startsWith("H ") && name == null)
+ {
+ /*
+ * AAindex identifier
+ */
+ return parseAAIndexFormat(lineNo, data);
+ }
else if (name == null)
{
err = "Format error: 'ScoreMatrix <name>' should be the first non-comment line";
}
/*
- * next line after ScoreMatrix should be the alphabet of scored symbols
+ * next non-comment line after ScoreMatrix should be the
+ * column header line with the alphabet of scored symbols
*/
if (alphabet == null)
{
- alphabet = data;
- size = alphabet.length();
+ StringTokenizer columnHeadings = new StringTokenizer(data,
+ DELIMITERS);
+ size = columnHeadings.countTokens();
+ alphabet = new char[size];
+ int col = 0;
+ while (columnHeadings.hasMoreTokens())
+ {
+ alphabet[col++] = columnHeadings.nextToken().charAt(0);
+ }
scores = new float[size][];
continue;
}
throw new FileFormatException(err);
}
- /*
- * permit an uncommented line with delimited residue headings
- */
- if (isHeaderLine(data, alphabet))
- {
- continue;
- }
-
- /*
- * subsequent lines should be the symbol scores
- * optionally with the symbol as the first column for readability
- */
- StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
- if (scoreLine.countTokens() == size + 1)
- {
- /*
- * check 'guide' symbol is the row'th letter of the alphabet
- */
- String symbol = scoreLine.nextToken();
- if (symbol.length() > 1 || symbol.charAt(0) != alphabet.charAt(row))
- {
- err = String
- .format("Error parsing score matrix at line %d, expected '%s' but found '%s'",
- lineNo, alphabet.charAt(row), symbol);
- throw new FileFormatException(err);
- }
- }
- if (scoreLine.countTokens() != size)
- {
- err = String.format("Expected %d scores at line %d but found %d",
- size, lineNo, scoreLine.countTokens());
- throw new FileFormatException(err);
- }
- scores[row] = new float[size];
- int col = 0;
- String value = null;
- while (scoreLine.hasMoreTokens())
- {
- try
- {
- value = scoreLine.nextToken();
- scores[row][col] = Float.valueOf(value);
- col++;
- } catch (NumberFormatException e)
- {
- err = String.format(
- "Invalid score value '%s' at line %d column %d", value,
- lineNo, col);
- throw new FileFormatException(err);
- }
- }
+ parseValues(data, lineNo, scores, row, alphabet);
row++;
}
/*
* If we get here, then name, alphabet and scores have been parsed successfully
*/
- sm = new ScoreMatrix(name, alphabet.toCharArray(), scores);
+ sm = new ScoreMatrix(name, alphabet, scores);
+ matrixName = name;
+
+ return sm;
+ }
+
+ /**
+ * Parse input as AAIndex format, starting from the header line with the
+ * accession id
+ *
+ * @param lineNo
+ * @param data
+ * @return
+ * @throws IOException
+ */
+ protected ScoreMatrix parseAAIndexFormat(int lineNo, String data)
+ throws IOException
+ {
+ String name = data.substring(2).trim();
+ String description = null;
+
+ float[][] scores = null;
+ char[] alphabet = null;
+ int row = 0;
+ int size = 0;
+
+ while ((data = nextLine()) != null)
+ {
+ lineNo++;
+ data = data.trim();
+ if (skipAAindexLine(data))
+ {
+ continue;
+ }
+ if (data.startsWith("D "))
+ {
+ description = data.substring(2).trim();
+ }
+ else if (data.startsWith("M "))
+ {
+ alphabet = parseAAindexRowsColumns(lineNo, data);
+ size = alphabet.length;
+ scores = new float[size][size];
+ }
+ else if (scores == null)
+ {
+ throw new FileFormatException(
+ "No alphabet specified in matrix file");
+ }
+ else if (row >= size)
+ {
+ throw new FileFormatException("Too many data rows in matrix file");
+ }
+ else
+ {
+ parseValues(data, lineNo, scores, row, alphabet);
+ row++;
+ }
+ }
+
+ ScoreMatrix sm = new ScoreMatrix(name, description, alphabet, scores);
matrixName = name;
return sm;
}
/**
- * Answers true if the data line consists of the alphabet characters,
- * delimited (as to provide a heading row). Otherwise returns false (e.g. if
- * the data is a row of score values).
+ * Parse one row of score values, delimited by whitespace or commas. The line
+ * may optionally include the symbol from which the scores are defined. Values
+ * may be present for all columns, or only up to the diagonal (in which case
+ * upper diagonal values are set symmetrically).
*
* @param data
+ * the line to be parsed
+ * @param lineNo
+ * @param scores
+ * the score matrix to add data to
+ * @param row
+ * the row number / alphabet index position
* @param alphabet
* @return
+ * @throws exception
+ * if invalid, or too few, or too many values
*/
- private boolean isHeaderLine(String data, String alphabet)
+ protected void parseValues(String data, int lineNo, float[][] scores,
+ int row, char[] alphabet) throws FileFormatException
{
+ String err;
+ int size = alphabet.length;
StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
- int i = 0;
- while (scoreLine.hasMoreElements())
+
+ int tokenCount = scoreLine.countTokens();
+
+ /*
+ * inspect first row to see if it includes the symbol in the first column,
+ * and to see if it is lower diagonal values only (i.e. just one score)
+ */
+ if (row == 0)
+ {
+ if (data.startsWith(String.valueOf(alphabet[0])))
+ {
+ hasGuideColumn = true;
+ }
+ if (tokenCount == (hasGuideColumn ? 2 : 1))
+ {
+ isLowerDiagonalOnly = true;
+ }
+ }
+
+ if (hasGuideColumn)
{
/*
- * skip over characters in the alphabet that are
- * also a delimiter (e.g. space)
+ * check 'guide' symbol is the row'th letter of the alphabet
*/
- char symbol = alphabet.charAt(i++);
- if (!DELIMITERS.contains(String.valueOf(symbol)))
+ String symbol = scoreLine.nextToken();
+ if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row])
{
- if (!String.valueOf(symbol).equals(scoreLine.nextToken()))
+ err = String
+ .format("Error parsing score matrix at line %d, expected '%s' but found '%s'",
+ lineNo, alphabet[row], symbol);
+ throw new FileFormatException(err);
+ }
+ tokenCount = scoreLine.countTokens(); // excluding guide symbol
+ }
+
+ /*
+ * check the right number of values (lower diagonal or full format)
+ */
+ if (isLowerDiagonalOnly && tokenCount != row + 1)
+ {
+ err = String.format(
+ "Expected %d scores at line %d: '%s' but found %d", row + 1,
+ lineNo, data, tokenCount);
+ throw new FileFormatException(err);
+ }
+
+ if (!isLowerDiagonalOnly && tokenCount != size)
+ {
+ err = String.format(
+ "Expected %d scores at line %d: '%s' but found %d", size,
+ lineNo, data, scoreLine.countTokens());
+ throw new FileFormatException(err);
+ }
+
+ /*
+ * parse and set the values, setting the symmetrical value
+ * as well if lower diagonal format data
+ */
+ scores[row] = new float[size];
+ int col = 0;
+ String value = null;
+ while (scoreLine.hasMoreTokens())
+ {
+ try
+ {
+ value = scoreLine.nextToken();
+ scores[row][col] = Float.valueOf(value);
+ if (isLowerDiagonalOnly)
{
- return false;
+ scores[col][row] = scores[row][col];
}
+ col++;
+ } catch (NumberFormatException e)
+ {
+ err = String.format(
+ "Invalid score value '%s' at line %d column %d", value,
+ lineNo, col);
+ throw new FileFormatException(err);
+ }
+ }
+ }
+
+ /**
+ * Parse the line in an aaindex file that looks like
+ *
+ * <pre>
+ * M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
+ * </pre>
+ *
+ * rejecting it if rows and cols do not match. Returns the string of
+ * characters in the row/cols alphabet.
+ *
+ * @param lineNo
+ * @param data
+ * @return
+ * @throws FileFormatException
+ */
+ protected char[] parseAAindexRowsColumns(int lineNo, String data)
+ throws FileFormatException
+ {
+ String err = "Unexpected aaIndex score matrix data at line " + lineNo
+ + ": " + data;
+
+ try
+ {
+ String[] toks = data.split(",");
+ String rowsAlphabet = toks[0].split("=")[1].trim();
+ String colsAlphabet = toks[1].split("=")[1].trim();
+ if (!rowsAlphabet.equals(colsAlphabet))
+ {
+ throw new FileFormatException("rows != cols");
}
+ return rowsAlphabet.toCharArray();
+ } catch (Throwable t)
+ {
+ throw new FileFormatException(err + " " + t.getMessage());
+ }
+ }
+
+ /**
+ * Answers true if line is one we are not interested in from AAindex format
+ * file
+ *
+ * @param data
+ * @return
+ */
+ protected boolean skipAAindexLine(String data)
+ {
+ if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
+ {
+ return true;
+ }
+ if (data.startsWith("*") || data.startsWith("R ")
+ || data.startsWith("A ") || data.startsWith("T ")
+ || data.startsWith("J ") || data.startsWith("//"))
+ {
+ return true;
}
- return true;
+ return false;
}
public String getMatrixName()