package jalview.analysis.scoremodels;
import jalview.api.analysis.ScoreModelI;
-import jalview.schemes.ResidueProperties;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Arrays;
+import java.util.StringTokenizer;
public class ScoreMatrix extends PairwiseSeqScoreModel implements
ScoreModelI
{
- String name;
+ public static final short UNMAPPED = (short) -1;
- @Override
- public String getName()
- {
- return name;
- }
+ private static final String DELIMITERS = " ,\t";
- /**
- * reference to integer score matrix
+ private static final String COMMENT_CHAR = "#";
+
+ private static final String BAD_ASCII_ERROR = "Unexpected character %s in getPairwiseScore";
+
+ private static final int MAX_ASCII = 127;
+
+ /*
+ * the name of the model as shown in menus
*/
- int[][] matrix;
+ private String name;
- /**
- * 0 for Protein Score matrix. 1 for dna score matrix
+ /*
+ * the characters that the model provides scores for
+ */
+ private char[] symbols;
+
+ /*
+ * the score matrix; both dimensions must equal the number of symbols
+ * matrix[i][j] is the substitution score for replacing symbols[i] with symbols[j]
+ */
+ private float[][] matrix;
+
+ /*
+ * quick lookup to convert from an ascii character value to the index
+ * of the corresponding symbol in the score matrix
*/
- int type;
+ private short[] symbolIndex;
+
+ /*
+ * true for Protein Score matrix, false for dna score matrix
+ */
+ private boolean peptide;
/**
+ * Constructor
*
* @param name
* Unique, human readable name for the matrix
+ * @param alphabet
+ * the symbols to which scores apply
* @param matrix
- * Pairwise scores indexed according to appropriate symbol alphabet
- * @param type
- * 0 for Protein, 1 for NA
+ * Pairwise scores indexed according to the symbol alphabet
*/
- public ScoreMatrix(String name, int[][] matrix, int type)
+ public ScoreMatrix(String name, char[] alphabet, float[][] matrix)
{
this.matrix = matrix;
- this.type = type;
this.name = name;
+ this.symbols = alphabet;
+
+ symbolIndex = buildSymbolIndex(alphabet);
+
+ /*
+ * crude heuristic for now...
+ */
+ peptide = alphabet.length >= 20;
+ }
+
+ /**
+ * Returns an array A where A[i] is the position in the alphabet array of the
+ * character whose value is i. For example if the alphabet is { 'A', 'D', 'X'
+ * } then A['D'] = A[68] = 1.
+ * <p>
+ * Unmapped characters (not in the alphabet) get an index of -1.
+ * <p>
+ * Mappings are added automatically for lower case symbols (for non case
+ * sensitive scoring), unless they are explicitly present in the alphabet (are
+ * scored separately in the score matrix).
+ *
+ * @param alphabet
+ * @return
+ */
+ static short[] buildSymbolIndex(char[] alphabet)
+ {
+ short[] index = new short[MAX_ASCII + 1];
+ Arrays.fill(index, UNMAPPED);
+ short pos = 0;
+ for (char c : alphabet)
+ {
+ if (c <= MAX_ASCII)
+ {
+ index[c] = pos;
+ }
+
+ /*
+ * also map lower-case character (unless separately mapped)
+ */
+ if (c >= 'A' && c <= 'Z')
+ {
+ short lowerCase = (short) (c + ('a' - 'A'));
+ if (index[lowerCase] == UNMAPPED)
+ {
+ index[lowerCase] = pos;
+ }
+ }
+ pos++;
+ }
+ return index;
+ }
+
+ @Override
+ public String getName()
+ {
+ return name;
}
@Override
public boolean isDNA()
{
- return type == 1;
+ return !peptide;
}
@Override
public boolean isProtein()
{
- return type == 0;
+ return peptide;
}
@Override
- public int[][] getMatrix()
+ public float[][] getMatrix()
{
return matrix;
}
/**
- *
- * @param A1
- * @param A2
- * @return score for substituting first char in A1 with first char in A2
+ * Returns the pairwise score for substituting c with d, or zero if c or d is
+ * an unscored or unexpected character
*/
- public int getPairwiseScore(String A1, String A2)
- {
- return getPairwiseScore(A1.charAt(0), A2.charAt(0));
- }
-
@Override
- public int getPairwiseScore(char c, char d)
+ public float getPairwiseScore(char c, char d)
{
- int pog = 0;
-
- try
+ if (c > MAX_ASCII)
{
- int a = (type == 0) ? ResidueProperties.aaIndex[c]
- : ResidueProperties.nucleotideIndex[c];
- int b = (type == 0) ? ResidueProperties.aaIndex[d]
- : ResidueProperties.nucleotideIndex[d];
-
- pog = matrix[a][b];
- } catch (Exception e)
+ System.err.println(String.format(BAD_ASCII_ERROR, c));
+ return 0;
+ }
+ if (d > MAX_ASCII)
{
- // System.out.println("Unknown residue in " + A1 + " " + A2);
+ System.err.println(String.format(BAD_ASCII_ERROR, d));
+ return 0;
}
- return pog;
+ int cIndex = symbolIndex[c];
+ int dIndex = symbolIndex[d];
+ if (cIndex != UNMAPPED && dIndex != UNMAPPED)
+ {
+ return matrix[cIndex][dIndex];
+ }
+ return 0;
}
/**
return outputMatrix(false);
}
+ /**
+ * Print the score matrix, optionally formatted as html, with the alphabet symbols as column headings and at the start of each row
+ * @param html
+ * @return
+ */
public String outputMatrix(boolean html)
{
- StringBuffer sb = new StringBuffer();
- int[] symbols = (type == 0) ? ResidueProperties.aaIndex
- : ResidueProperties.nucleotideIndex;
- int symMax = (type == 0) ? ResidueProperties.maxProteinIndex
- : ResidueProperties.maxNucleotideIndex;
- boolean header = true;
+ StringBuilder sb = new StringBuilder(512);
+
+ /*
+ * heading row with alphabet
+ */
if (html)
{
sb.append("<table border=\"1\">");
+ sb.append(html ? "<tr><th></th>" : "");
+ }
+ for (char sym : symbols)
+ {
+ if (html)
+ {
+ sb.append("<th> ").append(sym).append(" </th>");
+ }
+ else
+ {
+ sb.append("\t").append(sym);
+ }
+ }
+ sb.append(html ? "</tr>\n" : "\n");
+
+ /*
+ * table of scores
+ */
+ for (char c1 : symbols)
+ {
+ if (html)
+ {
+ sb.append("<tr><td>");
+ }
+ sb.append(c1).append(html ? "</td>" : "");
+ for (char c2 : symbols)
+ {
+ sb.append(html ? "<td>" : "\t")
+ .append(matrix[symbolIndex[c1]][symbolIndex[c2]])
+ .append(html ? "</td>" : "");
+ }
+ sb.append(html ? "</tr>\n" : "\n");
}
- for (char sym = 'A'; sym <= 'Z'; sym++)
+ if (html)
+ {
+ sb.append("</table>");
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Parse a score matrix from the given input stream and returns a ScoreMatrix
+ * object. If parsing fails, error messages are written to syserr and null is
+ * returned. It is the caller's responsibility to close the input stream.
+ *
+ * @param is
+ * @return
+ */
+ public static ScoreMatrix parse(InputStream is)
+ {
+ ScoreMatrix sm = null;
+ BufferedReader br = new BufferedReader(new InputStreamReader(is));
+ int lineNo = 0;
+ String name = null;
+ String alphabet = null;
+ float[][] scores = null;
+ int size = 0;
+ int row = 0;
+
+ try
{
- if (symbols[sym] >= 0 && symbols[sym] < symMax)
+ String data;
+
+ while ((data = br.readLine()) != null)
{
- if (header)
+ lineNo++;
+ data = data.trim();
+ if (data.startsWith(COMMENT_CHAR))
+ {
+ continue;
+ }
+ if (data.toLowerCase().startsWith("scorematrix"))
{
- sb.append(html ? "<tr><td></td>" : "");
- for (char sym2 = 'A'; sym2 <= 'Z'; sym2++)
+ /*
+ * Parse name from ScoreMatrix <name>
+ */
+ if (name != null)
+ {
+ System.err
+ .println("Warning: 'ScoreMatrix' repeated in file at line "
+ + lineNo);
+ }
+ StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
+ if (nameLine.countTokens() != 2)
{
- if (symbols[sym2] >= 0 && symbols[sym2] < symMax)
- {
- sb.append((html ? "<td> " : "\t") + sym2
- + (html ? " </td>" : ""));
- }
+ System.err
+ .println("Format error: expected 'ScoreMatrix <name>', found '"
+ + data + "' at line " + lineNo);
+ return null;
}
- header = false;
- sb.append(html ? "</tr>\n" : "\n");
+ nameLine.nextToken();
+ name = nameLine.nextToken();
+ continue;
}
- if (html)
+ else if (name == null)
{
- sb.append("<tr>");
+ System.err
+ .println("Format error: 'ScoreMatrix <name>' should be the first non-comment line");
+ return null;
}
- sb.append((html ? "<td>" : "") + sym + (html ? "</td>" : ""));
- for (char sym2 = 'A'; sym2 <= 'Z'; sym2++)
+
+ /*
+ * next line after ScoreMatrix should be the alphabet of scored symbols
+ */
+ if (alphabet == null)
+ {
+ alphabet = data;
+ size = alphabet.length();
+ scores = new float[size][];
+ continue;
+ }
+
+ /*
+ * too much information?
+ */
+ if (row >= size && data.length() > 0) {
+ System.err
+ .println("Unexpected extra input line in score model file "
+ + data);
+ return null;
+ }
+
+ /*
+ * subsequent lines should be the symbol scores
+ */
+ StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
+ if (scoreLine.countTokens() != size)
{
- if (symbols[sym2] >= 0 && symbols[sym2] < symMax)
+ System.err.println(String.format(
+ "Expected %d tokens at line %d but found %d", size,
+ lineNo, scoreLine.countTokens()));
+ return null;
+ }
+ scores[row] = new float[size];
+ int col = 0;
+ String value = null;
+ while (scoreLine.hasMoreTokens()) {
+ try {
+ value = scoreLine.nextToken();
+ scores[row][col] = Float.valueOf(value);
+ col++;
+ } catch (NumberFormatException e)
{
- sb.append((html ? "<td>" : "\t")
- + matrix[symbols[sym]][symbols[sym2]]
- + (html ? "</td>" : ""));
+ System.err.println(String.format(
+ "Invalid score value %s at line %d column %d", value,
+ lineNo, col));
+ return null;
}
}
- sb.append(html ? "</tr>\n" : "\n");
+ row++;
}
+ } catch (IOException e)
+ {
+ System.err.println("Error reading score matrix file: "
+ + e.getMessage() + " at line " + lineNo);
}
- if (html)
+
+ /*
+ * out of data - check we found enough
+ */
+ if (row < size)
{
- sb.append("</table>");
+ System.err
+ .println(String
+ .format("Expected %d rows of score data in score matrix but only found %d",
+ size, row));
+ return null;
}
- return sb.toString();
+
+ /*
+ * If we get here, then name, alphabet and scores have been parsed successfully
+ */
+ sm = new ScoreMatrix(name, alphabet.toCharArray(), scores);
+ return sm;
}
}