2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis.scoremodels;
23 import jalview.api.analysis.ScoreModelI;
25 import java.io.BufferedReader;
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.io.InputStreamReader;
29 import java.util.Arrays;
30 import java.util.StringTokenizer;
32 public class ScoreMatrix extends PairwiseSeqScoreModel implements
35 public static final short UNMAPPED = (short) -1;
37 private static final String DELIMITERS = " ,\t";
39 private static final String COMMENT_CHAR = "#";
41 private static final String BAD_ASCII_ERROR = "Unexpected character %s in getPairwiseScore";
43 private static final int MAX_ASCII = 127;
46 * the name of the model as shown in menus
51 * the characters that the model provides scores for
53 private char[] symbols;
56 * the score matrix; both dimensions must equal the number of symbols
57 * matrix[i][j] is the substitution score for replacing symbols[i] with symbols[j]
59 private float[][] matrix;
62 * quick lookup to convert from an ascii character value to the index
63 * of the corresponding symbol in the score matrix
65 private short[] symbolIndex;
68 * true for Protein Score matrix, false for dna score matrix
70 private boolean peptide;
76 * Unique, human readable name for the matrix
78 * the symbols to which scores apply
80 * Pairwise scores indexed according to the symbol alphabet
82 public ScoreMatrix(String name, char[] alphabet, float[][] matrix)
86 this.symbols = alphabet;
88 symbolIndex = buildSymbolIndex(alphabet);
91 * crude heuristic for now...
93 peptide = alphabet.length >= 20;
97 * Returns an array A where A[i] is the position in the alphabet array of the
98 * character whose value is i. For example if the alphabet is { 'A', 'D', 'X'
99 * } then A['D'] = A[68] = 1.
101 * Unmapped characters (not in the alphabet) get an index of -1.
103 * Mappings are added automatically for lower case symbols (for non case
104 * sensitive scoring), unless they are explicitly present in the alphabet (are
105 * scored separately in the score matrix).
110 static short[] buildSymbolIndex(char[] alphabet)
112 short[] index = new short[MAX_ASCII + 1];
113 Arrays.fill(index, UNMAPPED);
115 for (char c : alphabet)
123 * also map lower-case character (unless separately mapped)
125 if (c >= 'A' && c <= 'Z')
127 short lowerCase = (short) (c + ('a' - 'A'));
128 if (index[lowerCase] == UNMAPPED)
130 index[lowerCase] = pos;
139 public String getName()
145 public boolean isDNA()
151 public boolean isProtein()
157 public float[][] getMatrix()
163 * Returns the pairwise score for substituting c with d, or zero if c or d is
164 * an unscored or unexpected character
167 public float getPairwiseScore(char c, char d)
171 System.err.println(String.format(BAD_ASCII_ERROR, c));
176 System.err.println(String.format(BAD_ASCII_ERROR, d));
180 int cIndex = symbolIndex[c];
181 int dIndex = symbolIndex[d];
182 if (cIndex != UNMAPPED && dIndex != UNMAPPED)
184 return matrix[cIndex][dIndex];
190 * pretty print the matrix
193 public String toString()
195 return outputMatrix(false);
199 * Print the score matrix, optionally formatted as html, with the alphabet symbols as column headings and at the start of each row
203 public String outputMatrix(boolean html)
205 StringBuilder sb = new StringBuilder(512);
208 * heading row with alphabet
212 sb.append("<table border=\"1\">");
213 sb.append(html ? "<tr><th></th>" : "");
215 for (char sym : symbols)
219 sb.append("<th> ").append(sym).append(" </th>");
223 sb.append("\t").append(sym);
226 sb.append(html ? "</tr>\n" : "\n");
231 for (char c1 : symbols)
235 sb.append("<tr><td>");
237 sb.append(c1).append(html ? "</td>" : "");
238 for (char c2 : symbols)
240 sb.append(html ? "<td>" : "\t")
241 .append(matrix[symbolIndex[c1]][symbolIndex[c2]])
242 .append(html ? "</td>" : "");
244 sb.append(html ? "</tr>\n" : "\n");
248 sb.append("</table>");
250 return sb.toString();
254 * Parse a score matrix from the given input stream and returns a ScoreMatrix
255 * object. If parsing fails, error messages are written to syserr and null is
256 * returned. It is the caller's responsibility to close the input stream.
260 * ScoreMatrix displayName
261 * # comment lines begin with hash sign
262 * # symbol alphabet should be the next non-comment line
263 * ARNDCQEGHILKMFPSTWYVBZX *
264 * # scores matrix, with space, comma or tab delimited values
265 * # [i, j] = score for substituting symbol[i] with symbol[j]
266 * # first column in each row is optionally the 'substituted' symbol
267 * A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 -4
274 public static ScoreMatrix parse(InputStream is)
276 ScoreMatrix sm = null;
277 BufferedReader br = new BufferedReader(new InputStreamReader(is));
280 String alphabet = null;
281 float[][] scores = null;
289 while ((data = br.readLine()) != null)
293 if (data.startsWith(COMMENT_CHAR))
297 if (data.toLowerCase().startsWith("scorematrix"))
300 * Parse name from ScoreMatrix <name>
305 .println("Warning: 'ScoreMatrix' repeated in file at line "
308 StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
309 if (nameLine.countTokens() != 2)
312 .println("Format error: expected 'ScoreMatrix <name>', found '"
313 + data + "' at line " + lineNo);
316 nameLine.nextToken();
317 name = nameLine.nextToken();
320 else if (name == null)
323 .println("Format error: 'ScoreMatrix <name>' should be the first non-comment line");
328 * next line after ScoreMatrix should be the alphabet of scored symbols
330 if (alphabet == null)
333 size = alphabet.length();
334 scores = new float[size][];
339 * too much information?
341 if (row >= size && data.length() > 0) {
343 .println("Unexpected extra input line in score model file "
349 * subsequent lines should be the symbol scores
350 * optionally with the symbol as the first column for readability
352 StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
353 if (scoreLine.countTokens() == size + 1)
356 * check 'guide' symbol is the row'th letter of the alphabet
358 String symbol = scoreLine.nextToken();
359 if (symbol.length() > 1
360 || symbol.charAt(0) != alphabet.charAt(row))
364 .format("Error parsing score matrix at line %d, expected %s but found %s",
365 lineNo, alphabet.charAt(row), symbol));
369 if (scoreLine.countTokens() != size)
371 System.err.println(String.format(
372 "Expected %d scores at line %d but found %d", size,
373 lineNo, scoreLine.countTokens()));
376 scores[row] = new float[size];
379 while (scoreLine.hasMoreTokens())
383 value = scoreLine.nextToken();
384 scores[row][col] = Float.valueOf(value);
386 } catch (NumberFormatException e)
388 System.err.println(String.format(
389 "Invalid score value %s at line %d column %d", value,
396 } catch (IOException e)
398 System.err.println("Error reading score matrix file: "
399 + e.getMessage() + " at line " + lineNo);
403 * out of data - check we found enough
409 .format("Expected %d rows of score data in score matrix but only found %d",
415 * If we get here, then name, alphabet and scores have been parsed successfully
417 sm = new ScoreMatrix(name, alphabet.toCharArray(), scores);