3 import jalview.analysis.scoremodels.ScoreMatrix;
4 import jalview.analysis.scoremodels.ScoreModels;
5 import jalview.datamodel.SequenceI;
7 import java.io.IOException;
8 import java.util.StringTokenizer;
11 * A class that can parse a file containing a substitution matrix and register
12 * it for use in Jalview
14 * Accepts 'NCBI' format (e.g.
15 * https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt), with the
16 * addition of a header line to provide a matrix name, e.g.
19 * ScoreMatrix BLOSUM62
22 * Also accepts 'AAindex' format (as described at
23 * http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data
27 * H accession number (used as score matrix identifier in Jalview)
28 * D description (used for tooltip in Jalview)
30 * and the substitution scores
33 public class ScoreMatrixFile extends AlignFile implements
36 // first non-comment line identifier - also checked in IdentifyFile
37 public static final String SCOREMATRIX = "SCOREMATRIX";
39 private static final String DELIMITERS = " ,\t";
41 private static final String COMMENT_CHAR = "#";
43 private String matrixName;
45 boolean lowerDiagonalOnly;
53 public ScoreMatrixFile(FileParse source) throws IOException
59 public String print(SequenceI[] sqs, boolean jvsuffix)
65 * Parses the score matrix file, and if successful registers the matrix so it
66 * will be shown in Jalview menus. This method is not thread-safe (a separate
67 * instance of this class should be used by each thread).
70 public void parse() throws IOException
72 ScoreMatrix sm = parseMatrix();
74 ScoreModels.getInstance().registerScoreModel(sm);
78 * Parses the score matrix file and constructs a ScoreMatrix object. If an
79 * error is found in parsing, it is thrown as FileFormatException. Any
80 * warnings are written to syserr.
85 public ScoreMatrix parseMatrix() throws IOException
87 ScoreMatrix sm = null;
90 char[] alphabet = null;
91 float[][] scores = null;
96 lowerDiagonalOnly = false;
98 while ((data = nextLine()) != null)
102 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
106 if (data.toUpperCase().startsWith(SCOREMATRIX))
109 * Parse name from ScoreMatrix <name>
110 * we allow any delimiter after ScoreMatrix then take the rest of the line
115 .println("Warning: 'ScoreMatrix' repeated in file at line "
118 StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
119 if (nameLine.countTokens() < 2)
121 err = "Format error: expected 'ScoreMatrix <name>', found '"
122 + data + "' at line " + lineNo;
123 throw new FileFormatException(err);
125 nameLine.nextToken(); // 'ScoreMatrix'
126 name = nameLine.nextToken(); // next field
127 name = data.substring(1).substring(data.substring(1).indexOf(name));
130 else if (data.startsWith("H ") && name == null)
135 return parseAAIndexFormat(lineNo, data);
137 else if (name == null)
139 err = "Format error: 'ScoreMatrix <name>' should be the first non-comment line";
140 throw new FileFormatException(err);
144 * next non-comment line after ScoreMatrix should be the
145 * column header line with the alphabet of scored symbols
147 if (alphabet == null)
149 StringTokenizer columnHeadings = new StringTokenizer(data,
151 size = columnHeadings.countTokens();
152 alphabet = new char[size];
154 while (columnHeadings.hasMoreTokens())
156 alphabet[col++] = columnHeadings.nextToken().charAt(0);
158 scores = new float[size][];
163 * too much information
167 err = "Unexpected extra input line in score model file: '" + data
169 throw new FileFormatException(err);
172 parseValues(data, lineNo, scores, row, alphabet);
177 * out of data - check we found enough
182 .format("Expected %d rows of score data in score matrix but only found %d",
184 throw new FileFormatException(err);
188 * If we get here, then name, alphabet and scores have been parsed successfully
190 sm = new ScoreMatrix(name, alphabet, scores);
197 * Parse input as AAIndex format, starting from the header line with the
203 * @throws IOException
205 protected ScoreMatrix parseAAIndexFormat(int lineNo, String data)
208 String name = data.substring(2).trim();
209 String description = null;
211 float[][] scores = null;
212 char[] alphabet = null;
216 while ((data = nextLine()) != null)
220 if (skipAAindexLine(data))
224 if (data.startsWith("D "))
226 description = data.substring(2).trim();
228 else if (data.startsWith("M "))
230 alphabet = parseAAindexRowsColumns(lineNo, data);
231 size = alphabet.length;
232 scores = new float[size][size];
234 else if (scores == null)
236 throw new FileFormatException(
237 "No alphabet specified in matrix file");
239 else if (row >= size)
241 throw new FileFormatException("Too many data rows in matrix file");
245 parseValues(data, lineNo, scores, row, alphabet);
255 System.err.println("Warning: unexpected extra data in matrix file: "
259 ScoreMatrix sm = new ScoreMatrix(name, alphabet, scores);
260 sm.setDescription(description);
267 * Parse one row of score values, delimited by whitespace or commas. The line
268 * may optionally include the symbol from which the scores are defined. Values
269 * may be present for all columns, or only up to the diagonal (in which case
270 * upper diagonal values are set symmetrically).
273 * the line to be parsed
276 * the score matrix to add data to
278 * the row number / alphabet index position
282 * if invalid, or too few, or too many values
284 protected void parseValues(String data, int lineNo, float[][] scores,
285 int row, char[] alphabet) throws FileFormatException
288 int size = alphabet.length;
289 StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
291 int tokenCount = scoreLine.countTokens();
292 if (tokenCount == size + 1)
295 * check 'guide' symbol is the row'th letter of the alphabet
297 String symbol = scoreLine.nextToken();
298 if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row])
301 .format("Error parsing score matrix at line %d, expected '%s' but found '%s'",
302 lineNo, alphabet[row], symbol);
303 throw new FileFormatException(err);
307 tokenCount = scoreLine.countTokens();
310 * AAIndex format only has the lower diagonal i.e.
311 * 1 score in row 0, 2 in row 1, etc
312 * check this in all but the last row (which is the same either way)
316 boolean lowerDiagonal = tokenCount == row + 1;
317 if (lowerDiagonalOnly && !lowerDiagonal)
320 * had detected lower diagonal form but now it isn't - error
322 err = String.format("Unexpected number of tokens at line %d",
324 throw new FileFormatException(err);
326 lowerDiagonalOnly = lowerDiagonal;
329 if (!lowerDiagonalOnly && tokenCount != size)
331 err = String.format("Expected %d scores at line %d but found %d",
332 size, lineNo, scoreLine.countTokens());
333 throw new FileFormatException(err);
335 scores[row] = new float[size];
338 while (scoreLine.hasMoreTokens())
342 value = scoreLine.nextToken();
343 scores[row][col] = Float.valueOf(value);
344 if (lowerDiagonalOnly)
346 scores[col][row] = scores[row][col];
349 } catch (NumberFormatException e)
352 "Invalid score value '%s' at line %d column %d", value,
354 throw new FileFormatException(err);
360 * Parse the line in an aaindex file that looks like
363 * M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
366 * rejecting it if rows and cols do not match. Returns the string of
367 * characters in the row/cols alphabet.
372 * @throws FileFormatException
374 protected char[] parseAAindexRowsColumns(int lineNo, String data)
375 throws FileFormatException
377 String err = "Unexpected aaIndex score matrix data at line " + lineNo
382 String[] toks = data.split(",");
383 String rowsAlphabet = toks[0].split("=")[1].trim();
384 String colsAlphabet = toks[1].split("=")[1].trim();
385 if (!rowsAlphabet.equals(colsAlphabet))
387 throw new FileFormatException("rows != cols");
389 return rowsAlphabet.toCharArray();
390 } catch (Throwable t)
392 throw new FileFormatException(err + " " + t.getMessage());
397 * Answers true if line is one we are not interested in from AAindex format
403 protected boolean skipAAindexLine(String data)
405 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
409 if (data.startsWith("*") || data.startsWith("R ")
410 || data.startsWith("A ") || data.startsWith("T ")
411 || data.startsWith("J ") || data.startsWith("//"))
419 * Answers true if the data line consists of the alphabet characters,
420 * delimited (as to provide a heading row). Otherwise returns false (e.g. if
421 * the data is a row of score values).
427 private boolean isHeaderLine(String data, String alphabet)
429 StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
431 while (scoreLine.hasMoreElements())
434 * skip over characters in the alphabet that are
435 * also a delimiter (e.g. space)
437 char symbol = alphabet.charAt(i++);
438 if (!DELIMITERS.contains(String.valueOf(symbol)))
440 if (!String.valueOf(symbol).equals(scoreLine.nextToken()))
449 public String getMatrixName()