3 import jalview.analysis.scoremodels.ScoreMatrix;
4 import jalview.analysis.scoremodels.ScoreModels;
5 import jalview.datamodel.SequenceI;
7 import java.io.IOException;
8 import java.util.StringTokenizer;
11 * A class that can parse a file containing a substitution matrix and register
12 * it for use in Jalview
14 * Accepts 'NCBI' format (e.g.
15 * https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt), with the
16 * addition of a header line to provide a matrix name, e.g.
19 * ScoreMatrix BLOSUM62
22 * Also accepts 'AAindex' format (as described at
23 * http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data
27 * H accession number (used as score matrix identifier in Jalview)
28 * D description (used for tooltip in Jalview)
30 * and the substitution scores
33 public class ScoreMatrixFile extends AlignFile implements
36 // first non-comment line identifier - also checked in IdentifyFile
37 public static final String SCOREMATRIX = "SCOREMATRIX";
39 private static final String DELIMITERS = " ,\t";
41 private static final String COMMENT_CHAR = "#";
43 private String matrixName;
46 * aaindex format has scores for diagonal and below only
48 boolean isLowerDiagonalOnly;
51 * ncbi format has symbols as first column on score rows
53 boolean hasGuideColumn;
56 * Constructor given a file reader. The file is parsed immediately and if
57 * successful, the score model registered with the ScoreModels singleton.
62 public ScoreMatrixFile(FileParse source) throws IOException
68 * Constructor given a file reader. The data is optionally parsed immediately.
70 * @param parseImmediately
74 public ScoreMatrixFile(boolean parseImmediately, FileParse source)
77 super(parseImmediately, source);
81 public String print(SequenceI[] sqs, boolean jvsuffix)
87 * Parses the score matrix file, and if successful registers the matrix so it
88 * will be shown in Jalview menus. This method is not thread-safe (a separate
89 * instance of this class should be used by each thread).
92 public void parse() throws IOException
94 ScoreMatrix sm = parseMatrix();
96 ScoreModels.getInstance().registerScoreModel(sm);
100 * Parses the score matrix file and constructs a ScoreMatrix object. If an
101 * error is found in parsing, it is thrown as FileFormatException. Any
102 * warnings are written to syserr.
105 * @throws IOException
107 public ScoreMatrix parseMatrix() throws IOException
109 ScoreMatrix sm = null;
112 char[] alphabet = null;
113 float[][] scores = null;
118 isLowerDiagonalOnly = false;
120 while ((data = nextLine()) != null)
124 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
128 if (data.toUpperCase().startsWith(SCOREMATRIX))
131 * Parse name from ScoreMatrix <name>
132 * we allow any delimiter after ScoreMatrix then take the rest of the line
136 throw new FileFormatException(
137 "Error: 'ScoreMatrix' repeated in file at line "
140 StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
141 if (nameLine.countTokens() < 2)
143 err = "Format error: expected 'ScoreMatrix <name>', found '"
144 + data + "' at line " + lineNo;
145 throw new FileFormatException(err);
147 nameLine.nextToken(); // 'ScoreMatrix'
148 name = nameLine.nextToken(); // next field
149 name = data.substring(1).substring(data.substring(1).indexOf(name));
152 else if (data.startsWith("H ") && name == null)
157 return parseAAIndexFormat(lineNo, data);
159 else if (name == null)
161 err = "Format error: 'ScoreMatrix <name>' should be the first non-comment line";
162 throw new FileFormatException(err);
166 * next non-comment line after ScoreMatrix should be the
167 * column header line with the alphabet of scored symbols
169 if (alphabet == null)
171 StringTokenizer columnHeadings = new StringTokenizer(data,
173 size = columnHeadings.countTokens();
174 alphabet = new char[size];
176 while (columnHeadings.hasMoreTokens())
178 alphabet[col++] = columnHeadings.nextToken().charAt(0);
180 scores = new float[size][];
185 * too much information
189 err = "Unexpected extra input line in score model file: '" + data
191 throw new FileFormatException(err);
194 parseValues(data, lineNo, scores, row, alphabet);
199 * out of data - check we found enough
204 .format("Expected %d rows of score data in score matrix but only found %d",
206 throw new FileFormatException(err);
210 * If we get here, then name, alphabet and scores have been parsed successfully
212 sm = new ScoreMatrix(name, alphabet, scores);
219 * Parse input as AAIndex format, starting from the header line with the
225 * @throws IOException
227 protected ScoreMatrix parseAAIndexFormat(int lineNo, String data)
230 String name = data.substring(2).trim();
231 String description = null;
233 float[][] scores = null;
234 char[] alphabet = null;
238 while ((data = nextLine()) != null)
242 if (skipAAindexLine(data))
246 if (data.startsWith("D "))
248 description = data.substring(2).trim();
250 else if (data.startsWith("M "))
252 alphabet = parseAAindexRowsColumns(lineNo, data);
253 size = alphabet.length;
254 scores = new float[size][size];
256 else if (scores == null)
258 throw new FileFormatException(
259 "No alphabet specified in matrix file");
261 else if (row >= size)
263 throw new FileFormatException("Too many data rows in matrix file");
267 parseValues(data, lineNo, scores, row, alphabet);
272 ScoreMatrix sm = new ScoreMatrix(name, alphabet, scores);
273 sm.setDescription(description);
280 * Parse one row of score values, delimited by whitespace or commas. The line
281 * may optionally include the symbol from which the scores are defined. Values
282 * may be present for all columns, or only up to the diagonal (in which case
283 * upper diagonal values are set symmetrically).
286 * the line to be parsed
289 * the score matrix to add data to
291 * the row number / alphabet index position
295 * if invalid, or too few, or too many values
297 protected void parseValues(String data, int lineNo, float[][] scores,
298 int row, char[] alphabet) throws FileFormatException
301 int size = alphabet.length;
302 StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
304 int tokenCount = scoreLine.countTokens();
307 * inspect first row to see if it includes the symbol in the first column,
308 * and to see if it is lower diagonal values only (i.e. just one score)
312 if (data.startsWith(String.valueOf(alphabet[0])))
314 hasGuideColumn = true;
316 if (tokenCount == (hasGuideColumn ? 2 : 1))
318 isLowerDiagonalOnly = true;
325 * check 'guide' symbol is the row'th letter of the alphabet
327 String symbol = scoreLine.nextToken();
328 if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row])
331 .format("Error parsing score matrix at line %d, expected '%s' but found '%s'",
332 lineNo, alphabet[row], symbol);
333 throw new FileFormatException(err);
335 tokenCount = scoreLine.countTokens(); // excluding guide symbol
339 * check the right number of values (lower diagonal or full format)
341 if (isLowerDiagonalOnly && tokenCount != row + 1)
344 "Expected %d scores at line %d: '%s' but found %d", row + 1,
345 lineNo, data, tokenCount);
346 throw new FileFormatException(err);
349 if (!isLowerDiagonalOnly && tokenCount != size)
352 "Expected %d scores at line %d: '%s' but found %d", size,
353 lineNo, data, scoreLine.countTokens());
354 throw new FileFormatException(err);
358 * parse and set the values, setting the symmetrical value
359 * as well if lower diagonal format data
361 scores[row] = new float[size];
364 while (scoreLine.hasMoreTokens())
368 value = scoreLine.nextToken();
369 scores[row][col] = Float.valueOf(value);
370 if (isLowerDiagonalOnly)
372 scores[col][row] = scores[row][col];
375 } catch (NumberFormatException e)
378 "Invalid score value '%s' at line %d column %d", value,
380 throw new FileFormatException(err);
386 * Parse the line in an aaindex file that looks like
389 * M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
392 * rejecting it if rows and cols do not match. Returns the string of
393 * characters in the row/cols alphabet.
398 * @throws FileFormatException
400 protected char[] parseAAindexRowsColumns(int lineNo, String data)
401 throws FileFormatException
403 String err = "Unexpected aaIndex score matrix data at line " + lineNo
408 String[] toks = data.split(",");
409 String rowsAlphabet = toks[0].split("=")[1].trim();
410 String colsAlphabet = toks[1].split("=")[1].trim();
411 if (!rowsAlphabet.equals(colsAlphabet))
413 throw new FileFormatException("rows != cols");
415 return rowsAlphabet.toCharArray();
416 } catch (Throwable t)
418 throw new FileFormatException(err + " " + t.getMessage());
423 * Answers true if line is one we are not interested in from AAindex format
429 protected boolean skipAAindexLine(String data)
431 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
435 if (data.startsWith("*") || data.startsWith("R ")
436 || data.startsWith("A ") || data.startsWith("T ")
437 || data.startsWith("J ") || data.startsWith("//"))
444 public String getMatrixName()