2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import java.io.IOException;
24 import java.util.StringTokenizer;
26 import jalview.analysis.scoremodels.ScoreMatrix;
27 import jalview.analysis.scoremodels.ScoreModels;
28 import jalview.datamodel.SequenceI;
31 * A class that can parse a file containing a substitution matrix and register
32 * it for use in Jalview
34 * Accepts 'NCBI' format (e.g.
35 * https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt), with the
36 * addition of a header line to provide a matrix name, e.g.
39 * ScoreMatrix BLOSUM62
42 * Also accepts 'AAindex' format (as described at
43 * http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data
47 * H accession number (used as score matrix identifier in Jalview)
48 * D description (used for tooltip in Jalview)
50 * and the substitution scores
53 public class ScoreMatrixFile extends AlignFile
54 implements AlignmentFileReaderI
56 // first non-comment line identifier - also checked in IdentifyFile
57 public static final String SCOREMATRIX = "SCOREMATRIX";
59 private static final String DELIMITERS = " ,\t";
61 private static final String COMMENT_CHAR = "#";
63 private String matrixName;
66 * aaindex format has scores for diagonal and below only
68 boolean isLowerDiagonalOnly;
71 * ncbi format has symbols as first column on score rows
73 boolean hasGuideColumn;
81 public ScoreMatrixFile(FileParse source) throws IOException
87 public String print(SequenceI[] sqs, boolean jvsuffix)
93 * Parses the score matrix file, and if successful registers the matrix so it
94 * will be shown in Jalview menus. This method is not thread-safe (a separate
95 * instance of this class should be used by each thread).
98 public void parse() throws IOException
100 ScoreMatrix sm = parseMatrix();
102 ScoreModels.getInstance().registerScoreModel(sm);
106 * Parses the score matrix file and constructs a ScoreMatrix object. If an
107 * error is found in parsing, it is thrown as FileFormatException. Any
108 * warnings are written to syserr.
111 * @throws IOException
113 public ScoreMatrix parseMatrix() throws IOException
115 ScoreMatrix sm = null;
118 char[] alphabet = null;
119 float[][] scores = null;
124 isLowerDiagonalOnly = false;
126 while ((data = nextLine()) != null)
130 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
134 // equivalent to data.startsWithIgnoreCase(SCOREMATRIX)
135 if (data.regionMatches(true, 0, SCOREMATRIX, 0, SCOREMATRIX.length()))
138 * Parse name from ScoreMatrix <name>
139 * we allow any delimiter after ScoreMatrix then take the rest of the line
143 throw new FileFormatException(
144 "Error: 'ScoreMatrix' repeated in file at line "
147 StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
148 if (nameLine.countTokens() < 2)
150 err = "Format error: expected 'ScoreMatrix <name>', found '"
151 + data + "' at line " + lineNo;
152 throw new FileFormatException(err);
154 nameLine.nextToken(); // 'ScoreMatrix'
155 name = nameLine.nextToken(); // next field
156 name = data.substring(1).substring(data.substring(1).indexOf(name));
159 else if (data.startsWith("H ") && name == null)
164 return parseAAIndexFormat(lineNo, data);
166 else if (name == null)
168 err = "Format error: 'ScoreMatrix <name>' should be the first non-comment line";
169 throw new FileFormatException(err);
173 * next non-comment line after ScoreMatrix should be the
174 * column header line with the alphabet of scored symbols
176 if (alphabet == null)
178 StringTokenizer columnHeadings = new StringTokenizer(data,
180 size = columnHeadings.countTokens();
181 alphabet = new char[size];
183 while (columnHeadings.hasMoreTokens())
185 alphabet[col++] = columnHeadings.nextToken().charAt(0);
187 scores = new float[size][];
192 * too much information
196 err = "Unexpected extra input line in score model file: '" + data
198 throw new FileFormatException(err);
201 parseValues(data, lineNo, scores, row, alphabet);
206 * out of data - check we found enough
211 "Expected %d rows of score data in score matrix but only found %d",
213 throw new FileFormatException(err);
217 * If we get here, then name, alphabet and scores have been parsed successfully
219 sm = new ScoreMatrix(name, alphabet, scores);
226 * Parse input as AAIndex format, starting from the header line with the
232 * @throws IOException
234 protected ScoreMatrix parseAAIndexFormat(int lineNo, String data)
237 String name = data.substring(2).trim();
238 String description = null;
240 float[][] scores = null;
241 char[] alphabet = null;
245 while ((data = nextLine()) != null)
249 if (skipAAindexLine(data))
253 if (data.startsWith("D "))
255 description = data.substring(2).trim();
257 else if (data.startsWith("M "))
259 alphabet = parseAAindexRowsColumns(lineNo, data);
260 size = alphabet.length;
261 scores = new float[size][size];
263 else if (scores == null)
265 throw new FileFormatException(
266 "No alphabet specified in matrix file");
268 else if (row >= size)
270 throw new FileFormatException("Too many data rows in matrix file");
274 parseValues(data, lineNo, scores, row, alphabet);
279 ScoreMatrix sm = new ScoreMatrix(name, description, alphabet, scores);
286 * Parse one row of score values, delimited by whitespace or commas. The line
287 * may optionally include the symbol from which the scores are defined. Values
288 * may be present for all columns, or only up to the diagonal (in which case
289 * upper diagonal values are set symmetrically).
292 * the line to be parsed
295 * the score matrix to add data to
297 * the row number / alphabet index position
301 * if invalid, or too few, or too many values
303 protected void parseValues(String data, int lineNo, float[][] scores,
304 int row, char[] alphabet) throws FileFormatException
307 int size = alphabet.length;
308 StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
310 int tokenCount = scoreLine.countTokens();
313 * inspect first row to see if it includes the symbol in the first column,
314 * and to see if it is lower diagonal values only (i.e. just one score)
318 if (data.startsWith(String.valueOf(alphabet[0])))
320 hasGuideColumn = true;
322 if (tokenCount == (hasGuideColumn ? 2 : 1))
324 isLowerDiagonalOnly = true;
331 * check 'guide' symbol is the row'th letter of the alphabet
333 String symbol = scoreLine.nextToken();
334 if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row])
337 "Error parsing score matrix at line %d, expected '%s' but found '%s'",
338 lineNo, alphabet[row], symbol);
339 throw new FileFormatException(err);
341 tokenCount = scoreLine.countTokens(); // excluding guide symbol
345 * check the right number of values (lower diagonal or full format)
347 if (isLowerDiagonalOnly && tokenCount != row + 1)
350 "Expected %d scores at line %d: '%s' but found %d", row + 1,
351 lineNo, data, tokenCount);
352 throw new FileFormatException(err);
355 if (!isLowerDiagonalOnly && tokenCount != size)
358 "Expected %d scores at line %d: '%s' but found %d", size,
359 lineNo, data, scoreLine.countTokens());
360 throw new FileFormatException(err);
364 * parse and set the values, setting the symmetrical value
365 * as well if lower diagonal format data
367 scores[row] = new float[size];
370 while (scoreLine.hasMoreTokens())
374 value = scoreLine.nextToken();
375 scores[row][col] = Float.valueOf(value);
376 if (isLowerDiagonalOnly)
378 scores[col][row] = scores[row][col];
381 } catch (NumberFormatException e)
383 err = String.format("Invalid score value '%s' at line %d column %d",
385 throw new FileFormatException(err);
391 * Parse the line in an aaindex file that looks like
394 * M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
397 * rejecting it if rows and cols do not match. Returns the string of
398 * characters in the row/cols alphabet.
403 * @throws FileFormatException
405 protected char[] parseAAindexRowsColumns(int lineNo, String data)
406 throws FileFormatException
408 String err = "Unexpected aaIndex score matrix data at line " + lineNo
413 String[] toks = data.split(",");
414 String rowsAlphabet = toks[0].split("=")[1].trim();
415 String colsAlphabet = toks[1].split("=")[1].trim();
416 if (!rowsAlphabet.equals(colsAlphabet))
418 throw new FileFormatException("rows != cols");
420 return rowsAlphabet.toCharArray();
421 } catch (Throwable t)
423 throw new FileFormatException(err + " " + t.getMessage());
428 * Answers true if line is one we are not interested in from AAindex format
434 protected boolean skipAAindexLine(String data)
436 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
440 if (data.startsWith("*") || data.startsWith("R ")
441 || data.startsWith("A ") || data.startsWith("T ")
442 || data.startsWith("J ") || data.startsWith("//"))
449 public String getMatrixName()