3 import jalview.analysis.scoremodels.ScoreMatrix;
4 import jalview.analysis.scoremodels.ScoreModels;
5 import jalview.datamodel.SequenceI;
7 import java.io.IOException;
8 import java.util.StringTokenizer;
11 * A class that can parse a file containing a substitution matrix and register
12 * it for use in Jalview
14 * Accepts 'NCBI' format (e.g.
15 * https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt), with the
16 * addition of a header line to provide a matrix name, e.g.
19 * ScoreMatrix BLOSUM62
22 * Also accepts 'aaindex' format (as described at
23 * http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data
27 * H accession number (used as score matrix identifier in Jalview)
28 * D description (used for tooltip in Jalview)
30 * and the substitution scores
33 public class ScoreMatrixFile extends AlignFile implements
36 // first non-comment line identifier - also checked in IdentifyFile
37 public static final String SCOREMATRIX = "SCOREMATRIX";
39 private static final String DELIMITERS = " ,\t";
41 private static final String COMMENT_CHAR = "#";
43 private String matrixName;
51 public ScoreMatrixFile(FileParse source) throws IOException
57 public String print(SequenceI[] sqs, boolean jvsuffix)
63 * Parses the score matrix file, and if successful registers the matrix so it
64 * will be shown in Jalview menus.
67 public void parse() throws IOException
69 ScoreMatrix sm = parseMatrix();
71 ScoreModels.getInstance().registerScoreModel(sm);
75 * Parses the score matrix file and constructs a ScoreMatrix object. If an
76 * error is found in parsing, it is thrown as FileFormatException. Any
77 * warnings are written to syserr.
82 public ScoreMatrix parseMatrix() throws IOException
84 ScoreMatrix sm = null;
87 String alphabet = null;
88 float[][] scores = null;
94 while ((data = nextLine()) != null)
98 if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
102 if (data.toUpperCase().startsWith(SCOREMATRIX))
105 * Parse name from ScoreMatrix <name>
106 * we allow any delimiter after ScoreMatrix then take the rest of the line
111 .println("Warning: 'ScoreMatrix' repeated in file at line "
114 StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
115 if (nameLine.countTokens() < 2)
117 err = "Format error: expected 'ScoreMatrix <name>', found '"
118 + data + "' at line " + lineNo;
119 throw new FileFormatException(err);
121 nameLine.nextToken(); // 'ScoreMatrix'
122 name = nameLine.nextToken(); // next field
123 name = data.substring(1).substring(data.substring(1).indexOf(name));
126 else if (name == null)
128 err = "Format error: 'ScoreMatrix <name>' should be the first non-comment line";
129 throw new FileFormatException(err);
133 * next line after ScoreMatrix should be the alphabet of scored symbols
135 if (alphabet == null)
138 size = alphabet.length();
139 scores = new float[size][];
144 * too much information
148 err = "Unexpected extra input line in score model file: '" + data
150 throw new FileFormatException(err);
154 * permit an uncommented line with delimited residue headings
156 if (isHeaderLine(data, alphabet))
162 * subsequent lines should be the symbol scores
163 * optionally with the symbol as the first column for readability
165 StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
166 int tokenCount = scoreLine.countTokens();
167 if (tokenCount == size + 1)
170 * check 'guide' symbol is the row'th letter of the alphabet
172 String symbol = scoreLine.nextToken();
173 if (symbol.length() > 1 || symbol.charAt(0) != alphabet.charAt(row))
176 .format("Error parsing score matrix at line %d, expected '%s' but found '%s'",
177 lineNo, alphabet.charAt(row), symbol);
178 throw new FileFormatException(err);
181 if (scoreLine.countTokens() != size)
183 err = String.format("Expected %d scores at line %d but found %d",
184 size, lineNo, scoreLine.countTokens());
185 throw new FileFormatException(err);
187 scores[row] = new float[size];
190 while (scoreLine.hasMoreTokens())
194 value = scoreLine.nextToken();
195 scores[row][col] = Float.valueOf(value);
197 } catch (NumberFormatException e)
200 "Invalid score value '%s' at line %d column %d", value,
202 throw new FileFormatException(err);
209 * out of data - check we found enough
214 .format("Expected %d rows of score data in score matrix but only found %d",
216 throw new FileFormatException(err);
220 * If we get here, then name, alphabet and scores have been parsed successfully
222 sm = new ScoreMatrix(name, alphabet.toCharArray(), scores);
229 * Answers true if the data line consists of the alphabet characters,
230 * delimited (as to provide a heading row). Otherwise returns false (e.g. if
231 * the data is a row of score values).
237 private boolean isHeaderLine(String data, String alphabet)
239 StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
241 while (scoreLine.hasMoreElements())
244 * skip over characters in the alphabet that are
245 * also a delimiter (e.g. space)
247 char symbol = alphabet.charAt(i++);
248 if (!DELIMITERS.contains(String.valueOf(symbol)))
250 if (!String.valueOf(symbol).equals(scoreLine.nextToken()))
259 public String getMatrixName()