+ * Parse input as AAIndex format, starting from the header line with the
+ * accession id
+ *
+ * @param lineNo
+ * @param data
+ * @return
+ * @throws IOException
+ */
+ protected ScoreMatrix parseAAIndexFormat(int lineNo, String data)
+ throws IOException
+ {
+ String name = data.substring(2).trim();
+ String description = null;
+
+ float[][] scores = null;
+ char[] alphabet = null;
+ int row = 0;
+ int size = 0;
+
+ while ((data = nextLine()) != null)
+ {
+ lineNo++;
+ data = data.trim();
+ if (skipAAindexLine(data))
+ {
+ continue;
+ }
+ if (data.startsWith("D "))
+ {
+ description = data.substring(2).trim();
+ }
+ else if (data.startsWith("M "))
+ {
+ alphabet = parseAAindexRowsColumns(lineNo, data);
+ size = alphabet.length;
+ scores = new float[size][size];
+ }
+ else if (scores == null)
+ {
+ throw new FileFormatException(
+ "No alphabet specified in matrix file");
+ }
+ else if (row >= size)
+ {
+ throw new FileFormatException("Too many data rows in matrix file");
+ }
+ else
+ {
+ parseValues(data, lineNo, scores, row, alphabet);
+ row++;
+ if (row == size)
+ {
+ break;
+ }
+ }
+ }
+ if (data != null)
+ {
+ System.err.println("Warning: unexpected extra data in matrix file: "
+ + data);
+ }
+
+ ScoreMatrix sm = new ScoreMatrix(name, alphabet, scores);
+ sm.setDescription(description);
+ matrixName = name;
+
+ return sm;
+ }
+
+ /**
+ * Parse one row of score values, delimited by whitespace or commas. The line
+ * may optionally include the symbol from which the scores are defined. Values
+ * may be present for all columns, or only up to the diagonal (in which case
+ * upper diagonal values are set symmetrically).
+ *
+ * @param data
+ * the line to be parsed
+ * @param lineNo
+ * @param scores
+ * the score matrix to add data to
+ * @param row
+ * the row number / alphabet index position
+ * @param alphabet
+ * @return
+ * @throws exception
+ * if invalid, or too few, or too many values
+ */
+ protected void parseValues(String data, int lineNo, float[][] scores,
+ int row, char[] alphabet) throws FileFormatException
+ {
+ String err;
+ int size = alphabet.length;
+ StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
+
+ int tokenCount = scoreLine.countTokens();
+ if (tokenCount == size + 1)
+ {
+ /*
+ * check 'guide' symbol is the row'th letter of the alphabet
+ */
+ String symbol = scoreLine.nextToken();
+ if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row])
+ {
+ err = String
+ .format("Error parsing score matrix at line %d, expected '%s' but found '%s'",
+ lineNo, alphabet[row], symbol);
+ throw new FileFormatException(err);
+ }
+ }
+
+ tokenCount = scoreLine.countTokens();
+
+ /*
+ * AAIndex format only has the lower diagonal i.e.
+ * 1 score in row 0, 2 in row 1, etc
+ * check this in all but the last row (which is the same either way)
+ */
+ if (row < size - 1)
+ {
+ boolean lowerDiagonal = tokenCount == row + 1;
+ if (lowerDiagonalOnly && !lowerDiagonal)
+ {
+ /*
+ * had detected lower diagonal form but now it isn't - error
+ */
+ err = String.format("Unexpected number of tokens at line %d",
+ lineNo);
+ throw new FileFormatException(err);
+ }
+ lowerDiagonalOnly = lowerDiagonal;
+ }
+
+ if (!lowerDiagonalOnly && tokenCount != size)
+ {
+ err = String.format("Expected %d scores at line %d but found %d",
+ size, lineNo, scoreLine.countTokens());
+ throw new FileFormatException(err);
+ }
+ scores[row] = new float[size];
+ int col = 0;
+ String value = null;
+ while (scoreLine.hasMoreTokens())
+ {
+ try
+ {
+ value = scoreLine.nextToken();
+ scores[row][col] = Float.valueOf(value);
+ if (lowerDiagonalOnly)
+ {
+ scores[col][row] = scores[row][col];
+ }
+ col++;
+ } catch (NumberFormatException e)
+ {
+ err = String.format(
+ "Invalid score value '%s' at line %d column %d", value,
+ lineNo, col);
+ throw new FileFormatException(err);
+ }
+ }
+ }
+
+ /**
+ * Parse the line in an aaindex file that looks like
+ *
+ * <pre>
+ * M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
+ * </pre>
+ *
+ * rejecting it if rows and cols do not match. Returns the string of
+ * characters in the row/cols alphabet.
+ *
+ * @param lineNo
+ * @param data
+ * @return
+ * @throws FileFormatException
+ */
+ protected char[] parseAAindexRowsColumns(int lineNo, String data)
+ throws FileFormatException
+ {
+ String err = "Unexpected aaIndex score matrix data at line " + lineNo
+ + ": " + data;
+
+ try
+ {
+ String[] toks = data.split(",");
+ String rowsAlphabet = toks[0].split("=")[1].trim();
+ String colsAlphabet = toks[1].split("=")[1].trim();
+ if (!rowsAlphabet.equals(colsAlphabet))
+ {
+ throw new FileFormatException("rows != cols");
+ }
+ return rowsAlphabet.toCharArray();
+ } catch (Throwable t)
+ {
+ throw new FileFormatException(err + " " + t.getMessage());
+ }
+ }
+
+ /**
+ * Answers true if line is one we are not interested in from AAindex format
+ * file
+ *
+ * @param data
+ * @return
+ */
+ protected boolean skipAAindexLine(String data)
+ {
+ if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
+ {
+ return true;
+ }
+ if (data.startsWith("*") || data.startsWith("R ")
+ || data.startsWith("A ") || data.startsWith("T ")
+ || data.startsWith("J ") || data.startsWith("//"))
+ {
+ return true;
+ }
+ return false;
+ }
+
+ /**