/* * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) * Copyright (C) $$Year-Rel$$ The Jalview Authors * * This file is part of Jalview. * * Jalview is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * Jalview is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Jalview. If not, see . * The Jalview Authors are detailed in the 'AUTHORS' file. */ package jalview.analysis.scoremodels; import jalview.api.analysis.ScoreModelI; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Arrays; import java.util.StringTokenizer; public class ScoreMatrix extends PairwiseSeqScoreModel implements ScoreModelI { public static final short UNMAPPED = (short) -1; private static final String DELIMITERS = " ,\t"; private static final String COMMENT_CHAR = "#"; private static final String BAD_ASCII_ERROR = "Unexpected character %s in getPairwiseScore"; private static final int MAX_ASCII = 127; /* * the name of the model as shown in menus */ private String name; /* * the characters that the model provides scores for */ private char[] symbols; /* * the score matrix; both dimensions must equal the number of symbols * matrix[i][j] is the substitution score for replacing symbols[i] with symbols[j] */ private float[][] matrix; /* * quick lookup to convert from an ascii character value to the index * of the corresponding symbol in the score matrix */ private short[] symbolIndex; /* * true for Protein Score matrix, false for dna score matrix */ private boolean peptide; /** * Constructor * * @param name * Unique, human readable name for the matrix * @param alphabet * the symbols to which scores apply * @param matrix * Pairwise scores indexed according to the symbol alphabet */ public ScoreMatrix(String name, char[] alphabet, float[][] matrix) { this.matrix = matrix; this.name = name; this.symbols = alphabet; symbolIndex = buildSymbolIndex(alphabet); /* * crude heuristic for now... */ peptide = alphabet.length >= 20; } /** * Returns an array A where A[i] is the position in the alphabet array of the * character whose value is i. For example if the alphabet is { 'A', 'D', 'X' * } then A['D'] = A[68] = 1. *

* Unmapped characters (not in the alphabet) get an index of -1. *

* Mappings are added automatically for lower case symbols (for non case * sensitive scoring), unless they are explicitly present in the alphabet (are * scored separately in the score matrix). * * @param alphabet * @return */ static short[] buildSymbolIndex(char[] alphabet) { short[] index = new short[MAX_ASCII + 1]; Arrays.fill(index, UNMAPPED); short pos = 0; for (char c : alphabet) { if (c <= MAX_ASCII) { index[c] = pos; } /* * also map lower-case character (unless separately mapped) */ if (c >= 'A' && c <= 'Z') { short lowerCase = (short) (c + ('a' - 'A')); if (index[lowerCase] == UNMAPPED) { index[lowerCase] = pos; } } pos++; } return index; } @Override public String getName() { return name; } @Override public boolean isDNA() { return !peptide; } @Override public boolean isProtein() { return peptide; } @Override public float[][] getMatrix() { return matrix; } /** * Returns the pairwise score for substituting c with d, or zero if c or d is * an unscored or unexpected character */ @Override public float getPairwiseScore(char c, char d) { if (c > MAX_ASCII) { System.err.println(String.format(BAD_ASCII_ERROR, c)); return 0; } if (d > MAX_ASCII) { System.err.println(String.format(BAD_ASCII_ERROR, d)); return 0; } int cIndex = symbolIndex[c]; int dIndex = symbolIndex[d]; if (cIndex != UNMAPPED && dIndex != UNMAPPED) { return matrix[cIndex][dIndex]; } return 0; } /** * pretty print the matrix */ @Override public String toString() { return outputMatrix(false); } /** * Print the score matrix, optionally formatted as html, with the alphabet symbols as column headings and at the start of each row * @param html * @return */ public String outputMatrix(boolean html) { StringBuilder sb = new StringBuilder(512); /* * heading row with alphabet */ if (html) { sb.append(""); sb.append(html ? "" : ""); } for (char sym : symbols) { if (html) { sb.append(""); } else { sb.append("\t").append(sym); } } sb.append(html ? "\n" : "\n"); /* * table of scores */ for (char c1 : symbols) { if (html) { sb.append("" : ""); for (char c2 : symbols) { sb.append(html ? "" : ""); } sb.append(html ? "\n" : "\n"); } if (html) { sb.append("
 ").append(sym).append(" 
"); } sb.append(c1).append(html ? "" : "\t") .append(matrix[symbolIndex[c1]][symbolIndex[c2]]) .append(html ? "
"); } return sb.toString(); } /** * Parse a score matrix from the given input stream and returns a ScoreMatrix * object. If parsing fails, error messages are written to syserr and null is * returned. It is the caller's responsibility to close the input stream. * * @param is * @return */ public static ScoreMatrix parse(InputStream is) { ScoreMatrix sm = null; BufferedReader br = new BufferedReader(new InputStreamReader(is)); int lineNo = 0; String name = null; String alphabet = null; float[][] scores = null; int size = 0; int row = 0; try { String data; while ((data = br.readLine()) != null) { lineNo++; data = data.trim(); if (data.startsWith(COMMENT_CHAR)) { continue; } if (data.toLowerCase().startsWith("scorematrix")) { /* * Parse name from ScoreMatrix */ if (name != null) { System.err .println("Warning: 'ScoreMatrix' repeated in file at line " + lineNo); } StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS); if (nameLine.countTokens() != 2) { System.err .println("Format error: expected 'ScoreMatrix ', found '" + data + "' at line " + lineNo); return null; } nameLine.nextToken(); name = nameLine.nextToken(); continue; } else if (name == null) { System.err .println("Format error: 'ScoreMatrix ' should be the first non-comment line"); return null; } /* * next line after ScoreMatrix should be the alphabet of scored symbols */ if (alphabet == null) { alphabet = data; size = alphabet.length(); scores = new float[size][]; continue; } /* * too much information? */ if (row >= size && data.length() > 0) { System.err .println("Unexpected extra input line in score model file " + data); return null; } /* * subsequent lines should be the symbol scores */ StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS); if (scoreLine.countTokens() != size) { System.err.println(String.format( "Expected %d tokens at line %d but found %d", size, lineNo, scoreLine.countTokens())); return null; } scores[row] = new float[size]; int col = 0; String value = null; while (scoreLine.hasMoreTokens()) { try { value = scoreLine.nextToken(); scores[row][col] = Float.valueOf(value); col++; } catch (NumberFormatException e) { System.err.println(String.format( "Invalid score value %s at line %d column %d", value, lineNo, col)); return null; } } row++; } } catch (IOException e) { System.err.println("Error reading score matrix file: " + e.getMessage() + " at line " + lineNo); } /* * out of data - check we found enough */ if (row < size) { System.err .println(String .format("Expected %d rows of score data in score matrix but only found %d", size, row)); return null; } /* * If we get here, then name, alphabet and scores have been parsed successfully */ sm = new ScoreMatrix(name, alphabet.toCharArray(), scores); return sm; } }