/*
* Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
* Copyright (C) $$Year-Rel$$ The Jalview Authors
*
* This file is part of Jalview.
*
* Jalview is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* Jalview is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Jalview. If not, see .
* The Jalview Authors are detailed in the 'AUTHORS' file.
*/
package jalview.analysis.scoremodels;
import jalview.api.analysis.ScoreModelI;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.StringTokenizer;
public class ScoreMatrix extends PairwiseSeqScoreModel implements
ScoreModelI
{
public static final short UNMAPPED = (short) -1;
private static final String DELIMITERS = " ,\t";
private static final String COMMENT_CHAR = "#";
private static final String BAD_ASCII_ERROR = "Unexpected character %s in getPairwiseScore";
private static final int MAX_ASCII = 127;
/*
* the name of the model as shown in menus
*/
private String name;
/*
* the characters that the model provides scores for
*/
private char[] symbols;
/*
* the score matrix; both dimensions must equal the number of symbols
* matrix[i][j] is the substitution score for replacing symbols[i] with symbols[j]
*/
private float[][] matrix;
/*
* quick lookup to convert from an ascii character value to the index
* of the corresponding symbol in the score matrix
*/
private short[] symbolIndex;
/*
* true for Protein Score matrix, false for dna score matrix
*/
private boolean peptide;
/**
* Constructor
*
* @param name
* Unique, human readable name for the matrix
* @param alphabet
* the symbols to which scores apply
* @param matrix
* Pairwise scores indexed according to the symbol alphabet
*/
public ScoreMatrix(String name, char[] alphabet, float[][] matrix)
{
this.matrix = matrix;
this.name = name;
this.symbols = alphabet;
symbolIndex = buildSymbolIndex(alphabet);
/*
* crude heuristic for now...
*/
peptide = alphabet.length >= 20;
}
/**
* Returns an array A where A[i] is the position in the alphabet array of the
* character whose value is i. For example if the alphabet is { 'A', 'D', 'X'
* } then A['D'] = A[68] = 1.
*
* Unmapped characters (not in the alphabet) get an index of -1.
*
* Mappings are added automatically for lower case symbols (for non case
* sensitive scoring), unless they are explicitly present in the alphabet (are
* scored separately in the score matrix).
*
* @param alphabet
* @return
*/
static short[] buildSymbolIndex(char[] alphabet)
{
short[] index = new short[MAX_ASCII + 1];
Arrays.fill(index, UNMAPPED);
short pos = 0;
for (char c : alphabet)
{
if (c <= MAX_ASCII)
{
index[c] = pos;
}
/*
* also map lower-case character (unless separately mapped)
*/
if (c >= 'A' && c <= 'Z')
{
short lowerCase = (short) (c + ('a' - 'A'));
if (index[lowerCase] == UNMAPPED)
{
index[lowerCase] = pos;
}
}
pos++;
}
return index;
}
@Override
public String getName()
{
return name;
}
@Override
public boolean isDNA()
{
return !peptide;
}
@Override
public boolean isProtein()
{
return peptide;
}
@Override
public float[][] getMatrix()
{
return matrix;
}
/**
* Returns the pairwise score for substituting c with d, or zero if c or d is
* an unscored or unexpected character
*/
@Override
public float getPairwiseScore(char c, char d)
{
if (c > MAX_ASCII)
{
System.err.println(String.format(BAD_ASCII_ERROR, c));
return 0;
}
if (d > MAX_ASCII)
{
System.err.println(String.format(BAD_ASCII_ERROR, d));
return 0;
}
int cIndex = symbolIndex[c];
int dIndex = symbolIndex[d];
if (cIndex != UNMAPPED && dIndex != UNMAPPED)
{
return matrix[cIndex][dIndex];
}
return 0;
}
/**
* pretty print the matrix
*/
@Override
public String toString()
{
return outputMatrix(false);
}
/**
* Print the score matrix, optionally formatted as html, with the alphabet symbols as column headings and at the start of each row
* @param html
* @return
*/
public String outputMatrix(boolean html)
{
StringBuilder sb = new StringBuilder(512);
/*
* heading row with alphabet
*/
if (html)
{
sb.append("
");
sb.append(html ? " | " : "");
}
for (char sym : symbols)
{
if (html)
{
sb.append(" ").append(sym).append(" | ");
}
else
{
sb.append("\t").append(sym);
}
}
sb.append(html ? "
\n" : "\n");
/*
* table of scores
*/
for (char c1 : symbols)
{
if (html)
{
sb.append("");
}
sb.append(c1).append(html ? " | " : "");
for (char c2 : symbols)
{
sb.append(html ? "" : "\t")
.append(matrix[symbolIndex[c1]][symbolIndex[c2]])
.append(html ? " | " : "");
}
sb.append(html ? "
\n" : "\n");
}
if (html)
{
sb.append("
");
}
return sb.toString();
}
/**
* Parse a score matrix from the given input stream and returns a ScoreMatrix
* object. If parsing fails, error messages are written to syserr and null is
* returned. It is the caller's responsibility to close the input stream.
*
* @param is
* @return
*/
public static ScoreMatrix parse(InputStream is)
{
ScoreMatrix sm = null;
BufferedReader br = new BufferedReader(new InputStreamReader(is));
int lineNo = 0;
String name = null;
String alphabet = null;
float[][] scores = null;
int size = 0;
int row = 0;
try
{
String data;
while ((data = br.readLine()) != null)
{
lineNo++;
data = data.trim();
if (data.startsWith(COMMENT_CHAR))
{
continue;
}
if (data.toLowerCase().startsWith("scorematrix"))
{
/*
* Parse name from ScoreMatrix
*/
if (name != null)
{
System.err
.println("Warning: 'ScoreMatrix' repeated in file at line "
+ lineNo);
}
StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
if (nameLine.countTokens() != 2)
{
System.err
.println("Format error: expected 'ScoreMatrix ', found '"
+ data + "' at line " + lineNo);
return null;
}
nameLine.nextToken();
name = nameLine.nextToken();
continue;
}
else if (name == null)
{
System.err
.println("Format error: 'ScoreMatrix ' should be the first non-comment line");
return null;
}
/*
* next line after ScoreMatrix should be the alphabet of scored symbols
*/
if (alphabet == null)
{
alphabet = data;
size = alphabet.length();
scores = new float[size][];
continue;
}
/*
* too much information?
*/
if (row >= size && data.length() > 0) {
System.err
.println("Unexpected extra input line in score model file "
+ data);
return null;
}
/*
* subsequent lines should be the symbol scores
*/
StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
if (scoreLine.countTokens() != size)
{
System.err.println(String.format(
"Expected %d tokens at line %d but found %d", size,
lineNo, scoreLine.countTokens()));
return null;
}
scores[row] = new float[size];
int col = 0;
String value = null;
while (scoreLine.hasMoreTokens()) {
try {
value = scoreLine.nextToken();
scores[row][col] = Float.valueOf(value);
col++;
} catch (NumberFormatException e)
{
System.err.println(String.format(
"Invalid score value %s at line %d column %d", value,
lineNo, col));
return null;
}
}
row++;
}
} catch (IOException e)
{
System.err.println("Error reading score matrix file: "
+ e.getMessage() + " at line " + lineNo);
}
/*
* out of data - check we found enough
*/
if (row < size)
{
System.err
.println(String
.format("Expected %d rows of score data in score matrix but only found %d",
size, row));
return null;
}
/*
* If we get here, then name, alphabet and scores have been parsed successfully
*/
sm = new ScoreMatrix(name, alphabet.toCharArray(), scores);
return sm;
}
}