package jalview.io;
import jalview.analysis.scoremodels.ScoreMatrix;
import jalview.analysis.scoremodels.ScoreModels;
import jalview.datamodel.SequenceI;
import java.io.IOException;
import java.util.StringTokenizer;
/**
* A class that can parse a file containing a substitution matrix and register
* it for use in Jalview
*
* Accepts 'NCBI' format (e.g.
* https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt), with the
* addition of a header line to provide a matrix name, e.g.
*
*
* ScoreMatrix BLOSUM62
*
*
* Also accepts 'AAindex' format (as described at
* http://www.genome.jp/aaindex/aaindex_help.html) with the minimum data
* required being
*
*
* H accession number (used as score matrix identifier in Jalview)
* D description (used for tooltip in Jalview)
* M rows = symbolList
* and the substitution scores
*
*/
public class ScoreMatrixFile extends AlignFile implements
AlignmentFileReaderI
{
// first non-comment line identifier - also checked in IdentifyFile
public static final String SCOREMATRIX = "SCOREMATRIX";
private static final String DELIMITERS = " ,\t";
private static final String COMMENT_CHAR = "#";
private String matrixName;
/*
* aaindex format has scores for diagonal and below only
*/
boolean isLowerDiagonalOnly;
/*
* ncbi format has symbols as first column on score rows
*/
boolean hasGuideColumn;
/**
* Constructor
*
* @param source
* @throws IOException
*/
public ScoreMatrixFile(FileParse source) throws IOException
{
super(false, source);
}
@Override
public String print(SequenceI[] sqs, boolean jvsuffix)
{
return null;
}
/**
* Parses the score matrix file, and if successful registers the matrix so it
* will be shown in Jalview menus. This method is not thread-safe (a separate
* instance of this class should be used by each thread).
*/
@Override
public void parse() throws IOException
{
ScoreMatrix sm = parseMatrix();
ScoreModels.getInstance().registerScoreModel(sm);
}
/**
* Parses the score matrix file and constructs a ScoreMatrix object. If an
* error is found in parsing, it is thrown as FileFormatException. Any
* warnings are written to syserr.
*
* @return
* @throws IOException
*/
public ScoreMatrix parseMatrix() throws IOException
{
ScoreMatrix sm = null;
int lineNo = 0;
String name = null;
char[] alphabet = null;
float[][] scores = null;
int size = 0;
int row = 0;
String err = null;
String data;
isLowerDiagonalOnly = false;
while ((data = nextLine()) != null)
{
lineNo++;
data = data.trim();
if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
{
continue;
}
if (data.toUpperCase().startsWith(SCOREMATRIX))
{
/*
* Parse name from ScoreMatrix
* we allow any delimiter after ScoreMatrix then take the rest of the line
*/
if (name != null)
{
throw new FileFormatException(
"Error: 'ScoreMatrix' repeated in file at line "
+ lineNo);
}
StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS);
if (nameLine.countTokens() < 2)
{
err = "Format error: expected 'ScoreMatrix ', found '"
+ data + "' at line " + lineNo;
throw new FileFormatException(err);
}
nameLine.nextToken(); // 'ScoreMatrix'
name = nameLine.nextToken(); // next field
name = data.substring(1).substring(data.substring(1).indexOf(name));
continue;
}
else if (data.startsWith("H ") && name == null)
{
/*
* AAindex identifier
*/
return parseAAIndexFormat(lineNo, data);
}
else if (name == null)
{
err = "Format error: 'ScoreMatrix ' should be the first non-comment line";
throw new FileFormatException(err);
}
/*
* next non-comment line after ScoreMatrix should be the
* column header line with the alphabet of scored symbols
*/
if (alphabet == null)
{
StringTokenizer columnHeadings = new StringTokenizer(data,
DELIMITERS);
size = columnHeadings.countTokens();
alphabet = new char[size];
int col = 0;
while (columnHeadings.hasMoreTokens())
{
alphabet[col++] = columnHeadings.nextToken().charAt(0);
}
scores = new float[size][];
continue;
}
/*
* too much information
*/
if (row >= size)
{
err = "Unexpected extra input line in score model file: '" + data
+ "'";
throw new FileFormatException(err);
}
parseValues(data, lineNo, scores, row, alphabet);
row++;
}
/*
* out of data - check we found enough
*/
if (row < size)
{
err = String
.format("Expected %d rows of score data in score matrix but only found %d",
size, row);
throw new FileFormatException(err);
}
/*
* If we get here, then name, alphabet and scores have been parsed successfully
*/
sm = new ScoreMatrix(name, alphabet, scores);
matrixName = name;
return sm;
}
/**
* Parse input as AAIndex format, starting from the header line with the
* accession id
*
* @param lineNo
* @param data
* @return
* @throws IOException
*/
protected ScoreMatrix parseAAIndexFormat(int lineNo, String data)
throws IOException
{
String name = data.substring(2).trim();
String description = null;
float[][] scores = null;
char[] alphabet = null;
int row = 0;
int size = 0;
while ((data = nextLine()) != null)
{
lineNo++;
data = data.trim();
if (skipAAindexLine(data))
{
continue;
}
if (data.startsWith("D "))
{
description = data.substring(2).trim();
}
else if (data.startsWith("M "))
{
alphabet = parseAAindexRowsColumns(lineNo, data);
size = alphabet.length;
scores = new float[size][size];
}
else if (scores == null)
{
throw new FileFormatException(
"No alphabet specified in matrix file");
}
else if (row >= size)
{
throw new FileFormatException("Too many data rows in matrix file");
}
else
{
parseValues(data, lineNo, scores, row, alphabet);
row++;
}
}
ScoreMatrix sm = new ScoreMatrix(name, description, alphabet, scores);
matrixName = name;
return sm;
}
/**
* Parse one row of score values, delimited by whitespace or commas. The line
* may optionally include the symbol from which the scores are defined. Values
* may be present for all columns, or only up to the diagonal (in which case
* upper diagonal values are set symmetrically).
*
* @param data
* the line to be parsed
* @param lineNo
* @param scores
* the score matrix to add data to
* @param row
* the row number / alphabet index position
* @param alphabet
* @return
* @throws exception
* if invalid, or too few, or too many values
*/
protected void parseValues(String data, int lineNo, float[][] scores,
int row, char[] alphabet) throws FileFormatException
{
String err;
int size = alphabet.length;
StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS);
int tokenCount = scoreLine.countTokens();
/*
* inspect first row to see if it includes the symbol in the first column,
* and to see if it is lower diagonal values only (i.e. just one score)
*/
if (row == 0)
{
if (data.startsWith(String.valueOf(alphabet[0])))
{
hasGuideColumn = true;
}
if (tokenCount == (hasGuideColumn ? 2 : 1))
{
isLowerDiagonalOnly = true;
}
}
if (hasGuideColumn)
{
/*
* check 'guide' symbol is the row'th letter of the alphabet
*/
String symbol = scoreLine.nextToken();
if (symbol.length() > 1 || symbol.charAt(0) != alphabet[row])
{
err = String
.format("Error parsing score matrix at line %d, expected '%s' but found '%s'",
lineNo, alphabet[row], symbol);
throw new FileFormatException(err);
}
tokenCount = scoreLine.countTokens(); // excluding guide symbol
}
/*
* check the right number of values (lower diagonal or full format)
*/
if (isLowerDiagonalOnly && tokenCount != row + 1)
{
err = String.format(
"Expected %d scores at line %d: '%s' but found %d", row + 1,
lineNo, data, tokenCount);
throw new FileFormatException(err);
}
if (!isLowerDiagonalOnly && tokenCount != size)
{
err = String.format(
"Expected %d scores at line %d: '%s' but found %d", size,
lineNo, data, scoreLine.countTokens());
throw new FileFormatException(err);
}
/*
* parse and set the values, setting the symmetrical value
* as well if lower diagonal format data
*/
scores[row] = new float[size];
int col = 0;
String value = null;
while (scoreLine.hasMoreTokens())
{
try
{
value = scoreLine.nextToken();
scores[row][col] = Float.valueOf(value);
if (isLowerDiagonalOnly)
{
scores[col][row] = scores[row][col];
}
col++;
} catch (NumberFormatException e)
{
err = String.format(
"Invalid score value '%s' at line %d column %d", value,
lineNo, col);
throw new FileFormatException(err);
}
}
}
/**
* Parse the line in an aaindex file that looks like
*
*
* M rows = ARNDCQEGHILKMFPSTWYV, cols = ARNDCQEGHILKMFPSTWYV
*
*
* rejecting it if rows and cols do not match. Returns the string of
* characters in the row/cols alphabet.
*
* @param lineNo
* @param data
* @return
* @throws FileFormatException
*/
protected char[] parseAAindexRowsColumns(int lineNo, String data)
throws FileFormatException
{
String err = "Unexpected aaIndex score matrix data at line " + lineNo
+ ": " + data;
try
{
String[] toks = data.split(",");
String rowsAlphabet = toks[0].split("=")[1].trim();
String colsAlphabet = toks[1].split("=")[1].trim();
if (!rowsAlphabet.equals(colsAlphabet))
{
throw new FileFormatException("rows != cols");
}
return rowsAlphabet.toCharArray();
} catch (Throwable t)
{
throw new FileFormatException(err + " " + t.getMessage());
}
}
/**
* Answers true if line is one we are not interested in from AAindex format
* file
*
* @param data
* @return
*/
protected boolean skipAAindexLine(String data)
{
if (data.startsWith(COMMENT_CHAR) || data.length() == 0)
{
return true;
}
if (data.startsWith("*") || data.startsWith("R ")
|| data.startsWith("A ") || data.startsWith("T ")
|| data.startsWith("J ") || data.startsWith("//"))
{
return true;
}
return false;
}
public String getMatrixName()
{
return matrixName;
}
}