From: gmungoc Date: Mon, 13 Feb 2017 16:00:59 +0000 (+0000) Subject: JAL-1632 JAL-2416 load score matrices from file, as float[][] X-Git-Tag: Release_2_10_2~3^2~105^2~2^2~120 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=c6e8e8ccd10f21698226ae37196cd9680e6804a0;p=jalview.git JAL-1632 JAL-2416 load score matrices from file, as float[][] --- diff --git a/resources/lang/Messages.properties b/resources/lang/Messages.properties index 7d540f9..fa74c0d 100644 --- a/resources/lang/Messages.properties +++ b/resources/lang/Messages.properties @@ -168,6 +168,7 @@ label.redo_command = Redo {0} label.principal_component_analysis = Principal Component Analysis label.average_distance_identity = Average Distance Using % Identity label.neighbour_joining_identity = Neighbour Joining Using % Identity +label.choose_tree = Choose Tree Calculation label.treecalc_title = {0} Using {1} label.tree_calc_av = Average Distance label.tree_calc_nj = Neighbour Joining diff --git a/resources/lang/Messages_es.properties b/resources/lang/Messages_es.properties index d408fee..d771dd5 100644 --- a/resources/lang/Messages_es.properties +++ b/resources/lang/Messages_es.properties @@ -165,6 +165,7 @@ label.redo_command = Rehacer {0} label.principal_component_analysis = Análisis del Componente Principal label.average_distance_identity = Distancia Media Usando % de Identidad label.neighbour_joining_identity = Unir vecinos utilizando % de Identidad +label.choose_tree = Elegir el cálculo del árbol label.treecalc_title = {0} utilizando {1} label.tree_calc_av = Distancia media label.tree_calc_nj = Unir vecinos diff --git a/resources/scoreModel/blosum62.scm b/resources/scoreModel/blosum62.scm new file mode 100644 index 0000000..3df8833 --- /dev/null +++ b/resources/scoreModel/blosum62.scm @@ -0,0 +1,42 @@ +ScoreMatrix BLOSUM62 +ARNDCQEGHILKMFPSTWYVBZX * +# +# The BLOSUM62 substitution matrix, as at https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt +# The first line declares a ScoreMatrix with the name BLOSUM62 (shown in menus) +# The second line gives the symbols for which scores are held in the matrix +# These may include a space (but not as the first or last character) +# Scores are not case sensitive, unless column(s) are provided for lower case characters +# +# +# Comment line with symbols is provided as a guide +# Values may be integer or floating point, delimited by tab, space, comma or combinations +# +# A R N D C Q E G H I L K M F P S T W Y V B Z X * +# + 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 -4 + -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 -4 + -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 -4 + -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 -4 + 0 3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 -4 + -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 -4 + -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 -4 + 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 -4 + -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 -4 + -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 -4 + -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 -4 + -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 -4 + -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 -4 + -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 -4 + -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 -4 + 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 -4 + 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 -4 + -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 -4 + -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 -4 + 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 -4 + -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 -4 + -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 -4 + 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 -4 + -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 1 + -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 1 +# +# A R N D C Q E G H I L K M F P S T W Y V B Z X * diff --git a/resources/scoreModel/dna.scm b/resources/scoreModel/dna.scm new file mode 100644 index 0000000..b1e0621 --- /dev/null +++ b/resources/scoreModel/dna.scm @@ -0,0 +1,36 @@ +ScoreMatrix DNA +ACGTUIXRYN - +# +# A DNA substitution matrix. +# This is an ad-hoc matrix which, in addition to penalising mutations between the common +# nucleotides (ACGT), includes T/U equivalence in order to allow both DNA and/or RNA. +# In addition, it encodes weak equivalence between R and Y with AG and CTU, respectively, +# and N is allowed to match any other base weakly. +# This matrix also includes I (Inosine) and X (Xanthine), but encodes them to weakly match +# any of (ACGTU), and unfavourably match each other. +# +# The first line declares a ScoreMatrix with the name DNA (shown in menus) +# The second line gives the symbols for which scores are held in the matrix +# These may include a space (but not as the first or last character) +# Scores are not case sensitive, unless column(s) are provided for lower case characters +# +# +# Comment line with symbols is provided as a guide +# Values may be integer or floating point, delimited by tab, space, comma or combinations +# +# A C G T U I X R Y N - +# + 10 -8 -8 -8 -8 1 1 1 -8 1 1 1 + -8 10 -8 -8 -8 1 1 -8 1 1 1 1 + -8 -8 10 -8 -8 1 1 1 -8 1 1 1 + -8 -8 -8 10 10 1 1 -8 1 1 1 1 + -8 -8 -8 10 10 1 1 -8 1 1 1 1 + 1 1 1 1 1 10 0 0 0 1 1 1 + 1 1 1 1 1 0 10 0 0 1 1 1 + 1 -8 1 -8 -8 0 0 10 -8 1 1 1 + -8 1 -8 1 1 0 0 -8 10 1 1 1 + 1 1 1 1 1 1 1 1 1 10 1 1 + 1 1 1 1 1 1 1 1 1 1 1 1 + 1 1 1 1 1 1 1 1 1 1 1 1 +# +# A C G T U I X R Y N - diff --git a/resources/scoreModel/pam250.scm b/resources/scoreModel/pam250.scm new file mode 100644 index 0000000..b57485e --- /dev/null +++ b/resources/scoreModel/pam250.scm @@ -0,0 +1,42 @@ +ScoreMatrix PAM250 +ARNDCQEGHILKMFPSTWYVBZX * +# +# The PAM250 substitution matrix +# The first line declares a ScoreMatrix with the name PAM250 (shown in menus) +# The second line gives the symbols for which scores are held in the matrix +# These may include a space (but not as the first or last character) +# Scores are not case sensitive, unless column(s) are provided for lower case characters +# +# +# Comment line with symbols is provided as a guide +# Values may be integer or floating point, delimited by tab, space, comma or combinations +# +# A R N D C Q E G H I L K M F P S T W Y V B Z X * +# + 2 -2 0 0 -2 0 0 1 -1 -1 -2 -1 -1 -3 1 1 1 -6 -3 0 0 0 0 -8 -8 + -2 6 0 -1 -4 1 -1 -3 2 -2 -3 3 0 -4 0 0 -1 2 -4 -2 -1 0 -1 -8 -8 + 0 0 2 2 -4 1 1 0 2 -2 -3 1 -2 -3 0 1 0 -4 -2 -2 2 1 0 -8 -8 + 0 -1 2 4 -5 2 3 1 1 -2 -4 0 -3 -6 -1 0 0 -7 -4 -2 3 3 -1 -8 -8 + -2 -4 -4 -5 12 -5 -5 -3 -3 -2 -6 -5 -5 -4 -3 0 -2 -8 0 -2 -4 -5 -3 -8 -8 + 0 1 1 2 -5 4 2 -1 3 -2 -2 1 -1 -5 0 -1 -1 -5 -4 -2 1 3 -1 -8 -8 + 0 -1 1 3 -5 2 4 0 1 -2 -3 0 -2 -5 -1 0 0 -7 -4 -2 3 3 -1 -8 -8 + 1 -3 0 1 -3 -1 0 5 -2 -3 -4 -2 -3 -5 0 1 0 -7 -5 -1 0 0 -1 -8 -8 + -1 2 2 1 -3 3 1 -2 6 -2 -2 0 -2 -2 0 -1 -1 -3 0 -2 1 2 -1 -8 -8 + -1 -2 -2 -2 -2 -2 -2 -3 -2 5 2 -2 2 1 -2 -1 0 -5 -1 4 -2 -2 -1 -8 -8 + -2 -3 -3 -4 -6 -2 -3 -4 -2 2 6 -3 4 2 -3 -3 -2 -2 -1 2 -3 -3 -1 -8 -8 + -1 3 1 0 -5 1 0 -2 0 -2 -3 5 0 -5 -1 0 0 -3 -4 -2 1 0 -1 -8 -8 + -1 0 -2 -3 -5 -1 -2 -3 -2 2 4 0 6 0 -2 -2 -1 -4 -2 2 -2 -2 -1 -8 -8 + -3 -4 -3 -6 -4 -5 -5 -5 -2 1 2 -5 0 9 -5 -3 -3 0 7 -1 -4 -5 -2 -8 -8 + 1 0 0 -1 -3 0 -1 0 0 -2 -3 -1 -2 -5 6 1 0 -6 -5 -1 -1 0 -1 -8 -8 + 1 0 1 0 0 -1 0 1 -1 -1 -3 0 -2 -3 1 2 1 -2 -3 -1 0 0 0 -8 -8 + 1 -1 0 0 -2 -1 0 0 -1 0 -2 0 -1 -3 0 1 3 -5 -3 0 0 -1 0 -8 -8 + -6 2 -4 -7 -8 -5 -7 -7 -3 -5 -2 -3 -4 0 -6 -2 -5 17 0 -6 -5 -6 -4 -8 -8 + -3 -4 -2 -4 0 -4 -4 -5 0 -1 -1 -4 -2 7 -5 -3 -3 0 10 -2 -3 -4 -2 -8 -8 + 0 -2 -2 -2 -2 -2 -2 -1 -2 4 2 -2 2 -1 -1 -1 0 -6 -2 4 -2 -2 -1 -8 -8 + 0 -1 2 3 -4 1 3 0 1 -2 -3 1 -2 -4 -1 0 0 -5 -3 -2 3 2 -1 -8 -8 + 0 0 1 3 -5 3 3 0 2 -2 -3 0 -2 -5 0 0 -1 -6 -4 -2 2 3 -1 -8 -8 + 0 -1 0 -1 -3 -1 -1 -1 -1 -1 -1 -1 -1 -2 -1 0 0 -4 -2 -1 -1 -1 -1 -8 -8 + -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 1 + -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 -8 1 1 +# +# A R N D C Q E G H I L K M F P S T W Y V B Z X * diff --git a/src/MCview/AppletPDBCanvas.java b/src/MCview/AppletPDBCanvas.java index aac796c..1ab11b8 100644 --- a/src/MCview/AppletPDBCanvas.java +++ b/src/MCview/AppletPDBCanvas.java @@ -177,7 +177,7 @@ public class AppletPDBCanvas extends Panel implements MouseListener, colourBySequence(); - int max = -10; + float max = -10; int maxchain = -1; int pdbstart = 0; int pdbend = 0; diff --git a/src/MCview/PDBCanvas.java b/src/MCview/PDBCanvas.java index 292de91..5509056 100644 --- a/src/MCview/PDBCanvas.java +++ b/src/MCview/PDBCanvas.java @@ -176,7 +176,7 @@ public class PDBCanvas extends JPanel implements MouseListener, colourBySequence(); - int max = -10; + float max = -10; int maxchain = -1; int pdbstart = 0; int pdbend = 0; diff --git a/src/jalview/analysis/AlignSeq.java b/src/jalview/analysis/AlignSeq.java index 061a0a1..fe04156 100755 --- a/src/jalview/analysis/AlignSeq.java +++ b/src/jalview/analysis/AlignSeq.java @@ -59,11 +59,11 @@ public class AlignSeq static String[] pep = { "A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V", "B", "Z", "X", "-" }; - int[][] score; + float[][] score; - int[][] E; + float[][] E; - int[][] F; + float[][] F; int[][] traceback; @@ -106,7 +106,7 @@ public class AlignSeq int count; /** DOCUMENT ME!! */ - public int maxscore; + public float maxscore; float pid; @@ -116,7 +116,7 @@ public class AlignSeq int gapExtend = 20; - int[][] lookup = ResidueProperties.getBLOSUM62(); + float[][] lookup = ResidueProperties.getBLOSUM62(); String[] intToStr = pep; @@ -165,7 +165,7 @@ public class AlignSeq * * @return DOCUMENT ME! */ - public int getMaxScore() + public float getMaxScore() { return maxscore; } @@ -381,13 +381,13 @@ public class AlignSeq seq2 = new int[s2str.length()]; // System.out.println("seq2 " + rt.freeMemory() + " " + rt.totalMemory()); - score = new int[s1str.length()][s2str.length()]; + score = new float[s1str.length()][s2str.length()]; // System.out.println("score " + rt.freeMemory() + " " + rt.totalMemory()); - E = new int[s1str.length()][s2str.length()]; + E = new float[s1str.length()][s2str.length()]; // System.out.println("E " + rt.freeMemory() + " " + rt.totalMemory()); - F = new int[s1str.length()][s2str.length()]; + F = new float[s1str.length()][s2str.length()]; traceback = new int[s1str.length()][s2str.length()]; // System.out.println("F " + rt.freeMemory() + " " + rt.totalMemory()); @@ -460,7 +460,7 @@ public class AlignSeq public void traceAlignment() { // Find the maximum score along the rhs or bottom row - int max = -9999; + float max = -9999; for (int i = 0; i < seq1.length; i++) { @@ -728,7 +728,7 @@ public class AlignSeq public int findTrace(int i, int j) { int t = 0; - int max = score[i - 1][j - 1] + (lookup[seq1[i]][seq2[j]] * 10); + float max = score[i - 1][j - 1] + (lookup[seq1[i]][seq2[j]] * 10); if (F[i][j] > max) { @@ -843,27 +843,27 @@ public class AlignSeq /** * DOCUMENT ME! * - * @param i1 + * @param f1 * DOCUMENT ME! - * @param i2 + * @param f2 * DOCUMENT ME! - * @param i3 + * @param f3 * DOCUMENT ME! * * @return DOCUMENT ME! */ - public int max(int i1, int i2, int i3) + public float max(float f1, float f2, float f3) { - int max = i1; + float max = f1; - if (i2 > i1) + if (f2 > f1) { - max = i2; + max = f2; } - if (i3 > max) + if (f3 > max) { - max = i3; + max = f3; } return max; @@ -872,20 +872,20 @@ public class AlignSeq /** * DOCUMENT ME! * - * @param i1 + * @param f1 * DOCUMENT ME! - * @param i2 + * @param f2 * DOCUMENT ME! * * @return DOCUMENT ME! */ - public int max(int i1, int i2) + public float max(float f1, float f2) { - int max = i1; + float max = f1; - if (i2 > i1) + if (f2 > f1) { - max = i2; + max = f2; } return max; @@ -1113,7 +1113,7 @@ public class AlignSeq { SequenceI bestm = null; AlignSeq bestaseq = null; - int bestscore = 0; + float bestscore = 0; for (SequenceI msq : al.getSequences()) { AlignSeq aseq = doGlobalNWAlignment(msq, sq, dnaOrProtein); diff --git a/src/jalview/analysis/Conservation.java b/src/jalview/analysis/Conservation.java index 565924b..8f7e57b 100755 --- a/src/jalview/analysis/Conservation.java +++ b/src/jalview/analysis/Conservation.java @@ -602,7 +602,7 @@ public class Conservation quality = new Vector(); double max = -10000; - int[][] BLOSUM62 = ResidueProperties.getBLOSUM62(); + float[][] BLOSUM62 = ResidueProperties.getBLOSUM62(); // Loop over columns // JBPNote Profiling info // long ts = System.currentTimeMillis(); diff --git a/src/jalview/analysis/NJTree.java b/src/jalview/analysis/NJTree.java index e0e50fb..fcf208c 100644 --- a/src/jalview/analysis/NJTree.java +++ b/src/jalview/analysis/NJTree.java @@ -20,6 +20,7 @@ */ package jalview.analysis; +import jalview.analysis.scoremodels.ScoreModels; import jalview.api.analysis.ScoreModelI; import jalview.datamodel.AlignmentView; import jalview.datamodel.BinaryNode; @@ -30,7 +31,6 @@ import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import jalview.datamodel.SequenceNode; import jalview.io.NewickFile; -import jalview.schemes.ResidueProperties; import java.util.Enumeration; import java.util.List; @@ -244,7 +244,7 @@ public class NJTree if (sm == null && !(pwtype.equals("PID"))) { - if (ResidueProperties.getScoreMatrix(pwtype) == null) + if (ScoreModels.getInstance().forName(pwtype) == null) { pwtype = "BLOSUM62"; } @@ -734,10 +734,10 @@ public class NJTree if (_pwmatrix == null) { // Resolve substitution model - _pwmatrix = ResidueProperties.getScoreModel(pwtype); + _pwmatrix = ScoreModels.getInstance().forName(pwtype); if (_pwmatrix == null) { - _pwmatrix = ResidueProperties.getScoreMatrix("BLOSUM62"); + _pwmatrix = ScoreModels.getInstance().forName("BLOSUM62"); } } dist = _pwmatrix.findDistances(seqData); diff --git a/src/jalview/analysis/PCA.java b/src/jalview/analysis/PCA.java index 41dbcc0..87ec922 100755 --- a/src/jalview/analysis/PCA.java +++ b/src/jalview/analysis/PCA.java @@ -21,10 +21,10 @@ package jalview.analysis; import jalview.analysis.scoremodels.ScoreMatrix; +import jalview.analysis.scoremodels.ScoreModels; import jalview.datamodel.BinarySequence; import jalview.datamodel.BinarySequence.InvalidSequenceTypeException; import jalview.math.Matrix; -import jalview.schemes.ResidueProperties; import java.io.PrintStream; @@ -94,14 +94,14 @@ public class PCA implements Runnable String sm = s_m; if (sm != null) { - smtrx = ResidueProperties.getScoreMatrix(sm); + smtrx = (ScoreMatrix) ScoreModels.getInstance().forName(sm); } if (smtrx == null) { // either we were given a non-existent score matrix or a scoremodel that // isn't based on a pairwise symbol score matrix - smtrx = ResidueProperties.getScoreMatrix(sm = (nucleotides ? "DNA" - : "BLOSUM62")); + smtrx = (ScoreMatrix) ScoreModels.getInstance().forName( + sm = (nucleotides ? "DNA" : "BLOSUM62")); } details.append("PCA calculation using " + sm + " sequence similarity matrix\n========\n\n"); diff --git a/src/jalview/analysis/scoremodels/PairwiseSeqScoreModel.java b/src/jalview/analysis/scoremodels/PairwiseSeqScoreModel.java index 2ff2518..f980d8e 100644 --- a/src/jalview/analysis/scoremodels/PairwiseSeqScoreModel.java +++ b/src/jalview/analysis/scoremodels/PairwiseSeqScoreModel.java @@ -26,8 +26,9 @@ import jalview.util.Comparison; public abstract class PairwiseSeqScoreModel implements ScoreModelI { - abstract public int getPairwiseScore(char c, char d); + abstract public float getPairwiseScore(char c, char d); + @Override public float[][] findDistances(AlignmentView seqData) { String[] sequenceString = seqData @@ -35,13 +36,13 @@ public abstract class PairwiseSeqScoreModel implements ScoreModelI int noseqs = sequenceString.length; float[][] distance = new float[noseqs][noseqs]; - int maxscore = 0; + float maxscore = 0; int end = sequenceString[0].length(); for (int i = 0; i < (noseqs - 1); i++) { for (int j = i; j < noseqs; j++) { - int score = 0; + float score = 0; for (int k = 0; k < end; k++) { @@ -56,7 +57,7 @@ public abstract class PairwiseSeqScoreModel implements ScoreModelI } } - distance[i][j] = (float) score; + distance[i][j] = score; if (score > maxscore) { @@ -69,12 +70,12 @@ public abstract class PairwiseSeqScoreModel implements ScoreModelI { for (int j = i; j < noseqs; j++) { - distance[i][j] = (float) maxscore - distance[i][j]; + distance[i][j] = maxscore - distance[i][j]; distance[j][i] = distance[i][j]; } } return distance; } - abstract public int[][] getMatrix(); + abstract public float[][] getMatrix(); } diff --git a/src/jalview/analysis/scoremodels/ScoreMatrix.java b/src/jalview/analysis/scoremodels/ScoreMatrix.java index 41aef82..f0115bf 100644 --- a/src/jalview/analysis/scoremodels/ScoreMatrix.java +++ b/src/jalview/analysis/scoremodels/ScoreMatrix.java @@ -21,93 +21,169 @@ package jalview.analysis.scoremodels; import jalview.api.analysis.ScoreModelI; -import jalview.schemes.ResidueProperties; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.StringTokenizer; public class ScoreMatrix extends PairwiseSeqScoreModel implements ScoreModelI { - String name; + public static final short UNMAPPED = (short) -1; - @Override - public String getName() - { - return name; - } + private static final String DELIMITERS = " ,\t"; - /** - * reference to integer score matrix + private static final String COMMENT_CHAR = "#"; + + private static final String BAD_ASCII_ERROR = "Unexpected character %s in getPairwiseScore"; + + private static final int MAX_ASCII = 127; + + /* + * the name of the model as shown in menus */ - int[][] matrix; + private String name; - /** - * 0 for Protein Score matrix. 1 for dna score matrix + /* + * the characters that the model provides scores for + */ + private char[] symbols; + + /* + * the score matrix; both dimensions must equal the number of symbols + * matrix[i][j] is the substitution score for replacing symbols[i] with symbols[j] + */ + private float[][] matrix; + + /* + * quick lookup to convert from an ascii character value to the index + * of the corresponding symbol in the score matrix */ - int type; + private short[] symbolIndex; + + /* + * true for Protein Score matrix, false for dna score matrix + */ + private boolean peptide; /** + * Constructor * * @param name * Unique, human readable name for the matrix + * @param alphabet + * the symbols to which scores apply * @param matrix - * Pairwise scores indexed according to appropriate symbol alphabet - * @param type - * 0 for Protein, 1 for NA + * Pairwise scores indexed according to the symbol alphabet */ - public ScoreMatrix(String name, int[][] matrix, int type) + public ScoreMatrix(String name, char[] alphabet, float[][] matrix) { this.matrix = matrix; - this.type = type; this.name = name; + this.symbols = alphabet; + + symbolIndex = buildSymbolIndex(alphabet); + + /* + * crude heuristic for now... + */ + peptide = alphabet.length >= 20; + } + + /** + * Returns an array A where A[i] is the position in the alphabet array of the + * character whose value is i. For example if the alphabet is { 'A', 'D', 'X' + * } then A['D'] = A[68] = 1. + *

+ * Unmapped characters (not in the alphabet) get an index of -1. + *

+ * Mappings are added automatically for lower case symbols (for non case + * sensitive scoring), unless they are explicitly present in the alphabet (are + * scored separately in the score matrix). + * + * @param alphabet + * @return + */ + static short[] buildSymbolIndex(char[] alphabet) + { + short[] index = new short[MAX_ASCII + 1]; + Arrays.fill(index, UNMAPPED); + short pos = 0; + for (char c : alphabet) + { + if (c <= MAX_ASCII) + { + index[c] = pos; + } + + /* + * also map lower-case character (unless separately mapped) + */ + if (c >= 'A' && c <= 'Z') + { + short lowerCase = (short) (c + ('a' - 'A')); + if (index[lowerCase] == UNMAPPED) + { + index[lowerCase] = pos; + } + } + pos++; + } + return index; + } + + @Override + public String getName() + { + return name; } @Override public boolean isDNA() { - return type == 1; + return !peptide; } @Override public boolean isProtein() { - return type == 0; + return peptide; } @Override - public int[][] getMatrix() + public float[][] getMatrix() { return matrix; } /** - * - * @param A1 - * @param A2 - * @return score for substituting first char in A1 with first char in A2 + * Returns the pairwise score for substituting c with d, or zero if c or d is + * an unscored or unexpected character */ - public int getPairwiseScore(String A1, String A2) - { - return getPairwiseScore(A1.charAt(0), A2.charAt(0)); - } - @Override - public int getPairwiseScore(char c, char d) + public float getPairwiseScore(char c, char d) { - int pog = 0; - - try + if (c > MAX_ASCII) { - int a = (type == 0) ? ResidueProperties.aaIndex[c] - : ResidueProperties.nucleotideIndex[c]; - int b = (type == 0) ? ResidueProperties.aaIndex[d] - : ResidueProperties.nucleotideIndex[d]; - - pog = matrix[a][b]; - } catch (Exception e) + System.err.println(String.format(BAD_ASCII_ERROR, c)); + return 0; + } + if (d > MAX_ASCII) { - // System.out.println("Unknown residue in " + A1 + " " + A2); + System.err.println(String.format(BAD_ASCII_ERROR, d)); + return 0; } - return pog; + int cIndex = symbolIndex[c]; + int dIndex = symbolIndex[d]; + if (cIndex != UNMAPPED && dIndex != UNMAPPED) + { + return matrix[cIndex][dIndex]; + } + return 0; } /** @@ -119,57 +195,194 @@ public class ScoreMatrix extends PairwiseSeqScoreModel implements return outputMatrix(false); } + /** + * Print the score matrix, optionally formatted as html, with the alphabet symbols as column headings and at the start of each row + * @param html + * @return + */ public String outputMatrix(boolean html) { - StringBuffer sb = new StringBuffer(); - int[] symbols = (type == 0) ? ResidueProperties.aaIndex - : ResidueProperties.nucleotideIndex; - int symMax = (type == 0) ? ResidueProperties.maxProteinIndex - : ResidueProperties.maxNucleotideIndex; - boolean header = true; + StringBuilder sb = new StringBuilder(512); + + /* + * heading row with alphabet + */ if (html) { sb.append(""); + sb.append(html ? "" : ""); + } + for (char sym : symbols) + { + if (html) + { + sb.append(""); + } + else + { + sb.append("\t").append(sym); + } + } + sb.append(html ? "\n" : "\n"); + + /* + * table of scores + */ + for (char c1 : symbols) + { + if (html) + { + sb.append("" : ""); + for (char c2 : symbols) + { + sb.append(html ? "" : ""); + } + sb.append(html ? "\n" : "\n"); } - for (char sym = 'A'; sym <= 'Z'; sym++) + if (html) + { + sb.append("
 ").append(sym).append(" 
"); + } + sb.append(c1).append(html ? "" : "\t") + .append(matrix[symbolIndex[c1]][symbolIndex[c2]]) + .append(html ? "
"); + } + return sb.toString(); + } + + /** + * Parse a score matrix from the given input stream and returns a ScoreMatrix + * object. If parsing fails, error messages are written to syserr and null is + * returned. It is the caller's responsibility to close the input stream. + * + * @param is + * @return + */ + public static ScoreMatrix parse(InputStream is) + { + ScoreMatrix sm = null; + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + int lineNo = 0; + String name = null; + String alphabet = null; + float[][] scores = null; + int size = 0; + int row = 0; + + try { - if (symbols[sym] >= 0 && symbols[sym] < symMax) + String data; + + while ((data = br.readLine()) != null) { - if (header) + lineNo++; + data = data.trim(); + if (data.startsWith(COMMENT_CHAR)) + { + continue; + } + if (data.toLowerCase().startsWith("scorematrix")) { - sb.append(html ? "" : ""); - for (char sym2 = 'A'; sym2 <= 'Z'; sym2++) + /* + * Parse name from ScoreMatrix + */ + if (name != null) + { + System.err + .println("Warning: 'ScoreMatrix' repeated in file at line " + + lineNo); + } + StringTokenizer nameLine = new StringTokenizer(data, DELIMITERS); + if (nameLine.countTokens() != 2) { - if (symbols[sym2] >= 0 && symbols[sym2] < symMax) - { - sb.append((html ? " " : "\t") + sym2 - + (html ? " " : "")); - } + System.err + .println("Format error: expected 'ScoreMatrix ', found '" + + data + "' at line " + lineNo); + return null; } - header = false; - sb.append(html ? "\n" : "\n"); + nameLine.nextToken(); + name = nameLine.nextToken(); + continue; } - if (html) + else if (name == null) { - sb.append(""); + System.err + .println("Format error: 'ScoreMatrix ' should be the first non-comment line"); + return null; } - sb.append((html ? "" : "") + sym + (html ? "" : "")); - for (char sym2 = 'A'; sym2 <= 'Z'; sym2++) + + /* + * next line after ScoreMatrix should be the alphabet of scored symbols + */ + if (alphabet == null) + { + alphabet = data; + size = alphabet.length(); + scores = new float[size][]; + continue; + } + + /* + * too much information? + */ + if (row >= size && data.length() > 0) { + System.err + .println("Unexpected extra input line in score model file " + + data); + return null; + } + + /* + * subsequent lines should be the symbol scores + */ + StringTokenizer scoreLine = new StringTokenizer(data, DELIMITERS); + if (scoreLine.countTokens() != size) { - if (symbols[sym2] >= 0 && symbols[sym2] < symMax) + System.err.println(String.format( + "Expected %d tokens at line %d but found %d", size, + lineNo, scoreLine.countTokens())); + return null; + } + scores[row] = new float[size]; + int col = 0; + String value = null; + while (scoreLine.hasMoreTokens()) { + try { + value = scoreLine.nextToken(); + scores[row][col] = Float.valueOf(value); + col++; + } catch (NumberFormatException e) { - sb.append((html ? "" : "\t") - + matrix[symbols[sym]][symbols[sym2]] - + (html ? "" : "")); + System.err.println(String.format( + "Invalid score value %s at line %d column %d", value, + lineNo, col)); + return null; } } - sb.append(html ? "\n" : "\n"); + row++; } + } catch (IOException e) + { + System.err.println("Error reading score matrix file: " + + e.getMessage() + " at line " + lineNo); } - if (html) + + /* + * out of data - check we found enough + */ + if (row < size) { - sb.append(""); + System.err + .println(String + .format("Expected %d rows of score data in score matrix but only found %d", + size, row)); + return null; } - return sb.toString(); + + /* + * If we get here, then name, alphabet and scores have been parsed successfully + */ + sm = new ScoreMatrix(name, alphabet.toCharArray(), scores); + return sm; } } diff --git a/src/jalview/analysis/scoremodels/ScoreModels.java b/src/jalview/analysis/scoremodels/ScoreModels.java index 4fa6396..f1990c0 100644 --- a/src/jalview/analysis/scoremodels/ScoreModels.java +++ b/src/jalview/analysis/scoremodels/ScoreModels.java @@ -1,8 +1,10 @@ package jalview.analysis.scoremodels; import jalview.api.analysis.ScoreModelI; -import jalview.schemes.ResidueProperties; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; import java.util.Map; import java.util.TreeMap; @@ -37,18 +39,65 @@ public class ScoreModels * using TreeMap keeps models ordered alphabetically by name */ models = new TreeMap(String.CASE_INSENSITIVE_ORDER); - registerScoreModel(new ScoreMatrix("BLOSUM62", - ResidueProperties.BLOSUM62, 0)); - registerScoreModel(new ScoreMatrix("PAM250", ResidueProperties.PAM250, - 0)); - registerScoreModel(new ScoreMatrix("DNA", ResidueProperties.DNA, 1)); + loadScoreMatrix("/scoreModel/blosum62.scm"); + loadScoreMatrix("/scoreModel/pam250.scm"); + loadScoreMatrix("/scoreModel/dna.scm"); registerScoreModel(new FeatureScoreModel()); registerScoreModel(new PIDScoreModel()); } - public Iterable getModelNames() + /** + * Try to load a score matrix from the given resource file, and if successful, + * register it. Answers true if successful, else false. Any errors are + * reported on syserr but not thrown. + * + * @param string + */ + boolean loadScoreMatrix(String resourcePath) + { + URL url = this.getClass().getResource(resourcePath); + if (url == null) + { + System.err.println("Failed to locate " + resourcePath); + return false; + } + boolean success = false; + InputStream is = null; + try + { + is = url.openStream(); + ScoreMatrix sm = ScoreMatrix.parse(is); + if (sm != null) + { + registerScoreModel(sm); + success = true; + } + } catch (IOException e) + { + } finally + { + if (is != null) + { + try + { + is.close(); + } catch (IOException e) + { + } + } + } + return success; + } + + /** + * Answers an iterable set of the registered score models. Currently these are + * ordered by name (not case sensitive). + * + * @return + */ + public Iterable getModels() { - return models.keySet(); + return models.values(); } public ScoreModelI forName(String s) diff --git a/src/jalview/datamodel/BinarySequence.java b/src/jalview/datamodel/BinarySequence.java index a959cc7..90b8be7 100755 --- a/src/jalview/datamodel/BinarySequence.java +++ b/src/jalview/datamodel/BinarySequence.java @@ -132,7 +132,7 @@ public class BinarySequence extends Sequence : ResidueProperties.aaIndex, matrix.getMatrix()); } - private void matrixEncode(final int[] aaIndex, final int[][] matrix) + private void matrixEncode(final int[] aaIndex, final float[][] matrix) { // Set all matrix to 0 // dbinary = new double[getSequence().length * 21]; diff --git a/src/jalview/gui/PCAPanel.java b/src/jalview/gui/PCAPanel.java index 58ed008..f96d464 100644 --- a/src/jalview/gui/PCAPanel.java +++ b/src/jalview/gui/PCAPanel.java @@ -20,6 +20,9 @@ */ package jalview.gui; +import jalview.analysis.scoremodels.ScoreMatrix; +import jalview.analysis.scoremodels.ScoreModels; +import jalview.api.analysis.ScoreModelI; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.AlignmentView; @@ -27,7 +30,6 @@ import jalview.datamodel.ColumnSelection; import jalview.datamodel.SeqCigar; import jalview.datamodel.SequenceI; import jalview.jbgui.GPCAPanel; -import jalview.schemes.ResidueProperties; import jalview.util.MessageManager; import jalview.viewmodel.AlignmentViewport; import jalview.viewmodel.PCAModel; @@ -155,30 +157,28 @@ public class PCAPanel extends GPCAPanel implements Runnable, protected void scoreMatrix_menuSelected() { scoreMatrixMenu.removeAll(); - for (final String sm : ResidueProperties.scoreMatrices.keySet()) + for (ScoreModelI sm : ScoreModels.getInstance().getModels()) { - if (ResidueProperties.getScoreMatrix(sm) != null) + if (sm instanceof ScoreMatrix) { + final String name = sm.getName(); // create an entry for this score matrix for use in PCA JCheckBoxMenuItem jm = new JCheckBoxMenuItem(); jm.setText(MessageManager.getStringOrReturn("label.score_model_", - sm)); - jm.setSelected(pcaModel.getScore_matrix().equals(sm)); - if ((ResidueProperties.scoreMatrices.get(sm).isDNA() && ResidueProperties.scoreMatrices - .get(sm).isProtein()) - || pcaModel.isNucleotide() == ResidueProperties.scoreMatrices - .get(sm).isDNA()) + name)); + jm.setSelected(pcaModel.getScore_matrix().equals(name)); + if ((!pcaModel.isNucleotide() && !sm.isDNA()) + || (pcaModel.isNucleotide() && sm.isDNA())) { - final PCAPanel us = this; jm.addActionListener(new ActionListener() { @Override public void actionPerformed(ActionEvent e) { - if (!pcaModel.getScore_matrix().equals(sm)) + if (!pcaModel.getScore_matrix().equals(name)) { - pcaModel.setScore_matrix(sm); - Thread worker = new Thread(us); + pcaModel.setScore_matrix(name); + Thread worker = new Thread(PCAPanel.this); worker.start(); } } diff --git a/src/jalview/gui/TreeChooser.java b/src/jalview/gui/TreeChooser.java index 338fbb8..40c683d 100644 --- a/src/jalview/gui/TreeChooser.java +++ b/src/jalview/gui/TreeChooser.java @@ -85,12 +85,10 @@ public class TreeChooser extends JPanel matrixNames = new JComboBox(); ScoreModels scoreModels = ScoreModels.getInstance(); - for (String scoreType : scoreModels.getModelNames()) + for (ScoreModelI sm : scoreModels.getModels()) { - ScoreModelI sm = scoreModels.forName(scoreType); - if (sm.isDNA() == af.getViewport().getAlignment().isNucleotide() - || sm.isProtein() == !af.getViewport().getAlignment() - .isNucleotide()) + boolean nucleotide = af.getViewport().getAlignment().isNucleotide(); + if (sm.isDNA() && nucleotide || sm.isProtein() && !nucleotide) { matrixNames.addItem(sm.getName()); } diff --git a/src/jalview/gui/TreePanel.java b/src/jalview/gui/TreePanel.java index 25f4c1b..7a205b6 100755 --- a/src/jalview/gui/TreePanel.java +++ b/src/jalview/gui/TreePanel.java @@ -22,6 +22,7 @@ package jalview.gui; import jalview.analysis.AlignmentSorter; import jalview.analysis.NJTree; +import jalview.analysis.scoremodels.ScoreModels; import jalview.api.analysis.ScoreModelI; import jalview.api.analysis.ViewBasedAnalysisI; import jalview.bin.Cache; @@ -41,7 +42,6 @@ import jalview.io.JalviewFileChooser; import jalview.io.JalviewFileView; import jalview.io.NewickFile; import jalview.jbgui.GTreePanel; -import jalview.schemes.ResidueProperties; import jalview.util.ImageMaker; import jalview.util.MessageManager; import jalview.viewmodel.AlignmentViewport; @@ -321,7 +321,7 @@ public class TreePanel extends GTreePanel seqs = av.getSelectionGroup().getSequencesInOrder( av.getAlignment()); } - ScoreModelI sm = ResidueProperties.getScoreModel(pwtype); + ScoreModelI sm = ScoreModels.getInstance().forName(pwtype); if (sm instanceof ViewBasedAnalysisI) { try diff --git a/src/jalview/schemes/Blosum62ColourScheme.java b/src/jalview/schemes/Blosum62ColourScheme.java index f35b886..c03c40b 100755 --- a/src/jalview/schemes/Blosum62ColourScheme.java +++ b/src/jalview/schemes/Blosum62ColourScheme.java @@ -75,14 +75,14 @@ public class Blosum62ColourScheme extends ResidueColourScheme } else { - int c = 0; + float score = 0; for (char consensus : consensusResidue.toCharArray()) { - c += ResidueProperties.getBLOSUM62(consensus, res); + score += ResidueProperties.getBLOSUM62(consensus, res); } - if (c > 0) + if (score > 0) { colour = LIGHT_BLUE; } diff --git a/src/jalview/schemes/ResidueProperties.java b/src/jalview/schemes/ResidueProperties.java index c774ebf..b4b5452 100755 --- a/src/jalview/schemes/ResidueProperties.java +++ b/src/jalview/schemes/ResidueProperties.java @@ -478,7 +478,7 @@ public class ResidueProperties // public static final double hydmax = 1.38; // public static final double hydmin = -2.53; - public static final int[][] BLOSUM62 = { + public static final float[][] BLOSUM62 = { { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4 }, { -1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, @@ -528,7 +528,7 @@ public class ResidueProperties { -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1 }, }; - public static final int[][] PAM250 = { + public static final float[][] PAM250 = { { 2, -2, 0, 0, -2, 0, 0, 1, -1, -1, -2, -1, -1, -3, 1, 1, 1, -6, -3, 0, 0, 0, 0, -8 }, { -2, 6, 0, -1, -4, 1, -1, -3, 2, -2, -3, 3, 0, -4, 0, 0, -1, 2, -4, @@ -604,7 +604,7 @@ public class ResidueProperties // treats T and U identically. R and Y weak equivalence with AG and CTU. // N matches any other base weakly // - public static final int[][] DNA = { + public static final float[][] DNA = { { 10, -8, -8, -8, -8, 1, 1, 1, -8, 1, 1 }, // A { -8, 10, -8, -8, -8, 1, 1, -8, 1, 1, 1 }, // C { -8, -8, 10, -8, -8, 1, 1, 1, -8, 1, 1 }, // G @@ -622,9 +622,9 @@ public class ResidueProperties */ static { - scoreMatrices.put("BLOSUM62", new ScoreMatrix("BLOSUM62", BLOSUM62, 0)); - scoreMatrices.put("PAM250", new ScoreMatrix("PAM250", PAM250, 0)); - scoreMatrices.put("DNA", new ScoreMatrix("DNA", DNA, 1)); + // scoreMatrices.put("BLOSUM62", new ScoreMatrix("BLOSUM62", BLOSUM62)); + // scoreMatrices.put("PAM250", new ScoreMatrix("PAM250", PAM250)); + // scoreMatrices.put("DNA", new ScoreMatrix("DNA", DNA)); } public static List STOP = Arrays.asList("TGA", "TAA", "TAG"); @@ -1282,24 +1282,24 @@ public class ResidueProperties return aa3Hash; } - public static int[][] getDNA() + public static float[][] getDNA() { return ResidueProperties.DNA; } - public static int[][] getBLOSUM62() + public static float[][] getBLOSUM62() { return ResidueProperties.BLOSUM62; } - public static int getPAM250(String A1, String A2) + public static float getPAM250(String A1, String A2) { return getPAM250(A1.charAt(0), A2.charAt(0)); } - public static int getBLOSUM62(char c1, char c2) + public static float getBLOSUM62(char c1, char c2) { - int pog = 0; + float pog = 0; try { @@ -1325,12 +1325,12 @@ public class ResidueProperties return cdn; } - public static int[][] getDefaultPeptideMatrix() + public static float[][] getDefaultPeptideMatrix() { return ResidueProperties.getBLOSUM62(); } - public static int[][] getDefaultDnaMatrix() + public static float[][] getDefaultDnaMatrix() { return ResidueProperties.getDNA(); } @@ -1362,12 +1362,12 @@ public class ResidueProperties return scoreMatrices.get(pwtype); } - public static int getPAM250(char c, char d) + public static float getPAM250(char c, char d) { int a = aaIndex[c]; int b = aaIndex[d]; - int pog = ResidueProperties.PAM250[a][b]; + float pog = ResidueProperties.PAM250[a][b]; return pog; } diff --git a/src/jalview/structure/StructureSelectionManager.java b/src/jalview/structure/StructureSelectionManager.java index 65fd5e7..1c46ca8 100644 --- a/src/jalview/structure/StructureSelectionManager.java +++ b/src/jalview/structure/StructureSelectionManager.java @@ -454,7 +454,7 @@ public class StructureSelectionManager * Attempt pairwise alignment of the sequence with each chain in the PDB, * and remember the highest scoring chain */ - int max = -10; + float max = -10; AlignSeq maxAlignseq = null; String maxChainId = " "; PDBChain maxChain = null; diff --git a/src/jalview/viewmodel/PCAModel.java b/src/jalview/viewmodel/PCAModel.java index b0af302..9383166 100644 --- a/src/jalview/viewmodel/PCAModel.java +++ b/src/jalview/viewmodel/PCAModel.java @@ -37,7 +37,7 @@ public class PCAModel seqstrings = seqstrings2; seqs = seqs2; nucleotide = nucleotide2; - score_matrix = nucleotide2 ? "PID" : "BLOSUM62"; + score_matrix = nucleotide2 ? "DNA" : "BLOSUM62"; } private volatile PCA pca; @@ -148,7 +148,7 @@ public class PCAModel for (int i = 0; i < pca.getM().rows; i++) { - ((SequencePoint) points.elementAt(i)).coord = scores[i]; + points.elementAt(i).coord = scores[i]; } } diff --git a/test/jalview/analysis/TestAlignSeq.java b/test/jalview/analysis/TestAlignSeq.java index 9fc88ea..70e59c5 100644 --- a/test/jalview/analysis/TestAlignSeq.java +++ b/test/jalview/analysis/TestAlignSeq.java @@ -125,7 +125,7 @@ public class TestAlignSeq }; as.printAlignment(ps); - String expected = "Score = 320\nLength of alignment = 10\nSequence Seq1 : 3 - 18 (Sequence length = 14)\nSequence Seq1 : 1 - 10 (Sequence length = 10)\n\n" + String expected = "Score = 320.0\nLength of alignment = 10\nSequence Seq1 : 3 - 18 (Sequence length = 14)\nSequence Seq1 : 1 - 10 (Sequence length = 10)\n\n" + "Seq1 SDFAQQQRRR\n" + " ||||||| \n" + "Seq1 SDFAQQQSSS\n\n" + "Percentage ID = 70.00\n"; diff --git a/test/jalview/analysis/scoremodels/ScoreMatrixTest.java b/test/jalview/analysis/scoremodels/ScoreMatrixTest.java new file mode 100644 index 0000000..462edf2 --- /dev/null +++ b/test/jalview/analysis/scoremodels/ScoreMatrixTest.java @@ -0,0 +1,172 @@ +package jalview.analysis.scoremodels; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; + +import java.io.ByteArrayInputStream; + +import org.testng.annotations.Test; + +public class ScoreMatrixTest +{ + @Test(groups = "Functional") + public void testBuildSymbolIndex() + { + short[] index = ScoreMatrix.buildSymbolIndex("AX-. yxYp".toCharArray()); + + assertEquals(index.length, 128); // ASCII character set size + + assertEquals(index['A'], 0); + assertEquals(index['a'], 0); // lower-case mapping added + assertEquals(index['X'], 1); + assertEquals(index['-'], 2); + assertEquals(index['.'], 3); + assertEquals(index[' '], 4); + assertEquals(index['y'], 5); // lower-case override + assertEquals(index['x'], 6); // lower-case override + assertEquals(index['Y'], 7); + assertEquals(index['p'], 8); + assertEquals(index['P'], -1); // lower-case doesn't map upper-case + + /* + * check all unmapped symbols have index for unmapped + */ + for (int c = 0; c < index.length; c++) + { + if (!"AaXx-. Yyp".contains(String.valueOf((char) c))) + { + assertEquals(index[c], -1); + } + } + } + + /** + * check that characters not in the basic ASCII set are simply ignored + */ + @Test(groups = "Functional") + public void testBuildSymbolIndex_nonAscii() + { + char[] weird = new char[] { 128, 245, 'P' }; + short[] index = ScoreMatrix.buildSymbolIndex(weird); + assertEquals(index.length, 128); + assertEquals(index['P'], 2); + assertEquals(index['p'], 2); + for (int c = 0; c < index.length; c++) + { + if (c != 'P' && c != 'p') + { + assertEquals(index[c], -1); + } + } + } + + /** + * Test a successful parse of a (small) score matrix file + */ + @Test(groups = "Functional") + public void testParse() + { + /* + * some messy but valid input data, with comma, space + * or tab (or combinations) as score value delimiters + */ + String data = "ScoreMatrix MyTest\n" + "ATU tx-\n" + + "1.1,1.2,1.3,1.4, 1.5, 1.6, 1.7\n" + + "2.1 2.2 2.3 2.4 2.5 2.6 2.7\n" + + "3.1\t3.2\t3.3\t3.4\t3.5\t3.6\t3.7\n" + + " 4.1 ,4.2,\t,4.3 ,\t4.4\t, \4.5,4.6 4.7\n" + + ", 5.1,5.3,5.3,5.4,5.5, 5.6, 5.7\n" + + "\t6.1, 6.2 6.3 6.4 6.5 6.6 6.7\n" + + ", \t7.1\t7.2 7.3, 7.4, 7.5\t,7.6,7.7\n"; + ScoreMatrix sm = ScoreMatrix.parse(new ByteArrayInputStream(data + .getBytes())); + assertNotNull(sm); + assertEquals(sm.getName(), "MyTest"); + assertTrue(sm.isDNA()); + assertFalse(sm.isProtein()); + assertEquals(sm.getPairwiseScore('A', 'A'), 1.1f); + assertEquals(sm.getPairwiseScore('A', 'T'), 1.2f); + assertEquals(sm.getPairwiseScore('a', 'T'), 1.2f); // A/a equivalent + assertEquals(sm.getPairwiseScore('A', 't'), 1.5f); // T/t not equivalent + assertEquals(sm.getPairwiseScore('a', 't'), 1.5f); + assertEquals(sm.getPairwiseScore('T', ' '), 2.4f); + assertEquals(sm.getPairwiseScore('U', 'x'), 3.6f); + assertEquals(sm.getPairwiseScore('u', 'x'), 3.6f); + assertEquals(sm.getPairwiseScore('U', 'X'), 0f); // X (upper) unmapped + assertEquals(sm.getPairwiseScore('A', '.'), 0f); // . unmapped + assertEquals(sm.getPairwiseScore('-', '-'), 7.7f); + assertEquals(sm.getPairwiseScore('A', (char) 128), 0f); // out of range + } + + @Test(groups = "Functional") + public void testParse_invalidInput() + { + /* + * valid first + */ + String data = "ScoreMatrix MyTest\nXY\n1 2\n3 4\n"; + ScoreMatrix sm = ScoreMatrix.parse(new ByteArrayInputStream(data + .getBytes())); + assertNotNull(sm); + + /* + * Name missing + */ + data = "ScoreMatrix\nXY\n1 2\n3 4\n"; + sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); + assertNull(sm); + + /* + * ScoreMatrix header missing + */ + data = "XY\n1 2\n3 4\n"; + sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); + assertNull(sm); + + /* + * Not enough rows + */ + data = "ScoreMatrix MyTest\nXY\n1 2\n"; + sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); + assertNull(sm); + + /* + * Not enough columns + */ + data = "ScoreMatrix MyTest\nXY\n1 2\n3\n"; + sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); + assertNull(sm); + + /* + * Too many columns + */ + data = "ScoreMatrix MyTest\nXY\n1 2\n3 4 5\n"; + sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); + assertNull(sm); + + /* + * Too many rows + */ + data = "ScoreMatrix MyTest\nXY\n1 2\n3 4\n6 7"; + sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); + assertNull(sm); + + /* + * unsupported delimiter | + */ + data = "ScoreMatrix MyTest\nXY\n1|2\n3|4\n"; + sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); + assertNull(sm); + + /* + * Bad float value + */ + data = "ScoreMatrix MyTest\nXY\n1 2\n3 four\n"; + sm = ScoreMatrix.parse(new ByteArrayInputStream(data.getBytes())); + assertNull(sm); + + } +}