From 07394c1c2d9d4ae05c85cd6d9644e4d17f2818a2 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 3 Feb 2017 16:13:24 +0000 Subject: [PATCH] JAL-2379 'direct' pairwise score calculation for PCA (no encoding) --- src/jalview/analysis/PCA.java | 207 +++++++++++++++----------- src/jalview/schemes/ScoreMatrix.java | 33 +++- src/jalview/viewmodel/PCAModel.java | 13 +- test/jalview/analysis/PCATest.java | 61 ++++++++ test/jalview/schemes/ScoreMatrixPrinter.java | 3 - test/jalview/schemes/ScoreMatrixTest.java | 46 ++++++ 6 files changed, 269 insertions(+), 94 deletions(-) create mode 100644 test/jalview/analysis/PCATest.java create mode 100644 test/jalview/schemes/ScoreMatrixTest.java diff --git a/src/jalview/analysis/PCA.java b/src/jalview/analysis/PCA.java index 7d6b822..1a4060c 100755 --- a/src/jalview/analysis/PCA.java +++ b/src/jalview/analysis/PCA.java @@ -20,11 +20,8 @@ */ package jalview.analysis; -import jalview.datamodel.BinarySequence; -import jalview.datamodel.BinarySequence.InvalidSequenceTypeException; import jalview.math.Matrix; import jalview.math.MatrixI; -import jalview.math.SparseMatrix; import jalview.schemes.ResidueProperties; import jalview.schemes.ScoreMatrix; @@ -37,18 +34,18 @@ public class PCA implements Runnable { boolean jvCalcMode = true; - MatrixI m; - MatrixI symm; - MatrixI m2; - double[] eigenvalue; MatrixI eigenvector; StringBuilder details = new StringBuilder(1024); + private String[] seqs; + + private ScoreMatrix scoreMatrix; + /** * Creates a new PCA object. By default, uses blosum62 matrix to generate * sequence similarity matrices @@ -78,88 +75,78 @@ public class PCA implements Runnable public PCA(String[] s, boolean nucleotides, String s_m) { - - BinarySequence[] bs = new BinarySequence[s.length]; - int ii = 0; - - while ((ii < s.length) && (s[ii] != null)) - { - bs[ii] = new BinarySequence(s[ii], nucleotides); - bs[ii].encode(); - ii++; - } - - BinarySequence[] bs2 = new BinarySequence[s.length]; - ScoreMatrix smtrx = null; + this.seqs = s; + + // BinarySequence[] bs = new BinarySequence[s.length]; + // int ii = 0; + // + // while ((ii < s.length) && (s[ii] != null)) + // { + // bs[ii] = new BinarySequence(s[ii], nucleotides); + // bs[ii].encode(); + // ii++; + // } + // + // BinarySequence[] bs2 = new BinarySequence[s.length]; + scoreMatrix = null; String sm = s_m; if (sm != null) { - smtrx = ResidueProperties.getScoreMatrix(sm); + scoreMatrix = ResidueProperties.getScoreMatrix(sm); } - if (smtrx == null) + if (scoreMatrix == null) { // either we were given a non-existent score matrix or a scoremodel that // isn't based on a pairwise symbol score matrix - smtrx = ResidueProperties.getScoreMatrix(sm = (nucleotides ? "DNA" - : "BLOSUM62")); + scoreMatrix = ResidueProperties + .getScoreMatrix(sm = (nucleotides ? "DNA" : "BLOSUM62")); } details.append("PCA calculation using " + sm + " sequence similarity matrix\n========\n\n"); - ii = 0; - while ((ii < s.length) && (s[ii] != null)) - { - bs2[ii] = new BinarySequence(s[ii], nucleotides); - if (smtrx != null) - { - try - { - bs2[ii].matrixEncode(smtrx); - } catch (InvalidSequenceTypeException x) - { - details.append("Unexpected mismatch of sequence type and score matrix. Calculation will not be valid!\n\n"); - } - } - ii++; - } - - int count = 0; - while ((count < bs.length) && (bs[count] != null)) - { - count++; - } - - double[][] seqmat = new double[count][]; - double[][] seqmat2 = new double[count][]; - - int i = 0; - while (i < count) - { - seqmat[i] = bs[i].getDBinary(); - seqmat2[i] = bs2[i].getDBinary(); - i++; - } - - /* - * using a SparseMatrix to hold the encoded sequences matrix - * greatly speeds up matrix multiplication as these are mostly zero - */ - m = new SparseMatrix(seqmat); - m2 = new Matrix(seqmat2); + // ii = 0; + // while ((ii < s.length) && (s[ii] != null)) + // { + // bs2[ii] = new BinarySequence(s[ii], nucleotides); + // if (scoreMatrix != null) + // { + // try + // { + // bs2[ii].matrixEncode(scoreMatrix); + // } catch (InvalidSequenceTypeException x) + // { + // details.append("Unexpected mismatch of sequence type and score matrix. Calculation will not be valid!\n\n"); + // } + // } + // ii++; + // } + // + // int count = 0; + // while ((count < bs.length) && (bs[count] != null)) + // { + // count++; + // } + // + // double[][] seqmat = new double[count][]; + // double[][] seqmat2 = new double[count][]; + // + // int i = 0; + // while (i < count) + // { + // seqmat[i] = bs[i].getDBinary(); + // seqmat2[i] = bs2[i].getDBinary(); + // i++; + // } + // + // /* + // * using a SparseMatrix to hold the encoded sequences matrix + // * greatly speeds up matrix multiplication as these are mostly zero + // */ + // m = new SparseMatrix(seqmat); + // m2 = new Matrix(seqmat2); } /** - * Returns the matrix used in PCA calculation - * - * @return - */ - - public MatrixI getM() - { - return m; - } - - /** * Returns Eigenvalue * * @param i @@ -188,9 +175,9 @@ public class PCA implements Runnable */ public float[][] getComponents(int l, int n, int mm, float factor) { - float[][] out = new float[m.height()][3]; + float[][] out = new float[getHeight()][3]; - for (int i = 0; i < m.height(); i++) + for (int i = 0; i < getHeight(); i++) { out[i][0] = (float) component(i, l) * factor; out[i][1] = (float) component(i, n) * factor; @@ -211,9 +198,9 @@ public class PCA implements Runnable public double[] component(int n) { // n = index of eigenvector - double[] out = new double[m.height()]; + double[] out = new double[getHeight()]; - for (int i = 0; i < m.height(); i++) + for (int i = 0; i < out.length; i++) { out[i] = component(i, n); } @@ -275,12 +262,12 @@ public class PCA implements Runnable details.append("PCA Calculation Mode is " + (jvCalcMode ? "Jalview variant" : "Original SeqSpace") + "\n"); - MatrixI mt = m.transpose(); - - details.append(" --- OrigT * Orig ---- \n"); - eigenvector = mt.preMultiply(jvCalcMode ? m2 : m); + // MatrixI mt = m.transpose(); + // eigenvector = mt.preMultiply(jvCalcMode ? m2 : m); + eigenvector = computePairwiseScores(); + details.append(" --- OrigT * Orig ---- \n"); eigenvector.print(ps, "%8.2f"); symm = eigenvector.copy(); @@ -320,8 +307,62 @@ public class PCA implements Runnable // + (System.currentTimeMillis() - now) + "ms")); } + /** + * Computes an NxN matrix where N is the number of sequences, and entry [i, j] + * is sequence[i] pairwise multiplied with sequence[j], as a sum of scores + * computed using the current score matrix. For example + * + */ + MatrixI computePairwiseScores() + { + double[][] values = new double[seqs.length][]; + for (int row = 0; row < seqs.length; row++) + { + values[row] = new double[seqs.length]; + for (int col = 0; col < seqs.length; col++) + { + int total = 0; + int width = Math.min(seqs[row].length(), seqs[col].length()); + for (int i = 0; i < width; i++) + { + char c1 = seqs[row].charAt(i); + char c2 = seqs[col].charAt(i); + int score = scoreMatrix.getPairwiseScore(c1, c2); + total += score; + } + values[row][col] = total; + } + } + return new Matrix(values); + } + public void setJvCalcMode(boolean calcMode) { this.jvCalcMode = calcMode; } + + /** + * Answers the N dimensions of the NxN PCA matrix. This is the number of + * sequences involved in the pairwise score calculation. + * + * @return + */ + public int getHeight() + { + // TODO can any of seqs[] be null? + return seqs.length; + } } diff --git a/src/jalview/schemes/ScoreMatrix.java b/src/jalview/schemes/ScoreMatrix.java index 5e5fa8d..c47654a 100644 --- a/src/jalview/schemes/ScoreMatrix.java +++ b/src/jalview/schemes/ScoreMatrix.java @@ -21,10 +21,8 @@ package jalview.schemes; import jalview.analysis.scoremodels.PairwiseSeqScoreModel; -import jalview.api.analysis.ScoreModelI; -public class ScoreMatrix extends PairwiseSeqScoreModel implements - ScoreModelI +public class ScoreMatrix extends PairwiseSeqScoreModel { String name; @@ -89,6 +87,7 @@ public class ScoreMatrix extends PairwiseSeqScoreModel implements return getPairwiseScore(A1.charAt(0), A2.charAt(0)); } + @Override public int getPairwiseScore(char c, char d) { int pog = 0; @@ -100,6 +99,33 @@ public class ScoreMatrix extends PairwiseSeqScoreModel implements int b = (type == 0) ? ResidueProperties.aaIndex[d] : ResidueProperties.nucleotideIndex[d]; + /* + * hack to convert unassigned / unknown (including gap) + * to index of unknown (X for amino acids, N for nucleotide) + * TODO: statically assign gap characters to this index? + */ + if (type == 0) + { + if (a == ResidueProperties.maxProteinIndex) + { + a = ResidueProperties.aaIndex['X']; + } + if (b == ResidueProperties.maxProteinIndex) + { + b = ResidueProperties.aaIndex['X']; + } + } + if (type != 0) + { + if (a == ResidueProperties.maxNucleotideIndex) + { + a = ResidueProperties.nucleotideIndex['N']; + } + if (b == ResidueProperties.maxNucleotideIndex) + { + b = ResidueProperties.nucleotideIndex['N']; + } + } pog = matrix[a][b]; } catch (Exception e) { @@ -112,6 +138,7 @@ public class ScoreMatrix extends PairwiseSeqScoreModel implements /** * pretty print the matrix */ + @Override public String toString() { return outputMatrix(false); diff --git a/src/jalview/viewmodel/PCAModel.java b/src/jalview/viewmodel/PCAModel.java index 3b6829e..55e7300 100644 --- a/src/jalview/viewmodel/PCAModel.java +++ b/src/jalview/viewmodel/PCAModel.java @@ -69,7 +69,8 @@ public class PCAModel public void run() { - pca = new PCA(seqstrings.getSequenceStrings(' '), nucleotide, + String[] sequenceStrings = seqstrings.getSequenceStrings(' '); + pca = new PCA(sequenceStrings, nucleotide, score_matrix); pca.setJvCalcMode(jvCalcMode); pca.run(); @@ -82,12 +83,14 @@ public class PCAModel ii++; } - top = pca.getM().height() - 1; + int height = pca.getHeight(); + // top = pca.getM().height() - 1; + top = height - 1; points = new Vector(); float[][] scores = pca.getComponents(top - 1, top - 2, top - 3, 100); - for (int i = 0; i < pca.getM().height(); i++) + for (int i = 0; i < height; i++) { SequencePoint sp = new SequencePoint(seqs[i], scores[i]); points.addElement(sp); @@ -96,7 +99,7 @@ public class PCAModel public void updateRc(RotatableCanvasI rc) { - rc.setPoints(points, pca.getM().height()); + rc.setPoints(points, pca.getHeight()); } public boolean isNucleotide() @@ -134,7 +137,7 @@ public class PCAModel // note: actual indices for components are dim1-1, etc (patch for JAL-1123) float[][] scores = pca.getComponents(dim1 - 1, dim2 - 1, dim3 - 1, 100); - for (int i = 0; i < pca.getM().height(); i++) + for (int i = 0; i < pca.getHeight(); i++) { points.elementAt(i).coord = scores[i]; } diff --git a/test/jalview/analysis/PCATest.java b/test/jalview/analysis/PCATest.java new file mode 100644 index 0000000..815b71f --- /dev/null +++ b/test/jalview/analysis/PCATest.java @@ -0,0 +1,61 @@ +package jalview.analysis; + +import static org.testng.Assert.assertEquals; + +import jalview.math.MatrixI; + +import org.testng.annotations.Test; + +public class PCATest +{ + @Test(groups = "Functional") + public void testComputePairwiseScores() + { + String[] seqs = new String[] { "FKL", "R-D", "QIA", "GWC" }; + PCA pca = new PCA(seqs, false, "BLOSUM62"); + + MatrixI pairwise = pca.computePairwiseScores(); + + /* + * should be NxN where N = number of sequences + */ + assertEquals(pairwise.height(), 4); + assertEquals(pairwise.width(), 4); + + /* + * should be symmetrical (because BLOSUM62 is) + */ + for (int i = 0; i < pairwise.height(); i++) + { + for (int j = 0; j < pairwise.width(); j++) + { + assertEquals(pairwise.getValue(i, j), pairwise.getValue(j, i), + "Not symmetric"); + } + } + /* + * verify expected BLOSUM dot product scores + * Note: gap is treated like 'X' [22] in the matrix + */ + // F.F + K.K + L.L = 6 + 5 + 4 = 15 + assertEquals(pairwise.getValue(0, 0), 15d); + // R.R + X.X + D.D = 5 + -1 + 6 = 10 + assertEquals(pairwise.getValue(1, 1), 10d); + // Q.Q + I.I + A.A = 5 + 4 + 4 = 13 + assertEquals(pairwise.getValue(2, 2), 13d); + // G.G + W.W + C.C = 6 + 11 + 9 = 26 + assertEquals(pairwise.getValue(3, 3), 26d); + // F.R + K.X + L.D = -3 + -1 + -3 = -8 + assertEquals(pairwise.getValue(0, 1), -8d); + // F.Q + K.I + L.A = -3 + -3 + -1 = -7 + assertEquals(pairwise.getValue(0, 2), -7d); + // F.G + K.W + L.C = -3 + -3 + -1 = -7 + assertEquals(pairwise.getValue(0, 3), -7d); + // R.Q + X.I + D.A = 1 + -1 + -2 = -2 + assertEquals(pairwise.getValue(1, 2), -2d); + // R.G + X.W + D.C = -2 + -2 + -3 = -7 + assertEquals(pairwise.getValue(1, 3), -7d); + // Q.G + I.W + A.C = -2 + -3 + 0 = -5 + assertEquals(pairwise.getValue(2, 3), -5d); + } +} diff --git a/test/jalview/schemes/ScoreMatrixPrinter.java b/test/jalview/schemes/ScoreMatrixPrinter.java index a743163..80241fb 100644 --- a/test/jalview/schemes/ScoreMatrixPrinter.java +++ b/test/jalview/schemes/ScoreMatrixPrinter.java @@ -26,7 +26,6 @@ import jalview.gui.JvOptionPane; import java.util.Map; import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; public class ScoreMatrixPrinter { @@ -38,7 +37,6 @@ public class ScoreMatrixPrinter JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); } - @Test(groups = { "Functional" }) public void printAllMatrices() { for (Map.Entry sm : ResidueProperties.scoreMatrices @@ -49,7 +47,6 @@ public class ScoreMatrixPrinter } } - @Test(groups = { "Functional" }) public void printHTMLMatrices() { for (Map.Entry _sm : ResidueProperties.scoreMatrices diff --git a/test/jalview/schemes/ScoreMatrixTest.java b/test/jalview/schemes/ScoreMatrixTest.java new file mode 100644 index 0000000..0ae7375 --- /dev/null +++ b/test/jalview/schemes/ScoreMatrixTest.java @@ -0,0 +1,46 @@ +package jalview.schemes; + +import static org.testng.Assert.assertEquals; + +import org.testng.annotations.Test; + +public class ScoreMatrixTest +{ + @Test(groups = "Functional") + public void testSymmetric() + { + verifySymmetric(ResidueProperties.getScoreMatrix("BLOSUM62")); + verifySymmetric(ResidueProperties.getScoreMatrix("PAM250")); + verifySymmetric(ResidueProperties.getScoreMatrix("DNA")); + } + + private void verifySymmetric(ScoreMatrix sm) + { + int[][] m = sm.getMatrix(); + int rows = m.length; + for (int row = 0; row < rows; row++) + { + assertEquals(m[row].length, rows); + for (int col = 0; col < rows; col++) + { + // if (m[row][col] != m[col][row]) + // { + // System.out.println(String.format("%s [%d, %d] [%s, %s]", + // sm.getName(), m[row][col], m[col][row], + // ResidueProperties.aa[row], + // ResidueProperties.aa[col])); + // } + assertEquals(m[row][col], m[col][row], String.format("%s [%s, %s]", + sm.getName(), ResidueProperties.aa[row], + ResidueProperties.aa[col])); + } + } + + /* + * also check the score matrix is sized for + * the number of symbols scored, plus gap + */ + assertEquals(rows, (sm.isDNA() ? ResidueProperties.maxNucleotideIndex + : ResidueProperties.maxProteinIndex) + 1); + } +} -- 1.7.10.2