2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.math.Matrix;
24 import jalview.math.MatrixI;
25 import jalview.schemes.ResidueProperties;
26 import jalview.schemes.ScoreMatrix;
28 import java.io.PrintStream;
31 * Performs Principal Component Analysis on given sequences
33 public class PCA implements Runnable
35 boolean jvCalcMode = true;
43 StringBuilder details = new StringBuilder(1024);
45 private String[] seqs;
47 private ScoreMatrix scoreMatrix;
50 * Creates a new PCA object. By default, uses blosum62 matrix to generate
51 * sequence similarity matrices
54 * Set of amino acid sequences to perform PCA on
56 public PCA(String[] s)
62 * Creates a new PCA object. By default, uses blosum62 matrix to generate
63 * sequence similarity matrices
66 * Set of sequences to perform PCA on
68 * if true, uses standard DNA/RNA matrix for sequence similarity
71 public PCA(String[] s, boolean nucleotides)
73 this(s, nucleotides, null);
76 public PCA(String[] s, boolean nucleotides, String s_m)
80 // BinarySequence[] bs = new BinarySequence[s.length];
83 // while ((ii < s.length) && (s[ii] != null))
85 // bs[ii] = new BinarySequence(s[ii], nucleotides);
90 // BinarySequence[] bs2 = new BinarySequence[s.length];
95 scoreMatrix = ResidueProperties.getScoreMatrix(sm);
97 if (scoreMatrix == null)
99 // either we were given a non-existent score matrix or a scoremodel that
100 // isn't based on a pairwise symbol score matrix
101 scoreMatrix = ResidueProperties
102 .getScoreMatrix(sm = (nucleotides ? "DNA" : "BLOSUM62"));
104 details.append("PCA calculation using " + sm
105 + " sequence similarity matrix\n========\n\n");
107 // while ((ii < s.length) && (s[ii] != null))
109 // bs2[ii] = new BinarySequence(s[ii], nucleotides);
110 // if (scoreMatrix != null)
114 // bs2[ii].matrixEncode(scoreMatrix);
115 // } catch (InvalidSequenceTypeException x)
117 // details.append("Unexpected mismatch of sequence type and score matrix. Calculation will not be valid!\n\n");
124 // while ((count < bs.length) && (bs[count] != null))
129 // double[][] seqmat = new double[count][];
130 // double[][] seqmat2 = new double[count][];
135 // seqmat[i] = bs[i].getDBinary();
136 // seqmat2[i] = bs2[i].getDBinary();
141 // * using a SparseMatrix to hold the encoded sequences matrix
142 // * greatly speeds up matrix multiplication as these are mostly zero
144 // m = new SparseMatrix(seqmat);
145 // m2 = new Matrix(seqmat2);
153 * Index of diagonal within matrix
155 * @return Returns value of diagonal from matrix
157 public double getEigenvalue(int i)
159 return eigenvector.getD()[i];
174 * @return DOCUMENT ME!
176 public float[][] getComponents(int l, int n, int mm, float factor)
178 float[][] out = new float[getHeight()][3];
180 for (int i = 0; i < getHeight(); i++)
182 out[i][0] = (float) component(i, l) * factor;
183 out[i][1] = (float) component(i, n) * factor;
184 out[i][2] = (float) component(i, mm) * factor;
196 * @return DOCUMENT ME!
198 public double[] component(int n)
200 // n = index of eigenvector
201 double[] out = new double[getHeight()];
203 for (int i = 0; i < out.length; i++)
205 out[i] = component(i, n);
219 * @return DOCUMENT ME!
221 double component(int row, int n)
225 for (int i = 0; i < symm.width(); i++)
227 out += (symm.getValue(row, i) * eigenvector.getValue(i, n));
230 return out / eigenvector.getD()[n];
233 public String getDetails()
235 return details.toString();
244 PrintStream ps = new PrintStream(System.out)
247 public void print(String x)
253 public void println()
255 details.append("\n");
259 // long now = System.currentTimeMillis();
262 details.append("PCA Calculation Mode is "
263 + (jvCalcMode ? "Jalview variant" : "Original SeqSpace")
266 // MatrixI mt = m.transpose();
267 // eigenvector = mt.preMultiply(jvCalcMode ? m2 : m);
268 eigenvector = computePairwiseScores();
270 details.append(" --- OrigT * Orig ---- \n");
271 eigenvector.print(ps, "%8.2f");
273 symm = eigenvector.copy();
277 details.append(" ---Tridiag transform matrix ---\n");
278 details.append(" --- D vector ---\n");
279 eigenvector.printD(ps, "%15.4e");
281 details.append("--- E vector ---\n");
282 eigenvector.printE(ps, "%15.4e");
285 // Now produce the diagonalization matrix
287 } catch (Exception q)
290 details.append("\n*** Unexpected exception when performing PCA ***\n"
291 + q.getLocalizedMessage());
292 details.append("*** Matrices below may not be fully diagonalised. ***\n");
295 details.append(" --- New diagonalization matrix ---\n");
296 eigenvector.print(ps, "%8.2f");
297 details.append(" --- Eigenvalues ---\n");
298 eigenvector.printD(ps, "%15.4e");
301 * for (int seq=0;seq<symm.rows;seq++) { ps.print("\"Seq"+seq+"\""); for
302 * (int ev=0;ev<symm.rows; ev++) {
304 * ps.print(","+component(seq, ev)); } ps.println(); }
306 // System.out.println(("PCA.run() took "
307 // + (System.currentTimeMillis() - now) + "ms"));
311 * Computes an NxN matrix where N is the number of sequences, and entry [i, j]
312 * is sequence[i] pairwise multiplied with sequence[j], as a sum of scores
313 * computed using the current score matrix. For example
315 * <li>Sequences:</li>
320 * <li>Score matrix is BLOSUM62</li>
321 * <li>product [0, 0] = F.F + K.K + L.L = 6 + 5 + 4 = 15</li>
322 * <li>product [2, 1] = R.R + S.S + D.D = 5 + 4 + 6 = 15</li>
323 * <li>product [2, 2] = Q.Q + I.I + A.A = 5 + 4 + 4 = 13</li>
324 * <li>product [3, 3] = G.G + W.W + C.C = 6 + 11 + 9 = 26</li>
325 * <li>product[0, 1] = F.R + K.S + L.D = -3 + 0 + -3 = -7
329 MatrixI computePairwiseScores()
331 double[][] values = new double[seqs.length][];
332 for (int row = 0; row < seqs.length; row++)
334 values[row] = new double[seqs.length];
335 for (int col = 0; col < seqs.length; col++)
338 int width = Math.min(seqs[row].length(), seqs[col].length());
339 for (int i = 0; i < width; i++)
341 char c1 = seqs[row].charAt(i);
342 char c2 = seqs[col].charAt(i);
343 int score = scoreMatrix.getPairwiseScore(c1, c2);
346 values[row][col] = total;
349 return new Matrix(values);
352 public void setJvCalcMode(boolean calcMode)
354 this.jvCalcMode = calcMode;
358 * Answers the N dimensions of the NxN PCA matrix. This is the number of
359 * sequences involved in the pairwise score calculation.
363 public int getHeight()
365 // TODO can any of seqs[] be null?