********************************************************************/
/*
- * RCS $Id: hhmatrices-C.h 199 2011-02-21 18:24:49Z fabian $
+ * RCS $Id: hhmatrices-C.h 316 2016-12-16 16:14:39Z fabian $
*/
// Substitution matrices and their background frequencies
0.0011,0.0007,0.0006,0.0005,0.0003,0.0005,0.0006,0.0006,0.0016,0.0013,0.0020,0.0008,0.0005,0.0046,0.0003,0.0009,0.0008,0.0010,0.0148,
0.0046,0.0013,0.0009,0.0010,0.0013,0.0010,0.0015,0.0014,0.0005,0.0123,0.0089,0.0015,0.0022,0.0022,0.0010,0.0021,0.0033,0.0004,0.0012,0.0246};
+const float mism = 2000; // standard mismatch score
+const float matc = 7000;
+// RNA matrix, Gonnet type format
+const float ribosum[]={
+// A C G T U R Y M K S W B D H V N ? ? ? ?
+ 9500, 600, 700, 900, 900, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // A
+ 600, 4000, 300, 1200, 1200, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // C
+ 700, 300, 3500, 700, 700, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // G
+ 900, 1200, 700, 8200, 8200, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // T
+ 900, 1200, 700, 8200, 8200, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // U
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // R
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // Y
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // M
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // K
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // S
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // W
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // B
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // D
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // H
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // V
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // N
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // ?
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // ?
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // ?
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism};// ?
+
+const float dna_basic[]={
+// A C G T U R Y M K S W B D H V N ? ? ? ?
+ matc, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // A
+ mism, matc, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // C
+ mism, mism, matc, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // G
+ mism, mism, mism, matc, matc, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // T
+ mism, mism, mism, matc, matc, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // U
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // R
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // Y
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // M
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // K
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // S
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // W
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // B
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // D
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // H
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // V
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // N
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // ?
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // ?
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // ?
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism};// ?
+
+const float dave_nucleo[]={
+// A C G T U R Y M K S W B D H V N ? ? ? ?
+ 8000, 1500, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // A
+ 1500, 6500, 1000, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // C
+ mism, 1000, 6500, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // G
+ mism, mism, mism, 7000, 7000, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // T
+ mism, mism, mism, 7000, 7000, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // U
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // R
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // Y
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // M
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // K
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // S
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // W
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // B
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // D
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // H
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // V
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // N
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // ?
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // ?
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, // ?
+ mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism, mism};// ?
+
// prediction accuracy of Psipred:
// Ppred[cf][B][A] = P(A,B,cf)/P(A)/P(B,cf) = P(A|B,cf)/P(A)
// A = observed ss state B = predicted ss state cf = confidence value of prediction
//Precompute matrix R for amino acid pseudocounts:
for (a=0; a<20; ++a)
for (b=0; b<20; ++b)
- S[a][b] = log2(R[a][b]/pb[a]); // S[a][b] = log2(P(a,b)/P(a)/P(b))
+ S[a][b] = Log2(R[a][b]/pb[a]); // S[a][b] = log2(P(a,b)/P(a)/P(b))
// Evaluate sequence identity underlying substitution matrix
if (v>=3)
float entropy_pb=0.0f;
float mut_info=0.0f;
for (a=0; a<20; ++a) id+=P[a][a];
- for (a=0; a<20; ++a) entropy_pb-=pb[a]*log2(pb[a]);
+ for (a=0; a<20; ++a) entropy_pb-=pb[a]*Log2(pb[a]);
for (a=0; a<20; ++a)
for (b=0; b<20; ++b)
{
- entropy-=P[a][b]*log2(R[a][b]);
+ entropy-=P[a][b]*Log2(R[a][b]);
mut_info += P[a][b]*S[a][b];
}
}
}
+/////////////////////////////////////////////////////////////////////////////////////
+/**
+ * @brief Set (global variable) substitution matrix with derived matrices and background frequencies
+ */
+void
+SetRnaSubstitutionMatrix()
+{
+ int a,b;
+ for (a=0; a<20; ++a)
+ for (pb[a]=0.0f, b=0; b<20; ++b)
+ P[a][b] = 0.000001f*ribosum[a*20+b];
+ for (a=0; a<20; ++a) P[a][20]=P[20][a]=1.0f;
+
+ // Check transition probability matrix, renormalize P and calculate pb[a]
+ float sumab=0.0f;
+ for (a=0; a<20; a++)
+ for (b=0; b<20; ++b) sumab+=P[a][b];
+ for (a=0; a<20; a++)
+ for (b=0; b<20; ++b) P[a][b]/=sumab;
+ for (a=0; a<20; a++)
+ for (pb[a]=0.0f, b=0; b<20; ++b) pb[a]+=P[a][b];
+
+ //Compute similarity matrix for amino acid pairs (for calculating consensus sequence)
+ for (a=0; a<20; ++a)
+ for (b=0; b<20; ++b)
+ Sim[a][b] = P[a][b]*P[a][b]/P[a][a]/P[b][b];
+
+ //Precompute matrix R for amino acid pseudocounts:
+ for (a=0; a<20; ++a)
+ for (b=0; b<20; ++b)
+ R[a][b] = P[a][b]/pb[b]; //R[a][b]=P(a|b)
+
+ //Precompute matrix R for amino acid pseudocounts:
+ for (a=0; a<20; ++a)
+ for (b=0; b<20; ++b)
+ S[a][b] = Log2(R[a][b]/pb[a]); // S[a][b] = log2(P(a,b)/P(a)/P(b))
+
+ // Evaluate sequence identity underlying substitution matrix
+ if (v>=3)
+ {
+ float id=0.0f;
+ float entropy=0.0f;
+ float entropy_pb=0.0f;
+ float mut_info=0.0f;
+ for (a=0; a<20; ++a) id+=P[a][a];
+ for (a=0; a<20; ++a) entropy_pb-=pb[a]*Log2(pb[a]);
+ for (a=0; a<20; ++a)
+ for (b=0; b<20; ++b)
+ {
+ entropy-=P[a][b]*Log2(R[a][b]);
+ mut_info += P[a][b]*S[a][b];
+ }
+
+ printf(": sequence identity = %2.0f%%; entropy per column = %4.2f bits (out of %4.2f); mutual information = %4.2f bits\n",100*id,entropy,entropy_pb,mut_info);
+ }
+
+ if (v>=4) //Debugging: probability matrix and dissimilarity matrix
+ {
+ cout<<"Check matrix: before renormalization sum P(a,b)= "<<sumab<<"...\n";//PRINT
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ cout<<"p[] ";
+ for (a=0; a<20; a++) printf("%4.1f ",100*pb[a]);
+ cout<<endl<<"\nSubstitution matrix log2( P(a,b)/p(a)/p(b) ) (in bits):\n";
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ for (b=0; b<20; b++)
+ {
+ cout<<i2aa(b)<<" ";
+ for (a=0; a<20; a++) printf("%4.1f ",S[a][b]);
+ cout<<endl;
+ }
+ cout<<endl<<"\nOdds matrix P(a,b)/p(a)/p(b):\n";
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ for (b=0; b<20; b++)
+ {
+ cout<<i2aa(b)<<" ";
+ for (a=0; a<20; a++) printf("%4.1f ",P[b][a]/pb[a]/pb[b]);
+ cout<<endl;
+ }
+ cout<<endl<<"\nMatrix of conditional probabilities P(a|b) = P(a,b)/p(b) (in %):\n";
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ for (b=0; b<20; b++)
+ {
+ cout<<i2aa(b)<<" ";
+ for (a=0; a<20; a++) printf("%4.1f ",100*R[b][a]);
+ cout<<endl;
+ }
+ cout<<endl<<"\nProbability matrix P(a,b) (in %):\n";
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ for (b=0; b<20; b++)
+ {
+ cout<<i2aa(b)<<" ";
+ for (a=0; a<20; a++) printf("%5.0f ",1000000*P[b][a]);
+ cout<<endl;
+ }
+ cout<<endl<<"Similarity matrix P(a,b)^2/P(a,a,)/P(b,b) (in %):\n";
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ for (b=0; b<20; b++)
+ {
+ cout<<i2aa(b)<<" ";
+ for (a=0; a<20; a++) printf("%4.0f ",100*Sim[b][a]);
+ cout<<endl;
+ }
+ cout<<endl;
+
+
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////
+/**
+ * @brief Set (global variable) substitution matrix with derived matrices and background frequencies
+ */
+void
+SetDnaSubstitutionMatrix()
+{
+ int a,b;
+ for (a=0; a<20; ++a)
+ for (pb[a]=0.0f, b=0; b<20; ++b)
+ P[a][b] = 0.000001f*dna_basic[a*20+b];
+ for (a=0; a<20; ++a) P[a][20]=P[20][a]=1.0f;
+
+ // Check transition probability matrix, renormalize P and calculate pb[a]
+ float sumab=0.0f;
+ for (a=0; a<20; a++)
+ for (b=0; b<20; ++b) sumab+=P[a][b];
+ for (a=0; a<20; a++)
+ for (b=0; b<20; ++b) P[a][b]/=sumab;
+ for (a=0; a<20; a++)
+ for (pb[a]=0.0f, b=0; b<20; ++b) pb[a]+=P[a][b];
+
+ //Compute similarity matrix for amino acid pairs (for calculating consensus sequence)
+ for (a=0; a<20; ++a)
+ for (b=0; b<20; ++b)
+ Sim[a][b] = P[a][b]*P[a][b]/P[a][a]/P[b][b];
+
+ //Precompute matrix R for amino acid pseudocounts:
+ for (a=0; a<20; ++a)
+ for (b=0; b<20; ++b)
+ R[a][b] = P[a][b]/pb[b]; //R[a][b]=P(a|b)
+
+ //Precompute matrix R for amino acid pseudocounts:
+ for (a=0; a<20; ++a)
+ for (b=0; b<20; ++b)
+ S[a][b] = Log2(R[a][b]/pb[a]); // S[a][b] = log2(P(a,b)/P(a)/P(b))
+
+ // Evaluate sequence identity underlying substitution matrix
+ if (v>=3)
+ {
+ float id=0.0f;
+ float entropy=0.0f;
+ float entropy_pb=0.0f;
+ float mut_info=0.0f;
+ for (a=0; a<20; ++a) id+=P[a][a];
+ for (a=0; a<20; ++a) entropy_pb-=pb[a]*Log2(pb[a]);
+ for (a=0; a<20; ++a)
+ for (b=0; b<20; ++b)
+ {
+ entropy-=P[a][b]*Log2(R[a][b]);
+ mut_info += P[a][b]*S[a][b];
+ }
+
+ printf(": sequence identity = %2.0f%%; entropy per column = %4.2f bits (out of %4.2f); mutual information = %4.2f bits\n",100*id,entropy,entropy_pb,mut_info);
+ }
+
+ if (v>=4) //Debugging: probability matrix and dissimilarity matrix
+ {
+ cout<<"Check matrix: before renormalization sum P(a,b)= "<<sumab<<"...\n";//PRINT
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ cout<<"p[] ";
+ for (a=0; a<20; a++) printf("%4.1f ",100*pb[a]);
+ cout<<endl<<"\nSubstitution matrix log2( P(a,b)/p(a)/p(b) ) (in bits):\n";
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ for (b=0; b<20; b++)
+ {
+ cout<<i2aa(b)<<" ";
+ for (a=0; a<20; a++) printf("%4.1f ",S[a][b]);
+ cout<<endl;
+ }
+ cout<<endl<<"\nOdds matrix P(a,b)/p(a)/p(b):\n";
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ for (b=0; b<20; b++)
+ {
+ cout<<i2aa(b)<<" ";
+ for (a=0; a<20; a++) printf("%4.1f ",P[b][a]/pb[a]/pb[b]);
+ cout<<endl;
+ }
+ cout<<endl<<"\nMatrix of conditional probabilities P(a|b) = P(a,b)/p(b) (in %):\n";
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ for (b=0; b<20; b++)
+ {
+ cout<<i2aa(b)<<" ";
+ for (a=0; a<20; a++) printf("%4.1f ",100*R[b][a]);
+ cout<<endl;
+ }
+ cout<<endl<<"\nProbability matrix P(a,b) (in %):\n";
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ for (b=0; b<20; b++)
+ {
+ cout<<i2aa(b)<<" ";
+ for (a=0; a<20; a++) printf("%5.0f ",1000000*P[b][a]);
+ cout<<endl;
+ }
+ cout<<endl<<"Similarity matrix P(a,b)^2/P(a,a,)/P(b,b) (in %):\n";
+ cout<<" A C G T U ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\n";
+ for (b=0; b<20; b++)
+ {
+ cout<<i2aa(b)<<" ";
+ for (a=0; a<20; a++) printf("%4.0f ",100*Sim[b][a]);
+ cout<<endl;
+ }
+ cout<<endl;
+
+
+ }
+}
/////////////////////////////////////////////////////////////////////////////////////
/**
for (B=0; B<NSSPRED; B++)
{
P73[A][B][cf] = 1.-par.ssa + par.ssa*Ppred[cf*NSSPRED*NDSSP + B*NDSSP + A];
- S73[A][B][cf] = log2(P73[A][B][cf]);
+ S73[A][B][cf] = Log2(P73[A][B][cf]);
}
for (B=0; B<NSSPRED; B++)
sum=0;
for (A=1; A<NDSSP; A++)
sum += P73[A][B][cf] * P73[A][BB][ccf] * Pobs[A];
- S33[B][cf][BB][ccf] = log2(sum);
+ S33[B][cf][BB][ccf] = Log2(sum);
}
} /* this is the end of SetSecStrucSubstitutionMatrix() */