Next version of JABA
[jabaws.git] / binaries / src / muscle / scoredist.cpp
1 #include <limits.h>\r
2 #include <math.h>\r
3 #include "muscle.h"\r
4 #include "msa.h"\r
5 #include "distfunc.h"\r
6 #include "msa.h"\r
7 #include "seqvect.h"\r
8 #include "pwpath.h"\r
9 \r
10 // ScoreDist\r
11 // E. Sonnhammer & V. Hollich, Scoredist: A simple and robust protein sequence\r
12 // distance estimator, BMC Bioinformatics 2005, 6:108.\r
13 \r
14 extern int BLOSUM62[20][20];\r
15 extern double BLOSUM62_Expected;\r
16 \r
17 static const double Dayhoff_CalibrationFactor = 1.3370;\r
18 static const double JTT_CalibrationFactor = 1.2873;\r
19 static const double MV_CalibrationFactor = 1.1775;\r
20 static const double LARGE_D = 3.0;\r
21 \r
22 static double CalibrationFactor = JTT_CalibrationFactor;\r
23 \r
24 \r
25 // Similarity score\r
26 static double Sigma(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2,\r
27   unsigned *ptrLength)\r
28         {\r
29         unsigned Length = 0;\r
30         double Score = 0;\r
31         const unsigned ColCount = msa.GetColCount();\r
32         for (unsigned ColIndex = 0; ColIndex < ColCount; ++ColIndex)\r
33                 {\r
34                 unsigned Letter1 = msa.GetLetterEx(SeqIndex1, ColIndex);\r
35                 unsigned Letter2 = msa.GetLetterEx(SeqIndex2, ColIndex);\r
36                 if (Letter1 >= 20 || Letter2 >= 20)\r
37                         continue;\r
38                 ++Length;\r
39                 Score += BLOSUM62[Letter1][Letter2];\r
40                 }\r
41 \r
42         *ptrLength = Length;\r
43         return Score;\r
44         }\r
45 \r
46 // Normalized score\r
47 static double Sigma_N(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)\r
48         {\r
49         unsigned Length = UINT_MAX;\r
50         double Score = Sigma(msa, SeqIndex1, SeqIndex2, &Length);\r
51         double RandomScore = Length*BLOSUM62_Expected;\r
52         return Score - RandomScore;\r
53         }\r
54 \r
55 // Upper limit\r
56 static double Sigma_U(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2,\r
57   unsigned *ptrLength)\r
58         {\r
59         double Score11 = Sigma(msa, SeqIndex1, SeqIndex1, ptrLength);\r
60         double Score22 = Sigma(msa, SeqIndex2, SeqIndex2, ptrLength);\r
61         return (Score11 + Score22)/2;\r
62         }\r
63 \r
64 // Normalized upper limit\r
65 static double Sigma_UN(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)\r
66         {\r
67         unsigned Length = UINT_MAX;\r
68         double Score = Sigma_U(msa, SeqIndex1, SeqIndex2, &Length);\r
69         double RandomScore = Length*BLOSUM62_Expected;\r
70         return Score - RandomScore;\r
71         }\r
72 \r
73 double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)\r
74         {\r
75         if (g_Alpha != ALPHA_Amino)\r
76                 Quit("Scoredist is only for amino acid sequences");\r
77 \r
78         double s_N = Sigma_N(msa, SeqIndex1, SeqIndex2);\r
79         double s_UN = Sigma_UN(msa, SeqIndex1, SeqIndex2);\r
80         double d = 0.0;\r
81         if (s_UN != 0)\r
82                 {\r
83                 double Ratio = s_N/s_UN;\r
84                 if (Ratio < 0.001)\r
85                         d = LARGE_D;\r
86                 else\r
87                         d =     -log(Ratio);\r
88                 }\r
89         return d*CalibrationFactor;\r
90         }\r
91 \r
92 void DistPWScoreDist(const SeqVect &v, DistFunc &DF)\r
93         {\r
94         SEQWEIGHT SeqWeightSave = GetSeqWeightMethod();\r
95         SetSeqWeightMethod(SEQWEIGHT_Henikoff);\r
96 \r
97         const unsigned uSeqCount = v.Length();\r
98         DF.SetCount(uSeqCount);\r
99 \r
100         const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;\r
101         unsigned uCount = 0;\r
102         SetProgressDesc("PW ScoreDist");\r
103         for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)\r
104                 {\r
105                 const Seq &s1 = v.GetSeq(uSeqIndex1);\r
106                 MSA msa1;\r
107                 msa1.FromSeq(s1);\r
108                 for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)\r
109                         {\r
110                         if (0 == uCount%20)\r
111                                 Progress(uCount, uPairCount);\r
112                         ++uCount;\r
113                         const Seq &s2 = v.GetSeq(uSeqIndex2);\r
114                         MSA msa2;\r
115                         msa2.FromSeq(s2);\r
116                 \r
117                         PWPath Path;\r
118                         MSA msaOut;\r
119                         AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false);\r
120 \r
121                         float d = (float) GetScoreDist(msaOut, 0, 1);\r
122                         DF.SetDist(uSeqIndex1, uSeqIndex2, d);\r
123                         }\r
124                 }\r
125         ProgressStepsDone();\r
126 \r
127         SetSeqWeightMethod(SeqWeightSave);\r
128         }\r