Next version of JABA
[jabaws.git] / binaries / src / muscle / profilefrommsa.cpp
1 #include "muscle.h"\r
2 #include "msa.h"\r
3 #include "profile.h"\r
4 \r
5 #define TRACE   0\r
6 \r
7 static void LogF(FCOUNT f)\r
8         {\r
9         if (f > -0.00001 && f < 0.00001)\r
10                 Log("       ");\r
11         else\r
12                 Log("  %5.3f", f);\r
13         }\r
14 \r
15 static const char *LocalScoreToStr(SCORE s)\r
16         {\r
17         static char str[16];\r
18         if (s < -1e10 || s > 1e10)\r
19                 return "    *";\r
20         sprintf(str, "%5.1f", s);\r
21         return str;\r
22         }\r
23 \r
24 #if     DOUBLE_AFFINE\r
25 void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA)\r
26         {\r
27         Log("  Pos  Occ     LL     LG     GL     GG     Open  Close  Open2  Clos2\n");\r
28         Log("  ---  ---     --     --     --     --     ----  -----  -----  -----\n");\r
29         for (unsigned n = 0; n < uLength; ++n)\r
30                 {\r
31                 const ProfPos &PP = Prof[n];\r
32                 Log("%5u", n);\r
33                 LogF(PP.m_fOcc);\r
34                 LogF(PP.m_LL);\r
35                 LogF(PP.m_LG);\r
36                 LogF(PP.m_GL);\r
37                 LogF(PP.m_GG);\r
38                 Log("  %s", LocalScoreToStr(-PP.m_scoreGapOpen));\r
39                 Log("  %s", LocalScoreToStr(-PP.m_scoreGapClose));\r
40                 Log("  %s", LocalScoreToStr(-PP.m_scoreGapOpen2));\r
41                 Log("  %s", LocalScoreToStr(-PP.m_scoreGapClose2));\r
42                 if (0 != ptrMSA)\r
43                         {\r
44                         const unsigned uSeqCount = ptrMSA->GetSeqCount();\r
45                         Log("  ");\r
46                         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
47                                 Log("%c", ptrMSA->GetChar(uSeqIndex, n));\r
48                         }\r
49                 Log("\n");\r
50                 }\r
51 \r
52         Log("\n");\r
53         Log("  Pos G");\r
54         for (unsigned n = 0; n < g_AlphaSize; ++n)\r
55                 Log("     %c", LetterExToChar(n));\r
56         Log("\n");\r
57         Log("  --- -");\r
58         for (unsigned n = 0; n < g_AlphaSize; ++n)\r
59                 Log(" -----");\r
60         Log("\n");\r
61 \r
62         for (unsigned n = 0; n < uLength; ++n)\r
63                 {\r
64                 const ProfPos &PP = Prof[n];\r
65                 Log("%5u", n);\r
66                 if (-1 == PP.m_uResidueGroup)\r
67                         Log(" -", PP.m_uResidueGroup);\r
68                 else\r
69                         Log(" %d", PP.m_uResidueGroup);\r
70 \r
71                 for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
72                         {\r
73                         FCOUNT f = PP.m_fcCounts[uLetter];\r
74                         if (f == 0.0)\r
75                                 Log("      ");\r
76                         else\r
77                                 Log(" %5.3f", f);\r
78                         }\r
79                 if (0 != ptrMSA)\r
80                         {\r
81                         const unsigned uSeqCount = ptrMSA->GetSeqCount();\r
82                         Log("  ");\r
83                         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
84                                 Log("%c", ptrMSA->GetChar(uSeqIndex, n));\r
85                         }\r
86                 Log("\n");\r
87                 }\r
88         }\r
89 #endif  // DOUBLE_AFFINE\r
90 \r
91 #if SINGLE_AFFINE\r
92 void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA)\r
93         {\r
94         Log("  Pos  Occ     LL     LG     GL     GG     Open  Close\n");\r
95         Log("  ---  ---     --     --     --     --     ----  -----\n");\r
96         for (unsigned n = 0; n < uLength; ++n)\r
97                 {\r
98                 const ProfPos &PP = Prof[n];\r
99                 Log("%5u", n);\r
100                 LogF(PP.m_fOcc);\r
101                 LogF(PP.m_LL);\r
102                 LogF(PP.m_LG);\r
103                 LogF(PP.m_GL);\r
104                 LogF(PP.m_GG);\r
105                 Log("  %5.1f", -PP.m_scoreGapOpen);\r
106                 Log("  %5.1f", -PP.m_scoreGapClose);\r
107                 if (0 != ptrMSA)\r
108                         {\r
109                         const unsigned uSeqCount = ptrMSA->GetSeqCount();\r
110                         Log("  ");\r
111                         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
112                                 Log("%c", ptrMSA->GetChar(uSeqIndex, n));\r
113                         }\r
114                 Log("\n");\r
115                 }\r
116 \r
117         Log("\n");\r
118         Log("  Pos G");\r
119         for (unsigned n = 0; n < g_AlphaSize; ++n)\r
120                 Log("     %c", LetterExToChar(n));\r
121         Log("\n");\r
122         Log("  --- -");\r
123         for (unsigned n = 0; n < g_AlphaSize; ++n)\r
124                 Log(" -----");\r
125         Log("\n");\r
126 \r
127         for (unsigned n = 0; n < uLength; ++n)\r
128                 {\r
129                 const ProfPos &PP = Prof[n];\r
130                 Log("%5u", n);\r
131                 if (-1 == PP.m_uResidueGroup)\r
132                         Log(" -", PP.m_uResidueGroup);\r
133                 else\r
134                         Log(" %d", PP.m_uResidueGroup);\r
135 \r
136                 for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
137                         {\r
138                         FCOUNT f = PP.m_fcCounts[uLetter];\r
139                         if (f == 0.0)\r
140                                 Log("      ");\r
141                         else\r
142                                 Log(" %5.3f", f);\r
143                         }\r
144                 if (0 != ptrMSA)\r
145                         {\r
146                         const unsigned uSeqCount = ptrMSA->GetSeqCount();\r
147                         Log("  ");\r
148                         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
149                                 Log("%c", ptrMSA->GetChar(uSeqIndex, n));\r
150                         }\r
151                 Log("\n");\r
152                 }\r
153         }\r
154 #endif\r
155 \r
156 void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[])\r
157         {\r
158         static unsigned InitialSortOrder[MAX_ALPHA] =\r
159                 {\r
160                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19\r
161                 };\r
162         memcpy(SortOrder, InitialSortOrder, g_AlphaSize*sizeof(unsigned));\r
163 \r
164         bool bAny = true;\r
165         while (bAny)\r
166                 {\r
167                 bAny = false;\r
168                 for (unsigned n = 0; n < g_AlphaSize - 1; ++n)\r
169                         {\r
170                         unsigned i1 = SortOrder[n];\r
171                         unsigned i2 = SortOrder[n+1];\r
172                         if (fcCounts[i1] < fcCounts[i2])\r
173                                 {\r
174                                 SortOrder[n+1] = i1;\r
175                                 SortOrder[n] = i2;\r
176                                 bAny = true;\r
177                                 }\r
178                         }\r
179                 }\r
180         }\r
181 \r
182 static unsigned AminoGroupFromFCounts(const FCOUNT fcCounts[])\r
183         {\r
184         bool bAny = false;\r
185         unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE;\r
186         for (unsigned uLetter = 0; uLetter < 20; ++uLetter)\r
187                 {\r
188                 if (0 == fcCounts[uLetter])\r
189                         continue;\r
190                 const unsigned uResidueGroup = ResidueGroup[uLetter];\r
191                 if (bAny)\r
192                         {\r
193                         if (uResidueGroup != uConsensusResidueGroup)\r
194                                 return RESIDUE_GROUP_MULTIPLE;\r
195                         }\r
196                 else\r
197                         {\r
198                         bAny = true;\r
199                         uConsensusResidueGroup = uResidueGroup;\r
200                         }\r
201                 }\r
202         return uConsensusResidueGroup;\r
203         }\r
204 \r
205 static unsigned NucleoGroupFromFCounts(const FCOUNT fcCounts[])\r
206         {\r
207         bool bAny = false;\r
208         unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE;\r
209         for (unsigned uLetter = 0; uLetter < 4; ++uLetter)\r
210                 {\r
211                 if (0 == fcCounts[uLetter])\r
212                         continue;\r
213                 const unsigned uResidueGroup = uLetter;\r
214                 if (bAny)\r
215                         {\r
216                         if (uResidueGroup != uConsensusResidueGroup)\r
217                                 return RESIDUE_GROUP_MULTIPLE;\r
218                         }\r
219                 else\r
220                         {\r
221                         bAny = true;\r
222                         uConsensusResidueGroup = uResidueGroup;\r
223                         }\r
224                 }\r
225         return uConsensusResidueGroup;\r
226         }\r
227 \r
228 unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[])\r
229         {\r
230         switch (g_Alpha)\r
231                 {\r
232         case ALPHA_Amino:\r
233                 return AminoGroupFromFCounts(fcCounts);\r
234 \r
235         case ALPHA_DNA:\r
236         case ALPHA_RNA:\r
237                 return NucleoGroupFromFCounts(fcCounts);\r
238                 }\r
239         Quit("ResidueGroupFromFCounts: bad alpha");\r
240         return 0;\r
241         }\r
242 \r
243 ProfPos *ProfileFromMSA(const MSA &a)\r
244         {\r
245         const unsigned uSeqCount = a.GetSeqCount();\r
246         const unsigned uColCount = a.GetColCount();\r
247 \r
248 // Yuck -- cast away const (inconsistent design here).\r
249         SetMSAWeightsMuscle((MSA &) a);\r
250 \r
251         ProfPos *Pos = new ProfPos[uColCount];\r
252 \r
253         unsigned uHydrophobicRunLength = 0;\r
254         for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
255                 {\r
256                 ProfPos &PP = Pos[uColIndex];\r
257 \r
258                 PP.m_bAllGaps = a.IsGapColumn(uColIndex);\r
259 \r
260                 FCOUNT fcGapStart;\r
261                 FCOUNT fcGapEnd;\r
262                 FCOUNT fcGapExtend;\r
263                 FCOUNT fOcc;\r
264                 a.GetFractionalWeightedCounts(uColIndex, g_bNormalizeCounts, PP.m_fcCounts,\r
265                   &fcGapStart, &fcGapEnd, &fcGapExtend, &fOcc,\r
266                   &PP.m_LL, &PP.m_LG, &PP.m_GL, &PP.m_GG);\r
267                 PP.m_fOcc = fOcc;\r
268 \r
269                 SortCounts(PP.m_fcCounts, PP.m_uSortOrder);\r
270 \r
271                 PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts);\r
272 \r
273                 for (unsigned i = 0; i < g_AlphaSize; ++i)\r
274                         {\r
275                         SCORE scoreSum = 0;\r
276                         for (unsigned j = 0; j < g_AlphaSize; ++j)\r
277                                 scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j];\r
278                         PP.m_AAScores[i] = scoreSum;\r
279                         }\r
280 \r
281                 SCORE sStartOcc = (SCORE) (1.0 - fcGapStart);\r
282                 SCORE sEndOcc = (SCORE) (1.0 - fcGapEnd);\r
283 \r
284                 PP.m_fcStartOcc = sStartOcc;\r
285                 PP.m_fcEndOcc = sEndOcc;\r
286 \r
287                 PP.m_scoreGapOpen = sStartOcc*g_scoreGapOpen/2;\r
288                 PP.m_scoreGapClose = sEndOcc*g_scoreGapOpen/2;\r
289 #if     DOUBLE_AFFINE\r
290                 PP.m_scoreGapOpen2 = sStartOcc*g_scoreGapOpen2/2;\r
291                 PP.m_scoreGapClose2 = sEndOcc*g_scoreGapOpen2/2;\r
292 #endif\r
293 //              PP.m_scoreGapExtend = (SCORE) ((1.0 - fcGapExtend)*scoreGapExtend);\r
294 \r
295 #if     PAF\r
296                 if (ALHPA_Amino == g_Alpha && sStartOcc > 0.5)\r
297                         {\r
298                         extern SCORE PAFactor(const FCOUNT fcCounts[]);\r
299                         SCORE paf = PAFactor(PP.m_fcCounts);\r
300                         PP.m_scoreGapOpen *= paf;\r
301                         PP.m_scoreGapClose *= paf;\r
302                         }\r
303 #endif\r
304                 }\r
305 \r
306 #if     HYDRO\r
307         if (ALPHA_Amino == g_Alpha)\r
308                 Hydro(Pos, uColCount);\r
309 #endif\r
310 \r
311 #if     TRACE\r
312         {\r
313         Log("ProfileFromMSA\n");\r
314         ListProfile(Pos, uColCount, &a);\r
315         }\r
316 #endif\r
317         return Pos;\r
318         }\r