Dundee specific settings from JWS2 branch
[jabaws.git] / binaries / src / muscle / msa2.cpp
1 #include "muscle.h"\r
2 #include "msa.h"\r
3 #include "seqvect.h"\r
4 #include "profile.h"\r
5 #include "tree.h"\r
6 \r
7 // These global variables are a hack to allow the tree\r
8 // dependent iteration code to communicate the edge\r
9 // used to divide the tree. The three-way weighting\r
10 // scheme needs to know this edge in order to compute\r
11 // sequence weights.\r
12 static const Tree *g_ptrMuscleTree = 0;\r
13 unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR;\r
14 unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR;\r
15 \r
16 void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,\r
17   FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,\r
18   FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,\r
19   FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const\r
20         {\r
21         const unsigned uSeqCount = GetSeqCount();\r
22         const unsigned uColCount = GetColCount();\r
23 \r
24         memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));\r
25         WEIGHT wTotal = 0;\r
26         FCOUNT fGap = 0;\r
27         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
28                 {\r
29                 const WEIGHT w = GetSeqWeight(uSeqIndex);\r
30                 if (IsGap(uSeqIndex, uColIndex))\r
31                         {\r
32                         fGap += w;\r
33                         continue;\r
34                         }\r
35                 else if (IsWildcard(uSeqIndex, uColIndex))\r
36                         {\r
37                         const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);\r
38                         switch (g_Alpha)\r
39                                 {\r
40                         case ALPHA_Amino:\r
41                                 switch (uLetter)\r
42                                         {\r
43                                 case AX_B:              // D or N\r
44                                         fcCounts[AX_D] += w/2;\r
45                                         fcCounts[AX_N] += w/2;\r
46                                         break;\r
47                                 case AX_Z:              // E or Q\r
48                                         fcCounts[AX_E] += w/2;\r
49                                         fcCounts[AX_Q] += w/2;\r
50                                         break;\r
51                                 default:                // any\r
52                                         {\r
53                                         const FCOUNT f = w/20;\r
54                                         for (unsigned uLetter = 0; uLetter < 20; ++uLetter)\r
55                                                 fcCounts[uLetter] += f;\r
56                                         break;\r
57                                         }\r
58                                         }\r
59                                 break;\r
60 \r
61                         case ALPHA_DNA:\r
62                         case ALPHA_RNA:\r
63                                 switch (uLetter)\r
64                                         {\r
65                                 case AX_R:      // G or A\r
66                                         fcCounts[NX_G] += w/2;\r
67                                         fcCounts[NX_A] += w/2;\r
68                                         break;\r
69                                 case AX_Y:      // C or T/U\r
70                                         fcCounts[NX_C] += w/2;\r
71                                         fcCounts[NX_T] += w/2;\r
72                                         break;\r
73                                 default:        // any\r
74                                         const FCOUNT f = w/20;\r
75                                         for (unsigned uLetter = 0; uLetter < 4; ++uLetter)\r
76                                                 fcCounts[uLetter] += f;\r
77                                         break;\r
78                                         }\r
79                                 break;\r
80 \r
81                         default:\r
82                                 Quit("Alphabet %d not supported", g_Alpha);\r
83                                 }\r
84                         continue;\r
85                         }\r
86                 unsigned uLetter = GetLetter(uSeqIndex, uColIndex);\r
87                 fcCounts[uLetter] += w;\r
88                 wTotal += w;\r
89                 }\r
90         *ptrfOcc = (float) (1.0 - fGap);\r
91 \r
92         if (bNormalize && wTotal > 0)\r
93                 {\r
94                 if (wTotal > 1.001)\r
95                         Quit("wTotal=%g\n", wTotal);\r
96                 for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
97                         fcCounts[uLetter] /= wTotal;\r
98 //              AssertNormalized(fcCounts);\r
99                 }\r
100 \r
101         FCOUNT fcStartCount = 0;\r
102         if (uColIndex == 0)\r
103                 {\r
104                 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
105                         if (IsGap(uSeqIndex, uColIndex))\r
106                                 fcStartCount += GetSeqWeight(uSeqIndex);\r
107                 }\r
108         else\r
109                 {\r
110                 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
111                         if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))\r
112                                 fcStartCount += GetSeqWeight(uSeqIndex);\r
113                 }\r
114 \r
115         FCOUNT fcEndCount = 0;\r
116         if (uColCount - 1 == uColIndex)\r
117                 {\r
118                 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
119                         if (IsGap(uSeqIndex, uColIndex))\r
120                                 fcEndCount += GetSeqWeight(uSeqIndex);\r
121                 }\r
122         else\r
123                 {\r
124                 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
125                         if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))\r
126                                 fcEndCount += GetSeqWeight(uSeqIndex);\r
127                 }\r
128 \r
129         FCOUNT LL = 0;\r
130         FCOUNT LG = 0;\r
131         FCOUNT GL = 0;\r
132         FCOUNT GG = 0;\r
133         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
134                 {\r
135                 WEIGHT w = GetSeqWeight(uSeqIndex);\r
136                 bool bLetterHere = !IsGap(uSeqIndex, uColIndex);\r
137                 bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));\r
138                 if (bLetterHere)\r
139                         {\r
140                         if (bLetterPrev)\r
141                                 LL += w;\r
142                         else\r
143                                 GL += w;\r
144                         }\r
145                 else\r
146                         {\r
147                         if (bLetterPrev)\r
148                                 LG += w;\r
149                         else\r
150                                 GG += w;\r
151                         }\r
152                 }\r
153 \r
154         FCOUNT fcExtendCount = 0;\r
155         if (uColIndex > 0 && uColIndex < GetColCount() - 1)\r
156                 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
157                         if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&\r
158                           IsGap(uSeqIndex, uColIndex + 1))\r
159                                 fcExtendCount += GetSeqWeight(uSeqIndex);\r
160 \r
161         *ptrfcLL = LL;\r
162         *ptrfcLG = LG;\r
163         *ptrfcGL = GL;\r
164         *ptrfcGG = GG;\r
165         *ptrfcGapStart = fcStartCount;\r
166         *ptrfcGapEnd = fcEndCount;\r
167         *ptrfcGapExtend = fcExtendCount;\r
168         }\r
169 \r
170 // Return true if the given column has no gaps and all\r
171 // its residues are in the same biochemical group.\r
172 bool MSAColIsConservative(const MSA &msa, unsigned uColIndex)\r
173         {\r
174         extern unsigned ResidueGroup[];\r
175 \r
176         const unsigned uSeqCount = msa.GetColCount();\r
177         if (0 == uSeqCount)\r
178                 Quit("MSAColIsConservative: empty alignment");\r
179 \r
180         if (msa.IsGap(0, uColIndex))\r
181                 return false;\r
182 \r
183         unsigned uLetter = msa.GetLetterEx(0, uColIndex);\r
184         const unsigned uGroup = ResidueGroup[uLetter];\r
185 \r
186         for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex)\r
187                 {\r
188                 if (msa.IsGap(uSeqIndex, uColIndex))\r
189                         return false;\r
190                 uLetter = msa.GetLetter(uSeqIndex, uColIndex);\r
191                 if (ResidueGroup[uLetter] != uGroup)\r
192                         return false;\r
193                 }\r
194         return true;\r
195         }\r
196 \r
197 void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount,\r
198   MSA &msaOut)\r
199         {\r
200         const unsigned uColCount = msaIn.GetColCount();\r
201         msaOut.SetSize(uSeqCount, uColCount);\r
202 \r
203         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
204                 {\r
205                 const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex);\r
206                 msaOut.SetSeqName(uSeqIndex, ptrName);\r
207 \r
208                 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
209                         {\r
210                         const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex);\r
211                         msaOut.SetChar(uSeqIndex, uColIndex, c);\r
212                         }\r
213                 }\r
214         }\r
215 \r
216 void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,\r
217   MSA &msaOut)\r
218         {\r
219         const unsigned uSeqCount = msaIn.GetSeqCount();\r
220         const unsigned uInColCount = msaIn.GetColCount();\r
221 \r
222         if (uFromColIndex + uColCount - 1 > uInColCount)\r
223                 Quit("MSAFromColRange, out of bounds");\r
224 \r
225         msaOut.SetSize(uSeqCount, uColCount);\r
226 \r
227         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
228                 {\r
229                 const char *ptrName = msaIn.GetSeqName(uSeqIndex);\r
230                 unsigned uId = msaIn.GetSeqId(uSeqIndex);\r
231                 msaOut.SetSeqName(uSeqIndex, ptrName);\r
232                 msaOut.SetSeqId(uSeqIndex, uId);\r
233 \r
234                 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
235                         {\r
236                         const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex);\r
237                         msaOut.SetChar(uSeqIndex, uColIndex, c);\r
238                         }\r
239                 }\r
240         }\r
241 \r
242 void SeqVectFromMSA(const MSA &msa, SeqVect &v)\r
243         {\r
244         v.Clear();\r
245         const unsigned uSeqCount = msa.GetSeqCount();\r
246         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
247                 {\r
248                 Seq s;\r
249                 msa.GetSeq(uSeqIndex, s);\r
250 \r
251                 s.StripGaps();\r
252                 //if (0 == s.Length())\r
253                 //      continue;\r
254 \r
255                 const char *ptrName = msa.GetSeqName(uSeqIndex);\r
256                 s.SetName(ptrName);\r
257 \r
258                 v.AppendSeq(s);\r
259                 }\r
260         }\r
261 \r
262 void DeleteGappedCols(MSA &msa)\r
263         {\r
264         unsigned uColIndex = 0;\r
265         for (;;)\r
266                 {\r
267                 if (uColIndex >= msa.GetColCount())\r
268                         break;\r
269                 if (msa.IsGapColumn(uColIndex))\r
270                         msa.DeleteCol(uColIndex);\r
271                 else\r
272                         ++uColIndex;\r
273                 }\r
274         }\r
275 \r
276 void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,\r
277   MSA &msaOut)\r
278         {\r
279         const unsigned uColCount = msaIn.GetColCount();\r
280         msaOut.SetSize(uSeqCount, uColCount);\r
281         for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut)\r
282                 {\r
283                 unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut];\r
284                 const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);\r
285                 unsigned uId = msaIn.GetSeqId(uSeqIndexIn);\r
286                 msaOut.SetSeqName(uSeqIndexOut, ptrName);\r
287                 msaOut.SetSeqId(uSeqIndexOut, uId);\r
288                 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
289                         {\r
290                         const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);\r
291                         msaOut.SetChar(uSeqIndexOut, uColIndex, c);\r
292                         }\r
293                 }\r
294         }\r
295 \r
296 void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2)\r
297         {\r
298         const unsigned uSeqCount1 = msa1.GetSeqCount();\r
299         const unsigned uSeqCount2 = msa2.GetSeqCount();\r
300         if (uSeqCount1 != uSeqCount2)\r
301                 Quit("Seq count differs");\r
302 \r
303         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)\r
304                 {\r
305                 Seq seq1;\r
306                 msa1.GetSeq(uSeqIndex, seq1);\r
307 \r
308                 unsigned uId = msa1.GetSeqId(uSeqIndex);\r
309                 unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
310 \r
311                 Seq seq2;\r
312                 msa2.GetSeq(uSeqIndex2, seq2);\r
313 \r
314                 if (!seq1.EqIgnoreCaseAndGaps(seq2))\r
315                         {\r
316                         Log("Input:\n");\r
317                         seq1.LogMe();\r
318                         Log("Output:\n");\r
319                         seq2.LogMe();\r
320                         Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));\r
321                         }\r
322                 }\r
323         }\r
324 \r
325 void AssertMSAEq(const MSA &msa1, const MSA &msa2)\r
326         {\r
327         const unsigned uSeqCount1 = msa1.GetSeqCount();\r
328         const unsigned uSeqCount2 = msa2.GetSeqCount();\r
329         if (uSeqCount1 != uSeqCount2)\r
330                 Quit("Seq count differs");\r
331 \r
332         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)\r
333                 {\r
334                 Seq seq1;\r
335                 msa1.GetSeq(uSeqIndex, seq1);\r
336 \r
337                 unsigned uId = msa1.GetSeqId(uSeqIndex);\r
338                 unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
339 \r
340                 Seq seq2;\r
341                 msa2.GetSeq(uSeqIndex2, seq2);\r
342 \r
343                 if (!seq1.Eq(seq2))\r
344                         {\r
345                         Log("Input:\n");\r
346                         seq1.LogMe();\r
347                         Log("Output:\n");\r
348                         seq2.LogMe();\r
349                         Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));\r
350                         }\r
351                 }\r
352         }\r
353 \r
354 void SetMSAWeightsMuscle(MSA &msa)\r
355         {\r
356         SEQWEIGHT Method = GetSeqWeightMethod();\r
357         switch (Method)\r
358                 {\r
359         case SEQWEIGHT_None:\r
360                 msa.SetUniformWeights();\r
361                 return;\r
362 \r
363         case SEQWEIGHT_Henikoff:\r
364                 msa.SetHenikoffWeights();\r
365                 return;\r
366 \r
367         case SEQWEIGHT_HenikoffPB:\r
368                 msa.SetHenikoffWeightsPB();\r
369                 return;\r
370 \r
371         case SEQWEIGHT_GSC:\r
372                 msa.SetGSCWeights();\r
373                 return;\r
374 \r
375         case SEQWEIGHT_ClustalW:\r
376                 SetClustalWWeightsMuscle(msa);\r
377                 return;\r
378         \r
379         case SEQWEIGHT_ThreeWay:\r
380                 SetThreeWayWeightsMuscle(msa);\r
381                 return;\r
382                 }\r
383         Quit("SetMSAWeightsMuscle, Invalid method=%d", Method);\r
384         }\r
385 \r
386 static WEIGHT *g_MuscleWeights;\r
387 static unsigned g_uMuscleIdCount;\r
388 \r
389 WEIGHT GetMuscleSeqWeightById(unsigned uId)\r
390         {\r
391         if (0 == g_MuscleWeights)\r
392                 Quit("g_MuscleWeights = 0");\r
393         if (uId >= g_uMuscleIdCount)\r
394                 Quit("GetMuscleSeqWeightById(%u): count=%u",\r
395                   uId, g_uMuscleIdCount);\r
396 \r
397         return g_MuscleWeights[uId];\r
398         }\r
399 \r
400 void SetMuscleTree(const Tree &tree)\r
401         {\r
402         g_ptrMuscleTree = &tree;\r
403 \r
404         if (SEQWEIGHT_ClustalW != GetSeqWeightMethod())\r
405                 return;\r
406 \r
407         delete[] g_MuscleWeights;\r
408 \r
409         const unsigned uLeafCount = tree.GetLeafCount();\r
410         g_uMuscleIdCount = uLeafCount;\r
411         g_MuscleWeights = new WEIGHT[uLeafCount];\r
412         CalcClustalWWeights(tree, g_MuscleWeights);\r
413         }\r
414 \r
415 void SetClustalWWeightsMuscle(MSA &msa)\r
416         {\r
417         if (0 == g_MuscleWeights)\r
418                 Quit("g_MuscleWeights = 0");\r
419         const unsigned uSeqCount = msa.GetSeqCount();\r
420         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
421                 {\r
422                 const unsigned uId = msa.GetSeqId(uSeqIndex);\r
423                 if (uId >= g_uMuscleIdCount)\r
424                         Quit("SetClustalWWeightsMuscle: id out of range");\r
425                 msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]);\r
426                 }\r
427         msa.NormalizeWeights((WEIGHT) 1.0);\r
428         }\r
429 \r
430 #define LOCAL_VERBOSE   0\r
431 \r
432 void SetThreeWayWeightsMuscle(MSA &msa)\r
433         {\r
434         if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2)\r
435                 {\r
436                 msa.SetHenikoffWeightsPB();\r
437                 return;\r
438                 }\r
439 \r
440         const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount();\r
441         WEIGHT *Weights = new WEIGHT[uMuscleSeqCount];\r
442 \r
443         CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2,\r
444           Weights);\r
445 \r
446         const unsigned uMSASeqCount = msa.GetSeqCount();\r
447         for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex)\r
448                 {\r
449                 const unsigned uId = msa.GetSeqId(uSeqIndex);\r
450                 if (uId >= uMuscleSeqCount)\r
451                         Quit("SetThreeWayWeightsMuscle: id out of range");\r
452                 msa.SetSeqWeight(uSeqIndex, Weights[uId]);\r
453                 }\r
454 #if     LOCAL_VERBOSE\r
455         {\r
456         Log("SetThreeWayWeightsMuscle\n");\r
457         for (unsigned n = 0; n < uMSASeqCount; ++n)\r
458                 {\r
459                 const unsigned uId = msa.GetSeqId(n);\r
460                 Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]);\r
461                 }\r
462         }\r
463 #endif\r
464         msa.NormalizeWeights((WEIGHT) 1.0);\r
465 \r
466         delete[] Weights;\r
467         }\r
468 \r
469 // Append msa2 at the end of msa1\r
470 void MSAAppend(MSA &msa1, const MSA &msa2)\r
471         {\r
472         const unsigned uSeqCount = msa1.GetSeqCount();\r
473 \r
474         const unsigned uColCount1 = msa1.GetColCount();\r
475         const unsigned uColCount2 = msa2.GetColCount();\r
476         const unsigned uColCountCat = uColCount1 + uColCount2;\r
477 \r
478         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
479                 {\r
480                 unsigned uId = msa1.GetSeqId(uSeqIndex);\r
481                 unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);\r
482                 for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
483                         {\r
484                         const char c = msa2.GetChar(uSeqIndex2, uColIndex);\r
485                         msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c);\r
486                         }\r
487                 }\r
488         }\r
489 \r
490 // "Catenate" two MSAs (by bad analogy with UNIX cat command).\r
491 // msa1 and msa2 must have same sequence names, but possibly\r
492 // in a different order.\r
493 // msaCat is the combined alignment produce by appending\r
494 // sequences in msa2 to sequences in msa1.\r
495 void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat)\r
496         {\r
497         const unsigned uSeqCount = msa1.GetSeqCount();\r
498 \r
499         const unsigned uColCount1 = msa1.GetColCount();\r
500         const unsigned uColCount2 = msa2.GetColCount();\r
501         const unsigned uColCountCat = uColCount1 + uColCount2;\r
502 \r
503         msaCat.SetSize(uSeqCount, uColCountCat);\r
504 \r
505         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
506                 {\r
507                 for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex)\r
508                         {\r
509                         const char c = msa1.GetChar(uSeqIndex, uColIndex);\r
510                         msaCat.SetChar(uSeqIndex, uColIndex, c);\r
511                         }\r
512 \r
513                 const char *ptrSeqName = msa1.GetSeqName(uSeqIndex);\r
514                 unsigned uSeqIndex2;\r
515                 msaCat.SetSeqName(uSeqIndex, ptrSeqName);\r
516                 bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2);\r
517                 if (bFound)\r
518                         {\r
519                         for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
520                                 {\r
521                                 const char c = msa2.GetChar(uSeqIndex2, uColIndex);\r
522                                 msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c);\r
523                                 }\r
524                         }\r
525                 else\r
526                         {\r
527                         for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)\r
528                                 msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-');\r
529                         }\r
530                 }\r
531         }\r