7 // These global variables are a hack to allow the tree
\r
8 // dependent iteration code to communicate the edge
\r
9 // used to divide the tree. The three-way weighting
\r
10 // scheme needs to know this edge in order to compute
\r
11 // sequence weights.
\r
12 static const Tree *g_ptrMuscleTree = 0;
\r
13 unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR;
\r
14 unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR;
\r
16 void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,
\r
17 FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,
\r
18 FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,
\r
19 FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const
\r
21 const unsigned uSeqCount = GetSeqCount();
\r
22 const unsigned uColCount = GetColCount();
\r
24 memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));
\r
27 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
29 const WEIGHT w = GetSeqWeight(uSeqIndex);
\r
30 if (IsGap(uSeqIndex, uColIndex))
\r
35 else if (IsWildcard(uSeqIndex, uColIndex))
\r
37 const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
\r
43 case AX_B: // D or N
\r
44 fcCounts[AX_D] += w/2;
\r
45 fcCounts[AX_N] += w/2;
\r
47 case AX_Z: // E or Q
\r
48 fcCounts[AX_E] += w/2;
\r
49 fcCounts[AX_Q] += w/2;
\r
53 const FCOUNT f = w/20;
\r
54 for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
\r
55 fcCounts[uLetter] += f;
\r
65 case AX_R: // G or A
\r
66 fcCounts[NX_G] += w/2;
\r
67 fcCounts[NX_A] += w/2;
\r
69 case AX_Y: // C or T/U
\r
70 fcCounts[NX_C] += w/2;
\r
71 fcCounts[NX_T] += w/2;
\r
74 const FCOUNT f = w/20;
\r
75 for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
\r
76 fcCounts[uLetter] += f;
\r
82 Quit("Alphabet %d not supported", g_Alpha);
\r
86 unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
\r
87 fcCounts[uLetter] += w;
\r
90 *ptrfOcc = (float) (1.0 - fGap);
\r
92 if (bNormalize && wTotal > 0)
\r
95 Quit("wTotal=%g\n", wTotal);
\r
96 for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
\r
97 fcCounts[uLetter] /= wTotal;
\r
98 // AssertNormalized(fcCounts);
\r
101 FCOUNT fcStartCount = 0;
\r
102 if (uColIndex == 0)
\r
104 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
105 if (IsGap(uSeqIndex, uColIndex))
\r
106 fcStartCount += GetSeqWeight(uSeqIndex);
\r
110 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
111 if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))
\r
112 fcStartCount += GetSeqWeight(uSeqIndex);
\r
115 FCOUNT fcEndCount = 0;
\r
116 if (uColCount - 1 == uColIndex)
\r
118 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
119 if (IsGap(uSeqIndex, uColIndex))
\r
120 fcEndCount += GetSeqWeight(uSeqIndex);
\r
124 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
125 if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))
\r
126 fcEndCount += GetSeqWeight(uSeqIndex);
\r
133 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
135 WEIGHT w = GetSeqWeight(uSeqIndex);
\r
136 bool bLetterHere = !IsGap(uSeqIndex, uColIndex);
\r
137 bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));
\r
154 FCOUNT fcExtendCount = 0;
\r
155 if (uColIndex > 0 && uColIndex < GetColCount() - 1)
\r
156 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
157 if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&
\r
158 IsGap(uSeqIndex, uColIndex + 1))
\r
159 fcExtendCount += GetSeqWeight(uSeqIndex);
\r
165 *ptrfcGapStart = fcStartCount;
\r
166 *ptrfcGapEnd = fcEndCount;
\r
167 *ptrfcGapExtend = fcExtendCount;
\r
170 // Return true if the given column has no gaps and all
\r
171 // its residues are in the same biochemical group.
\r
172 bool MSAColIsConservative(const MSA &msa, unsigned uColIndex)
\r
174 extern unsigned ResidueGroup[];
\r
176 const unsigned uSeqCount = msa.GetColCount();
\r
177 if (0 == uSeqCount)
\r
178 Quit("MSAColIsConservative: empty alignment");
\r
180 if (msa.IsGap(0, uColIndex))
\r
183 unsigned uLetter = msa.GetLetterEx(0, uColIndex);
\r
184 const unsigned uGroup = ResidueGroup[uLetter];
\r
186 for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
188 if (msa.IsGap(uSeqIndex, uColIndex))
\r
190 uLetter = msa.GetLetter(uSeqIndex, uColIndex);
\r
191 if (ResidueGroup[uLetter] != uGroup)
\r
197 void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount,
\r
200 const unsigned uColCount = msaIn.GetColCount();
\r
201 msaOut.SetSize(uSeqCount, uColCount);
\r
203 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
205 const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex);
\r
206 msaOut.SetSeqName(uSeqIndex, ptrName);
\r
208 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
210 const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex);
\r
211 msaOut.SetChar(uSeqIndex, uColIndex, c);
\r
216 void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,
\r
219 const unsigned uSeqCount = msaIn.GetSeqCount();
\r
220 const unsigned uInColCount = msaIn.GetColCount();
\r
222 if (uFromColIndex + uColCount - 1 > uInColCount)
\r
223 Quit("MSAFromColRange, out of bounds");
\r
225 msaOut.SetSize(uSeqCount, uColCount);
\r
227 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
229 const char *ptrName = msaIn.GetSeqName(uSeqIndex);
\r
230 unsigned uId = msaIn.GetSeqId(uSeqIndex);
\r
231 msaOut.SetSeqName(uSeqIndex, ptrName);
\r
232 msaOut.SetSeqId(uSeqIndex, uId);
\r
234 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
236 const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex);
\r
237 msaOut.SetChar(uSeqIndex, uColIndex, c);
\r
242 void SeqVectFromMSA(const MSA &msa, SeqVect &v)
\r
245 const unsigned uSeqCount = msa.GetSeqCount();
\r
246 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
249 msa.GetSeq(uSeqIndex, s);
\r
252 //if (0 == s.Length())
\r
255 const char *ptrName = msa.GetSeqName(uSeqIndex);
\r
256 s.SetName(ptrName);
\r
262 void DeleteGappedCols(MSA &msa)
\r
264 unsigned uColIndex = 0;
\r
267 if (uColIndex >= msa.GetColCount())
\r
269 if (msa.IsGapColumn(uColIndex))
\r
270 msa.DeleteCol(uColIndex);
\r
276 void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,
\r
279 const unsigned uColCount = msaIn.GetColCount();
\r
280 msaOut.SetSize(uSeqCount, uColCount);
\r
281 for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut)
\r
283 unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut];
\r
284 const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);
\r
285 unsigned uId = msaIn.GetSeqId(uSeqIndexIn);
\r
286 msaOut.SetSeqName(uSeqIndexOut, ptrName);
\r
287 msaOut.SetSeqId(uSeqIndexOut, uId);
\r
288 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
290 const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);
\r
291 msaOut.SetChar(uSeqIndexOut, uColIndex, c);
\r
296 void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2)
\r
298 const unsigned uSeqCount1 = msa1.GetSeqCount();
\r
299 const unsigned uSeqCount2 = msa2.GetSeqCount();
\r
300 if (uSeqCount1 != uSeqCount2)
\r
301 Quit("Seq count differs");
\r
303 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)
\r
306 msa1.GetSeq(uSeqIndex, seq1);
\r
308 unsigned uId = msa1.GetSeqId(uSeqIndex);
\r
309 unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
\r
312 msa2.GetSeq(uSeqIndex2, seq2);
\r
314 if (!seq1.EqIgnoreCaseAndGaps(seq2))
\r
320 Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));
\r
325 void AssertMSAEq(const MSA &msa1, const MSA &msa2)
\r
327 const unsigned uSeqCount1 = msa1.GetSeqCount();
\r
328 const unsigned uSeqCount2 = msa2.GetSeqCount();
\r
329 if (uSeqCount1 != uSeqCount2)
\r
330 Quit("Seq count differs");
\r
332 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)
\r
335 msa1.GetSeq(uSeqIndex, seq1);
\r
337 unsigned uId = msa1.GetSeqId(uSeqIndex);
\r
338 unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
\r
341 msa2.GetSeq(uSeqIndex2, seq2);
\r
343 if (!seq1.Eq(seq2))
\r
349 Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));
\r
354 void SetMSAWeightsMuscle(MSA &msa)
\r
356 SEQWEIGHT Method = GetSeqWeightMethod();
\r
359 case SEQWEIGHT_None:
\r
360 msa.SetUniformWeights();
\r
363 case SEQWEIGHT_Henikoff:
\r
364 msa.SetHenikoffWeights();
\r
367 case SEQWEIGHT_HenikoffPB:
\r
368 msa.SetHenikoffWeightsPB();
\r
371 case SEQWEIGHT_GSC:
\r
372 msa.SetGSCWeights();
\r
375 case SEQWEIGHT_ClustalW:
\r
376 SetClustalWWeightsMuscle(msa);
\r
379 case SEQWEIGHT_ThreeWay:
\r
380 SetThreeWayWeightsMuscle(msa);
\r
383 Quit("SetMSAWeightsMuscle, Invalid method=%d", Method);
\r
386 static WEIGHT *g_MuscleWeights;
\r
387 static unsigned g_uMuscleIdCount;
\r
389 WEIGHT GetMuscleSeqWeightById(unsigned uId)
\r
391 if (0 == g_MuscleWeights)
\r
392 Quit("g_MuscleWeights = 0");
\r
393 if (uId >= g_uMuscleIdCount)
\r
394 Quit("GetMuscleSeqWeightById(%u): count=%u",
\r
395 uId, g_uMuscleIdCount);
\r
397 return g_MuscleWeights[uId];
\r
400 void SetMuscleTree(const Tree &tree)
\r
402 g_ptrMuscleTree = &tree;
\r
404 if (SEQWEIGHT_ClustalW != GetSeqWeightMethod())
\r
407 delete[] g_MuscleWeights;
\r
409 const unsigned uLeafCount = tree.GetLeafCount();
\r
410 g_uMuscleIdCount = uLeafCount;
\r
411 g_MuscleWeights = new WEIGHT[uLeafCount];
\r
412 CalcClustalWWeights(tree, g_MuscleWeights);
\r
415 void SetClustalWWeightsMuscle(MSA &msa)
\r
417 if (0 == g_MuscleWeights)
\r
418 Quit("g_MuscleWeights = 0");
\r
419 const unsigned uSeqCount = msa.GetSeqCount();
\r
420 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
422 const unsigned uId = msa.GetSeqId(uSeqIndex);
\r
423 if (uId >= g_uMuscleIdCount)
\r
424 Quit("SetClustalWWeightsMuscle: id out of range");
\r
425 msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]);
\r
427 msa.NormalizeWeights((WEIGHT) 1.0);
\r
430 #define LOCAL_VERBOSE 0
\r
432 void SetThreeWayWeightsMuscle(MSA &msa)
\r
434 if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2)
\r
436 msa.SetHenikoffWeightsPB();
\r
440 const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount();
\r
441 WEIGHT *Weights = new WEIGHT[uMuscleSeqCount];
\r
443 CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2,
\r
446 const unsigned uMSASeqCount = msa.GetSeqCount();
\r
447 for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex)
\r
449 const unsigned uId = msa.GetSeqId(uSeqIndex);
\r
450 if (uId >= uMuscleSeqCount)
\r
451 Quit("SetThreeWayWeightsMuscle: id out of range");
\r
452 msa.SetSeqWeight(uSeqIndex, Weights[uId]);
\r
456 Log("SetThreeWayWeightsMuscle\n");
\r
457 for (unsigned n = 0; n < uMSASeqCount; ++n)
\r
459 const unsigned uId = msa.GetSeqId(n);
\r
460 Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]);
\r
464 msa.NormalizeWeights((WEIGHT) 1.0);
\r
469 // Append msa2 at the end of msa1
\r
470 void MSAAppend(MSA &msa1, const MSA &msa2)
\r
472 const unsigned uSeqCount = msa1.GetSeqCount();
\r
474 const unsigned uColCount1 = msa1.GetColCount();
\r
475 const unsigned uColCount2 = msa2.GetColCount();
\r
476 const unsigned uColCountCat = uColCount1 + uColCount2;
\r
478 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
480 unsigned uId = msa1.GetSeqId(uSeqIndex);
\r
481 unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
\r
482 for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
\r
484 const char c = msa2.GetChar(uSeqIndex2, uColIndex);
\r
485 msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
\r
490 // "Catenate" two MSAs (by bad analogy with UNIX cat command).
\r
491 // msa1 and msa2 must have same sequence names, but possibly
\r
492 // in a different order.
\r
493 // msaCat is the combined alignment produce by appending
\r
494 // sequences in msa2 to sequences in msa1.
\r
495 void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat)
\r
497 const unsigned uSeqCount = msa1.GetSeqCount();
\r
499 const unsigned uColCount1 = msa1.GetColCount();
\r
500 const unsigned uColCount2 = msa2.GetColCount();
\r
501 const unsigned uColCountCat = uColCount1 + uColCount2;
\r
503 msaCat.SetSize(uSeqCount, uColCountCat);
\r
505 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
507 for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex)
\r
509 const char c = msa1.GetChar(uSeqIndex, uColIndex);
\r
510 msaCat.SetChar(uSeqIndex, uColIndex, c);
\r
513 const char *ptrSeqName = msa1.GetSeqName(uSeqIndex);
\r
514 unsigned uSeqIndex2;
\r
515 msaCat.SetSeqName(uSeqIndex, ptrSeqName);
\r
516 bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2);
\r
519 for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
\r
521 const char c = msa2.GetChar(uSeqIndex2, uColIndex);
\r
522 msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
\r
527 for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
\r
528 msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-');
\r