5 Compute Henikoff weights.
\r
6 Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.
\r
7 J. Mol. Biol., 243(4):574-578.
\r
9 Award each different residue an equal share of the weight, and then to divide up
\r
10 that weight equally among the sequences sharing the same residue. So if in a
\r
11 position of a multiple alignment, r different residues are represented, a residue
\r
12 represented in only one sequence contributes a score of 1/r to that sequence, whereas a
\r
13 residue represented in s sequences contributes a score of 1/rs to each of the s
\r
14 sequences. For each sequence, the contributions from each position are summed to give
\r
17 Here we use the variant from PSI-BLAST, which (a) treats gaps as a 21st letter,
\r
18 and (b) ignores columns that are perfectly conserved.
\r
20 >>> WARNING -- I SUSPECT THIS DOESN'T WORK CORRECTLY <<<
\r
23 void MSA::CalcHenikoffWeightsColPB(unsigned uColIndex) const
\r
25 const unsigned uSeqCount = GetSeqCount();
\r
27 // Compute letter counts in this column
\r
28 unsigned uLetterCount[MAX_ALPHA+1];
\r
29 memset(uLetterCount, 0, (MAX_ALPHA+1)*sizeof(unsigned));
\r
31 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
33 if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
\r
34 uLetter = MAX_ALPHA;
\r
36 uLetter = GetLetter(uSeqIndex, uColIndex);
\r
37 ++(uLetterCount[uLetter]);
\r
40 // Check for special case of perfect conservation
\r
41 for (unsigned uLetter = 0; uLetter < MAX_ALPHA+1; ++uLetter)
\r
43 unsigned uCount = uLetterCount[uLetter];
\r
46 // Perfectly conserved?
\r
47 if (uCount == uSeqCount)
\r
50 // If count > 0 but less than nr. sequences, can't be conserved
\r
55 // Compute weight contributions
\r
56 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
59 if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
\r
60 uLetter = MAX_ALPHA;
\r
62 uLetter = GetLetter(uSeqIndex, uColIndex);
\r
63 const unsigned uCount = uLetterCount[uLetter];
\r
64 m_Weights[uSeqIndex] += (WEIGHT) (1.0/uCount);
\r
68 bool MSA::IsGapSeq(unsigned uSeqIndex) const
\r
70 const unsigned uColCount = GetColCount();
\r
71 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
72 if (!IsGap(uSeqIndex, uColIndex))
\r
77 void MSA::SetUniformWeights() const
\r
79 const unsigned uSeqCount = GetSeqCount();
\r
83 const WEIGHT w = (WEIGHT) (1.0 / uSeqCount);
\r
84 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
85 m_Weights[uSeqIndex] = w;
\r
88 void MSA::SetHenikoffWeightsPB() const
\r
90 const unsigned uColCount = GetColCount();
\r
91 const unsigned uSeqCount = GetSeqCount();
\r
95 else if (1 == uSeqCount)
\r
100 else if (2 == uSeqCount)
\r
102 m_Weights[0] = (WEIGHT) 0.5;
\r
103 m_Weights[1] = (WEIGHT) 0.5;
\r
107 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
108 m_Weights[uSeqIndex] = 0.0;
\r
110 for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
\r
111 CalcHenikoffWeightsColPB(uColIndex);
\r
113 // Set all-gap seqs weight to 0
\r
114 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
115 if (IsGapSeq(uSeqIndex))
\r
116 m_Weights[uSeqIndex] = 0.0;
\r
118 // Check for special case of identical sequences, which will cause all
\r
119 // columns to be skipped becasue they're perfectly conserved.
\r
120 if (VectorIsZero(m_Weights, uSeqCount))
\r
121 VectorSet(m_Weights, uSeqCount, 1.0);
\r
123 Normalize(m_Weights, uSeqCount);
\r