Next version of JABA
[jabaws.git] / binaries / src / muscle / henikoffweight.cpp
1 #include "muscle.h"\r
2 #include "msa.h"\r
3 \r
4 /***\r
5 Compute Henikoff weights.\r
6 Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.\r
7 J. Mol. Biol., 243(4):574-578.\r
8 \r
9 Award each different residue an equal share of the weight, and then to divide up\r
10 that weight equally among the sequences sharing the same residue. So if in a\r
11 position of a multiple alignment, r different residues are represented, a residue\r
12 represented in only one sequence contributes a score of 1/r to that sequence, whereas a\r
13 residue represented in s sequences contributes a score of 1/rs to each of the s\r
14 sequences. For each sequence, the contributions from each position are summed to give\r
15 a sequence weight.\r
16 \r
17 See also HenikoffWeightPB.\r
18 ***/\r
19 \r
20 void MSA::CalcHenikoffWeightsCol(unsigned uColIndex) const\r
21         {\r
22         const unsigned uSeqCount = GetSeqCount();\r
23 \r
24 // Compute letter counts in this column\r
25         unsigned uLetterCount[MAX_ALPHA];\r
26         memset(uLetterCount, 0, sizeof(uLetterCount));\r
27         unsigned uDifferentLetterCount = 0;\r
28         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
29                 {\r
30                 unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);\r
31                 if (uLetter >= 20)\r
32                         continue;\r
33                 unsigned uNewCount = uLetterCount[uLetter] + 1;\r
34                 uLetterCount[uLetter] = uNewCount;\r
35                 if (1 == uNewCount)\r
36                         ++uDifferentLetterCount;\r
37                 }\r
38 \r
39 // Compute weight contributions\r
40         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
41                 {\r
42                 unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);\r
43                 if (uLetter >= 20)\r
44                         continue;\r
45                 const unsigned uCount = uLetterCount[uLetter];\r
46                 unsigned uDenom = uCount*uDifferentLetterCount;\r
47                 if (uDenom == 0)\r
48                         continue;\r
49                 m_Weights[uSeqIndex] += (WEIGHT) (1.0/uDenom);\r
50                 }\r
51         }\r
52 \r
53 void MSA::SetHenikoffWeights() const\r
54         {\r
55         const unsigned uColCount = GetColCount();\r
56         const unsigned uSeqCount = GetSeqCount();\r
57 \r
58         if (0 == uSeqCount)\r
59                 return;\r
60         else if (1 == uSeqCount)\r
61                 {\r
62                 m_Weights[0] = (WEIGHT) 1.0;\r
63                 return;\r
64                 }\r
65         else if (2 == uSeqCount)\r
66                 {\r
67                 m_Weights[0] = (WEIGHT) 0.5;\r
68                 m_Weights[1] = (WEIGHT) 0.5;\r
69                 return;\r
70                 }\r
71 \r
72         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
73                 m_Weights[uSeqIndex] = 0.0;\r
74 \r
75         for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
76                 CalcHenikoffWeightsCol(uColIndex);\r
77 \r
78 // Set all-gap seqs weight to 0\r
79         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
80                 if (IsGapSeq(uSeqIndex))\r
81                         m_Weights[uSeqIndex] = 0.0;\r
82 \r
83         Normalize(m_Weights, uSeqCount);\r
84         }\r