Next version of JABA
[jabaws.git] / binaries / src / muscle / henikoffweightpb.cpp
1 #include "muscle.h"\r
2 #include "msa.h"\r
3 \r
4 /***\r
5 Compute Henikoff weights.\r
6 Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.\r
7 J. Mol. Biol., 243(4):574-578.\r
8 \r
9 Award each different residue an equal share of the weight, and then to divide up\r
10 that weight equally among the sequences sharing the same residue. So if in a\r
11 position of a multiple alignment, r different residues are represented, a residue\r
12 represented in only one sequence contributes a score of 1/r to that sequence, whereas a\r
13 residue represented in s sequences contributes a score of 1/rs to each of the s\r
14 sequences. For each sequence, the contributions from each position are summed to give\r
15 a sequence weight.\r
16 \r
17 Here we use the variant from PSI-BLAST, which (a) treats gaps as a 21st letter,\r
18 and (b) ignores columns that are perfectly conserved.\r
19 \r
20 >>> WARNING -- I SUSPECT THIS DOESN'T WORK CORRECTLY <<<\r
21 ***/\r
22 \r
23 void MSA::CalcHenikoffWeightsColPB(unsigned uColIndex) const\r
24         {\r
25         const unsigned uSeqCount = GetSeqCount();\r
26 \r
27 // Compute letter counts in this column\r
28         unsigned uLetterCount[MAX_ALPHA+1];\r
29         memset(uLetterCount, 0, (MAX_ALPHA+1)*sizeof(unsigned));\r
30         unsigned uLetter;\r
31         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
32                 {\r
33                 if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))\r
34                         uLetter = MAX_ALPHA;\r
35                 else\r
36                         uLetter = GetLetter(uSeqIndex, uColIndex);\r
37                 ++(uLetterCount[uLetter]);\r
38                 }\r
39 \r
40 // Check for special case of perfect conservation\r
41         for (unsigned uLetter = 0; uLetter < MAX_ALPHA+1; ++uLetter)\r
42                 {\r
43                 unsigned uCount = uLetterCount[uLetter];\r
44                 if (uCount > 0)\r
45                         {\r
46                 // Perfectly conserved?\r
47                         if (uCount == uSeqCount)\r
48                                 return;\r
49                         else\r
50                         // If count > 0 but less than nr. sequences, can't be conserved\r
51                                 break;\r
52                         }\r
53                 }\r
54 \r
55 // Compute weight contributions\r
56         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
57                 {\r
58                 unsigned uLetter;\r
59                 if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))\r
60                         uLetter = MAX_ALPHA;\r
61                 else\r
62                         uLetter = GetLetter(uSeqIndex, uColIndex);\r
63                 const unsigned uCount = uLetterCount[uLetter];\r
64                 m_Weights[uSeqIndex] += (WEIGHT) (1.0/uCount);\r
65                 }\r
66         }\r
67 \r
68 bool MSA::IsGapSeq(unsigned uSeqIndex) const\r
69         {\r
70         const unsigned uColCount = GetColCount();\r
71         for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
72                 if (!IsGap(uSeqIndex, uColIndex))\r
73                         return false;\r
74         return true;\r
75         }\r
76 \r
77 void MSA::SetUniformWeights() const\r
78         {\r
79         const unsigned uSeqCount = GetSeqCount();\r
80         if (0 == uSeqCount)\r
81                 return;\r
82 \r
83         const WEIGHT w = (WEIGHT) (1.0 / uSeqCount);\r
84         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
85                 m_Weights[uSeqIndex] = w;\r
86         }\r
87 \r
88 void MSA::SetHenikoffWeightsPB() const\r
89         {\r
90         const unsigned uColCount = GetColCount();\r
91         const unsigned uSeqCount = GetSeqCount();\r
92 \r
93         if (0 == uSeqCount)\r
94                 return;\r
95         else if (1 == uSeqCount)\r
96                 {\r
97                 m_Weights[0] = 1.0;\r
98                 return;\r
99                 }\r
100         else if (2 == uSeqCount)\r
101                 {\r
102                 m_Weights[0] = (WEIGHT) 0.5;\r
103                 m_Weights[1] = (WEIGHT) 0.5;\r
104                 return;\r
105                 }\r
106 \r
107         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
108                 m_Weights[uSeqIndex] = 0.0;\r
109 \r
110         for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
111                 CalcHenikoffWeightsColPB(uColIndex);\r
112 \r
113 // Set all-gap seqs weight to 0\r
114         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
115                 if (IsGapSeq(uSeqIndex))\r
116                         m_Weights[uSeqIndex] = 0.0;\r
117 \r
118 // Check for special case of identical sequences, which will cause all\r
119 // columns to be skipped becasue they're perfectly conserved.\r
120         if (VectorIsZero(m_Weights, uSeqCount))\r
121                 VectorSet(m_Weights, uSeqCount, 1.0);\r
122 \r
123         Normalize(m_Weights, uSeqCount);\r
124         }\r