Next version of JABA
[jabaws.git] / binaries / src / muscle / cons.cpp
1 /***\r
2 Conservation value for a column in an MSA is defined as the number\r
3 of times the most common letter appears divided by the number of\r
4 sequences.\r
5 ***/\r
6 \r
7 #include "muscle.h"\r
8 #include "msa.h"\r
9 #include <math.h>\r
10 \r
11 double MSA::GetAvgCons() const\r
12         {\r
13         assert(GetSeqCount() > 0);\r
14         double dSum = 0;\r
15         unsigned uNonGapColCount = 0;\r
16         for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex)\r
17                 {\r
18                 if (!IsGapColumn(uColIndex))\r
19                         {\r
20                         dSum += GetCons(uColIndex);\r
21                         ++uNonGapColCount;\r
22                         }\r
23                 }\r
24         assert(uNonGapColCount > 0);\r
25         double dAvg = dSum / uNonGapColCount;\r
26         assert(dAvg > 0 && dAvg <= 1);\r
27         return dAvg;\r
28         }\r
29 \r
30 double MSA::GetCons(unsigned uColIndex) const\r
31         {\r
32         unsigned Counts[MAX_ALPHA];\r
33         for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
34                 Counts[uLetter] = 0;\r
35 \r
36         unsigned uMaxCount = 0;\r
37         for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
38                 {\r
39                 if (IsGap(uSeqIndex, uColIndex))\r
40                         continue;\r
41                 char c = GetChar(uSeqIndex, uColIndex);\r
42                 c = toupper(c);\r
43                 if ('X' == c || 'B' == c || 'Z' == c)\r
44                         continue;\r
45                 unsigned uLetter = GetLetter(uSeqIndex, uColIndex);\r
46                 unsigned uCount = Counts[uLetter] + 1;\r
47                 if (uCount > uMaxCount)\r
48                         uMaxCount = uCount;\r
49                 Counts[uLetter] = uCount;\r
50                 }\r
51 \r
52 // Cons is undefined for all-gap column\r
53         if (0 == uMaxCount)\r
54                 {\r
55 //              assert(false);\r
56                 return 1;\r
57                 }\r
58 \r
59         double dCons = (double) uMaxCount / (double) GetSeqCount();\r
60         assert(dCons > 0 && dCons <= 1);\r
61         return dCons;\r
62         }\r
63 \r
64 // Perecent identity of a pair of sequences.\r
65 // Positions with one or both gapped are ignored.\r
66 double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const\r
67         {\r
68         const unsigned uColCount = GetColCount();\r
69         unsigned uPosCount = 0;\r
70         unsigned uSameCount = 0;\r
71         for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
72                 {\r
73                 const char c1 = GetChar(uSeqIndex1, uColIndex);\r
74                 const char c2 = GetChar(uSeqIndex2, uColIndex);\r
75                 if (IsGapChar(c1) || IsGapChar(c2))\r
76                         continue;\r
77                 if (c1 == c2)\r
78                         ++uSameCount;\r
79                 ++uPosCount;\r
80                 }\r
81         if (0 == uPosCount)\r
82                 return 0;\r
83         return (double) uSameCount / (double) uPosCount;\r
84         }\r
85 \r
86 // Perecent group identity of a pair of sequences.\r
87 // Positions with one or both gapped are ignored.\r
88 double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1,\r
89   unsigned uSeqIndex2) const\r
90         {\r
91         extern unsigned ResidueGroup[];\r
92 \r
93         const unsigned uColCount = GetColCount();\r
94         unsigned uPosCount = 0;\r
95         unsigned uSameCount = 0;\r
96         for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
97                 {\r
98                 if (IsGap(uSeqIndex1, uColIndex))\r
99                         continue;\r
100                 if (IsGap(uSeqIndex2, uColIndex))\r
101                         continue;\r
102                 if (IsWildcard(uSeqIndex1, uColIndex))\r
103                         continue;\r
104                 if (IsWildcard(uSeqIndex2, uColIndex))\r
105                         continue;\r
106 \r
107                 const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex);\r
108                 const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex);\r
109                 const unsigned uGroup1 = ResidueGroup[uLetter1];\r
110                 const unsigned uGroup2 = ResidueGroup[uLetter2];\r
111                 if (uGroup1 == uGroup2)\r
112                         ++uSameCount;\r
113                 ++uPosCount;\r
114                 }\r
115         if (0 == uPosCount)\r
116                 return 0;\r
117         return (double) uSameCount / (double) uPosCount;\r
118         }\r