--- /dev/null
+/***\r
+Conservation value for a column in an MSA is defined as the number\r
+of times the most common letter appears divided by the number of\r
+sequences.\r
+***/\r
+\r
+#include "muscle.h"\r
+#include "msa.h"\r
+#include <math.h>\r
+\r
+double MSA::GetAvgCons() const\r
+ {\r
+ assert(GetSeqCount() > 0);\r
+ double dSum = 0;\r
+ unsigned uNonGapColCount = 0;\r
+ for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex)\r
+ {\r
+ if (!IsGapColumn(uColIndex))\r
+ {\r
+ dSum += GetCons(uColIndex);\r
+ ++uNonGapColCount;\r
+ }\r
+ }\r
+ assert(uNonGapColCount > 0);\r
+ double dAvg = dSum / uNonGapColCount;\r
+ assert(dAvg > 0 && dAvg <= 1);\r
+ return dAvg;\r
+ }\r
+\r
+double MSA::GetCons(unsigned uColIndex) const\r
+ {\r
+ unsigned Counts[MAX_ALPHA];\r
+ for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
+ Counts[uLetter] = 0;\r
+\r
+ unsigned uMaxCount = 0;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
+ {\r
+ if (IsGap(uSeqIndex, uColIndex))\r
+ continue;\r
+ char c = GetChar(uSeqIndex, uColIndex);\r
+ c = toupper(c);\r
+ if ('X' == c || 'B' == c || 'Z' == c)\r
+ continue;\r
+ unsigned uLetter = GetLetter(uSeqIndex, uColIndex);\r
+ unsigned uCount = Counts[uLetter] + 1;\r
+ if (uCount > uMaxCount)\r
+ uMaxCount = uCount;\r
+ Counts[uLetter] = uCount;\r
+ }\r
+\r
+// Cons is undefined for all-gap column\r
+ if (0 == uMaxCount)\r
+ {\r
+// assert(false);\r
+ return 1;\r
+ }\r
+\r
+ double dCons = (double) uMaxCount / (double) GetSeqCount();\r
+ assert(dCons > 0 && dCons <= 1);\r
+ return dCons;\r
+ }\r
+\r
+// Perecent identity of a pair of sequences.\r
+// Positions with one or both gapped are ignored.\r
+double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const\r
+ {\r
+ const unsigned uColCount = GetColCount();\r
+ unsigned uPosCount = 0;\r
+ unsigned uSameCount = 0;\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ const char c1 = GetChar(uSeqIndex1, uColIndex);\r
+ const char c2 = GetChar(uSeqIndex2, uColIndex);\r
+ if (IsGapChar(c1) || IsGapChar(c2))\r
+ continue;\r
+ if (c1 == c2)\r
+ ++uSameCount;\r
+ ++uPosCount;\r
+ }\r
+ if (0 == uPosCount)\r
+ return 0;\r
+ return (double) uSameCount / (double) uPosCount;\r
+ }\r
+\r
+// Perecent group identity of a pair of sequences.\r
+// Positions with one or both gapped are ignored.\r
+double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1,\r
+ unsigned uSeqIndex2) const\r
+ {\r
+ extern unsigned ResidueGroup[];\r
+\r
+ const unsigned uColCount = GetColCount();\r
+ unsigned uPosCount = 0;\r
+ unsigned uSameCount = 0;\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ if (IsGap(uSeqIndex1, uColIndex))\r
+ continue;\r
+ if (IsGap(uSeqIndex2, uColIndex))\r
+ continue;\r
+ if (IsWildcard(uSeqIndex1, uColIndex))\r
+ continue;\r
+ if (IsWildcard(uSeqIndex2, uColIndex))\r
+ continue;\r
+\r
+ const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex);\r
+ const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex);\r
+ const unsigned uGroup1 = ResidueGroup[uLetter1];\r
+ const unsigned uGroup2 = ResidueGroup[uLetter2];\r
+ if (uGroup1 == uGroup2)\r
+ ++uSameCount;\r
+ ++uPosCount;\r
+ }\r
+ if (0 == uPosCount)\r
+ return 0;\r
+ return (double) uSameCount / (double) uPosCount;\r
+ }\r