+++ /dev/null
-/***\r
-Conservation value for a column in an MSA is defined as the number\r
-of times the most common letter appears divided by the number of\r
-sequences.\r
-***/\r
-\r
-#include "muscle.h"\r
-#include "msa.h"\r
-#include <math.h>\r
-\r
-double MSA::GetAvgCons() const\r
- {\r
- assert(GetSeqCount() > 0);\r
- double dSum = 0;\r
- unsigned uNonGapColCount = 0;\r
- for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex)\r
- {\r
- if (!IsGapColumn(uColIndex))\r
- {\r
- dSum += GetCons(uColIndex);\r
- ++uNonGapColCount;\r
- }\r
- }\r
- assert(uNonGapColCount > 0);\r
- double dAvg = dSum / uNonGapColCount;\r
- assert(dAvg > 0 && dAvg <= 1);\r
- return dAvg;\r
- }\r
-\r
-double MSA::GetCons(unsigned uColIndex) const\r
- {\r
- unsigned Counts[MAX_ALPHA];\r
- for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
- Counts[uLetter] = 0;\r
-\r
- unsigned uMaxCount = 0;\r
- for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
- {\r
- if (IsGap(uSeqIndex, uColIndex))\r
- continue;\r
- char c = GetChar(uSeqIndex, uColIndex);\r
- c = toupper(c);\r
- if ('X' == c || 'B' == c || 'Z' == c)\r
- continue;\r
- unsigned uLetter = GetLetter(uSeqIndex, uColIndex);\r
- unsigned uCount = Counts[uLetter] + 1;\r
- if (uCount > uMaxCount)\r
- uMaxCount = uCount;\r
- Counts[uLetter] = uCount;\r
- }\r
-\r
-// Cons is undefined for all-gap column\r
- if (0 == uMaxCount)\r
- {\r
-// assert(false);\r
- return 1;\r
- }\r
-\r
- double dCons = (double) uMaxCount / (double) GetSeqCount();\r
- assert(dCons > 0 && dCons <= 1);\r
- return dCons;\r
- }\r
-\r
-// Perecent identity of a pair of sequences.\r
-// Positions with one or both gapped are ignored.\r
-double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const\r
- {\r
- const unsigned uColCount = GetColCount();\r
- unsigned uPosCount = 0;\r
- unsigned uSameCount = 0;\r
- for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
- {\r
- const char c1 = GetChar(uSeqIndex1, uColIndex);\r
- const char c2 = GetChar(uSeqIndex2, uColIndex);\r
- if (IsGapChar(c1) || IsGapChar(c2))\r
- continue;\r
- if (c1 == c2)\r
- ++uSameCount;\r
- ++uPosCount;\r
- }\r
- if (0 == uPosCount)\r
- return 0;\r
- return (double) uSameCount / (double) uPosCount;\r
- }\r
-\r
-// Perecent group identity of a pair of sequences.\r
-// Positions with one or both gapped are ignored.\r
-double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1,\r
- unsigned uSeqIndex2) const\r
- {\r
- extern unsigned ResidueGroup[];\r
-\r
- const unsigned uColCount = GetColCount();\r
- unsigned uPosCount = 0;\r
- unsigned uSameCount = 0;\r
- for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
- {\r
- if (IsGap(uSeqIndex1, uColIndex))\r
- continue;\r
- if (IsGap(uSeqIndex2, uColIndex))\r
- continue;\r
- if (IsWildcard(uSeqIndex1, uColIndex))\r
- continue;\r
- if (IsWildcard(uSeqIndex2, uColIndex))\r
- continue;\r
-\r
- const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex);\r
- const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex);\r
- const unsigned uGroup1 = ResidueGroup[uLetter1];\r
- const unsigned uGroup2 = ResidueGroup[uLetter2];\r
- if (uGroup1 == uGroup2)\r
- ++uSameCount;\r
- ++uPosCount;\r
- }\r
- if (0 == uPosCount)\r
- return 0;\r
- return (double) uSameCount / (double) uPosCount;\r
- }\r