--- /dev/null
+#include "muscle.h"\r
+#include "msa.h"\r
+#include "textfile.h"\r
+#include "seq.h"\r
+#include <math.h>\r
+\r
+const unsigned DEFAULT_SEQ_LENGTH = 500;\r
+\r
+unsigned MSA::m_uIdCount = 0;\r
+\r
+MSA::MSA()\r
+ {\r
+ m_uSeqCount = 0;\r
+ m_uColCount = 0;\r
+\r
+ m_szSeqs = 0;\r
+ m_szNames = 0;\r
+ m_Weights = 0;\r
+\r
+ m_IdToSeqIndex = 0;\r
+ m_SeqIndexToId = 0;\r
+\r
+ m_uCacheSeqCount = 0;\r
+ m_uCacheSeqLength = 0;\r
+ }\r
+\r
+MSA::~MSA()\r
+ {\r
+ Free();\r
+ }\r
+\r
+void MSA::Free()\r
+ {\r
+ for (unsigned n = 0; n < m_uSeqCount; ++n)\r
+ {\r
+ delete[] m_szSeqs[n];\r
+ delete[] m_szNames[n];\r
+ }\r
+\r
+ delete[] m_szSeqs;\r
+ delete[] m_szNames;\r
+ delete[] m_Weights;\r
+ delete[] m_IdToSeqIndex;\r
+ delete[] m_SeqIndexToId;\r
+\r
+ m_uSeqCount = 0;\r
+ m_uColCount = 0;\r
+\r
+ m_szSeqs = 0;\r
+ m_szNames = 0;\r
+ m_Weights = 0;\r
+\r
+ m_IdToSeqIndex = 0;\r
+ m_SeqIndexToId = 0;\r
+ }\r
+\r
+void MSA::SetSize(unsigned uSeqCount, unsigned uColCount)\r
+ {\r
+ Free();\r
+\r
+ m_uSeqCount = uSeqCount;\r
+ m_uCacheSeqLength = uColCount;\r
+ m_uColCount = 0;\r
+\r
+ if (0 == uSeqCount && 0 == uColCount)\r
+ return;\r
+\r
+ m_szSeqs = new char *[uSeqCount];\r
+ m_szNames = new char *[uSeqCount];\r
+ m_Weights = new WEIGHT[uSeqCount];\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ m_szSeqs[uSeqIndex] = new char[uColCount+1];\r
+ m_szNames[uSeqIndex] = 0;\r
+#if DEBUG\r
+ m_Weights[uSeqIndex] = BTInsane;\r
+ memset(m_szSeqs[uSeqIndex], '?', uColCount);\r
+#endif\r
+ m_szSeqs[uSeqIndex][uColCount] = 0;\r
+ }\r
+\r
+ if (m_uIdCount > 0)\r
+ {\r
+ m_IdToSeqIndex = new unsigned[m_uIdCount];\r
+ m_SeqIndexToId = new unsigned[m_uSeqCount];\r
+#if DEBUG\r
+ memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned));\r
+ memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned));\r
+#endif\r
+ }\r
+ }\r
+\r
+void MSA::LogMe() const\r
+ {\r
+ if (0 == GetColCount())\r
+ {\r
+ Log("MSA empty\n");\r
+ return;\r
+ }\r
+\r
+ const unsigned uColsPerLine = 50;\r
+ unsigned uLinesPerSeq = (GetColCount() - 1)/uColsPerLine + 1;\r
+ for (unsigned n = 0; n < uLinesPerSeq; ++n)\r
+ {\r
+ unsigned i;\r
+ unsigned iStart = n*uColsPerLine;\r
+ unsigned iEnd = GetColCount();\r
+ if (iEnd - iStart + 1 > uColsPerLine)\r
+ iEnd = iStart + uColsPerLine;\r
+ Log(" ");\r
+ for (i = iStart; i < iEnd; ++i)\r
+ Log("%u", i%10);\r
+ Log("\n");\r
+ Log(" ");\r
+ for (i = iStart; i + 9 < iEnd; i += 10)\r
+ Log("%-10u", i);\r
+ if (n == uLinesPerSeq - 1)\r
+ Log(" %-10u", GetColCount());\r
+ Log("\n");\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
+ {\r
+ Log("%12.12s", m_szNames[uSeqIndex]);\r
+ if (m_Weights[uSeqIndex] != BTInsane)\r
+ Log(" (%5.3f)", m_Weights[uSeqIndex]);\r
+ else\r
+ Log(" ");\r
+ Log(" ");\r
+ for (i = iStart; i < iEnd; ++i)\r
+ Log("%c", GetChar(uSeqIndex, i));\r
+ if (0 != m_SeqIndexToId)\r
+ Log(" [%5u]", m_SeqIndexToId[uSeqIndex]);\r
+ Log("\n");\r
+ }\r
+ Log("\n\n");\r
+ }\r
+ }\r
+\r
+char MSA::GetChar(unsigned uSeqIndex, unsigned uIndex) const\r
+ {\r
+// TODO: Performance cost?\r
+ if (uSeqIndex >= m_uSeqCount || uIndex >= m_uColCount)\r
+ Quit("MSA::GetChar(%u/%u,%u/%u)",\r
+ uSeqIndex, m_uSeqCount, uIndex, m_uColCount);\r
+\r
+ char c = m_szSeqs[uSeqIndex][uIndex];\r
+// assert(IsLegalChar(c));\r
+ return c;\r
+ }\r
+\r
+unsigned MSA::GetLetter(unsigned uSeqIndex, unsigned uIndex) const\r
+ {\r
+// TODO: Performance cost?\r
+ char c = GetChar(uSeqIndex, uIndex);\r
+ unsigned uLetter = CharToLetter(c);\r
+ if (uLetter >= 20)\r
+ {\r
+ char c = ' ';\r
+ if (uSeqIndex < m_uSeqCount && uIndex < m_uColCount)\r
+ c = m_szSeqs[uSeqIndex][uIndex];\r
+ Quit("MSA::GetLetter(%u/%u, %u/%u)='%c'/%u",\r
+ uSeqIndex, m_uSeqCount, uIndex, m_uColCount, c, uLetter);\r
+ }\r
+ return uLetter;\r
+ }\r
+\r
+unsigned MSA::GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const\r
+ {\r
+// TODO: Performance cost?\r
+ char c = GetChar(uSeqIndex, uIndex);\r
+ unsigned uLetter = CharToLetterEx(c);\r
+ return uLetter;\r
+ }\r
+\r
+void MSA::SetSeqName(unsigned uSeqIndex, const char szName[])\r
+ {\r
+ if (uSeqIndex >= m_uSeqCount)\r
+ Quit("MSA::SetSeqName(%u, %s), count=%u", uSeqIndex, m_uSeqCount);\r
+ delete[] m_szNames[uSeqIndex];\r
+ int n = (int) strlen(szName) + 1;\r
+ m_szNames[uSeqIndex] = new char[n];\r
+ memcpy(m_szNames[uSeqIndex], szName, n);\r
+ }\r
+\r
+const char *MSA::GetSeqName(unsigned uSeqIndex) const\r
+ {\r
+ if (uSeqIndex >= m_uSeqCount)\r
+ Quit("MSA::GetSeqName(%u), count=%u", uSeqIndex, m_uSeqCount);\r
+ return m_szNames[uSeqIndex];\r
+ }\r
+\r
+bool MSA::IsGap(unsigned uSeqIndex, unsigned uIndex) const\r
+ {\r
+ char c = GetChar(uSeqIndex, uIndex);\r
+ return IsGapChar(c);\r
+ }\r
+\r
+bool MSA::IsWildcard(unsigned uSeqIndex, unsigned uIndex) const\r
+ {\r
+ char c = GetChar(uSeqIndex, uIndex);\r
+ return IsWildcardChar(c);\r
+ }\r
+\r
+void MSA::SetChar(unsigned uSeqIndex, unsigned uIndex, char c)\r
+ {\r
+ if (uSeqIndex >= m_uSeqCount || uIndex > m_uCacheSeqLength)\r
+ Quit("MSA::SetChar(%u,%u)", uSeqIndex, uIndex);\r
+\r
+ if (uIndex == m_uCacheSeqLength)\r
+ {\r
+ const unsigned uNewCacheSeqLength = m_uCacheSeqLength + DEFAULT_SEQ_LENGTH;\r
+ for (unsigned n = 0; n < m_uSeqCount; ++n)\r
+ {\r
+ char *ptrNewSeq = new char[uNewCacheSeqLength+1];\r
+ memcpy(ptrNewSeq, m_szSeqs[n], m_uCacheSeqLength);\r
+ memset(ptrNewSeq + m_uCacheSeqLength, '?', DEFAULT_SEQ_LENGTH);\r
+ ptrNewSeq[uNewCacheSeqLength] = 0;\r
+ delete[] m_szSeqs[n];\r
+ m_szSeqs[n] = ptrNewSeq;\r
+ }\r
+\r
+ m_uColCount = uIndex;\r
+ m_uCacheSeqLength = uNewCacheSeqLength;\r
+ }\r
+\r
+ if (uIndex >= m_uColCount)\r
+ m_uColCount = uIndex + 1;\r
+ m_szSeqs[uSeqIndex][uIndex] = c;\r
+ }\r
+\r
+void MSA::GetSeq(unsigned uSeqIndex, Seq &seq) const\r
+ {\r
+ assert(uSeqIndex < m_uSeqCount);\r
+\r
+ seq.Clear();\r
+\r
+ for (unsigned n = 0; n < m_uColCount; ++n)\r
+ if (!IsGap(uSeqIndex, n))\r
+ {\r
+ char c = GetChar(uSeqIndex, n);\r
+ if (!isalpha(c))\r
+ Quit("Invalid character '%c' in sequence", c);\r
+ c = toupper(c);\r
+ seq.push_back(c);\r
+ }\r
+ const char *ptrName = GetSeqName(uSeqIndex);\r
+ seq.SetName(ptrName);\r
+ }\r
+\r
+bool MSA::HasGap() const\r
+ {\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
+ for (unsigned n = 0; n < GetColCount(); ++n)\r
+ if (IsGap(uSeqIndex, n))\r
+ return true;\r
+ return false;\r
+ }\r
+\r
+bool MSA::IsLegalLetter(unsigned uLetter) const\r
+ {\r
+ return uLetter < 20;\r
+ }\r
+\r
+void MSA::SetSeqCount(unsigned uSeqCount)\r
+ {\r
+ Free();\r
+ SetSize(uSeqCount, DEFAULT_SEQ_LENGTH);\r
+ }\r
+\r
+void MSA::CopyCol(unsigned uFromCol, unsigned uToCol)\r
+ {\r
+ assert(uFromCol < GetColCount());\r
+ assert(uToCol < GetColCount());\r
+ if (uFromCol == uToCol)\r
+ return;\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
+ {\r
+ const char c = GetChar(uSeqIndex, uFromCol);\r
+ SetChar(uSeqIndex, uToCol, c);\r
+ }\r
+ }\r
+\r
+void MSA::Copy(const MSA &msa)\r
+ {\r
+ Free();\r
+ const unsigned uSeqCount = msa.GetSeqCount();\r
+ const unsigned uColCount = msa.GetColCount();\r
+ SetSize(uSeqCount, uColCount);\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ SetSeqName(uSeqIndex, msa.GetSeqName(uSeqIndex));\r
+ const unsigned uId = msa.GetSeqId(uSeqIndex);\r
+ SetSeqId(uSeqIndex, uId);\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ const char c = msa.GetChar(uSeqIndex, uColIndex);\r
+ SetChar(uSeqIndex, uColIndex, c);\r
+ }\r
+ }\r
+ }\r
+\r
+bool MSA::IsGapColumn(unsigned uColIndex) const\r
+ {\r
+ assert(GetSeqCount() > 0);\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
+ if (!IsGap(uSeqIndex, uColIndex))\r
+ return false;\r
+ return true;\r
+ }\r
+\r
+bool MSA::GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const\r
+ {\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
+ if (0 == stricmp(ptrSeqName, GetSeqName(uSeqIndex)))\r
+ {\r
+ *ptruSeqIndex = uSeqIndex;\r
+ return true;\r
+ }\r
+ return false;\r
+ }\r
+\r
+void MSA::DeleteCol(unsigned uColIndex)\r
+ {\r
+ assert(uColIndex < m_uColCount);\r
+ size_t n = m_uColCount - uColIndex;\r
+ if (n > 0)\r
+ {\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
+ {\r
+ char *ptrSeq = m_szSeqs[uSeqIndex];\r
+ memmove(ptrSeq + uColIndex, ptrSeq + uColIndex + 1, n);\r
+ }\r
+ }\r
+ --m_uColCount;\r
+ }\r
+\r
+void MSA::DeleteColumns(unsigned uColIndex, unsigned uColCount)\r
+ {\r
+ for (unsigned n = 0; n < uColCount; ++n)\r
+ DeleteCol(uColIndex);\r
+ }\r
+\r
+void MSA::FromFile(TextFile &File)\r
+ {\r
+ FromFASTAFile(File);\r
+ }\r
+\r
+// Weights sum to 1, WCounts sum to NIC\r
+WEIGHT MSA::GetSeqWeight(unsigned uSeqIndex) const\r
+ {\r
+ assert(uSeqIndex < m_uSeqCount);\r
+ WEIGHT w = m_Weights[uSeqIndex];\r
+ if (w == wInsane)\r
+ Quit("Seq weight not set");\r
+ return w;\r
+ }\r
+\r
+void MSA::SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const\r
+ {\r
+ assert(uSeqIndex < m_uSeqCount);\r
+ m_Weights[uSeqIndex] = w;\r
+ }\r
+\r
+void MSA::NormalizeWeights(WEIGHT wDesiredTotal) const\r
+ {\r
+ WEIGHT wTotal = 0;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
+ wTotal += m_Weights[uSeqIndex];\r
+\r
+ if (0 == wTotal)\r
+ return;\r
+\r
+ const WEIGHT f = wDesiredTotal/wTotal;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
+ m_Weights[uSeqIndex] *= f;\r
+ }\r
+\r
+void MSA::CalcWeights() const\r
+ {\r
+ Quit("Calc weights not implemented");\r
+ }\r
+\r
+static void FmtChar(char c, unsigned uWidth)\r
+ {\r
+ Log("%c", c);\r
+ for (unsigned n = 0; n < uWidth - 1; ++n)\r
+ Log(" ");\r
+ }\r
+\r
+static void FmtInt(unsigned u, unsigned uWidth)\r
+ {\r
+ static char szStr[1024];\r
+ assert(uWidth < sizeof(szStr));\r
+ if (u > 0)\r
+ sprintf(szStr, "%u", u);\r
+ else\r
+ strcpy(szStr, ".");\r
+ Log(szStr);\r
+ unsigned n = (unsigned) strlen(szStr);\r
+ if (n < uWidth)\r
+ for (unsigned i = 0; i < uWidth - n; ++i)\r
+ Log(" ");\r
+ }\r
+\r
+static void FmtInt0(unsigned u, unsigned uWidth)\r
+ {\r
+ static char szStr[1024];\r
+ assert(uWidth < sizeof(szStr));\r
+ sprintf(szStr, "%u", u);\r
+ Log(szStr);\r
+ unsigned n = (unsigned) strlen(szStr);\r
+ if (n < uWidth)\r
+ for (unsigned i = 0; i < uWidth - n; ++i)\r
+ Log(" ");\r
+ }\r
+\r
+static void FmtPad(unsigned n)\r
+ {\r
+ for (unsigned i = 0; i < n; ++i)\r
+ Log(" ");\r
+ }\r
+\r
+void MSA::FromSeq(const Seq &s)\r
+ {\r
+ unsigned uSeqLength = s.Length();\r
+ SetSize(1, uSeqLength);\r
+ SetSeqName(0, s.GetName());\r
+ if (0 != m_SeqIndexToId)\r
+ SetSeqId(0, s.GetId());\r
+ for (unsigned n = 0; n < uSeqLength; ++n)\r
+ SetChar(0, n, s[n]);\r
+ }\r
+\r
+unsigned MSA::GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const\r
+ {\r
+ assert(uSeqIndex < GetSeqCount());\r
+ assert(uColIndex < GetColCount());\r
+\r
+ unsigned uCol = 0;\r
+ for (unsigned n = 0; n <= uColIndex; ++n)\r
+ if (!IsGap(uSeqIndex, n))\r
+ ++uCol;\r
+ return uCol;\r
+ }\r
+\r
+void MSA::CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex)\r
+ {\r
+ assert(uToSeqIndex < m_uSeqCount);\r
+ const unsigned uColCount = msaFrom.GetColCount();\r
+ assert(m_uColCount == uColCount ||\r
+ (0 == m_uColCount && uColCount <= m_uCacheSeqLength));\r
+\r
+ memcpy(m_szSeqs[uToSeqIndex], msaFrom.GetSeqBuffer(uFromSeqIndex), uColCount);\r
+ SetSeqName(uToSeqIndex, msaFrom.GetSeqName(uFromSeqIndex));\r
+ if (0 == m_uColCount)\r
+ m_uColCount = uColCount;\r
+ }\r
+\r
+const char *MSA::GetSeqBuffer(unsigned uSeqIndex) const\r
+ {\r
+ assert(uSeqIndex < m_uSeqCount);\r
+ return m_szSeqs[uSeqIndex];\r
+ }\r
+\r
+void MSA::DeleteSeq(unsigned uSeqIndex)\r
+ {\r
+ assert(uSeqIndex < m_uSeqCount);\r
+\r
+ delete m_szSeqs[uSeqIndex];\r
+ delete m_szNames[uSeqIndex];\r
+\r
+ const unsigned uBytesToMove = (m_uSeqCount - uSeqIndex)*sizeof(char *);\r
+ if (uBytesToMove > 0)\r
+ {\r
+ memmove(m_szSeqs + uSeqIndex, m_szSeqs + uSeqIndex + 1, uBytesToMove);\r
+ memmove(m_szNames + uSeqIndex, m_szNames + uSeqIndex + 1, uBytesToMove);\r
+ }\r
+\r
+ --m_uSeqCount;\r
+\r
+ delete[] m_Weights;\r
+ m_Weights = 0;\r
+ }\r
+\r
+bool MSA::IsEmptyCol(unsigned uColIndex) const\r
+ {\r
+ const unsigned uSeqCount = GetSeqCount();\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ if (!IsGap(uSeqIndex, uColIndex))\r
+ return false;\r
+ return true;\r
+ }\r
+\r
+//void MSA::DeleteEmptyCols(bool bProgress)\r
+// {\r
+// unsigned uColCount = GetColCount();\r
+// for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+// {\r
+// if (IsEmptyCol(uColIndex))\r
+// {\r
+// if (bProgress)\r
+// {\r
+// Log("Deleting col %u of %u\n", uColIndex, uColCount);\r
+// printf("Deleting col %u of %u\n", uColIndex, uColCount);\r
+// }\r
+// DeleteCol(uColIndex);\r
+// --uColCount;\r
+// }\r
+// }\r
+// }\r
+\r
+unsigned MSA::AlignedColIndexToColIndex(unsigned uAlignedColIndex) const\r
+ {\r
+ Quit("MSA::AlignedColIndexToColIndex not implemented");\r
+ return 0;\r
+ }\r
+\r
+WEIGHT MSA::GetTotalSeqWeight() const\r
+ {\r
+ WEIGHT wTotal = 0;\r
+ const unsigned uSeqCount = GetSeqCount();\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ wTotal += m_Weights[uSeqIndex];\r
+ return wTotal;\r
+ }\r
+\r
+bool MSA::SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2,\r
+ unsigned uSeqIndex2)\r
+ {\r
+ Seq s1;\r
+ Seq s2;\r
+\r
+ a1.GetSeq(uSeqIndex1, s1);\r
+ a2.GetSeq(uSeqIndex2, s2);\r
+\r
+ s1.StripGaps();\r
+ s2.StripGaps();\r
+\r
+ return s1.EqIgnoreCase(s2);\r
+ }\r
+\r
+unsigned MSA::GetSeqLength(unsigned uSeqIndex) const\r
+ {\r
+ assert(uSeqIndex < GetSeqCount());\r
+\r
+ const unsigned uColCount = GetColCount();\r
+ unsigned uLength = 0;\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ if (!IsGap(uSeqIndex, uColIndex))\r
+ ++uLength;\r
+ return uLength;\r
+ }\r
+\r
+void MSA::GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrPWID,\r
+ unsigned *ptruPosCount) const\r
+ {\r
+ assert(uSeqIndex1 < GetSeqCount());\r
+ assert(uSeqIndex2 < GetSeqCount());\r
+\r
+ unsigned uSameCount = 0;\r
+ unsigned uPosCount = 0;\r
+ const unsigned uColCount = GetColCount();\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ char c1 = GetChar(uSeqIndex1, uColIndex);\r
+ if (IsGapChar(c1))\r
+ continue;\r
+ char c2 = GetChar(uSeqIndex2, uColIndex);\r
+ if (IsGapChar(c2))\r
+ continue;\r
+ ++uPosCount;\r
+ if (c1 == c2)\r
+ ++uSameCount;\r
+ }\r
+ *ptruPosCount = uPosCount;\r
+ if (uPosCount > 0)\r
+ *ptrPWID = 100.0 * (double) uSameCount / (double) uPosCount;\r
+ else\r
+ *ptrPWID = 0;\r
+ }\r
+\r
+void MSA::UnWeight()\r
+ {\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
+ m_Weights[uSeqIndex] = BTInsane;\r
+ }\r
+\r
+unsigned MSA::UniqueResidueTypes(unsigned uColIndex) const\r
+ {\r
+ assert(uColIndex < GetColCount());\r
+\r
+ unsigned Counts[MAX_ALPHA];\r
+ memset(Counts, 0, sizeof(Counts));\r
+ const unsigned uSeqCount = GetSeqCount();\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))\r
+ continue;\r
+ const unsigned uLetter = GetLetter(uSeqIndex, uColIndex);\r
+ ++(Counts[uLetter]);\r
+ }\r
+ unsigned uUniqueCount = 0;\r
+ for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
+ if (Counts[uLetter] > 0)\r
+ ++uUniqueCount;\r
+ return uUniqueCount;\r
+ }\r
+\r
+double MSA::GetOcc(unsigned uColIndex) const\r
+ {\r
+ unsigned uGapCount = 0;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
+ if (IsGap(uSeqIndex, uColIndex))\r
+ ++uGapCount;\r
+ unsigned uSeqCount = GetSeqCount();\r
+ return (double) (uSeqCount - uGapCount) / (double) uSeqCount;\r
+ }\r
+\r
+void MSA::ToFile(TextFile &File) const\r
+ {\r
+ if (g_bMSF)\r
+ ToMSFFile(File);\r
+ else if (g_bAln)\r
+ ToAlnFile(File);\r
+ else if (g_bHTML)\r
+ ToHTMLFile(File);\r
+ else if (g_bPHYS)\r
+ ToPhySequentialFile(File);\r
+ else if (g_bPHYI)\r
+ ToPhyInterleavedFile(File);\r
+ else\r
+ ToFASTAFile(File);\r
+ if (0 != g_pstrScoreFileName)\r
+ WriteScoreFile(*this);\r
+ }\r
+\r
+bool MSA::ColumnHasGap(unsigned uColIndex) const\r
+ {\r
+ const unsigned uSeqCount = GetSeqCount();\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ if (IsGap(uSeqIndex, uColIndex))\r
+ return true;\r
+ return false;\r
+ }\r
+\r
+void MSA::SetIdCount(unsigned uIdCount)\r
+ {\r
+ //if (m_uIdCount != 0)\r
+ // Quit("MSA::SetIdCount: may only be called once");\r
+\r
+ if (m_uIdCount > 0)\r
+ {\r
+ if (uIdCount > m_uIdCount)\r
+ Quit("MSA::SetIdCount: cannot increase count");\r
+ return;\r
+ }\r
+ m_uIdCount = uIdCount;\r
+ }\r
+\r
+void MSA::SetSeqId(unsigned uSeqIndex, unsigned uId)\r
+ {\r
+ assert(uSeqIndex < m_uSeqCount);\r
+ assert(uId < m_uIdCount);\r
+ if (0 == m_SeqIndexToId)\r
+ {\r
+ if (0 == m_uIdCount)\r
+ Quit("MSA::SetSeqId, SetIdCount has not been called");\r
+ m_IdToSeqIndex = new unsigned[m_uIdCount];\r
+ m_SeqIndexToId = new unsigned[m_uSeqCount];\r
+\r
+ memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned));\r
+ memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned));\r
+ }\r
+ m_SeqIndexToId[uSeqIndex] = uId;\r
+ m_IdToSeqIndex[uId] = uSeqIndex;\r
+ }\r
+\r
+unsigned MSA::GetSeqIndex(unsigned uId) const\r
+ {\r
+ assert(uId < m_uIdCount);\r
+ assert(0 != m_IdToSeqIndex);\r
+ unsigned uSeqIndex = m_IdToSeqIndex[uId];\r
+ assert(uSeqIndex < m_uSeqCount);\r
+ return uSeqIndex;\r
+ }\r
+\r
+bool MSA::GetSeqIndex(unsigned uId, unsigned *ptruIndex) const\r
+ {\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
+ {\r
+ if (uId == m_SeqIndexToId[uSeqIndex])\r
+ {\r
+ *ptruIndex = uSeqIndex;\r
+ return true;\r
+ }\r
+ }\r
+ return false;\r
+ }\r
+\r
+unsigned MSA::GetSeqId(unsigned uSeqIndex) const\r
+ {\r
+ assert(uSeqIndex < m_uSeqCount);\r
+ unsigned uId = m_SeqIndexToId[uSeqIndex];\r
+ assert(uId < m_uIdCount);\r
+ return uId;\r
+ }\r
+\r
+bool MSA::WeightsSet() const\r
+ {\r
+ return BTInsane != m_Weights[0];\r
+ }\r
+\r
+void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount,\r
+ MSA &msaOut)\r
+ {\r
+ const unsigned uColCount = msaIn.GetColCount();\r
+ msaOut.SetSize(uIdCount, uColCount);\r
+ for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uIdCount; ++uSeqIndexOut)\r
+ {\r
+ const unsigned uId = Ids[uSeqIndexOut];\r
+\r
+ const unsigned uSeqIndexIn = msaIn.GetSeqIndex(uId);\r
+ const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);\r
+\r
+ msaOut.SetSeqId(uSeqIndexOut, uId);\r
+ msaOut.SetSeqName(uSeqIndexOut, ptrName);\r
+\r
+ for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
+ {\r
+ const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);\r
+ msaOut.SetChar(uSeqIndexOut, uColIndex, c);\r
+ }\r
+ }\r
+ }\r
+\r
+// Caller must allocate ptrSeq and ptrLabel as new char[n].\r
+void MSA::AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel)\r
+ {\r
+ if (m_uSeqCount > m_uCacheSeqCount)\r
+ Quit("Internal error MSA::AppendSeq");\r
+ if (m_uSeqCount == m_uCacheSeqCount)\r
+ ExpandCache(m_uSeqCount + 4, uSeqLength);\r
+ m_szSeqs[m_uSeqCount] = ptrSeq;\r
+ m_szNames[m_uSeqCount] = ptrLabel;\r
+ ++m_uSeqCount;\r
+ }\r
+\r
+void MSA::ExpandCache(unsigned uSeqCount, unsigned uColCount)\r
+ {\r
+ if (m_IdToSeqIndex != 0 || m_SeqIndexToId != 0 || uSeqCount < m_uSeqCount)\r
+ Quit("Internal error MSA::ExpandCache");\r
+\r
+ if (m_uSeqCount > 0 && uColCount != m_uColCount)\r
+ Quit("Internal error MSA::ExpandCache, ColCount changed");\r
+\r
+ char **NewSeqs = new char *[uSeqCount];\r
+ char **NewNames = new char *[uSeqCount];\r
+ WEIGHT *NewWeights = new WEIGHT[uSeqCount];\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
+ {\r
+ NewSeqs[uSeqIndex] = m_szSeqs[uSeqIndex];\r
+ NewNames[uSeqIndex] = m_szNames[uSeqIndex];\r
+ NewWeights[uSeqIndex] = m_Weights[uSeqIndex];\r
+ }\r
+\r
+ for (unsigned uSeqIndex = m_uSeqCount; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ char *Seq = new char[uColCount];\r
+ NewSeqs[uSeqIndex] = Seq;\r
+#if DEBUG\r
+ memset(Seq, '?', uColCount);\r
+#endif\r
+ }\r
+\r
+ delete[] m_szSeqs;\r
+ delete[] m_szNames;\r
+ delete[] m_Weights;\r
+\r
+ m_szSeqs = NewSeqs;\r
+ m_szNames = NewNames;\r
+ m_Weights = NewWeights;\r
+\r
+ m_uCacheSeqCount = uSeqCount;\r
+ m_uCacheSeqLength = uColCount;\r
+ m_uColCount = uColCount;\r
+ }\r
+\r
+void MSA::FixAlpha()\r
+ {\r
+ ClearInvalidLetterWarning();\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
+ {\r
+ for (unsigned uColIndex = 0; uColIndex < m_uColCount; ++uColIndex)\r
+ {\r
+ char c = GetChar(uSeqIndex, uColIndex);\r
+ if (!IsResidueChar(c) && !IsGapChar(c))\r
+ {\r
+ char w = GetWildcardChar();\r
+ // Warning("Invalid letter '%c', replaced by '%c'", c, w);\r
+ InvalidLetterWarning(c, w);\r
+ SetChar(uSeqIndex, uColIndex, w);\r
+ }\r
+ }\r
+ }\r
+ ReportInvalidLetters();\r
+ }\r
+\r
+ALPHA MSA::GuessAlpha() const\r
+ {\r
+// If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap\r
+// letters belong to the nucleotide alphabet, guess nucleo.\r
+// Otherwise amino.\r
+ const unsigned CHAR_COUNT = 100;\r
+ const unsigned MIN_NUCLEO_PCT = 95;\r
+\r
+ const unsigned uSeqCount = GetSeqCount();\r
+ const unsigned uColCount = GetColCount();\r
+ if (0 == uSeqCount)\r
+ return ALPHA_Amino;\r
+\r
+ unsigned uDNACount = 0;\r
+ unsigned uRNACount = 0;\r
+ unsigned uTotal = 0;\r
+ unsigned i = 0;\r
+ for (;;)\r
+ {\r
+ unsigned uSeqIndex = i/uColCount;\r
+ if (uSeqIndex >= uSeqCount)\r
+ break;\r
+ unsigned uColIndex = i%uColCount;\r
+ ++i;\r
+ char c = GetChar(uSeqIndex, uColIndex);\r
+ if (IsGapChar(c))\r
+ continue;\r
+ if (IsDNA(c))\r
+ ++uDNACount;\r
+ if (IsRNA(c))\r
+ ++uRNACount;\r
+ ++uTotal;\r
+ if (uTotal >= CHAR_COUNT)\r
+ break;\r
+ }\r
+ if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT)\r
+ return ALPHA_RNA;\r
+ if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT)\r
+ return ALPHA_DNA;\r
+ return ALPHA_Amino;\r
+ }\r