+++ /dev/null
-#include "muscle.h"\r
-#include "msa.h"\r
-#include "textfile.h"\r
-#include "seq.h"\r
-#include <math.h>\r
-\r
-const unsigned DEFAULT_SEQ_LENGTH = 500;\r
-\r
-unsigned MSA::m_uIdCount = 0;\r
-\r
-MSA::MSA()\r
- {\r
- m_uSeqCount = 0;\r
- m_uColCount = 0;\r
-\r
- m_szSeqs = 0;\r
- m_szNames = 0;\r
- m_Weights = 0;\r
-\r
- m_IdToSeqIndex = 0;\r
- m_SeqIndexToId = 0;\r
-\r
- m_uCacheSeqCount = 0;\r
- m_uCacheSeqLength = 0;\r
- }\r
-\r
-MSA::~MSA()\r
- {\r
- Free();\r
- }\r
-\r
-void MSA::Free()\r
- {\r
- for (unsigned n = 0; n < m_uSeqCount; ++n)\r
- {\r
- delete[] m_szSeqs[n];\r
- delete[] m_szNames[n];\r
- }\r
-\r
- delete[] m_szSeqs;\r
- delete[] m_szNames;\r
- delete[] m_Weights;\r
- delete[] m_IdToSeqIndex;\r
- delete[] m_SeqIndexToId;\r
-\r
- m_uSeqCount = 0;\r
- m_uColCount = 0;\r
-\r
- m_szSeqs = 0;\r
- m_szNames = 0;\r
- m_Weights = 0;\r
-\r
- m_IdToSeqIndex = 0;\r
- m_SeqIndexToId = 0;\r
- }\r
-\r
-void MSA::SetSize(unsigned uSeqCount, unsigned uColCount)\r
- {\r
- Free();\r
-\r
- m_uSeqCount = uSeqCount;\r
- m_uCacheSeqLength = uColCount;\r
- m_uColCount = 0;\r
-\r
- if (0 == uSeqCount && 0 == uColCount)\r
- return;\r
-\r
- m_szSeqs = new char *[uSeqCount];\r
- m_szNames = new char *[uSeqCount];\r
- m_Weights = new WEIGHT[uSeqCount];\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- m_szSeqs[uSeqIndex] = new char[uColCount+1];\r
- m_szNames[uSeqIndex] = 0;\r
-#if DEBUG\r
- m_Weights[uSeqIndex] = BTInsane;\r
- memset(m_szSeqs[uSeqIndex], '?', uColCount);\r
-#endif\r
- m_szSeqs[uSeqIndex][uColCount] = 0;\r
- }\r
-\r
- if (m_uIdCount > 0)\r
- {\r
- m_IdToSeqIndex = new unsigned[m_uIdCount];\r
- m_SeqIndexToId = new unsigned[m_uSeqCount];\r
-#if DEBUG\r
- memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned));\r
- memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned));\r
-#endif\r
- }\r
- }\r
-\r
-void MSA::LogMe() const\r
- {\r
- if (0 == GetColCount())\r
- {\r
- Log("MSA empty\n");\r
- return;\r
- }\r
-\r
- const unsigned uColsPerLine = 50;\r
- unsigned uLinesPerSeq = (GetColCount() - 1)/uColsPerLine + 1;\r
- for (unsigned n = 0; n < uLinesPerSeq; ++n)\r
- {\r
- unsigned i;\r
- unsigned iStart = n*uColsPerLine;\r
- unsigned iEnd = GetColCount();\r
- if (iEnd - iStart + 1 > uColsPerLine)\r
- iEnd = iStart + uColsPerLine;\r
- Log(" ");\r
- for (i = iStart; i < iEnd; ++i)\r
- Log("%u", i%10);\r
- Log("\n");\r
- Log(" ");\r
- for (i = iStart; i + 9 < iEnd; i += 10)\r
- Log("%-10u", i);\r
- if (n == uLinesPerSeq - 1)\r
- Log(" %-10u", GetColCount());\r
- Log("\n");\r
- for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
- {\r
- Log("%12.12s", m_szNames[uSeqIndex]);\r
- if (m_Weights[uSeqIndex] != BTInsane)\r
- Log(" (%5.3f)", m_Weights[uSeqIndex]);\r
- else\r
- Log(" ");\r
- Log(" ");\r
- for (i = iStart; i < iEnd; ++i)\r
- Log("%c", GetChar(uSeqIndex, i));\r
- if (0 != m_SeqIndexToId)\r
- Log(" [%5u]", m_SeqIndexToId[uSeqIndex]);\r
- Log("\n");\r
- }\r
- Log("\n\n");\r
- }\r
- }\r
-\r
-char MSA::GetChar(unsigned uSeqIndex, unsigned uIndex) const\r
- {\r
-// TODO: Performance cost?\r
- if (uSeqIndex >= m_uSeqCount || uIndex >= m_uColCount)\r
- Quit("MSA::GetChar(%u/%u,%u/%u)",\r
- uSeqIndex, m_uSeqCount, uIndex, m_uColCount);\r
-\r
- char c = m_szSeqs[uSeqIndex][uIndex];\r
-// assert(IsLegalChar(c));\r
- return c;\r
- }\r
-\r
-unsigned MSA::GetLetter(unsigned uSeqIndex, unsigned uIndex) const\r
- {\r
-// TODO: Performance cost?\r
- char c = GetChar(uSeqIndex, uIndex);\r
- unsigned uLetter = CharToLetter(c);\r
- if (uLetter >= 20)\r
- {\r
- char c = ' ';\r
- if (uSeqIndex < m_uSeqCount && uIndex < m_uColCount)\r
- c = m_szSeqs[uSeqIndex][uIndex];\r
- Quit("MSA::GetLetter(%u/%u, %u/%u)='%c'/%u",\r
- uSeqIndex, m_uSeqCount, uIndex, m_uColCount, c, uLetter);\r
- }\r
- return uLetter;\r
- }\r
-\r
-unsigned MSA::GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const\r
- {\r
-// TODO: Performance cost?\r
- char c = GetChar(uSeqIndex, uIndex);\r
- unsigned uLetter = CharToLetterEx(c);\r
- return uLetter;\r
- }\r
-\r
-void MSA::SetSeqName(unsigned uSeqIndex, const char szName[])\r
- {\r
- if (uSeqIndex >= m_uSeqCount)\r
- Quit("MSA::SetSeqName(%u, %s), count=%u", uSeqIndex, m_uSeqCount);\r
- delete[] m_szNames[uSeqIndex];\r
- int n = (int) strlen(szName) + 1;\r
- m_szNames[uSeqIndex] = new char[n];\r
- memcpy(m_szNames[uSeqIndex], szName, n);\r
- }\r
-\r
-const char *MSA::GetSeqName(unsigned uSeqIndex) const\r
- {\r
- if (uSeqIndex >= m_uSeqCount)\r
- Quit("MSA::GetSeqName(%u), count=%u", uSeqIndex, m_uSeqCount);\r
- return m_szNames[uSeqIndex];\r
- }\r
-\r
-bool MSA::IsGap(unsigned uSeqIndex, unsigned uIndex) const\r
- {\r
- char c = GetChar(uSeqIndex, uIndex);\r
- return IsGapChar(c);\r
- }\r
-\r
-bool MSA::IsWildcard(unsigned uSeqIndex, unsigned uIndex) const\r
- {\r
- char c = GetChar(uSeqIndex, uIndex);\r
- return IsWildcardChar(c);\r
- }\r
-\r
-void MSA::SetChar(unsigned uSeqIndex, unsigned uIndex, char c)\r
- {\r
- if (uSeqIndex >= m_uSeqCount || uIndex > m_uCacheSeqLength)\r
- Quit("MSA::SetChar(%u,%u)", uSeqIndex, uIndex);\r
-\r
- if (uIndex == m_uCacheSeqLength)\r
- {\r
- const unsigned uNewCacheSeqLength = m_uCacheSeqLength + DEFAULT_SEQ_LENGTH;\r
- for (unsigned n = 0; n < m_uSeqCount; ++n)\r
- {\r
- char *ptrNewSeq = new char[uNewCacheSeqLength+1];\r
- memcpy(ptrNewSeq, m_szSeqs[n], m_uCacheSeqLength);\r
- memset(ptrNewSeq + m_uCacheSeqLength, '?', DEFAULT_SEQ_LENGTH);\r
- ptrNewSeq[uNewCacheSeqLength] = 0;\r
- delete[] m_szSeqs[n];\r
- m_szSeqs[n] = ptrNewSeq;\r
- }\r
-\r
- m_uColCount = uIndex;\r
- m_uCacheSeqLength = uNewCacheSeqLength;\r
- }\r
-\r
- if (uIndex >= m_uColCount)\r
- m_uColCount = uIndex + 1;\r
- m_szSeqs[uSeqIndex][uIndex] = c;\r
- }\r
-\r
-void MSA::GetSeq(unsigned uSeqIndex, Seq &seq) const\r
- {\r
- assert(uSeqIndex < m_uSeqCount);\r
-\r
- seq.Clear();\r
-\r
- for (unsigned n = 0; n < m_uColCount; ++n)\r
- if (!IsGap(uSeqIndex, n))\r
- {\r
- char c = GetChar(uSeqIndex, n);\r
- if (!isalpha(c))\r
- Quit("Invalid character '%c' in sequence", c);\r
- c = toupper(c);\r
- seq.push_back(c);\r
- }\r
- const char *ptrName = GetSeqName(uSeqIndex);\r
- seq.SetName(ptrName);\r
- }\r
-\r
-bool MSA::HasGap() const\r
- {\r
- for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
- for (unsigned n = 0; n < GetColCount(); ++n)\r
- if (IsGap(uSeqIndex, n))\r
- return true;\r
- return false;\r
- }\r
-\r
-bool MSA::IsLegalLetter(unsigned uLetter) const\r
- {\r
- return uLetter < 20;\r
- }\r
-\r
-void MSA::SetSeqCount(unsigned uSeqCount)\r
- {\r
- Free();\r
- SetSize(uSeqCount, DEFAULT_SEQ_LENGTH);\r
- }\r
-\r
-void MSA::CopyCol(unsigned uFromCol, unsigned uToCol)\r
- {\r
- assert(uFromCol < GetColCount());\r
- assert(uToCol < GetColCount());\r
- if (uFromCol == uToCol)\r
- return;\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
- {\r
- const char c = GetChar(uSeqIndex, uFromCol);\r
- SetChar(uSeqIndex, uToCol, c);\r
- }\r
- }\r
-\r
-void MSA::Copy(const MSA &msa)\r
- {\r
- Free();\r
- const unsigned uSeqCount = msa.GetSeqCount();\r
- const unsigned uColCount = msa.GetColCount();\r
- SetSize(uSeqCount, uColCount);\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- SetSeqName(uSeqIndex, msa.GetSeqName(uSeqIndex));\r
- const unsigned uId = msa.GetSeqId(uSeqIndex);\r
- SetSeqId(uSeqIndex, uId);\r
- for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
- {\r
- const char c = msa.GetChar(uSeqIndex, uColIndex);\r
- SetChar(uSeqIndex, uColIndex, c);\r
- }\r
- }\r
- }\r
-\r
-bool MSA::IsGapColumn(unsigned uColIndex) const\r
- {\r
- assert(GetSeqCount() > 0);\r
- for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
- if (!IsGap(uSeqIndex, uColIndex))\r
- return false;\r
- return true;\r
- }\r
-\r
-bool MSA::GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const\r
- {\r
- for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
- if (0 == stricmp(ptrSeqName, GetSeqName(uSeqIndex)))\r
- {\r
- *ptruSeqIndex = uSeqIndex;\r
- return true;\r
- }\r
- return false;\r
- }\r
-\r
-void MSA::DeleteCol(unsigned uColIndex)\r
- {\r
- assert(uColIndex < m_uColCount);\r
- size_t n = m_uColCount - uColIndex;\r
- if (n > 0)\r
- {\r
- for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
- {\r
- char *ptrSeq = m_szSeqs[uSeqIndex];\r
- memmove(ptrSeq + uColIndex, ptrSeq + uColIndex + 1, n);\r
- }\r
- }\r
- --m_uColCount;\r
- }\r
-\r
-void MSA::DeleteColumns(unsigned uColIndex, unsigned uColCount)\r
- {\r
- for (unsigned n = 0; n < uColCount; ++n)\r
- DeleteCol(uColIndex);\r
- }\r
-\r
-void MSA::FromFile(TextFile &File)\r
- {\r
- FromFASTAFile(File);\r
- }\r
-\r
-// Weights sum to 1, WCounts sum to NIC\r
-WEIGHT MSA::GetSeqWeight(unsigned uSeqIndex) const\r
- {\r
- assert(uSeqIndex < m_uSeqCount);\r
- WEIGHT w = m_Weights[uSeqIndex];\r
- if (w == wInsane)\r
- Quit("Seq weight not set");\r
- return w;\r
- }\r
-\r
-void MSA::SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const\r
- {\r
- assert(uSeqIndex < m_uSeqCount);\r
- m_Weights[uSeqIndex] = w;\r
- }\r
-\r
-void MSA::NormalizeWeights(WEIGHT wDesiredTotal) const\r
- {\r
- WEIGHT wTotal = 0;\r
- for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
- wTotal += m_Weights[uSeqIndex];\r
-\r
- if (0 == wTotal)\r
- return;\r
-\r
- const WEIGHT f = wDesiredTotal/wTotal;\r
- for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
- m_Weights[uSeqIndex] *= f;\r
- }\r
-\r
-void MSA::CalcWeights() const\r
- {\r
- Quit("Calc weights not implemented");\r
- }\r
-\r
-static void FmtChar(char c, unsigned uWidth)\r
- {\r
- Log("%c", c);\r
- for (unsigned n = 0; n < uWidth - 1; ++n)\r
- Log(" ");\r
- }\r
-\r
-static void FmtInt(unsigned u, unsigned uWidth)\r
- {\r
- static char szStr[1024];\r
- assert(uWidth < sizeof(szStr));\r
- if (u > 0)\r
- sprintf(szStr, "%u", u);\r
- else\r
- strcpy(szStr, ".");\r
- Log(szStr);\r
- unsigned n = (unsigned) strlen(szStr);\r
- if (n < uWidth)\r
- for (unsigned i = 0; i < uWidth - n; ++i)\r
- Log(" ");\r
- }\r
-\r
-static void FmtInt0(unsigned u, unsigned uWidth)\r
- {\r
- static char szStr[1024];\r
- assert(uWidth < sizeof(szStr));\r
- sprintf(szStr, "%u", u);\r
- Log(szStr);\r
- unsigned n = (unsigned) strlen(szStr);\r
- if (n < uWidth)\r
- for (unsigned i = 0; i < uWidth - n; ++i)\r
- Log(" ");\r
- }\r
-\r
-static void FmtPad(unsigned n)\r
- {\r
- for (unsigned i = 0; i < n; ++i)\r
- Log(" ");\r
- }\r
-\r
-void MSA::FromSeq(const Seq &s)\r
- {\r
- unsigned uSeqLength = s.Length();\r
- SetSize(1, uSeqLength);\r
- SetSeqName(0, s.GetName());\r
- if (0 != m_SeqIndexToId)\r
- SetSeqId(0, s.GetId());\r
- for (unsigned n = 0; n < uSeqLength; ++n)\r
- SetChar(0, n, s[n]);\r
- }\r
-\r
-unsigned MSA::GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const\r
- {\r
- assert(uSeqIndex < GetSeqCount());\r
- assert(uColIndex < GetColCount());\r
-\r
- unsigned uCol = 0;\r
- for (unsigned n = 0; n <= uColIndex; ++n)\r
- if (!IsGap(uSeqIndex, n))\r
- ++uCol;\r
- return uCol;\r
- }\r
-\r
-void MSA::CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex)\r
- {\r
- assert(uToSeqIndex < m_uSeqCount);\r
- const unsigned uColCount = msaFrom.GetColCount();\r
- assert(m_uColCount == uColCount ||\r
- (0 == m_uColCount && uColCount <= m_uCacheSeqLength));\r
-\r
- memcpy(m_szSeqs[uToSeqIndex], msaFrom.GetSeqBuffer(uFromSeqIndex), uColCount);\r
- SetSeqName(uToSeqIndex, msaFrom.GetSeqName(uFromSeqIndex));\r
- if (0 == m_uColCount)\r
- m_uColCount = uColCount;\r
- }\r
-\r
-const char *MSA::GetSeqBuffer(unsigned uSeqIndex) const\r
- {\r
- assert(uSeqIndex < m_uSeqCount);\r
- return m_szSeqs[uSeqIndex];\r
- }\r
-\r
-void MSA::DeleteSeq(unsigned uSeqIndex)\r
- {\r
- assert(uSeqIndex < m_uSeqCount);\r
-\r
- delete m_szSeqs[uSeqIndex];\r
- delete m_szNames[uSeqIndex];\r
-\r
- const unsigned uBytesToMove = (m_uSeqCount - uSeqIndex)*sizeof(char *);\r
- if (uBytesToMove > 0)\r
- {\r
- memmove(m_szSeqs + uSeqIndex, m_szSeqs + uSeqIndex + 1, uBytesToMove);\r
- memmove(m_szNames + uSeqIndex, m_szNames + uSeqIndex + 1, uBytesToMove);\r
- }\r
-\r
- --m_uSeqCount;\r
-\r
- delete[] m_Weights;\r
- m_Weights = 0;\r
- }\r
-\r
-bool MSA::IsEmptyCol(unsigned uColIndex) const\r
- {\r
- const unsigned uSeqCount = GetSeqCount();\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- if (!IsGap(uSeqIndex, uColIndex))\r
- return false;\r
- return true;\r
- }\r
-\r
-//void MSA::DeleteEmptyCols(bool bProgress)\r
-// {\r
-// unsigned uColCount = GetColCount();\r
-// for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
-// {\r
-// if (IsEmptyCol(uColIndex))\r
-// {\r
-// if (bProgress)\r
-// {\r
-// Log("Deleting col %u of %u\n", uColIndex, uColCount);\r
-// printf("Deleting col %u of %u\n", uColIndex, uColCount);\r
-// }\r
-// DeleteCol(uColIndex);\r
-// --uColCount;\r
-// }\r
-// }\r
-// }\r
-\r
-unsigned MSA::AlignedColIndexToColIndex(unsigned uAlignedColIndex) const\r
- {\r
- Quit("MSA::AlignedColIndexToColIndex not implemented");\r
- return 0;\r
- }\r
-\r
-WEIGHT MSA::GetTotalSeqWeight() const\r
- {\r
- WEIGHT wTotal = 0;\r
- const unsigned uSeqCount = GetSeqCount();\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- wTotal += m_Weights[uSeqIndex];\r
- return wTotal;\r
- }\r
-\r
-bool MSA::SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2,\r
- unsigned uSeqIndex2)\r
- {\r
- Seq s1;\r
- Seq s2;\r
-\r
- a1.GetSeq(uSeqIndex1, s1);\r
- a2.GetSeq(uSeqIndex2, s2);\r
-\r
- s1.StripGaps();\r
- s2.StripGaps();\r
-\r
- return s1.EqIgnoreCase(s2);\r
- }\r
-\r
-unsigned MSA::GetSeqLength(unsigned uSeqIndex) const\r
- {\r
- assert(uSeqIndex < GetSeqCount());\r
-\r
- const unsigned uColCount = GetColCount();\r
- unsigned uLength = 0;\r
- for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
- if (!IsGap(uSeqIndex, uColIndex))\r
- ++uLength;\r
- return uLength;\r
- }\r
-\r
-void MSA::GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrPWID,\r
- unsigned *ptruPosCount) const\r
- {\r
- assert(uSeqIndex1 < GetSeqCount());\r
- assert(uSeqIndex2 < GetSeqCount());\r
-\r
- unsigned uSameCount = 0;\r
- unsigned uPosCount = 0;\r
- const unsigned uColCount = GetColCount();\r
- for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
- {\r
- char c1 = GetChar(uSeqIndex1, uColIndex);\r
- if (IsGapChar(c1))\r
- continue;\r
- char c2 = GetChar(uSeqIndex2, uColIndex);\r
- if (IsGapChar(c2))\r
- continue;\r
- ++uPosCount;\r
- if (c1 == c2)\r
- ++uSameCount;\r
- }\r
- *ptruPosCount = uPosCount;\r
- if (uPosCount > 0)\r
- *ptrPWID = 100.0 * (double) uSameCount / (double) uPosCount;\r
- else\r
- *ptrPWID = 0;\r
- }\r
-\r
-void MSA::UnWeight()\r
- {\r
- for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
- m_Weights[uSeqIndex] = BTInsane;\r
- }\r
-\r
-unsigned MSA::UniqueResidueTypes(unsigned uColIndex) const\r
- {\r
- assert(uColIndex < GetColCount());\r
-\r
- unsigned Counts[MAX_ALPHA];\r
- memset(Counts, 0, sizeof(Counts));\r
- const unsigned uSeqCount = GetSeqCount();\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))\r
- continue;\r
- const unsigned uLetter = GetLetter(uSeqIndex, uColIndex);\r
- ++(Counts[uLetter]);\r
- }\r
- unsigned uUniqueCount = 0;\r
- for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)\r
- if (Counts[uLetter] > 0)\r
- ++uUniqueCount;\r
- return uUniqueCount;\r
- }\r
-\r
-double MSA::GetOcc(unsigned uColIndex) const\r
- {\r
- unsigned uGapCount = 0;\r
- for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)\r
- if (IsGap(uSeqIndex, uColIndex))\r
- ++uGapCount;\r
- unsigned uSeqCount = GetSeqCount();\r
- return (double) (uSeqCount - uGapCount) / (double) uSeqCount;\r
- }\r
-\r
-void MSA::ToFile(TextFile &File) const\r
- {\r
- if (g_bMSF)\r
- ToMSFFile(File);\r
- else if (g_bAln)\r
- ToAlnFile(File);\r
- else if (g_bHTML)\r
- ToHTMLFile(File);\r
- else if (g_bPHYS)\r
- ToPhySequentialFile(File);\r
- else if (g_bPHYI)\r
- ToPhyInterleavedFile(File);\r
- else\r
- ToFASTAFile(File);\r
- if (0 != g_pstrScoreFileName)\r
- WriteScoreFile(*this);\r
- }\r
-\r
-bool MSA::ColumnHasGap(unsigned uColIndex) const\r
- {\r
- const unsigned uSeqCount = GetSeqCount();\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- if (IsGap(uSeqIndex, uColIndex))\r
- return true;\r
- return false;\r
- }\r
-\r
-void MSA::SetIdCount(unsigned uIdCount)\r
- {\r
- //if (m_uIdCount != 0)\r
- // Quit("MSA::SetIdCount: may only be called once");\r
-\r
- if (m_uIdCount > 0)\r
- {\r
- if (uIdCount > m_uIdCount)\r
- Quit("MSA::SetIdCount: cannot increase count");\r
- return;\r
- }\r
- m_uIdCount = uIdCount;\r
- }\r
-\r
-void MSA::SetSeqId(unsigned uSeqIndex, unsigned uId)\r
- {\r
- assert(uSeqIndex < m_uSeqCount);\r
- assert(uId < m_uIdCount);\r
- if (0 == m_SeqIndexToId)\r
- {\r
- if (0 == m_uIdCount)\r
- Quit("MSA::SetSeqId, SetIdCount has not been called");\r
- m_IdToSeqIndex = new unsigned[m_uIdCount];\r
- m_SeqIndexToId = new unsigned[m_uSeqCount];\r
-\r
- memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned));\r
- memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned));\r
- }\r
- m_SeqIndexToId[uSeqIndex] = uId;\r
- m_IdToSeqIndex[uId] = uSeqIndex;\r
- }\r
-\r
-unsigned MSA::GetSeqIndex(unsigned uId) const\r
- {\r
- assert(uId < m_uIdCount);\r
- assert(0 != m_IdToSeqIndex);\r
- unsigned uSeqIndex = m_IdToSeqIndex[uId];\r
- assert(uSeqIndex < m_uSeqCount);\r
- return uSeqIndex;\r
- }\r
-\r
-bool MSA::GetSeqIndex(unsigned uId, unsigned *ptruIndex) const\r
- {\r
- for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
- {\r
- if (uId == m_SeqIndexToId[uSeqIndex])\r
- {\r
- *ptruIndex = uSeqIndex;\r
- return true;\r
- }\r
- }\r
- return false;\r
- }\r
-\r
-unsigned MSA::GetSeqId(unsigned uSeqIndex) const\r
- {\r
- assert(uSeqIndex < m_uSeqCount);\r
- unsigned uId = m_SeqIndexToId[uSeqIndex];\r
- assert(uId < m_uIdCount);\r
- return uId;\r
- }\r
-\r
-bool MSA::WeightsSet() const\r
- {\r
- return BTInsane != m_Weights[0];\r
- }\r
-\r
-void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount,\r
- MSA &msaOut)\r
- {\r
- const unsigned uColCount = msaIn.GetColCount();\r
- msaOut.SetSize(uIdCount, uColCount);\r
- for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uIdCount; ++uSeqIndexOut)\r
- {\r
- const unsigned uId = Ids[uSeqIndexOut];\r
-\r
- const unsigned uSeqIndexIn = msaIn.GetSeqIndex(uId);\r
- const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);\r
-\r
- msaOut.SetSeqId(uSeqIndexOut, uId);\r
- msaOut.SetSeqName(uSeqIndexOut, ptrName);\r
-\r
- for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
- {\r
- const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);\r
- msaOut.SetChar(uSeqIndexOut, uColIndex, c);\r
- }\r
- }\r
- }\r
-\r
-// Caller must allocate ptrSeq and ptrLabel as new char[n].\r
-void MSA::AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel)\r
- {\r
- if (m_uSeqCount > m_uCacheSeqCount)\r
- Quit("Internal error MSA::AppendSeq");\r
- if (m_uSeqCount == m_uCacheSeqCount)\r
- ExpandCache(m_uSeqCount + 4, uSeqLength);\r
- m_szSeqs[m_uSeqCount] = ptrSeq;\r
- m_szNames[m_uSeqCount] = ptrLabel;\r
- ++m_uSeqCount;\r
- }\r
-\r
-void MSA::ExpandCache(unsigned uSeqCount, unsigned uColCount)\r
- {\r
- if (m_IdToSeqIndex != 0 || m_SeqIndexToId != 0 || uSeqCount < m_uSeqCount)\r
- Quit("Internal error MSA::ExpandCache");\r
-\r
- if (m_uSeqCount > 0 && uColCount != m_uColCount)\r
- Quit("Internal error MSA::ExpandCache, ColCount changed");\r
-\r
- char **NewSeqs = new char *[uSeqCount];\r
- char **NewNames = new char *[uSeqCount];\r
- WEIGHT *NewWeights = new WEIGHT[uSeqCount];\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
- {\r
- NewSeqs[uSeqIndex] = m_szSeqs[uSeqIndex];\r
- NewNames[uSeqIndex] = m_szNames[uSeqIndex];\r
- NewWeights[uSeqIndex] = m_Weights[uSeqIndex];\r
- }\r
-\r
- for (unsigned uSeqIndex = m_uSeqCount; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- char *Seq = new char[uColCount];\r
- NewSeqs[uSeqIndex] = Seq;\r
-#if DEBUG\r
- memset(Seq, '?', uColCount);\r
-#endif\r
- }\r
-\r
- delete[] m_szSeqs;\r
- delete[] m_szNames;\r
- delete[] m_Weights;\r
-\r
- m_szSeqs = NewSeqs;\r
- m_szNames = NewNames;\r
- m_Weights = NewWeights;\r
-\r
- m_uCacheSeqCount = uSeqCount;\r
- m_uCacheSeqLength = uColCount;\r
- m_uColCount = uColCount;\r
- }\r
-\r
-void MSA::FixAlpha()\r
- {\r
- ClearInvalidLetterWarning();\r
- for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)\r
- {\r
- for (unsigned uColIndex = 0; uColIndex < m_uColCount; ++uColIndex)\r
- {\r
- char c = GetChar(uSeqIndex, uColIndex);\r
- if (!IsResidueChar(c) && !IsGapChar(c))\r
- {\r
- char w = GetWildcardChar();\r
- // Warning("Invalid letter '%c', replaced by '%c'", c, w);\r
- InvalidLetterWarning(c, w);\r
- SetChar(uSeqIndex, uColIndex, w);\r
- }\r
- }\r
- }\r
- ReportInvalidLetters();\r
- }\r
-\r
-ALPHA MSA::GuessAlpha() const\r
- {\r
-// If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap\r
-// letters belong to the nucleotide alphabet, guess nucleo.\r
-// Otherwise amino.\r
- const unsigned CHAR_COUNT = 100;\r
- const unsigned MIN_NUCLEO_PCT = 95;\r
-\r
- const unsigned uSeqCount = GetSeqCount();\r
- const unsigned uColCount = GetColCount();\r
- if (0 == uSeqCount)\r
- return ALPHA_Amino;\r
-\r
- unsigned uDNACount = 0;\r
- unsigned uRNACount = 0;\r
- unsigned uTotal = 0;\r
- unsigned i = 0;\r
- for (;;)\r
- {\r
- unsigned uSeqIndex = i/uColCount;\r
- if (uSeqIndex >= uSeqCount)\r
- break;\r
- unsigned uColIndex = i%uColCount;\r
- ++i;\r
- char c = GetChar(uSeqIndex, uColIndex);\r
- if (IsGapChar(c))\r
- continue;\r
- if (IsDNA(c))\r
- ++uDNACount;\r
- if (IsRNA(c))\r
- ++uRNACount;\r
- ++uTotal;\r
- if (uTotal >= CHAR_COUNT)\r
- break;\r
- }\r
- if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT)\r
- return ALPHA_RNA;\r
- if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT)\r
- return ALPHA_DNA;\r
- return ALPHA_Amino;\r
- }\r