--- /dev/null
+#include "muscle.h"\r
+#include "tree.h"\r
+#include "textfile.h" // for test only\r
+#include "msa.h"\r
+#include "seqvect.h"\r
+#include "profile.h"\r
+#ifndef _MSC_VER\r
+#include <unistd.h> // for unlink\r
+#endif\r
+\r
+#define TRACE 0\r
+\r
+/***\r
+Find subfamilies from tree by following criteria:\r
+\r
+(a) number of leaves <= max,\r
+(b) is monophyletic, i.e. most recent common ancestor is parent\r
+of no more than one subfamily.\r
+***/\r
+\r
+static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount,\r
+ unsigned SubFams[], unsigned &uSubFamCount)\r
+ {\r
+ if (tree.IsLeaf(uNodeIndex))\r
+ return 1;\r
+\r
+ unsigned uLeft = tree.GetLeft(uNodeIndex);\r
+ unsigned uRight = tree.GetRight(uNodeIndex);\r
+ unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount);\r
+ unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount);\r
+\r
+ unsigned uLeafCount = uLeftCount + uRightCount;\r
+ if (uLeftCount + uRightCount > uMaxLeafCount)\r
+ {\r
+ if (uLeftCount <= uMaxLeafCount)\r
+ SubFams[uSubFamCount++] = uLeft;\r
+ if (uRightCount <= uMaxLeafCount)\r
+ SubFams[uSubFamCount++] = uRight;\r
+ }\r
+ else if (tree.IsRoot(uNodeIndex))\r
+ {\r
+ if (uSubFamCount != 0)\r
+ Quit("Error in SubFamRecurse");\r
+ SubFams[uSubFamCount++] = uNodeIndex;\r
+ }\r
+\r
+ return uLeafCount;\r
+ }\r
+\r
+void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount)\r
+ {\r
+ *ptruSubFamCount = 0;\r
+ SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount);\r
+\r
+#if TRACE\r
+ {\r
+ Log("\n");\r
+ Log("Tree:\n");\r
+ tree.LogMe();\r
+ //void DrawTree(const Tree &tree);\r
+ //DrawTree(tree);\r
+ Log("\n");\r
+ Log("%d subfams:\n", *ptruSubFamCount);\r
+ for (unsigned i = 0; i < *ptruSubFamCount; ++i)\r
+ Log(" %d=%d", i, SubFams[i]);\r
+ Log("\n");\r
+ }\r
+#endif\r
+ }\r
+\r
+//unsigned SubFams[9999];\r
+//unsigned uSubFamCount;\r
+//\r
+//static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)\r
+// {\r
+// const unsigned uRoot = tree.GetRootNodeIndex();\r
+// unsigned uDist = 0;\r
+// while (uNodeIndex != uRoot)\r
+// {\r
+// ++uDist;\r
+// uNodeIndex = tree.GetParent(uNodeIndex);\r
+// }\r
+// return uDist;\r
+// }\r
+//\r
+//static void DrawNode(const Tree &tree, unsigned uNodeIndex)\r
+// {\r
+// if (!tree.IsLeaf(uNodeIndex))\r
+// DrawNode(tree, tree.GetLeft(uNodeIndex));\r
+//\r
+// unsigned uDist = DistFromRoot(tree, uNodeIndex);\r
+// for (unsigned i = 0; i < 5*uDist; ++i)\r
+// Log(" ");\r
+// Log("%d", uNodeIndex);\r
+// for (unsigned i = 0; i < uSubFamCount; ++i)\r
+// if (uNodeIndex == SubFams[i])\r
+// {\r
+// Log("*");\r
+// break;\r
+// }\r
+// Log("\n");\r
+//\r
+// if (!tree.IsLeaf(uNodeIndex))\r
+// DrawNode(tree, tree.GetRight(uNodeIndex));\r
+// }\r
+//\r
+//static void DrawTree(const Tree &tree)\r
+// {\r
+// unsigned uRoot = tree.GetRootNodeIndex();\r
+// DrawNode(tree, uRoot);\r
+// }\r
+//\r
+//void TestSubFams(const char *FileName)\r
+// {\r
+// Tree tree;\r
+// TextFile f(FileName);\r
+// tree.FromFile(f);\r
+// SubFam(tree, 5, SubFams, &uSubFamCount);\r
+// DrawTree(tree);\r
+// }\r
+\r
+static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[])\r
+ {\r
+ if (tree.IsLeaf(uNodeIndex))\r
+ return;\r
+ unsigned uLeft = tree.GetLeft(uNodeIndex);\r
+ unsigned uRight = tree.GetRight(uNodeIndex);\r
+ NodeInSubFam[uLeft] = true;\r
+ NodeInSubFam[uRight] = true;\r
+\r
+ SetInFam(tree, uLeft, NodeInSubFam);\r
+ SetInFam(tree, uRight, NodeInSubFam);\r
+ }\r
+\r
+void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex,\r
+ MSA &msaOut)\r
+ {\r
+ const unsigned uSeqCount = vAll.GetSeqCount();\r
+\r
+ const char *InTmp = "asf_in.tmp";\r
+ const char *OutTmp = "asf_out.tmp";\r
+\r
+ unsigned *Leaves = new unsigned[uSeqCount];\r
+ unsigned uLeafCount;\r
+ GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount);\r
+\r
+ SeqVect v;\r
+ for (unsigned i = 0; i < uLeafCount; ++i)\r
+ {\r
+ unsigned uLeafNodeIndex = Leaves[i];\r
+ unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex);\r
+ Seq &s = vAll.GetSeqById(uId);\r
+ v.AppendSeq(s);\r
+ }\r
+\r
+#if TRACE\r
+ {\r
+ Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount);\r
+ for (unsigned i = 0; i < uLeafCount; ++i)\r
+ Log(" %s", v.GetSeqName(i));\r
+ Log("\n");\r
+ }\r
+#endif\r
+\r
+ TextFile fIn(InTmp, true);\r
+\r
+ v.ToFASTAFile(fIn);\r
+ fIn.Close();\r
+\r
+ char CmdLine[4096];\r
+ sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp);\r
+// sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp);\r
+ int NotUsed = system(CmdLine);\r
+\r
+ TextFile fOut(OutTmp);\r
+ msaOut.FromFile(fOut);\r
+\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex)\r
+ {\r
+ const char *Name = msaOut.GetSeqName(uSeqIndex);\r
+ unsigned uId = vAll.GetSeqIdFromName(Name);\r
+ msaOut.SetSeqId(uSeqIndex, uId);\r
+ }\r
+\r
+ unlink(InTmp);\r
+ unlink(OutTmp);\r
+\r
+ delete[] Leaves;\r
+ }\r
+\r
+void ProgAlignSubFams()\r
+ {\r
+ MSA msaOut;\r
+\r
+ SetOutputFileName(g_pstrOutFileName);\r
+ SetInputFileName(g_pstrInFileName);\r
+\r
+ SetMaxIters(g_uMaxIters);\r
+ SetSeqWeightMethod(g_SeqWeight1);\r
+\r
+ TextFile fileIn(g_pstrInFileName);\r
+ SeqVect v;\r
+ v.FromFASTAFile(fileIn);\r
+ const unsigned uSeqCount = v.Length();\r
+\r
+ if (0 == uSeqCount)\r
+ Quit("No sequences in input file");\r
+\r
+ ALPHA Alpha = ALPHA_Undefined;\r
+ switch (g_SeqType)\r
+ {\r
+ case SEQTYPE_Auto:\r
+ Alpha = v.GuessAlpha();\r
+ break;\r
+\r
+ case SEQTYPE_Protein:\r
+ Alpha = ALPHA_Amino;\r
+ break;\r
+\r
+ case SEQTYPE_DNA:\r
+ Alpha = ALPHA_DNA;\r
+ break;\r
+\r
+ case SEQTYPE_RNA:\r
+ Alpha = ALPHA_RNA;\r
+ break;\r
+\r
+ default:\r
+ Quit("Invalid seq type");\r
+ }\r
+ SetAlpha(Alpha);\r
+ v.FixAlpha();\r
+\r
+ PTR_SCOREMATRIX UserMatrix = 0;\r
+ if (0 != g_pstrMatrixFileName)\r
+ {\r
+ const char *FileName = g_pstrMatrixFileName;\r
+ const char *Path = getenv("MUSCLE_MXPATH");\r
+ if (Path != 0)\r
+ {\r
+ size_t n = strlen(Path) + 1 + strlen(FileName) + 1;\r
+ char *NewFileName = new char[n];\r
+ sprintf(NewFileName, "%s/%s", Path, FileName);\r
+ FileName = NewFileName;\r
+ }\r
+ TextFile File(FileName);\r
+ UserMatrix = ReadMx(File);\r
+ g_Alpha = ALPHA_Amino;\r
+ g_PPScore = PPSCORE_SP;\r
+ }\r
+\r
+ SetPPScore();\r
+\r
+ if (0 != UserMatrix)\r
+ g_ptrScoreMatrix = UserMatrix;\r
+\r
+ if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)\r
+ {\r
+ SetPPScore(PPSCORE_SPN);\r
+ g_Distance1 = DISTANCE_Kmer4_6;\r
+ }\r
+\r
+ unsigned uMaxL = 0;\r
+ unsigned uTotL = 0;\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ {\r
+ unsigned L = v.GetSeq(uSeqIndex).Length();\r
+ uTotL += L;\r
+ if (L > uMaxL)\r
+ uMaxL = L;\r
+ }\r
+\r
+ SetIter(1);\r
+ g_bDiags = g_bDiags1;\r
+ SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);\r
+\r
+ SetMuscleSeqVect(v);\r
+\r
+ MSA::SetIdCount(uSeqCount);\r
+\r
+// Initialize sequence ids.\r
+// From this point on, ids must somehow propogate from here.\r
+ for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+ v.SetSeqId(uSeqIndex, uSeqIndex);\r
+\r
+ if (uSeqCount > 1)\r
+ MHackStart(v);\r
+\r
+ if (0 == uSeqCount)\r
+ {\r
+ msaOut.Clear();\r
+ return;\r
+ }\r
+\r
+ if (1 == uSeqCount && ALPHA_Amino == Alpha)\r
+ {\r
+ const Seq &s = v.GetSeq(0);\r
+ msaOut.FromSeq(s);\r
+ return;\r
+ }\r
+\r
+ Tree GuideTree;\r
+ TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1);\r
+ SetMuscleTree(GuideTree);\r
+\r
+ MSA msa;\r
+ if (g_bLow)\r
+ {\r
+ ProgNode *ProgNodes = 0;\r
+ ProgNodes = ProgressiveAlignE(v, GuideTree, msa);\r
+ delete[] ProgNodes;\r
+ }\r
+ else\r
+ ProgressiveAlign(v, GuideTree, msa);\r
+ SetCurrentAlignment(msa);\r
+ TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2);\r
+ SetMuscleTree(GuideTree);\r
+\r
+ unsigned *SubFams = new unsigned[uSeqCount];\r
+ unsigned uSubFamCount;\r
+ SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount);\r
+\r
+ SetProgressDesc("Align node");\r
+ const unsigned uNodeCount = 2*uSeqCount - 1;\r
+\r
+ ProgNode *ProgNodes = new ProgNode[uNodeCount];\r
+ bool *NodeIsSubFam = new bool[uNodeCount];\r
+ bool *NodeInSubFam = new bool[uNodeCount];\r
+\r
+ for (unsigned i = 0; i < uNodeCount; ++i)\r
+ {\r
+ NodeIsSubFam[i] = false;\r
+ NodeInSubFam[i] = false;\r
+ }\r
+\r
+ for (unsigned i = 0; i < uSubFamCount; ++i)\r
+ {\r
+ unsigned uNodeIndex = SubFams[i];\r
+ assert(uNodeIndex < uNodeCount);\r
+ NodeIsSubFam[uNodeIndex] = true;\r
+ SetInFam(GuideTree, uNodeIndex, NodeInSubFam);\r
+ }\r
+\r
+ unsigned uJoin = 0;\r
+ unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();\r
+ do\r
+ {\r
+ if (NodeIsSubFam[uTreeNodeIndex])\r
+ {\r
+#if TRACE\r
+ Log("Node %d: align subfam\n", uTreeNodeIndex);\r
+#endif\r
+ ProgNode &Node = ProgNodes[uTreeNodeIndex];\r
+ AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);\r
+ Node.m_uLength = Node.m_MSA.GetColCount();\r
+ }\r
+ else if (!NodeInSubFam[uTreeNodeIndex])\r
+ {\r
+#if TRACE\r
+ Log("Node %d: align two subfams\n", uTreeNodeIndex);\r
+#endif\r
+ Progress(uJoin, uSubFamCount - 1);\r
+ ++uJoin;\r
+\r
+ const unsigned uMergeNodeIndex = uTreeNodeIndex;\r
+ ProgNode &Parent = ProgNodes[uMergeNodeIndex];\r
+\r
+ const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);\r
+ const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);\r
+\r
+ ProgNode &Node1 = ProgNodes[uLeft];\r
+ ProgNode &Node2 = ProgNodes[uRight];\r
+\r
+ PWPath Path;\r
+ AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);\r
+ Parent.m_uLength = Parent.m_MSA.GetColCount();\r
+\r
+ Node1.m_MSA.Clear();\r
+ Node2.m_MSA.Clear();\r
+ }\r
+ else\r
+ {\r
+#if TRACE\r
+ Log("Node %d: in subfam\n", uTreeNodeIndex);\r
+#endif\r
+ ;\r
+ }\r
+ uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);\r
+ }\r
+ while (NULL_NEIGHBOR != uTreeNodeIndex);\r
+ ProgressStepsDone();\r
+\r
+ unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();\r
+ ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];\r
+\r
+ TextFile fOut(g_pstrOutFileName, true);\r
+ MHackEnd(RootProgNode.m_MSA);\r
+ RootProgNode.m_MSA.ToFile(fOut);\r
+\r
+ delete[] NodeInSubFam;\r
+ delete[] NodeIsSubFam;\r
+ delete[] ProgNodes;\r
+ delete[] SubFams;\r
+\r
+ ProgNodes = 0;\r
+ NodeInSubFam = 0;\r
+ NodeIsSubFam = 0;\r
+ SubFams = 0;\r
+ }\r