+++ /dev/null
-#include "muscle.h"\r
-#include "tree.h"\r
-#include "textfile.h" // for test only\r
-#include "msa.h"\r
-#include "seqvect.h"\r
-#include "profile.h"\r
-#ifndef _MSC_VER\r
-#include <unistd.h> // for unlink\r
-#endif\r
-\r
-#define TRACE 0\r
-\r
-/***\r
-Find subfamilies from tree by following criteria:\r
-\r
-(a) number of leaves <= max,\r
-(b) is monophyletic, i.e. most recent common ancestor is parent\r
-of no more than one subfamily.\r
-***/\r
-\r
-static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount,\r
- unsigned SubFams[], unsigned &uSubFamCount)\r
- {\r
- if (tree.IsLeaf(uNodeIndex))\r
- return 1;\r
-\r
- unsigned uLeft = tree.GetLeft(uNodeIndex);\r
- unsigned uRight = tree.GetRight(uNodeIndex);\r
- unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount);\r
- unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount);\r
-\r
- unsigned uLeafCount = uLeftCount + uRightCount;\r
- if (uLeftCount + uRightCount > uMaxLeafCount)\r
- {\r
- if (uLeftCount <= uMaxLeafCount)\r
- SubFams[uSubFamCount++] = uLeft;\r
- if (uRightCount <= uMaxLeafCount)\r
- SubFams[uSubFamCount++] = uRight;\r
- }\r
- else if (tree.IsRoot(uNodeIndex))\r
- {\r
- if (uSubFamCount != 0)\r
- Quit("Error in SubFamRecurse");\r
- SubFams[uSubFamCount++] = uNodeIndex;\r
- }\r
-\r
- return uLeafCount;\r
- }\r
-\r
-void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount)\r
- {\r
- *ptruSubFamCount = 0;\r
- SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount);\r
-\r
-#if TRACE\r
- {\r
- Log("\n");\r
- Log("Tree:\n");\r
- tree.LogMe();\r
- //void DrawTree(const Tree &tree);\r
- //DrawTree(tree);\r
- Log("\n");\r
- Log("%d subfams:\n", *ptruSubFamCount);\r
- for (unsigned i = 0; i < *ptruSubFamCount; ++i)\r
- Log(" %d=%d", i, SubFams[i]);\r
- Log("\n");\r
- }\r
-#endif\r
- }\r
-\r
-//unsigned SubFams[9999];\r
-//unsigned uSubFamCount;\r
-//\r
-//static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)\r
-// {\r
-// const unsigned uRoot = tree.GetRootNodeIndex();\r
-// unsigned uDist = 0;\r
-// while (uNodeIndex != uRoot)\r
-// {\r
-// ++uDist;\r
-// uNodeIndex = tree.GetParent(uNodeIndex);\r
-// }\r
-// return uDist;\r
-// }\r
-//\r
-//static void DrawNode(const Tree &tree, unsigned uNodeIndex)\r
-// {\r
-// if (!tree.IsLeaf(uNodeIndex))\r
-// DrawNode(tree, tree.GetLeft(uNodeIndex));\r
-//\r
-// unsigned uDist = DistFromRoot(tree, uNodeIndex);\r
-// for (unsigned i = 0; i < 5*uDist; ++i)\r
-// Log(" ");\r
-// Log("%d", uNodeIndex);\r
-// for (unsigned i = 0; i < uSubFamCount; ++i)\r
-// if (uNodeIndex == SubFams[i])\r
-// {\r
-// Log("*");\r
-// break;\r
-// }\r
-// Log("\n");\r
-//\r
-// if (!tree.IsLeaf(uNodeIndex))\r
-// DrawNode(tree, tree.GetRight(uNodeIndex));\r
-// }\r
-//\r
-//static void DrawTree(const Tree &tree)\r
-// {\r
-// unsigned uRoot = tree.GetRootNodeIndex();\r
-// DrawNode(tree, uRoot);\r
-// }\r
-//\r
-//void TestSubFams(const char *FileName)\r
-// {\r
-// Tree tree;\r
-// TextFile f(FileName);\r
-// tree.FromFile(f);\r
-// SubFam(tree, 5, SubFams, &uSubFamCount);\r
-// DrawTree(tree);\r
-// }\r
-\r
-static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[])\r
- {\r
- if (tree.IsLeaf(uNodeIndex))\r
- return;\r
- unsigned uLeft = tree.GetLeft(uNodeIndex);\r
- unsigned uRight = tree.GetRight(uNodeIndex);\r
- NodeInSubFam[uLeft] = true;\r
- NodeInSubFam[uRight] = true;\r
-\r
- SetInFam(tree, uLeft, NodeInSubFam);\r
- SetInFam(tree, uRight, NodeInSubFam);\r
- }\r
-\r
-void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex,\r
- MSA &msaOut)\r
- {\r
- const unsigned uSeqCount = vAll.GetSeqCount();\r
-\r
- const char *InTmp = "asf_in.tmp";\r
- const char *OutTmp = "asf_out.tmp";\r
-\r
- unsigned *Leaves = new unsigned[uSeqCount];\r
- unsigned uLeafCount;\r
- GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount);\r
-\r
- SeqVect v;\r
- for (unsigned i = 0; i < uLeafCount; ++i)\r
- {\r
- unsigned uLeafNodeIndex = Leaves[i];\r
- unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex);\r
- Seq &s = vAll.GetSeqById(uId);\r
- v.AppendSeq(s);\r
- }\r
-\r
-#if TRACE\r
- {\r
- Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount);\r
- for (unsigned i = 0; i < uLeafCount; ++i)\r
- Log(" %s", v.GetSeqName(i));\r
- Log("\n");\r
- }\r
-#endif\r
-\r
- TextFile fIn(InTmp, true);\r
-\r
- v.ToFASTAFile(fIn);\r
- fIn.Close();\r
-\r
- char CmdLine[4096];\r
- sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp);\r
-// sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp);\r
- int NotUsed = system(CmdLine);\r
-\r
- TextFile fOut(OutTmp);\r
- msaOut.FromFile(fOut);\r
-\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex)\r
- {\r
- const char *Name = msaOut.GetSeqName(uSeqIndex);\r
- unsigned uId = vAll.GetSeqIdFromName(Name);\r
- msaOut.SetSeqId(uSeqIndex, uId);\r
- }\r
-\r
- unlink(InTmp);\r
- unlink(OutTmp);\r
-\r
- delete[] Leaves;\r
- }\r
-\r
-void ProgAlignSubFams()\r
- {\r
- MSA msaOut;\r
-\r
- SetOutputFileName(g_pstrOutFileName);\r
- SetInputFileName(g_pstrInFileName);\r
-\r
- SetMaxIters(g_uMaxIters);\r
- SetSeqWeightMethod(g_SeqWeight1);\r
-\r
- TextFile fileIn(g_pstrInFileName);\r
- SeqVect v;\r
- v.FromFASTAFile(fileIn);\r
- const unsigned uSeqCount = v.Length();\r
-\r
- if (0 == uSeqCount)\r
- Quit("No sequences in input file");\r
-\r
- ALPHA Alpha = ALPHA_Undefined;\r
- switch (g_SeqType)\r
- {\r
- case SEQTYPE_Auto:\r
- Alpha = v.GuessAlpha();\r
- break;\r
-\r
- case SEQTYPE_Protein:\r
- Alpha = ALPHA_Amino;\r
- break;\r
-\r
- case SEQTYPE_DNA:\r
- Alpha = ALPHA_DNA;\r
- break;\r
-\r
- case SEQTYPE_RNA:\r
- Alpha = ALPHA_RNA;\r
- break;\r
-\r
- default:\r
- Quit("Invalid seq type");\r
- }\r
- SetAlpha(Alpha);\r
- v.FixAlpha();\r
-\r
- PTR_SCOREMATRIX UserMatrix = 0;\r
- if (0 != g_pstrMatrixFileName)\r
- {\r
- const char *FileName = g_pstrMatrixFileName;\r
- const char *Path = getenv("MUSCLE_MXPATH");\r
- if (Path != 0)\r
- {\r
- size_t n = strlen(Path) + 1 + strlen(FileName) + 1;\r
- char *NewFileName = new char[n];\r
- sprintf(NewFileName, "%s/%s", Path, FileName);\r
- FileName = NewFileName;\r
- }\r
- TextFile File(FileName);\r
- UserMatrix = ReadMx(File);\r
- g_Alpha = ALPHA_Amino;\r
- g_PPScore = PPSCORE_SP;\r
- }\r
-\r
- SetPPScore();\r
-\r
- if (0 != UserMatrix)\r
- g_ptrScoreMatrix = UserMatrix;\r
-\r
- if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)\r
- {\r
- SetPPScore(PPSCORE_SPN);\r
- g_Distance1 = DISTANCE_Kmer4_6;\r
- }\r
-\r
- unsigned uMaxL = 0;\r
- unsigned uTotL = 0;\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- {\r
- unsigned L = v.GetSeq(uSeqIndex).Length();\r
- uTotL += L;\r
- if (L > uMaxL)\r
- uMaxL = L;\r
- }\r
-\r
- SetIter(1);\r
- g_bDiags = g_bDiags1;\r
- SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);\r
-\r
- SetMuscleSeqVect(v);\r
-\r
- MSA::SetIdCount(uSeqCount);\r
-\r
-// Initialize sequence ids.\r
-// From this point on, ids must somehow propogate from here.\r
- for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
- v.SetSeqId(uSeqIndex, uSeqIndex);\r
-\r
- if (uSeqCount > 1)\r
- MHackStart(v);\r
-\r
- if (0 == uSeqCount)\r
- {\r
- msaOut.Clear();\r
- return;\r
- }\r
-\r
- if (1 == uSeqCount && ALPHA_Amino == Alpha)\r
- {\r
- const Seq &s = v.GetSeq(0);\r
- msaOut.FromSeq(s);\r
- return;\r
- }\r
-\r
- Tree GuideTree;\r
- TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1);\r
- SetMuscleTree(GuideTree);\r
-\r
- MSA msa;\r
- if (g_bLow)\r
- {\r
- ProgNode *ProgNodes = 0;\r
- ProgNodes = ProgressiveAlignE(v, GuideTree, msa);\r
- delete[] ProgNodes;\r
- }\r
- else\r
- ProgressiveAlign(v, GuideTree, msa);\r
- SetCurrentAlignment(msa);\r
- TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2);\r
- SetMuscleTree(GuideTree);\r
-\r
- unsigned *SubFams = new unsigned[uSeqCount];\r
- unsigned uSubFamCount;\r
- SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount);\r
-\r
- SetProgressDesc("Align node");\r
- const unsigned uNodeCount = 2*uSeqCount - 1;\r
-\r
- ProgNode *ProgNodes = new ProgNode[uNodeCount];\r
- bool *NodeIsSubFam = new bool[uNodeCount];\r
- bool *NodeInSubFam = new bool[uNodeCount];\r
-\r
- for (unsigned i = 0; i < uNodeCount; ++i)\r
- {\r
- NodeIsSubFam[i] = false;\r
- NodeInSubFam[i] = false;\r
- }\r
-\r
- for (unsigned i = 0; i < uSubFamCount; ++i)\r
- {\r
- unsigned uNodeIndex = SubFams[i];\r
- assert(uNodeIndex < uNodeCount);\r
- NodeIsSubFam[uNodeIndex] = true;\r
- SetInFam(GuideTree, uNodeIndex, NodeInSubFam);\r
- }\r
-\r
- unsigned uJoin = 0;\r
- unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();\r
- do\r
- {\r
- if (NodeIsSubFam[uTreeNodeIndex])\r
- {\r
-#if TRACE\r
- Log("Node %d: align subfam\n", uTreeNodeIndex);\r
-#endif\r
- ProgNode &Node = ProgNodes[uTreeNodeIndex];\r
- AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);\r
- Node.m_uLength = Node.m_MSA.GetColCount();\r
- }\r
- else if (!NodeInSubFam[uTreeNodeIndex])\r
- {\r
-#if TRACE\r
- Log("Node %d: align two subfams\n", uTreeNodeIndex);\r
-#endif\r
- Progress(uJoin, uSubFamCount - 1);\r
- ++uJoin;\r
-\r
- const unsigned uMergeNodeIndex = uTreeNodeIndex;\r
- ProgNode &Parent = ProgNodes[uMergeNodeIndex];\r
-\r
- const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);\r
- const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);\r
-\r
- ProgNode &Node1 = ProgNodes[uLeft];\r
- ProgNode &Node2 = ProgNodes[uRight];\r
-\r
- PWPath Path;\r
- AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);\r
- Parent.m_uLength = Parent.m_MSA.GetColCount();\r
-\r
- Node1.m_MSA.Clear();\r
- Node2.m_MSA.Clear();\r
- }\r
- else\r
- {\r
-#if TRACE\r
- Log("Node %d: in subfam\n", uTreeNodeIndex);\r
-#endif\r
- ;\r
- }\r
- uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);\r
- }\r
- while (NULL_NEIGHBOR != uTreeNodeIndex);\r
- ProgressStepsDone();\r
-\r
- unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();\r
- ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];\r
-\r
- TextFile fOut(g_pstrOutFileName, true);\r
- MHackEnd(RootProgNode.m_MSA);\r
- RootProgNode.m_MSA.ToFile(fOut);\r
-\r
- delete[] NodeInSubFam;\r
- delete[] NodeIsSubFam;\r
- delete[] ProgNodes;\r
- delete[] SubFams;\r
-\r
- ProgNodes = 0;\r
- NodeInSubFam = 0;\r
- NodeIsSubFam = 0;\r
- SubFams = 0;\r
- }\r