Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / muscle / subfam.cpp
diff --git a/website/archive/binaries/mac/src/muscle/subfam.cpp b/website/archive/binaries/mac/src/muscle/subfam.cpp
new file mode 100644 (file)
index 0000000..e720cfa
--- /dev/null
@@ -0,0 +1,409 @@
+#include "muscle.h"\r
+#include "tree.h"\r
+#include "textfile.h"  // for test only\r
+#include "msa.h"\r
+#include "seqvect.h"\r
+#include "profile.h"\r
+#ifndef _MSC_VER\r
+#include <unistd.h>    //      for unlink\r
+#endif\r
+\r
+#define TRACE  0\r
+\r
+/***\r
+Find subfamilies from tree by following criteria:\r
+\r
+(a) number of leaves <= max,\r
+(b) is monophyletic, i.e. most recent common ancestor is parent\r
+of no more than one subfamily.\r
+***/\r
+\r
+static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount,\r
+  unsigned SubFams[], unsigned &uSubFamCount)\r
+       {\r
+       if (tree.IsLeaf(uNodeIndex))\r
+               return 1;\r
+\r
+       unsigned uLeft = tree.GetLeft(uNodeIndex);\r
+       unsigned uRight = tree.GetRight(uNodeIndex);\r
+       unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount);\r
+       unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount);\r
+\r
+       unsigned uLeafCount = uLeftCount + uRightCount;\r
+       if (uLeftCount + uRightCount > uMaxLeafCount)\r
+               {\r
+               if (uLeftCount <= uMaxLeafCount)\r
+                       SubFams[uSubFamCount++] = uLeft;\r
+               if (uRightCount <= uMaxLeafCount)\r
+                       SubFams[uSubFamCount++] = uRight;\r
+               }\r
+       else if (tree.IsRoot(uNodeIndex))\r
+               {\r
+               if (uSubFamCount != 0)\r
+                       Quit("Error in SubFamRecurse");\r
+               SubFams[uSubFamCount++] = uNodeIndex;\r
+               }\r
+\r
+       return uLeafCount;\r
+       }\r
+\r
+void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount)\r
+       {\r
+       *ptruSubFamCount = 0;\r
+       SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount);\r
+\r
+#if    TRACE\r
+       {\r
+       Log("\n");\r
+       Log("Tree:\n");\r
+       tree.LogMe();\r
+       //void DrawTree(const Tree &tree);\r
+       //DrawTree(tree);\r
+       Log("\n");\r
+       Log("%d subfams:\n", *ptruSubFamCount);\r
+       for (unsigned i = 0; i < *ptruSubFamCount; ++i)\r
+               Log("  %d=%d", i, SubFams[i]);\r
+       Log("\n");\r
+       }\r
+#endif\r
+       }\r
+\r
+//unsigned SubFams[9999];\r
+//unsigned uSubFamCount;\r
+//\r
+//static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)\r
+//     {\r
+//     const unsigned uRoot = tree.GetRootNodeIndex();\r
+//     unsigned uDist = 0;\r
+//     while (uNodeIndex != uRoot)\r
+//             {\r
+//             ++uDist;\r
+//             uNodeIndex = tree.GetParent(uNodeIndex);\r
+//             }\r
+//     return uDist;\r
+//     }\r
+//\r
+//static void DrawNode(const Tree &tree, unsigned uNodeIndex)\r
+//     {\r
+//     if (!tree.IsLeaf(uNodeIndex))\r
+//             DrawNode(tree, tree.GetLeft(uNodeIndex));\r
+//\r
+//     unsigned uDist = DistFromRoot(tree, uNodeIndex);\r
+//     for (unsigned i = 0; i < 5*uDist; ++i)\r
+//             Log(" ");\r
+//     Log("%d", uNodeIndex);\r
+//     for (unsigned i = 0; i < uSubFamCount; ++i)\r
+//             if (uNodeIndex == SubFams[i])\r
+//                     {\r
+//                     Log("*");\r
+//                     break;\r
+//                     }\r
+//     Log("\n");\r
+//\r
+//     if (!tree.IsLeaf(uNodeIndex))\r
+//             DrawNode(tree, tree.GetRight(uNodeIndex));\r
+//     }\r
+//\r
+//static void DrawTree(const Tree &tree)\r
+//     {\r
+//     unsigned uRoot = tree.GetRootNodeIndex();\r
+//     DrawNode(tree, uRoot);\r
+//     }\r
+//\r
+//void TestSubFams(const char *FileName)\r
+//     {\r
+//     Tree tree;\r
+//     TextFile f(FileName);\r
+//     tree.FromFile(f);\r
+//     SubFam(tree, 5, SubFams, &uSubFamCount);\r
+//     DrawTree(tree);\r
+//     }\r
+\r
+static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[])\r
+       {\r
+       if (tree.IsLeaf(uNodeIndex))\r
+               return;\r
+       unsigned uLeft = tree.GetLeft(uNodeIndex);\r
+       unsigned uRight = tree.GetRight(uNodeIndex);\r
+       NodeInSubFam[uLeft] = true;\r
+       NodeInSubFam[uRight] = true;\r
+\r
+       SetInFam(tree, uLeft, NodeInSubFam);\r
+       SetInFam(tree, uRight, NodeInSubFam);\r
+       }\r
+\r
+void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex,\r
+  MSA &msaOut)\r
+       {\r
+       const unsigned uSeqCount = vAll.GetSeqCount();\r
+\r
+       const char *InTmp = "asf_in.tmp";\r
+       const char *OutTmp = "asf_out.tmp";\r
+\r
+       unsigned *Leaves = new unsigned[uSeqCount];\r
+       unsigned uLeafCount;\r
+       GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount);\r
+\r
+       SeqVect v;\r
+       for (unsigned i = 0; i < uLeafCount; ++i)\r
+               {\r
+               unsigned uLeafNodeIndex = Leaves[i];\r
+               unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex);\r
+               Seq &s = vAll.GetSeqById(uId);\r
+               v.AppendSeq(s);\r
+               }\r
+\r
+#if    TRACE\r
+       {\r
+       Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount);\r
+       for (unsigned i = 0; i < uLeafCount; ++i)\r
+               Log(" %s", v.GetSeqName(i));\r
+       Log("\n");\r
+       }\r
+#endif\r
+\r
+       TextFile fIn(InTmp, true);\r
+\r
+       v.ToFASTAFile(fIn);\r
+       fIn.Close();\r
+\r
+       char CmdLine[4096];\r
+       sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp);\r
+//     sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp);\r
+       int NotUsed = system(CmdLine);\r
+\r
+       TextFile fOut(OutTmp);\r
+       msaOut.FromFile(fOut);\r
+\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex)\r
+               {\r
+               const char *Name = msaOut.GetSeqName(uSeqIndex);\r
+               unsigned uId = vAll.GetSeqIdFromName(Name);\r
+               msaOut.SetSeqId(uSeqIndex, uId);\r
+               }\r
+\r
+       unlink(InTmp);\r
+       unlink(OutTmp);\r
+\r
+       delete[] Leaves;\r
+       }\r
+\r
+void ProgAlignSubFams()\r
+       {\r
+       MSA msaOut;\r
+\r
+       SetOutputFileName(g_pstrOutFileName);\r
+       SetInputFileName(g_pstrInFileName);\r
+\r
+       SetMaxIters(g_uMaxIters);\r
+       SetSeqWeightMethod(g_SeqWeight1);\r
+\r
+       TextFile fileIn(g_pstrInFileName);\r
+       SeqVect v;\r
+       v.FromFASTAFile(fileIn);\r
+       const unsigned uSeqCount = v.Length();\r
+\r
+       if (0 == uSeqCount)\r
+               Quit("No sequences in input file");\r
+\r
+       ALPHA Alpha = ALPHA_Undefined;\r
+       switch (g_SeqType)\r
+               {\r
+       case SEQTYPE_Auto:\r
+               Alpha = v.GuessAlpha();\r
+               break;\r
+\r
+       case SEQTYPE_Protein:\r
+               Alpha = ALPHA_Amino;\r
+               break;\r
+\r
+       case SEQTYPE_DNA:\r
+               Alpha = ALPHA_DNA;\r
+               break;\r
+\r
+       case SEQTYPE_RNA:\r
+               Alpha = ALPHA_RNA;\r
+               break;\r
+\r
+       default:\r
+               Quit("Invalid seq type");\r
+               }\r
+       SetAlpha(Alpha);\r
+       v.FixAlpha();\r
+\r
+       PTR_SCOREMATRIX UserMatrix = 0;\r
+       if (0 != g_pstrMatrixFileName)\r
+               {\r
+               const char *FileName = g_pstrMatrixFileName;\r
+               const char *Path = getenv("MUSCLE_MXPATH");\r
+               if (Path != 0)\r
+                       {\r
+                       size_t n = strlen(Path) + 1 + strlen(FileName) + 1;\r
+                       char *NewFileName = new char[n];\r
+                       sprintf(NewFileName, "%s/%s", Path, FileName);\r
+                       FileName = NewFileName;\r
+                       }\r
+               TextFile File(FileName);\r
+               UserMatrix = ReadMx(File);\r
+               g_Alpha = ALPHA_Amino;\r
+               g_PPScore = PPSCORE_SP;\r
+               }\r
+\r
+       SetPPScore();\r
+\r
+       if (0 != UserMatrix)\r
+               g_ptrScoreMatrix = UserMatrix;\r
+\r
+       if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)\r
+               {\r
+               SetPPScore(PPSCORE_SPN);\r
+               g_Distance1 = DISTANCE_Kmer4_6;\r
+               }\r
+\r
+       unsigned uMaxL = 0;\r
+       unsigned uTotL = 0;\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               {\r
+               unsigned L = v.GetSeq(uSeqIndex).Length();\r
+               uTotL += L;\r
+               if (L > uMaxL)\r
+                       uMaxL = L;\r
+               }\r
+\r
+       SetIter(1);\r
+       g_bDiags = g_bDiags1;\r
+       SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);\r
+\r
+       SetMuscleSeqVect(v);\r
+\r
+       MSA::SetIdCount(uSeqCount);\r
+\r
+// Initialize sequence ids.\r
+// From this point on, ids must somehow propogate from here.\r
+       for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
+               v.SetSeqId(uSeqIndex, uSeqIndex);\r
+\r
+       if (uSeqCount > 1)\r
+               MHackStart(v);\r
+\r
+       if (0 == uSeqCount)\r
+               {\r
+               msaOut.Clear();\r
+               return;\r
+               }\r
+\r
+       if (1 == uSeqCount && ALPHA_Amino == Alpha)\r
+               {\r
+               const Seq &s = v.GetSeq(0);\r
+               msaOut.FromSeq(s);\r
+               return;\r
+               }\r
+\r
+       Tree GuideTree;\r
+       TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1);\r
+       SetMuscleTree(GuideTree);\r
+\r
+       MSA msa;\r
+       if (g_bLow)\r
+               {\r
+               ProgNode *ProgNodes = 0;\r
+               ProgNodes = ProgressiveAlignE(v, GuideTree, msa);\r
+               delete[] ProgNodes;\r
+               }\r
+       else\r
+               ProgressiveAlign(v, GuideTree, msa);\r
+       SetCurrentAlignment(msa);\r
+       TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2);\r
+       SetMuscleTree(GuideTree);\r
+\r
+       unsigned *SubFams = new unsigned[uSeqCount];\r
+       unsigned uSubFamCount;\r
+       SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount);\r
+\r
+       SetProgressDesc("Align node");\r
+       const unsigned uNodeCount = 2*uSeqCount - 1;\r
+\r
+       ProgNode *ProgNodes = new ProgNode[uNodeCount];\r
+       bool *NodeIsSubFam = new bool[uNodeCount];\r
+       bool *NodeInSubFam = new bool[uNodeCount];\r
+\r
+       for (unsigned i = 0; i < uNodeCount; ++i)\r
+               {\r
+               NodeIsSubFam[i] = false;\r
+               NodeInSubFam[i] = false;\r
+               }\r
+\r
+       for (unsigned i = 0; i < uSubFamCount; ++i)\r
+               {\r
+               unsigned uNodeIndex = SubFams[i];\r
+               assert(uNodeIndex < uNodeCount);\r
+               NodeIsSubFam[uNodeIndex] = true;\r
+               SetInFam(GuideTree, uNodeIndex, NodeInSubFam);\r
+               }\r
+\r
+       unsigned uJoin = 0;\r
+       unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();\r
+       do\r
+               {\r
+               if (NodeIsSubFam[uTreeNodeIndex])\r
+                       {\r
+#if    TRACE\r
+                       Log("Node %d: align subfam\n", uTreeNodeIndex);\r
+#endif\r
+                       ProgNode &Node = ProgNodes[uTreeNodeIndex];\r
+                       AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);\r
+                       Node.m_uLength = Node.m_MSA.GetColCount();\r
+                       }\r
+               else if (!NodeInSubFam[uTreeNodeIndex])\r
+                       {\r
+#if    TRACE\r
+                       Log("Node %d: align two subfams\n", uTreeNodeIndex);\r
+#endif\r
+                       Progress(uJoin, uSubFamCount - 1);\r
+                       ++uJoin;\r
+\r
+                       const unsigned uMergeNodeIndex = uTreeNodeIndex;\r
+                       ProgNode &Parent = ProgNodes[uMergeNodeIndex];\r
+\r
+                       const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);\r
+                       const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);\r
+\r
+                       ProgNode &Node1 = ProgNodes[uLeft];\r
+                       ProgNode &Node2 = ProgNodes[uRight];\r
+\r
+                       PWPath Path;\r
+                       AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);\r
+                       Parent.m_uLength = Parent.m_MSA.GetColCount();\r
+\r
+                       Node1.m_MSA.Clear();\r
+                       Node2.m_MSA.Clear();\r
+                       }\r
+               else\r
+                       {\r
+#if    TRACE\r
+                       Log("Node %d: in subfam\n", uTreeNodeIndex);\r
+#endif\r
+                       ;\r
+                       }\r
+               uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);\r
+               }\r
+       while (NULL_NEIGHBOR != uTreeNodeIndex);\r
+       ProgressStepsDone();\r
+\r
+       unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();\r
+       ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];\r
+\r
+       TextFile fOut(g_pstrOutFileName, true);\r
+       MHackEnd(RootProgNode.m_MSA);\r
+       RootProgNode.m_MSA.ToFile(fOut);\r
+\r
+       delete[] NodeInSubFam;\r
+       delete[] NodeIsSubFam;\r
+       delete[] ProgNodes;\r
+       delete[] SubFams;\r
+\r
+       ProgNodes = 0;\r
+       NodeInSubFam = 0;\r
+       NodeIsSubFam = 0;\r
+       SubFams = 0;\r
+       }\r