23f9473dbbbe3480acd4ba173f9580d15317e6dc
[jabaws.git] / binaries / src / muscle / subfam.cpp
1 #include "muscle.h"\r
2 #include "tree.h"\r
3 #include "textfile.h"   // for test only\r
4 #include "msa.h"\r
5 #include "seqvect.h"\r
6 #include "profile.h"\r
7 #ifndef _MSC_VER\r
8 #include <unistd.h>     //      for unlink\r
9 #endif\r
10 \r
11 #define TRACE   0\r
12 \r
13 /***\r
14 Find subfamilies from tree by following criteria:\r
15 \r
16 (a) number of leaves <= max,\r
17 (b) is monophyletic, i.e. most recent common ancestor is parent\r
18 of no more than one subfamily.\r
19 ***/\r
20 \r
21 static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount,\r
22   unsigned SubFams[], unsigned &uSubFamCount)\r
23         {\r
24         if (tree.IsLeaf(uNodeIndex))\r
25                 return 1;\r
26 \r
27         unsigned uLeft = tree.GetLeft(uNodeIndex);\r
28         unsigned uRight = tree.GetRight(uNodeIndex);\r
29         unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount);\r
30         unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount);\r
31 \r
32         unsigned uLeafCount = uLeftCount + uRightCount;\r
33         if (uLeftCount + uRightCount > uMaxLeafCount)\r
34                 {\r
35                 if (uLeftCount <= uMaxLeafCount)\r
36                         SubFams[uSubFamCount++] = uLeft;\r
37                 if (uRightCount <= uMaxLeafCount)\r
38                         SubFams[uSubFamCount++] = uRight;\r
39                 }\r
40         else if (tree.IsRoot(uNodeIndex))\r
41                 {\r
42                 if (uSubFamCount != 0)\r
43                         Quit("Error in SubFamRecurse");\r
44                 SubFams[uSubFamCount++] = uNodeIndex;\r
45                 }\r
46 \r
47         return uLeafCount;\r
48         }\r
49 \r
50 void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount)\r
51         {\r
52         *ptruSubFamCount = 0;\r
53         SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount);\r
54 \r
55 #if     TRACE\r
56         {\r
57         Log("\n");\r
58         Log("Tree:\n");\r
59         tree.LogMe();\r
60         //void DrawTree(const Tree &tree);\r
61         //DrawTree(tree);\r
62         Log("\n");\r
63         Log("%d subfams:\n", *ptruSubFamCount);\r
64         for (unsigned i = 0; i < *ptruSubFamCount; ++i)\r
65                 Log("  %d=%d", i, SubFams[i]);\r
66         Log("\n");\r
67         }\r
68 #endif\r
69         }\r
70 \r
71 //unsigned SubFams[9999];\r
72 //unsigned uSubFamCount;\r
73 //\r
74 //static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)\r
75 //      {\r
76 //      const unsigned uRoot = tree.GetRootNodeIndex();\r
77 //      unsigned uDist = 0;\r
78 //      while (uNodeIndex != uRoot)\r
79 //              {\r
80 //              ++uDist;\r
81 //              uNodeIndex = tree.GetParent(uNodeIndex);\r
82 //              }\r
83 //      return uDist;\r
84 //      }\r
85 //\r
86 //static void DrawNode(const Tree &tree, unsigned uNodeIndex)\r
87 //      {\r
88 //      if (!tree.IsLeaf(uNodeIndex))\r
89 //              DrawNode(tree, tree.GetLeft(uNodeIndex));\r
90 //\r
91 //      unsigned uDist = DistFromRoot(tree, uNodeIndex);\r
92 //      for (unsigned i = 0; i < 5*uDist; ++i)\r
93 //              Log(" ");\r
94 //      Log("%d", uNodeIndex);\r
95 //      for (unsigned i = 0; i < uSubFamCount; ++i)\r
96 //              if (uNodeIndex == SubFams[i])\r
97 //                      {\r
98 //                      Log("*");\r
99 //                      break;\r
100 //                      }\r
101 //      Log("\n");\r
102 //\r
103 //      if (!tree.IsLeaf(uNodeIndex))\r
104 //              DrawNode(tree, tree.GetRight(uNodeIndex));\r
105 //      }\r
106 //\r
107 //static void DrawTree(const Tree &tree)\r
108 //      {\r
109 //      unsigned uRoot = tree.GetRootNodeIndex();\r
110 //      DrawNode(tree, uRoot);\r
111 //      }\r
112 //\r
113 //void TestSubFams(const char *FileName)\r
114 //      {\r
115 //      Tree tree;\r
116 //      TextFile f(FileName);\r
117 //      tree.FromFile(f);\r
118 //      SubFam(tree, 5, SubFams, &uSubFamCount);\r
119 //      DrawTree(tree);\r
120 //      }\r
121 \r
122 static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[])\r
123         {\r
124         if (tree.IsLeaf(uNodeIndex))\r
125                 return;\r
126         unsigned uLeft = tree.GetLeft(uNodeIndex);\r
127         unsigned uRight = tree.GetRight(uNodeIndex);\r
128         NodeInSubFam[uLeft] = true;\r
129         NodeInSubFam[uRight] = true;\r
130 \r
131         SetInFam(tree, uLeft, NodeInSubFam);\r
132         SetInFam(tree, uRight, NodeInSubFam);\r
133         }\r
134 \r
135 void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex,\r
136   MSA &msaOut)\r
137         {\r
138         const unsigned uSeqCount = vAll.GetSeqCount();\r
139 \r
140         const char *InTmp = "asf_in.tmp";\r
141         const char *OutTmp = "asf_out.tmp";\r
142 \r
143         unsigned *Leaves = new unsigned[uSeqCount];\r
144         unsigned uLeafCount;\r
145         GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount);\r
146 \r
147         SeqVect v;\r
148         for (unsigned i = 0; i < uLeafCount; ++i)\r
149                 {\r
150                 unsigned uLeafNodeIndex = Leaves[i];\r
151                 unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex);\r
152                 Seq &s = vAll.GetSeqById(uId);\r
153                 v.AppendSeq(s);\r
154                 }\r
155 \r
156 #if     TRACE\r
157         {\r
158         Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount);\r
159         for (unsigned i = 0; i < uLeafCount; ++i)\r
160                 Log(" %s", v.GetSeqName(i));\r
161         Log("\n");\r
162         }\r
163 #endif\r
164 \r
165         TextFile fIn(InTmp, true);\r
166 \r
167         v.ToFASTAFile(fIn);\r
168         fIn.Close();\r
169 \r
170         char CmdLine[4096];\r
171         sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp);\r
172 //      sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp);\r
173         system(CmdLine);\r
174 \r
175         TextFile fOut(OutTmp);\r
176         msaOut.FromFile(fOut);\r
177 \r
178         for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex)\r
179                 {\r
180                 const char *Name = msaOut.GetSeqName(uSeqIndex);\r
181                 unsigned uId = vAll.GetSeqIdFromName(Name);\r
182                 msaOut.SetSeqId(uSeqIndex, uId);\r
183                 }\r
184 \r
185         unlink(InTmp);\r
186         unlink(OutTmp);\r
187 \r
188         delete[] Leaves;\r
189         }\r
190 \r
191 void ProgAlignSubFams()\r
192         {\r
193         MSA msaOut;\r
194 \r
195         SetOutputFileName(g_pstrOutFileName);\r
196         SetInputFileName(g_pstrInFileName);\r
197 \r
198         SetMaxIters(g_uMaxIters);\r
199         SetSeqWeightMethod(g_SeqWeight1);\r
200 \r
201         TextFile fileIn(g_pstrInFileName);\r
202         SeqVect v;\r
203         v.FromFASTAFile(fileIn);\r
204         const unsigned uSeqCount = v.Length();\r
205 \r
206         if (0 == uSeqCount)\r
207                 Quit("No sequences in input file");\r
208 \r
209         ALPHA Alpha = ALPHA_Undefined;\r
210         switch (g_SeqType)\r
211                 {\r
212         case SEQTYPE_Auto:\r
213                 Alpha = v.GuessAlpha();\r
214                 break;\r
215 \r
216         case SEQTYPE_Protein:\r
217                 Alpha = ALPHA_Amino;\r
218                 break;\r
219 \r
220         case SEQTYPE_DNA:\r
221                 Alpha = ALPHA_DNA;\r
222                 break;\r
223 \r
224         case SEQTYPE_RNA:\r
225                 Alpha = ALPHA_RNA;\r
226                 break;\r
227 \r
228         default:\r
229                 Quit("Invalid seq type");\r
230                 }\r
231         SetAlpha(Alpha);\r
232         v.FixAlpha();\r
233 \r
234         PTR_SCOREMATRIX UserMatrix = 0;\r
235         if (0 != g_pstrMatrixFileName)\r
236                 {\r
237                 const char *FileName = g_pstrMatrixFileName;\r
238                 const char *Path = getenv("MUSCLE_MXPATH");\r
239                 if (Path != 0)\r
240                         {\r
241                         size_t n = strlen(Path) + 1 + strlen(FileName) + 1;\r
242                         char *NewFileName = new char[n];\r
243                         sprintf(NewFileName, "%s/%s", Path, FileName);\r
244                         FileName = NewFileName;\r
245                         }\r
246                 TextFile File(FileName);\r
247                 UserMatrix = ReadMx(File);\r
248                 g_Alpha = ALPHA_Amino;\r
249                 g_PPScore = PPSCORE_SP;\r
250                 }\r
251 \r
252         SetPPScore();\r
253 \r
254         if (0 != UserMatrix)\r
255                 g_ptrScoreMatrix = UserMatrix;\r
256 \r
257         if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)\r
258                 {\r
259                 SetPPScore(PPSCORE_SPN);\r
260                 g_Distance1 = DISTANCE_Kmer4_6;\r
261                 }\r
262 \r
263         unsigned uMaxL = 0;\r
264         unsigned uTotL = 0;\r
265         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
266                 {\r
267                 unsigned L = v.GetSeq(uSeqIndex).Length();\r
268                 uTotL += L;\r
269                 if (L > uMaxL)\r
270                         uMaxL = L;\r
271                 }\r
272 \r
273         SetIter(1);\r
274         g_bDiags = g_bDiags1;\r
275         SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);\r
276 \r
277         SetMuscleSeqVect(v);\r
278 \r
279         MSA::SetIdCount(uSeqCount);\r
280 \r
281 // Initialize sequence ids.\r
282 // From this point on, ids must somehow propogate from here.\r
283         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
284                 v.SetSeqId(uSeqIndex, uSeqIndex);\r
285 \r
286         if (uSeqCount > 1)\r
287                 MHackStart(v);\r
288 \r
289         if (0 == uSeqCount)\r
290                 {\r
291                 msaOut.Clear();\r
292                 return;\r
293                 }\r
294 \r
295         if (1 == uSeqCount && ALPHA_Amino == Alpha)\r
296                 {\r
297                 const Seq &s = v.GetSeq(0);\r
298                 msaOut.FromSeq(s);\r
299                 return;\r
300                 }\r
301 \r
302         Tree GuideTree;\r
303         TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1);\r
304         SetMuscleTree(GuideTree);\r
305 \r
306         MSA msa;\r
307         if (g_bLow)\r
308                 {\r
309                 ProgNode *ProgNodes = 0;\r
310                 ProgNodes = ProgressiveAlignE(v, GuideTree, msa);\r
311                 delete[] ProgNodes;\r
312                 }\r
313         else\r
314                 ProgressiveAlign(v, GuideTree, msa);\r
315         SetCurrentAlignment(msa);\r
316         TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2);\r
317         SetMuscleTree(GuideTree);\r
318 \r
319         unsigned *SubFams = new unsigned[uSeqCount];\r
320         unsigned uSubFamCount;\r
321         SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount);\r
322 \r
323         SetProgressDesc("Align node");\r
324         const unsigned uNodeCount = 2*uSeqCount - 1;\r
325 \r
326         ProgNode *ProgNodes = new ProgNode[uNodeCount];\r
327         bool *NodeIsSubFam = new bool[uNodeCount];\r
328         bool *NodeInSubFam = new bool[uNodeCount];\r
329 \r
330         for (unsigned i = 0; i < uNodeCount; ++i)\r
331                 {\r
332                 NodeIsSubFam[i] = false;\r
333                 NodeInSubFam[i] = false;\r
334                 }\r
335 \r
336         for (unsigned i = 0; i < uSubFamCount; ++i)\r
337                 {\r
338                 unsigned uNodeIndex = SubFams[i];\r
339                 assert(uNodeIndex < uNodeCount);\r
340                 NodeIsSubFam[uNodeIndex] = true;\r
341                 SetInFam(GuideTree, uNodeIndex, NodeInSubFam);\r
342                 }\r
343 \r
344         unsigned uJoin = 0;\r
345         unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();\r
346         do\r
347                 {\r
348                 if (NodeIsSubFam[uTreeNodeIndex])\r
349                         {\r
350 #if     TRACE\r
351                         Log("Node %d: align subfam\n", uTreeNodeIndex);\r
352 #endif\r
353                         ProgNode &Node = ProgNodes[uTreeNodeIndex];\r
354                         AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);\r
355                         Node.m_uLength = Node.m_MSA.GetColCount();\r
356                         }\r
357                 else if (!NodeInSubFam[uTreeNodeIndex])\r
358                         {\r
359 #if     TRACE\r
360                         Log("Node %d: align two subfams\n", uTreeNodeIndex);\r
361 #endif\r
362                         Progress(uJoin, uSubFamCount - 1);\r
363                         ++uJoin;\r
364 \r
365                         const unsigned uMergeNodeIndex = uTreeNodeIndex;\r
366                         ProgNode &Parent = ProgNodes[uMergeNodeIndex];\r
367 \r
368                         const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);\r
369                         const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);\r
370 \r
371                         ProgNode &Node1 = ProgNodes[uLeft];\r
372                         ProgNode &Node2 = ProgNodes[uRight];\r
373 \r
374                         PWPath Path;\r
375                         AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);\r
376                         Parent.m_uLength = Parent.m_MSA.GetColCount();\r
377 \r
378                         Node1.m_MSA.Clear();\r
379                         Node2.m_MSA.Clear();\r
380                         }\r
381                 else\r
382                         {\r
383 #if     TRACE\r
384                         Log("Node %d: in subfam\n", uTreeNodeIndex);\r
385 #endif\r
386                         ;\r
387                         }\r
388                 uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);\r
389                 }\r
390         while (NULL_NEIGHBOR != uTreeNodeIndex);\r
391         ProgressStepsDone();\r
392 \r
393         unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();\r
394         ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];\r
395 \r
396         TextFile fOut(g_pstrOutFileName, true);\r
397         MHackEnd(RootProgNode.m_MSA);\r
398         RootProgNode.m_MSA.ToFile(fOut);\r
399 \r
400         delete[] NodeInSubFam;\r
401         delete[] NodeIsSubFam;\r
402         delete[] ProgNodes;\r
403         delete[] SubFams;\r
404 \r
405         ProgNodes = 0;\r
406         NodeInSubFam = 0;\r
407         NodeIsSubFam = 0;\r
408         SubFams = 0;\r
409         }\r