3 #include "textfile.h" // for test only
\r
8 #include <unistd.h> // for unlink
\r
14 Find subfamilies from tree by following criteria:
\r
16 (a) number of leaves <= max,
\r
17 (b) is monophyletic, i.e. most recent common ancestor is parent
\r
18 of no more than one subfamily.
\r
21 static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount,
\r
22 unsigned SubFams[], unsigned &uSubFamCount)
\r
24 if (tree.IsLeaf(uNodeIndex))
\r
27 unsigned uLeft = tree.GetLeft(uNodeIndex);
\r
28 unsigned uRight = tree.GetRight(uNodeIndex);
\r
29 unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount);
\r
30 unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount);
\r
32 unsigned uLeafCount = uLeftCount + uRightCount;
\r
33 if (uLeftCount + uRightCount > uMaxLeafCount)
\r
35 if (uLeftCount <= uMaxLeafCount)
\r
36 SubFams[uSubFamCount++] = uLeft;
\r
37 if (uRightCount <= uMaxLeafCount)
\r
38 SubFams[uSubFamCount++] = uRight;
\r
40 else if (tree.IsRoot(uNodeIndex))
\r
42 if (uSubFamCount != 0)
\r
43 Quit("Error in SubFamRecurse");
\r
44 SubFams[uSubFamCount++] = uNodeIndex;
\r
50 void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount)
\r
52 *ptruSubFamCount = 0;
\r
53 SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount);
\r
60 //void DrawTree(const Tree &tree);
\r
63 Log("%d subfams:\n", *ptruSubFamCount);
\r
64 for (unsigned i = 0; i < *ptruSubFamCount; ++i)
\r
65 Log(" %d=%d", i, SubFams[i]);
\r
71 //unsigned SubFams[9999];
\r
72 //unsigned uSubFamCount;
\r
74 //static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)
\r
76 // const unsigned uRoot = tree.GetRootNodeIndex();
\r
77 // unsigned uDist = 0;
\r
78 // while (uNodeIndex != uRoot)
\r
81 // uNodeIndex = tree.GetParent(uNodeIndex);
\r
86 //static void DrawNode(const Tree &tree, unsigned uNodeIndex)
\r
88 // if (!tree.IsLeaf(uNodeIndex))
\r
89 // DrawNode(tree, tree.GetLeft(uNodeIndex));
\r
91 // unsigned uDist = DistFromRoot(tree, uNodeIndex);
\r
92 // for (unsigned i = 0; i < 5*uDist; ++i)
\r
94 // Log("%d", uNodeIndex);
\r
95 // for (unsigned i = 0; i < uSubFamCount; ++i)
\r
96 // if (uNodeIndex == SubFams[i])
\r
103 // if (!tree.IsLeaf(uNodeIndex))
\r
104 // DrawNode(tree, tree.GetRight(uNodeIndex));
\r
107 //static void DrawTree(const Tree &tree)
\r
109 // unsigned uRoot = tree.GetRootNodeIndex();
\r
110 // DrawNode(tree, uRoot);
\r
113 //void TestSubFams(const char *FileName)
\r
116 // TextFile f(FileName);
\r
117 // tree.FromFile(f);
\r
118 // SubFam(tree, 5, SubFams, &uSubFamCount);
\r
122 static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[])
\r
124 if (tree.IsLeaf(uNodeIndex))
\r
126 unsigned uLeft = tree.GetLeft(uNodeIndex);
\r
127 unsigned uRight = tree.GetRight(uNodeIndex);
\r
128 NodeInSubFam[uLeft] = true;
\r
129 NodeInSubFam[uRight] = true;
\r
131 SetInFam(tree, uLeft, NodeInSubFam);
\r
132 SetInFam(tree, uRight, NodeInSubFam);
\r
135 void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex,
\r
138 const unsigned uSeqCount = vAll.GetSeqCount();
\r
140 const char *InTmp = "asf_in.tmp";
\r
141 const char *OutTmp = "asf_out.tmp";
\r
143 unsigned *Leaves = new unsigned[uSeqCount];
\r
144 unsigned uLeafCount;
\r
145 GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount);
\r
148 for (unsigned i = 0; i < uLeafCount; ++i)
\r
150 unsigned uLeafNodeIndex = Leaves[i];
\r
151 unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex);
\r
152 Seq &s = vAll.GetSeqById(uId);
\r
158 Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount);
\r
159 for (unsigned i = 0; i < uLeafCount; ++i)
\r
160 Log(" %s", v.GetSeqName(i));
\r
165 TextFile fIn(InTmp, true);
\r
167 v.ToFASTAFile(fIn);
\r
170 char CmdLine[4096];
\r
171 sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp);
\r
172 // sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp);
\r
175 TextFile fOut(OutTmp);
\r
176 msaOut.FromFile(fOut);
\r
178 for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex)
\r
180 const char *Name = msaOut.GetSeqName(uSeqIndex);
\r
181 unsigned uId = vAll.GetSeqIdFromName(Name);
\r
182 msaOut.SetSeqId(uSeqIndex, uId);
\r
191 void ProgAlignSubFams()
\r
195 SetOutputFileName(g_pstrOutFileName);
\r
196 SetInputFileName(g_pstrInFileName);
\r
198 SetMaxIters(g_uMaxIters);
\r
199 SetSeqWeightMethod(g_SeqWeight1);
\r
201 TextFile fileIn(g_pstrInFileName);
\r
203 v.FromFASTAFile(fileIn);
\r
204 const unsigned uSeqCount = v.Length();
\r
206 if (0 == uSeqCount)
\r
207 Quit("No sequences in input file");
\r
209 ALPHA Alpha = ALPHA_Undefined;
\r
213 Alpha = v.GuessAlpha();
\r
216 case SEQTYPE_Protein:
\r
217 Alpha = ALPHA_Amino;
\r
229 Quit("Invalid seq type");
\r
234 PTR_SCOREMATRIX UserMatrix = 0;
\r
235 if (0 != g_pstrMatrixFileName)
\r
237 const char *FileName = g_pstrMatrixFileName;
\r
238 const char *Path = getenv("MUSCLE_MXPATH");
\r
241 size_t n = strlen(Path) + 1 + strlen(FileName) + 1;
\r
242 char *NewFileName = new char[n];
\r
243 sprintf(NewFileName, "%s/%s", Path, FileName);
\r
244 FileName = NewFileName;
\r
246 TextFile File(FileName);
\r
247 UserMatrix = ReadMx(File);
\r
248 g_Alpha = ALPHA_Amino;
\r
249 g_PPScore = PPSCORE_SP;
\r
254 if (0 != UserMatrix)
\r
255 g_ptrScoreMatrix = UserMatrix;
\r
257 if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
\r
259 SetPPScore(PPSCORE_SPN);
\r
260 g_Distance1 = DISTANCE_Kmer4_6;
\r
263 unsigned uMaxL = 0;
\r
264 unsigned uTotL = 0;
\r
265 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
267 unsigned L = v.GetSeq(uSeqIndex).Length();
\r
274 g_bDiags = g_bDiags1;
\r
275 SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);
\r
277 SetMuscleSeqVect(v);
\r
279 MSA::SetIdCount(uSeqCount);
\r
281 // Initialize sequence ids.
\r
282 // From this point on, ids must somehow propogate from here.
\r
283 for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
\r
284 v.SetSeqId(uSeqIndex, uSeqIndex);
\r
289 if (0 == uSeqCount)
\r
295 if (1 == uSeqCount && ALPHA_Amino == Alpha)
\r
297 const Seq &s = v.GetSeq(0);
\r
303 TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1);
\r
304 SetMuscleTree(GuideTree);
\r
309 ProgNode *ProgNodes = 0;
\r
310 ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
\r
311 delete[] ProgNodes;
\r
314 ProgressiveAlign(v, GuideTree, msa);
\r
315 SetCurrentAlignment(msa);
\r
316 TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2);
\r
317 SetMuscleTree(GuideTree);
\r
319 unsigned *SubFams = new unsigned[uSeqCount];
\r
320 unsigned uSubFamCount;
\r
321 SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount);
\r
323 SetProgressDesc("Align node");
\r
324 const unsigned uNodeCount = 2*uSeqCount - 1;
\r
326 ProgNode *ProgNodes = new ProgNode[uNodeCount];
\r
327 bool *NodeIsSubFam = new bool[uNodeCount];
\r
328 bool *NodeInSubFam = new bool[uNodeCount];
\r
330 for (unsigned i = 0; i < uNodeCount; ++i)
\r
332 NodeIsSubFam[i] = false;
\r
333 NodeInSubFam[i] = false;
\r
336 for (unsigned i = 0; i < uSubFamCount; ++i)
\r
338 unsigned uNodeIndex = SubFams[i];
\r
339 assert(uNodeIndex < uNodeCount);
\r
340 NodeIsSubFam[uNodeIndex] = true;
\r
341 SetInFam(GuideTree, uNodeIndex, NodeInSubFam);
\r
344 unsigned uJoin = 0;
\r
345 unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
\r
348 if (NodeIsSubFam[uTreeNodeIndex])
\r
351 Log("Node %d: align subfam\n", uTreeNodeIndex);
\r
353 ProgNode &Node = ProgNodes[uTreeNodeIndex];
\r
354 AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);
\r
355 Node.m_uLength = Node.m_MSA.GetColCount();
\r
357 else if (!NodeInSubFam[uTreeNodeIndex])
\r
360 Log("Node %d: align two subfams\n", uTreeNodeIndex);
\r
362 Progress(uJoin, uSubFamCount - 1);
\r
365 const unsigned uMergeNodeIndex = uTreeNodeIndex;
\r
366 ProgNode &Parent = ProgNodes[uMergeNodeIndex];
\r
368 const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
\r
369 const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);
\r
371 ProgNode &Node1 = ProgNodes[uLeft];
\r
372 ProgNode &Node2 = ProgNodes[uRight];
\r
375 AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
\r
376 Parent.m_uLength = Parent.m_MSA.GetColCount();
\r
378 Node1.m_MSA.Clear();
\r
379 Node2.m_MSA.Clear();
\r
384 Log("Node %d: in subfam\n", uTreeNodeIndex);
\r
388 uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
\r
390 while (NULL_NEIGHBOR != uTreeNodeIndex);
\r
391 ProgressStepsDone();
\r
393 unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
\r
394 ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];
\r
396 TextFile fOut(g_pstrOutFileName, true);
\r
397 MHackEnd(RootProgNode.m_MSA);
\r
398 RootProgNode.m_MSA.ToFile(fOut);
\r
400 delete[] NodeInSubFam;
\r
401 delete[] NodeIsSubFam;
\r
402 delete[] ProgNodes;
\r