Next version of JABA
[jabaws.git] / binaries / src / muscle / refinesubfams.cpp
1 #include "muscle.h"\r
2 #include "msa.h"\r
3 #include "tree.h"\r
4 #include "clust.h"\r
5 #include "profile.h"\r
6 #include "pwpath.h"\r
7 \r
8 #define TRACE 0\r
9 \r
10 static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[],\r
11   unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa);\r
12 \r
13 // Identify subfamilies in a tree.\r
14 // Returns array of internal node indexes, one for each subfamily.\r
15 // First try is to select groups by height (which should approximate\r
16 // minimum percent identity), if this gives too many subfamilies then\r
17 // we cut at a point that gives the maximum allowed number of subfams.\r
18 static void GetSubfams(const Tree &tree, double dMaxHeight,\r
19   unsigned uMaxSubfamCount, unsigned **ptrptrSubfams, unsigned *ptruSubfamCount)\r
20         {\r
21         const unsigned uNodeCount = tree.GetNodeCount();\r
22 \r
23         unsigned *Subfams = new unsigned[uNodeCount];\r
24 \r
25         unsigned uSubfamCount;\r
26         ClusterByHeight(tree, dMaxHeight, Subfams, &uSubfamCount);\r
27 \r
28         if (uSubfamCount > uMaxSubfamCount)\r
29                 ClusterBySubfamCount(tree, uMaxSubfamCount, Subfams, &uSubfamCount);\r
30 \r
31         *ptrptrSubfams = Subfams;\r
32         *ptruSubfamCount = uSubfamCount;\r
33         }\r
34 \r
35 static void LogSubfams(const Tree &tree, const unsigned Subfams[],\r
36   unsigned uSubfamCount)\r
37         {\r
38         const unsigned uNodeCount = tree.GetNodeCount();\r
39         Log("%u subfamilies found\n", uSubfamCount);\r
40         Log("Subfam  Sequence\n");\r
41         Log("------  --------\n");\r
42         unsigned *Leaves = new unsigned[uNodeCount];\r
43         for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex)\r
44                 {\r
45                 unsigned uSubfamNodeIndex = Subfams[uSubfamIndex];\r
46                 unsigned uLeafCount;\r
47                 GetLeaves(tree, uSubfamNodeIndex, Leaves, &uLeafCount);\r
48                 for (unsigned uLeafIndex = 0; uLeafIndex < uLeafCount; ++uLeafIndex)\r
49                         Log("%6u  %s\n", uSubfamIndex + 1, tree.GetLeafName(Leaves[uLeafIndex]));\r
50                 Log("\n");\r
51                 }\r
52         delete[] Leaves;\r
53         }\r
54 \r
55 bool RefineSubfams(MSA &msa, const Tree &tree, unsigned uIters)\r
56         {\r
57         const unsigned uSeqCount = msa.GetSeqCount();\r
58         if (uSeqCount < 3)\r
59                 return false;\r
60 \r
61         const double dMaxHeight = 0.6;\r
62         const unsigned uMaxSubfamCount = 16;\r
63         const unsigned uNodeCount = tree.GetNodeCount();\r
64 \r
65         unsigned *Subfams;\r
66         unsigned uSubfamCount;\r
67         GetSubfams(tree, dMaxHeight, uMaxSubfamCount, &Subfams, &uSubfamCount);\r
68         assert(uSubfamCount <= uSeqCount);\r
69 \r
70         if (g_bVerbose)\r
71                 LogSubfams(tree, Subfams, uSubfamCount);\r
72 \r
73         MSA *SubfamMSAs = new MSA[uSubfamCount];\r
74         unsigned *Leaves = new unsigned[uSeqCount];\r
75         unsigned *Ids = new unsigned[uSeqCount];\r
76 \r
77         bool bAnyChanges = false;\r
78         for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex)\r
79                 {\r
80                 unsigned uSubfam = Subfams[uSubfamIndex];\r
81                 unsigned uLeafCount;\r
82                 GetLeaves(tree, uSubfam, Leaves, &uLeafCount);\r
83                 assert(uLeafCount <= uSeqCount);\r
84 \r
85                 LeafIndexesToIds(tree, Leaves, uLeafCount, Ids);\r
86 \r
87                 MSA &msaSubfam = SubfamMSAs[uSubfamIndex];\r
88                 MSASubsetByIds(msa, Ids, uLeafCount, msaSubfam);\r
89                 DeleteGappedCols(msaSubfam);\r
90 \r
91 #if     TRACE\r
92                 Log("Subfam %u MSA=\n", uSubfamIndex);\r
93                 msaSubfam.LogMe();\r
94 #endif\r
95 \r
96                 if (msaSubfam.GetSeqCount() <= 2)\r
97                         continue;\r
98 \r
99         // TODO /////////////////////////////////////////\r
100         // Try using existing tree, may actually hurt to\r
101         // re-estimate, may also be a waste of CPU & mem.\r
102         /////////////////////////////////////////////////\r
103                 Tree SubfamTree;\r
104                 TreeFromMSA(msaSubfam, SubfamTree, g_Cluster2, g_Distance2, g_Root2);\r
105 \r
106                 bool bAnyChangesThisSubfam;\r
107                 if (g_bAnchors)\r
108                         bAnyChangesThisSubfam = RefineVert(msaSubfam, SubfamTree, uIters);\r
109                 else\r
110                         bAnyChangesThisSubfam = RefineHoriz(msaSubfam, SubfamTree, uIters, false, false);\r
111 #if     TRACE\r
112                 Log("Subfam %u Changed %d\n", uSubfamIndex, bAnyChangesThisSubfam);\r
113 #endif\r
114                 if (bAnyChangesThisSubfam)\r
115                         bAnyChanges = true;\r
116                 }\r
117 \r
118         if (bAnyChanges)\r
119                 ProgressiveAlignSubfams(tree, Subfams, uSubfamCount, SubfamMSAs, msa);\r
120 \r
121         delete[] Leaves;\r
122         delete[] Subfams;\r
123         delete[] SubfamMSAs;\r
124 \r
125         return bAnyChanges;\r
126         }\r
127 \r
128 static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[],\r
129   unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa)\r
130         {\r
131         const unsigned uNodeCount = tree.GetNodeCount();\r
132 \r
133         bool *Ready = new bool[uNodeCount];\r
134         MSA **MSAs = new MSA *[uNodeCount];\r
135         for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)\r
136                 {\r
137                 Ready[uNodeIndex] = false;\r
138                 MSAs[uNodeIndex] = 0;\r
139                 }\r
140 \r
141         for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex)\r
142                 {\r
143                 unsigned uNodeIndex = Subfams[uSubfamIndex];\r
144                 Ready[uNodeIndex] = true;\r
145                 MSA *ptrMSA = new MSA;\r
146         // TODO: Wasteful copy, needs re-design\r
147                 ptrMSA->Copy(SubfamMSAs[uSubfamIndex]);\r
148                 MSAs[uNodeIndex] = ptrMSA;\r
149                 }\r
150 \r
151         for (unsigned uNodeIndex = tree.FirstDepthFirstNode();\r
152           NULL_NEIGHBOR != uNodeIndex;\r
153           uNodeIndex = tree.NextDepthFirstNode(uNodeIndex))\r
154                 {\r
155                 if (tree.IsLeaf(uNodeIndex))\r
156                         continue;\r
157 \r
158                 unsigned uRight = tree.GetRight(uNodeIndex);\r
159                 unsigned uLeft = tree.GetLeft(uNodeIndex);\r
160                 if (!Ready[uRight] || !Ready[uLeft])\r
161                         continue;\r
162 \r
163                 MSA *ptrLeft = MSAs[uLeft];\r
164                 MSA *ptrRight = MSAs[uRight];\r
165                 assert(ptrLeft != 0 && ptrRight != 0);\r
166 \r
167                 MSA *ptrParent = new MSA;\r
168 \r
169                 PWPath Path;\r
170                 AlignTwoMSAs(*ptrLeft, *ptrRight, *ptrParent, Path);\r
171 \r
172                 MSAs[uNodeIndex] = ptrParent;\r
173                 Ready[uNodeIndex] = true;\r
174                 Ready[uLeft] = false;\r
175                 Ready[uRight] = false;\r
176 \r
177                 delete MSAs[uLeft];\r
178                 delete MSAs[uRight];\r
179                 MSAs[uLeft] = 0;\r
180                 MSAs[uRight] = 0;\r
181                 }\r
182 \r
183 #if     DEBUG\r
184         {\r
185         unsigned uReadyCount = 0;\r
186         for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)\r
187                 {\r
188                 if (Ready[uNodeIndex])\r
189                         {\r
190                         assert(tree.IsRoot(uNodeIndex));\r
191                         ++uReadyCount;\r
192                         assert(0 != MSAs[uNodeIndex]);\r
193                         }\r
194                 else\r
195                         assert(0 == MSAs[uNodeIndex]);\r
196                 }\r
197         assert(1 == uReadyCount);\r
198         }\r
199 #endif\r
200 \r
201         const unsigned uRoot = tree.GetRootNodeIndex();\r
202         MSA *ptrRootAlignment = MSAs[uRoot];\r
203 \r
204         msa.Copy(*ptrRootAlignment);\r
205 \r
206         delete ptrRootAlignment;\r
207 \r
208 #if     TRACE\r
209         Log("After refine subfamilies, root alignment=\n");\r
210         msa.LogMe();\r
211 #endif\r
212         }\r