Next version of JABA
[jabaws.git] / binaries / src / muscle / domuscle.cpp
1 #include "muscle.h"\r
2 #include "textfile.h"\r
3 #include "seqvect.h"\r
4 #include "distfunc.h"\r
5 #include "msa.h"\r
6 #include "tree.h"\r
7 #include "profile.h"\r
8 #include "timing.h"\r
9 \r
10 static char g_strUseTreeWarning[] =\r
11 "\n******** WARNING ****************\n"\r
12 "\nYou specified the -usetree option.\n"\r
13 "Note that a good evolutionary tree may NOT be a good\n"\r
14 "guide tree for multiple alignment. For more details,\n"\r
15 "please refer to the user guide. To disable this\n"\r
16 "warning, use -usetree_nowarn <treefilename>.\n\n";\r
17 \r
18 void DoMuscle()\r
19         {\r
20         SetOutputFileName(g_pstrOutFileName);\r
21         SetInputFileName(g_pstrInFileName);\r
22 \r
23         SetMaxIters(g_uMaxIters);\r
24         SetSeqWeightMethod(g_SeqWeight1);\r
25 \r
26         TextFile fileIn(g_pstrInFileName);\r
27         SeqVect v;\r
28         v.FromFASTAFile(fileIn);\r
29         const unsigned uSeqCount = v.Length();\r
30 \r
31         if (0 == uSeqCount)\r
32                 Quit("No sequences in input file");\r
33 \r
34         ALPHA Alpha = ALPHA_Undefined;\r
35         switch (g_SeqType)\r
36                 {\r
37         case SEQTYPE_Auto:\r
38                 Alpha = v.GuessAlpha();\r
39                 break;\r
40 \r
41         case SEQTYPE_Protein:\r
42                 Alpha = ALPHA_Amino;\r
43                 break;\r
44 \r
45         case SEQTYPE_DNA:\r
46                 Alpha = ALPHA_DNA;\r
47                 break;\r
48 \r
49         case SEQTYPE_RNA:\r
50                 Alpha = ALPHA_RNA;\r
51                 break;\r
52 \r
53         default:\r
54                 Quit("Invalid seq type");\r
55                 }\r
56         SetAlpha(Alpha);\r
57         v.FixAlpha();\r
58 \r
59         PTR_SCOREMATRIX UserMatrix = 0;\r
60         if (0 != g_pstrMatrixFileName)\r
61                 {\r
62                 const char *FileName = g_pstrMatrixFileName;\r
63                 const char *Path = getenv("MUSCLE_MXPATH");\r
64                 if (Path != 0)\r
65                         {\r
66                         size_t n = strlen(Path) + 1 + strlen(FileName) + 1;\r
67                         char *NewFileName = new char[n];\r
68                         sprintf(NewFileName, "%s/%s", Path, FileName);\r
69                         FileName = NewFileName;\r
70                         }\r
71                 TextFile File(FileName);\r
72                 UserMatrix = ReadMx(File);\r
73                 g_Alpha = ALPHA_Amino;\r
74                 g_PPScore = PPSCORE_SP;\r
75                 }\r
76 \r
77         SetPPScore();\r
78 \r
79         if (0 != UserMatrix)\r
80                 g_ptrScoreMatrix = UserMatrix;\r
81 \r
82         unsigned uMaxL = 0;\r
83         unsigned uTotL = 0;\r
84         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
85                 {\r
86                 unsigned L = v.GetSeq(uSeqIndex).Length();\r
87                 uTotL += L;\r
88                 if (L > uMaxL)\r
89                         uMaxL = L;\r
90                 }\r
91 \r
92         SetIter(1);\r
93         g_bDiags = g_bDiags1;\r
94         SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);\r
95 \r
96         SetMuscleSeqVect(v);\r
97 \r
98         MSA::SetIdCount(uSeqCount);\r
99 \r
100 // Initialize sequence ids.\r
101 // From this point on, ids must somehow propogate from here.\r
102         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
103                 v.SetSeqId(uSeqIndex, uSeqIndex);\r
104 \r
105         if (0 == uSeqCount)\r
106                 Quit("Input file '%s' has no sequences", g_pstrInFileName);\r
107         if (1 == uSeqCount)\r
108                 {\r
109                 TextFile fileOut(g_pstrOutFileName, true);\r
110                 v.ToFile(fileOut);\r
111                 return;\r
112                 }\r
113 \r
114         if (uSeqCount > 1)\r
115                 MHackStart(v);\r
116 \r
117 // First iteration\r
118         Tree GuideTree;\r
119         if (0 != g_pstrUseTreeFileName)\r
120                 {\r
121         // Discourage users...\r
122                 if (!g_bUseTreeNoWarn)\r
123                         fprintf(stderr, g_strUseTreeWarning);\r
124 \r
125         // Read tree from file\r
126                 TextFile TreeFile(g_pstrUseTreeFileName);\r
127                 GuideTree.FromFile(TreeFile);\r
128 \r
129         // Make sure tree is rooted\r
130                 if (!GuideTree.IsRooted())\r
131                         Quit("User tree must be rooted");\r
132 \r
133                 if (GuideTree.GetLeafCount() != uSeqCount)\r
134                         Quit("User tree does not match input sequences");\r
135 \r
136                 const unsigned uNodeCount = GuideTree.GetNodeCount();\r
137                 for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)\r
138                         {\r
139                         if (!GuideTree.IsLeaf(uNodeIndex))\r
140                                 continue;\r
141                         const char *LeafName = GuideTree.GetLeafName(uNodeIndex);\r
142                         unsigned uSeqIndex;\r
143                         bool SeqFound = v.FindName(LeafName, &uSeqIndex);\r
144                         if (!SeqFound)\r
145                                 Quit("Label %s in tree does not match sequences", LeafName);\r
146                         unsigned uId = v.GetSeqIdFromName(LeafName);\r
147                         GuideTree.SetLeafId(uNodeIndex, uId);\r
148                         }\r
149                 }\r
150         else\r
151                 TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1,\r
152                   g_pstrDistMxFileName1);\r
153 \r
154         const char *Tree1 = ValueOpt("Tree1");\r
155         if (0 != Tree1)\r
156                 {\r
157                 TextFile f(Tree1, true);\r
158                 GuideTree.ToFile(f);\r
159                 if (g_bClusterOnly)\r
160                         return;\r
161                 }\r
162 \r
163         SetMuscleTree(GuideTree);\r
164         ValidateMuscleIds(GuideTree);\r
165 \r
166         MSA msa;\r
167         ProgNode *ProgNodes = 0;\r
168         if (g_bLow)\r
169                 ProgNodes = ProgressiveAlignE(v, GuideTree, msa);\r
170         else\r
171                 ProgressiveAlign(v, GuideTree, msa);\r
172         SetCurrentAlignment(msa);\r
173 \r
174         if (0 != g_pstrComputeWeightsFileName)\r
175                 {\r
176                 extern void OutWeights(const char *FileName, const MSA &msa);\r
177                 SetMSAWeightsMuscle(msa);\r
178                 OutWeights(g_pstrComputeWeightsFileName, msa);\r
179                 return;\r
180                 }\r
181 \r
182         ValidateMuscleIds(msa);\r
183 \r
184         if (1 == g_uMaxIters || 2 == uSeqCount)\r
185                 {\r
186                 //TextFile fileOut(g_pstrOutFileName, true);\r
187                 //MHackEnd(msa);\r
188                 //msa.ToFile(fileOut);\r
189                 MuscleOutput(msa);\r
190                 return;\r
191                 }\r
192 \r
193         if (0 == g_pstrUseTreeFileName)\r
194                 {\r
195                 g_bDiags = g_bDiags2;\r
196                 SetIter(2);\r
197 \r
198                 if (g_bLow)\r
199                         {\r
200                         if (0 != g_uMaxTreeRefineIters)\r
201                                 RefineTreeE(msa, v, GuideTree, ProgNodes);\r
202                         }\r
203                 else\r
204                         RefineTree(msa, GuideTree);\r
205 \r
206                 const char *Tree2 = ValueOpt("Tree2");\r
207                 if (0 != Tree2)\r
208                         {\r
209                         TextFile f(Tree2, true);\r
210                         GuideTree.ToFile(f);\r
211                         }\r
212                 }\r
213 \r
214         SetSeqWeightMethod(g_SeqWeight2);\r
215         SetMuscleTree(GuideTree);\r
216 \r
217         if (g_bAnchors)\r
218                 RefineVert(msa, GuideTree, g_uMaxIters - 2);\r
219         else\r
220                 RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false);\r
221 \r
222 #if     0\r
223 // Refining by subfamilies is disabled as it didn't give better\r
224 // results. I tried doing this before and after RefineHoriz.\r
225 // Should get back to this as it seems like this should work.\r
226         RefineSubfams(msa, GuideTree, g_uMaxIters - 2);\r
227 #endif\r
228 \r
229         ValidateMuscleIds(msa);\r
230         ValidateMuscleIds(GuideTree);\r
231 \r
232         //TextFile fileOut(g_pstrOutFileName, true);\r
233         //MHackEnd(msa);\r
234         //msa.ToFile(fileOut);\r
235         MuscleOutput(msa);\r
236         }\r
237 \r
238 void Run()\r
239         {\r
240         SetStartTime();\r
241         Log("Started %s\n", GetTimeAsStr());\r
242         for (int i = 0; i < g_argc; ++i)\r
243                 Log("%s ", g_argv[i]);\r
244         Log("\n");\r
245 \r
246 #if     TIMING\r
247         TICKS t1 = GetClockTicks();\r
248 #endif\r
249         if (g_bRefine)\r
250                 Refine();\r
251         else if (g_bRefineW)\r
252                 {\r
253                 extern void DoRefineW();\r
254                 DoRefineW();\r
255                 }\r
256         else if (g_bProfDB)\r
257                 ProfDB();\r
258         else if (g_bSW)\r
259                 Local();\r
260         else if (0 != g_pstrSPFileName)\r
261                 DoSP();\r
262         else if (g_bProfile)\r
263                 Profile();\r
264         else if (g_bPPScore)\r
265                 PPScore();\r
266         else if (g_bPAS)\r
267                 ProgAlignSubFams();\r
268         else if (g_bMakeTree)\r
269                 {\r
270                 extern void DoMakeTree();\r
271                 DoMakeTree();\r
272                 }\r
273         else\r
274                 DoMuscle();\r
275 \r
276 #if     TIMING\r
277         extern TICKS g_ticksDP;\r
278         extern TICKS g_ticksObjScore;\r
279         TICKS t2 = GetClockTicks();\r
280         TICKS TotalTicks = t2 - t1;\r
281         TICKS ticksOther = TotalTicks - g_ticksDP - g_ticksObjScore;\r
282         double dSecs = TicksToSecs(TotalTicks);\r
283         double PctDP = (double) g_ticksDP*100.0/(double) TotalTicks;\r
284         double PctOS = (double) g_ticksObjScore*100.0/(double) TotalTicks;\r
285         double PctOther = (double) ticksOther*100.0/(double) TotalTicks;\r
286         Log("                 Ticks     Secs    Pct\n");\r
287         Log("          ============  =======  =====\n");\r
288         Log("DP        %12ld  %7.2f  %5.1f%%\n",\r
289           (long) g_ticksDP, TicksToSecs(g_ticksDP), PctDP);\r
290         Log("OS        %12ld  %7.2f  %5.1f%%\n",\r
291           (long) g_ticksObjScore, TicksToSecs(g_ticksObjScore), PctOS);\r
292         Log("Other     %12ld  %7.2f  %5.1f%%\n",\r
293           (long) ticksOther, TicksToSecs(ticksOther), PctOther);\r
294         Log("Total     %12ld  %7.2f  100.0%%\n", (long) TotalTicks, dSecs);\r
295 #endif\r
296 \r
297         ListDiagSavings();\r
298         Log("Finished %s\n", GetTimeAsStr());\r
299         }\r