Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / muscle / phyfromfile.cpp
1 #include "muscle.h"\r
2 #include "tree.h"\r
3 #include "textfile.h"\r
4 \r
5 #define TRACE 0\r
6 \r
7 // Tokens in Newick files are:\r
8 //              ( ) : , ;\r
9 //              string\r
10 //              'string'\r
11 //              "string"\r
12 //              [ comment ]\r
13 //\r
14 // We can't safely distinguish between identifiers and floating point\r
15 // numbers at the lexical level (because identifiers may be numeric,\r
16 // or start with digits), so both edge lengths and identifiers are\r
17 // returned as strings.\r
18 \r
19 const char *Tree::NTTStr(NEWICK_TOKEN_TYPE NTT) const\r
20         {\r
21         switch (NTT)\r
22                 {\r
23 #define c(x)    case NTT_##x: return #x;\r
24         c(Unknown)\r
25         c(Lparen)\r
26         c(Rparen)\r
27         c(Colon)\r
28         c(Comma)\r
29         c(Semicolon)\r
30         c(String)\r
31         c(SingleQuotedString)\r
32         c(DoubleQuotedString)\r
33         c(Comment)\r
34 #undef c\r
35                 }\r
36         return "??";\r
37         }\r
38 \r
39 NEWICK_TOKEN_TYPE Tree::GetToken(TextFile &File, char szToken[], unsigned uBytes) const\r
40         {\r
41 // Skip leading white space\r
42         File.SkipWhite();\r
43 \r
44         char c;\r
45         File.GetCharX(c);\r
46 \r
47 // In case a single-character token\r
48         szToken[0] = c;\r
49         szToken[1] = 0;\r
50 \r
51         unsigned uBytesCopied = 0;\r
52         NEWICK_TOKEN_TYPE TT;\r
53         switch (c)\r
54                 {\r
55         case '(':\r
56                 return NTT_Lparen;\r
57 \r
58         case ')':\r
59                 return NTT_Rparen;\r
60 \r
61         case ':':\r
62                 return NTT_Colon;\r
63 \r
64         case ';':\r
65                 return NTT_Semicolon;\r
66 \r
67         case ',':\r
68                 return NTT_Comma;\r
69 \r
70         case '\'':\r
71                 TT = NTT_SingleQuotedString;\r
72                 File.GetCharX(c);\r
73                 break;\r
74 \r
75         case '"':\r
76                 TT = NTT_DoubleQuotedString;\r
77                 File.GetCharX(c);\r
78                 break;\r
79 \r
80         case '[':\r
81                 TT = NTT_Comment;\r
82                 break;\r
83 \r
84         default:\r
85                 TT = NTT_String;\r
86                 break;\r
87                 }\r
88 \r
89         for (;;)\r
90                 {\r
91                 if (TT != NTT_Comment)\r
92                         {\r
93                         if (uBytesCopied < uBytes - 2)\r
94                                 {\r
95                                 szToken[uBytesCopied++] = c;\r
96                                 szToken[uBytesCopied] = 0;\r
97                                 }\r
98                         else\r
99                                 Quit("Tree::GetToken: input buffer too small, token so far='%s'", szToken);\r
100                         }\r
101                 bool bEof = File.GetChar(c);\r
102                 if (bEof)\r
103                         return TT;\r
104 \r
105                 switch (TT)\r
106                         {\r
107                 case NTT_String:\r
108                         if (0 != strchr("():;,", c))\r
109                                 {\r
110                                 File.PushBack(c);\r
111                                 return NTT_String;\r
112                                 }\r
113                         if (isspace(c))\r
114                                 return NTT_String;\r
115                         break;\r
116 \r
117                 case NTT_SingleQuotedString:\r
118                         if ('\'' == c)\r
119                                 return NTT_String;\r
120                         break;\r
121 \r
122                 case NTT_DoubleQuotedString:\r
123                         if ('"' == c)\r
124                                 return NTT_String;\r
125                         break;\r
126 \r
127                 case NTT_Comment:\r
128                         if (']' == c)\r
129                                 return GetToken(File, szToken, uBytes);\r
130                         break;\r
131 \r
132                 default:\r
133                         Quit("Tree::GetToken, invalid TT=%u", TT);\r
134                         }\r
135                 }\r
136         }\r
137 \r
138 // NOTE: this hack must come after definition of Tree::GetToken.\r
139 #if     TRACE\r
140 #define GetToken        GetTokenVerbose\r
141 #endif\r
142 \r
143 void Tree::FromFile(TextFile &File)\r
144         {\r
145 // Assume rooted.\r
146 // If we discover that it is unrooted, will convert on the fly.\r
147         CreateRooted();\r
148 \r
149         double dEdgeLength;\r
150         bool bEdgeLength = GetGroupFromFile(File, 0, &dEdgeLength);\r
151 \r
152 // Next token should be either ';' for rooted tree or ',' for unrooted.\r
153         char szToken[16];\r
154         NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken));\r
155 \r
156 // If rooted, all done.\r
157         if (NTT_Semicolon == NTT)\r
158                 {\r
159                 if (bEdgeLength)\r
160                         Log(" *** Warning *** edge length on root group in Newick file %s\n",\r
161                           File.GetFileName());\r
162                 Validate();\r
163                 return;\r
164                 }\r
165 \r
166         if (NTT_Comma != NTT)\r
167                 Quit("Tree::FromFile, expected ';' or ',', got '%s'", szToken);\r
168 \r
169         const unsigned uThirdNode = UnrootFromFile();\r
170         bEdgeLength = GetGroupFromFile(File, uThirdNode, &dEdgeLength);\r
171         if (bEdgeLength)\r
172                 SetEdgeLength(0, uThirdNode, dEdgeLength);\r
173         Validate();\r
174         }\r
175 \r
176 // Return true if edge length for this group.\r
177 bool Tree::GetGroupFromFile(TextFile &File, unsigned uNodeIndex,\r
178   double *ptrdEdgeLength)\r
179         {\r
180         char szToken[1024];\r
181         NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken));\r
182 \r
183 // Group is either leaf name or (left, right).\r
184         if (NTT_String == NTT)\r
185                 {\r
186                 SetLeafName(uNodeIndex, szToken);\r
187 #if     TRACE\r
188                 Log("Group is leaf '%s'\n", szToken);\r
189 #endif\r
190                 }\r
191         else if (NTT_Lparen == NTT)\r
192                 {\r
193                 const unsigned uLeft = AppendBranch(uNodeIndex);\r
194                 const unsigned uRight = uLeft + 1;\r
195 \r
196         // Left sub-group...\r
197 #if     TRACE\r
198                 Log("Got '(', group is compound, expect left sub-group\n");\r
199 #endif\r
200                 double dEdgeLength;\r
201                 bool bLeftLength = GetGroupFromFile(File, uLeft, &dEdgeLength);\r
202 #if     TRACE\r
203                 if (bLeftLength)\r
204                         Log("Edge length for left sub-group: %.3g\n", dEdgeLength);\r
205                 else\r
206                         Log("No edge length for left sub-group\n");\r
207 #endif\r
208                 if (bLeftLength)\r
209                         SetEdgeLength(uNodeIndex, uLeft, dEdgeLength);\r
210 \r
211         // ... then comma ...\r
212 #if     TRACE\r
213                 Log("Expect comma\n");\r
214 #endif\r
215                 NTT = GetToken(File, szToken, sizeof(szToken));\r
216                 if (NTT_Comma != NTT)\r
217                         Quit("Tree::GetGroupFromFile, expected ',', got '%s'", szToken);\r
218 \r
219         // ...then right sub-group...\r
220 #if     TRACE\r
221                 Log("Expect right sub-group\n");\r
222 #endif\r
223                 bool bRightLength = GetGroupFromFile(File, uRight, &dEdgeLength);\r
224                 if (bRightLength)\r
225                         SetEdgeLength(uNodeIndex, uRight, dEdgeLength);\r
226 \r
227 #if     TRACE\r
228                 if (bRightLength)\r
229                         Log("Edge length for right sub-group: %.3g\n", dEdgeLength);\r
230                 else\r
231                         Log("No edge length for right sub-group\n");\r
232 #endif\r
233 \r
234         // ... then closing parenthesis.\r
235 #if     TRACE\r
236                 Log("Expect closing parenthesis (or comma if > 2-ary)\n");\r
237 #endif\r
238                 NTT = GetToken(File, szToken, sizeof(szToken));\r
239                 if (NTT_Rparen == NTT)\r
240                         ;\r
241                 else if (NTT_Comma == NTT)\r
242                         {\r
243                         File.PushBack(',');\r
244                         return false;\r
245                         }\r
246                 else\r
247                         Quit("Tree::GetGroupFromFile, expected ')' or ',', got '%s'", szToken);\r
248                 }\r
249         else\r
250                 Quit("Tree::GetGroupFromFile, expected '(' or leaf name, got '%s'",\r
251                   szToken);\r
252 \r
253 // Group may optionally be followed by edge length.\r
254         bool bEof = File.SkipWhiteX();\r
255         if (bEof)\r
256                 return false;\r
257         char c;\r
258         File.GetCharX(c);\r
259 #if     TRACE\r
260         Log("Character following group, could be colon, is '%c'\n", c);\r
261 #endif\r
262         if (':' == c)\r
263                 {\r
264                 NTT = GetToken(File, szToken, sizeof(szToken));\r
265                 if (NTT_String != NTT)\r
266                         Quit("Tree::GetGroupFromFile, expected edge length, got '%s'", szToken);\r
267                 *ptrdEdgeLength = atof(szToken);\r
268                 return true;\r
269                 }\r
270         File.PushBack(c);\r
271         return false;\r
272         }\r