Next version of JABA
[jabaws.git] / binaries / src / muscle / seq.cpp
1 #include "muscle.h"\r
2 #include "seq.h"\r
3 #include "textfile.h"\r
4 #include "msa.h"\r
5 //#include <ctype.h>\r
6 \r
7 const size_t MAX_FASTA_LINE = 16000;\r
8 \r
9 void Seq::SetName(const char *ptrName)\r
10         {\r
11         delete[] m_ptrName;\r
12         size_t n = strlen(ptrName) + 1;\r
13         m_ptrName = new char[n];\r
14         strcpy(m_ptrName, ptrName);\r
15         }\r
16 \r
17 void Seq::ToFASTAFile(TextFile &File) const\r
18         {\r
19         File.PutFormat(">%s\n", m_ptrName);\r
20         unsigned uColCount = Length();\r
21         for (unsigned n = 0; n < uColCount; ++n)\r
22                 {\r
23                 if (n > 0 && n%60 == 0)\r
24                         File.PutString("\n");\r
25                 File.PutChar(at(n));\r
26                 }\r
27         File.PutString("\n");\r
28         }\r
29 \r
30 // Return true on end-of-file\r
31 bool Seq::FromFASTAFile(TextFile &File)\r
32         {\r
33         Clear();\r
34 \r
35         char szLine[MAX_FASTA_LINE];\r
36         bool bEof = File.GetLine(szLine, sizeof(szLine));\r
37         if (bEof)\r
38                 return true;\r
39         if ('>' != szLine[0])\r
40                 Quit("Expecting '>' in FASTA file %s line %u",\r
41                   File.GetFileName(), File.GetLineNr());\r
42 \r
43         size_t n = strlen(szLine);\r
44         if (1 == n)\r
45                 Quit("Missing annotation following '>' in FASTA file %s line %u",\r
46                   File.GetFileName(), File.GetLineNr());\r
47 \r
48         m_ptrName = new char[n];\r
49         strcpy(m_ptrName, szLine + 1);\r
50 \r
51         TEXTFILEPOS Pos = File.GetPos();\r
52         for (;;)\r
53                 {\r
54                 bEof = File.GetLine(szLine, sizeof(szLine));\r
55                 if (bEof)\r
56                         {\r
57                         if (0 == size())\r
58                                 {\r
59                                 Quit("Empty sequence in FASTA file %s line %u",\r
60                                   File.GetFileName(), File.GetLineNr());\r
61                                 return true;\r
62                                 }\r
63                         return false;\r
64                         }\r
65                 if ('>' == szLine[0])\r
66                         {\r
67                         if (0 == size())\r
68                                 Quit("Empty sequence in FASTA file %s line %u",\r
69                                   File.GetFileName(), File.GetLineNr());\r
70                 // Rewind to beginning of this line, it's the start of the\r
71                 // next sequence.\r
72                         File.SetPos(Pos);\r
73                         return false;\r
74                         }\r
75                 const char *ptrChar = szLine;\r
76                 while (char c = *ptrChar++)\r
77                         {\r
78                         if (isspace(c))\r
79                                 continue;\r
80                         if (IsGapChar(c))\r
81                                 continue;\r
82                         if (!IsResidueChar(c))\r
83                                 {\r
84                                 if (isprint(c))\r
85                                         {\r
86                                         char w = GetWildcardChar();\r
87                                         Warning("Invalid residue '%c' in FASTA file %s line %d, replaced by '%c'",\r
88                                           c, File.GetFileName(), File.GetLineNr(), w);\r
89                                         c = w;\r
90                                         }\r
91                                 else\r
92                                         Quit("Invalid byte hex %02x in FASTA file %s line %d",\r
93                                           (unsigned char) c, File.GetFileName(), File.GetLineNr());\r
94                                 }\r
95                         c = toupper(c);\r
96                         push_back(c);\r
97                         }\r
98                 Pos = File.GetPos();\r
99                 }\r
100         }\r
101 \r
102 void Seq::ExtractUngapped(MSA &msa) const\r
103         {\r
104         msa.Clear();\r
105         unsigned uColCount = Length();\r
106         msa.SetSize(1, 1);\r
107         unsigned uUngappedPos = 0;\r
108         for (unsigned n = 0; n < uColCount; ++n)\r
109                 {\r
110                 char c = at(n);\r
111                 if (!IsGapChar(c))\r
112                         msa.SetChar(0, uUngappedPos++, c);\r
113                 }\r
114         msa.SetSeqName(0, m_ptrName);\r
115         }\r
116 \r
117 void Seq::Copy(const Seq &rhs)\r
118         {\r
119         clear();\r
120         const unsigned uLength = rhs.Length();\r
121         for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)\r
122                 push_back(rhs.at(uColIndex));\r
123         const char *ptrName = rhs.GetName();\r
124         size_t n = strlen(ptrName) + 1;\r
125         m_ptrName = new char[n];\r
126         strcpy(m_ptrName, ptrName);\r
127         SetId(rhs.GetId());\r
128         }\r
129 \r
130 void Seq::CopyReversed(const Seq &rhs)\r
131         {\r
132         clear();\r
133         const unsigned uLength = rhs.Length();\r
134         const unsigned uBase = rhs.Length() - 1;\r
135         for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)\r
136                 push_back(rhs.at(uBase - uColIndex));\r
137         const char *ptrName = rhs.GetName();\r
138         size_t n = strlen(ptrName) + 1;\r
139         m_ptrName = new char[n];\r
140         strcpy(m_ptrName, ptrName);\r
141         }\r
142 \r
143 void Seq::StripGaps()\r
144         {\r
145         for (CharVect::iterator p = begin(); p != end(); )\r
146                 {\r
147                 char c = *p;\r
148                 if (IsGapChar(c))\r
149                         erase(p);\r
150                 else\r
151                         ++p;\r
152                 }\r
153         }\r
154 \r
155 void Seq::StripGapsAndWhitespace()\r
156         {\r
157         for (CharVect::iterator p = begin(); p != end(); )\r
158                 {\r
159                 char c = *p;\r
160                 if (isspace(c) || IsGapChar(c))\r
161                         erase(p);\r
162                 else\r
163                         ++p;\r
164                 }\r
165         }\r
166 \r
167 void Seq::ToUpper()\r
168         {\r
169         for (CharVect::iterator p = begin(); p != end(); ++p)\r
170                 {\r
171                 char c = *p;\r
172                 if (islower(c))\r
173                         *p = toupper(c);\r
174                 }\r
175         }\r
176 \r
177 unsigned Seq::GetLetter(unsigned uIndex) const\r
178         {\r
179         assert(uIndex < Length());\r
180         char c = operator[](uIndex);\r
181         return CharToLetter(c);\r
182         }\r
183 \r
184 bool Seq::EqIgnoreCase(const Seq &s) const\r
185         {\r
186         const unsigned n = Length();\r
187         if (n != s.Length())\r
188                 return false;\r
189         for (unsigned i = 0; i < n; ++i)\r
190                 {\r
191                 const char c1 = at(i);\r
192                 const char c2 = s.at(i);\r
193                 if (IsGapChar(c1))\r
194                         {\r
195                         if (!IsGapChar(c2))\r
196                                 return false;\r
197                         }\r
198                 else\r
199                         {\r
200                         if (toupper(c1) != toupper(c2))\r
201                                 return false;\r
202                         }\r
203                 }\r
204         return true;\r
205         }\r
206 \r
207 bool Seq::Eq(const Seq &s) const\r
208         {\r
209         const unsigned n = Length();\r
210         if (n != s.Length())\r
211                 return false;\r
212         for (unsigned i = 0; i < n; ++i)\r
213                 {\r
214                 const char c1 = at(i);\r
215                 const char c2 = s.at(i);\r
216                 if (c1 != c2)\r
217                         return false;\r
218                 }\r
219         return true;\r
220         }\r
221 \r
222 bool Seq::EqIgnoreCaseAndGaps(const Seq &s) const\r
223         {\r
224         const unsigned uThisLength = Length();\r
225         const unsigned uOtherLength = s.Length();\r
226         \r
227         unsigned uThisPos = 0;\r
228         unsigned uOtherPos = 0;\r
229 \r
230         int cThis;\r
231         int cOther;\r
232         for (;;)\r
233                 {\r
234                 if (uThisPos == uThisLength && uOtherPos == uOtherLength)\r
235                         break;\r
236 \r
237         // Set cThis to next non-gap character in this string\r
238         // or -1 if end-of-string.\r
239                 for (;;)\r
240                         {\r
241                         if (uThisPos == uThisLength)\r
242                                 {\r
243                                 cThis = -1;\r
244                                 break;\r
245                                 }\r
246                         else\r
247                                 {\r
248                                 cThis = at(uThisPos);\r
249                                 ++uThisPos;\r
250                                 if (!IsGapChar(cThis))\r
251                                         {\r
252                                         cThis = toupper(cThis);\r
253                                         break;\r
254                                         }\r
255                                 }\r
256                         }\r
257 \r
258         // Set cOther to next non-gap character in s\r
259         // or -1 if end-of-string.\r
260                 for (;;)\r
261                         {\r
262                         if (uOtherPos == uOtherLength)\r
263                                 {\r
264                                 cOther = -1;\r
265                                 break;\r
266                                 }\r
267                         else\r
268                                 {\r
269                                 cOther = s.at(uOtherPos);\r
270                                 ++uOtherPos;\r
271                                 if (!IsGapChar(cOther))\r
272                                         {\r
273                                         cOther = toupper(cOther);\r
274                                         break;\r
275                                         }\r
276                                 }\r
277                         }\r
278 \r
279         // Compare characters are corresponding ungapped position\r
280                 if (cThis != cOther)\r
281                         return false;\r
282                 }\r
283         return true;\r
284         }\r
285 \r
286 unsigned Seq::GetUngappedLength() const\r
287         {\r
288         unsigned uUngappedLength = 0;\r
289         for (CharVect::const_iterator p = begin(); p != end(); ++p)\r
290                 {\r
291                 char c = *p;\r
292                 if (!IsGapChar(c))\r
293                         ++uUngappedLength;\r
294                 }\r
295         return uUngappedLength;\r
296         }\r
297 \r
298 void Seq::LogMe() const\r
299         {\r
300         Log(">%s\n", m_ptrName);\r
301         const unsigned n = Length();\r
302         for (unsigned i = 0; i < n; ++i)\r
303                 Log("%c", at(i));\r
304         Log("\n");\r
305         }\r
306 \r
307 void Seq::FromString(const char *pstrSeq, const char *pstrName)\r
308         {\r
309         clear();\r
310         const unsigned uLength = (unsigned) strlen(pstrSeq);\r
311         for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex)\r
312                 push_back(pstrSeq[uColIndex]);\r
313         size_t n = strlen(pstrName) + 1;\r
314         m_ptrName = new char[n];\r
315         strcpy(m_ptrName, pstrName);\r
316         }\r
317 \r
318 bool Seq::HasGap() const\r
319         {\r
320         for (CharVect::const_iterator p = begin(); p != end(); ++p)\r
321                 {\r
322                 char c = *p;\r
323                 if (IsGapChar(c))\r
324                         return true;\r
325                 }\r
326         return false;\r
327         }\r
328 \r
329 void Seq::FixAlpha()\r
330         {\r
331         for (CharVect::iterator p = begin(); p != end(); ++p)\r
332                 {\r
333                 char c = *p;\r
334                 if (!IsResidueChar(c))\r
335                         {\r
336                         char w = GetWildcardChar();\r
337                         // Warning("Invalid residue '%c', replaced by '%c'", c, w);\r
338                         InvalidLetterWarning(c, w);\r
339                         *p = w;\r
340                         }\r
341                 }\r
342         }\r