Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / muscle / seqvect.cpp
1 #include "muscle.h"\r
2 #include "seqvect.h"\r
3 #include "textfile.h"\r
4 #include "msa.h"\r
5 \r
6 const size_t MAX_FASTA_LINE = 16000;\r
7 \r
8 SeqVect::~SeqVect()\r
9         {\r
10         Clear();\r
11         }\r
12 \r
13 void SeqVect::Clear()\r
14         {\r
15         for (size_t n = 0; n < size(); ++n)\r
16                 delete (*this)[n];\r
17         }\r
18 \r
19 void SeqVect::ToFASTAFile(TextFile &File) const\r
20         {\r
21         unsigned uSeqCount = Length();\r
22         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
23                 {\r
24                 Seq *ptrSeq = at(uSeqIndex);\r
25                 ptrSeq->ToFASTAFile(File);\r
26                 }\r
27         }\r
28 \r
29 void SeqVect::FromFASTAFile(TextFile &File)\r
30         {\r
31         Clear();\r
32 \r
33         FILE *f = File.GetStdioFile();\r
34         for (;;)\r
35                 {\r
36                 char *Label;\r
37                 unsigned uLength;\r
38                 char *SeqData = GetFastaSeq(f, &uLength, &Label);\r
39                 if (0 == SeqData)\r
40                         return;\r
41                 Seq *ptrSeq = new Seq;\r
42 \r
43                 for (unsigned i = 0; i < uLength; ++i)\r
44                         {\r
45                         char c = SeqData[i];\r
46                         ptrSeq->push_back(c);\r
47                         }\r
48 \r
49                 ptrSeq->SetName(Label);\r
50                 push_back(ptrSeq);\r
51 \r
52                 delete[] SeqData;\r
53                 delete[] Label;\r
54                 }\r
55         }\r
56 \r
57 void SeqVect::PadToMSA(MSA &msa)\r
58         {\r
59         unsigned uSeqCount = Length();\r
60         if (0 == uSeqCount)\r
61                 {\r
62                 msa.Clear();\r
63                 return;\r
64                 }\r
65 \r
66         unsigned uLongestSeqLength = 0;\r
67         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
68                 {\r
69                 Seq *ptrSeq = at(uSeqIndex);\r
70                 unsigned uColCount = ptrSeq->Length();\r
71                 if (uColCount > uLongestSeqLength)\r
72                         uLongestSeqLength = uColCount;\r
73                 }\r
74         msa.SetSize(uSeqCount, uLongestSeqLength);\r
75         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
76                 {\r
77                 Seq *ptrSeq = at(uSeqIndex);\r
78                 msa.SetSeqName(uSeqIndex, ptrSeq->GetName());\r
79                 unsigned uColCount = ptrSeq->Length();\r
80                 unsigned uColIndex;\r
81                 for (uColIndex = 0; uColIndex < uColCount; ++uColIndex)\r
82                         {\r
83                         char c = ptrSeq->at(uColIndex);\r
84                         msa.SetChar(uSeqIndex, uColIndex, c);\r
85                         }\r
86                 while (uColIndex < uLongestSeqLength)\r
87                         msa.SetChar(uSeqIndex, uColIndex++, '.');\r
88                 }\r
89         }\r
90 \r
91 void SeqVect::Copy(const SeqVect &rhs)\r
92         {\r
93         clear();\r
94         unsigned uSeqCount = rhs.Length();\r
95         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
96                 {\r
97                 Seq *ptrSeq = rhs.at(uSeqIndex);\r
98                 Seq *ptrSeqCopy = new Seq;\r
99                 ptrSeqCopy->Copy(*ptrSeq);\r
100                 push_back(ptrSeqCopy);\r
101                 }\r
102         }\r
103 \r
104 void SeqVect::StripGaps()\r
105         {\r
106         unsigned uSeqCount = Length();\r
107         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
108                 {\r
109                 Seq *ptrSeq = at(uSeqIndex);\r
110                 ptrSeq->StripGaps();\r
111                 }\r
112         }\r
113 \r
114 void SeqVect::StripGapsAndWhitespace()\r
115         {\r
116         unsigned uSeqCount = Length();\r
117         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
118                 {\r
119                 Seq *ptrSeq = at(uSeqIndex);\r
120                 ptrSeq->StripGapsAndWhitespace();\r
121                 }\r
122         }\r
123 \r
124 void SeqVect::ToUpper()\r
125         {\r
126         unsigned uSeqCount = Length();\r
127         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
128                 {\r
129                 Seq *ptrSeq = at(uSeqIndex);\r
130                 ptrSeq->ToUpper();\r
131                 }\r
132         }\r
133 \r
134 bool SeqVect::FindName(const char *ptrName, unsigned *ptruIndex) const\r
135         {\r
136         unsigned uSeqCount = Length();\r
137         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
138                 {\r
139                 const Seq *ptrSeq = at(uSeqIndex);\r
140                 if (0 == stricmp(ptrSeq->GetName(), ptrName))\r
141                         {\r
142                         *ptruIndex = uSeqIndex;\r
143                         return true;\r
144                         }\r
145                 }\r
146         return false;\r
147         }\r
148 \r
149 void SeqVect::AppendSeq(const Seq &s)\r
150         {\r
151         Seq *ptrSeqCopy = new Seq;\r
152         ptrSeqCopy->Copy(s);\r
153         push_back(ptrSeqCopy);\r
154         }\r
155 \r
156 void SeqVect::LogMe() const\r
157         {\r
158         unsigned uSeqCount = Length();\r
159         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
160                 {\r
161                 const Seq *ptrSeq = at(uSeqIndex);\r
162                 ptrSeq->LogMe();\r
163                 }\r
164         }\r
165 \r
166 const char *SeqVect::GetSeqName(unsigned uSeqIndex) const\r
167         {\r
168         assert(uSeqIndex < size());\r
169         const Seq *ptrSeq = at(uSeqIndex);\r
170         return ptrSeq->GetName();\r
171         }\r
172 \r
173 unsigned SeqVect::GetSeqId(unsigned uSeqIndex) const\r
174         {\r
175         assert(uSeqIndex < size());\r
176         const Seq *ptrSeq = at(uSeqIndex);\r
177         return ptrSeq->GetId();\r
178         }\r
179 \r
180 unsigned SeqVect::GetSeqIdFromName(const char *Name) const\r
181         {\r
182         const unsigned uSeqCount = GetSeqCount();\r
183         for (unsigned i = 0; i < uSeqCount; ++i)\r
184                 {\r
185                 if (!strcmp(Name, GetSeqName(i)))\r
186                         return GetSeqId(i);\r
187                 }\r
188         Quit("SeqVect::GetSeqIdFromName(%s): not found", Name);\r
189         return 0;\r
190         }\r
191 \r
192 Seq &SeqVect::GetSeqById(unsigned uId)\r
193         {\r
194         const unsigned uSeqCount = GetSeqCount();\r
195         for (unsigned i = 0; i < uSeqCount; ++i)\r
196                 {\r
197                 if (GetSeqId(i) == uId)\r
198                         return GetSeq(i);\r
199                 }\r
200         Quit("SeqVect::GetSeqIdByUd(%d): not found", uId);\r
201         return (Seq &) *((Seq *) 0);\r
202         }\r
203 \r
204 unsigned SeqVect::GetSeqLength(unsigned uSeqIndex) const\r
205         {\r
206         assert(uSeqIndex < size());\r
207         const Seq *ptrSeq = at(uSeqIndex);\r
208         return ptrSeq->Length();\r
209         }\r
210 \r
211 Seq &SeqVect::GetSeq(unsigned uSeqIndex)\r
212         {\r
213         assert(uSeqIndex < size());\r
214         return *at(uSeqIndex);\r
215         }\r
216 \r
217 const Seq &SeqVect::GetSeq(unsigned uSeqIndex) const\r
218         {\r
219         assert(uSeqIndex < size());\r
220         return *at(uSeqIndex);\r
221         }\r
222 \r
223 void SeqVect::SetSeqId(unsigned uSeqIndex, unsigned uId)\r
224         {\r
225         assert(uSeqIndex < size());\r
226         Seq *ptrSeq = at(uSeqIndex);\r
227         return ptrSeq->SetId(uId);\r
228         }\r
229 \r
230 ALPHA SeqVect::GuessAlpha() const\r
231         {\r
232 // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap\r
233 // letters belong to the nucleotide alphabet, guess nucleo.\r
234 // Otherwise amino.\r
235         const unsigned CHAR_COUNT = 100;\r
236         const unsigned MIN_NUCLEO_PCT = 95;\r
237 \r
238         const unsigned uSeqCount = GetSeqCount();\r
239         if (0 == uSeqCount)\r
240                 return ALPHA_Amino;\r
241 \r
242         unsigned uSeqIndex = 0;\r
243         unsigned uPos = 0;\r
244         unsigned uSeqLength = GetSeqLength(0);\r
245         unsigned uDNACount = 0;\r
246         unsigned uRNACount = 0;\r
247         unsigned uTotal = 0;\r
248         const Seq *ptrSeq = &GetSeq(0);\r
249         for (;;)\r
250                 {\r
251                 while (uPos >= uSeqLength)\r
252                         {\r
253                         ++uSeqIndex;\r
254                         if (uSeqIndex >= uSeqCount)\r
255                                 break;\r
256                         ptrSeq = &GetSeq(uSeqIndex);\r
257                         uSeqLength = ptrSeq->Length();\r
258                         uPos = 0;\r
259                         }\r
260                 if (uSeqIndex >= uSeqCount)\r
261                         break;\r
262                 char c = ptrSeq->at(uPos++);\r
263                 if (IsGapChar(c))\r
264                         continue;\r
265                 if (IsDNA(c))\r
266                         ++uDNACount;\r
267                 if (IsRNA(c))\r
268                         ++uRNACount;\r
269                 ++uTotal;\r
270                 if (uTotal >= CHAR_COUNT)\r
271                         break;\r
272                 }\r
273         if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT)\r
274                 return ALPHA_DNA;\r
275         if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT)\r
276                 return ALPHA_RNA;\r
277         return ALPHA_Amino;\r
278         }\r
279 \r
280 void SeqVect::FixAlpha()\r
281         {\r
282         ClearInvalidLetterWarning();\r
283         unsigned uSeqCount = Length();\r
284         for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)\r
285                 {\r
286                 Seq *ptrSeq = at(uSeqIndex);\r
287                 ptrSeq->FixAlpha();\r
288                 }\r
289         ReportInvalidLetters();\r
290         }\r