Add GLprobs and MSAprobs to binaries
[jabaws.git] / binaries / src / MSAProbs-0.9.7 / MSAProbs / Sequence.h
1 /////////////////////////////////////////////////////////////////
2 // Sequence.h
3 //
4 // Class for reading/manipulating single sequence character data.
5 /////////////////////////////////////////////////////////////////
6
7 #ifndef SEQUENCE_H
8 #define SEQUENCE_H
9
10 #include <string>
11 #include <fstream>
12 #include <iostream>
13 #include <cctype>
14 #include <cstdlib>
15 #include "SafeVector.h"
16 #include "FileBuffer.h"
17
18 /////////////////////////////////////////////////////////////////
19 // Sequence
20 //
21 // Class for storing sequence information.
22 /////////////////////////////////////////////////////////////////
23
24 class Sequence {
25
26         bool isValid; // a boolean indicating whether the sequence data is valid or not
27         string header;       // string containing the comment line of the FASTA file
28         SafeVector<char> *data;      // pointer to character data
29         int length;                  // length of the sequence
30         int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences
31                                            //   in a Multi-FASTA file
32         int inputLabel;              // position of sequence in original input
33
34         /////////////////////////////////////////////////////////////////
35         // Sequence::Sequence()
36         //
37         // Default constructor.  Does nothing.
38         /////////////////////////////////////////////////////////////////
39
40         Sequence() :
41                         isValid(false), header(""), data(NULL), length(0), sequenceLabel(0), inputLabel(
42                                         0) {
43         }
44
45 public:
46
47         /////////////////////////////////////////////////////////////////
48         // Sequence::Sequence()
49         //
50         // Constructor.  Reads the sequence from a FileBuffer.
51         /////////////////////////////////////////////////////////////////
52
53         Sequence(FileBuffer &infile, bool stripGaps = false) :
54                         isValid(false), header("~"), data(NULL), length(0), sequenceLabel(
55                                         0), inputLabel(0) {
56
57                 // read until the first non-blank line
58                 while (!infile.eof()) {
59                         infile.GetLine(header);
60                         if (header.length() != 0)
61                                 break;
62                 }
63
64                 // check to make sure that it is a correct header line
65                 if (header[0] == '>') {
66
67                         // if so, remove the leading ">"
68                         header = header.substr(1);
69
70                         // remove any leading or trailing white space in the header comment
71                         while (header.length() > 0 && isspace(header[0]))
72                                 header = header.substr(1);
73                         while (header.length() > 0 && isspace(header[header.length() - 1]))
74                                 header = header.substr(0, header.length() - 1);
75
76                         // get ready to read the data[] array; note that data[0] is always '@'
77                         char ch;
78                         data = new SafeVector<char>;
79                         assert(data);
80                         data->push_back('@');
81
82                         // get a character from the file
83                         while (infile.Get(ch)) {
84
85                                 // if we've reached a new comment line, put the character back and stop
86                                 if (ch == '>') {
87                                         infile.UnGet();
88                                         break;
89                                 }
90
91                                 // skip whitespace
92                                 if (isspace(ch))
93                                         continue;
94
95                                 // substitute gap character
96                                 if (ch == '.')
97                                         ch = '-';
98                                 if (stripGaps && ch == '-')
99                                         continue;
100
101                                 // check for known characters
102                                 if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))) {
103                                         cerr << "ERROR: Unknown character encountered: " << ch
104                                                         << endl;
105                                         exit(1);
106                                 }
107
108                                 // everything's ok so far, so just store this character.
109                                 if (ch >= 'a' && ch <= 'z') {
110                                         ch = ch - 'a' + 'A';
111                                 }       //change to upper case. fixed by Liu Yongchao, May 21, 2010
112
113                                 data->push_back(ch);
114                                 ++length;
115                         }
116
117                         // sequence must contain data in order to be valid
118                         isValid = length > 0;
119                         if (!isValid) {
120                                 delete data;
121                                 data = NULL;
122                         }
123                 }
124         }
125
126         /////////////////////////////////////////////////////////////////
127         // Sequence::Sequence()
128         //
129         // Constructor.  Builds a sequence from existing data.  Note
130         // that the data must use one-based indexing where data[0] should
131         // be set to '@'.
132         /////////////////////////////////////////////////////////////////
133
134         Sequence(SafeVector<char> *data, string header, int length,
135                         int sequenceLabel, int inputLabel) :
136                         isValid(data != NULL), header(header), data(data), length(length), sequenceLabel(
137                                         sequenceLabel), inputLabel(inputLabel) {
138                 assert(data);
139                 assert((*data)[0] == '@');
140         }
141
142         /////////////////////////////////////////////////////////////////
143         // Sequence::Sequence()
144         //
145         // Destructor.  Release allocated memory.
146         /////////////////////////////////////////////////////////////////
147
148         ~Sequence() {
149                 if (data) {
150                         assert(isValid);
151                         delete data;
152                         data = NULL;
153                         isValid = false;
154                 }
155         }
156
157         /////////////////////////////////////////////////////////////////
158         // Sequence::GetHeader()
159         //
160         // Return the string comment associated with this sequence.
161         /////////////////////////////////////////////////////////////////
162
163         string GetHeader() const {
164                 return header;
165         }
166
167         /////////////////////////////////////////////////////////////////
168         // Sequence::GetName()
169         //
170         // Return the first word of the string comment associated with this sequence.
171         /////////////////////////////////////////////////////////////////
172
173         string GetName() const {
174                 char name[1024];
175                 sscanf(header.c_str(), "%s", name);
176                 return string(name);
177         }
178
179         /////////////////////////////////////////////////////////////////
180         // Sequence::GetDataPtr()
181         //
182         // Return the iterator to data associated with this sequence.
183         /////////////////////////////////////////////////////////////////
184
185         SafeVector<char>::iterator GetDataPtr() {
186                 assert(isValid);
187                 assert(data);
188                 return data->begin();
189         }
190
191         /////////////////////////////////////////////////////////////////
192         // Sequence::GetPosition()
193         //
194         // Return the character at position i.  Recall that the character
195         // data is stored with one-based indexing.
196         /////////////////////////////////////////////////////////////////
197
198         char GetPosition(int i) const {
199                 assert(isValid);
200                 assert(data);
201                 assert(i >= 1 && i <= length);
202                 return (*data)[i];
203         }
204
205         /////////////////////////////////////////////////////////////////
206         // Sequence::SetLabel()
207         //
208         // Sets the sequence label to i.
209         /////////////////////////////////////////////////////////////////
210
211         void SetLabel(int i) {
212                 assert(isValid);
213                 sequenceLabel = i;
214                 inputLabel = i;
215         }
216
217         /////////////////////////////////////////////////////////////////
218         // Sequence::SetSortLabel()
219         //
220         // Sets the sequence sorting label to i.
221         /////////////////////////////////////////////////////////////////
222
223         void SetSortLabel(int i) {
224                 assert(isValid);
225                 sequenceLabel = i;
226         }
227
228         /////////////////////////////////////////////////////////////////
229         // Sequence::GetLabel()
230         //
231         // Retrieves the input label.
232         /////////////////////////////////////////////////////////////////
233
234         int GetLabel() const {
235                 assert(isValid);
236                 return inputLabel;
237         }
238
239         /////////////////////////////////////////////////////////////////
240         // Sequence::GetSortLabel()
241         //
242         // Retrieves the sorting label.
243         /////////////////////////////////////////////////////////////////
244
245         int GetSortLabel() const {
246                 assert(isValid);
247                 return sequenceLabel;
248         }
249
250         /////////////////////////////////////////////////////////////////
251         // Sequence::Fail()
252         //
253         // Checks to see if the sequence successfully loaded.
254         /////////////////////////////////////////////////////////////////
255
256         bool Fail() const {
257                 return !isValid;
258         }
259
260         /////////////////////////////////////////////////////////////////
261         // Sequence::Length()
262         //
263         // Returns the length of the sequence.
264         /////////////////////////////////////////////////////////////////
265
266         int GetLength() const {
267                 assert(isValid);
268                 assert(data);
269                 return length;
270         }
271
272         /////////////////////////////////////////////////////////////////
273         // Sequence::WriteMFA()
274         //
275         // Writes the sequence to outfile in MFA format.  Uses numColumns
276         // columns per line.  If useIndex is set to false, then the
277         // header is printed as normal, but if useIndex is true, then
278         // ">S###" is printed where ### represents the sequence label.
279         /////////////////////////////////////////////////////////////////
280
281         void WriteMFA(ostream &outfile, int numColumns,
282                         bool useIndex = false) const {
283                 assert(isValid);
284                 assert(data);
285                 assert(!outfile.fail());
286
287                 // print out heading
288                 if (useIndex)
289                         outfile << ">S" << GetLabel() << endl;
290                 else
291                         outfile << ">" << header << endl;
292
293                 // print out character data
294                 int ct = 1;
295                 for (; ct <= length; ct++) {
296                         outfile << (*data)[ct];
297                         if (ct % numColumns == 0)
298                                 outfile << endl;
299                 }
300                 if ((ct - 1) % numColumns != 0)
301                         outfile << endl;
302         }
303
304         /////////////////////////////////////////////////////////////////
305         // Sequence::Clone()
306         //
307         // Returns a new deep copy of the seqeuence.
308         /////////////////////////////////////////////////////////////////
309
310         Sequence *Clone() const {
311                 Sequence *ret = new Sequence();
312                 assert(ret);
313
314                 ret->isValid = isValid;
315                 ret->header = header;
316                 ret->data = new SafeVector<char>;
317                 assert(ret->data);
318                 *(ret->data) = *data;
319                 ret->length = length;
320                 ret->sequenceLabel = sequenceLabel;
321                 ret->inputLabel = inputLabel;
322
323                 return ret;
324         }
325
326         /////////////////////////////////////////////////////////////////
327         // Sequence::GetRange()
328         //
329         // Returns a new sequence object consisting of a range of
330         // characters from the current seuquence.
331         /////////////////////////////////////////////////////////////////
332
333         Sequence *GetRange(int start, int end) const {
334                 Sequence *ret = new Sequence();
335                 assert(ret);
336
337                 assert(start >= 1 && start <= length);
338                 assert(end >= 1 && end <= length);
339                 assert(start <= end);
340
341                 ret->isValid = isValid;
342                 ret->header = header;
343                 ret->data = new SafeVector<char>;
344                 assert(ret->data);
345                 ret->data->push_back('@');
346                 for (int i = start; i <= end; i++)
347                         ret->data->push_back((*data)[i]);
348                 ret->length = end - start + 1;
349                 ret->sequenceLabel = sequenceLabel;
350                 ret->inputLabel = inputLabel;
351
352                 return ret;
353         }
354
355         /////////////////////////////////////////////////////////////////
356         // Sequence::AddGaps()
357         //
358         // Given an SafeVector<char> containing the skeleton for an
359         // alignment and the identity of the current character, this
360         // routine will create a new sequence with all necesssary gaps added.
361         // For instance,
362         //    alignment = "XXXBBYYYBBYYXX"
363         //    id = 'X'
364         // will perform the transformation
365         //    "ATGCAGTCA" --> "ATGCC---GT--CA"
366         //                    (XXXBBYYYBBYYXX)
367         /////////////////////////////////////////////////////////////////
368
369         Sequence *AddGaps(SafeVector<char> *alignment, char id) {
370                 Sequence *ret = new Sequence();
371                 assert(ret);
372
373                 ret->isValid = isValid;
374                 ret->header = header;
375                 ret->data = new SafeVector<char>;
376                 assert(ret->data);
377                 ret->length = (int) alignment->size();
378                 ret->sequenceLabel = sequenceLabel;
379                 ret->inputLabel = inputLabel;
380                 ret->data->push_back('@');
381
382                 SafeVector<char>::iterator dataIter = data->begin() + 1;
383                 for (SafeVector<char>::iterator iter = alignment->begin();
384                                 iter != alignment->end(); ++iter) {
385                         if (*iter == 'B' || *iter == id) {
386                                 ret->data->push_back(*dataIter);
387                                 ++dataIter;
388                         } else
389                                 ret->data->push_back('-');
390                 }
391
392                 return ret;
393         }
394
395         /////////////////////////////////////////////////////////////////
396         // Sequence::GetString()
397         //
398         // Returns the sequence as a string with gaps removed.
399         /////////////////////////////////////////////////////////////////
400
401         string GetString() {
402                 string s = "";
403                 for (int i = 1; i <= length; i++) {
404                         if ((*data)[i] != '-')
405                                 s += (*data)[i];
406                 }
407                 return s;
408         }
409
410         /////////////////////////////////////////////////////////////////
411         // Sequence::GetMapping()
412         //
413         // Returns a SafeVector<int> containing the indices of every
414         // character in the sequence.  For instance, if the data is
415         // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
416         /////////////////////////////////////////////////////////////////
417
418         SafeVector<int> *GetMapping() const {
419                 SafeVector<int> *ret = new SafeVector<int>(1, 0);
420                 for (int i = 1; i <= length; i++) {
421                         if ((*data)[i] != '-')
422                                 ret->push_back(i);
423                 }
424                 return ret;
425         }
426
427         /////////////////////////////////////////////////////////////////
428         // Sequence::Highlight()
429         //
430         // Changes all positions with score >= cutoff to upper case and
431         // all positions with score < cutoff to lower case.
432         /////////////////////////////////////////////////////////////////
433
434         void Highlight(const SafeVector<float> &scores, const float cutoff) {
435                 for (int i = 1; i <= length; i++) {
436                         if (scores[i - 1] >= cutoff)
437                                 (*data)[i] = toupper((*data)[i]);
438                         else
439                                 (*data)[i] = tolower((*data)[i]);
440                 }
441         }
442 };
443
444 #endif