1 /////////////////////////////////////////////////////////////////
4 // Class for reading/manipulating single sequence character data.
5 /////////////////////////////////////////////////////////////////
15 #include "SafeVector.h"
16 #include "FileBuffer.h"
18 /////////////////////////////////////////////////////////////////
21 // Class for storing sequence information.
22 /////////////////////////////////////////////////////////////////
26 bool isValid; // a boolean indicating whether the sequence data is valid or not
27 string header; // string containing the comment line of the FASTA file
28 SafeVector<char> *data; // pointer to character data
29 int length; // length of the sequence
30 int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences
31 // in a Multi-FASTA file
32 int inputLabel; // position of sequence in original input
34 /////////////////////////////////////////////////////////////////
35 // Sequence::Sequence()
37 // Default constructor. Does nothing.
38 /////////////////////////////////////////////////////////////////
41 isValid(false), header(""), data(NULL), length(0), sequenceLabel(0), inputLabel(
47 /////////////////////////////////////////////////////////////////
48 // Sequence::Sequence()
50 // Constructor. Reads the sequence from a FileBuffer.
51 /////////////////////////////////////////////////////////////////
53 Sequence(FileBuffer &infile, bool stripGaps = false) :
54 isValid(false), header("~"), data(NULL), length(0), sequenceLabel(
57 // read until the first non-blank line
58 while (!infile.eof()) {
59 infile.GetLine(header);
60 if (header.length() != 0)
64 // check to make sure that it is a correct header line
65 if (header[0] == '>') {
67 // if so, remove the leading ">"
68 header = header.substr(1);
70 // remove any leading or trailing white space in the header comment
71 while (header.length() > 0 && isspace(header[0]))
72 header = header.substr(1);
73 while (header.length() > 0 && isspace(header[header.length() - 1]))
74 header = header.substr(0, header.length() - 1);
76 // get ready to read the data[] array; note that data[0] is always '@'
78 data = new SafeVector<char>;
82 // get a character from the file
83 while (infile.Get(ch)) {
85 // if we've reached a new comment line, put the character back and stop
95 // substitute gap character
98 if (stripGaps && ch == '-')
101 // check for known characters
102 if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))) {
103 cerr << "ERROR: Unknown character encountered: " << ch
108 // everything's ok so far, so just store this character.
109 if (ch >= 'a' && ch <= 'z') {
111 } //change to upper case. fixed by Liu Yongchao, May 21, 2010
117 // sequence must contain data in order to be valid
118 isValid = length > 0;
126 /////////////////////////////////////////////////////////////////
127 // Sequence::Sequence()
129 // Constructor. Builds a sequence from existing data. Note
130 // that the data must use one-based indexing where data[0] should
132 /////////////////////////////////////////////////////////////////
134 Sequence(SafeVector<char> *data, string header, int length,
135 int sequenceLabel, int inputLabel) :
136 isValid(data != NULL), header(header), data(data), length(length), sequenceLabel(
137 sequenceLabel), inputLabel(inputLabel) {
139 assert((*data)[0] == '@');
142 /////////////////////////////////////////////////////////////////
143 // Sequence::Sequence()
145 // Destructor. Release allocated memory.
146 /////////////////////////////////////////////////////////////////
157 /////////////////////////////////////////////////////////////////
158 // Sequence::GetHeader()
160 // Return the string comment associated with this sequence.
161 /////////////////////////////////////////////////////////////////
163 string GetHeader() const {
167 /////////////////////////////////////////////////////////////////
168 // Sequence::GetName()
170 // Return the first word of the string comment associated with this sequence.
171 /////////////////////////////////////////////////////////////////
173 string GetName() const {
175 sscanf(header.c_str(), "%s", name);
179 /////////////////////////////////////////////////////////////////
180 // Sequence::GetDataPtr()
182 // Return the iterator to data associated with this sequence.
183 /////////////////////////////////////////////////////////////////
185 SafeVector<char>::iterator GetDataPtr() {
188 return data->begin();
191 /////////////////////////////////////////////////////////////////
192 // Sequence::GetPosition()
194 // Return the character at position i. Recall that the character
195 // data is stored with one-based indexing.
196 /////////////////////////////////////////////////////////////////
198 char GetPosition(int i) const {
201 assert(i >= 1 && i <= length);
205 /////////////////////////////////////////////////////////////////
206 // Sequence::SetLabel()
208 // Sets the sequence label to i.
209 /////////////////////////////////////////////////////////////////
211 void SetLabel(int i) {
217 /////////////////////////////////////////////////////////////////
218 // Sequence::SetSortLabel()
220 // Sets the sequence sorting label to i.
221 /////////////////////////////////////////////////////////////////
223 void SetSortLabel(int i) {
228 /////////////////////////////////////////////////////////////////
229 // Sequence::GetLabel()
231 // Retrieves the input label.
232 /////////////////////////////////////////////////////////////////
234 int GetLabel() const {
239 /////////////////////////////////////////////////////////////////
240 // Sequence::GetSortLabel()
242 // Retrieves the sorting label.
243 /////////////////////////////////////////////////////////////////
245 int GetSortLabel() const {
247 return sequenceLabel;
250 /////////////////////////////////////////////////////////////////
253 // Checks to see if the sequence successfully loaded.
254 /////////////////////////////////////////////////////////////////
260 /////////////////////////////////////////////////////////////////
261 // Sequence::Length()
263 // Returns the length of the sequence.
264 /////////////////////////////////////////////////////////////////
266 int GetLength() const {
272 /////////////////////////////////////////////////////////////////
273 // Sequence::WriteMFA()
275 // Writes the sequence to outfile in MFA format. Uses numColumns
276 // columns per line. If useIndex is set to false, then the
277 // header is printed as normal, but if useIndex is true, then
278 // ">S###" is printed where ### represents the sequence label.
279 /////////////////////////////////////////////////////////////////
281 void WriteMFA(ostream &outfile, int numColumns,
282 bool useIndex = false) const {
285 assert(!outfile.fail());
289 outfile << ">S" << GetLabel() << endl;
291 outfile << ">" << header << endl;
293 // print out character data
295 for (; ct <= length; ct++) {
296 outfile << (*data)[ct];
297 if (ct % numColumns == 0)
300 if ((ct - 1) % numColumns != 0)
304 /////////////////////////////////////////////////////////////////
307 // Returns a new deep copy of the seqeuence.
308 /////////////////////////////////////////////////////////////////
310 Sequence *Clone() const {
311 Sequence *ret = new Sequence();
314 ret->isValid = isValid;
315 ret->header = header;
316 ret->data = new SafeVector<char>;
318 *(ret->data) = *data;
319 ret->length = length;
320 ret->sequenceLabel = sequenceLabel;
321 ret->inputLabel = inputLabel;
326 /////////////////////////////////////////////////////////////////
327 // Sequence::GetRange()
329 // Returns a new sequence object consisting of a range of
330 // characters from the current seuquence.
331 /////////////////////////////////////////////////////////////////
333 Sequence *GetRange(int start, int end) const {
334 Sequence *ret = new Sequence();
337 assert(start >= 1 && start <= length);
338 assert(end >= 1 && end <= length);
339 assert(start <= end);
341 ret->isValid = isValid;
342 ret->header = header;
343 ret->data = new SafeVector<char>;
345 ret->data->push_back('@');
346 for (int i = start; i <= end; i++)
347 ret->data->push_back((*data)[i]);
348 ret->length = end - start + 1;
349 ret->sequenceLabel = sequenceLabel;
350 ret->inputLabel = inputLabel;
355 /////////////////////////////////////////////////////////////////
356 // Sequence::AddGaps()
358 // Given an SafeVector<char> containing the skeleton for an
359 // alignment and the identity of the current character, this
360 // routine will create a new sequence with all necesssary gaps added.
362 // alignment = "XXXBBYYYBBYYXX"
364 // will perform the transformation
365 // "ATGCAGTCA" --> "ATGCC---GT--CA"
367 /////////////////////////////////////////////////////////////////
369 Sequence *AddGaps(SafeVector<char> *alignment, char id) {
370 Sequence *ret = new Sequence();
373 ret->isValid = isValid;
374 ret->header = header;
375 ret->data = new SafeVector<char>;
377 ret->length = (int) alignment->size();
378 ret->sequenceLabel = sequenceLabel;
379 ret->inputLabel = inputLabel;
380 ret->data->push_back('@');
382 SafeVector<char>::iterator dataIter = data->begin() + 1;
383 for (SafeVector<char>::iterator iter = alignment->begin();
384 iter != alignment->end(); ++iter) {
385 if (*iter == 'B' || *iter == id) {
386 ret->data->push_back(*dataIter);
389 ret->data->push_back('-');
395 /////////////////////////////////////////////////////////////////
396 // Sequence::GetString()
398 // Returns the sequence as a string with gaps removed.
399 /////////////////////////////////////////////////////////////////
403 for (int i = 1; i <= length; i++) {
404 if ((*data)[i] != '-')
410 /////////////////////////////////////////////////////////////////
411 // Sequence::GetMapping()
413 // Returns a SafeVector<int> containing the indices of every
414 // character in the sequence. For instance, if the data is
415 // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
416 /////////////////////////////////////////////////////////////////
418 SafeVector<int> *GetMapping() const {
419 SafeVector<int> *ret = new SafeVector<int>(1, 0);
420 for (int i = 1; i <= length; i++) {
421 if ((*data)[i] != '-')
427 /////////////////////////////////////////////////////////////////
428 // Sequence::Highlight()
430 // Changes all positions with score >= cutoff to upper case and
431 // all positions with score < cutoff to lower case.
432 /////////////////////////////////////////////////////////////////
434 void Highlight(const SafeVector<float> &scores, const float cutoff) {
435 for (int i = 1; i <= length; i++) {
436 if (scores[i - 1] >= cutoff)
437 (*data)[i] = toupper((*data)[i]);
439 (*data)[i] = tolower((*data)[i]);