1 /////////////////////////////////////////////////////////////////
4 // Class for reading/manipulating single sequence character data.
5 /////////////////////////////////////////////////////////////////
15 #include "SafeVector.h"
16 #include "FileBuffer.h"
18 /////////////////////////////////////////////////////////////////
21 // Class for storing sequence information.
22 /////////////////////////////////////////////////////////////////
26 bool isValid; // a boolean indicating whether the sequence data is valid or not
27 string header; // string containing the comment line of the FASTA file
28 SafeVector<char> *data; // pointer to character data
29 int length; // length of the sequence
30 int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences
31 // in a Multi-FASTA file
32 int inputLabel; // position of sequence in original input
35 /////////////////////////////////////////////////////////////////
36 // Sequence::Sequence()
38 // Default constructor. Does nothing.
39 /////////////////////////////////////////////////////////////////
41 Sequence () : isValid (false), header (""), data (NULL), length (0), sequenceLabel (0), inputLabel (0) {}
45 /////////////////////////////////////////////////////////////////
46 // Sequence::Sequence()
48 // Constructor. Reads the sequence from a FileBuffer.
49 /////////////////////////////////////////////////////////////////
51 Sequence (FileBuffer &infile, bool stripGaps = false) : isValid (false), header ("~"), data (NULL), length(0), sequenceLabel (0), inputLabel (0) {
53 // read until the first non-blank line
54 while (!infile.eof()){
55 infile.GetLine (header);
56 if (header.length() != 0) break;
59 // check to make sure that it is a correct header line
60 if (header[0] == '>'){
62 // if so, remove the leading ">"
63 header = header.substr (1);
65 // remove any leading or trailing white space in the header comment
66 while (header.length() > 0 && isspace (header[0])) header = header.substr (1);
67 while (header.length() > 0 && isspace (header[header.length() - 1])) header = header.substr(0, header.length() - 1);
69 // get ready to read the data[] array; note that data[0] is always '@'
71 data = new SafeVector<char>; assert (data);
72 data->push_back ('@');
74 // get a character from the file
75 while (infile.Get(ch)){
77 // if we've reached a new comment line, put the character back and stop
78 if (ch == '>'){ infile.UnGet(); break; }
81 if (isspace (ch)) continue;
83 // substitute gap character
84 if (ch == '.') ch = '-';
85 if (stripGaps && ch == '-') continue;
87 // check for known characters
88 if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){
89 cerr << "ERROR: Unknown character encountered: " << ch << endl;
93 // everything's ok so far, so just store this character.
98 // sequence must contain data in order to be valid
108 /////////////////////////////////////////////////////////////////
109 // Sequence::Sequence()
111 // Constructor. Builds a sequence from existing data. Note
112 // that the data must use one-based indexing where data[0] should
114 /////////////////////////////////////////////////////////////////
116 Sequence (SafeVector<char> *data, string header, int length, int sequenceLabel, int inputLabel) :
117 isValid (data != NULL), header(header), data(data), length (length), sequenceLabel (sequenceLabel), inputLabel (inputLabel) {
119 assert ((*data)[0] == '@');
122 /////////////////////////////////////////////////////////////////
123 // Sequence::Sequence()
125 // Destructor. Release allocated memory.
126 /////////////////////////////////////////////////////////////////
137 void SetWeight(float myWeight) {
140 float GetWeight() const {
144 /////////////////////////////////////////////////////////////////
145 // Sequence::GetHeader()
147 // Return the string comment associated with this sequence.
148 /////////////////////////////////////////////////////////////////
150 string GetHeader () const {
154 /////////////////////////////////////////////////////////////////
155 // Sequence::GetName()
157 // Return the first word of the string comment associated with this sequence.
158 /////////////////////////////////////////////////////////////////
160 string GetName () const {
162 sscanf (header.c_str(), "%s", name);
166 /////////////////////////////////////////////////////////////////
167 // Sequence::GetDataPtr()
169 // Return the iterator to data associated with this sequence.
170 /////////////////////////////////////////////////////////////////
172 SafeVector<char>::iterator GetDataPtr(){
175 return data->begin();
178 /////////////////////////////////////////////////////////////////
179 // Sequence::GetPosition()
181 // Return the character at position i. Recall that the character
182 // data is stored with one-based indexing.
183 /////////////////////////////////////////////////////////////////
185 char GetPosition (int i) const {
188 assert (i >= 0 && i <= length);
192 /////////////////////////////////////////////////////////////////
193 // Sequence::SetLabel()
195 // Sets the sequence label to i.
196 /////////////////////////////////////////////////////////////////
198 void SetLabel (int i){
204 /////////////////////////////////////////////////////////////////
205 // Sequence::SetSortLabel()
207 // Sets the sequence sorting label to i.
208 /////////////////////////////////////////////////////////////////
210 void SetSortLabel (int i){
215 /////////////////////////////////////////////////////////////////
216 // Sequence::GetLabel()
218 // Retrieves the input label.
219 /////////////////////////////////////////////////////////////////
221 int GetLabel () const {
226 /////////////////////////////////////////////////////////////////
227 // Sequence::GetSortLabel()
229 // Retrieves the sorting label.
230 /////////////////////////////////////////////////////////////////
232 int GetSortLabel () const {
234 return sequenceLabel;
237 /////////////////////////////////////////////////////////////////
240 // Checks to see if the sequence successfully loaded.
241 /////////////////////////////////////////////////////////////////
247 /////////////////////////////////////////////////////////////////
248 // Sequence::Length()
250 // Returns the length of the sequence.
251 /////////////////////////////////////////////////////////////////
253 int GetLength () const {
259 /////////////////////////////////////////////////////////////////
260 // Sequence::WriteMFA()
262 // Writes the sequence to outfile in MFA format. Uses numColumns
263 // columns per line. If useIndex is set to false, then the
264 // header is printed as normal, but if useIndex is true, then
265 // ">S###" is printed where ### represents the sequence label.
266 /////////////////////////////////////////////////////////////////
268 void WriteMFA (ostream &outfile, int numColumns, bool useIndex = false) const {
271 assert (!outfile.fail());
275 outfile << ">S" << GetLabel() << endl;
277 outfile << ">" << header << endl;
279 // print out character data
281 for (; ct <= length; ct++){
282 outfile << (*data)[ct];
283 if (ct % numColumns == 0) outfile << endl;
285 if ((ct-1) % numColumns != 0) outfile << endl;
288 /////////////////////////////////////////////////////////////////
289 // Sequence::WriteWEB()
291 // output for web interfase based on Sequence::WriteMFA()
292 /////////////////////////////////////////////////////////////////
294 void WriteWEB (ostream &outfile, int numColumns, bool useIndex = false) const {
297 assert (!outfile.fail());
299 outfile << "<php ref=\"" << GetLabel() << "\">" << endl;
300 outfile << "<name>" << endl;
303 outfile << "S" << GetLabel() << endl;
305 outfile << "" << header << endl;
307 outfile << "</name>" << endl;
309 // print out character data
310 outfile << "<sequence>" << endl;
312 for (; ct <= length; ct++){
313 outfile << (*data)[ct];
314 if (ct % numColumns == 0) outfile << endl;
316 if ((ct-1) % numColumns != 0) outfile << endl;
318 outfile << "</sequence>" << endl;
319 outfile << "</php>" << endl;
322 /////////////////////////////////////////////////////////////////
325 // Returns a new deep copy of the seqeuence.
326 /////////////////////////////////////////////////////////////////
328 Sequence *Clone () const {
329 Sequence *ret = new Sequence();
332 ret->isValid = isValid;
333 ret->header = header;
334 ret->data = new SafeVector<char>; assert (ret->data);
335 *(ret->data) = *data;
336 ret->length = length;
337 ret->sequenceLabel = sequenceLabel;
338 ret->inputLabel = inputLabel;
339 ret->weight = weight;
344 /////////////////////////////////////////////////////////////////
345 // Sequence::GetRange()
347 // Returns a new sequence object consisting of a range of
348 // characters from the current seuquence.
349 /////////////////////////////////////////////////////////////////
351 Sequence *GetRange (int start, int end) const {
352 Sequence *ret = new Sequence();
355 assert (start >= 1 && start <= length);
356 assert (end >= 1 && end <= length);
357 assert (start <= end);
359 ret->isValid = isValid;
360 ret->header = header;
361 ret->data = new SafeVector<char>; assert (ret->data);
362 ret->data->push_back ('@');
363 for (int i = start; i <= end; i++)
364 ret->data->push_back ((*data)[i]);
365 ret->length = end - start + 1;
366 ret->sequenceLabel = sequenceLabel;
367 ret->inputLabel = inputLabel;
372 /////////////////////////////////////////////////////////////////
373 // Sequence::AddGaps()
375 // Given an SafeVector<char> containing the skeleton for an
376 // alignment and the identity of the current character, this
377 // routine will create a new sequence with all necesssary gaps added.
379 // alignment = "XXXBBYYYBBYYXX"
381 // will perform the transformation
382 // "ATGCAGTCA" --> "ATGCC---GT--CA"
384 /////////////////////////////////////////////////////////////////
386 Sequence *AddGaps (SafeVector<char> *alignment, char id){
387 Sequence *ret = new Sequence();
390 ret->isValid = isValid;
391 ret->header = header;
392 ret->data = new SafeVector<char>; assert (ret->data);
393 ret->length = (int) alignment->size();
394 ret->sequenceLabel = sequenceLabel;
395 ret->inputLabel = inputLabel;
396 ret->data->push_back ('@');
398 SafeVector<char>::iterator dataIter = data->begin() + 1;
399 for (SafeVector<char>::iterator iter = alignment->begin(); iter != alignment->end(); ++iter){
400 if (*iter == 'B' || *iter == id){
401 ret->data->push_back (*dataIter);
405 ret->data->push_back ('-');
411 /////////////////////////////////////////////////////////////////
412 // Sequence::AddGaps()
414 // Given an SafeVector<char> containing the skeleton for an
415 // alignment and the identity of the current character, this
416 // routine will create a new sequence with all necesssary gaps added.
418 // alignment = "XXXBBYYYBBYYXX"
420 // will perform the transformation
421 // "ATGCAGTCA" --> "ATGCC---GT--CA"
423 /////////////////////////////////////////////////////////////////
424 Sequence *AddGapsReverse (SafeVector<char> *alignment, char id){
425 Sequence *ret = new Sequence();
428 ret->isValid = isValid;
429 ret->header = header;
430 ret->data = new SafeVector<char>; assert (ret->data);
431 ret->length = (int) alignment->size();
432 ret->sequenceLabel = sequenceLabel;
433 ret->inputLabel = inputLabel;
434 ret->data->push_back ('@');
436 SafeVector<char>::iterator dataIter = data->begin() + 1;
437 for (SafeVector<char>::reverse_iterator iter = alignment->rbegin(); iter != alignment->rend(); ++iter){
438 if (*iter == 'B' || *iter == id){
439 ret->data->push_back (*dataIter);
443 ret->data->push_back ('-');
450 /////////////////////////////////////////////////////////////////
451 // Sequence::GetString()
453 // Returns the sequence as a string with gaps removed.
454 /////////////////////////////////////////////////////////////////
458 for (int i = 1; i <= length; i++){
459 if ((*data)[i] != '-') s += (*data)[i];
465 /////////////////////////////////////////////////////////////////
466 // Sequence::GetMapping()
468 // Returns a SafeVector<int> containing the indices of every
469 // character in the sequence. For instance, if the data is
470 // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
471 /////////////////////////////////////////////////////////////////
473 SafeVector<int> *GetMapping () const {
474 SafeVector<int> *ret = new SafeVector<int>(1, 0);
475 for (int i = 1; i <= length; i++){
476 if ((*data)[i] != '-') ret->push_back (i);
481 /////////////////////////////////////////////////////////////////
482 // Sequence::GetMappingNumber()
484 // Returns a SafeVector<int> containing the indices of every
485 // character in the sequence. For instance, if the data is
486 // "ATGCC---GT--CA", the method returns {1,2,3,4,5,0,0,0,6,7,0,0,8,9}.
487 /////////////////////////////////////////////////////////////////
488 SafeVector<int> *GetMappingNumber () const {
489 SafeVector<int> *ret = new SafeVector<int>(1, 0);
491 for(int i = 1; i <= length; i++) {
492 if((*data)[i] != '-') ret->push_back(++count);
493 else ret->push_back(0);
498 /////////////////////////////////////////////////////////////////
499 // Sequence::Highlight()
501 // Changes all positions with score >= cutoff to upper case and
502 // all positions with score < cutoff to lower case.
503 /////////////////////////////////////////////////////////////////
505 void Highlight (const SafeVector<float> &scores, const float cutoff){
506 for (int i = 1; i <= length; i++){
507 if (scores[i-1] >= cutoff)
508 (*data)[i] = toupper ((*data)[i]);
510 (*data)[i] = tolower ((*data)[i]);
515 #endif // __SQUENCE_HPP__