///////////////////////////////////////////////////////////////// // Sequence.h // // Class for reading/manipulating single sequence character data. ///////////////////////////////////////////////////////////////// #ifndef SEQUENCE_H #define SEQUENCE_H #include #include #include #include #include #include "SafeVector.h" #include "FileBuffer.h" ///////////////////////////////////////////////////////////////// // Sequence // // Class for storing sequence information. ///////////////////////////////////////////////////////////////// class Sequence { bool isValid; // a boolean indicating whether the sequence data is valid or not string header; // string containing the comment line of the FASTA file SafeVector *data; // pointer to character data int length; // length of the sequence int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences // in a Multi-FASTA file int inputLabel; // position of sequence in original input ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Default constructor. Does nothing. ///////////////////////////////////////////////////////////////// Sequence() : isValid(false), header(""), data(NULL), length(0), sequenceLabel(0), inputLabel( 0) { } public: ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Constructor. Reads the sequence from a FileBuffer. ///////////////////////////////////////////////////////////////// Sequence(FileBuffer &infile, bool stripGaps = false) : isValid(false), header("~"), data(NULL), length(0), sequenceLabel( 0), inputLabel(0) { // read until the first non-blank line while (!infile.eof()) { infile.GetLine(header); if (header.length() != 0) break; } // check to make sure that it is a correct header line if (header[0] == '>') { // if so, remove the leading ">" header = header.substr(1); // remove any leading or trailing white space in the header comment while (header.length() > 0 && isspace(header[0])) header = header.substr(1); while (header.length() > 0 && isspace(header[header.length() - 1])) header = header.substr(0, header.length() - 1); // get ready to read the data[] array; note that data[0] is always '@' char ch; data = new SafeVector; assert(data); data->push_back('@'); // get a character from the file while (infile.Get(ch)) { // if we've reached a new comment line, put the character back and stop if (ch == '>') { infile.UnGet(); break; } // skip whitespace if (isspace(ch)) continue; // substitute gap character if (ch == '.') ch = '-'; if (stripGaps && ch == '-') continue; // check for known characters if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))) { cerr << "ERROR: Unknown character encountered: " << ch << endl; exit(1); } // everything's ok so far, so just store this character. if (ch >= 'a' && ch <= 'z') { ch = ch - 'a' + 'A'; } //change to upper case. fixed by Liu Yongchao, May 21, 2010 data->push_back(ch); ++length; } // sequence must contain data in order to be valid isValid = length > 0; if (!isValid) { delete data; data = NULL; } } } ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Constructor. Builds a sequence from existing data. Note // that the data must use one-based indexing where data[0] should // be set to '@'. ///////////////////////////////////////////////////////////////// Sequence(SafeVector *data, string header, int length, int sequenceLabel, int inputLabel) : isValid(data != NULL), header(header), data(data), length(length), sequenceLabel( sequenceLabel), inputLabel(inputLabel) { assert(data); assert((*data)[0] == '@'); } ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Destructor. Release allocated memory. ///////////////////////////////////////////////////////////////// ~Sequence() { if (data) { assert(isValid); delete data; data = NULL; isValid = false; } } ///////////////////////////////////////////////////////////////// // Sequence::GetHeader() // // Return the string comment associated with this sequence. ///////////////////////////////////////////////////////////////// string GetHeader() const { return header; } ///////////////////////////////////////////////////////////////// // Sequence::GetName() // // Return the first word of the string comment associated with this sequence. ///////////////////////////////////////////////////////////////// string GetName() const { char name[1024]; sscanf(header.c_str(), "%s", name); return string(name); } ///////////////////////////////////////////////////////////////// // Sequence::GetDataPtr() // // Return the iterator to data associated with this sequence. ///////////////////////////////////////////////////////////////// SafeVector::iterator GetDataPtr() { assert(isValid); assert(data); return data->begin(); } ///////////////////////////////////////////////////////////////// // Sequence::GetPosition() // // Return the character at position i. Recall that the character // data is stored with one-based indexing. ///////////////////////////////////////////////////////////////// char GetPosition(int i) const { assert(isValid); assert(data); assert(i >= 1 && i <= length); return (*data)[i]; } ///////////////////////////////////////////////////////////////// // Sequence::SetLabel() // // Sets the sequence label to i. ///////////////////////////////////////////////////////////////// void SetLabel(int i) { assert(isValid); sequenceLabel = i; inputLabel = i; } ///////////////////////////////////////////////////////////////// // Sequence::SetSortLabel() // // Sets the sequence sorting label to i. ///////////////////////////////////////////////////////////////// void SetSortLabel(int i) { assert(isValid); sequenceLabel = i; } ///////////////////////////////////////////////////////////////// // Sequence::GetLabel() // // Retrieves the input label. ///////////////////////////////////////////////////////////////// int GetLabel() const { assert(isValid); return inputLabel; } ///////////////////////////////////////////////////////////////// // Sequence::GetSortLabel() // // Retrieves the sorting label. ///////////////////////////////////////////////////////////////// int GetSortLabel() const { assert(isValid); return sequenceLabel; } ///////////////////////////////////////////////////////////////// // Sequence::Fail() // // Checks to see if the sequence successfully loaded. ///////////////////////////////////////////////////////////////// bool Fail() const { return !isValid; } ///////////////////////////////////////////////////////////////// // Sequence::Length() // // Returns the length of the sequence. ///////////////////////////////////////////////////////////////// int GetLength() const { assert(isValid); assert(data); return length; } ///////////////////////////////////////////////////////////////// // Sequence::WriteMFA() // // Writes the sequence to outfile in MFA format. Uses numColumns // columns per line. If useIndex is set to false, then the // header is printed as normal, but if useIndex is true, then // ">S###" is printed where ### represents the sequence label. ///////////////////////////////////////////////////////////////// void WriteMFA(ostream &outfile, int numColumns, bool useIndex = false) const { assert(isValid); assert(data); assert(!outfile.fail()); // print out heading if (useIndex) outfile << ">S" << GetLabel() << endl; else outfile << ">" << header << endl; // print out character data int ct = 1; for (; ct <= length; ct++) { outfile << (*data)[ct]; if (ct % numColumns == 0) outfile << endl; } if ((ct - 1) % numColumns != 0) outfile << endl; } ///////////////////////////////////////////////////////////////// // Sequence::Clone() // // Returns a new deep copy of the seqeuence. ///////////////////////////////////////////////////////////////// Sequence *Clone() const { Sequence *ret = new Sequence(); assert(ret); ret->isValid = isValid; ret->header = header; ret->data = new SafeVector; assert(ret->data); *(ret->data) = *data; ret->length = length; ret->sequenceLabel = sequenceLabel; ret->inputLabel = inputLabel; return ret; } ///////////////////////////////////////////////////////////////// // Sequence::GetRange() // // Returns a new sequence object consisting of a range of // characters from the current seuquence. ///////////////////////////////////////////////////////////////// Sequence *GetRange(int start, int end) const { Sequence *ret = new Sequence(); assert(ret); assert(start >= 1 && start <= length); assert(end >= 1 && end <= length); assert(start <= end); ret->isValid = isValid; ret->header = header; ret->data = new SafeVector; assert(ret->data); ret->data->push_back('@'); for (int i = start; i <= end; i++) ret->data->push_back((*data)[i]); ret->length = end - start + 1; ret->sequenceLabel = sequenceLabel; ret->inputLabel = inputLabel; return ret; } ///////////////////////////////////////////////////////////////// // Sequence::AddGaps() // // Given an SafeVector containing the skeleton for an // alignment and the identity of the current character, this // routine will create a new sequence with all necesssary gaps added. // For instance, // alignment = "XXXBBYYYBBYYXX" // id = 'X' // will perform the transformation // "ATGCAGTCA" --> "ATGCC---GT--CA" // (XXXBBYYYBBYYXX) ///////////////////////////////////////////////////////////////// Sequence *AddGaps(SafeVector *alignment, char id) { Sequence *ret = new Sequence(); assert(ret); ret->isValid = isValid; ret->header = header; ret->data = new SafeVector; assert(ret->data); ret->length = (int) alignment->size(); ret->sequenceLabel = sequenceLabel; ret->inputLabel = inputLabel; ret->data->push_back('@'); SafeVector::iterator dataIter = data->begin() + 1; for (SafeVector::iterator iter = alignment->begin(); iter != alignment->end(); ++iter) { if (*iter == 'B' || *iter == id) { ret->data->push_back(*dataIter); ++dataIter; } else ret->data->push_back('-'); } return ret; } ///////////////////////////////////////////////////////////////// // Sequence::GetString() // // Returns the sequence as a string with gaps removed. ///////////////////////////////////////////////////////////////// string GetString() { string s = ""; for (int i = 1; i <= length; i++) { if ((*data)[i] != '-') s += (*data)[i]; } return s; } ///////////////////////////////////////////////////////////////// // Sequence::GetMapping() // // Returns a SafeVector containing the indices of every // character in the sequence. For instance, if the data is // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}. ///////////////////////////////////////////////////////////////// SafeVector *GetMapping() const { SafeVector *ret = new SafeVector(1, 0); for (int i = 1; i <= length; i++) { if ((*data)[i] != '-') ret->push_back(i); } return ret; } ///////////////////////////////////////////////////////////////// // Sequence::Highlight() // // Changes all positions with score >= cutoff to upper case and // all positions with score < cutoff to lower case. ///////////////////////////////////////////////////////////////// void Highlight(const SafeVector &scores, const float cutoff) { for (int i = 1; i <= length; i++) { if (scores[i - 1] >= cutoff) (*data)[i] = toupper((*data)[i]); else (*data)[i] = tolower((*data)[i]); } } }; #endif