--- /dev/null
+/////////////////////////////////////////////////////////////////
+// Sequence.h
+//
+// Class for reading/manipulating single sequence character data.
+/////////////////////////////////////////////////////////////////
+
+#ifndef SEQUENCE_H
+#define SEQUENCE_H
+
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cctype>
+#include <cstdlib>
+#include "SafeVector.h"
+#include "FileBuffer.h"
+
+/////////////////////////////////////////////////////////////////
+// Sequence
+//
+// Class for storing sequence information.
+/////////////////////////////////////////////////////////////////
+
+class Sequence {
+
+ bool isValid; // a boolean indicating whether the sequence data is valid or not
+ string header; // string containing the comment line of the FASTA file
+ SafeVector<char> *data; // pointer to character data
+ int length; // length of the sequence
+ int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences
+ // in a Multi-FASTA file
+ int inputLabel; // position of sequence in original input
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Default constructor. Does nothing.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence() :
+ isValid(false), header(""), data(NULL), length(0), sequenceLabel(0), inputLabel(
+ 0) {
+ }
+
+public:
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Constructor. Reads the sequence from a FileBuffer.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence(FileBuffer &infile, bool stripGaps = false) :
+ isValid(false), header("~"), data(NULL), length(0), sequenceLabel(
+ 0), inputLabel(0) {
+
+ // read until the first non-blank line
+ while (!infile.eof()) {
+ infile.GetLine(header);
+ if (header.length() != 0)
+ break;
+ }
+
+ // check to make sure that it is a correct header line
+ if (header[0] == '>') {
+
+ // if so, remove the leading ">"
+ header = header.substr(1);
+
+ // remove any leading or trailing white space in the header comment
+ while (header.length() > 0 && isspace(header[0]))
+ header = header.substr(1);
+ while (header.length() > 0 && isspace(header[header.length() - 1]))
+ header = header.substr(0, header.length() - 1);
+
+ // get ready to read the data[] array; note that data[0] is always '@'
+ char ch;
+ data = new SafeVector<char>;
+ assert(data);
+ data->push_back('@');
+
+ // get a character from the file
+ while (infile.Get(ch)) {
+
+ // if we've reached a new comment line, put the character back and stop
+ if (ch == '>') {
+ infile.UnGet();
+ break;
+ }
+
+ // skip whitespace
+ if (isspace(ch))
+ continue;
+
+ // substitute gap character
+ if (ch == '.')
+ ch = '-';
+ if (stripGaps && ch == '-')
+ continue;
+
+ // check for known characters
+ if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))) {
+ cerr << "ERROR: Unknown character encountered: " << ch
+ << endl;
+ exit(1);
+ }
+
+ // everything's ok so far, so just store this character.
+ if (ch >= 'a' && ch <= 'z') {
+ ch = ch - 'a' + 'A';
+ } //change to upper case. fixed by Liu Yongchao, May 21, 2010
+
+ data->push_back(ch);
+ ++length;
+ }
+
+ // sequence must contain data in order to be valid
+ isValid = length > 0;
+ if (!isValid) {
+ delete data;
+ data = NULL;
+ }
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Constructor. Builds a sequence from existing data. Note
+ // that the data must use one-based indexing where data[0] should
+ // be set to '@'.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence(SafeVector<char> *data, string header, int length,
+ int sequenceLabel, int inputLabel) :
+ isValid(data != NULL), header(header), data(data), length(length), sequenceLabel(
+ sequenceLabel), inputLabel(inputLabel) {
+ assert(data);
+ assert((*data)[0] == '@');
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Destructor. Release allocated memory.
+ /////////////////////////////////////////////////////////////////
+
+ ~Sequence() {
+ if (data) {
+ assert(isValid);
+ delete data;
+ data = NULL;
+ isValid = false;
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetHeader()
+ //
+ // Return the string comment associated with this sequence.
+ /////////////////////////////////////////////////////////////////
+
+ string GetHeader() const {
+ return header;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetName()
+ //
+ // Return the first word of the string comment associated with this sequence.
+ /////////////////////////////////////////////////////////////////
+
+ string GetName() const {
+ char name[1024];
+ sscanf(header.c_str(), "%s", name);
+ return string(name);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetDataPtr()
+ //
+ // Return the iterator to data associated with this sequence.
+ /////////////////////////////////////////////////////////////////
+
+ SafeVector<char>::iterator GetDataPtr() {
+ assert(isValid);
+ assert(data);
+ return data->begin();
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetPosition()
+ //
+ // Return the character at position i. Recall that the character
+ // data is stored with one-based indexing.
+ /////////////////////////////////////////////////////////////////
+
+ char GetPosition(int i) const {
+ assert(isValid);
+ assert(data);
+ assert(i >= 1 && i <= length);
+ return (*data)[i];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::SetLabel()
+ //
+ // Sets the sequence label to i.
+ /////////////////////////////////////////////////////////////////
+
+ void SetLabel(int i) {
+ assert(isValid);
+ sequenceLabel = i;
+ inputLabel = i;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::SetSortLabel()
+ //
+ // Sets the sequence sorting label to i.
+ /////////////////////////////////////////////////////////////////
+
+ void SetSortLabel(int i) {
+ assert(isValid);
+ sequenceLabel = i;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetLabel()
+ //
+ // Retrieves the input label.
+ /////////////////////////////////////////////////////////////////
+
+ int GetLabel() const {
+ assert(isValid);
+ return inputLabel;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetSortLabel()
+ //
+ // Retrieves the sorting label.
+ /////////////////////////////////////////////////////////////////
+
+ int GetSortLabel() const {
+ assert(isValid);
+ return sequenceLabel;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Fail()
+ //
+ // Checks to see if the sequence successfully loaded.
+ /////////////////////////////////////////////////////////////////
+
+ bool Fail() const {
+ return !isValid;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Length()
+ //
+ // Returns the length of the sequence.
+ /////////////////////////////////////////////////////////////////
+
+ int GetLength() const {
+ assert(isValid);
+ assert(data);
+ return length;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::WriteMFA()
+ //
+ // Writes the sequence to outfile in MFA format. Uses numColumns
+ // columns per line. If useIndex is set to false, then the
+ // header is printed as normal, but if useIndex is true, then
+ // ">S###" is printed where ### represents the sequence label.
+ /////////////////////////////////////////////////////////////////
+
+ void WriteMFA(ostream &outfile, int numColumns,
+ bool useIndex = false) const {
+ assert(isValid);
+ assert(data);
+ assert(!outfile.fail());
+
+ // print out heading
+ if (useIndex)
+ outfile << ">S" << GetLabel() << endl;
+ else
+ outfile << ">" << header << endl;
+
+ // print out character data
+ int ct = 1;
+ for (; ct <= length; ct++) {
+ outfile << (*data)[ct];
+ if (ct % numColumns == 0)
+ outfile << endl;
+ }
+ if ((ct - 1) % numColumns != 0)
+ outfile << endl;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Clone()
+ //
+ // Returns a new deep copy of the seqeuence.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence *Clone() const {
+ Sequence *ret = new Sequence();
+ assert(ret);
+
+ ret->isValid = isValid;
+ ret->header = header;
+ ret->data = new SafeVector<char>;
+ assert(ret->data);
+ *(ret->data) = *data;
+ ret->length = length;
+ ret->sequenceLabel = sequenceLabel;
+ ret->inputLabel = inputLabel;
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetRange()
+ //
+ // Returns a new sequence object consisting of a range of
+ // characters from the current seuquence.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence *GetRange(int start, int end) const {
+ Sequence *ret = new Sequence();
+ assert(ret);
+
+ assert(start >= 1 && start <= length);
+ assert(end >= 1 && end <= length);
+ assert(start <= end);
+
+ ret->isValid = isValid;
+ ret->header = header;
+ ret->data = new SafeVector<char>;
+ assert(ret->data);
+ ret->data->push_back('@');
+ for (int i = start; i <= end; i++)
+ ret->data->push_back((*data)[i]);
+ ret->length = end - start + 1;
+ ret->sequenceLabel = sequenceLabel;
+ ret->inputLabel = inputLabel;
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::AddGaps()
+ //
+ // Given an SafeVector<char> containing the skeleton for an
+ // alignment and the identity of the current character, this
+ // routine will create a new sequence with all necesssary gaps added.
+ // For instance,
+ // alignment = "XXXBBYYYBBYYXX"
+ // id = 'X'
+ // will perform the transformation
+ // "ATGCAGTCA" --> "ATGCC---GT--CA"
+ // (XXXBBYYYBBYYXX)
+ /////////////////////////////////////////////////////////////////
+
+ Sequence *AddGaps(SafeVector<char> *alignment, char id) {
+ Sequence *ret = new Sequence();
+ assert(ret);
+
+ ret->isValid = isValid;
+ ret->header = header;
+ ret->data = new SafeVector<char>;
+ assert(ret->data);
+ ret->length = (int) alignment->size();
+ ret->sequenceLabel = sequenceLabel;
+ ret->inputLabel = inputLabel;
+ ret->data->push_back('@');
+
+ SafeVector<char>::iterator dataIter = data->begin() + 1;
+ for (SafeVector<char>::iterator iter = alignment->begin();
+ iter != alignment->end(); ++iter) {
+ if (*iter == 'B' || *iter == id) {
+ ret->data->push_back(*dataIter);
+ ++dataIter;
+ } else
+ ret->data->push_back('-');
+ }
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetString()
+ //
+ // Returns the sequence as a string with gaps removed.
+ /////////////////////////////////////////////////////////////////
+
+ string GetString() {
+ string s = "";
+ for (int i = 1; i <= length; i++) {
+ if ((*data)[i] != '-')
+ s += (*data)[i];
+ }
+ return s;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetMapping()
+ //
+ // Returns a SafeVector<int> containing the indices of every
+ // character in the sequence. For instance, if the data is
+ // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
+ /////////////////////////////////////////////////////////////////
+
+ SafeVector<int> *GetMapping() const {
+ SafeVector<int> *ret = new SafeVector<int>(1, 0);
+ for (int i = 1; i <= length; i++) {
+ if ((*data)[i] != '-')
+ ret->push_back(i);
+ }
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Highlight()
+ //
+ // Changes all positions with score >= cutoff to upper case and
+ // all positions with score < cutoff to lower case.
+ /////////////////////////////////////////////////////////////////
+
+ void Highlight(const SafeVector<float> &scores, const float cutoff) {
+ for (int i = 1; i <= length; i++) {
+ if (scores[i - 1] >= cutoff)
+ (*data)[i] = toupper((*data)[i]);
+ else
+ (*data)[i] = tolower((*data)[i]);
+ }
+ }
+};
+
+#endif