1 /////////////////////////////////////////////////////////////////
4 // Class for reading/manipulating single sequence character data.
5 /////////////////////////////////////////////////////////////////
15 #include "SafeVector.h"
16 #include "FileBuffer.h"
18 /////////////////////////////////////////////////////////////////
21 // Class for storing sequence information.
22 /////////////////////////////////////////////////////////////////
26 bool isValid; // a boolean indicating whether the sequence data is valid or not
27 string header; // string containing the comment line of the FASTA file
28 SafeVector<char> *data; // pointer to character data
29 int length; // length of the sequence
30 int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences
31 // in a Multi-FASTA file
32 int inputLabel; // position of sequence in original input
34 /////////////////////////////////////////////////////////////////
35 // Sequence::Sequence()
37 // Default constructor. Does nothing.
38 /////////////////////////////////////////////////////////////////
40 Sequence () : isValid (false), header (""), data (NULL), length (0), sequenceLabel (0), inputLabel (0) {}
44 /////////////////////////////////////////////////////////////////
45 // Sequence::Sequence()
47 // Constructor. Reads the sequence from a FileBuffer.
48 /////////////////////////////////////////////////////////////////
50 Sequence (FileBuffer &infile, bool stripGaps = false) : isValid (false), header ("~"), data (NULL), length(0), sequenceLabel (0), inputLabel (0) {
52 // read until the first non-blank line
53 while (!infile.eof()){
54 infile.GetLine (header);
55 if (header.length() != 0) break;
58 // check to make sure that it is a correct header line
59 if (header[0] == '>'){
61 // if so, remove the leading ">"
62 header = header.substr (1);
64 // remove any leading or trailing white space in the header comment
65 while (header.length() > 0 && isspace (header[0])) header = header.substr (1);
66 while (header.length() > 0 && isspace (header[header.length() - 1])) header = header.substr(0, header.length() - 1);
68 // get ready to read the data[] array; note that data[0] is always '@'
70 data = new SafeVector<char>; assert (data);
71 data->push_back ('@');
73 // get a character from the file
74 while (infile.Get(ch)){
76 // if we've reached a new comment line, put the character back and stop
77 if (ch == '>'){ infile.UnGet(); break; }
80 if (isspace (ch)) continue;
82 // substitute gap character
83 if (ch == '.') ch = '-';
84 if (stripGaps && ch == '-') continue;
86 // check for known characters
87 if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){
88 cerr << "ERROR: Unknown character encountered: " << ch << endl;
92 // everything's ok so far, so just store this character.
97 // sequence must contain data in order to be valid
107 /////////////////////////////////////////////////////////////////
108 // Sequence::Sequence()
110 // Constructor. Builds a sequence from existing data. Note
111 // that the data must use one-based indexing where data[0] should
113 /////////////////////////////////////////////////////////////////
115 Sequence (SafeVector<char> *data, string header, int length, int sequenceLabel, int inputLabel) :
116 isValid (data != NULL), header(header), data(data), length (length), sequenceLabel (sequenceLabel), inputLabel (inputLabel) {
118 assert ((*data)[0] == '@');
121 /////////////////////////////////////////////////////////////////
122 // Sequence::Sequence()
124 // Destructor. Release allocated memory.
125 /////////////////////////////////////////////////////////////////
136 /////////////////////////////////////////////////////////////////
137 // Sequence::GetHeader()
139 // Return the string comment associated with this sequence.
140 /////////////////////////////////////////////////////////////////
142 string GetHeader () const {
146 /////////////////////////////////////////////////////////////////
147 // Sequence::GetName()
149 // Return the first word of the string comment associated with this sequence.
150 /////////////////////////////////////////////////////////////////
152 string GetName () const {
154 sscanf (header.c_str(), "%s", name);
158 /////////////////////////////////////////////////////////////////
159 // Sequence::GetDataPtr()
161 // Return the iterator to data associated with this sequence.
162 /////////////////////////////////////////////////////////////////
164 SafeVector<char>::iterator GetDataPtr(){
167 return data->begin();
170 /////////////////////////////////////////////////////////////////
171 // Sequence::GetPosition()
173 // Return the character at position i. Recall that the character
174 // data is stored with one-based indexing.
175 /////////////////////////////////////////////////////////////////
177 char GetPosition (int i) const {
180 assert (i >= 1 && i <= length);
184 /////////////////////////////////////////////////////////////////
185 // Sequence::SetLabel()
187 // Sets the sequence label to i.
188 /////////////////////////////////////////////////////////////////
190 void SetLabel (int i){
196 /////////////////////////////////////////////////////////////////
197 // Sequence::SetSortLabel()
199 // Sets the sequence sorting label to i.
200 /////////////////////////////////////////////////////////////////
202 void SetSortLabel (int i){
207 /////////////////////////////////////////////////////////////////
208 // Sequence::GetLabel()
210 // Retrieves the input label.
211 /////////////////////////////////////////////////////////////////
213 int GetLabel () const {
218 /////////////////////////////////////////////////////////////////
219 // Sequence::GetSortLabel()
221 // Retrieves the sorting label.
222 /////////////////////////////////////////////////////////////////
224 int GetSortLabel () const {
226 return sequenceLabel;
229 /////////////////////////////////////////////////////////////////
232 // Checks to see if the sequence successfully loaded.
233 /////////////////////////////////////////////////////////////////
239 /////////////////////////////////////////////////////////////////
240 // Sequence::Length()
242 // Returns the length of the sequence.
243 /////////////////////////////////////////////////////////////////
245 int GetLength () const {
251 /////////////////////////////////////////////////////////////////
252 // Sequence::WriteMFA()
254 // Writes the sequence to outfile in MFA format. Uses numColumns
255 // columns per line. If useIndex is set to false, then the
256 // header is printed as normal, but if useIndex is true, then
257 // ">S###" is printed where ### represents the sequence label.
258 /////////////////////////////////////////////////////////////////
260 void WriteMFA (ostream &outfile, int numColumns, bool useIndex = false) const {
263 assert (!outfile.fail());
267 outfile << ">S" << GetLabel() << endl;
269 outfile << ">" << header << endl;
271 // print out character data
273 for (; ct <= length; ct++){
274 outfile << (*data)[ct];
275 if (ct % numColumns == 0) outfile << endl;
277 if ((ct-1) % numColumns != 0) outfile << endl;
280 /////////////////////////////////////////////////////////////////
283 // Returns a new deep copy of the seqeuence.
284 /////////////////////////////////////////////////////////////////
286 Sequence *Clone () const {
287 Sequence *ret = new Sequence();
290 ret->isValid = isValid;
291 ret->header = header;
292 ret->data = new SafeVector<char>; assert (ret->data);
293 *(ret->data) = *data;
294 ret->length = length;
295 ret->sequenceLabel = sequenceLabel;
296 ret->inputLabel = inputLabel;
301 /////////////////////////////////////////////////////////////////
302 // Sequence::GetRange()
304 // Returns a new sequence object consisting of a range of
305 // characters from the current seuquence.
306 /////////////////////////////////////////////////////////////////
308 Sequence *GetRange (int start, int end) const {
309 Sequence *ret = new Sequence();
312 assert (start >= 1 && start <= length);
313 assert (end >= 1 && end <= length);
314 assert (start <= end);
316 ret->isValid = isValid;
317 ret->header = header;
318 ret->data = new SafeVector<char>; assert (ret->data);
319 ret->data->push_back ('@');
320 for (int i = start; i <= end; i++)
321 ret->data->push_back ((*data)[i]);
322 ret->length = end - start + 1;
323 ret->sequenceLabel = sequenceLabel;
324 ret->inputLabel = inputLabel;
329 /////////////////////////////////////////////////////////////////
330 // Sequence::AddGaps()
332 // Given an SafeVector<char> containing the skeleton for an
333 // alignment and the identity of the current character, this
334 // routine will create a new sequence with all necesssary gaps added.
336 // alignment = "XXXBBYYYBBYYXX"
338 // will perform the transformation
339 // "ATGCAGTCA" --> "ATGCC---GT--CA"
341 /////////////////////////////////////////////////////////////////
343 Sequence *AddGaps (SafeVector<char> *alignment, char id){
344 Sequence *ret = new Sequence();
347 ret->isValid = isValid;
348 ret->header = header;
349 ret->data = new SafeVector<char>; assert (ret->data);
350 ret->length = (int) alignment->size();
351 ret->sequenceLabel = sequenceLabel;
352 ret->inputLabel = inputLabel;
353 ret->data->push_back ('@');
355 SafeVector<char>::iterator dataIter = data->begin() + 1;
356 for (SafeVector<char>::iterator iter = alignment->begin(); iter != alignment->end(); ++iter){
357 if (*iter == 'B' || *iter == id){
358 ret->data->push_back (*dataIter);
362 ret->data->push_back ('-');
368 /////////////////////////////////////////////////////////////////
369 // Sequence::GetString()
371 // Returns the sequence as a string with gaps removed.
372 /////////////////////////////////////////////////////////////////
376 for (int i = 1; i <= length; i++){
377 if ((*data)[i] != '-') s += (*data)[i];
383 /////////////////////////////////////////////////////////////////
384 // Sequence::GetMapping()
386 // Returns a SafeVector<int> containing the indices of every
387 // character in the sequence. For instance, if the data is
388 // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
389 /////////////////////////////////////////////////////////////////
391 SafeVector<int> *GetMapping () const {
392 SafeVector<int> *ret = new SafeVector<int>(1, 0);
393 for (int i = 1; i <= length; i++){
394 if ((*data)[i] != '-') ret->push_back (i);
399 /////////////////////////////////////////////////////////////////
400 // Sequence::Highlight()
402 // Changes all positions with score >= cutoff to upper case and
403 // all positions with score < cutoff to lower case.
404 /////////////////////////////////////////////////////////////////
406 void Highlight (const SafeVector<float> &scores, const float cutoff){
407 for (int i = 1; i <= length; i++){
408 if (scores[i-1] >= cutoff)
409 (*data)[i] = toupper ((*data)[i]);
411 (*data)[i] = tolower ((*data)[i]);