1 ////////////////////////////////////////////////////////////////
4 // Utilities for reading/writing multiple sequence data.
5 /////////////////////////////////////////////////////////////////
7 #ifndef MULTISEQUENCE_H
8 #define MULTISEQUENCE_H
17 #include "SafeVector.h"
19 #include "FileBuffer.h"
21 /////////////////////////////////////////////////////////////////
24 // Class for multiple sequence alignment input/output.
25 /////////////////////////////////////////////////////////////////
29 SafeVector<Sequence *> *sequences;
33 /////////////////////////////////////////////////////////////////
34 // MultiSequence::MultiSequence()
36 // Default constructor.
37 /////////////////////////////////////////////////////////////////
39 MultiSequence () : sequences (NULL) {}
41 /////////////////////////////////////////////////////////////////
42 // MultiSequence::MultiSequence()
44 // Constructor. Load MFA from a FileBuffer object.
45 /////////////////////////////////////////////////////////////////
47 MultiSequence (FileBuffer &infile) : sequences (NULL) {
51 /////////////////////////////////////////////////////////////////
52 // MultiSequence::MultiSequence()
54 // Constructor. Load MFA from a filename.
55 /////////////////////////////////////////////////////////////////
57 MultiSequence (const string &filename) : sequences (NULL){
61 /////////////////////////////////////////////////////////////////
62 // MultiSequence::~MultiSequence()
64 // Destructor. Gets rid of sequence objects contained in the
65 // multiple alignment.
66 /////////////////////////////////////////////////////////////////
70 // if sequences allocated
74 for (SafeVector<Sequence *>::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){
80 // free sequence vector
86 /////////////////////////////////////////////////////////////////
87 // MultiSequence::LoadMFA()
89 // Load MFA from a filename.
90 /////////////////////////////////////////////////////////////////
92 void LoadMFA (const string &filename, bool stripGaps = false){
95 FileBuffer infile (filename.c_str());
98 cerr << "ERROR: Could not open file '" << filename << "' for reading." << endl;
102 // if successful, then load using other LoadMFA() routine
103 LoadMFA (infile, stripGaps);
108 /////////////////////////////////////////////////////////////////
109 // MultiSequence::LoadMFA()
111 // Load MSF from a FileBuffer object.
112 /////////////////////////////////////////////////////////////////
114 void ParseMSF (FileBuffer &infile, string header, bool stripGaps = false){
116 SafeVector<SafeVector<char> *> seqData;
117 SafeVector<string> seqNames;
118 SafeVector<int> seqLengths;
122 bool missingHeader = false;
123 bool clustalW = false;
125 // read until data starts
126 while (!infile.eof() && header.find ("..", 0) == string::npos){
127 if (header.find ("CLUSTAL", 0) == 0 || header.find ("PROBCONS", 0) == 0){
131 infile.GetLine (header);
132 if (header.find ("//", 0) != string::npos){
133 missingHeader = true;
138 // read until end-of-file
140 infile.GetLine (header);
141 if (infile.eof()) break;
147 // check if there's anything on this line
150 // clustalw name parsing
152 if (!isspace(header[0]) && find (seqNames.begin(), seqNames.end(), word) == seqNames.end()){
153 seqNames.push_back (word);
154 seqData.push_back (new SafeVector<char>());
155 seqLengths.push_back (0);
156 seqData[(int) seqData.size() - 1]->push_back ('@');
160 // look for new sequence label
161 if (word == string ("Name:")){
163 seqNames.push_back (word);
164 seqData.push_back (new SafeVector<char>());
165 seqLengths.push_back (0);
166 seqData[(int) seqData.size() - 1]->push_back ('@');
172 // check if this is sequence data
173 else if (find (seqNames.begin(), seqNames.end(), word) != seqNames.end()){
174 int index = find (seqNames.begin(), seqNames.end(), word) - seqNames.begin();
176 // read all remaining characters on the line
179 if (isspace (ch)) continue;
180 if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A';
181 if (ch == '.') ch = '-';
182 if (stripGaps && ch == '-') continue;
183 if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){
184 cerr << "ERROR: Unknown character encountered: " << ch << endl;
188 // everything's ok so far, so just store this character.
189 seqData[index]->push_back (ch);
193 else if (missingHeader){
194 seqNames.push_back (word);
195 seqData.push_back (new SafeVector<char>());
196 seqLengths.push_back (0);
197 seqData[(int) seqData.size() - 1]->push_back ('@');
199 int index = (int) seqNames.size() - 1;
201 // read all remaining characters on the line
204 if (isspace (ch)) continue;
205 if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A';
206 if (ch == '.') ch = '-';
207 if (stripGaps && ch == '-') continue;
208 if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){
209 cerr << "ERROR: Unknown character encountered: " << ch << endl;
213 // everything's ok so far, so just store this character.
214 seqData[index]->push_back (ch);
222 if (seqNames.size() == 0){
223 cerr << "ERROR: No sequences read!" << endl;
228 sequences = new SafeVector<Sequence *>;
229 for (int i = 0; i < (int) seqNames.size(); i++){
230 if (seqLengths[i] == 0){
231 cerr << "ERROR: Sequence of zero length!" << endl;
234 Sequence *seq = new Sequence (seqData[i], seqNames[i], seqLengths[i], i, i);
235 sequences->push_back (seq);
239 /////////////////////////////////////////////////////////////////
240 // MultiSequence::LoadMFA()
242 // Load MFA from a FileBuffer object.
243 /////////////////////////////////////////////////////////////////
245 void LoadMFA (FileBuffer &infile, bool stripGaps = false){
247 // check to make sure that file reading is ok
249 cerr << "ERROR: Error reading file." << endl;
253 // read all sequences
256 // get the sequence label as being the current # of sequences
257 // NOTE: sequence labels here are zero-based
258 int index = (!sequences) ? 0 : sequences->size();
261 Sequence *seq = new Sequence (infile, stripGaps);
264 // check if alternative file format (i.e. not MFA)
266 string header = seq->GetHeader();
267 if (header.length() > 0 && header[0] != '>'){
270 ParseMSF (infile, header);
278 seq->SetLabel (index);
280 // add the sequence to the list of current sequences
281 if (!sequences) sequences = new SafeVector<Sequence *>;
282 sequences->push_back (seq);
285 // make sure at least one sequence was read
287 cerr << "ERROR: No sequences read." << endl;
292 /////////////////////////////////////////////////////////////////
293 // MultiSequence::AddSequence()
295 // Add another sequence to an existing sequence list
296 /////////////////////////////////////////////////////////////////
298 void AddSequence (Sequence *sequence){
300 assert (!sequence->Fail());
303 if (!sequences) sequences = new SafeVector<Sequence *>;
304 sequences->push_back (sequence);
307 /////////////////////////////////////////////////////////////////
308 // MultiSequence::RemoveSequence()
310 // Remove a sequence from the MultiSequence
311 /////////////////////////////////////////////////////////////////
313 void RemoveSequence (int index){
316 assert (index >= 0 && index < (int) sequences->size());
317 delete (*sequences)[index];
319 sequences->erase (sequences->begin() + index);
322 /////////////////////////////////////////////////////////////////
323 // MultiSequence::WriteMFA()
325 // Write MFA to the outfile. Allows the user to specify the
326 // number of columns for the output. Also, useIndices determines
327 // whether or not the actual sequence comments will be printed
328 // out or whether the artificially assigned sequence labels will
330 /////////////////////////////////////////////////////////////////
332 void WriteMFA (ostream &outfile, int numColumns = 60, bool useIndices = false){
333 if (!sequences) return;
335 // loop through all sequences and write them out
336 for (SafeVector<Sequence *>::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){
337 (*iter)->WriteMFA (outfile, numColumns, useIndices);
341 /////////////////////////////////////////////////////////////////
342 // MultiSequence::GetAnnotationChar()
344 // Return CLUSTALW annotation for column.
345 /////////////////////////////////////////////////////////////////
347 char GetAnnotationChar (SafeVector<char> &column){
348 SafeVector<int> counts (256, 0);
349 int allChars = (int) column.size();
351 for (int i = 0; i < allChars; i++){
352 counts[(unsigned char) toupper(column[i])]++;
355 allChars -= counts[(unsigned char) '-'];
356 if (allChars == 1) return ' ';
358 for (int i = 0; i < 256; i++) if ((char) i != '-' && counts[i] == allChars) return '*';
360 if (counts[(unsigned char) 'S'] +
361 counts[(unsigned char) 'T'] +
362 counts[(unsigned char) 'A'] == allChars)
365 if (counts[(unsigned char) 'N'] +
366 counts[(unsigned char) 'E'] +
367 counts[(unsigned char) 'Q'] +
368 counts[(unsigned char) 'K'] == allChars)
371 if (counts[(unsigned char) 'N'] +
372 counts[(unsigned char) 'H'] +
373 counts[(unsigned char) 'Q'] +
374 counts[(unsigned char) 'K'] == allChars)
377 if (counts[(unsigned char) 'N'] +
378 counts[(unsigned char) 'D'] +
379 counts[(unsigned char) 'E'] +
380 counts[(unsigned char) 'Q'] == allChars)
383 if (counts[(unsigned char) 'Q'] +
384 counts[(unsigned char) 'H'] +
385 counts[(unsigned char) 'R'] +
386 counts[(unsigned char) 'K'] == allChars)
389 if (counts[(unsigned char) 'M'] +
390 counts[(unsigned char) 'I'] +
391 counts[(unsigned char) 'L'] +
392 counts[(unsigned char) 'V'] == allChars)
395 if (counts[(unsigned char) 'M'] +
396 counts[(unsigned char) 'I'] +
397 counts[(unsigned char) 'L'] +
398 counts[(unsigned char) 'F'] == allChars)
401 if (counts[(unsigned char) 'H'] +
402 counts[(unsigned char) 'Y'] == allChars)
405 if (counts[(unsigned char) 'F'] +
406 counts[(unsigned char) 'Y'] +
407 counts[(unsigned char) 'W'] == allChars)
410 if (counts[(unsigned char) 'C'] +
411 counts[(unsigned char) 'S'] +
412 counts[(unsigned char) 'A'] == allChars)
415 if (counts[(unsigned char) 'A'] +
416 counts[(unsigned char) 'T'] +
417 counts[(unsigned char) 'V'] == allChars)
420 if (counts[(unsigned char) 'S'] +
421 counts[(unsigned char) 'A'] +
422 counts[(unsigned char) 'G'] == allChars)
425 if (counts[(unsigned char) 'S'] +
426 counts[(unsigned char) 'T'] +
427 counts[(unsigned char) 'N'] +
428 counts[(unsigned char) 'K'] == allChars)
431 if (counts[(unsigned char) 'S'] +
432 counts[(unsigned char) 'T'] +
433 counts[(unsigned char) 'P'] +
434 counts[(unsigned char) 'A'] == allChars)
437 if (counts[(unsigned char) 'S'] +
438 counts[(unsigned char) 'G'] +
439 counts[(unsigned char) 'N'] +
440 counts[(unsigned char) 'D'] == allChars)
443 if (counts[(unsigned char) 'S'] +
444 counts[(unsigned char) 'N'] +
445 counts[(unsigned char) 'D'] +
446 counts[(unsigned char) 'E'] +
447 counts[(unsigned char) 'Q'] +
448 counts[(unsigned char) 'K'] == allChars)
451 if (counts[(unsigned char) 'N'] +
452 counts[(unsigned char) 'D'] +
453 counts[(unsigned char) 'E'] +
454 counts[(unsigned char) 'Q'] +
455 counts[(unsigned char) 'H'] +
456 counts[(unsigned char) 'K'] == allChars)
459 if (counts[(unsigned char) 'N'] +
460 counts[(unsigned char) 'E'] +
461 counts[(unsigned char) 'H'] +
462 counts[(unsigned char) 'Q'] +
463 counts[(unsigned char) 'R'] +
464 counts[(unsigned char) 'K'] == allChars)
467 if (counts[(unsigned char) 'F'] +
468 counts[(unsigned char) 'V'] +
469 counts[(unsigned char) 'L'] +
470 counts[(unsigned char) 'I'] +
471 counts[(unsigned char) 'M'] == allChars)
474 if (counts[(unsigned char) 'H'] +
475 counts[(unsigned char) 'F'] +
476 counts[(unsigned char) 'Y'] == allChars)
482 /////////////////////////////////////////////////////////////////
483 // MultiSequence::WriteALN()
485 // Write ALN to the outfile. Allows the user to specify the
486 // number of columns for the output.
487 /////////////////////////////////////////////////////////////////
489 void WriteALN (ostream &outfile, int numColumns = 60){
490 if (!sequences) return;
492 outfile << "PROBCONS version " << VERSION << " multiple sequence alignment" << endl;
494 int longestComment = 0;
495 SafeVector<SafeVector<char>::iterator> ptrs (GetNumSequences());
496 SafeVector<int> lengths (GetNumSequences());
497 for (int i = 0; i < GetNumSequences(); i++){
498 ptrs[i] = GetSequence (i)->GetDataPtr();
499 lengths[i] = GetSequence (i)->GetLength();
500 longestComment = max (longestComment, (int) GetSequence(i)->GetName().length());
504 int writtenChars = 0;
505 bool allDone = false;
511 // loop through all sequences and write them out
512 for (int i = 0; i < GetNumSequences(); i++){
514 if (writtenChars < lengths[i]){
515 outfile << GetSequence(i)->GetName();
516 for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++)
519 for (int j = 0; j < numColumns; j++){
520 if (writtenChars + j < lengths[i])
521 outfile << ptrs[i][writtenChars + j + 1];
528 if (writtenChars + numColumns < lengths[i]) allDone = false;
532 // write annotation line
533 for (int j = 0; j < longestComment; j++)
536 for (int j = 0; j < numColumns; j++){
537 SafeVector<char> column;
539 for (int i = 0; i < GetNumSequences(); i++)
540 if (writtenChars + j < lengths[i])
541 column.push_back (ptrs[i][writtenChars + j + 1]);
543 if (column.size() > 0)
544 outfile << GetAnnotationChar (column);
548 writtenChars += numColumns;
552 /////////////////////////////////////////////////////////////////
553 // MultiSequence::GetSequence()
555 // Retrieve a sequence from the MultiSequence object.
556 /////////////////////////////////////////////////////////////////
558 Sequence* GetSequence (int i){
560 assert (0 <= i && i < (int) sequences->size());
562 return (*sequences)[i];
565 /////////////////////////////////////////////////////////////////
566 // MultiSequence::GetSequence()
568 // Retrieve a sequence from the MultiSequence object
570 /////////////////////////////////////////////////////////////////
572 const Sequence* GetSequence (int i) const {
574 assert (0 <= i && i < (int) sequences->size());
576 return (*sequences)[i];
579 /////////////////////////////////////////////////////////////////
580 // MultiSequence::GetNumSequences()
582 // Returns the number of sequences in the MultiSequence.
583 /////////////////////////////////////////////////////////////////
585 int GetNumSequences () const {
586 if (!sequences) return 0;
587 return (int) sequences->size();
590 /////////////////////////////////////////////////////////////////
591 // MultiSequence::SortByHeader()
593 // Organizes the sequences according to their sequence headers
594 // in ascending order.
595 /////////////////////////////////////////////////////////////////
597 void SortByHeader () {
600 // a quick and easy O(n^2) sort
601 for (int i = 0; i < (int) sequences->size()-1; i++){
602 for (int j = i+1; j < (int) sequences->size(); j++){
603 if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader())
604 swap ((*sequences)[i], (*sequences)[j]);
609 /////////////////////////////////////////////////////////////////
610 // MultiSequence::SortByLabel()
612 // Organizes the sequences according to their sequence labels
613 // in ascending order.
614 /////////////////////////////////////////////////////////////////
616 void SortByLabel () {
619 // a quick and easy O(n^2) sort
620 for (int i = 0; i < (int) sequences->size()-1; i++){
621 for (int j = i+1; j < (int) sequences->size(); j++){
622 if ((*sequences)[i]->GetSortLabel() > (*sequences)[j]->GetSortLabel())
623 swap ((*sequences)[i], (*sequences)[j]);
628 /////////////////////////////////////////////////////////////////
629 // MultiSequence::SaveOrdering()
631 // Relabels sequences so as to preserve the current ordering.
632 /////////////////////////////////////////////////////////////////
634 void SaveOrdering () {
637 for (int i = 0; i < (int) sequences->size(); i++)
638 (*sequences)[i]->SetSortLabel (i);
641 /////////////////////////////////////////////////////////////////
642 // MultiSequence::Project()
644 // Given a set of indices, extract all sequences from the current
645 // MultiSequence object whose index is included in the set.
646 // Then, project the multiple alignments down to the desired
647 // subset, and return the projection as a new MultiSequence
649 /////////////////////////////////////////////////////////////////
651 MultiSequence *Project (const set<int> &indices){
652 SafeVector<SafeVector<char>::iterator> oldPtrs (indices.size());
653 SafeVector<SafeVector<char> *> newPtrs (indices.size());
655 assert (indices.size() != 0);
659 for (set<int>::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){
660 oldPtrs[i++] = GetSequence (*iter)->GetDataPtr();
663 // compute new length
664 int oldLength = GetSequence (*indices.begin())->GetLength();
666 for (i = 1; i <= oldLength; i++){
668 // check to see if there is a gap in every sequence of the set
670 for (int j = 0; !found && j < (int) indices.size(); j++)
671 found = (oldPtrs[j][i] != '-');
673 // if not, then this column counts towards the sequence length
674 if (found) newLength++;
677 // build new alignments
678 for (i = 0; i < (int) indices.size(); i++){
679 newPtrs[i] = new SafeVector<char>(); assert (newPtrs[i]);
680 newPtrs[i]->push_back ('@');
683 // add all needed columns
684 for (i = 1; i <= oldLength; i++){
686 // make sure column is not gapped in all sequences in the set
688 for (int j = 0; !found && j < (int) indices.size(); j++)
689 found = (oldPtrs[j][i] != '-');
691 // if not, then add it
693 for (int j = 0; j < (int) indices.size(); j++)
694 newPtrs[j]->push_back (oldPtrs[j][i]);
698 // wrap sequences in MultiSequence object
699 MultiSequence *ret = new MultiSequence();
701 for (set<int>::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){
702 ret->AddSequence (new Sequence (newPtrs[i++], GetSequence (*iter)->GetHeader(), newLength,
703 GetSequence (*iter)->GetSortLabel(), GetSequence (*iter)->GetLabel()));