1 ////////////////////////////////////////////////////////////////
4 // Utilities for reading/writing multiple sequence data.
5 /////////////////////////////////////////////////////////////////
7 #ifndef MULTISEQUENCE_H
8 #define MULTISEQUENCE_H
17 #include "SafeVector.h"
19 #include "FileBuffer.h"
21 /////////////////////////////////////////////////////////////////
24 // Class for multiple sequence alignment input/output.
25 /////////////////////////////////////////////////////////////////
30 SafeVector<Sequence *> *sequences;
34 /////////////////////////////////////////////////////////////////
35 // MultiSequence::MultiSequence()
37 // Default constructor.
38 /////////////////////////////////////////////////////////////////
40 MultiSequence () : sequences (NULL) {}
42 /////////////////////////////////////////////////////////////////
43 // MultiSequence::MultiSequence()
45 // Constructor. Load MFA from a FileBuffer object.
46 /////////////////////////////////////////////////////////////////
48 MultiSequence (FileBuffer &infile) : sequences (NULL) {
52 /////////////////////////////////////////////////////////////////
53 // MultiSequence::MultiSequence()
55 // Constructor. Load MFA from a filename.
56 /////////////////////////////////////////////////////////////////
58 MultiSequence (const string &filename) : sequences (NULL){
62 /////////////////////////////////////////////////////////////////
63 // MultiSequence::~MultiSequence()
65 // Destructor. Gets rid of sequence objects contained in the
66 // multiple alignment.
67 /////////////////////////////////////////////////////////////////
71 // if sequences allocated
75 for (SafeVector<Sequence *>::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){
81 // free sequence vector
87 /////////////////////////////////////////////////////////////////
88 // MultiSequence::LoadMFA()
90 // Load MFA from a filename.
91 /////////////////////////////////////////////////////////////////
93 void LoadMFA (const string &filename, bool stripGaps = false){
96 FileBuffer infile (filename.c_str());
99 cerr << "ERROR: Could not open file '" << filename << "' for reading." << endl;
103 // if successful, then load using other LoadMFA() routine
104 LoadMFA (infile, stripGaps);
109 /////////////////////////////////////////////////////////////////
110 // MultiSequence::LoadMFA()
112 // Load MSF from a FileBuffer object.
113 /////////////////////////////////////////////////////////////////
115 void ParseMSF (FileBuffer &infile, string header, bool stripGaps = false){
117 SafeVector<SafeVector<char> *> seqData;
118 SafeVector<string> seqNames;
119 SafeVector<int> seqLengths;
123 bool missingHeader = false;
124 bool clustalW = false;
126 // read until data starts
127 while (!infile.eof() && header.find ("..", 0) == string::npos){
128 if (header.find ("CLUSTAL", 0) == 0 || header.find ("PROBCONS", 0) == 0){
132 infile.GetLine (header);
133 if (header.find ("//", 0) != string::npos){
134 missingHeader = true;
139 // read until end-of-file
141 infile.GetLine (header);
142 if (infile.eof()) break;
148 // check if there's anything on this line
151 // clustalw name parsing
153 if (!isspace(header[0]) && find (seqNames.begin(), seqNames.end(), word) == seqNames.end()){
154 seqNames.push_back (word);
155 seqData.push_back (new SafeVector<char>());
156 seqLengths.push_back (0);
157 seqData[(int) seqData.size() - 1]->push_back ('@');
161 // look for new sequence label
162 if (word == string ("Name:")){
164 seqNames.push_back (word);
165 seqData.push_back (new SafeVector<char>());
166 seqLengths.push_back (0);
167 seqData[(int) seqData.size() - 1]->push_back ('@');
173 // check if this is sequence data
174 else if (find (seqNames.begin(), seqNames.end(), word) != seqNames.end()){
175 int index = find (seqNames.begin(), seqNames.end(), word) - seqNames.begin();
177 // read all remaining characters on the line
180 if (isspace (ch)) continue;
181 // if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A';
182 if (ch == '.') ch = '-';
183 if (stripGaps && ch == '-') continue;
185 if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){
186 cerr << "ERROR: Unknown character encountered: " << ch << endl;
190 if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){
191 cerr << "ERROR: Unknown character encountered: " << ch << endl;
195 // everything's ok so far, so just store this character.
196 seqData[index]->push_back (ch);
200 else if (missingHeader){
201 seqNames.push_back (word);
202 seqData.push_back (new SafeVector<char>());
203 seqLengths.push_back (0);
204 seqData[(int) seqData.size() - 1]->push_back ('@');
206 int index = (int) seqNames.size() - 1;
208 // read all remaining characters on the line
211 if (isspace (ch)) continue;
212 // if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A';
213 if (ch == '.') ch = '-';
214 if (stripGaps && ch == '-') continue;
216 if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){
217 cerr << "ERROR: Unknown character encountered: " << ch << endl;
221 if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){
222 cerr << "ERROR: Unknown character encountered: " << ch << endl;
226 // everything's ok so far, so just store this character.
227 seqData[index]->push_back (ch);
235 if (seqNames.size() == 0){
236 cerr << "ERROR: No sequences read!" << endl;
241 sequences = new SafeVector<Sequence *>;
242 for (int i = 0; i < (int) seqNames.size(); i++){
243 if (seqLengths[i] == 0){
244 cerr << "ERROR: Sequence of zero length!" << endl;
247 Sequence *seq = new Sequence (seqData[i], seqNames[i], seqLengths[i], i, i);
248 sequences->push_back (seq);
252 /////////////////////////////////////////////////////////////////
253 // MultiSequence::LoadMFA()
255 // Load MFA from a FileBuffer object.
256 /////////////////////////////////////////////////////////////////
258 void LoadMFA (FileBuffer &infile, bool stripGaps = false){
260 // check to make sure that file reading is ok
262 cerr << "ERROR: Error reading file." << endl;
266 // read all sequences
269 // get the sequence label as being the current # of sequences
270 // NOTE: sequence labels here are zero-based
271 int index = (!sequences) ? 0 : sequences->size();
274 Sequence *seq = new Sequence (infile, stripGaps);
277 // check if alternative file format (i.e. not MFA)
279 string header = seq->GetHeader();
280 if (header.length() > 0 && header[0] != '>'){
282 ParseMSF (infile, header);
290 seq->SetLabel (index);
292 // add the sequence to the list of current sequences
293 if (!sequences) sequences = new SafeVector<Sequence *>;
294 sequences->push_back (seq);
297 // make sure at least one sequence was read
299 cerr << "ERROR: No sequences read." << endl;
304 /////////////////////////////////////////////////////////////////
305 // MultiSequence::AddSequence()
307 // Add another sequence to an existing sequence list
308 /////////////////////////////////////////////////////////////////
310 void AddSequence (Sequence *sequence){
312 assert (!sequence->Fail());
315 if (!sequences) sequences = new SafeVector<Sequence *>;
316 sequences->push_back (sequence);
319 /////////////////////////////////////////////////////////////////
320 // MultiSequence::RemoveSequence()
322 // Remove a sequence from the MultiSequence
323 /////////////////////////////////////////////////////////////////
325 void RemoveSequence (int index){
328 assert (index >= 0 && index < (int) sequences->size());
329 delete (*sequences)[index];
331 sequences->erase (sequences->begin() + index);
334 /////////////////////////////////////////////////////////////////
335 // MultiSequence::WriteMFA()
337 // Write MFA to the outfile. Allows the user to specify the
338 // number of columns for the output. Also, useIndices determines
339 // whether or not the actual sequence comments will be printed
340 // out or whether the artificially assigned sequence labels will
342 /////////////////////////////////////////////////////////////////
344 void WriteMFA (ostream &outfile, string *ssCons = NULL, int numColumns = 60, bool useIndices = false){
345 if (!sequences) return;
347 // loop through all sequences and write them out
348 for (SafeVector<Sequence *>::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){
349 (*iter)->WriteMFA (outfile, numColumns, useIndices);
353 if (ssCons != NULL) {
354 outfile << ">#=GC SS_cons" << endl;
355 int length = ssCons->length();
356 for (int i = 1; i < length; i++ ) {
357 outfile << ssCons->at(i);
360 if (numColumns <= count) {
370 void WriteMFAseq (ostream &outfile, int numColumns = 60, bool useIndices = false){
371 if (!sequences) return;
373 // loop through all sequences and write them out
374 for (SafeVector<Sequence *>::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){
375 (*iter)->WriteMFA (outfile, numColumns, useIndices);
379 /////////////////////////////////////////////////////////////////
380 // MultiSequence::GetAnnotationChar()
382 // Return CLUSTALW annotation for column.
383 /////////////////////////////////////////////////////////////////
385 char GetAnnotationChar (SafeVector<char> &column){
386 SafeVector<int> counts (256, 0);
387 int allChars = (int) column.size();
389 for (int i = 0; i < allChars; i++){
390 counts[(unsigned char) toupper(column[i])]++;
393 allChars -= counts[(unsigned char) '-'];
394 if (allChars == 1) return ' ';
396 for (int i = 0; i < 256; i++) if ((char) i != '-' && counts[i] == allChars) return '*';
398 if (counts[(unsigned char) 'S'] +
399 counts[(unsigned char) 'T'] +
400 counts[(unsigned char) 'A'] == allChars)
403 if (counts[(unsigned char) 'N'] +
404 counts[(unsigned char) 'E'] +
405 counts[(unsigned char) 'Q'] +
406 counts[(unsigned char) 'K'] == allChars)
409 if (counts[(unsigned char) 'N'] +
410 counts[(unsigned char) 'H'] +
411 counts[(unsigned char) 'Q'] +
412 counts[(unsigned char) 'K'] == allChars)
415 if (counts[(unsigned char) 'N'] +
416 counts[(unsigned char) 'D'] +
417 counts[(unsigned char) 'E'] +
418 counts[(unsigned char) 'Q'] == allChars)
421 if (counts[(unsigned char) 'Q'] +
422 counts[(unsigned char) 'H'] +
423 counts[(unsigned char) 'R'] +
424 counts[(unsigned char) 'K'] == allChars)
427 if (counts[(unsigned char) 'M'] +
428 counts[(unsigned char) 'I'] +
429 counts[(unsigned char) 'L'] +
430 counts[(unsigned char) 'V'] == allChars)
433 if (counts[(unsigned char) 'M'] +
434 counts[(unsigned char) 'I'] +
435 counts[(unsigned char) 'L'] +
436 counts[(unsigned char) 'F'] == allChars)
439 if (counts[(unsigned char) 'H'] +
440 counts[(unsigned char) 'Y'] == allChars)
443 if (counts[(unsigned char) 'F'] +
444 counts[(unsigned char) 'Y'] +
445 counts[(unsigned char) 'W'] == allChars)
448 if (counts[(unsigned char) 'C'] +
449 counts[(unsigned char) 'S'] +
450 counts[(unsigned char) 'A'] == allChars)
453 if (counts[(unsigned char) 'A'] +
454 counts[(unsigned char) 'T'] +
455 counts[(unsigned char) 'V'] == allChars)
458 if (counts[(unsigned char) 'S'] +
459 counts[(unsigned char) 'A'] +
460 counts[(unsigned char) 'G'] == allChars)
463 if (counts[(unsigned char) 'S'] +
464 counts[(unsigned char) 'T'] +
465 counts[(unsigned char) 'N'] +
466 counts[(unsigned char) 'K'] == allChars)
469 if (counts[(unsigned char) 'S'] +
470 counts[(unsigned char) 'T'] +
471 counts[(unsigned char) 'P'] +
472 counts[(unsigned char) 'A'] == allChars)
475 if (counts[(unsigned char) 'S'] +
476 counts[(unsigned char) 'G'] +
477 counts[(unsigned char) 'N'] +
478 counts[(unsigned char) 'D'] == allChars)
481 if (counts[(unsigned char) 'S'] +
482 counts[(unsigned char) 'N'] +
483 counts[(unsigned char) 'D'] +
484 counts[(unsigned char) 'E'] +
485 counts[(unsigned char) 'Q'] +
486 counts[(unsigned char) 'K'] == allChars)
489 if (counts[(unsigned char) 'N'] +
490 counts[(unsigned char) 'D'] +
491 counts[(unsigned char) 'E'] +
492 counts[(unsigned char) 'Q'] +
493 counts[(unsigned char) 'H'] +
494 counts[(unsigned char) 'K'] == allChars)
497 if (counts[(unsigned char) 'N'] +
498 counts[(unsigned char) 'E'] +
499 counts[(unsigned char) 'H'] +
500 counts[(unsigned char) 'Q'] +
501 counts[(unsigned char) 'R'] +
502 counts[(unsigned char) 'K'] == allChars)
505 if (counts[(unsigned char) 'F'] +
506 counts[(unsigned char) 'V'] +
507 counts[(unsigned char) 'L'] +
508 counts[(unsigned char) 'I'] +
509 counts[(unsigned char) 'M'] == allChars)
512 if (counts[(unsigned char) 'H'] +
513 counts[(unsigned char) 'F'] +
514 counts[(unsigned char) 'Y'] == allChars)
520 /////////////////////////////////////////////////////////////////
521 // MultiSequence::WriteALN()
523 // Write ALN to the outfile. Allows the user to specify the
524 // number of columns for the output.
525 /////////////////////////////////////////////////////////////////
527 void WriteALN (ostream &outfile, int numColumns = 60){
528 if (!sequences) return;
530 // outfile << "Multplex SCARNA version " << VERSION << " multiple sequence alignment" << endl;
531 // outfile << "PROBCONS version " << VERSION << " multiple sequence alignment" << endl;
532 outfile << "CLUSTAL W(1.83) multiple sequence alignment" << endl;
533 // outfile << "//" << endl;
535 int longestComment = 0;
536 SafeVector<SafeVector<char>::iterator> ptrs (GetNumSequences());
537 SafeVector<int> lengths (GetNumSequences());
538 for (int i = 0; i < GetNumSequences(); i++){
539 ptrs[i] = GetSequence (i)->GetDataPtr();
540 lengths[i] = GetSequence (i)->GetLength();
541 longestComment = max (longestComment, (int) GetSequence(i)->GetName().length());
545 int writtenChars = 0;
546 bool allDone = false;
552 // loop through all sequences and write them out
553 for (int i = 0; i < GetNumSequences(); i++){
555 if (writtenChars < lengths[i]){
556 outfile << GetSequence(i)->GetName();
557 for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++)
560 for (int j = 0; j < numColumns; j++){
561 if (writtenChars + j < lengths[i])
562 outfile << ptrs[i][writtenChars + j + 1];
569 if (writtenChars + numColumns < lengths[i]) allDone = false;
573 // write annotation line
574 for (int j = 0; j < longestComment; j++)
577 for (int j = 0; j < numColumns; j++){
578 SafeVector<char> column;
580 for (int i = 0; i < GetNumSequences(); i++)
581 if (writtenChars + j < lengths[i])
582 column.push_back (ptrs[i][writtenChars + j + 1]);
584 if (column.size() > 0)
585 outfile << GetAnnotationChar (column);
589 writtenChars += numColumns;
594 ////////////////////////////////////////////////////////////////
595 // MultiSequence::WriteWEB();
597 // Write ALN to the outfile. Allows the user to specify the
598 // number of columns for the output.
599 ///////////////////////////////////////////////////////////////
600 void WriteWEB (ostream &outfile, string *ssCons = NULL, int numColumns = 60, bool useIndices = false){
601 if (!sequences) return;
603 // loop through all sequences and write them out
604 for (SafeVector<Sequence *>::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){
605 (*iter)->WriteWEB (outfile, numColumns, useIndices);
608 // write conservation
609 outfile << "<conservation>" << endl;
610 int longestComment = 0;
611 SafeVector<SafeVector<char>::iterator> ptrs (GetNumSequences());
612 SafeVector<int> lengths (GetNumSequences());
613 for (int i = 0; i < GetNumSequences(); i++){
614 ptrs[i] = GetSequence (i)->GetDataPtr();
615 lengths[i] = GetSequence (i)->GetLength();
616 longestComment = max (longestComment, (int) GetSequence(i)->GetName().length());
620 int writtenChars = 0;
621 bool allDone = false;
627 // loop through all sequences and write them out
628 for (int i = 0; i < GetNumSequences(); i++){
630 if (writtenChars < lengths[i]){
631 // outfile << GetSequence(i)->GetName();
632 for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++)
635 for (int j = 0; j < numColumns; j++){
636 if (writtenChars + j < lengths[i]);
637 // outfile << ptrs[i][writtenChars + j + 1];
644 if (writtenChars + numColumns < lengths[i]) allDone = false;
648 // write annotation line
649 // for (int j = 0; j < longestComment; j++)
652 for (int j = 0; j < numColumns; j++){
653 SafeVector<char> column;
655 for (int i = 0; i < GetNumSequences(); i++)
656 if (writtenChars + j < lengths[i])
657 column.push_back (ptrs[i][writtenChars + j + 1]);
659 if (column.size() > 0)
660 outfile << GetAnnotationChar (column);
664 writtenChars += numColumns;
667 outfile << "</conservation>" << endl;
669 // write structure information
670 if (ssCons != NULL) {
671 outfile << "<structure>" << endl;
672 int length = ssCons->length();
673 for (int i = 1; i < length; i++ ) {
674 outfile << ssCons->at(i);
677 outfile << "</structure>" << endl;
679 // add coordinate information 06/09/14
680 outfile << "<coordinate>" << endl;
683 for (int i = 1; i < length; i++) {
686 if ( ssCons->at(i) == '(' ) {
691 char ch = ssCons->at(++j);
698 outfile << "<segment position=\"" << segmentPos++ << "\" starts=\""
699 << i << "\"" << " ends=\"" << j << "\"/>" << endl;
704 outfile << "</coordinate>" << endl;
706 outfile << "<mxscarna>" << endl;
707 WriteMXSCARNA (outfile, ssCons);
708 outfile << "</mxscarna>" << endl;
710 outfile << "<aln>" << endl;
712 outfile << "</aln>" << endl;
714 outfile << "<mfa>" << endl;
715 WriteMFA (outfile, ssCons);
716 outfile << "</mfa>" << endl;
718 outfile << "<stockholm>" << endl;
719 WriteWebSTOCKHOLM (outfile, ssCons);
720 outfile << "</stockholm>" << endl;
723 ////////////////////////////////////////////////////////////////
724 // MultiSequence::WriteSTOCKHOLM();
726 // Write STOCKHOLM to the outfile. Allows the user to specify the
727 // number of columns for the output.
728 ///////////////////////////////////////////////////////////////
729 void WriteSTOCKHOLM (ostream &outfile, string *ssCons = NULL, int numColumns = 60) {
730 if (!sequences) return;
732 outfile << "# STOCKHOLM 1.0" << endl;
734 int longestComment = 0;
735 SafeVector<SafeVector<char>::iterator> ptrs (GetNumSequences());
736 SafeVector<int> lengths (GetNumSequences());
737 for (int i = 0; i < GetNumSequences(); i++){
738 ptrs[i] = GetSequence (i)->GetDataPtr();
739 lengths[i] = GetSequence (i)->GetLength();
740 longestComment = max (longestComment, (int) GetSequence(i)->GetName().length());
744 int writtenChars = 0;
745 bool allDone = false;
751 // loop through all sequences and write them out
752 for (int i = 0; i < GetNumSequences(); i++){
754 if (writtenChars < lengths[i]){
755 outfile << GetSequence(i)->GetName();
756 for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++)
759 for (int j = 0; j < numColumns; j++){
760 if (writtenChars + j < lengths[i])
761 if (ptrs[i][writtenChars + j + 1] != '-')
762 outfile << ptrs[i][writtenChars + j + 1];
771 if (writtenChars + numColumns < lengths[i]) allDone = false;
777 if (ssCons != NULL) {
778 outfile << "#=GC SS_cons";
779 int lengthSScons = 12;
780 for (int j = 0; j < longestComment - lengthSScons; j++)
783 for (int j = 0; j < numColumns; j++) {
784 if (ssCons->at(writtenChars + j + 1) == '(')
786 else if (ssCons->at(writtenChars + j + 1) == ')')
790 if ((unsigned int)writtenChars + j + 1 >= ssCons->length() - 1)
796 writtenChars += numColumns;
802 ////////////////////////////////////////////////////////////////
803 // MultiSequence::WriteSTOCKHOLM();
805 // Write STOCKHOLM to the outfile. Allows the user to specify the
806 // number of columns for the output.
807 ///////////////////////////////////////////////////////////////
808 void WriteWebSTOCKHOLM (ostream &outfile, string *ssCons = NULL, int numColumns = 60) {
809 if (!sequences) return;
811 outfile << "# STOCKHOLM 1.0" << endl;
813 int longestComment = 0;
814 SafeVector<SafeVector<char>::iterator> ptrs (GetNumSequences());
815 SafeVector<int> lengths (GetNumSequences());
816 for (int i = 0; i < GetNumSequences(); i++){
817 ptrs[i] = GetSequence (i)->GetDataPtr();
818 lengths[i] = GetSequence (i)->GetLength();
819 longestComment = max (longestComment, (int) GetSequence(i)->GetName().length());
823 int writtenChars = 0;
824 bool allDone = false;
830 // loop through all sequences and write them out
831 for (int i = 0; i < GetNumSequences(); i++){
833 if (writtenChars < lengths[i]){
834 outfile << GetSequence(i)->GetName();
835 for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++)
838 for (int j = 0; j < numColumns; j++){
839 if (writtenChars + j < lengths[i])
840 if (ptrs[i][writtenChars + j + 1] != '-')
841 outfile << ptrs[i][writtenChars + j + 1];
850 if (writtenChars + numColumns < lengths[i]) allDone = false;
856 if (ssCons != NULL) {
857 outfile << "#=GC SS_cons";
858 int lengthSScons = 12;
859 for (int j = 0; j < longestComment - lengthSScons; j++)
862 for (int j = 0; j < numColumns; j++) {
863 outfile << ssCons->at(writtenChars + j + 1);
865 if ((unsigned int)writtenChars + j + 1 >= ssCons->length() - 1)
871 writtenChars += numColumns;
877 ////////////////////////////////////////////////////////////////
878 // MultiSequence::WriteMXSCARNA();
880 // Write MXSCARNA to the outfile. Allows the user to specify the
881 // number of columns for the output.
882 ///////////////////////////////////////////////////////////////
883 void WriteMXSCARNA (ostream &outfile, string *ssCons = NULL, int numColumns = 60){
884 if (!sequences) return;
886 outfile << "Multplex SCARNA version " << VERSION << " multiple sequence alignment" << endl;
888 int longestComment = 0;
889 SafeVector<SafeVector<char>::iterator> ptrs (GetNumSequences());
890 SafeVector<int> lengths (GetNumSequences());
891 for (int i = 0; i < GetNumSequences(); i++){
892 ptrs[i] = GetSequence (i)->GetDataPtr();
893 lengths[i] = GetSequence (i)->GetLength();
894 longestComment = max (longestComment, (int) GetSequence(i)->GetName().length());
898 int writtenChars = 0;
899 bool allDone = false;
905 // loop through all sequences and write them out
906 for (int i = 0; i < GetNumSequences(); i++){
908 if (writtenChars < lengths[i]){
909 outfile << GetSequence(i)->GetName();
910 for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++)
913 for (int j = 0; j < numColumns; j++){
914 if (writtenChars + j < lengths[i])
915 outfile << ptrs[i][writtenChars + j + 1];
922 if (writtenChars + numColumns < lengths[i]) allDone = false;
927 if (ssCons != NULL) {
928 outfile << "ss_cons";
929 int lengthSScons = 7;
930 for (int j = 0; j < longestComment - lengthSScons; j++)
933 for (int j = 0; j < numColumns; j++) {
934 outfile << ssCons->at(writtenChars + j + 1);
935 if ((unsigned int)writtenChars + j + 1 >= ssCons->length() - 1)
941 // write annotation line
942 for (int j = 0; j < longestComment; j++)
945 for (int j = 0; j < numColumns; j++){
946 SafeVector<char> column;
948 for (int i = 0; i < GetNumSequences(); i++)
949 if (writtenChars + j < lengths[i])
950 column.push_back (ptrs[i][writtenChars + j + 1]);
952 if (column.size() > 0)
953 outfile << GetAnnotationChar (column);
957 writtenChars += numColumns;
962 /////////////////////////////////////////////////////////////////
963 // MultiSequence::GetSequence()
965 // Retrieve a sequence from the MultiSequence object.
966 /////////////////////////////////////////////////////////////////
968 Sequence* GetSequence (int i){
970 assert (0 <= i && i < (int) sequences->size());
972 return (*sequences)[i];
975 /////////////////////////////////////////////////////////////////
976 // MultiSequence::GetSequence()
978 // Retrieve a sequence from the MultiSequence object
980 /////////////////////////////////////////////////////////////////
982 const Sequence* GetSequence (int i) const {
984 assert (0 <= i && i < (int) sequences->size());
986 return (*sequences)[i];
989 /////////////////////////////////////////////////////////////////
990 // MultiSequence::GetNumSequences()
992 // Returns the number of sequences in the MultiSequence.
993 /////////////////////////////////////////////////////////////////
995 int GetNumSequences () const {
996 if (!sequences) return 0;
997 return (int) sequences->size();
1000 /////////////////////////////////////////////////////////////////
1001 // MultiSequence::SortByHeader()
1003 // Organizes the sequences according to their sequence headers
1004 // in ascending order.
1005 /////////////////////////////////////////////////////////////////
1007 void SortByHeader () {
1010 // a quick and easy O(n^2) sort
1011 for (int i = 0; i < (int) sequences->size()-1; i++){
1012 for (int j = i+1; j < (int) sequences->size(); j++){
1013 if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader())
1014 swap ((*sequences)[i], (*sequences)[j]);
1019 /////////////////////////////////////////////////////////////////
1020 // MultiSequence::SortByLabel()
1022 // Organizes the sequences according to their sequence labels
1023 // in ascending order.
1024 /////////////////////////////////////////////////////////////////
1026 void SortByLabel () {
1029 // a quick and easy O(n^2) sort
1030 for (int i = 0; i < (int) sequences->size()-1; i++){
1031 for (int j = i+1; j < (int) sequences->size(); j++){
1032 if ((*sequences)[i]->GetSortLabel() > (*sequences)[j]->GetSortLabel())
1033 swap ((*sequences)[i], (*sequences)[j]);
1038 /////////////////////////////////////////////////////////////////
1039 // MultiSequence::SaveOrdering()
1041 // Relabels sequences so as to preserve the current ordering.
1042 /////////////////////////////////////////////////////////////////
1044 void SaveOrdering () {
1047 for (int i = 0; i < (int) sequences->size(); i++)
1048 (*sequences)[i]->SetSortLabel (i);
1051 /////////////////////////////////////////////////////////////////
1052 // MultiSequence::Project()
1054 // Given a set of indices, extract all sequences from the current
1055 // MultiSequence object whose index is included in the set.
1056 // Then, project the multiple alignments down to the desired
1057 // subset, and return the projection as a new MultiSequence
1059 /////////////////////////////////////////////////////////////////
1061 MultiSequence *Project (const set<int> &indices){
1062 SafeVector<SafeVector<char>::iterator> oldPtrs (indices.size());
1063 SafeVector<SafeVector<char> *> newPtrs (indices.size());
1065 assert (indices.size() != 0);
1069 for (set<int>::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){
1070 oldPtrs[i++] = GetSequence (*iter)->GetDataPtr();
1073 // compute new length
1074 int oldLength = GetSequence (*indices.begin())->GetLength();
1076 for (i = 1; i <= oldLength; i++){
1078 // check to see if there is a gap in every sequence of the set
1080 for (int j = 0; !found && j < (int) indices.size(); j++)
1081 found = (oldPtrs[j][i] != '-');
1083 // if not, then this column counts towards the sequence length
1084 if (found) newLength++;
1087 // build new alignments
1088 for (i = 0; i < (int) indices.size(); i++){
1089 newPtrs[i] = new SafeVector<char>(); assert (newPtrs[i]);
1090 newPtrs[i]->push_back ('@');
1093 // add all needed columns
1094 for (i = 1; i <= oldLength; i++){
1096 // make sure column is not gapped in all sequences in the set
1098 for (int j = 0; !found && j < (int) indices.size(); j++)
1099 found = (oldPtrs[j][i] != '-');
1101 // if not, then add it
1103 for (int j = 0; j < (int) indices.size(); j++)
1104 newPtrs[j]->push_back (oldPtrs[j][i]);
1108 // wrap sequences in MultiSequence object
1109 MultiSequence *ret = new MultiSequence();
1111 for (set<int>::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){
1112 ret->AddSequence (new Sequence (newPtrs[i++], GetSequence (*iter)->GetHeader(), newLength,
1113 GetSequence (*iter)->GetSortLabel(), GetSequence (*iter)->GetLabel()));