//////////////////////////////////////////////////////////////// // MultiSequence.h // // Utilities for reading/writing multiple sequence data. ///////////////////////////////////////////////////////////////// #ifndef MULTISEQUENCE_H #define MULTISEQUENCE_H #include #include #include #include #include #include #include #include "SafeVector.h" #include "Sequence.h" #include "FileBuffer.h" ///////////////////////////////////////////////////////////////// // MultiSequence // // Class for multiple sequence alignment input/output. ///////////////////////////////////////////////////////////////// namespace MXSCARNA { class MultiSequence { SafeVector *sequences; public: ///////////////////////////////////////////////////////////////// // MultiSequence::MultiSequence() // // Default constructor. ///////////////////////////////////////////////////////////////// MultiSequence () : sequences (NULL) {} ///////////////////////////////////////////////////////////////// // MultiSequence::MultiSequence() // // Constructor. Load MFA from a FileBuffer object. ///////////////////////////////////////////////////////////////// MultiSequence (FileBuffer &infile) : sequences (NULL) { LoadMFA (infile); } ///////////////////////////////////////////////////////////////// // MultiSequence::MultiSequence() // // Constructor. Load MFA from a filename. ///////////////////////////////////////////////////////////////// MultiSequence (const string &filename) : sequences (NULL){ LoadMFA (filename); } ///////////////////////////////////////////////////////////////// // MultiSequence::~MultiSequence() // // Destructor. Gets rid of sequence objects contained in the // multiple alignment. ///////////////////////////////////////////////////////////////// ~MultiSequence(){ // if sequences allocated if (sequences){ // free all sequences for (SafeVector::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){ assert (*iter); delete *iter; *iter = NULL; } // free sequence vector delete sequences; sequences = NULL; } } ///////////////////////////////////////////////////////////////// // MultiSequence::LoadMFA() // // Load MFA from a filename. ///////////////////////////////////////////////////////////////// void LoadMFA (const string &filename, bool stripGaps = false){ // try opening file FileBuffer infile (filename.c_str()); if (infile.fail()){ cerr << "ERROR: Could not open file '" << filename << "' for reading." << endl; exit (1); } // if successful, then load using other LoadMFA() routine LoadMFA (infile, stripGaps); infile.close(); } ///////////////////////////////////////////////////////////////// // MultiSequence::LoadMFA() // // Load MSF from a FileBuffer object. ///////////////////////////////////////////////////////////////// void ParseMSF (FileBuffer &infile, string header, bool stripGaps = false){ SafeVector *> seqData; SafeVector seqNames; SafeVector seqLengths; istringstream in; bool valid = true; bool missingHeader = false; bool clustalW = false; // read until data starts while (!infile.eof() && header.find ("..", 0) == string::npos){ if (header.find ("CLUSTAL", 0) == 0 || header.find ("PROBCONS", 0) == 0){ clustalW = true; break; } infile.GetLine (header); if (header.find ("//", 0) != string::npos){ missingHeader = true; break; } } // read until end-of-file while (valid){ infile.GetLine (header); if (infile.eof()) break; string word; in.clear(); in.str(header); // check if there's anything on this line if (in >> word){ // clustalw name parsing if (clustalW){ if (!isspace(header[0]) && find (seqNames.begin(), seqNames.end(), word) == seqNames.end()){ seqNames.push_back (word); seqData.push_back (new SafeVector()); seqLengths.push_back (0); seqData[(int) seqData.size() - 1]->push_back ('@'); } } // look for new sequence label if (word == string ("Name:")){ if (in >> word){ seqNames.push_back (word); seqData.push_back (new SafeVector()); seqLengths.push_back (0); seqData[(int) seqData.size() - 1]->push_back ('@'); } else valid = false; } // check if this is sequence data else if (find (seqNames.begin(), seqNames.end(), word) != seqNames.end()){ int index = find (seqNames.begin(), seqNames.end(), word) - seqNames.begin(); // read all remaining characters on the line char ch; while (in >> ch){ if (isspace (ch)) continue; // if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A'; if (ch == '.') ch = '-'; if (stripGaps && ch == '-') continue; /* if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){ cerr << "ERROR: Unknown character encountered: " << ch << endl; exit (1); } */ if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){ cerr << "ERROR: Unknown character encountered: " << ch << endl; exit (1); } // everything's ok so far, so just store this character. seqData[index]->push_back (ch); seqLengths[index]++; } } else if (missingHeader){ seqNames.push_back (word); seqData.push_back (new SafeVector()); seqLengths.push_back (0); seqData[(int) seqData.size() - 1]->push_back ('@'); int index = (int) seqNames.size() - 1; // read all remaining characters on the line char ch; while (in >> ch){ if (isspace (ch)) continue; // if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A'; if (ch == '.') ch = '-'; if (stripGaps && ch == '-') continue; if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){ cerr << "ERROR: Unknown character encountered: " << ch << endl; exit (1); } /* if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){ cerr << "ERROR: Unknown character encountered: " << ch << endl; exit (1); } */ // everything's ok so far, so just store this character. seqData[index]->push_back (ch); seqLengths[index]++; } } } } // check for errorsq if (seqNames.size() == 0){ cerr << "ERROR: No sequences read!" << endl; exit (1); } assert (!sequences); sequences = new SafeVector; for (int i = 0; i < (int) seqNames.size(); i++){ if (seqLengths[i] == 0){ cerr << "ERROR: Sequence of zero length!" << endl; exit (1); } Sequence *seq = new Sequence (seqData[i], seqNames[i], seqLengths[i], i, i); sequences->push_back (seq); } } ///////////////////////////////////////////////////////////////// // MultiSequence::LoadMFA() // // Load MFA from a FileBuffer object. ///////////////////////////////////////////////////////////////// void LoadMFA (FileBuffer &infile, bool stripGaps = false){ // check to make sure that file reading is ok if (infile.fail()){ cerr << "ERROR: Error reading file." << endl; exit (1); } // read all sequences while (true){ // get the sequence label as being the current # of sequences // NOTE: sequence labels here are zero-based int index = (!sequences) ? 0 : sequences->size(); // read the sequence Sequence *seq = new Sequence (infile, stripGaps); if (seq->Fail()){ // check if alternative file format (i.e. not MFA) if (index == 0){ string header = seq->GetHeader(); if (header.length() > 0 && header[0] != '>'){ // try MSF format ParseMSF (infile, header); break; } } delete seq; break; } seq->SetLabel (index); // add the sequence to the list of current sequences if (!sequences) sequences = new SafeVector; sequences->push_back (seq); } // make sure at least one sequence was read if (!sequences){ cerr << "ERROR: No sequences read." << endl; exit (1); } } ///////////////////////////////////////////////////////////////// // MultiSequence::AddSequence() // // Add another sequence to an existing sequence list ///////////////////////////////////////////////////////////////// void AddSequence (Sequence *sequence){ assert (sequence); assert (!sequence->Fail()); // add sequence if (!sequences) sequences = new SafeVector; sequences->push_back (sequence); } ///////////////////////////////////////////////////////////////// // MultiSequence::RemoveSequence() // // Remove a sequence from the MultiSequence ///////////////////////////////////////////////////////////////// void RemoveSequence (int index){ assert (sequences); assert (index >= 0 && index < (int) sequences->size()); delete (*sequences)[index]; sequences->erase (sequences->begin() + index); } ///////////////////////////////////////////////////////////////// // MultiSequence::WriteMFA() // // Write MFA to the outfile. Allows the user to specify the // number of columns for the output. Also, useIndices determines // whether or not the actual sequence comments will be printed // out or whether the artificially assigned sequence labels will // be used instead. ///////////////////////////////////////////////////////////////// void WriteMFA (ostream &outfile, string *ssCons = NULL, int numColumns = 60, bool useIndices = false){ if (!sequences) return; // loop through all sequences and write them out for (SafeVector::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){ (*iter)->WriteMFA (outfile, numColumns, useIndices); } int count = 0; if (ssCons != NULL) { outfile << ">#=GC SS_cons" << endl; int length = ssCons->length(); for (int i = 1; i < length; i++ ) { outfile << ssCons->at(i); ++count; if (numColumns <= count) { outfile << endl; count = 0; } } } outfile << endl; } void WriteMFAseq (ostream &outfile, int numColumns = 60, bool useIndices = false){ if (!sequences) return; // loop through all sequences and write them out for (SafeVector::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){ (*iter)->WriteMFA (outfile, numColumns, useIndices); } } ///////////////////////////////////////////////////////////////// // MultiSequence::GetAnnotationChar() // // Return CLUSTALW annotation for column. ///////////////////////////////////////////////////////////////// char GetAnnotationChar (SafeVector &column){ SafeVector counts (256, 0); int allChars = (int) column.size(); for (int i = 0; i < allChars; i++){ counts[(unsigned char) toupper(column[i])]++; } allChars -= counts[(unsigned char) '-']; if (allChars == 1) return ' '; for (int i = 0; i < 256; i++) if ((char) i != '-' && counts[i] == allChars) return '*'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'A'] == allChars) return ':'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] == allChars) return ':'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] == allChars) return ':'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] == allChars) return ':'; if (counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K'] == allChars) return ':'; if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I'] + counts[(unsigned char) 'L'] + counts[(unsigned char) 'V'] == allChars) return ':'; if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I'] + counts[(unsigned char) 'L'] + counts[(unsigned char) 'F'] == allChars) return ':'; if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'Y'] == allChars) return ':'; if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y'] + counts[(unsigned char) 'W'] == allChars) return ':'; if (counts[(unsigned char) 'C'] + counts[(unsigned char) 'S'] + counts[(unsigned char) 'A'] == allChars) return '.'; if (counts[(unsigned char) 'A'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'V'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'A'] + counts[(unsigned char) 'G'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'N'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'P'] + counts[(unsigned char) 'A'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'G'] + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'V'] + counts[(unsigned char) 'L'] + counts[(unsigned char) 'I'] + counts[(unsigned char) 'M'] == allChars) return '.'; if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y'] == allChars) return '.'; return ' '; } ///////////////////////////////////////////////////////////////// // MultiSequence::WriteALN() // // Write ALN to the outfile. Allows the user to specify the // number of columns for the output. ///////////////////////////////////////////////////////////////// void WriteALN (ostream &outfile, int numColumns = 60){ if (!sequences) return; // outfile << "Multplex SCARNA version " << VERSION << " multiple sequence alignment" << endl; // outfile << "PROBCONS version " << VERSION << " multiple sequence alignment" << endl; outfile << "CLUSTAL W(1.83) multiple sequence alignment" << endl; // outfile << "//" << endl; int longestComment = 0; SafeVector::iterator> ptrs (GetNumSequences()); SafeVector lengths (GetNumSequences()); for (int i = 0; i < GetNumSequences(); i++){ ptrs[i] = GetSequence (i)->GetDataPtr(); lengths[i] = GetSequence (i)->GetLength(); longestComment = max (longestComment, (int) GetSequence(i)->GetName().length()); } longestComment += 4; int writtenChars = 0; bool allDone = false; while (!allDone){ outfile << endl; allDone = true; // loop through all sequences and write them out for (int i = 0; i < GetNumSequences(); i++){ if (writtenChars < lengths[i]){ outfile << GetSequence(i)->GetName(); for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++) outfile << ' '; for (int j = 0; j < numColumns; j++){ if (writtenChars + j < lengths[i]) outfile << ptrs[i][writtenChars + j + 1]; else break; } outfile << endl; if (writtenChars + numColumns < lengths[i]) allDone = false; } } // write annotation line for (int j = 0; j < longestComment; j++) outfile << ' '; for (int j = 0; j < numColumns; j++){ SafeVector column; for (int i = 0; i < GetNumSequences(); i++) if (writtenChars + j < lengths[i]) column.push_back (ptrs[i][writtenChars + j + 1]); if (column.size() > 0) outfile << GetAnnotationChar (column); } outfile << endl; writtenChars += numColumns; } outfile << endl; } //////////////////////////////////////////////////////////////// // MultiSequence::WriteWEB(); // // Write ALN to the outfile. Allows the user to specify the // number of columns for the output. /////////////////////////////////////////////////////////////// void WriteWEB (ostream &outfile, string *ssCons = NULL, int numColumns = 60, bool useIndices = false){ if (!sequences) return; // loop through all sequences and write them out for (SafeVector::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){ (*iter)->WriteWEB (outfile, numColumns, useIndices); } // write conservation outfile << "" << endl; int longestComment = 0; SafeVector::iterator> ptrs (GetNumSequences()); SafeVector lengths (GetNumSequences()); for (int i = 0; i < GetNumSequences(); i++){ ptrs[i] = GetSequence (i)->GetDataPtr(); lengths[i] = GetSequence (i)->GetLength(); longestComment = max (longestComment, (int) GetSequence(i)->GetName().length()); } longestComment += 4; int writtenChars = 0; bool allDone = false; while (!allDone){ // outfile << endl; allDone = true; // loop through all sequences and write them out for (int i = 0; i < GetNumSequences(); i++){ if (writtenChars < lengths[i]){ // outfile << GetSequence(i)->GetName(); for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++) // outfile << ' '; for (int j = 0; j < numColumns; j++){ if (writtenChars + j < lengths[i]); // outfile << ptrs[i][writtenChars + j + 1]; else break; } // outfile << endl; if (writtenChars + numColumns < lengths[i]) allDone = false; } } // write annotation line // for (int j = 0; j < longestComment; j++) // outfile << ' '; for (int j = 0; j < numColumns; j++){ SafeVector column; for (int i = 0; i < GetNumSequences(); i++) if (writtenChars + j < lengths[i]) column.push_back (ptrs[i][writtenChars + j + 1]); if (column.size() > 0) outfile << GetAnnotationChar (column); } // outfile << endl; writtenChars += numColumns; } outfile << endl; outfile << "" << endl; // write structure information if (ssCons != NULL) { outfile << "" << endl; int length = ssCons->length(); for (int i = 1; i < length; i++ ) { outfile << ssCons->at(i); } outfile << endl; outfile << "" << endl; // add coordinate information 06/09/14 outfile << "" << endl; int segmentPos = 1; for (int i = 1; i < length; i++) { int count = 0; if ( ssCons->at(i) == '(' ) { ++count; int j = i; while (count != 0) { char ch = ssCons->at(++j); if (ch == '(') ++count; else if (ch == ')') --count; } outfile << "" << endl; } } } outfile << "" << endl; outfile << "" << endl; WriteMXSCARNA (outfile, ssCons); outfile << "" << endl; outfile << "" << endl; WriteALN (outfile); outfile << "" << endl; outfile << "" << endl; WriteMFA (outfile, ssCons); outfile << "" << endl; outfile << "" << endl; WriteWebSTOCKHOLM (outfile, ssCons); outfile << "" << endl; } //////////////////////////////////////////////////////////////// // MultiSequence::WriteSTOCKHOLM(); // // Write STOCKHOLM to the outfile. Allows the user to specify the // number of columns for the output. /////////////////////////////////////////////////////////////// void WriteSTOCKHOLM (ostream &outfile, string *ssCons = NULL, int numColumns = 60) { if (!sequences) return; outfile << "# STOCKHOLM 1.0" << endl; int longestComment = 0; SafeVector::iterator> ptrs (GetNumSequences()); SafeVector lengths (GetNumSequences()); for (int i = 0; i < GetNumSequences(); i++){ ptrs[i] = GetSequence (i)->GetDataPtr(); lengths[i] = GetSequence (i)->GetLength(); longestComment = max (longestComment, (int) GetSequence(i)->GetName().length()); } longestComment += 4; int writtenChars = 0; bool allDone = false; while (!allDone){ outfile << endl; allDone = true; // loop through all sequences and write them out for (int i = 0; i < GetNumSequences(); i++){ if (writtenChars < lengths[i]){ outfile << GetSequence(i)->GetName(); for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++) outfile << ' '; for (int j = 0; j < numColumns; j++){ if (writtenChars + j < lengths[i]) if (ptrs[i][writtenChars + j + 1] != '-') outfile << ptrs[i][writtenChars + j + 1]; else outfile << "."; else break; } outfile << endl; if (writtenChars + numColumns < lengths[i]) allDone = false; } } // write ssCons if (ssCons != NULL) { outfile << "#=GC SS_cons"; int lengthSScons = 12; for (int j = 0; j < longestComment - lengthSScons; j++) outfile << ' '; for (int j = 0; j < numColumns; j++) { if (ssCons->at(writtenChars + j + 1) == '(') outfile << "<"; else if (ssCons->at(writtenChars + j + 1) == ')') outfile << ">"; else outfile << "."; if ((unsigned int)writtenChars + j + 1 >= ssCons->length() - 1) break; } outfile << endl; } writtenChars += numColumns; } outfile << "//"; outfile << endl; } //////////////////////////////////////////////////////////////// // MultiSequence::WriteSTOCKHOLM(); // // Write STOCKHOLM to the outfile. Allows the user to specify the // number of columns for the output. /////////////////////////////////////////////////////////////// void WriteWebSTOCKHOLM (ostream &outfile, string *ssCons = NULL, int numColumns = 60) { if (!sequences) return; outfile << "# STOCKHOLM 1.0" << endl; int longestComment = 0; SafeVector::iterator> ptrs (GetNumSequences()); SafeVector lengths (GetNumSequences()); for (int i = 0; i < GetNumSequences(); i++){ ptrs[i] = GetSequence (i)->GetDataPtr(); lengths[i] = GetSequence (i)->GetLength(); longestComment = max (longestComment, (int) GetSequence(i)->GetName().length()); } longestComment += 4; int writtenChars = 0; bool allDone = false; while (!allDone){ outfile << endl; allDone = true; // loop through all sequences and write them out for (int i = 0; i < GetNumSequences(); i++){ if (writtenChars < lengths[i]){ outfile << GetSequence(i)->GetName(); for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++) outfile << ' '; for (int j = 0; j < numColumns; j++){ if (writtenChars + j < lengths[i]) if (ptrs[i][writtenChars + j + 1] != '-') outfile << ptrs[i][writtenChars + j + 1]; else outfile << "."; else break; } outfile << endl; if (writtenChars + numColumns < lengths[i]) allDone = false; } } // write ssCons if (ssCons != NULL) { outfile << "#=GC SS_cons"; int lengthSScons = 12; for (int j = 0; j < longestComment - lengthSScons; j++) outfile << ' '; for (int j = 0; j < numColumns; j++) { outfile << ssCons->at(writtenChars + j + 1); if ((unsigned int)writtenChars + j + 1 >= ssCons->length() - 1) break; } outfile << endl; } writtenChars += numColumns; } outfile << "//"; outfile << endl; } //////////////////////////////////////////////////////////////// // MultiSequence::WriteMXSCARNA(); // // Write MXSCARNA to the outfile. Allows the user to specify the // number of columns for the output. /////////////////////////////////////////////////////////////// void WriteMXSCARNA (ostream &outfile, string *ssCons = NULL, int numColumns = 60){ if (!sequences) return; outfile << "Multplex SCARNA version " << VERSION << " multiple sequence alignment" << endl; int longestComment = 0; SafeVector::iterator> ptrs (GetNumSequences()); SafeVector lengths (GetNumSequences()); for (int i = 0; i < GetNumSequences(); i++){ ptrs[i] = GetSequence (i)->GetDataPtr(); lengths[i] = GetSequence (i)->GetLength(); longestComment = max (longestComment, (int) GetSequence(i)->GetName().length()); } longestComment += 4; int writtenChars = 0; bool allDone = false; while (!allDone){ outfile << endl; allDone = true; // loop through all sequences and write them out for (int i = 0; i < GetNumSequences(); i++){ if (writtenChars < lengths[i]){ outfile << GetSequence(i)->GetName(); for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++) outfile << ' '; for (int j = 0; j < numColumns; j++){ if (writtenChars + j < lengths[i]) outfile << ptrs[i][writtenChars + j + 1]; else break; } outfile << endl; if (writtenChars + numColumns < lengths[i]) allDone = false; } } // write ssCons if (ssCons != NULL) { outfile << "ss_cons"; int lengthSScons = 7; for (int j = 0; j < longestComment - lengthSScons; j++) outfile << ' '; for (int j = 0; j < numColumns; j++) { outfile << ssCons->at(writtenChars + j + 1); if ((unsigned int)writtenChars + j + 1 >= ssCons->length() - 1) break; } outfile << endl; } // write annotation line for (int j = 0; j < longestComment; j++) outfile << ' '; for (int j = 0; j < numColumns; j++){ SafeVector column; for (int i = 0; i < GetNumSequences(); i++) if (writtenChars + j < lengths[i]) column.push_back (ptrs[i][writtenChars + j + 1]); if (column.size() > 0) outfile << GetAnnotationChar (column); } outfile << endl; writtenChars += numColumns; } outfile << endl; } ///////////////////////////////////////////////////////////////// // MultiSequence::GetSequence() // // Retrieve a sequence from the MultiSequence object. ///////////////////////////////////////////////////////////////// Sequence* GetSequence (int i){ assert (sequences); assert (0 <= i && i < (int) sequences->size()); return (*sequences)[i]; } ///////////////////////////////////////////////////////////////// // MultiSequence::GetSequence() // // Retrieve a sequence from the MultiSequence object // (const version). ///////////////////////////////////////////////////////////////// const Sequence* GetSequence (int i) const { assert (sequences); assert (0 <= i && i < (int) sequences->size()); return (*sequences)[i]; } ///////////////////////////////////////////////////////////////// // MultiSequence::GetNumSequences() // // Returns the number of sequences in the MultiSequence. ///////////////////////////////////////////////////////////////// int GetNumSequences () const { if (!sequences) return 0; return (int) sequences->size(); } ///////////////////////////////////////////////////////////////// // MultiSequence::SortByHeader() // // Organizes the sequences according to their sequence headers // in ascending order. ///////////////////////////////////////////////////////////////// void SortByHeader () { assert (sequences); // a quick and easy O(n^2) sort for (int i = 0; i < (int) sequences->size()-1; i++){ for (int j = i+1; j < (int) sequences->size(); j++){ if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader()) swap ((*sequences)[i], (*sequences)[j]); } } } ///////////////////////////////////////////////////////////////// // MultiSequence::SortByLabel() // // Organizes the sequences according to their sequence labels // in ascending order. ///////////////////////////////////////////////////////////////// void SortByLabel () { assert (sequences); // a quick and easy O(n^2) sort for (int i = 0; i < (int) sequences->size()-1; i++){ for (int j = i+1; j < (int) sequences->size(); j++){ if ((*sequences)[i]->GetSortLabel() > (*sequences)[j]->GetSortLabel()) swap ((*sequences)[i], (*sequences)[j]); } } } ///////////////////////////////////////////////////////////////// // MultiSequence::SaveOrdering() // // Relabels sequences so as to preserve the current ordering. ///////////////////////////////////////////////////////////////// void SaveOrdering () { assert (sequences); for (int i = 0; i < (int) sequences->size(); i++) (*sequences)[i]->SetSortLabel (i); } ///////////////////////////////////////////////////////////////// // MultiSequence::Project() // // Given a set of indices, extract all sequences from the current // MultiSequence object whose index is included in the set. // Then, project the multiple alignments down to the desired // subset, and return the projection as a new MultiSequence // object. ///////////////////////////////////////////////////////////////// MultiSequence *Project (const set &indices){ SafeVector::iterator> oldPtrs (indices.size()); SafeVector *> newPtrs (indices.size()); assert (indices.size() != 0); // grab old data int i = 0; for (set::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){ oldPtrs[i++] = GetSequence (*iter)->GetDataPtr(); } // compute new length int oldLength = GetSequence (*indices.begin())->GetLength(); int newLength = 0; for (i = 1; i <= oldLength; i++){ // check to see if there is a gap in every sequence of the set bool found = false; for (int j = 0; !found && j < (int) indices.size(); j++) found = (oldPtrs[j][i] != '-'); // if not, then this column counts towards the sequence length if (found) newLength++; } // build new alignments for (i = 0; i < (int) indices.size(); i++){ newPtrs[i] = new SafeVector(); assert (newPtrs[i]); newPtrs[i]->push_back ('@'); } // add all needed columns for (i = 1; i <= oldLength; i++){ // make sure column is not gapped in all sequences in the set bool found = false; for (int j = 0; !found && j < (int) indices.size(); j++) found = (oldPtrs[j][i] != '-'); // if not, then add it if (found){ for (int j = 0; j < (int) indices.size(); j++) newPtrs[j]->push_back (oldPtrs[j][i]); } } // wrap sequences in MultiSequence object MultiSequence *ret = new MultiSequence(); i = 0; for (set::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){ ret->AddSequence (new Sequence (newPtrs[i++], GetSequence (*iter)->GetHeader(), newLength, GetSequence (*iter)->GetSortLabel(), GetSequence (*iter)->GetLabel())); } return ret; } }; } #endif