Add GLprobs and MSAprobs to binaries
[jabaws.git] / binaries / src / GLProbs-1.0 / MultiSequence.h
diff --git a/binaries/src/GLProbs-1.0/MultiSequence.h b/binaries/src/GLProbs-1.0/MultiSequence.h
new file mode 100644 (file)
index 0000000..96a61f5
--- /dev/null
@@ -0,0 +1,735 @@
+////////////////////////////////////////////////////////////////
+// MultiSequence.h
+//
+// Utilities for reading/writing multiple sequence data.
+/////////////////////////////////////////////////////////////////
+
+#ifndef MULTISEQUENCE_H
+#define MULTISEQUENCE_H
+
+#include <cctype>
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+#include <set>
+#include "SafeVector.h"
+#include "Sequence.h"
+#include "FileBuffer.h"
+
+#define VERSION "0.9.7"
+/////////////////////////////////////////////////////////////////
+// MultiSequence
+//
+// Class for multiple sequence alignment input/output.
+/////////////////////////////////////////////////////////////////
+
+class MultiSequence {
+
+       SafeVector<Sequence *> *sequences;
+
+public:
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::MultiSequence()
+       //
+       // Default constructor.
+       /////////////////////////////////////////////////////////////////
+
+       MultiSequence() :
+                       sequences(NULL) {
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::MultiSequence()
+       //
+       // Constructor.  Load MFA from a FileBuffer object.
+       /////////////////////////////////////////////////////////////////
+
+       MultiSequence(FileBuffer &infile) :
+                       sequences(NULL) {
+               LoadMFA(infile);
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::MultiSequence()
+       //
+       // Constructor.  Load MFA from a filename.
+       /////////////////////////////////////////////////////////////////
+
+       MultiSequence(const string &filename) :
+                       sequences(NULL) {
+               LoadMFA(filename);
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::~MultiSequence()
+       //
+       // Destructor.  Gets rid of sequence objects contained in the
+       // multiple alignment.
+       /////////////////////////////////////////////////////////////////
+
+       ~MultiSequence() {
+
+               // if sequences allocated
+               if (sequences) {
+
+                       // free all sequences
+                       for (SafeVector<Sequence *>::iterator iter = sequences->begin();
+                                       iter != sequences->end(); ++iter) {
+                               assert(*iter);
+                               delete *iter;
+                               *iter = NULL;
+                       }
+
+                       // free sequence vector
+                       delete sequences;
+                       sequences = NULL;
+               }
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::LoadMFA()
+       //
+       // Load MFA from a filename.
+       /////////////////////////////////////////////////////////////////
+
+       void LoadMFA(const string &filename, bool stripGaps = false) {
+
+               // try opening file
+               FileBuffer infile(filename.c_str());
+
+               if (infile.fail()) {
+                       cerr << "ERROR: Could not open file '" << filename
+                                       << "' for reading." << endl;
+                       exit(1);
+               }
+
+               // if successful, then load using other LoadMFA() routine
+               LoadMFA(infile, stripGaps);
+
+               infile.close();
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::LoadMFA()
+       //
+       // Load MSF from a FileBuffer object.
+       /////////////////////////////////////////////////////////////////
+
+       void ParseMSF(FileBuffer &infile, string header, bool stripGaps = false) {
+
+               SafeVector<SafeVector<char> *> seqData;
+               SafeVector<string> seqNames;
+               SafeVector<int> seqLengths;
+
+               istringstream in;
+               bool valid = true;
+               bool missingHeader = false;
+               bool clustalW = false;
+
+               // read until data starts
+               while (!infile.eof() && header.find("..", 0) == string::npos) {
+                       if (header.find("CLUSTAL", 0) == 0
+                                       || header.find("MSAPROBS", 0) == 0) {
+                               clustalW = true;
+                               break;
+                       }
+                       infile.GetLine(header);
+                       if (header.find("//", 0) != string::npos) {
+                               missingHeader = true;
+                               break;
+                       }
+               }
+
+               // read until end-of-file
+               while (valid) {
+                       infile.GetLine(header);
+                       if (infile.eof())
+                               break;
+
+                       string word;
+                       in.clear();
+                       in.str(header);
+
+                       // check if there's anything on this line
+                       if (in >> word) {
+
+                               // clustalw name parsing
+                               if (clustalW) {
+                                       if (!isspace(header[0])
+                                                       && find(seqNames.begin(), seqNames.end(), word)
+                                                                       == seqNames.end()) {
+                                               seqNames.push_back(word);
+                                               seqData.push_back(new SafeVector<char>());
+                                               seqLengths.push_back(0);
+                                               seqData[(int) seqData.size() - 1]->push_back('@');
+                                       }
+                               }
+
+                               // look for new sequence label
+                               if (word == string("Name:")) {
+                                       if (in >> word) {
+                                               seqNames.push_back(word);
+                                               seqData.push_back(new SafeVector<char>());
+                                               seqLengths.push_back(0);
+                                               seqData[(int) seqData.size() - 1]->push_back('@');
+                                       } else
+                                               valid = false;
+                               }
+
+                               // check if this is sequence data
+                               else if (find(seqNames.begin(), seqNames.end(), word)
+                                               != seqNames.end()) {
+                                       int index = find(seqNames.begin(), seqNames.end(), word)
+                                                       - seqNames.begin();
+
+                                       // read all remaining characters on the line
+                                       char ch;
+                                       while (in >> ch) {
+                                               if (isspace(ch))
+                                                       continue;
+                                               if (ch >= 'a' && ch <= 'z')
+                                                       ch = ch - 'a' + 'A';
+                                               if (ch == '.')
+                                                       ch = '-';
+                                               if (stripGaps && ch == '-')
+                                                       continue;
+                                               if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')) {
+                                                       cerr << "ERROR: Unknown character encountered: "
+                                                                       << ch << endl;
+                                                       exit(1);
+                                               }
+
+                                               // everything's ok so far, so just store this character.
+                                               seqData[index]->push_back(ch);
+                                               seqLengths[index]++;
+                                       }
+                               } else if (missingHeader) {
+                                       seqNames.push_back(word);
+                                       seqData.push_back(new SafeVector<char>());
+                                       seqLengths.push_back(0);
+                                       seqData[(int) seqData.size() - 1]->push_back('@');
+
+                                       int index = (int) seqNames.size() - 1;
+
+                                       // read all remaining characters on the line
+                                       char ch;
+                                       while (in >> ch) {
+                                               if (isspace(ch))
+                                                       continue;
+                                               if (ch >= 'a' && ch <= 'z')
+                                                       ch = ch - 'a' + 'A';
+                                               if (ch == '.')
+                                                       ch = '-';
+                                               if (stripGaps && ch == '-')
+                                                       continue;
+                                               if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')) {
+                                                       cerr << "ERROR: Unknown character encountered: "
+                                                                       << ch << endl;
+                                                       exit(1);
+                                               }
+
+                                               // everything's ok so far, so just store this character.
+                                               seqData[index]->push_back(ch);
+                                               seqLengths[index]++;
+                                       }
+                               }
+                       }
+               }
+
+               // check for errors
+               if (seqNames.size() == 0) {
+                       cerr << "ERROR: No sequences read!" << endl;
+                       exit(1);
+               }
+
+               assert(!sequences);
+               sequences = new SafeVector<Sequence *>;
+               for (int i = 0; i < (int) seqNames.size(); i++) {
+                       if (seqLengths[i] == 0) {
+                               cerr << "ERROR: Sequence of zero length!" << endl;
+                               exit(1);
+                       }
+                       Sequence *seq = new Sequence(seqData[i], seqNames[i], seqLengths[i],
+                                       i, i);
+                       sequences->push_back(seq);
+               }
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::LoadMFA()
+       //
+       // Load MFA from a FileBuffer object.
+       /////////////////////////////////////////////////////////////////
+
+       void LoadMFA(FileBuffer &infile, bool stripGaps = false) {
+
+               // check to make sure that file reading is ok
+               if (infile.fail()) {
+                       cerr << "ERROR: Error reading file." << endl;
+                       exit(1);
+               }
+
+               // read all sequences
+               while (true) {
+
+                       // get the sequence label as being the current # of sequences
+                       // NOTE: sequence labels here are zero-based
+                       int index = (!sequences) ? 0 : sequences->size();
+
+                       // read the sequence
+                       Sequence *seq = new Sequence(infile, stripGaps);
+                       if (seq->Fail()) {
+
+                               // check if alternative file format (i.e. not MFA)
+                               if (index == 0) {
+                                       string header = seq->GetHeader();
+                                       if (header.length() > 0 && header[0] != '>') {
+
+                                               // try MSF format
+                                               ParseMSF(infile, header);
+                                               break;
+                                       }
+                               }
+
+                               delete seq;
+                               break;
+                       }
+                       seq->SetLabel(index);
+
+                       // add the sequence to the list of current sequences
+                       if (!sequences)
+                               sequences = new SafeVector<Sequence *>;
+                       sequences->push_back(seq);
+               }
+
+               // make sure at least one sequence was read
+               if (!sequences) {
+                       cerr << "ERROR: No sequences read." << endl;
+                       exit(1);
+               }
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::AddSequence()
+       //
+       // Add another sequence to an existing sequence list
+       /////////////////////////////////////////////////////////////////
+
+       void AddSequence(Sequence *sequence) {
+               assert(sequence);
+               assert(!sequence->Fail());
+
+               // add sequence
+               if (!sequences)
+                       sequences = new SafeVector<Sequence *>;
+               sequences->push_back(sequence);
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::RemoveSequence()
+       //
+       // Remove a sequence from the MultiSequence
+       /////////////////////////////////////////////////////////////////
+
+       void RemoveSequence(int index) {
+               assert(sequences);
+
+               assert(index >= 0 && index < (int) sequences->size());
+               delete (*sequences)[index];
+
+               sequences->erase(sequences->begin() + index);
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::WriteMFA()
+       //
+       // Write MFA to the outfile.  Allows the user to specify the
+       // number of columns for the output.  Also, useIndices determines
+       // whether or not the actual sequence comments will be printed
+       // out or whether the artificially assigned sequence labels will
+       // be used instead.
+       /////////////////////////////////////////////////////////////////
+
+       void WriteMFA(ostream &outfile, int numColumns = 60,
+                       bool useIndices = false) {
+               if (!sequences)
+                       return;
+
+               // loop through all sequences and write them out
+               for (SafeVector<Sequence *>::iterator iter = sequences->begin();
+                               iter != sequences->end(); ++iter) {
+                       (*iter)->WriteMFA(outfile, numColumns, useIndices);
+               }
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::GetAnnotationChar()
+       //
+       // Return CLUSTALW annotation for column.
+       /////////////////////////////////////////////////////////////////
+
+       char GetAnnotationChar(SafeVector<char> &column) {
+               SafeVector<int> counts(256, 0);
+               int allChars = (int) column.size();
+
+               for (int i = 0; i < allChars; i++) {
+                       counts[(unsigned char) toupper(column[i])]++;
+               }
+
+               allChars -= counts[(unsigned char) '-'];
+               if (allChars == 1)
+                       return ' ';
+
+               for (int i = 0; i < 256; i++)
+                       if ((char) i != '-' && counts[i] == allChars)
+                               return '*';
+
+               if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T']
+                               + counts[(unsigned char) 'A'] == allChars)
+                       return ':';
+
+               if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E']
+                               + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K']
+                               == allChars)
+                       return ':';
+
+               if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'H']
+                               + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K']
+                               == allChars)
+                       return ':';
+
+               if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D']
+                               + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q']
+                               == allChars)
+                       return ':';
+
+               if (counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H']
+                               + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K']
+                               == allChars)
+                       return ':';
+
+               if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I']
+                               + counts[(unsigned char) 'L'] + counts[(unsigned char) 'V']
+                               == allChars)
+                       return ':';
+
+               if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I']
+                               + counts[(unsigned char) 'L'] + counts[(unsigned char) 'F']
+                               == allChars)
+                       return ':';
+
+               if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'Y']
+                               == allChars)
+                       return ':';
+
+               if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y']
+                               + counts[(unsigned char) 'W'] == allChars)
+                       return ':';
+
+               if (counts[(unsigned char) 'C'] + counts[(unsigned char) 'S']
+                               + counts[(unsigned char) 'A'] == allChars)
+                       return '.';
+
+               if (counts[(unsigned char) 'A'] + counts[(unsigned char) 'T']
+                               + counts[(unsigned char) 'V'] == allChars)
+                       return '.';
+
+               if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'A']
+                               + counts[(unsigned char) 'G'] == allChars)
+                       return '.';
+
+               if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T']
+                               + counts[(unsigned char) 'N'] + counts[(unsigned char) 'K']
+                               == allChars)
+                       return '.';
+
+               if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T']
+                               + counts[(unsigned char) 'P'] + counts[(unsigned char) 'A']
+                               == allChars)
+                       return '.';
+
+               if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'G']
+                               + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D']
+                               == allChars)
+                       return '.';
+
+               if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'N']
+                               + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E']
+                               + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K']
+                               == allChars)
+                       return '.';
+
+               if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D']
+                               + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q']
+                               + counts[(unsigned char) 'H'] + counts[(unsigned char) 'K']
+                               == allChars)
+                       return '.';
+
+               if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E']
+                               + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q']
+                               + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K']
+                               == allChars)
+                       return '.';
+
+               if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'V']
+                               + counts[(unsigned char) 'L'] + counts[(unsigned char) 'I']
+                               + counts[(unsigned char) 'M'] == allChars)
+                       return '.';
+
+               if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'F']
+                               + counts[(unsigned char) 'Y'] == allChars)
+                       return '.';
+
+               return ' ';
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::WriteALN()
+       //
+       // Write ALN to the outfile.  Allows the user to specify the
+       // number of columns for the output.  
+       /////////////////////////////////////////////////////////////////
+
+       void WriteALN(ostream &outfile, int numColumns = 60) {
+               if (!sequences)
+                       return;
+
+               outfile << "CLUSTAL for GLPROBS version " << VERSION << " multiple sequence alignment" << endl;
+//
+//             outfile << "//"<<endl<<endl;
+
+               int longestComment = 0;
+               SafeVector<SafeVector<char>::iterator> ptrs(GetNumSequences());
+               SafeVector<int> lengths(GetNumSequences());
+               for (int i = 0; i < GetNumSequences(); i++) {
+                       ptrs[i] = GetSequence(i)->GetDataPtr();
+                       lengths[i] = GetSequence(i)->GetLength();
+                       longestComment = max(longestComment,
+                                       (int) GetSequence(i)->GetName().length());
+               }
+               longestComment += 4;
+
+               int writtenChars = 0;
+               bool allDone = false;
+
+               while (!allDone) {
+                       outfile << endl;
+                       allDone = true;
+
+                       // loop through all sequences and write them out
+                       for (int i = 0; i < GetNumSequences(); i++) {
+
+                               if (writtenChars < lengths[i]) {
+                                       outfile << GetSequence(i)->GetName();
+                                       for (int j = 0;
+                                                       j
+                                                                       < longestComment
+                                                                                       - (int) GetSequence(i)->GetName().length();
+                                                       j++)
+                                               outfile << ' ';
+
+                                       for (int j = 0; j < numColumns; j++) {
+                                               if (writtenChars + j < lengths[i])
+                                                       outfile << ptrs[i][writtenChars + j + 1];
+                                               else
+                                                       break;
+                                       }
+
+                                       outfile << endl;
+
+                                       if (writtenChars + numColumns < lengths[i])
+                                               allDone = false;
+                               }
+                       }
+
+                       // write annotation line
+/*
+                       for (int j = 0; j < longestComment; j++)
+                               outfile << ' ';
+
+                       for (int j = 0; j < numColumns; j++) {
+                               SafeVector<char> column;
+
+                               for (int i = 0; i < GetNumSequences(); i++)
+                                       if (writtenChars + j < lengths[i])
+                                               column.push_back(ptrs[i][writtenChars + j + 1]);
+
+                               if (column.size() > 0)
+                                       outfile << GetAnnotationChar(column);
+                       }
+*/
+                       outfile << endl;
+                       writtenChars += numColumns;
+               }
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::GetSequence()
+       //
+       // Retrieve a sequence from the MultiSequence object.
+       /////////////////////////////////////////////////////////////////
+
+       Sequence* GetSequence(int i) {
+               assert(sequences);
+               assert(0 <= i && i < (int) sequences->size());
+
+               return (*sequences)[i];
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::GetSequence()
+       //
+       // Retrieve a sequence from the MultiSequence object
+       // (const version).
+       /////////////////////////////////////////////////////////////////
+
+       const Sequence* GetSequence(int i) const {
+               assert(sequences);
+               assert(0 <= i && i < (int) sequences->size());
+
+               return (*sequences)[i];
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::GetNumSequences()
+       //
+       // Returns the number of sequences in the MultiSequence.
+       /////////////////////////////////////////////////////////////////
+
+       int GetNumSequences() const {
+               if (!sequences)
+                       return 0;
+               return (int) sequences->size();
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::SortByHeader()
+       //
+       // Organizes the sequences according to their sequence headers
+       // in ascending order.
+       /////////////////////////////////////////////////////////////////
+
+       void SortByHeader() {
+               assert(sequences);
+
+               // a quick and easy O(n^2) sort
+               for (int i = 0; i < (int) sequences->size() - 1; i++) {
+                       for (int j = i + 1; j < (int) sequences->size(); j++) {
+                               if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader())
+                                       swap((*sequences)[i], (*sequences)[j]);
+                       }
+               }
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::SortByLabel()
+       //
+       // Organizes the sequences according to their sequence labels
+       // in ascending order.
+       /////////////////////////////////////////////////////////////////
+
+       void SortByLabel() {
+               assert(sequences);
+
+               // a quick and easy O(n^2) sort
+               for (int i = 0; i < (int) sequences->size() - 1; i++) {
+                       for (int j = i + 1; j < (int) sequences->size(); j++) {
+                               if ((*sequences)[i]->GetSortLabel()
+                                               > (*sequences)[j]->GetSortLabel())
+                                       swap((*sequences)[i], (*sequences)[j]);
+                       }
+               }
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::SaveOrdering()
+       //
+       // Relabels sequences so as to preserve the current ordering.
+       /////////////////////////////////////////////////////////////////
+
+       void SaveOrdering() {
+               assert(sequences);
+
+               for (int i = 0; i < (int) sequences->size(); i++)
+                       (*sequences)[i]->SetSortLabel(i);
+       }
+
+       /////////////////////////////////////////////////////////////////
+       // MultiSequence::Project()
+       //
+       // Given a set of indices, extract all sequences from the current
+       // MultiSequence object whose index is included in the set.
+       // Then, project the multiple alignments down to the desired
+       // subset, and return the projection as a new MultiSequence
+       // object.
+       /////////////////////////////////////////////////////////////////
+
+       MultiSequence *Project(const set<int> &indices) {
+               SafeVector<SafeVector<char>::iterator> oldPtrs(indices.size());
+               SafeVector<SafeVector<char> *> newPtrs(indices.size());
+
+               assert(indices.size() != 0);
+
+               // grab old data
+               int i = 0;
+               for (set<int>::const_iterator iter = indices.begin();
+                               iter != indices.end(); ++iter) {
+                       oldPtrs[i++] = GetSequence(*iter)->GetDataPtr();
+               }
+
+               // compute new length
+               int oldLength = GetSequence(*indices.begin())->GetLength();
+               int newLength = 0;
+               for (i = 1; i <= oldLength; i++) {
+
+                       // check to see if there is a gap in every sequence of the set
+                       bool found = false;
+                       for (int j = 0; !found && j < (int) indices.size(); j++)
+                               found = (oldPtrs[j][i] != '-');
+
+                       // if not, then this column counts towards the sequence length
+                       if (found)
+                               newLength++;
+               }
+
+               // build new alignments
+               for (i = 0; i < (int) indices.size(); i++) {
+                       newPtrs[i] = new SafeVector<char>();
+                       assert(newPtrs[i]);
+                       newPtrs[i]->push_back('@');
+               }
+
+               // add all needed columns
+               for (i = 1; i <= oldLength; i++) {
+
+                       // make sure column is not gapped in all sequences in the set
+                       bool found = false;
+                       for (int j = 0; !found && j < (int) indices.size(); j++)
+                               found = (oldPtrs[j][i] != '-');
+
+                       // if not, then add it
+                       if (found) {
+                               for (int j = 0; j < (int) indices.size(); j++)
+                                       newPtrs[j]->push_back(oldPtrs[j][i]);
+                       }
+               }
+
+               // wrap sequences in MultiSequence object
+               MultiSequence *ret = new MultiSequence();
+               i = 0;
+               for (set<int>::const_iterator iter = indices.begin();
+                               iter != indices.end(); ++iter) {
+                       ret->AddSequence(
+                                       new Sequence(newPtrs[i++], GetSequence(*iter)->GetHeader(),
+                                                       newLength, GetSequence(*iter)->GetSortLabel(),
+                                                       GetSequence(*iter)->GetLabel()));
+               }
+
+               return ret;
+       }
+};
+
+#endif