--- /dev/null
+/////////////////////////////////////////////////////////////////
+// Defaults.h
+//
+// Default constants for use in MSAPROBS. The emission
+// probabilities were computed using the program used to build
+// the BLOSUM62 matrix from the BLOCKS 5.0 dataset. Transition
+// parameters were obtained via unsupervised EM training on the
+// BALIBASE 2.0 benchmark alignment database.
+/////////////////////////////////////////////////////////////////
+
+#ifndef DEFAULTS_H
+#define DEFAULTS_H
+
+#include <string>
+
+using namespace std;
+
+/*
+ float initDistrib1Default[] = { 0.3202854395, 0.3398572505, 0.3398572505 };
+ float gapOpen1Default[] = { 0.1375414133, 0.1375414133 };
+ float gapExtend1Default[] = { 0.7832147479, 0.7832147479 };
+ */
+/*
+float initDistrib1Default[] = { 0.6080327034f, 0.1959836632f, 0.1959836632f };
+float gapOpen1Default[] = { 0.01993141696f, 0.01993141696f };
+float gapExtend1Default[] = { 0.7943345308f, 0.7943345308f };
+*/
+
+float initDistrib1Default[] = { 0.06188, 0.93812, 0.1959836632f };
+float gapOpen1Default[] = { 0.01993141696f, 0.01993141696f };
+float gapExtend1Default[] = { 0.7943345308f, 0.7943345308f };
+
+/*
+float initDistrib1Default[] = { 0.2031769156f, 0.7968229055f, 0.05529401079f };
+float gapOpen1Default[] = { 0.006541831419f, 0.006541831419f };
+float gapExtend1Default[] = { 0.3042867482f, 0.3042867482f };
+*/
+/*
+float initDistrib1Default[] = { 0.109684445f, 0.8903156519f, 0.01231110841f };
+float gapOpen1Default[] = { 0.01968936995f, 0.01968936995f };
+float gapExtend1Default[] = { 0.5699355602f, 0.5699355602f };
+*/
+float initDistrib2Default[] = { 0.6814756989f, 8.615339902e-05f,
+ 0.700645f, 0.1591759622f, 0.1591759622 };
+float gapOpen2Default[] = { 0.0119511066f, 0.01993141696f, 0.008008334786f,
+ 0.008008334786 };
+float gapExtend2Default[] = { 0.3965826333f, 0.7943345308f, 0.8988758326f,
+ 0.8988758326 };
+
+string alphabetDefault = "ARNDCQEGHILKMFPSTWYV";
+float emitSingleDefault[20] = { 0.07831005f, 0.05246024f, 0.04433257f,
+ 0.05130349f, 0.02189704f, 0.03585766f, 0.05615771f, 0.07783433f,
+ 0.02601093f, 0.06511648f, 0.09716489f, 0.05877077f, 0.02438117f,
+ 0.04463228f, 0.03940142f, 0.05849916f, 0.05115306f, 0.01203523f,
+ 0.03124726f, 0.07343426f };
+
+float emitPairsDefault[20][20] = { { 0.02373072f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f }, { 0.00244502f, 0.01775118f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f }, { 0.00210228f, 0.00207782f, 0.01281864f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f }, { 0.00223549f, 0.00161657f, 0.00353540f, 0.01911178f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f }, { 0.00145515f, 0.00044701f, 0.00042479f,
+ 0.00036798f, 0.01013470f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00219102f,
+ 0.00253532f, 0.00158223f, 0.00176784f, 0.00032102f, 0.00756604f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f }, { 0.00332218f, 0.00268865f, 0.00224738f, 0.00496800f,
+ 0.00037956f, 0.00345128f, 0.01676565f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00597898f,
+ 0.00194865f, 0.00288882f, 0.00235249f, 0.00071206f, 0.00142432f,
+ 0.00214860f, 0.04062876f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00114353f, 0.00132105f, 0.00141205f,
+ 0.00097077f, 0.00026421f, 0.00113901f, 0.00131767f, 0.00103704f,
+ 0.00867996f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f },
+ { 0.00318853f, 0.00138145f, 0.00104273f, 0.00105355f, 0.00094040f,
+ 0.00100883f, 0.00124207f, 0.00142520f, 0.00059716f, 0.01778263f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, {
+ 0.00449576f, 0.00246811f, 0.00160275f, 0.00161966f, 0.00138494f,
+ 0.00180553f, 0.00222063f, 0.00212853f, 0.00111754f, 0.01071834f,
+ 0.03583921f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f }, { 0.00331693f, 0.00595650f, 0.00257310f, 0.00252518f,
+ 0.00046951f, 0.00312308f, 0.00428420f, 0.00259311f, 0.00121376f,
+ 0.00157852f, 0.00259626f, 0.01612228f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00148878f, 0.00076734f,
+ 0.00063401f, 0.00047808f, 0.00037421f, 0.00075546f, 0.00076105f,
+ 0.00066504f, 0.00042237f, 0.00224097f, 0.00461939f, 0.00096120f,
+ 0.00409522f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, {
+ 0.00165004f, 0.00090768f, 0.00084658f, 0.00069041f, 0.00052274f,
+ 0.00059248f, 0.00078814f, 0.00115204f, 0.00072545f, 0.00279948f,
+ 0.00533369f, 0.00087222f, 0.00116111f, 0.01661038f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00230618f, 0.00106268f,
+ 0.00100282f, 0.00125381f, 0.00034766f, 0.00090111f, 0.00151550f,
+ 0.00155601f, 0.00049078f, 0.00103767f, 0.00157310f, 0.00154836f,
+ 0.00046718f, 0.00060701f, 0.01846071f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f }, { 0.00631752f, 0.00224540f, 0.00301397f, 0.00285226f,
+ 0.00094867f, 0.00191155f, 0.00293898f, 0.00381962f, 0.00116422f,
+ 0.00173565f, 0.00250962f, 0.00312633f, 0.00087787f, 0.00119036f,
+ 0.00180037f, 0.01346609f, 0.0f, 0.0f, 0.0f, 0.0f }, {
+ 0.00389995f, 0.00186053f, 0.00220144f, 0.00180488f, 0.00073798f,
+ 0.00154526f, 0.00216760f, 0.00214841f, 0.00077747f, 0.00248968f,
+ 0.00302273f, 0.00250862f, 0.00093371f, 0.00107595f, 0.00147982f,
+ 0.00487295f, 0.01299436f, 0.0f, 0.0f, 0.0f }, { 0.00039119f,
+ 0.00029139f, 0.00021006f, 0.00016015f, 0.00010666f, 0.00020592f,
+ 0.00023815f, 0.00038786f, 0.00019097f, 0.00039549f, 0.00076736f,
+ 0.00028448f, 0.00016253f, 0.00085751f, 0.00015674f, 0.00026525f,
+ 0.00024961f, 0.00563625f, 0.0f, 0.0f }, { 0.00131840f,
+ 0.00099430f, 0.00074960f, 0.00066005f, 0.00036626f, 0.00070192f,
+ 0.00092548f, 0.00089301f, 0.00131038f, 0.00127857f, 0.00219713f,
+ 0.00100817f, 0.00054105f, 0.00368739f, 0.00047608f, 0.00102648f,
+ 0.00094759f, 0.00069226f, 0.00999315f, 0.0f }, { 0.00533241f,
+ 0.00169359f, 0.00136609f, 0.00127915f, 0.00119152f, 0.00132844f,
+ 0.00178697f, 0.00194579f, 0.00071553f, 0.01117956f, 0.00914460f,
+ 0.00210897f, 0.00197461f, 0.00256159f, 0.00135781f, 0.00241601f,
+ 0.00343452f, 0.00038538f, 0.00148001f, 0.02075171f } };
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// FileBuffer.h
+//
+// Buffered file reading.
+/////////////////////////////////////////////////////////////////
+
+#ifndef FILEBUFFER_H
+#define FILEBUFFER_H
+
+#include <string>
+#include <fstream>
+#include <iostream>
+
+using namespace std;
+
+const int BufferSize = 1000;
+
+/////////////////////////////////////////////////////////////////
+// FileBuffer
+//
+// Class for buffering file reading.
+/////////////////////////////////////////////////////////////////
+
+class FileBuffer {
+ ifstream file;
+ char buffer[BufferSize];
+ int currPos;
+ int size;
+ bool isEOF;
+ bool isValid;
+ bool canUnget;
+
+public:
+
+ // Some common routines
+
+ FileBuffer(const char *filename) :
+ file(filename), currPos(0), size(0), isEOF(false), isValid(
+ !file.fail()), canUnget(false) {
+ }
+ ~FileBuffer() {
+ close();
+ }
+ bool fail() const {
+ return !isValid;
+ }
+ bool eof() const {
+ return (!isValid || isEOF);
+ }
+ void close() {
+ file.close();
+ isValid = false;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // FileBuffer::Get()
+ //
+ // Retrieve a character from the file buffer. Returns true if
+ // and only if a character is read.
+ /////////////////////////////////////////////////////////////////
+
+ bool Get(char &ch) {
+
+ // check to make sure that there's more stuff in the file
+ if (!isValid || isEOF)
+ return false;
+
+ // if the buffer is empty, it's time to reload it
+ if (currPos == size) {
+ file.read(buffer, BufferSize);
+ size = file.gcount();
+ isEOF = (size == 0);
+ currPos = 0;
+ if (isEOF)
+ return false;
+ }
+
+ // store the read character
+ ch = buffer[currPos++];
+ canUnget = true;
+ return true;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // FileBuffer::UnGet()
+ //
+ // Unretrieve the most recently read character from the file
+ // buffer. Note that this allows only a one-level undo.
+ /////////////////////////////////////////////////////////////////
+
+ void UnGet() {
+ assert(canUnget);
+ assert(isValid);
+ assert(currPos > 0);
+ currPos--;
+ assert(currPos < size);
+ isEOF = false;
+ canUnget = false;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // FileBuffer::GetLine()
+ //
+ // Retrieve characters of text until a newline character is
+ // encountered. Terminates properly on end-of-file condition.
+ /////////////////////////////////////////////////////////////////
+
+ void GetLine(string &s) {
+ char ch;
+ s = "";
+ while (Get(ch) && ch != '\n')
+ s += ch;
+ }
+
+};
+
+#endif
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <set>
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <iomanip>
+#include "MSA.h"
+#include "MSAClusterTree.h"
+#include "Defaults.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+string parametersInputFilename = "";
+string parametersOutputFilename = "no training";
+string annotationFilename = "";
+
+bool enableVerbose = false;
+bool enableAnnotation = false;
+bool enableClustalWOutput = false;
+bool enableAlignOrder = false;
+int numConsistencyReps = 2;
+int numPreTrainingReps = 0;
+int numIterativeRefinementReps = 100;
+
+float cutoff = 0;
+
+VF initDistrib(NumMatrixTypes);
+VF gapOpen(2 * NumInsertStates);
+VF gapExtend(2 * NumInsertStates);
+VVF emitPairs(256, VF(256, 1e-10));
+VF emitSingle(256, 1e-5);
+
+string alphabet = alphabetDefault;
+
+const int MIN_PRETRAINING_REPS = 0;
+const int MAX_PRETRAINING_REPS = 20;
+const int MIN_CONSISTENCY_REPS = 0;
+const int MAX_CONSISTENCY_REPS = 5;
+const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
+const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
+
+string posteriorProbsFilename = "";
+bool allscores = true;
+string infilename;
+
+int flag_gui = 0; //0: no gui related o/p
+//1: gui related o/p generated
+int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
+//1: pp score seq added to o/p fasta alignment
+
+///////////////////////////////
+// global scoring matrix variables
+//////////////////////////////
+float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
+char *aminos, *bases, matrixtype[20] = "gonnet_160";
+int subst_index[26];
+
+double sub_matrix[26][26];
+double normalized_matrix[26][26];// add by YE Yongtao
+int firstread = 0; //this makes sure that matrices are read only once
+
+float TEMPERATURE = 5;
+int MATRIXTYPE = 160;
+int prot_nuc = 0; //0=prot, 1=nucleotide
+
+float GAPOPEN = 0;
+float GAPEXT = 0;
+int numThreads = 0;
+
+//argument support
+typedef struct {
+ char input[30];
+ int matrix;
+ int N;
+ float T;
+ float beta;
+ char opt; //can be 'P' or 'M'
+ float gapopen;
+ float gapext;
+} argument_decl;
+
+argument_decl argument;
+
+extern inline void read_sustitution_matrix(char *fileName);
+extern void setmatrixtype(int le);
+extern inline int matrixtype_to_int();
+extern inline void read_dna_matrix();
+extern inline void read_vtml_la_matrix();
+extern void init_arguments();
+
+MSA::MSA(int argc, char* argv[]) {
+ //parse program parameters
+ SafeVector<string> sequenceNames = ParseParams(argc, argv);
+
+ //initialize arguments for partition function
+ init_arguments();
+
+ ReadParameters();
+ //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
+
+ //read the input sequences
+ MultiSequence *sequences = new MultiSequence();
+ assert(sequences);
+ for (int i = 0; i < (int) sequenceNames.size(); i++) {
+ cerr << "Loading sequence file: " << sequenceNames[i] << endl;
+ sequences->LoadMFA(sequenceNames[i], true);
+ }
+ //allocate space for sequence weights
+ this->seqsWeights = new int[sequences->GetNumSequences()];
+ //initilaize parameters for OPENMP
+#ifdef _OPENMP
+ if(numThreads <= 0) {
+ numThreads = omp_get_num_procs();
+// cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
+ }
+// cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
+
+ //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
+ omp_set_num_threads(numThreads);
+#endif
+/*
+FILE *fi = fopen ("accuracy", "a");
+fprintf (fi, "%s ", argv[1]);
+fclose (fi);
+*/
+ int levelid = AdjustmentTest(sequences,ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,emitSingle));
+ //cerr<<levelid<<endl;
+ // now, we can perform the alignments and write them out
+ MultiSequence *alignment = doAlign(sequences,
+ ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
+ emitSingle), levelid);
+
+ //write the alignment results to standard output
+ if (enableClustalWOutput) {
+ alignment->WriteALN(*alignOutFile);
+ } else {
+ alignment->WriteMFA(*alignOutFile);
+ }
+
+ //release resources
+ delete[] this->seqsWeights;
+ delete alignment;
+ delete sequences;
+}
+MSA::~MSA() {
+ /*close the output file*/
+ if (alignOutFileName.length() > 0) {
+ ((std::ofstream*) alignOutFile)->close();
+ }
+}
+/////////////////////////////////////////////////////////////////
+// PrintParameters()
+//
+// Prints MSAPROBS parameters to STDERR. If a filename is
+// specified, then the parameters are also written to the file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::PrintParameters(const char *message, const VF &initDistrib,
+ const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
+ const VF &emitSingle, const char *filename) {
+
+ // print parameters to the screen
+ cerr << message << endl << " initDistrib[] = { ";
+ for (int i = 0; i < NumMatrixTypes; i++)
+ cerr << setprecision(10) << initDistrib[i] << " ";
+ cerr << "}" << endl << " gapOpen[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapOpen[i] << " ";
+ cerr << "}" << endl << " gapExtend[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapExtend[i] << " ";
+ cerr << "}" << endl << endl;
+
+ // if a file name is specified
+ if (filename) {
+
+ // attempt to open the file for writing
+ FILE *file = fopen(filename, "w");
+ if (!file) {
+ cerr << "ERROR: Unable to write parameter file: " << filename
+ << endl;
+ exit(1);
+ }
+
+ // if successful, then write the parameters to the file
+ for (int i = 0; i < NumMatrixTypes; i++)
+ fprintf(file, "%.10f ", initDistrib[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapOpen[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapExtend[i]);
+ fprintf(file, "\n");
+ fprintf(file, "%s\n", alphabet.c_str());
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++)
+ fprintf(file, "%.10f ",
+ emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
+ fprintf(file, "\n");
+ }
+ for (int i = 0; i < (int) alphabet.size(); i++)
+ fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
+ fprintf(file, "\n");
+ fclose(file);
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// doAlign()
+//
+// First computes all pairwise posterior probability matrices.
+// Then, computes new parameters if training, or a final
+// alignment, otherwise.
+/////////////////////////////////////////////////////////////////
+extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
+MultiSequence* MSA::doAlign(MultiSequence *sequences,
+ const ProbabilisticModel &model, int levelid) {
+ assert(sequences);
+
+ //get the number of sequences
+ const int numSeqs = sequences->GetNumSequences();
+ //create distance matrix
+ VVF distances(numSeqs, VF(numSeqs, 0));
+ //creat sparseMatrices
+ SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+#ifdef _OPENMP
+ //calculate sequence pairs for openmp model
+ int pairIdx = 0;
+ numPairs = (numSeqs - 1) * numSeqs / 2;
+ seqsPairs = new SeqsPair[numPairs];
+ for(int a = 0; a < numSeqs; a++) {
+ for(int b = a + 1; b < numSeqs; b++) {
+ seqsPairs[pairIdx].seq1 = a;
+ seqsPairs[pairIdx].seq2 = b;
+ pairIdx++;
+ }
+ }
+#endif
+ // do all pairwise alignments for posterior probability matrices
+#ifdef _OPENMP
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int a= seqsPairs[pairIdx].seq1;
+ int b = seqsPairs[pairIdx].seq2;
+ if(enableVerbose) {
+#pragma omp critical
+ cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
+ }
+#else
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+#endif
+ Sequence *seq1 = sequences->GetSequence(a);
+ Sequence *seq2 = sequences->GetSequence(b);
+
+ //posterior probability matrix
+ VF* posterior;
+
+//low similarity use local model
+ if(levelid == 1){
+
+ VF *forward = model.ComputeForwardMatrix(seq1, seq2,false);
+ assert(forward);
+ VF *backward = model.ComputeBackwardMatrix(seq1, seq2,false);
+ assert(backward);
+ posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
+ delete forward;
+ delete backward;
+
+ }
+//high similarity use global model
+ else if(levelid >= 2) posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
+
+//extreme low or extreme high similarity use combined model
+ else{
+
+//probcons
+ // compute forward and backward probabilities
+ VF *forward = model.ComputeForwardMatrix(seq1, seq2);
+ assert(forward);
+ VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
+ assert(backward);
+ // compute posterior probability matrix from HMM
+ VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
+ assert(probcons_posterior);
+ delete forward;
+ delete backward;
+
+//probalign
+ VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
+ assert(probalign_posterior);
+//local
+ forward = model.ComputeForwardMatrix(seq1, seq2,false);
+ assert(forward);
+ backward = model.ComputeBackwardMatrix(seq1, seq2,false);
+ assert(backward);
+ posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
+ assert(posterior);
+ delete forward;
+ delete backward;
+//combined model
+ //merge probalign + local + probcons
+ VF::iterator ptr1 = probcons_posterior->begin();
+ VF::iterator ptr2 = probalign_posterior->begin();
+ VF::iterator ptr = posterior->begin();
+ for (int i = 0; i <= seq1->GetLength(); i++) {
+ for (int j = 0; j <= seq2->GetLength(); j++) {
+ float v1 = *ptr1;
+ float v2 = *ptr2;
+ float v3 = *ptr;
+ *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3);
+ ptr1++;
+ ptr2++;
+ ptr++;
+ }
+ }
+ delete probcons_posterior;
+ delete probalign_posterior;
+ }
+
+ assert(posterior);
+ // perform the pairwise sequence alignment
+ pair<SafeVector<char> *, float> alignment = model.ComputeAlignment(
+ seq1->GetLength(), seq2->GetLength(), *posterior);
+
+ //compute expected accuracy
+ distances[a][b] = distances[b][a] = 1.0f - alignment.second
+ / min(seq1->GetLength(), seq2->GetLength());
+
+ // compute sparse representations
+ sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), *posterior);
+ sparseMatrices[b][a] = NULL;
+
+ delete posterior;
+ delete alignment.first;
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+ //create the guide tree
+ this->tree = new MSAClusterTree(this, distances, numSeqs);
+ this->tree->create();
+
+ // perform the consistency transformation the desired number of times
+ float* fweights = new float[numSeqs];
+ for (int r = 0; r < numSeqs; r++) {
+ fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
+ fweights[r] *= 10;
+ }
+ for (int r = 0; r < numConsistencyReps; r++) {
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
+ DoRelaxation(fweights, sequences, sparseMatrices);
+
+ // now replace the old posterior matrices
+ for (int i = 0; i < numSeqs; i++) {
+ for (int j = 0; j < numSeqs; j++) {
+ delete sparseMatrices[i][j];
+ sparseMatrices[i][j] = newSparseMatrices[i][j];
+ }
+ }
+ }
+ delete[] fweights;
+#ifdef _OPENMP
+ delete [] seqsPairs;
+#endif
+
+ //compute the final multiple sequence alignment
+ MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
+ sparseMatrices, model,levelid);
+
+ // build annotation
+ if (enableAnnotation) {
+ WriteAnnotation(finalAlignment, sparseMatrices);
+ }
+ //destroy the guide tree
+ delete this->tree;
+ this->tree = 0;
+
+ // delete sparse matrices
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+ delete sparseMatrices[a][b];
+ delete sparseMatrices[b][a];
+ }
+ }
+
+ return finalAlignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetInteger()
+//
+// Attempts to parse an integer from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetInteger(char *data, int *val) {
+ char *endPtr;
+ long int retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtol(data, &endPtr, 0);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
+ return false;
+ if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
+ return false;
+ *val = (int) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetFloat()
+//
+// Attempts to parse a float from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetFloat(char *data, float *val) {
+ char *endPtr;
+ double retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtod(data, &endPtr);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
+ return false;
+ *val = (float) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// ReadParameters()
+//
+// Read initial distribution, transition, and emission
+// parameters from a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::ReadParameters() {
+
+ ifstream data;
+
+ emitPairs = VVF(256, VF(256, 1e-10));
+ emitSingle = VF(256, 1e-5);
+
+ // read initial state distribution and transition parameters
+ if (parametersInputFilename == string("")) {
+ if (NumInsertStates == 1) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend1Default[i];
+ } else if (NumInsertStates == 2) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend2Default[i];
+ } else {
+ cerr
+ << "ERROR: No default initial distribution/parameter settings exist"
+ << endl << " for " << NumInsertStates
+ << " pairs of insert states. Use --paramfile." << endl;
+ exit(1);
+ }
+
+ alphabet = alphabetDefault;
+
+ for (int i = 0; i < (int) alphabet.length(); i++) {
+ emitSingle[(unsigned char) tolower(alphabet[i])] =
+ emitSingleDefault[i];
+ emitSingle[(unsigned char) toupper(alphabet[i])] =
+ emitSingleDefault[i];
+ for (int j = 0; j <= i; j++) {
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ }
+ }
+ } else {
+ data.open(parametersInputFilename.c_str());
+ if (data.fail()) {
+ cerr << "ERROR: Unable to read parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ string line[3];
+ for (int i = 0; i < 3; i++) {
+ if (!getline(data, line[i])) {
+ cerr
+ << "ERROR: Unable to read transition parameters from parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+ }
+ istringstream data2;
+ data2.clear();
+ data2.str(line[0]);
+ for (int i = 0; i < NumMatrixTypes; i++)
+ data2 >> initDistrib[i];
+ data2.clear();
+ data2.str(line[1]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapOpen[i];
+ data2.clear();
+ data2.str(line[2]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapExtend[i];
+
+ if (!getline(data, line[0])) {
+ cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ // read alphabet as concatenation of all characters on alphabet line
+ alphabet = "";
+ string token;
+ data2.clear();
+ data2.str(line[0]);
+ while (data2 >> token)
+ alphabet += token;
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++) {
+ float val;
+ data >> val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ }
+ }
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ float val;
+ data >> val;
+ emitSingle[(unsigned char) tolower(alphabet[i])] = val;
+ emitSingle[(unsigned char) toupper(alphabet[i])] = val;
+ }
+ data.close();
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// ParseParams()
+//
+// Parse all command-line options.
+/////////////////////////////////////////////////////////////////
+void MSA::printUsage() {
+ cerr
+ << "************************************************************************"
+ << endl
+ << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
+ << endl
+ << "\tbased on pair hidden markov model and partition function postirior"
+ << endl
+ << "\tprobabilities. If any comments or problems, please contact"
+ << endl
+ << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
+ << endl
+ << "*************************************************************************"
+ << endl << "Usage:" << endl
+ << " msaprobs [OPTION]... [infile]..." << endl << endl
+ << "Description:" << endl
+ << " Align sequences in multi-FASTA format" << endl << endl
+ << " -o, --outfile <string>" << endl
+ << " specify the output file name (STDOUT by default)"
+ << endl << " -num_threads <integer>" << endl
+ << " specify the number of threads used, and otherwise detect automatically"
+ << endl << " -clustalw" << endl
+ << " use CLUSTALW output format instead of FASTA format"
+ << endl << endl << " -c, --consistency REPS" << endl
+ << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
+ << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
+ << ") passes of consistency transformation" << endl << endl
+ << " -ir, --iterative-refinement REPS" << endl
+ << " use " << MIN_ITERATIVE_REFINEMENT_REPS
+ << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
+ << numIterativeRefinementReps << ") passes of iterative-refinement"
+ << endl << endl << " -v, --verbose" << endl
+ << " report progress while aligning (default: "
+ << (enableVerbose ? "on" : "off") << ")" << endl << endl
+ << " -annot FILENAME" << endl
+ << " write annotation for multiple alignment to FILENAME"
+ << endl << endl << " -a, --alignment-order" << endl
+ << " print sequences in alignment order rather than input order (default: "
+ << (enableAlignOrder ? "on" : "off") << ")" << endl
+ << " -version " << endl
+ << " print out version of MSAPROBS " << endl << endl;
+}
+SafeVector<string> MSA::ParseParams(int argc, char **argv) {
+ if (argc < 2) {
+ printUsage();
+ exit(1);
+ }
+ SafeVector<string> sequenceNames;
+ int tempInt;
+ float tempFloat;
+
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-') {
+ //help
+ if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
+ printUsage();
+ exit(1);
+ //output file name
+ } else if (!strcmp(argv[i], "-o")
+ || !strcmp(argv[i], "--outfile")) {
+ if (i < argc - 1) {
+ alignOutFileName = argv[++i]; //get the file name
+ } else {
+ cerr << "ERROR: String expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // parameter file
+ } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
+ if (i < argc - 1)
+ parametersInputFilename = string (argv[++i]);
+ else {
+ cerr << "ERROR: Filename expected for option " << argv[i] << endl;
+ exit (1);
+ }
+ //number of threads used
+ } else if (!strcmp(argv[i], "-p")
+ || !strcmp(argv[i], "-num_threads")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << " ERROR: invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < 0) {
+ tempInt = 0;
+ }
+ numThreads = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // number of consistency transformations
+ } else if (!strcmp(argv[i], "-c")
+ || !strcmp(argv[i], "--consistency")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_CONSISTENCY_REPS
+ || tempInt > MAX_CONSISTENCY_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_CONSISTENCY_REPS << " and "
+ << MAX_CONSISTENCY_REPS << "." << endl;
+ exit(1);
+ } else {
+ numConsistencyReps = tempInt;
+ }
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // number of randomized partitioning iterative refinement passes
+ else if (!strcmp(argv[i], "-ir")
+ || !strcmp(argv[i], "--iterative-refinement")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
+ || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_ITERATIVE_REFINEMENT_REPS << " and "
+ << MAX_ITERATIVE_REFINEMENT_REPS << "."
+ << endl;
+ exit(1);
+ } else
+ numIterativeRefinementReps = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // annotation files
+ else if (!strcmp(argv[i], "-annot")) {
+ enableAnnotation = true;
+ if (i < argc - 1) {
+ annotationFilename = argv[++i];
+ } else {
+ cerr << "ERROR: FILENAME expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // clustalw output format
+ else if (!strcmp(argv[i], "-clustalw")) {
+ enableClustalWOutput = true;
+ }
+
+ // cutoff
+ else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
+ if (i < argc - 1) {
+ if (!GetFloat(argv[++i], &tempFloat)) {
+ cerr
+ << "ERROR: Invalid floating-point value following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempFloat < 0 || tempFloat > 1) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", floating-point value must be between 0 and 1."
+ << endl;
+ exit(1);
+ } else
+ cutoff = tempFloat;
+ }
+ } else {
+ cerr << "ERROR: Floating-point value expected for option "
+ << argv[i] << endl;
+ exit(1);
+ }
+ }
+
+ // verbose reporting
+ else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
+ enableVerbose = true;
+ }
+
+ // alignment order
+ else if (!strcmp(argv[i], "-a")
+ || !strcmp(argv[i], "--alignment-order")) {
+ enableAlignOrder = true;
+ }
+
+ //print out version
+ else if (!strcmp(argv[i], "-version")) {
+ cerr << "MSAPROBS version " << VERSION << endl;
+ exit(1);
+ }
+ // bad arguments
+ else {
+ cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
+ exit(1);
+ }
+ } else {
+ sequenceNames.push_back(string(argv[i]));
+ }
+ }
+
+ /*check the output file name*/
+ cerr << "-------------------------------------" << endl;
+ if (alignOutFileName.length() == 0) {
+ cerr << "The final alignments will be printed out to STDOUT" << endl;
+ alignOutFile = &std::cout;
+ } else {
+ cerr << "Open the output file " << alignOutFileName << endl;
+ alignOutFile = new ofstream(alignOutFileName.c_str(),
+ ios::binary | ios::out | ios::trunc);
+ }
+ cerr << "-------------------------------------" << endl;
+ return sequenceNames;
+}
+
+/////////////////////////////////////////////////////////////////
+// ProcessTree()
+//
+// Process the tree recursively. Returns the aligned sequences
+// corresponding to a node or leaf of the tree.
+/////////////////////////////////////////////////////////////////
+MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ MultiSequence *result;
+
+ // check if this is a node of the alignment tree
+ //if (tree->GetSequenceLabel() == -1){
+ if (tree->leaf == NODE) {
+ MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
+ sparseMatrices, model);
+ MultiSequence *alignRight = ProcessTree(tree->right, sequences,
+ sparseMatrices, model);
+
+ assert(alignLeft);
+ assert(alignRight);
+
+ result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
+ assert(result);
+
+ delete alignLeft;
+ delete alignRight;
+ }
+
+ // otherwise, this is a leaf of the alignment tree
+ else {
+ result = new MultiSequence();
+ assert(result);
+ //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
+ result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
+ }
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeFinalAlignment()
+//
+// Compute the final alignment by calling ProcessTree(), then
+// performing iterative refinement as needed.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
+ MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, int levelid) {
+ MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
+ sparseMatrices, model);
+
+ SafeVector<int> oldOrdering;
+ int numSeqs = alignment->GetNumSequences();
+ if (enableAlignOrder) {
+ for (int i = 0; i < numSeqs; i++)
+ oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
+ alignment->SaveOrdering();
+ enableAlignOrder = false;
+ }
+
+ // tree-based refinement
+ // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
+ /*
+ int numSeqs = alignment->GetNumSequences();
+ //if(numSeqs < numIterativeRefinementReps){
+ for(int iter = 0; iter < 5; iter ++){
+ for(int i = 0; i < numSeqs - 1; i++){
+ DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
+ }
+ }
+ //}*/
+/*
+ //DoIterativeRefinement() return 1,2: this refinement unsuccessful
+ if(levelid == 3) numIterativeRefinementReps=10;
+ int ineffectiveness = 0;
+ for (int i = 0; i < numIterativeRefinementReps; i++){
+ int flag = DoIterativeRefinement(sparseMatrices, model, alignment);
+ if(numSeqs > 35 && levelid < 3){
+ if(flag > 0){
+ if(numIterativeRefinementReps < 10*numSeqs)
+ numIterativeRefinementReps ++;
+ if(flag == 1) ineffectiveness ++;
+ }
+ //else ineffectiveness = 0;
+ if(ineffectiveness > numSeqs && i >100 ) break;
+ }
+ }
+*/
+
+ //if(levelid == 3) numIterativeRefinementReps=10;
+ for (int i = 0; i < numIterativeRefinementReps; i++)
+ DoIterativeRefinement(sparseMatrices, model, alignment);
+
+ cerr << endl;
+
+ if (oldOrdering.size() > 0) {
+ for (int i = 0; i < (int) oldOrdering.size(); i++) {
+ alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
+ }
+ }
+
+ // return final alignment
+ return alignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// AlignAlignments()
+//
+// Returns the alignment of two MultiSequence objects.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
+ MultiSequence *align2,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ // print some info about the alignment
+ if (enableVerbose) {
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align1->GetSequence(i)->GetLabel();
+ cerr << "] vs. ";
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align2->GetSequence(i)->GetLabel();
+ cerr << "]: ";
+ }
+#if 0
+ VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
+ sparseMatrices, cutoff);
+#endif
+ // compute an "accuracy" measure for the alignment before refinement
+
+ pair<SafeVector<char> *, float> alignment;
+ //perform alignment
+ alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
+ align2->GetSequence(0)->GetLength(), *posterior);
+
+ delete posterior;
+
+ if (enableVerbose) {
+
+ // compute total length of sequences
+ int totLength = 0;
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ for (int j = 0; j < align2->GetNumSequences(); j++)
+ totLength += min(align1->GetSequence(i)->GetLength(),
+ align2->GetSequence(j)->GetLength());
+
+ // give an "accuracy" measure for the alignment
+ cerr << alignment.second / totLength << endl;
+ }
+
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ result->AddSequence(
+ align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ result->AddSequence(
+ align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
+ if (!enableAlignOrder)
+ result->SortByLabel();
+
+ // free temporary alignment
+ delete alignment.first;
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// DoRelaxation()
+//
+// Performs one round of the weighted probabilistic consistency transformation.
+// 1
+/////////////////////////////////////////////////////////////////
+
+SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
+ MultiSequence *sequences,
+ SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ const int numSeqs = sequences->GetNumSequences();
+
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+ // for every pair of sequences
+#ifdef _OPENMP
+ int pairIdx;
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int i = seqsPairs[pairIdx].seq1;
+ int j = seqsPairs[pairIdx].seq2;
+ float wi = seqsWeights[i];
+ float wj = seqsWeights[j];
+#else
+ for (int i = 0; i < numSeqs; i++) {
+ float wi = seqsWeights[i];
+ for (int j = i + 1; j < numSeqs; j++) {
+ float wj = seqsWeights[j];
+#endif
+ Sequence *seq1 = sequences->GetSequence(i);
+ Sequence *seq2 = sequences->GetSequence(j);
+
+ if (enableVerbose) {
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+ cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
+ << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
+ << ": ";
+ }
+ // get the original posterior matrix
+ VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+
+ // contribution from the summation where z = x and z = y
+ float w = wi * wi * wj + wi * wj * wj;
+ float sumW = w;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ //posterior[k] = w*posterior[k];
+ posterior[k] += posterior[k];
+ }
+
+ if (enableVerbose)
+ cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
+
+ // contribution from all other sequences
+ for (int k = 0; k < numSeqs; k++) {
+ if (k != i && k != j) {
+ float wk = seqsWeights[k];
+ float w = wi * wj * wk;
+ sumW += w;
+ if (k < i)
+ Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
+ posterior);
+ else if (k > i && k < j)
+ Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
+ posterior);
+ else {
+ SparseMatrix *temp =
+ sparseMatrices[j][k]->ComputeTranspose();
+ Relax(w, sparseMatrices[i][k], temp, posterior);
+ delete temp;
+ }
+ }
+ }
+ //cerr<<"sumW "<<sumW<<endl;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ //posterior[k] /= sumW;
+ posterior[k] /= numSeqs;
+ }
+ // mask out positions not originally in the posterior matrix
+ SparseMatrix *matXY = sparseMatrices[i][j];
+ for (int y = 0; y <= seq2Length; y++)
+ posterior[y] = 0;
+ for (int x = 1; x <= seq1Length; x++) {
+ SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
+ SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
+ VF::iterator base = posterior.begin() + x * (seq2Length + 1);
+ int curr = 0;
+ while (XYptr != XYend) {
+
+ // zero out all cells until the first filled column
+ while (curr < XYptr->first) {
+ base[curr] = 0;
+ curr++;
+ }
+
+ // now, skip over this column
+ curr++;
+ ++XYptr;
+ }
+
+ // zero out cells after last column
+ while (curr <= seq2Length) {
+ base[curr] = 0;
+ curr++;
+ }
+ }
+
+ // save the new posterior matrix
+ newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), posterior);
+ newSparseMatrices[j][i] = NULL;
+
+ if (enableVerbose)
+ cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
+
+ delete posteriorPtr;
+
+ if (enableVerbose)
+ cerr << "done." << endl;
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+ return newSparseMatrices;
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matXZ);
+ assert(matZY);
+
+ int lengthX = matXZ->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+ assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
+
+ // for every x[i]
+ for (int i = 1; i <= lengthX; i++) {
+ SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
+ SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
+
+ VF::iterator base = posterior.begin() + i * (lengthY + 1);
+
+ // iterate through all x[i]-z[k]
+ while (XZptr != XZend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
+ SafeVector<PIF>::iterator ZYend = ZYptr
+ + matZY->GetRowSize(XZptr->first);
+ const float XZval = XZptr->second;
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ //base[ZYptr->first] += weight * XZval * ZYptr->second;
+ base[ZYptr->first] += XZval * ZYptr->second;
+ ZYptr++;
+ }
+ XZptr++;
+ }
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax1()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matZX);
+ assert(matZY);
+
+ int lengthZ = matZX->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+
+ // for every z[k]
+ for (int k = 1; k <= lengthZ; k++) {
+ SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
+
+ // iterate through all z[k]-x[i]
+ while (ZXptr != ZXend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
+ const float ZXval = ZXptr->second;
+ VF::iterator base = posterior.begin()
+ + ZXptr->first * (lengthY + 1);
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ //base[ZYptr->first] += weight * ZXval * ZYptr->second;
+ base[ZYptr->first] += ZXval * ZYptr->second;
+ ZYptr++;
+ }
+ ZXptr++;
+ }
+ }
+}
+/////////////////////////////////////////////////////////////////
+// DoIterativeRefinement()
+//
+// Performs a single round of randomized partionining iterative
+// refinement.
+// return 0: successful refinement, 1: ineffective refinement, 2: random problem
+/////////////////////////////////////////////////////////////////
+int MSA::DoIterativeRefinement(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+ int i;
+ // create two separate groups
+ for (i = 0; i < numSeqs; i++) {
+ int index = rand();
+ if (index % 2) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty()) return 2;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+
+//start add by Yongtao
+#if 1
+ VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
+ sparseMatrices, cutoff);
+#endif
+ // compute an "accuracy" measure for the alignment before refinement
+ SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
+ SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
+ i=0;
+ for (set<int>::const_iterator iter = groupOne.begin();
+ iter != groupOne.end(); ++iter) {
+ oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
+ }
+ i=0;
+ for (set<int>::const_iterator iter = groupTwo.begin();
+ iter != groupTwo.end(); ++iter) {
+ oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
+ }
+
+ VF &posteriorArr = *posterior;
+ int oldLength = alignment->GetSequence(0)->GetLength();
+ int groupOneindex=0; int groupTwoindex=0;
+ float accuracy_before = 0;
+ int j;
+ for (i = 1; i <= oldLength; i++) {
+ // check to see if there is a gap in every sequence of the set
+ bool foundOne = false;
+ for (j = 0; !foundOne && j < (int) groupOne.size(); j++)
+ foundOne = (oldOnePtrs[j][i] != '-');
+ // if not, then this column counts towards the sequence length
+ if (foundOne) groupOneindex ++;
+ bool foundTwo = false;
+ for (j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
+ foundTwo = (oldTwoPtrs[j][i] != '-');
+ if (foundTwo) groupTwoindex ++;
+ if(foundOne && foundTwo) accuracy_before +=
+ posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
+ }
+
+ pair<SafeVector<char> *, float> refinealignment;
+ //perform alignment
+ refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
+ groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
+ delete posterior;
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
+ result->AddSequence(
+ groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
+ for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
+ result->AddSequence(
+ groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
+ // free temporary alignment
+ delete refinealignment.first;
+ delete alignment;
+ alignment = result;
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+ if(accuracy_before == refinealignment.second) return 1;
+ else return 0;
+}
+
+
+void MSA::DoIterativeRefinementTreeNode(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment,
+ int nodeIndex) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+
+ vector<bool> inGroup1;
+ inGroup1.resize(numSeqs);
+ for (int i = 0; i < numSeqs; i++) {
+ inGroup1[i] = false;
+ }
+
+ AlignmentOrder* orders = this->tree->getAlignOrders();
+ AlignmentOrder* order = &orders[nodeIndex];
+ for (int i = 0; i < order->leftNum; i++) {
+ int si = order->leftLeafs[i];
+ inGroup1[si] = true;
+ }
+ for (int i = 0; i < order->rightNum; i++) {
+ int si = order->rightLeafs[i];
+ inGroup1[si] = true;
+ }
+ // create two separate groups
+ for (int i = 0; i < numSeqs; i++) {
+ if (inGroup1[i]) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty())
+ return;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+ delete alignment;
+
+ // realign
+ alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
+ model);
+
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+}
+
+/////////////////////////////////////////////////////////////////
+// WriteAnnotation()
+//
+// Computes annotation for multiple alignment and write values
+// to a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::WriteAnnotation(MultiSequence *alignment,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ ofstream outfile(annotationFilename.c_str());
+
+ if (outfile.fail()) {
+ cerr << "ERROR: Unable to write annotation file." << endl;
+ exit(1);
+ }
+
+ const int alignLength = alignment->GetSequence(0)->GetLength();
+ const int numSeqs = alignment->GetNumSequences();
+
+ SafeVector<int> position(numSeqs, 0);
+ SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
+ for (int i = 0; i < numSeqs; i++)
+ seqs[i] = alignment->GetSequence(i)->GetDataPtr();
+ SafeVector<pair<int, int> > active;
+ active.reserve(numSeqs);
+
+ SafeVector<int> lab;
+ for (int i = 0; i < numSeqs; i++)
+ lab.push_back(alignment->GetSequence(i)->GetSortLabel());
+
+ // for every column
+ for (int i = 1; i <= alignLength; i++) {
+
+ // find all aligned residues in this particular column
+ active.clear();
+ for (int j = 0; j < numSeqs; j++) {
+ if (seqs[j][i] != '-') {
+ active.push_back(make_pair(lab[j], ++position[j]));
+ }
+ }
+
+ sort(active.begin(), active.end());
+ outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
+ }
+
+ outfile.close();
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeScore()
+//
+// Computes the annotation score for a particular column.
+/////////////////////////////////////////////////////////////////
+
+int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+
+ if (active.size() <= 1)
+ return 0;
+
+ // ALTERNATIVE #1: Compute the average alignment score.
+
+ float val = 0;
+ for (int i = 0; i < (int) active.size(); i++) {
+ for (int j = i + 1; j < (int) active.size(); j++) {
+ val += sparseMatrices[active[i].first][active[j].first]->GetValue(
+ active[i].second, active[j].second);
+ }
+ }
+
+ return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
+
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeSimilarity ()
+//
+// Computes the average similarity for a particular family.
+// extreme low similarity(<=25%) return 0
+// low similarity(<=40%) return 1
+// high similarity(<=70%) return 2
+// extreme high similarity(>70%) return 3
+/////////////////////////////////////////////////////////////////
+int MSA::AdjustmentTest(MultiSequence *sequences,const ProbabilisticModel &model){
+ assert(sequences);
+
+ //get the number of sequences
+ const int numSeqs = sequences->GetNumSequences();
+ //average identity for all sequences
+ float identity = 0;
+
+#ifdef _OPENMP
+ //calculate sequence pairs for openmp model
+ int pairIdx = 0;
+ numPairs = (numSeqs - 1) * numSeqs / 2;
+ seqsPairs = new SeqsPair[numPairs];
+ for(int a = 0; a < numSeqs; a++) {
+ for(int b = a + 1; b < numSeqs; b++) {
+ seqsPairs[pairIdx].seq1 = a;
+ seqsPairs[pairIdx].seq2 = b;
+ pairIdx++;
+ }
+ }
+#endif
+
+ // do all pairwise alignments for family similarity
+#ifdef _OPENMP
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int a= seqsPairs[pairIdx].seq1;
+ int b = seqsPairs[pairIdx].seq2;
+ if(enableVerbose) {
+#pragma omp critical
+ cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
+ }
+#else
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+#endif
+ Sequence *seq1 = sequences->GetSequence(a);
+ Sequence *seq2 = sequences->GetSequence(b);
+ pair<SafeVector<char> *, float> alignment = model.ComputeViterbiAlignment(seq1,seq2);
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+ float N_correct_match = 0;
+ //float N_alignment = 0;
+ int i = 1;int j = 1;
+ for (SafeVector<char>::iterator iter = alignment.first->begin();
+ iter != alignment.first->end(); ++iter){
+ //N_alignment += 1;
+ if (*iter == 'B'){
+ unsigned char c1 = (unsigned char) iter1[i++];
+ unsigned char c2 = (unsigned char) iter2[j++];
+ if(c1==c2) N_correct_match += 1;
+ }
+ else if(*iter == 'X') i++;
+ else if(*iter == 'Y') j++;
+ }
+ if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "similarity error"<< endl;
+ identity += N_correct_match / alignment.first->size();
+ delete alignment.first;
+#ifndef _OPENMP
+ }
+#endif
+ }
+ identity /= numPairs;
+/*
+FILE *fi = fopen ("accuracy", "a");
+fprintf (fi, " %.10f ", similarity); fprintf (fi, "\n");
+fclose (fi);
+*/
+
+ //adapative
+ if( identity <= 0.15 ) initDistrib[2] = 0.143854;
+ else if( identity <= 0.2 ) initDistrib[2] = 0.191948;
+ else if( identity <= 0.25 ) initDistrib[2] = 0.170705;
+ else if( identity <= 0.3 ) initDistrib[2] = 0.100675;
+ else if( identity <= 0.35 ) initDistrib[2] = 0.090755;
+ else if( identity <= 0.4 ) initDistrib[2] = 0.146188;
+ else if( identity <= 0.45 ) initDistrib[2] = 0.167858;
+ else if( identity <= 0.5) initDistrib[2] = 0.250769;
+
+
+ if( identity <= 0.25 ) return 0;
+ else if( identity <= 0.4) return 1;
+ else if( identity <= 0.7) return 2;
+ else return 3;
+
+}
--- /dev/null
+#ifndef _MSA_H
+#define _MSA_H
+#include "MSADef.h"
+#include "MSAGuideTree.h"
+
+#include "SafeVector.h"
+#include "MultiSequence.h"
+#include "ScoreType.h"
+#include "ProbabilisticModel.h"
+#include "SparseMatrix.h"
+#include <string>
+using namespace std;
+
+class MSAGuideTree;
+struct TreeNode;
+class MSA {
+public:
+ MSA(int argc, char* argv[]);
+ ~MSA();
+
+ static void getSysTime(double * dtime);
+ MSAGuideTree* getGuideTree() {
+ return tree;
+ }
+ int * getSeqsWeights() {
+ return seqsWeights;
+ }
+private:
+ //print usage
+ void printUsage();
+ //do multiple sequence alignment
+ void doAlign();
+
+ //for sequence weights
+ void createSeqsWeights(int seqsNum);
+ void releaseSeqsWeights();
+
+ //weights of sequences
+ int * seqsWeights;
+ //guide tree
+ MSAGuideTree* tree;
+ //output file
+ string alignOutFileName;
+ std::ostream* alignOutFile;
+private:
+ SafeVector<string> ParseParams(int argc, char *argv[]);
+ void PrintParameters(const char *message, const VF &initDistrib,
+ const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
+ const VF &emitSingle, const char *filename);
+
+ SafeVector<string> PostProbsParseParams(int argc, char **argv);
+ MultiSequence *doAlign(MultiSequence *sequence,
+ const ProbabilisticModel &model, int levelid);
+ void ReadParameters();
+ MultiSequence* ProcessTree(TreeNode *tree, MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model);
+ MultiSequence *ComputeFinalAlignment(MSAGuideTree *tree,
+ MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model,int levelid);
+ MultiSequence *AlignAlignments(MultiSequence *align1, MultiSequence *align2,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model);
+ SafeVector<SafeVector<SparseMatrix *> > DoRelaxation(float* seqsWeights,
+ MultiSequence *sequences,
+ SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
+ void Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
+ VF &posterior);
+ void Relax1(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
+ VF &posterior);
+ int DoIterativeRefinement(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment);
+ void DoIterativeRefinementTreeNode(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment,
+ int nodeIndex);
+ void WriteAnnotation(MultiSequence *alignment,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
+ int ComputeScore(const SafeVector<pair<int, int> > &active,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
+ int AdjustmentTest(MultiSequence *sequences,const ProbabilisticModel &model);
+#ifdef _OPENMP
+ //private struct
+ struct SeqsPair {
+ int seq1;
+ int seq2;
+ };
+ int numPairs;
+ SeqsPair* seqsPairs;
+#endif
+};
+
+#endif
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <set>
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <iomanip>
+#include "MSA.h"
+#include "MSAClusterTree.h"
+#include "Defaults.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+string parametersInputFilename = "";
+string parametersOutputFilename = "no training";
+string annotationFilename = "";
+
+bool enableVerbose = false;
+bool enableAnnotation = false;
+bool enableClustalWOutput = false;
+bool enableAlignOrder = false;
+int numConsistencyReps = 2;
+int numPreTrainingReps = 0;
+int numIterativeRefinementReps = 100;
+
+float cutoff = 0;
+
+VF initDistrib(NumMatrixTypes);
+VF gapOpen(2 * NumInsertStates);
+VF gapExtend(2 * NumInsertStates);
+VVF emitPairs(256, VF(256, 1e-10));
+VF emitSingle(256, 1e-5);
+
+string alphabet = alphabetDefault;
+
+const int MIN_PRETRAINING_REPS = 0;
+const int MAX_PRETRAINING_REPS = 20;
+const int MIN_CONSISTENCY_REPS = 0;
+const int MAX_CONSISTENCY_REPS = 5;
+const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
+const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
+
+string posteriorProbsFilename = "";
+bool allscores = true;
+string infilename;
+
+int flag_gui = 0; //0: no gui related o/p
+//1: gui related o/p generated
+int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
+//1: pp score seq added to o/p fasta alignment
+
+///////////////////////////////
+// global scoring matrix variables
+//////////////////////////////
+float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
+char *aminos, *bases, matrixtype[20] = "gonnet_160";
+int subst_index[26];
+
+double sub_matrix[26][26];
+double normalized_matrix[26][26];// add by YE Yongtao
+int firstread = 0; //this makes sure that matrices are read only once
+
+float TEMPERATURE = 5;
+int MATRIXTYPE = 160;
+int prot_nuc = 0; //0=prot, 1=nucleotide
+
+float GAPOPEN = 0;
+float GAPEXT = 0;
+int numThreads = 0;
+
+//argument support
+typedef struct {
+ char input[30];
+ int matrix;
+ int N;
+ float T;
+ float beta;
+ char opt; //can be 'P' or 'M'
+ float gapopen;
+ float gapext;
+} argument_decl;
+
+argument_decl argument;
+
+extern inline void read_sustitution_matrix(char *fileName);
+extern void setmatrixtype(int le);
+extern inline int matrixtype_to_int();
+extern inline void read_dna_matrix();
+extern inline void read_vtml_la_matrix();
+extern void init_arguments();
+
+MSA::MSA(int argc, char* argv[]) {
+ //parse program parameters
+ SafeVector<string> sequenceNames = ParseParams(argc, argv);
+
+ //initialize arguments for partition function
+ init_arguments();
+
+ ReadParameters();
+ //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
+
+ //read the input sequences
+ MultiSequence *sequences = new MultiSequence();
+ assert(sequences);
+ for (int i = 0; i < (int) sequenceNames.size(); i++) {
+ cerr << "Loading sequence file: " << sequenceNames[i] << endl;
+ sequences->LoadMFA(sequenceNames[i], true);
+ }
+ //allocate space for sequence weights
+ this->seqsWeights = new int[sequences->GetNumSequences()];
+ //initilaize parameters for OPENMP
+#ifdef _OPENMP
+ if(numThreads <= 0) {
+ numThreads = omp_get_num_procs();
+ cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
+ }
+ cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
+
+ //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
+ omp_set_num_threads(numThreads);
+#endif
+FILE *fi = fopen ("accuracy", "a");
+fprintf (fi, "%s ", argv[1]);
+fclose (fi);
+
+ int levelid = ComputeSimilarity (sequences,ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,emitSingle));
+/*
+ // now, we can perform the alignments and write them out
+ MultiSequence *alignment = doAlign(sequences,
+ ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
+ emitSingle), levelid);
+
+ //write the alignment results to standard output
+ if (enableClustalWOutput) {
+ alignment->WriteALN(*alignOutFile);
+ } else {
+ alignment->WriteMFA(*alignOutFile);
+ }
+ //release resources
+ delete[] this->seqsWeights;
+ delete alignment;
+*/
+ delete sequences;
+}
+MSA::~MSA() {
+ /*close the output file*/
+ if (alignOutFileName.length() > 0) {
+ ((std::ofstream*) alignOutFile)->close();
+ }
+}
+/////////////////////////////////////////////////////////////////
+// PrintParameters()
+//
+// Prints MSAPROBS parameters to STDERR. If a filename is
+// specified, then the parameters are also written to the file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::PrintParameters(const char *message, const VF &initDistrib,
+ const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
+ const VF &emitSingle, const char *filename) {
+
+ // print parameters to the screen
+ cerr << message << endl << " initDistrib[] = { ";
+ for (int i = 0; i < NumMatrixTypes; i++)
+ cerr << setprecision(10) << initDistrib[i] << " ";
+ cerr << "}" << endl << " gapOpen[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapOpen[i] << " ";
+ cerr << "}" << endl << " gapExtend[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapExtend[i] << " ";
+ cerr << "}" << endl << endl;
+
+ // if a file name is specified
+ if (filename) {
+
+ // attempt to open the file for writing
+ FILE *file = fopen(filename, "w");
+ if (!file) {
+ cerr << "ERROR: Unable to write parameter file: " << filename
+ << endl;
+ exit(1);
+ }
+
+ // if successful, then write the parameters to the file
+ for (int i = 0; i < NumMatrixTypes; i++)
+ fprintf(file, "%.10f ", initDistrib[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapOpen[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapExtend[i]);
+ fprintf(file, "\n");
+ fprintf(file, "%s\n", alphabet.c_str());
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++)
+ fprintf(file, "%.10f ",
+ emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
+ fprintf(file, "\n");
+ }
+ for (int i = 0; i < (int) alphabet.size(); i++)
+ fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
+ fprintf(file, "\n");
+ fclose(file);
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// doAlign()
+//
+// First computes all pairwise posterior probability matrices.
+// Then, computes new parameters if training, or a final
+// alignment, otherwise.
+/////////////////////////////////////////////////////////////////
+extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
+MultiSequence* MSA::doAlign(MultiSequence *sequences,
+ const ProbabilisticModel &model, int levelid) {
+ assert(sequences);
+
+ //get the number of sequences
+ const int numSeqs = sequences->GetNumSequences();
+ //create distance matrix
+ VVF distances(numSeqs, VF(numSeqs, 0));
+ //creat sparseMatrices
+ SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+#ifdef _OPENMP
+ //calculate sequence pairs for openmp model
+ int pairIdx = 0;
+ numPairs = (numSeqs - 1) * numSeqs / 2;
+ seqsPairs = new SeqsPair[numPairs];
+ for(int a = 0; a < numSeqs; a++) {
+ for(int b = a + 1; b < numSeqs; b++) {
+ seqsPairs[pairIdx].seq1 = a;
+ seqsPairs[pairIdx].seq2 = b;
+ pairIdx++;
+ }
+ }
+#endif
+ // do all pairwise alignments for posterior probability matrices
+#ifdef _OPENMP
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int a= seqsPairs[pairIdx].seq1;
+ int b = seqsPairs[pairIdx].seq2;
+ if(enableVerbose) {
+#pragma omp critical
+ cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
+ }
+#else
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+#endif
+ Sequence *seq1 = sequences->GetSequence(a);
+ Sequence *seq2 = sequences->GetSequence(b);
+
+ //posterior probability matrix
+ VF* posterior;
+
+
+//high similarity use global model
+ //if(levelid == 2)
+ if(1) posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
+
+/*
+//low similarity use local model
+ else if(levelid == 1){
+ VF *forward = model.ComputeForwardMatrix(seq1, seq2,false);
+ assert(forward);
+ VF *backward = model.ComputeBackwardMatrix(seq1, seq2,false);
+ assert(backward);
+ posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
+ delete forward;
+ delete backward;
+ }
+
+//extreme low or extreme high similarity use combined model
+ else{
+
+//probcons
+ // compute forward and backward probabilities
+ VF *forward = model.ComputeForwardMatrix(seq1, seq2);
+ assert(forward);
+ VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
+ assert(backward);
+ // compute posterior probability matrix from HMM
+ VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
+ assert(probcons_posterior);
+ delete forward;
+ delete backward;
+
+//probalign
+ VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
+ assert(probalign_posterior);
+//local
+ forward = model.ComputeForwardMatrix(seq1, seq2,false);
+ assert(forward);
+ backward = model.ComputeBackwardMatrix(seq1, seq2,false);
+ assert(backward);
+ posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
+ assert(posterior);
+ delete forward;
+ delete backward;
+//combined model
+ //merge probalign + local + probcons
+ VF::iterator ptr1 = probcons_posterior->begin();
+ VF::iterator ptr2 = probalign_posterior->begin();
+ VF::iterator ptr = posterior->begin();
+ for (int i = 0; i <= seq1->GetLength(); i++) {
+ for (int j = 0; j <= seq2->GetLength(); j++) {
+ float v1 = *ptr1;
+ float v2 = *ptr2;
+ float v3 = *ptr;
+ *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3);
+ ptr1++;
+ ptr2++;
+ ptr++;
+ }
+ }
+ delete probcons_posterior;
+ delete probalign_posterior;
+ }
+*/
+ assert(posterior);
+ // perform the pairwise sequence alignment
+ pair<SafeVector<char> *, float> alignment = model.ComputeAlignment(
+ seq1->GetLength(), seq2->GetLength(), *posterior);
+
+ //compute expected accuracy
+ distances[a][b] = distances[b][a] = 1.0f - alignment.second
+ / min(seq1->GetLength(), seq2->GetLength());
+
+ // compute sparse representations
+ sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), *posterior);
+ sparseMatrices[b][a] = NULL;
+
+ delete posterior;
+ delete alignment.first;
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+ //create the guide tree
+ this->tree = new MSAClusterTree(this, distances, numSeqs);
+ this->tree->create();
+
+ // perform the consistency transformation the desired number of times
+ float* fweights = new float[numSeqs];
+ for (int r = 0; r < numSeqs; r++) {
+ fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
+ fweights[r] *= 10;
+ }
+ for (int r = 0; r < numConsistencyReps; r++) {
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
+ DoRelaxation(fweights, sequences, sparseMatrices);
+
+ // now replace the old posterior matrices
+ for (int i = 0; i < numSeqs; i++) {
+ for (int j = 0; j < numSeqs; j++) {
+ delete sparseMatrices[i][j];
+ sparseMatrices[i][j] = newSparseMatrices[i][j];
+ }
+ }
+ }
+ delete[] fweights;
+#ifdef _OPENMP
+ delete [] seqsPairs;
+#endif
+
+ //compute the final multiple sequence alignment
+ MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
+ sparseMatrices, model);
+
+ // build annotation
+ if (enableAnnotation) {
+ WriteAnnotation(finalAlignment, sparseMatrices);
+ }
+ //destroy the guide tree
+ delete this->tree;
+ this->tree = 0;
+
+ // delete sparse matrices
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+ delete sparseMatrices[a][b];
+ delete sparseMatrices[b][a];
+ }
+ }
+
+ return finalAlignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetInteger()
+//
+// Attempts to parse an integer from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetInteger(char *data, int *val) {
+ char *endPtr;
+ long int retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtol(data, &endPtr, 0);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
+ return false;
+ if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
+ return false;
+ *val = (int) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetFloat()
+//
+// Attempts to parse a float from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetFloat(char *data, float *val) {
+ char *endPtr;
+ double retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtod(data, &endPtr);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
+ return false;
+ *val = (float) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// ReadParameters()
+//
+// Read initial distribution, transition, and emission
+// parameters from a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::ReadParameters() {
+
+ ifstream data;
+
+ emitPairs = VVF(256, VF(256, 1e-10));
+ emitSingle = VF(256, 1e-5);
+
+ // read initial state distribution and transition parameters
+ if (parametersInputFilename == string("")) {
+ if (NumInsertStates == 1) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend1Default[i];
+ } else if (NumInsertStates == 2) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend2Default[i];
+ } else {
+ cerr
+ << "ERROR: No default initial distribution/parameter settings exist"
+ << endl << " for " << NumInsertStates
+ << " pairs of insert states. Use --paramfile." << endl;
+ exit(1);
+ }
+
+ alphabet = alphabetDefault;
+
+ for (int i = 0; i < (int) alphabet.length(); i++) {
+ emitSingle[(unsigned char) tolower(alphabet[i])] =
+ emitSingleDefault[i];
+ emitSingle[(unsigned char) toupper(alphabet[i])] =
+ emitSingleDefault[i];
+ for (int j = 0; j <= i; j++) {
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ }
+ }
+ } else {
+ data.open(parametersInputFilename.c_str());
+ if (data.fail()) {
+ cerr << "ERROR: Unable to read parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ string line[3];
+ for (int i = 0; i < 3; i++) {
+ if (!getline(data, line[i])) {
+ cerr
+ << "ERROR: Unable to read transition parameters from parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+ }
+ istringstream data2;
+ data2.clear();
+ data2.str(line[0]);
+ for (int i = 0; i < NumMatrixTypes; i++)
+ data2 >> initDistrib[i];
+ data2.clear();
+ data2.str(line[1]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapOpen[i];
+ data2.clear();
+ data2.str(line[2]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapExtend[i];
+
+ if (!getline(data, line[0])) {
+ cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ // read alphabet as concatenation of all characters on alphabet line
+ alphabet = "";
+ string token;
+ data2.clear();
+ data2.str(line[0]);
+ while (data2 >> token)
+ alphabet += token;
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++) {
+ float val;
+ data >> val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ }
+ }
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ float val;
+ data >> val;
+ emitSingle[(unsigned char) tolower(alphabet[i])] = val;
+ emitSingle[(unsigned char) toupper(alphabet[i])] = val;
+ }
+ data.close();
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// ParseParams()
+//
+// Parse all command-line options.
+/////////////////////////////////////////////////////////////////
+void MSA::printUsage() {
+ cerr
+ << "************************************************************************"
+ << endl
+ << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
+ << endl
+ << "\tbased on pair hidden markov model and partition function postirior"
+ << endl
+ << "\tprobabilities. If any comments or problems, please contact"
+ << endl
+ << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
+ << endl
+ << "*************************************************************************"
+ << endl << "Usage:" << endl
+ << " msaprobs [OPTION]... [infile]..." << endl << endl
+ << "Description:" << endl
+ << " Align sequences in multi-FASTA format" << endl << endl
+ << " -o, --outfile <string>" << endl
+ << " specify the output file name (STDOUT by default)"
+ << endl << " -num_threads <integer>" << endl
+ << " specify the number of threads used, and otherwise detect automatically"
+ << endl << " -clustalw" << endl
+ << " use CLUSTALW output format instead of FASTA format"
+ << endl << endl << " -c, --consistency REPS" << endl
+ << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
+ << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
+ << ") passes of consistency transformation" << endl << endl
+ << " -ir, --iterative-refinement REPS" << endl
+ << " use " << MIN_ITERATIVE_REFINEMENT_REPS
+ << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
+ << numIterativeRefinementReps << ") passes of iterative-refinement"
+ << endl << endl << " -v, --verbose" << endl
+ << " report progress while aligning (default: "
+ << (enableVerbose ? "on" : "off") << ")" << endl << endl
+ << " -annot FILENAME" << endl
+ << " write annotation for multiple alignment to FILENAME"
+ << endl << endl << " -a, --alignment-order" << endl
+ << " print sequences in alignment order rather than input order (default: "
+ << (enableAlignOrder ? "on" : "off") << ")" << endl
+ << " -version " << endl
+ << " print out version of MSAPROBS " << endl << endl;
+}
+SafeVector<string> MSA::ParseParams(int argc, char **argv) {
+ if (argc < 2) {
+ printUsage();
+ exit(1);
+ }
+ SafeVector<string> sequenceNames;
+ int tempInt;
+ float tempFloat;
+
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-') {
+ //help
+ if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
+ printUsage();
+ exit(1);
+ //output file name
+ } else if (!strcmp(argv[i], "-o")
+ || !strcmp(argv[i], "--outfile")) {
+ if (i < argc - 1) {
+ alignOutFileName = argv[++i]; //get the file name
+ } else {
+ cerr << "ERROR: String expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // parameter file
+ } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
+ if (i < argc - 1)
+ parametersInputFilename = string (argv[++i]);
+ else {
+ cerr << "ERROR: Filename expected for option " << argv[i] << endl;
+ exit (1);
+ }
+ //number of threads used
+ } else if (!strcmp(argv[i], "-p")
+ || !strcmp(argv[i], "-num_threads")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << " ERROR: invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < 0) {
+ tempInt = 0;
+ }
+ numThreads = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // number of consistency transformations
+ } else if (!strcmp(argv[i], "-c")
+ || !strcmp(argv[i], "--consistency")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_CONSISTENCY_REPS
+ || tempInt > MAX_CONSISTENCY_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_CONSISTENCY_REPS << " and "
+ << MAX_CONSISTENCY_REPS << "." << endl;
+ exit(1);
+ } else {
+ numConsistencyReps = tempInt;
+ }
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // number of randomized partitioning iterative refinement passes
+ else if (!strcmp(argv[i], "-ir")
+ || !strcmp(argv[i], "--iterative-refinement")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
+ || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_ITERATIVE_REFINEMENT_REPS << " and "
+ << MAX_ITERATIVE_REFINEMENT_REPS << "."
+ << endl;
+ exit(1);
+ } else
+ numIterativeRefinementReps = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // annotation files
+ else if (!strcmp(argv[i], "-annot")) {
+ enableAnnotation = true;
+ if (i < argc - 1) {
+ annotationFilename = argv[++i];
+ } else {
+ cerr << "ERROR: FILENAME expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // clustalw output format
+ else if (!strcmp(argv[i], "-clustalw")) {
+ enableClustalWOutput = true;
+ }
+
+ // cutoff
+ else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
+ if (i < argc - 1) {
+ if (!GetFloat(argv[++i], &tempFloat)) {
+ cerr
+ << "ERROR: Invalid floating-point value following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempFloat < 0 || tempFloat > 1) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", floating-point value must be between 0 and 1."
+ << endl;
+ exit(1);
+ } else
+ cutoff = tempFloat;
+ }
+ } else {
+ cerr << "ERROR: Floating-point value expected for option "
+ << argv[i] << endl;
+ exit(1);
+ }
+ }
+
+ // verbose reporting
+ else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
+ enableVerbose = true;
+ }
+
+ // alignment order
+ else if (!strcmp(argv[i], "-a")
+ || !strcmp(argv[i], "--alignment-order")) {
+ enableAlignOrder = true;
+ }
+
+ //print out version
+ else if (!strcmp(argv[i], "-version")) {
+ cerr << "MSAPROBS version " << VERSION << endl;
+ exit(1);
+ }
+ // bad arguments
+ else {
+ cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
+ exit(1);
+ }
+ } else {
+ sequenceNames.push_back(string(argv[i]));
+ }
+ }
+
+ /*check the output file name*/
+ cerr << "-------------------------------------" << endl;
+ if (alignOutFileName.length() == 0) {
+ cerr << "The final alignments will be printed out to STDOUT" << endl;
+ alignOutFile = &std::cout;
+ } else {
+ cerr << "Open the output file " << alignOutFileName << endl;
+ alignOutFile = new ofstream(alignOutFileName.c_str(),
+ ios::binary | ios::out | ios::trunc);
+ }
+ cerr << "-------------------------------------" << endl;
+ return sequenceNames;
+}
+
+/////////////////////////////////////////////////////////////////
+// ProcessTree()
+//
+// Process the tree recursively. Returns the aligned sequences
+// corresponding to a node or leaf of the tree.
+/////////////////////////////////////////////////////////////////
+MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ MultiSequence *result;
+
+ // check if this is a node of the alignment tree
+ //if (tree->GetSequenceLabel() == -1){
+ if (tree->leaf == NODE) {
+ MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
+ sparseMatrices, model);
+ MultiSequence *alignRight = ProcessTree(tree->right, sequences,
+ sparseMatrices, model);
+
+ assert(alignLeft);
+ assert(alignRight);
+
+ result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
+ assert(result);
+
+ delete alignLeft;
+ delete alignRight;
+ }
+
+ // otherwise, this is a leaf of the alignment tree
+ else {
+ result = new MultiSequence();
+ assert(result);
+ //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
+ result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
+ }
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeFinalAlignment()
+//
+// Compute the final alignment by calling ProcessTree(), then
+// performing iterative refinement as needed.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
+ MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+ MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
+ sparseMatrices, model);
+
+ SafeVector<int> oldOrdering;
+ int numSeqs = alignment->GetNumSequences();
+ if (enableAlignOrder) {
+ for (int i = 0; i < numSeqs; i++)
+ oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
+ alignment->SaveOrdering();
+ enableAlignOrder = false;
+ }
+
+ // tree-based refinement
+ // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
+ /*
+ int numSeqs = alignment->GetNumSequences();
+ //if(numSeqs < numIterativeRefinementReps){
+ for(int iter = 0; iter < 5; iter ++){
+ for(int i = 0; i < numSeqs - 1; i++){
+ DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
+ }
+ }
+ //}*/
+ //DoIterativeRefinement() return 1,2: this refinement unsuccessful
+/*
+ int ineffectiveness = 0;
+ for (int i = 0; i < numIterativeRefinementReps; i++){
+ int flag = DoIterativeRefinement(sparseMatrices, model, alignment);
+ if(numSeqs > 25){
+ if(flag > 0){
+ if(numIterativeRefinementReps < 20*numSeqs)
+ numIterativeRefinementReps ++;
+ if(flag == 1) ineffectiveness ++;
+ }
+ //else ineffectiveness = 0;
+ if(ineffectiveness > 2*numSeqs && i >100 ) break;
+ }
+ }
+*/
+
+ for (int i = 0; i < numIterativeRefinementReps; i++)
+ DoIterativeRefinement(sparseMatrices, model, alignment);
+
+ cerr << endl;
+
+ if (oldOrdering.size() > 0) {
+ for (int i = 0; i < (int) oldOrdering.size(); i++) {
+ alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
+ }
+ }
+
+ // return final alignment
+ return alignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// AlignAlignments()
+//
+// Returns the alignment of two MultiSequence objects.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
+ MultiSequence *align2,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ // print some info about the alignment
+ if (enableVerbose) {
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align1->GetSequence(i)->GetLabel();
+ cerr << "] vs. ";
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align2->GetSequence(i)->GetLabel();
+ cerr << "]: ";
+ }
+#if 0
+ VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
+ sparseMatrices, cutoff);
+#endif
+ // compute an "accuracy" measure for the alignment before refinement
+
+ pair<SafeVector<char> *, float> alignment;
+ //perform alignment
+ alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
+ align2->GetSequence(0)->GetLength(), *posterior);
+
+ delete posterior;
+
+ if (enableVerbose) {
+
+ // compute total length of sequences
+ int totLength = 0;
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ for (int j = 0; j < align2->GetNumSequences(); j++)
+ totLength += min(align1->GetSequence(i)->GetLength(),
+ align2->GetSequence(j)->GetLength());
+
+ // give an "accuracy" measure for the alignment
+ cerr << alignment.second / totLength << endl;
+ }
+
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ result->AddSequence(
+ align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ result->AddSequence(
+ align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
+ if (!enableAlignOrder)
+ result->SortByLabel();
+
+ // free temporary alignment
+ delete alignment.first;
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// DoRelaxation()
+//
+// Performs one round of the weighted probabilistic consistency transformation.
+// 1
+/////////////////////////////////////////////////////////////////
+
+SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
+ MultiSequence *sequences,
+ SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ const int numSeqs = sequences->GetNumSequences();
+
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+ // for every pair of sequences
+#ifdef _OPENMP
+ int pairIdx;
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int i = seqsPairs[pairIdx].seq1;
+ int j = seqsPairs[pairIdx].seq2;
+ float wi = seqsWeights[i];
+ float wj = seqsWeights[j];
+#else
+ for (int i = 0; i < numSeqs; i++) {
+ float wi = seqsWeights[i];
+ for (int j = i + 1; j < numSeqs; j++) {
+ float wj = seqsWeights[j];
+#endif
+ Sequence *seq1 = sequences->GetSequence(i);
+ Sequence *seq2 = sequences->GetSequence(j);
+
+ if (enableVerbose) {
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+ cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
+ << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
+ << ": ";
+ }
+ // get the original posterior matrix
+ VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+
+ // contribution from the summation where z = x and z = y
+ float w = wi * wi * wj + wi * wj * wj;
+ float sumW = w;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ //posterior[k] = w*posterior[k];
+ posterior[k] += posterior[k];
+ }
+
+ if (enableVerbose)
+ cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
+
+ // contribution from all other sequences
+ for (int k = 0; k < numSeqs; k++) {
+ if (k != i && k != j) {
+ float wk = seqsWeights[k];
+ float w = wi * wj * wk;
+ sumW += w;
+ if (k < i)
+ Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
+ posterior);
+ else if (k > i && k < j)
+ Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
+ posterior);
+ else {
+ SparseMatrix *temp =
+ sparseMatrices[j][k]->ComputeTranspose();
+ Relax(w, sparseMatrices[i][k], temp, posterior);
+ delete temp;
+ }
+ }
+ }
+ //cerr<<"sumW "<<sumW<<endl;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ //posterior[k] /= sumW;
+ posterior[k] /= numSeqs;
+ }
+ // mask out positions not originally in the posterior matrix
+ SparseMatrix *matXY = sparseMatrices[i][j];
+ for (int y = 0; y <= seq2Length; y++)
+ posterior[y] = 0;
+ for (int x = 1; x <= seq1Length; x++) {
+ SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
+ SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
+ VF::iterator base = posterior.begin() + x * (seq2Length + 1);
+ int curr = 0;
+ while (XYptr != XYend) {
+
+ // zero out all cells until the first filled column
+ while (curr < XYptr->first) {
+ base[curr] = 0;
+ curr++;
+ }
+
+ // now, skip over this column
+ curr++;
+ ++XYptr;
+ }
+
+ // zero out cells after last column
+ while (curr <= seq2Length) {
+ base[curr] = 0;
+ curr++;
+ }
+ }
+
+ // save the new posterior matrix
+ newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), posterior);
+ newSparseMatrices[j][i] = NULL;
+
+ if (enableVerbose)
+ cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
+
+ delete posteriorPtr;
+
+ if (enableVerbose)
+ cerr << "done." << endl;
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+ return newSparseMatrices;
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matXZ);
+ assert(matZY);
+
+ int lengthX = matXZ->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+ assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
+
+ // for every x[i]
+ for (int i = 1; i <= lengthX; i++) {
+ SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
+ SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
+
+ VF::iterator base = posterior.begin() + i * (lengthY + 1);
+
+ // iterate through all x[i]-z[k]
+ while (XZptr != XZend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
+ SafeVector<PIF>::iterator ZYend = ZYptr
+ + matZY->GetRowSize(XZptr->first);
+ const float XZval = XZptr->second;
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ //base[ZYptr->first] += weight * XZval * ZYptr->second;
+ base[ZYptr->first] += XZval * ZYptr->second;
+ ZYptr++;
+ }
+ XZptr++;
+ }
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax1()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matZX);
+ assert(matZY);
+
+ int lengthZ = matZX->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+
+ // for every z[k]
+ for (int k = 1; k <= lengthZ; k++) {
+ SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
+
+ // iterate through all z[k]-x[i]
+ while (ZXptr != ZXend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
+ const float ZXval = ZXptr->second;
+ VF::iterator base = posterior.begin()
+ + ZXptr->first * (lengthY + 1);
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ //base[ZYptr->first] += weight * ZXval * ZYptr->second;
+ base[ZYptr->first] += ZXval * ZYptr->second;
+ ZYptr++;
+ }
+ ZXptr++;
+ }
+ }
+}
+/////////////////////////////////////////////////////////////////
+// DoIterativeRefinement()
+//
+// Performs a single round of randomized partionining iterative
+// refinement.
+// return 0: successful refinement, 1: ineffective refinement, 2: random problem
+/////////////////////////////////////////////////////////////////
+int MSA::DoIterativeRefinement(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+ int i;
+ // create two separate groups
+ for (i = 0; i < numSeqs; i++) {
+ int index = rand();
+ if (index % 2) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty()) return 2;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+
+//start add by Yongtao
+#if 1
+ VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
+ sparseMatrices, cutoff);
+#endif
+ // compute an "accuracy" measure for the alignment before refinement
+ SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
+ SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
+ i=0;
+ for (set<int>::const_iterator iter = groupOne.begin();
+ iter != groupOne.end(); ++iter) {
+ oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
+ }
+ i=0;
+ for (set<int>::const_iterator iter = groupTwo.begin();
+ iter != groupTwo.end(); ++iter) {
+ oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
+ }
+
+ VF &posteriorArr = *posterior;
+ int oldLength = alignment->GetSequence(0)->GetLength();
+ int groupOneindex=0; int groupTwoindex=0;
+ float accuracy_before = 0;
+ int j;
+ for (i = 1; i <= oldLength; i++) {
+ // check to see if there is a gap in every sequence of the set
+ bool foundOne = false;
+ for (j = 0; !foundOne && j < (int) groupOne.size(); j++)
+ foundOne = (oldOnePtrs[j][i] != '-');
+ // if not, then this column counts towards the sequence length
+ if (foundOne) groupOneindex ++;
+ bool foundTwo = false;
+ for (j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
+ foundTwo = (oldTwoPtrs[j][i] != '-');
+ if (foundTwo) groupTwoindex ++;
+ if(foundOne && foundTwo) accuracy_before +=
+ posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
+ }
+
+ pair<SafeVector<char> *, float> refinealignment;
+ //perform alignment
+ refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
+ groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
+ delete posterior;
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
+ result->AddSequence(
+ groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
+ for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
+ result->AddSequence(
+ groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
+ // free temporary alignment
+ delete refinealignment.first;
+ delete alignment;
+ alignment = result;
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+ if(accuracy_before == refinealignment.second) return 1;
+ else return 0;
+}
+
+
+void MSA::DoIterativeRefinementTreeNode(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment,
+ int nodeIndex) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+
+ vector<bool> inGroup1;
+ inGroup1.resize(numSeqs);
+ for (int i = 0; i < numSeqs; i++) {
+ inGroup1[i] = false;
+ }
+
+ AlignmentOrder* orders = this->tree->getAlignOrders();
+ AlignmentOrder* order = &orders[nodeIndex];
+ for (int i = 0; i < order->leftNum; i++) {
+ int si = order->leftLeafs[i];
+ inGroup1[si] = true;
+ }
+ for (int i = 0; i < order->rightNum; i++) {
+ int si = order->rightLeafs[i];
+ inGroup1[si] = true;
+ }
+ // create two separate groups
+ for (int i = 0; i < numSeqs; i++) {
+ if (inGroup1[i]) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty())
+ return;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+ delete alignment;
+
+ // realign
+ alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
+ model);
+
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+}
+
+/////////////////////////////////////////////////////////////////
+// WriteAnnotation()
+//
+// Computes annotation for multiple alignment and write values
+// to a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::WriteAnnotation(MultiSequence *alignment,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ ofstream outfile(annotationFilename.c_str());
+
+ if (outfile.fail()) {
+ cerr << "ERROR: Unable to write annotation file." << endl;
+ exit(1);
+ }
+
+ const int alignLength = alignment->GetSequence(0)->GetLength();
+ const int numSeqs = alignment->GetNumSequences();
+
+ SafeVector<int> position(numSeqs, 0);
+ SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
+ for (int i = 0; i < numSeqs; i++)
+ seqs[i] = alignment->GetSequence(i)->GetDataPtr();
+ SafeVector<pair<int, int> > active;
+ active.reserve(numSeqs);
+
+ SafeVector<int> lab;
+ for (int i = 0; i < numSeqs; i++)
+ lab.push_back(alignment->GetSequence(i)->GetSortLabel());
+
+ // for every column
+ for (int i = 1; i <= alignLength; i++) {
+
+ // find all aligned residues in this particular column
+ active.clear();
+ for (int j = 0; j < numSeqs; j++) {
+ if (seqs[j][i] != '-') {
+ active.push_back(make_pair(lab[j], ++position[j]));
+ }
+ }
+
+ sort(active.begin(), active.end());
+ outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
+ }
+
+ outfile.close();
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeScore()
+//
+// Computes the annotation score for a particular column.
+/////////////////////////////////////////////////////////////////
+
+int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+
+ if (active.size() <= 1)
+ return 0;
+
+ // ALTERNATIVE #1: Compute the average alignment score.
+
+ float val = 0;
+ for (int i = 0; i < (int) active.size(); i++) {
+ for (int j = i + 1; j < (int) active.size(); j++) {
+ val += sparseMatrices[active[i].first][active[j].first]->GetValue(
+ active[i].second, active[j].second);
+ }
+ }
+
+ return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
+
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeSimilarity ()
+//
+// Computes the average similarity for a particular family.
+// extreme low or extreme high similarity(<=20% or >80%) return 0
+// low similarity(20%-50%) return 1
+// high similarity(50%-80%) return 2
+/////////////////////////////////////////////////////////////////
+extern pair<SafeVector<char> *, float> partViterbi(string seq1, string seq2);
+extern float computeS(string seq1, string seq2, SafeVector<char> * alignment);
+
+int MSA::ComputeSimilarity (MultiSequence *sequences,const ProbabilisticModel &model){
+ assert(sequences);
+
+ //get the number of sequences
+ const int numSeqs = sequences->GetNumSequences();
+ //average identity for all sequences
+ float identity = 0;
+
+#ifdef _OPENMP
+ //calculate sequence pairs for openmp model
+ int pairIdx = 0;
+ numPairs = (numSeqs - 1) * numSeqs / 2;
+ seqsPairs = new SeqsPair[numPairs];
+ for(int a = 0; a < numSeqs; a++) {
+ for(int b = a + 1; b < numSeqs; b++) {
+ seqsPairs[pairIdx].seq1 = a;
+ seqsPairs[pairIdx].seq2 = b;
+ pairIdx++;
+ }
+ }
+#endif
+
+ // do all pairwise alignments for family similarity
+#ifdef _OPENMP
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int a= seqsPairs[pairIdx].seq1;
+ int b = seqsPairs[pairIdx].seq2;
+ if(enableVerbose) {
+#pragma omp critical
+ cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
+ }
+#else
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+#endif
+ Sequence *seq1 = sequences->GetSequence(a);
+ Sequence *seq2 = sequences->GetSequence(b);
+
+ //pair<SafeVector<char> *, float> alignment = ::partViterbi(seq1->GetString(),seq2->GetString());
+ //cerr << alignment.second / alignment.first->size();
+ //cerr << computeS(seq1->GetString(),seq2->GetString(),alignment.first)<< endl;
+ pair<SafeVector<char> *, float> alignment = model.ComputeViterbiAlignment(seq1,seq2);
+/*
+ VF* posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
+ pair<SafeVector<char> *, float> alignment = model.ComputeAlignment(
+ seq1->GetLength(), seq2->GetLength(), *posterior);
+*/
+/*
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+ float N_correct_match = 0;
+ //float N_match;
+ //float N_column = 0;
+ //float N_alignment = 0;
+ int i = 1;int j = 1;
+ //bool start = false; bool end = false;
+ for (SafeVector<char>::iterator iter = alignment.first->begin();
+ iter != alignment.first->end(); ++iter){
+ if (*iter == 'B'){
+ //N_match += 1;
+ //start = true;
+ //if(i==seq1->GetLength() || j==seq2->GetLength()) end = true;
+ unsigned char c1 = (unsigned char) iter1[i++];
+ unsigned char c2 = (unsigned char) iter2[j++];
+ if(c1==c2) N_correct_match += 1;
+ }
+ else if(*iter == 'X') i++;
+ else if(*iter == 'Y') j++;
+ //if(start && !end) N_column += 1;
+ N_alignment += 1;
+ }
+ if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "similarity error"<< endl;
+ identity += N_correct_match / N_alignment;
+ //
+*/
+ identity += alignment.second / alignment.first->size();
+ delete alignment.first;
+#ifndef _OPENMP
+ }
+#endif
+ }
+ identity /= numPairs;
+
+FILE *fi = fopen ("accuracy", "a");
+fprintf (fi, " %.10f ", identity); fprintf (fi, "\n");
+fclose (fi);
+
+/*
+ //adapative
+ if(identity <= 0.15) initDistrib[2] = 0.143854;
+ else if(identity <= 0.2) initDistrib[2] = 0.191948;
+ else if(identity <= 0.25) initDistrib[2] = 0.170705;
+ else if(identity <= 0.3) initDistrib[2] = 0.100675;
+ else if(identity <= 0.35) initDistrib[2] = 0.090755;
+ else if(identity <= 0.4) initDistrib[2] = 0.146188;
+ else if(identity <= 0.45) initDistrib[2] = 0.167858;
+ else if(identity <= 0.5) initDistrib[2] = 0.250769;
+ //else if(identity <= 0.6) initDistrib[2] = 0.500829;
+ //else if(identity <= 0.7) initDistrib[2] = 0.259622;
+*/
+ if( identity<= 0.25 || identity > 0.8 ) return 0;
+ else if(identity > 0.2 && identity<= 0.4) return 1;
+ else return 2;
+
+}
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#include "MSAClusterTree.h"
+MSAClusterTree::MSAClusterTree(MSA* msa, VVF& distMatrix, int numSeqs) :
+ MSAGuideTree(msa, distMatrix, numSeqs) {
+}
+MSAClusterTree::~MSAClusterTree() {
+}
+void MSAClusterTree::create() {
+ //generate the neighbor-joining tree
+ this->generateClusterTree();
+
+ //calculate sequence weights
+ this->getSeqsWeights();
+
+ //construct the alignment orders
+ this->createAlignmentOrders();
+}
+void MSAClusterTree::generateClusterTree() {
+ int i;
+ ValidNode* validNodes, *headValidNodes;
+ ValidNode* miniPtr, *minjPtr, *ivalid, *jvalid;
+ int mini, minj;
+ float* joins;
+ unsigned int* clusterLeafs;
+
+ //initialize the valid nodes link list
+ validNodes = new ValidNode[leafsNum + 1];
+ joins = new float[leafsNum + 1];
+ clusterLeafs = new unsigned int[nodesNum + 1];
+ if (!validNodes || !joins || !clusterLeafs) {
+ cerr << "Out of memory of the reconstruction of cluster tree" << endl;
+ }
+ //initialize cluster size
+ for (i = 0; i < this->leafsNum; i++) {
+ clusterLeafs[i] = 1;
+ }
+
+ headValidNodes = &validNodes[0];
+ headValidNodes->next = &validNodes[1];
+ headValidNodes->n = -1;
+ headValidNodes->node = -1;
+ headValidNodes->prev = NULL;
+
+ //build an initial link list
+ ValidNode* curr = &validNodes[1];
+ ValidNode* prev = headValidNodes;
+ ValidNode* next = &validNodes[2];
+ for (i = 0; i < leafsNum; i++) {
+ curr->n = i;
+ curr->node = i;
+ curr->prev = prev;
+ curr->next = next;
+ prev = curr;
+ curr = next;
+ next++;
+ }
+ prev->next = NULL;
+
+ //to generate the cluster tree
+ int nodeIdx; //the index of an internal node
+ int firstNode = leafsNum; //the index of the first internal node
+ int lastNode = firstNode + leafsNum - 1;//the index of the last internal node
+
+ for (nodeIdx = firstNode; nodeIdx < lastNode; nodeIdx++) {
+ //find closest pair of clusters
+ float minDist = 1.1f;
+ miniPtr = headValidNodes;
+ minjPtr = headValidNodes;
+
+ for (ivalid = headValidNodes->next; ivalid != NULL;
+ ivalid = ivalid->next) {
+ mini = ivalid->n;
+
+ for (jvalid = headValidNodes->next;
+ jvalid != NULL && jvalid->n < mini; jvalid = jvalid->next) {
+ minj = jvalid->n;
+ float dist = (*distMatrix)[mini][minj];
+ if (dist < 0) {
+ cerr
+ << "ERROR: It is impossible to have distance value less than zero"
+ << endl;
+ dist = 0;
+ }
+ if (dist < minDist) {
+ minDist = dist;
+ miniPtr = ivalid;
+ minjPtr = jvalid;
+ }
+ //printf("dist %g mini %d minj %d\n", dist, ivalid->node, jvalid->node);
+ }
+ }
+ //printf("**** mini %d minj %d minDist %g *****\n", miniPtr->node, minjPtr->node, minDist);
+ //check the validity of miniPtr and minjPtr;
+ if (miniPtr == headValidNodes || minjPtr == headValidNodes) {
+ cerr << "OOPS: Error occurred while constructing the cluster tree\n"
+ << endl;
+ exit(-1);
+ }
+ //computing branch length and join the two nodes
+ float branchLength = minDist * 0.5f;
+ this->connectNodes(&nodes[nodeIdx], nodeIdx, &nodes[miniPtr->node],
+ branchLength, &nodes[minjPtr->node], branchLength);
+ clusterLeafs[nodeIdx] = clusterLeafs[miniPtr->node]
+ + clusterLeafs[minjPtr->node];
+
+ //remove the valid node minjPtr from the list
+ minjPtr->prev->next = minjPtr->next;
+ if (minjPtr->next != NULL) {
+ minjPtr->next->prev = minjPtr->prev;
+ }
+ minjPtr->prev = minjPtr->next = NULL;
+
+ //compute the distance of each remaining valid node to the new node
+ for (ivalid = headValidNodes->next; ivalid != NULL;
+ ivalid = ivalid->next) {
+ int idx = ivalid->n;
+
+ float idist = (*distMatrix)[miniPtr->n][idx];
+ float jdist = (*distMatrix)[minjPtr->n][idx];
+
+ unsigned int isize = clusterLeafs[miniPtr->node];
+ unsigned int jsize = clusterLeafs[minjPtr->node];
+ joins[idx] = (idist * isize + jdist * jsize) / (isize + jsize);
+ //joins[idx] = (idist + jdist )/ 2;
+ }
+ //update the distance to the new node
+ miniPtr->node = nodeIdx;
+ mini = miniPtr->n;
+ for (jvalid = headValidNodes->next; jvalid != NULL;
+ jvalid = jvalid->next) {
+ minj = jvalid->n;
+
+ float dist = joins[minj];
+ (*distMatrix)[mini][minj] = dist;
+ (*distMatrix)[minj][mini] = dist;
+ }
+ }
+ //add a pseudo root to this unrooted NJ tree
+ this->root = &nodes[lastNode - 1];
+
+ delete[] validNodes;
+ delete[] joins;
+ delete[] clusterLeafs;
+}
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#ifndef _MSA_CLUSTER_TREE_H
+#define _MSA_CLUSTER_TREE_H
+
+#include "MSAGuideTree.h"
+
+class MSAClusterTree: public MSAGuideTree {
+public:
+ MSAClusterTree(MSA* msa, VVF& distMatrix, int numSeqs);
+ ~MSAClusterTree();
+
+ //construct the cluster tree
+ void create();
+private:
+ //generate the cluster tree
+ void generateClusterTree();
+};
+#endif
--- /dev/null
+#ifndef _MSA_DEF_H
+#define _MSA_DEF_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <float.h>
+#include <math.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+//maximum number
+#define MAX_INT_NUM 0x7FFFFFFF
+#define MAX_FLOAT_NUM FLT_MAX
+#define INT_MULTIPLY 1000
+
+#define SUBMATRIX_INT_SCALE 100
+
+//a tree node is a leaf or a node
+enum {
+ NONE, NODE, LEAF
+};
+
+#endif
+
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+#include "MSAGuideTree.h"
+#include "MSA.h"
+MSAGuideTree::MSAGuideTree(MSA* msa, VVF& distances, int numSeqs) {
+ int i;
+ TreeNode* node;
+ //system configuration
+ this->msa = msa;
+ this->distMatrix = &distances;
+ this->numSeqs = numSeqs;
+ this->seqsWeights = msa->getSeqsWeights();
+
+ //tree structure
+ this->nodesSize = this->numSeqs * 2 + 1;
+ this->nodes = new TreeNode[this->nodesSize];
+ if (!this->nodes) {
+ cerr << "TreeNodes memory allocation failed" << endl;
+ exit(-1);
+ }
+ //initialize all the tree nodes
+ this->leafs = this->nodes;
+ this->leafsNum = this->numSeqs;
+ this->nodesNum = 2 * this->leafsNum - 1;
+ for (i = 0; i < this->nodesSize; i++) {
+ node = &nodes[i];
+ node->left = 0;
+ node->right = 0;
+ node->parent = 0;
+ node->leftIdx = -1;
+ node->rightIdx = -1;
+ node->parentIdx = -1;
+ node->idx = -1;
+ node->dist = 0;
+ node->leaf = NODE; //setted to be NODE, by default
+ node->order = 0;
+ node->depth = 0;
+ }
+ //initialize the leaf nodes
+ for (i = 0; i < this->leafsNum; i++) {
+ node = &this->leafs[i];
+ node->idx = i;
+ node->leaf = LEAF;
+ }
+}
+MSAGuideTree::~MSAGuideTree() {
+ //release tree nodes
+ delete[] this->nodes;
+
+ //release alignment orders
+ releaseAlignmentOrders();
+
+}
+//get the tree nodes
+TreeNode* MSAGuideTree::getNodes() {
+ return nodes;
+}
+//get the leaf nodes
+TreeNode* MSAGuideTree::getLeafs() {
+ return leafs;
+}
+//get the number of nodes;
+int MSAGuideTree::getNodesNum() {
+ return nodesNum;
+}
+//get the number of leaf nodes
+int MSAGuideTree::getLeafsNum() {
+ return leafsNum;
+}
+//get the alignment orders
+AlignmentOrder* MSAGuideTree::getAlignOrders() {
+ return alignOrders;
+}
+int MSAGuideTree::getAlignOrdersNum() {
+ return alignOrdersNum;
+}
+/****************************************************
+ create the evolutionary relationship
+ ****************************************************/
+void MSAGuideTree::connectNodes(TreeNode* parent, int parentIdx,
+ TreeNode* leftChild, float leftDist, TreeNode* rightChild,
+ float rightDist) {
+ //save the parents index for each child
+ leftChild->parent = parent;
+ leftChild->parentIdx = parentIdx;
+ rightChild->parent = parent;
+ rightChild->parentIdx = parentIdx;
+
+ //save the branch lengths (i.e. distance) from each child to its parent
+ leftChild->dist = leftDist;
+ rightChild->dist = rightDist;
+
+ //save the indices of itself and its children for this new tree node
+ parent->idx = parentIdx;
+ parent->left = leftChild;
+ parent->leftIdx = leftChild->idx;
+ parent->right = rightChild;
+ parent->rightIdx = rightChild->idx;
+}
+/*****************************************
+ compute the alignment order of the phylogentic tree
+ *****************************************/
+void MSAGuideTree::createAlignmentOrders() {
+ int i;
+
+ AlignmentOrder* order;
+ //allocate memory space for alignment orders vector
+ this->alignOrdersNum = 0;//for alignment orders, it starts from 1 instead of 0
+ this->alignOrdersSize = numSeqs;//the number of internal nodes of the phylogentic tree + 1
+ this->alignOrders = new AlignmentOrder[this->alignOrdersSize];
+ if (!this->alignOrders) {
+ cerr << "OOPS: Alignment orders memory allocation failed" << endl;
+ exit(-1);
+ }
+ //initialize the alignment orders vector
+ for (i = 0; i < this->alignOrdersSize; i++) {
+ order = &this->alignOrders[i];
+ order->leftOrder = 0;
+ order->rightOrder = 0;
+ order->leftLeafs = 0;
+ order->leftNum = 0;
+ order->rightLeafs = 0;
+ order->rightNum = 0;
+ }
+ //starting out constructing the alignment orders
+ int subLeafsNum;
+ int nodeDepth = 1;
+ int subOrder = recursiveCreateAlignmentOrders(this->root, 0, subLeafsNum,
+ nodeDepth);
+
+ //check whether the function works well
+ if (subLeafsNum != numSeqs || this->alignOrdersNum != subOrder) {
+ fprintf(stderr,
+ "The alignment orders constructed were wrong (subLeafsNum %d, alignOrdersNum %d, subOrder %d)\n",
+ subLeafsNum, alignOrdersNum, subOrder);
+ }
+
+}
+int MSAGuideTree::recursiveCreateAlignmentOrders(TreeNode* subRoot,
+ int* subLeafs, int& subLeafsNum, int nodeDepth) {
+ int leftNum, rightNum;
+ int leftOrder, rightOrder;
+ int* leftLeafs, *rightLeafs;
+
+ if (subRoot->leaf == LEAF) {
+ subLeafs[0] = subRoot->idx;
+ subLeafsNum = 1;
+
+ return 0; //if it is a leaf, return the index 0
+ }
+ leftOrder = rightOrder = 0;
+ leftNum = rightNum = 0;
+ leftLeafs = new int[numSeqs];
+ rightLeafs = new int[numSeqs];
+
+ //check the left subtree
+ if (subRoot->left) {
+ //recursively tranverse the left subtree
+ leftOrder = recursiveCreateAlignmentOrders(subRoot->left, leftLeafs,
+ leftNum, nodeDepth + 1);
+ }
+ //check the right subtree
+ if (subRoot->right) {
+ rightOrder = recursiveCreateAlignmentOrders(subRoot->right, rightLeafs,
+ rightNum, nodeDepth + 1);
+ }
+ //save the leafs in the left and right subtrees of the current subtree
+ if (this->alignOrdersNum > this->alignOrdersSize) {
+ fprintf(stderr, "the alignment order function works bad\n");\
+ exit(-1);
+ }
+
+ AlignmentOrder* order = &this->alignOrders[++this->alignOrdersNum];
+ order->nodeDepth = nodeDepth;
+ order->leftOrder = leftOrder;
+ order->rightOrder = rightOrder;
+ order->leftNum = leftNum;
+ order->rightNum = rightNum;
+ order->leftLeafs = new int[order->leftNum];
+ order->rightLeafs = new int[order->rightNum];
+ if (!order->leftLeafs || !order->rightLeafs) {
+ fprintf(stderr,
+ "memory allocation failed while recursively constructing alignment orders\n");
+ exit(-1);
+ }
+ memcpy(order->leftLeafs, leftLeafs, order->leftNum * sizeof(int));
+ memcpy(order->rightLeafs, rightLeafs, order->rightNum * sizeof(int));
+
+ delete[] leftLeafs;
+ delete[] rightLeafs;
+
+ //for the root of the tree, subLeafs buffer is set to 0
+ if (subLeafs) {
+ //copy the results to the parent tree node
+ memcpy(subLeafs, order->leftLeafs, order->leftNum * sizeof(int));
+ memcpy(subLeafs + order->leftNum, order->rightLeafs,
+ order->rightNum * sizeof(int));
+ }
+ //compute the total number of leafs in this subtree
+ subLeafsNum = order->leftNum + order->rightNum;
+
+ return this->alignOrdersNum;//return the index of itself, starting from 1, instead of 0
+}
+void MSAGuideTree::releaseAlignmentOrders() {
+ if (!this->alignOrders) {
+ return;
+ }
+ for (int i = 0; i < this->alignOrdersNum; i++) {
+ AlignmentOrder* order = &this->alignOrders[i];
+ if (order->leftLeafs) {
+ delete[] order->leftLeafs;
+ }
+ if (order->rightLeafs) {
+ delete[] order->rightLeafs;
+ }
+ }
+ delete[] alignOrders;
+}
+/********************************
+ display the alignment orders
+ ********************************/
+void MSAGuideTree::displayAlignmentOrders() {
+ int i, j;
+ AlignmentOrder* order;
+ fprintf(stderr, "************DISPLAY ALIGNMENT ORDER***************\n");
+ for (i = 1; i <= this->alignOrdersNum; i++) {
+ order = &this->alignOrders[i];
+
+ fprintf(stderr, "GROUP (%d depth %d):\n---LEFT ORDER: %d\n", i,
+ order->nodeDepth, order->leftOrder);
+ fprintf(stderr, "---LEFT: ");
+ for (j = 0; j < order->leftNum; j++) {
+ fprintf(stderr, "%d ", order->leftLeafs[j]);
+ }
+
+ fprintf(stderr, "\n---RIGHT ORDER: %d\n", order->rightOrder);
+ fprintf(stderr, "\n---RIGHT: ");
+ for (j = 0; j < order->rightNum; j++) {
+ fprintf(stderr, "%d ", order->rightLeafs[j]);
+ }
+ fprintf(stderr, "\n");
+ }
+ fprintf(stderr, "*******************************************\n");
+}
+/*********************************
+ display the tree
+ *********************************/
+void MSAGuideTree::displayTree() {
+ fprintf(stderr, "**************DISPLAY TREE*********************\n");
+ for (int i = 0; i < nodesNum; i++) {
+ TreeNode* node = &nodes[i];
+
+ fprintf(stderr,
+ "%d(%p): left(%p) %d, right(%p) %d, parent(%p) %d, dist %f\n",
+ (node == &nodes[node->idx]) ? node->idx : -2, node, node->left,
+ (!node->left || node->left == &nodes[node->leftIdx]) ?
+ node->leftIdx : -2, node->right,
+ (!node->right || node->right == &nodes[node->rightIdx]) ?
+ node->rightIdx : -2, node->parent,
+ (!node->parent || node->parent == &nodes[node->parentIdx]) ?
+ node->parentIdx : -2, node->dist);
+ }
+ fprintf(stderr, "*******************************************\n");
+}
+/*********************************
+ compute the sequence weights
+ *********************************/
+void MSAGuideTree::getSeqsWeights() {
+ int i;
+ TreeNode* curr;
+
+ //compute the order of each node, which represents the number of leaf nodes in the substree rooting from it.
+ for (i = 0; i < leafsNum; i++) {
+ //for each leaf nodes
+ curr = &this->leafs[i];
+ while (curr != 0) {
+ curr->order++;
+
+ curr = curr->parent;
+ }
+ }
+ //compute the weight of each sequence, which corresponds to a leaf node
+ for (i = 0; i < numSeqs; i++) {
+ //compute the weight of each sequence
+ float weights = 0;
+ curr = &this->leafs[i];
+ while (curr->parent != 0) {
+ weights += curr->dist / curr->order;
+ curr = curr->parent;
+ //printf("order:%d weights: %f\n", curr->order, weights);
+ }
+ //save the weight of this sequence
+ seqsWeights[i] = (int) (100 * weights);
+ //printf("%d\n", seqsWeights[i]);
+ }
+ //normalize the weights
+ int wsum = 0;
+ for (i = 0; i < numSeqs; i++) {
+ wsum += seqsWeights[i];
+ }
+ if (wsum == 0) {
+ //in this case, every sequence is assumed to have an identical weight
+ for (i = 0; i < numSeqs; i++) {
+ seqsWeights[i] = 1;
+ }
+ wsum = numSeqs;
+ }
+ //printf("wsum:%d \n", wsum);
+ for (i = 0; i < numSeqs; i++) {
+ seqsWeights[i] = (seqsWeights[i] * INT_MULTIPLY) / wsum;
+ if (seqsWeights[i] < 1) {
+ seqsWeights[i] = 1;
+ }
+ //printf("%d \n", seqsWeights[i]);
+ }
+}
+void MSAGuideTree::create() {
+ //do nothing
+}
+
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#ifndef _MSA_GUIDE_TREE_H
+#define _MSA_GUIDE_TREE_H
+#include "MSADef.h"
+#include "MSA.h"
+
+#include "SafeVector.h"
+#include "MultiSequence.h"
+#include "ScoreType.h"
+#include "ProbabilisticModel.h"
+#include "SparseMatrix.h"
+
+class MSA;
+struct ValidNode {
+ ValidNode* prev;
+ ValidNode* next;
+ int n; //the index in the distance matrix
+ int node; //the index in the tree node entries
+};
+
+struct TreeNode {
+ struct TreeNode *left; //the pointer to its left child
+ struct TreeNode *right; //the pointer to its right child
+ struct TreeNode *parent; //the pointer to its parent
+ int leftIdx; //the index of the left child
+ int rightIdx; //the index of the right child
+ int parentIdx; //the index of its parent
+ int idx; //the index of itself
+ float dist; //the distance to its parent
+ int leaf; //whether it is a leaf node or not
+ int order; //the number of generations dating back to its ancestor
+ int depth; //the depth of the node
+};
+struct AlignmentOrder {
+ int nodeDepth; //the depth of the internal node
+ int leftOrder; //the order number of the right child
+ int rightOrder; //the order number of the left child
+ int* leftLeafs; //the indices of leafs in the left subtree
+ int leftNum; //the number of leafs in the left subtree
+ int* rightLeafs; //the indices of leafs in the right subtree
+ int rightNum; //the number of leafs in the right substree
+};
+
+class MSAGuideTree {
+public:
+ MSAGuideTree(MSA* msa, VVF& distMatrix, int numSeqs);
+ virtual ~MSAGuideTree() = 0; //abstract class
+
+ //get the tree nodes
+ TreeNode* getNodes();
+ //get the leaf nodes
+ TreeNode* getLeafs();
+ //get the number of nodes;
+ int getNodesNum();
+ //get the number of leaf nodes
+ int getLeafsNum();
+ //get the root of the tree
+ TreeNode* getRoot() {
+ return this->root;
+ }
+ //get the alignment orders
+ AlignmentOrder* getAlignOrders();
+ int getAlignOrdersNum();
+ //construct the alignment orders
+ void createAlignmentOrders();
+
+ //construct the guide tree
+ virtual void create();
+ //calculate the sequence weights
+ virtual void getSeqsWeights();
+
+ /**********DEBUGING****************/
+ //display the tree
+ void displayTree();
+ //display the alignment orders
+ void displayAlignmentOrders();
+
+protected:
+ //join two nodes
+ void connectNodes(TreeNode* parent, int parentIdx, TreeNode* leftChild,
+ float leftDist, TreeNode* rightChild, float rightDist);
+ //release the alignment orders vector
+ void releaseAlignmentOrders();
+ //recursive implemenation of constructing the alignment orders
+ int recursiveCreateAlignmentOrders(TreeNode* subRoot, int* subLeafs,
+ int& subLeafsNum, int nodeDepth);
+
+ //system configurations
+ MSA* msa;
+ VVF* distMatrix;
+ int numSeqs;
+ int* seqsWeights;
+
+ //all the tree nodes
+ TreeNode* nodes;
+ int nodesNum;
+ int nodesSize;
+ //the root tree node
+ TreeNode* root;
+ //leaf node
+ TreeNode* leafs;
+ int leafsNum;
+
+ //alignment order
+ AlignmentOrder* alignOrders;
+ int alignOrdersNum;
+ int alignOrdersSize;
+};
+#endif
+
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+#include "SafeVector.h"
+#include <iostream>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <ctype.h>
+#include <assert.h>
+#include "MultiSequence.h"
+#include "ScoreType.h"
+
+#define TRACE 0 // 0: NOTRACE 1: TRACE
+//proba like settings
+#define endgaps 1 // 1: engap penaties enabled 0: disabled
+#define PART_FULL_MEMORY 0 //0: LOW MEM OPTION
+#define REVPART_FULL_MEMORY 0 //0: LOW MEM OPTION
+using namespace std;
+
+#ifdef _WIN32
+#define OS_HUGE_VALL HUGE_VAL
+#else
+#define OS_HUGE_VALL HUGE_VALL
+#endif
+
+typedef struct {
+ char input[30];
+ int matrix;
+ int N;
+ float T;
+ float beta;
+ char opt; //can be 'P' or 'M'
+ float gapopen;
+ float gapext;
+} argument_decl;
+
+typedef struct sequence {
+ char *title;
+ char *text;
+ int length;
+} fasta;
+
+typedef struct alignment {
+ char *title;
+ char *text;
+ int length;
+} align;
+
+////////////////////////////////////////////////////////
+//externs related to scoring matrix and input arguments
+///////////////////////////////////////////////////////////
+extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
+extern char aminos[26], matrixtype[20], bases[26];
+
+extern double sub_matrix[26][26];
+extern double normalized_matrix[26][26]; // add by YE Yongtao
+extern int subst_index[26];
+
+extern float TEMPERATURE;
+extern int MATRIXTYPE;
+
+extern float GAPOPEN;
+extern float GAPEXT;
+extern argument_decl argument;
+
+//////////////////////////////////////////////////////////////////////////////
+//calculates reverse partition function values based on z matrices
+//and also simulaneously calculates the propability of each basepair
+//or aminoacid residue pair i,j
+//////////////////////////////////////////////////////////////////////////////
+
+VF *revers_partf(fasta sequences[2], const double termgapopen,
+ const double termgapextend, long double **Zfm, const double d,
+ const double e) {
+ // printf("revpart\n");
+ //rest of the declarations
+ int i, j;
+ long double **Zm = NULL;
+ long double **Ze = NULL;
+ long double **Zf = NULL;
+ int len0, len1;
+ float probability;
+ long double tempvar;
+ int Si, Tj;
+ double endgapopen, endgapextend;
+ FILE *fo;
+
+ //Init lengths of sequences
+ len0 = strlen(sequences[0].text);
+ len1 = strlen(sequences[1].text);
+
+ //Safe vector declared
+ VF *posteriorPtr = new VF((len0 + 1) * (len1 + 1));
+ VF & posterior = *posteriorPtr;
+ VF::iterator ptr = posterior.begin();
+
+ if (TRACE) //open the trace file
+ fo = fopen("revpartdump", "a");
+
+ //default:
+ endgapopen = termgapopen;
+ endgapextend = termgapextend;
+
+ //instantiate the z matrix
+ if (REVPART_FULL_MEMORY) {
+
+ Ze = new long double *[sequences[1].length + 1];
+ Zf = new long double *[sequences[1].length + 1];
+ Zm = new long double *[sequences[1].length + 1];
+
+ if (TRACE)
+ printf("\n\n %e %e\n", d, e);
+
+ //DYNAMICALLY GROW 2D Zm Zf Ze MARICES (long double)
+ for (i = 0; i <= sequences[1].length; i++) {
+ Ze[i] = new long double[sequences[0].length + 1];
+ Zf[i] = new long double[sequences[0].length + 1];
+ Zm[i] = new long double[sequences[0].length + 1];
+ }
+ } else {
+ Zm = new long double *[2];
+ Ze = new long double *[2];
+ Zf = new long double *[2];
+ for (i = 0; i <= 1; i++) {
+ Zm[i] = new long double[sequences[0].length + 1];
+ Ze[i] = new long double[sequences[0].length + 1];
+ Zf[i] = new long double[sequences[0].length + 1];
+ }
+
+ }
+
+ if (TRACE) {
+ printf("in rev partf---");
+ printf("\n\n");
+ }
+
+ if (REVPART_FULL_MEMORY) {
+ for (i = 0; i <= len1; i++)
+ for (j = 0; j <= len0; j++) {
+ Zm[i][j] = 0.0;
+ Zf[i][j] = 0.0;
+ Ze[i][j] = 0.0;
+ }
+ } else {
+
+ for (j = 0; j <= len0; j++) {
+ Zm[0][j] = 0;
+ Zf[0][j] = 0;
+ Ze[0][j] = 0;
+ Zf[1][j] = 0;
+ Ze[1][j] = 0;
+ Zm[1][j] = 0;
+ }
+ }
+
+ //fill the probability matrix with 0s
+ for (i = 0; i <= len1; i++)
+ for (j = 0; j <= len0; j++)
+ ptr[j * (len1 + 1) + i] = 0;
+
+ if (endgaps == 0) {
+ Zm[len1][len0] = 1;
+ Ze[len1][len0] = Zf[len1][len0] = 0;
+ Zf[len1 - 1][len0] = Zm[len1][len0] * d;
+ Ze[len1][len0 - 1] = Zm[len1][len0] * d;
+
+ //>=2ND ROW INIT
+ if (REVPART_FULL_MEMORY) {
+ for (i = len1 - 2; i >= 0; i--) {
+ Zf[i][len0] = Zf[i + 1][len0] * e;
+ }
+ }
+
+ //>=2ND COL INIT
+ if (REVPART_FULL_MEMORY) {
+ for (j = len0 - 2; j >= 0; j--) {
+ Ze[len1][j] = Ze[len1][j + 1] * e;
+ }
+ } else {
+ for (j = len0 - 2; j >= 0; j--) {
+ Ze[0][j] = Ze[0][j + 1] * e;
+ }
+ }
+ } else {
+
+ if (REVPART_FULL_MEMORY) {
+
+ Zm[len1][len0] = 1;
+ Ze[len1][len0] = Zf[len1][len0] = 0;
+ Zf[len1 - 1][len0] = Zm[len1][len0] * endgapopen;
+ Ze[len1][len0 - 1] = Zm[len1][len0] * endgapopen;
+
+ //>=2ND ROW INIT
+ for (i = len1 - 2; i >= 0; i--) {
+ Zf[i][len0] = Zf[i + 1][len0] * endgapextend;
+ }
+
+ //M Iy= d+j*e
+
+ //>=2ND COL INIT
+ for (j = len0 - 2; j >= 0; j--) {
+ Ze[len1][j] = Ze[len1][j + 1] * endgapextend;
+ }
+
+ } else {
+ //in Zm
+ //let:
+ // Zm(0) be the current row being filled/computed
+ // Zm(1) be the previous row
+
+ Zm[1][len0] = 1;
+ Ze[0][len0] = Zf[0][len0] = 0;
+ Zf[1][len0] = Zm[1][len0] * endgapopen;
+ Ze[0][len0 - 1] = Zm[1][len0] * endgapopen;
+
+ //>=2ND COL INIT
+ for (j = len0 - 2; j >= 0; j--) {
+ Ze[0][j] = Ze[0][j + 1] * endgapextend;
+ }
+
+ } //END ELSE
+
+ } //END FULL MEMORY and GAP enablement IF STATEMENT
+
+ double scorez, zz = 0;
+
+ for (i = len1 - 1; i >= 0; i--) {
+
+ for (j = len0 - 1; j >= 0; j--) {
+ Si = subst_index[sequences[1].text[i] - 'A'];
+ Tj = subst_index[sequences[0].text[j] - 'A'];
+ scorez = sub_matrix[Si][Tj];
+
+ //endgaps modification aug 10
+ double open0, extend0, open1, extend1;
+
+ open0 = open1 = d;
+ extend0 = extend1 = e;
+
+ if (endgaps == 1) {
+
+ //check to see if one of the 2 sequences or both reach the end
+
+ if (i == 0) {
+ open0 = endgapopen;
+ extend0 = endgapextend;
+
+ }
+
+ if (j == 0) {
+ open1 = endgapopen;
+ extend1 = endgapextend;
+ }
+
+ }
+
+ if (REVPART_FULL_MEMORY) {
+ //z computation
+
+ Ze[i][j] = Zm[i][j + 1] * open0 + Ze[i][j + 1] * extend0;
+ Zf[i][j] = Zm[i + 1][j] * open1 + Zf[i + 1][j] * extend1;
+ Zm[i][j] = (Zm[i + 1][j + 1] + Zf[i + 1][j + 1]
+ + Ze[i + 1][j + 1]) * scorez;
+ zz = Zm[i][j] + Zf[i][j] + Ze[i][j];
+
+ } else {
+
+ //2 ROW zE zF ALGORITHM GOES...:
+ //Ze[1][j] =Zm[i][j + 1] * exp(beta * open0) + Ze[1][j + 1] *exp(beta * extend0);
+ //Zf[1][j] = Zm[i + 1][j] * exp(beta * open1) + Zf[0][j] * exp(beta * extend1);
+ //Zm[i][j] = (Zm[i + 1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1]) * exp(beta * scorez);
+ //zz = Zm[0][j] + Zf[1][j] + Ze[1][j];
+
+ //lowmem code for merging probability calculating module
+ //Here we make use of Zm as a 2 row matrix
+
+ Zf[1][j] = Zm[1][j] * open1 + Zf[0][j] * extend1;
+ Ze[1][j] = Zm[0][j + 1] * open0 + Ze[1][j + 1] * extend0;
+ Zm[0][j] = (Zm[1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1])
+ * scorez;
+
+ tempvar = Zfm[i + 1][j + 1] * Zm[0][j];
+ //divide P(i,j) i.e. pairwise probability by denominator
+ tempvar /= (scorez * Zfm[0][0]);
+ probability = (float) tempvar;
+
+ //store only noticable probabilities
+ //if (probability <= 1 && probability >= 0.001) {
+ //algorithm goes...
+ //validprob[i + 1][j + 1] = probability;
+ ptr[(j + 1) * (len1 + 1) + (i + 1)] = probability;
+ //}
+ //lowmem code ends here
+
+ }
+
+ } //end of for
+
+ if (REVPART_FULL_MEMORY == 0) {
+ for (int t = 0; t <= sequences[0].length; t++) {
+ Ze[0][t] = Ze[1][t];
+ Ze[1][t] = 0;
+
+ Zf[0][t] = Zf[1][t];
+ Zf[1][t] = 0;
+
+ Zm[1][t] = Zm[0][t];
+ Zm[0][t] = 0;
+
+ }
+ Zf[0][len0] = 1;
+
+ }
+
+ } //end of for
+
+ if (TRACE) {
+ printf("\n\nrM:....\n\n");
+ if (REVPART_FULL_MEMORY) {
+ for (i = 0; i <= len1; i++) {
+ for (j = 0; j <= len0; j++)
+ printf("%.2Le ", Zm[i][j]);
+ printf("\n");
+ }
+
+ printf("\n\nrE:....\n\n");
+ for (i = 0; i <= len1; i++) {
+ for (j = 0; j <= len0; j++)
+ printf("%.2Le ", Ze[i][j]);
+ printf("\n");
+
+ }
+
+ printf("\n\nrF:....\n\n");
+ for (i = 0; i <= len1; i++) {
+ for (j = 0; j <= len0; j++)
+ printf("%.2Le ", Zf[i][j]);
+ printf("\n");
+
+ }
+
+ }
+
+ }
+
+ if (TRACE) {
+ fprintf(fo, "\n");
+ fclose(fo);
+ }
+
+ //delete unused memory
+
+ if (REVPART_FULL_MEMORY) {
+ for (i = 0; i <= len1; i++) {
+ delete (Zm[i]);
+ delete (Zf[i]);
+ delete (Ze[i]);
+ }
+ } else {
+ delete (Zf[0]);
+ delete (Ze[0]);
+ delete (Zm[0]);
+
+ delete (Zm[1]);
+ delete (Zf[1]);
+ delete (Ze[1]);
+ }
+
+ for (i = 0; i <= len1; i++) {
+ delete (Zfm[i]);
+ }
+
+ if (Zf != NULL)
+ delete (Zf);
+
+ if (Ze != NULL)
+ delete (Ze);
+
+ if (Zm != NULL)
+ delete (Zm);
+
+ if (Zfm != NULL)
+ delete (Zfm);
+
+ posterior[0] = 0;
+ return (posteriorPtr);
+
+}
+
+//////////////////////////////////////////////////////////////
+//forward partition function
+/////////////////////////////////////////////////////////////
+
+long double **partf(fasta sequences[2], const double termgapopen,
+ const double termgapextend, const double d, const double e) {
+ //printf("partf\n");
+ int i, j, len1, len0;
+ long double **Zm = NULL, **Zf = NULL, **Ze = NULL, zz = 0;
+ double endgapopen, endgapextend;
+
+ //default:
+ endgapopen = termgapopen;
+ endgapextend = termgapextend;
+
+ //the flag endgaps is set at the #define section
+ if (PART_FULL_MEMORY) {
+
+ Zf = new long double *[sequences[1].length + 1];
+ Ze = new long double *[sequences[1].length + 1];
+ Zm = new long double *[sequences[1].length + 1];
+
+ //comment
+ if (TRACE)
+ printf("\nPARTF:====\n");
+
+ //DYNAMICALLY GROW 2D M,IX,IY,PIX,PIY MARICES
+ for (i = 0; i <= sequences[1].length; i++) {
+ Zf[i] = new long double[sequences[0].length + 1];
+ Ze[i] = new long double[sequences[0].length + 1];
+ Zm[i] = new long double[sequences[0].length + 1];
+ }
+ } else {
+ Zm = new long double *[sequences[1].length + 1];
+ Ze = new long double *[2];
+ Zf = new long double *[2];
+ for (i = 0; i <= sequences[1].length; i++) {
+ Zm[i] = new long double[sequences[0].length + 1];
+ }
+ Ze[0] = new long double[sequences[0].length + 1];
+ Zf[0] = new long double[sequences[0].length + 1];
+ Ze[1] = new long double[sequences[0].length + 1];
+ Zf[1] = new long double[sequences[0].length + 1];
+ }
+
+ len0 = strlen(sequences[0].text);
+ len1 = strlen(sequences[1].text);
+
+ if (PART_FULL_MEMORY) {
+ for (i = 0; i <= sequences[1].length; i++)
+ for (j = 0; j <= sequences[0].length; j++) {
+ Zm[i][j] = 0.00;
+ Zf[i][j] = 0.00;
+ Ze[i][j] = 0.00;
+ }
+ } else {
+ for (i = 0; i <= len1; i++) {
+ for (j = 0; j <= len0; j++) {
+ Zm[i][j] = 0;
+ }
+ }
+ for (j = 0; j <= len0; j++) {
+ Zf[0][j] = 0;
+ Ze[0][j] = 0;
+ Zf[1][j] = 0;
+ Ze[1][j] = 0;
+ }
+ }
+
+ //INTITIALIZE THE DP
+
+ if (endgaps == 0) {
+ Zm[0][0] = 1.00;
+
+ Zf[0][0] = Ze[0][0] = 0;
+ Zf[1][0] = Zm[0][0] * d;
+ Ze[0][1] = Zm[0][0] * d;
+
+ //>=2ND ROW INIT
+ if (PART_FULL_MEMORY) {
+ for (i = 2; i <= sequences[1].length; i++) {
+ Zf[i][0] = Zf[i - 1][0] * e;
+ }
+ }
+
+ //>=2ND COL INIT
+ for (j = 2; j <= sequences[0].length; j++) {
+ Ze[0][j] = Ze[0][j - 1] * e;
+ }
+ } else {
+ //init z
+ Zm[0][0] = 1.00;
+ Zf[0][0] = Ze[0][0] = 0;
+ Zf[1][0] = Zm[0][0] * endgapopen;
+ Ze[0][1] = Zm[0][0] * endgapopen;
+
+ //>=2ND ROW INIT
+ if (PART_FULL_MEMORY) {
+ for (i = 2; i <= sequences[1].length; i++) {
+ Zf[i][0] = Zf[i - 1][0] * endgapextend;
+ }
+ }
+
+ //>=2ND COL INIT
+ for (j = 2; j <= sequences[0].length; j++) {
+ Ze[0][j] = Ze[0][j - 1] * endgapextend;
+ }
+ }
+
+ //1ST ROW/COL INIT
+
+ int Si, Tj;
+ double score;
+
+ for (i = 1; i <= sequences[1].length; i++) {
+
+ for (j = 1; j <= sequences[0].length; j++) {
+
+ Si = subst_index[sequences[1].text[i - 1] - 'A'];
+ Tj = subst_index[sequences[0].text[j - 1] - 'A'];
+
+ score = sub_matrix[Si][Tj];
+
+ double open0, extend0, open1, extend1;
+
+ open0 = open1 = d;
+ extend0 = extend1 = e;
+
+ if (endgaps == 1) {
+ //check to see if one of the 2 sequences or both reach the end
+
+ if (i == sequences[1].length) {
+ open0 = endgapopen;
+ extend0 = endgapextend;
+
+ }
+
+ if (j == sequences[0].length) {
+ open1 = endgapopen;
+ extend1 = endgapextend;
+ }
+ }
+
+ //
+ //z computation using open and extend temp vars
+ //open0 is gap open in seq0 and open1 is gap open in seq1
+ //entend0 is gap extend in seq0 and extend1 is gap extend in seq1
+
+ if (PART_FULL_MEMORY) {
+ Ze[i][j] = Zm[i][j - 1] * open0 + Ze[i][j - 1] * extend0;
+
+ if (Ze[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for Ze\n");
+ exit(1);
+ }
+
+ Zf[i][j] = Zm[i - 1][j] * open1 + Zf[i - 1][j] * extend1;
+
+ if (Zf[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for Zf\n");
+ exit(1);
+ }
+
+ Zm[i][j] = (Zm[i - 1][j - 1] + Ze[i - 1][j - 1]
+ + Zf[i - 1][j - 1]) * score;
+
+ if (Zm[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for Zm\n");
+ exit(1);
+ }
+
+ zz = Zm[i][j] + Ze[i][j] + Zf[i][j];
+ } else {
+ Ze[1][j] = Zm[i][j - 1] * open0 + Ze[1][j - 1] * extend0;
+
+ if (Ze[1][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for zE\n");
+ exit(1);
+ }
+
+ Zf[1][j] = Zm[i - 1][j] * open1 + Zf[0][j] * extend1;
+
+ if (Zf[1][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for zF\n");
+ exit(1);
+ }
+
+ Zm[i][j] = (Zm[i - 1][j - 1] + Ze[0][j - 1] + Zf[0][j - 1])
+ * score;
+
+ if (Zm[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for zM\n");
+ exit(1);
+ }
+
+ zz = Zm[i][j] + Ze[1][j] + Zf[1][j];
+ }
+
+ } //end for
+
+ if (!PART_FULL_MEMORY) {
+ for (int t = 0; t <= sequences[0].length; t++) {
+ Ze[0][t] = Ze[1][t];
+ Ze[1][t] = 0;
+
+ Zf[0][t] = Zf[1][t];
+ Zf[1][t] = 0;
+ }
+
+ Zf[1][0] = 1;
+
+ }
+
+ } //end for
+
+ //store the sum of zm zf ze (m,n)s in zm's 0,0 th position
+ Zm[0][0] = zz;
+
+ if (TRACE) {
+ //debug code aug 3
+ //print the 3 Z matrices namely Zm Zf and Ze
+
+ printf("\n\nFINAL Zm:\n");
+ for (i = 0; i <= sequences[1].length; i++) {
+ for (j = 0; j <= sequences[0].length; j++)
+ printf("%.2Le ", Zm[i][j]);
+ printf("\n");
+ }
+
+ printf("FINAL Zf \n");
+ for (i = 0; i <= sequences[1].length; i++) {
+ for (j = 0; j <= sequences[0].length; j++)
+ printf("%.2Le ", Zf[i][j]);
+ printf("\n");
+ }
+
+ printf("FINAL Ze \n");
+ for (i = 0; i <= sequences[1].length; i++) {
+ for (j = 0; j <= sequences[0].length; j++)
+ printf("%.2Le ", Ze[i][j]);
+ printf("\n");
+ }
+
+ //end debug dump code
+
+ }
+
+ if (PART_FULL_MEMORY) {
+ for (i = 0; i <= sequences[1].length; i++) {
+ delete (Zf[i]);
+ delete (Ze[i]);
+ }
+ } else {
+ delete (Zf[0]);
+ delete (Ze[0]);
+ delete (Zf[1]);
+ delete (Ze[1]);
+ }
+
+ delete (Zf);
+ delete (Ze);
+
+ return Zm;
+
+} //end of forward partition function
+
+/////////////////////////////////////////////////////////////////////////////////////////
+//entry point (was the main function) , returns the posterior probability safe vector
+////////////////////////////////////////////////////////////////////////////////////////
+VF *ComputePostProbs(int a, int b, string seq1, string seq2) {
+ //printf("probamod\n");
+ double gap_open = -22, gap_ext = -1, beta = 0.2;//T = 5, beta = 1/T = 0.2, by default
+ int stock_loop = 1;
+ int le = 160;
+ double termgapopen = 1.0f; //exp(0)
+ double termgapextend = 1.0f; //exp(0)
+
+ //initialize the sequence structure
+ fasta sequences[2];
+
+ sequences[0].length = strlen((char *) seq1.c_str());
+ sequences[0].text = (char *) seq1.c_str();
+ sequences[0].title = new char[10];
+ strcpy(sequences[0].title, "seq0");
+ sequences[1].length = strlen((char *) seq2.c_str());
+ sequences[1].text = (char *) seq2.c_str();
+ sequences[1].title = new char[10];
+ strcpy(sequences[1].title, "seq1");
+
+ if (TRACE)
+
+ {
+ printf("%d %d %s\n%d %d %s\n--\n", a, sequences[0].length,
+ sequences[0].text, b, sequences[1].length, sequences[1].text);
+ printf("after init\n");
+
+ FILE *dump1 = fopen("dump1", "a");
+ fprintf(dump1, "%d %d %s\n%d %d %s\n--\n", a, sequences[0].length,
+ sequences[0].text, b, sequences[1].length, sequences[1].text);
+ fclose(dump1);
+ }
+
+ gap_open = argument.gapopen;
+ gap_ext = argument.gapext;
+ beta = argument.beta;
+
+ stock_loop = argument.N;
+ le = argument.matrix;
+
+ //compute the values of exp(beta * ?)
+ termgapopen = exp(beta * 0.0);
+ termgapextend = exp(beta * 0.0);
+ gap_open = exp(beta * gap_open);
+ gap_ext = exp(beta * gap_ext);
+
+ if (TRACE)
+ printf("%f %f %f %d\n", gap_open, gap_ext, beta, le);
+
+ //call for calculating the posterior probabilities
+ // 1. call partition function partf
+ // 2. calculate revpartition using revers_parf
+ // 3. calculate probabilities
+ /// MODIFICATION... POPULATE SAFE VECTOR
+
+ long double **MAT1;
+
+ MAT1 = partf(sequences, termgapopen, termgapextend, gap_open, gap_ext);
+
+ return revers_partf(sequences, termgapopen, termgapextend, MAT1, gap_open,
+ gap_ext);
+
+}
+
+//////////////////////////////////////////////////////////////
+//Compute Viterbi Alignment
+// Added by YE Yongtao
+/////////////////////////////////////////////////////////////
+
+pair<SafeVector<char> *, float> partViterbi(string seq1, string seq2) {
+
+
+ double gap_open = -12, gap_ext = -1, beta = 0.2;//T = 5, beta = 1/T = 0.2, by default
+ int stock_loop = 1;
+ int le = 160;
+ //double termgapopen = 1.0f; //exp(0)
+ //double termgapextend = 1.0f; //exp(0)
+
+ //initialize the sequence structure
+ fasta sequences[2];
+ sequences[0].length = strlen((char *) seq1.c_str());
+ sequences[0].text = (char *) seq1.c_str();
+ sequences[0].title = new char[10];
+ strcpy(sequences[0].title, "seq0");
+ sequences[1].length = strlen((char *) seq2.c_str());
+ sequences[1].text = (char *) seq2.c_str();
+ sequences[1].title = new char[10];
+ strcpy(sequences[1].title, "seq1");
+
+ gap_open = argument.gapopen;
+ gap_ext = argument.gapext;
+ beta = argument.beta;
+
+ stock_loop = argument.N;
+ le = argument.matrix;
+
+ //compute the values of exp(beta * ?)
+ double endgapopen = exp(beta * 0.0);
+ double endgapextend = exp(beta * 0.0);
+ double d = exp(beta * gap_open);
+ double e = exp(beta * gap_ext);
+
+ int i, j, len1, len0;
+ long double **Zm = NULL, **Zf = NULL, **Ze = NULL;
+ int **traceZm = NULL, **traceZf = NULL, **traceZe = NULL;
+
+ //the flag endgaps is set at the #define section
+ Zf = new long double *[sequences[1].length + 1];
+ Ze = new long double *[sequences[1].length + 1];
+ Zm = new long double *[sequences[1].length + 1];
+
+ traceZf = new int *[sequences[1].length + 1];
+ traceZe = new int *[sequences[1].length + 1];
+ traceZm = new int *[sequences[1].length + 1];
+
+ //DYNAMICALLY GROW 2D M,IX,IY,PIX,PIY MARICES
+ for (i = 0; i <= sequences[1].length; i++) {
+ Zf[i] = new long double[sequences[0].length + 1];
+ Ze[i] = new long double[sequences[0].length + 1];
+ Zm[i] = new long double[sequences[0].length + 1];
+
+ traceZf[i] = new int[sequences[0].length + 1];
+ traceZe[i] = new int[sequences[0].length + 1];
+ traceZm[i] = new int[sequences[0].length + 1];
+ }
+
+ len0 = strlen(sequences[0].text);
+ len1 = strlen(sequences[1].text);
+
+
+ for (i = 0; i <= sequences[1].length; i++)
+ for (j = 0; j <= sequences[0].length; j++) {
+ Zm[i][j] = 0.00;
+ Zf[i][j] = 0.00;
+ Ze[i][j] = 0.00;
+
+ traceZm[i][j] = -1;
+ traceZf[i][j] = -1;
+ traceZe[i][j] = -1;
+ }
+
+
+ //INTITIALIZE THE DP
+ if (endgaps == 0) {
+ Zm[0][0] = 1.00;
+
+ Zf[0][0] = Ze[0][0] = 0;
+ Zf[1][0] = Zm[0][0] * d;
+ Ze[0][1] = Zm[0][0] * d;
+
+ //>=2ND ROW INIT
+
+ for (i = 2; i <= sequences[1].length; i++) {
+ Zf[i][0] = Zf[i - 1][0] * e;
+ traceZf[i][0] = 2;
+ }
+
+
+ //>=2ND COL INIT
+ for (j = 2; j <= sequences[0].length; j++) {
+ Ze[0][j] = Ze[0][j - 1] * e;
+ traceZe[0][j] = 1;
+ }
+ } else {
+ //init z
+ Zm[0][0] = 1.00;
+ Zf[0][0] = Ze[0][0] = 0;
+ Zf[1][0] = Zm[0][0] * endgapopen;
+ Ze[0][1] = Zm[0][0] * endgapopen;
+
+ //>=2ND ROW INIT
+
+ for (i = 2; i <= sequences[1].length; i++) {
+ Zf[i][0] = Zf[i - 1][0] * endgapextend;
+ traceZf[i][0] = 2;
+ }
+ //>=2ND COL INIT
+ for (j = 2; j <= sequences[0].length; j++) {
+ Ze[0][j] = Ze[0][j - 1] * endgapextend;
+ traceZe[0][j] = 1;
+ }
+ }
+
+ //1ST ROW/COL INIT
+
+ int Si, Tj;
+ double score;
+
+ for (i = 1; i <= sequences[1].length; i++) {
+
+ for (j = 1; j <= sequences[0].length; j++) {
+
+ Si = subst_index[sequences[1].text[i - 1] - 'A'];
+ Tj = subst_index[sequences[0].text[j - 1] - 'A'];
+
+ score = sub_matrix[Si][Tj];
+
+ double open0, extend0, open1, extend1;
+
+ open0 = open1 = d;
+ extend0 = extend1 = e;
+
+ if (endgaps == 1) {
+ //check to see if one of the 2 sequences or both reach the end
+
+ if (i == sequences[1].length) {
+ open0 = endgapopen;
+ extend0 = endgapextend;
+
+ }
+
+ if (j == sequences[0].length) {
+ open1 = endgapopen;
+ extend1 = endgapextend;
+ }
+ }
+
+ //
+ //z computation using open and extend temp vars
+ //open0 is gap open in seq0 and open1 is gap open in seq1
+ //entend0 is gap extend in seq0 and extend1 is gap extend in seq1
+ Zf[i][j] = Zf[i - 1][j] * extend1;
+ traceZf[i][j] = 2;
+
+ if(Zm[i - 1][j] * open1 > Zf[i][j]){
+ Zf[i][j] = Zm[i - 1][j] * open1;
+ traceZf[i][j] = 0;
+ }
+ if (Zf[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for Zf\n");
+ exit(1);
+ }
+ Ze[i][j] = Ze[i][j - 1] * extend0;
+ traceZe[i][j] = 1;
+ if(Zm[i][j - 1] * open0 > Ze[i][j]){
+ Ze[i][j] = Zm[i][j - 1] * open0;
+ traceZe[i][j] = 0;
+ }
+
+ if (Ze[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for Ze\n");
+ exit(1);
+ }
+
+ Zm[i][j] = Zm[i - 1][j - 1] * score;
+ traceZm[i][j] = 0;
+ if(Zf[i - 1][j - 1] * score > Zm[i][j]){
+ Zm[i][j] = Zf[i - 1][j - 1] * score;
+ traceZm[i][j] = 2;
+ }
+ if(Ze[i - 1][j - 1] * score > Zm[i][j]){
+ Zm[i][j] = Ze[i - 1][j - 1] * score;
+ traceZm[i][j] = 1;
+ }
+ if (Zm[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for Zm\n");
+ exit(1);
+ }
+
+ }//end for
+ }//end for
+ // figure out best terminating cell
+
+ float bestProb = Zm[sequences[1].length][sequences[0].length];
+ int state = 0;
+ if( bestProb < Zf[sequences[1].length][sequences[0].length]){
+ bestProb = Zf[sequences[1].length][sequences[0].length];
+ state = 2;
+ }
+ if( bestProb < Ze[sequences[1].length][sequences[0].length]){
+ bestProb = Ze[sequences[1].length][sequences[0].length];
+ state = 1;
+ }
+ assert (state != -1);
+
+ // compute traceback
+ SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
+ int c = sequences[1].length, r = sequences[0].length;
+ while (r != 0 || c != 0){
+ int newState;
+ if(state == 0){
+ newState = traceZm[c][r];
+ c--; r--; alignment->push_back ('B');
+ }
+ else if(state == 1){
+ newState = traceZe[c][r];
+ r--; alignment->push_back ('X');
+ }
+ else{
+ newState = traceZf[c][r];
+ c--; alignment->push_back ('Y');
+ }
+ state = newState;
+ }
+
+ reverse (alignment->begin(), alignment->end());
+
+ for (i = 0; i <= sequences[1].length; i++) {
+ delete (Zf[i]);
+ delete (Ze[i]);
+ delete (Zm[i]);
+ delete (traceZf[i]);
+ delete (traceZe[i]);
+ delete (traceZm[i]);
+ }
+
+ delete (Zf);
+ delete (Ze);
+ delete (Zm);
+ delete (traceZf);
+ delete (traceZe);
+ delete (traceZm);
+
+ return make_pair(alignment, bestProb);
+}
+
+//////////////////////////////////////////////////////////////
+// Compute two sequences' similarity defined as the normalized alignment score without gap penalties
+// Added by YE Yongtao
+/////////////////////////////////////////////////////////////
+
+float computeSimilarity(string seq1, string seq2, SafeVector<char> * alignment) {
+
+ //initialize the sequence structure
+ fasta sequences[2];
+ sequences[0].length = strlen((char *) seq1.c_str());
+ sequences[0].text = (char *) seq1.c_str();
+ sequences[0].title = new char[10];
+ strcpy(sequences[0].title, "seq0");
+ sequences[1].length = strlen((char *) seq2.c_str());
+ sequences[1].text = (char *) seq2.c_str();
+ sequences[1].title = new char[10];
+ strcpy(sequences[1].title, "seq1");
+
+ float bestProb = 0;
+ int Si, Tj;
+ double score;
+ int i = 1;int j = 1;
+ for (SafeVector<char>::iterator iter = alignment->begin();
+ iter != alignment->end(); ++iter){
+ if (*iter == 'B'){
+ Si = subst_index[sequences[1].text[j - 1] - 'A'];
+ Tj = subst_index[sequences[0].text[i - 1] - 'A'];
+ score = normalized_matrix[Si][Tj];
+ bestProb += score;
+ i++; j++;
+ }
+ else if(*iter == 'X') i++;
+ else if(*iter == 'Y') j++;
+ }
+ if(i!= sequences[0].length + 1 || j!= sequences[1].length + 1 ) cerr << "similarity error"<< endl;
+ bestProb /= alignment->size();
+ //bestProb /= min(sequences[0].length, sequences[1].length);
+ return bestProb;
+}
+//end of posterior probability module
--- /dev/null
+<?xml version="1.0" encoding="gb2312"?>\r
+<VisualStudioProject\r
+ ProjectType="Visual C++"\r
+ Version="8.00"\r
+ Name="MSAProbs"\r
+ ProjectGUID="{671563E4-93A2-419E-8B41-48DDF71DD144}"\r
+ RootNamespace="MSAProbs"\r
+ Keyword="Win32Proj"\r
+ >\r
+ <Platforms>\r
+ <Platform\r
+ Name="Win32"\r
+ />\r
+ </Platforms>\r
+ <ToolFiles>\r
+ </ToolFiles>\r
+ <Configurations>\r
+ <Configuration\r
+ Name="Debug|Win32"\r
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)"\r
+ IntermediateDirectory="$(ConfigurationName)"\r
+ ConfigurationType="1"\r
+ CharacterSet="1"\r
+ >\r
+ <Tool\r
+ Name="VCPreBuildEventTool"\r
+ />\r
+ <Tool\r
+ Name="VCCustomBuildTool"\r
+ />\r
+ <Tool\r
+ Name="VCXMLDataGeneratorTool"\r
+ />\r
+ <Tool\r
+ Name="VCWebServiceProxyGeneratorTool"\r
+ />\r
+ <Tool\r
+ Name="VCMIDLTool"\r
+ />\r
+ <Tool\r
+ Name="VCCLCompilerTool"\r
+ AdditionalOptions="/openmp"\r
+ Optimization="0"\r
+ PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"\r
+ MinimalRebuild="true"\r
+ BasicRuntimeChecks="3"\r
+ RuntimeLibrary="3"\r
+ OpenMP="true"\r
+ UsePrecompiledHeader="0"\r
+ WarningLevel="3"\r
+ Detect64BitPortabilityProblems="true"\r
+ DebugInformationFormat="4"\r
+ />\r
+ <Tool\r
+ Name="VCManagedResourceCompilerTool"\r
+ />\r
+ <Tool\r
+ Name="VCResourceCompilerTool"\r
+ />\r
+ <Tool\r
+ Name="VCPreLinkEventTool"\r
+ />\r
+ <Tool\r
+ Name="VCLinkerTool"\r
+ LinkIncremental="2"\r
+ GenerateDebugInformation="true"\r
+ SubSystem="1"\r
+ TargetMachine="1"\r
+ />\r
+ <Tool\r
+ Name="VCALinkTool"\r
+ />\r
+ <Tool\r
+ Name="VCManifestTool"\r
+ />\r
+ <Tool\r
+ Name="VCXDCMakeTool"\r
+ />\r
+ <Tool\r
+ Name="VCBscMakeTool"\r
+ />\r
+ <Tool\r
+ Name="VCFxCopTool"\r
+ />\r
+ <Tool\r
+ Name="VCAppVerifierTool"\r
+ />\r
+ <Tool\r
+ Name="VCWebDeploymentTool"\r
+ />\r
+ <Tool\r
+ Name="VCPostBuildEventTool"\r
+ />\r
+ </Configuration>\r
+ <Configuration\r
+ Name="Release|Win32"\r
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)"\r
+ IntermediateDirectory="$(ConfigurationName)"\r
+ ConfigurationType="1"\r
+ CharacterSet="1"\r
+ WholeProgramOptimization="1"\r
+ >\r
+ <Tool\r
+ Name="VCPreBuildEventTool"\r
+ />\r
+ <Tool\r
+ Name="VCCustomBuildTool"\r
+ />\r
+ <Tool\r
+ Name="VCXMLDataGeneratorTool"\r
+ />\r
+ <Tool\r
+ Name="VCWebServiceProxyGeneratorTool"\r
+ />\r
+ <Tool\r
+ Name="VCMIDLTool"\r
+ />\r
+ <Tool\r
+ Name="VCCLCompilerTool"\r
+ PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"\r
+ RuntimeLibrary="2"\r
+ OpenMP="true"\r
+ UsePrecompiledHeader="0"\r
+ WarningLevel="3"\r
+ Detect64BitPortabilityProblems="true"\r
+ DebugInformationFormat="3"\r
+ />\r
+ <Tool\r
+ Name="VCManagedResourceCompilerTool"\r
+ />\r
+ <Tool\r
+ Name="VCResourceCompilerTool"\r
+ />\r
+ <Tool\r
+ Name="VCPreLinkEventTool"\r
+ />\r
+ <Tool\r
+ Name="VCLinkerTool"\r
+ LinkIncremental="1"\r
+ GenerateDebugInformation="true"\r
+ SubSystem="1"\r
+ OptimizeReferences="2"\r
+ EnableCOMDATFolding="2"\r
+ TargetMachine="1"\r
+ />\r
+ <Tool\r
+ Name="VCALinkTool"\r
+ />\r
+ <Tool\r
+ Name="VCManifestTool"\r
+ />\r
+ <Tool\r
+ Name="VCXDCMakeTool"\r
+ />\r
+ <Tool\r
+ Name="VCBscMakeTool"\r
+ />\r
+ <Tool\r
+ Name="VCFxCopTool"\r
+ />\r
+ <Tool\r
+ Name="VCAppVerifierTool"\r
+ />\r
+ <Tool\r
+ Name="VCWebDeploymentTool"\r
+ />\r
+ <Tool\r
+ Name="VCPostBuildEventTool"\r
+ />\r
+ </Configuration>\r
+ </Configurations>\r
+ <References>\r
+ </References>\r
+ <Files>\r
+ <Filter\r
+ Name="Source Files"\r
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"\r
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"\r
+ >\r
+ <File\r
+ RelativePath=".\main.cpp"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSA.cpp"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAClusterTree.cpp"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAGuideTree.cpp"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAPartProbs.cpp"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAReadMatrix.cpp"\r
+ >\r
+ </File>\r
+ </Filter>\r
+ <Filter\r
+ Name="Header Files"\r
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"\r
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"\r
+ >\r
+ <File\r
+ RelativePath=".\Defaults.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\FileBuffer.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSA.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAClusterTree.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSADef.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAGuideTree.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAReadMatrix.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MultiSequence.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\ProbabilisticModel.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\SafeVector.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\ScoreType.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\Sequence.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\SparseMatrix.h"\r
+ >\r
+ </File>\r
+ </Filter>\r
+ <Filter\r
+ Name="Resource Files"\r
+ Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"\r
+ UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"\r
+ >\r
+ </Filter>\r
+ </Files>\r
+ <Globals>\r
+ </Globals>\r
+</VisualStudioProject>\r
--- /dev/null
+<?xml version="1.0" encoding="gb2312"?>\r
+<VisualStudioUserFile\r
+ ProjectType="Visual C++"\r
+ Version="8.00"\r
+ ShowAllFiles="false"\r
+ >\r
+ <Configurations>\r
+ <Configuration\r
+ Name="Debug|Win32"\r
+ >\r
+ <DebugSettings\r
+ Command="$(TargetPath)"\r
+ WorkingDirectory=""\r
+ CommandArguments=""\r
+ Attach="false"\r
+ DebuggerType="3"\r
+ Remote="1"\r
+ RemoteMachine="GS-14"\r
+ RemoteCommand=""\r
+ HttpUrl=""\r
+ PDBPath=""\r
+ SQLDebugging=""\r
+ Environment=""\r
+ EnvironmentMerge="true"\r
+ DebuggerFlavor=""\r
+ MPIRunCommand=""\r
+ MPIRunArguments=""\r
+ MPIRunWorkingDirectory=""\r
+ ApplicationCommand=""\r
+ ApplicationArguments=""\r
+ ShimCommand=""\r
+ MPIAcceptMode=""\r
+ MPIAcceptFilter=""\r
+ />\r
+ </Configuration>\r
+ <Configuration\r
+ Name="Release|Win32"\r
+ >\r
+ <DebugSettings\r
+ Command="$(TargetPath)"\r
+ WorkingDirectory=""\r
+ CommandArguments=""\r
+ Attach="false"\r
+ DebuggerType="3"\r
+ Remote="1"\r
+ RemoteMachine="GS-14"\r
+ RemoteCommand=""\r
+ HttpUrl=""\r
+ PDBPath=""\r
+ SQLDebugging=""\r
+ Environment=""\r
+ EnvironmentMerge="true"\r
+ DebuggerFlavor=""\r
+ MPIRunCommand=""\r
+ MPIRunArguments=""\r
+ MPIRunWorkingDirectory=""\r
+ ApplicationCommand=""\r
+ ApplicationArguments=""\r
+ ShimCommand=""\r
+ MPIAcceptMode=""\r
+ MPIAcceptFilter=""\r
+ />\r
+ </Configuration>\r
+ </Configurations>\r
+</VisualStudioUserFile>\r
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "MSAReadMatrix.h"
+
+#define TRACE 0
+
+////////////////////////////////////////////////////////////
+// extern variables for scoring matrix data
+////////////////////////////////////////////////////////////
+extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
+extern char *aminos, *bases, matrixtype[20];
+extern int subst_index[26];
+
+extern double sub_matrix[26][26];
+extern double normalized_matrix[26][26];
+
+extern float TEMPERATURE;
+extern int MATRIXTYPE;
+
+extern float GAPOPEN;
+extern float GAPEXT;
+
+typedef struct {
+ char input[30];
+ int matrix;
+ int N;
+ float T;
+ float beta;
+ char opt; //can be 'P' or 'M'
+ float gapopen;
+ float gapext;
+} argument_decl;
+
+//argument support
+extern argument_decl argument;
+
+/////////////////////////////////////////////////////////
+//sets substitution matrix type
+////////////////////////////////////////////////////////
+void setmatrixtype(int le) {
+ switch (le) {
+ case 160:
+ strcpy(matrixtype, "gonnet_160");
+ break;
+ case 4:
+ strcpy(matrixtype, "nuc_simple");
+ break;
+ default:
+ strcpy(matrixtype, "CUSTOM");
+ break;
+
+ };
+
+}
+
+///////////////////////////////////////////////////////////////////
+//sets matrix flag
+///////////////////////////////////////////////////////////////////
+inline int matrixtype_to_int() {
+
+ if (!strcmp(matrixtype, "nuc_simple"))
+ return 4;
+ else if (!strcmp(matrixtype, "gonnet_160"))
+ return 160;
+ else
+ return 1000;
+
+}
+
+/////////////////////////////////////////////////////////////////
+//
+// Can read any scoring matrix as long as it is defined in Matrix.h
+// AND it is a lower triangular
+// AND the order of amino acids/bases is mentioned
+/////////////////////////////////////////////////////////////////
+
+inline void read_matrix(score_matrix matrx) {
+ int i, j, basecount, position = 0;
+
+ bases = (char *) matrx.monomers;
+
+ basecount = strlen(bases);
+
+ for (i = 0; i < basecount; i++)
+ subst_index[i] = -1;
+
+ for (i = 0; i < basecount; i++)
+ subst_index[bases[i] - 'A'] = i;
+
+ if (TRACE == 1)
+ printf("\nbases read: %d\n", basecount);
+
+ for (i = 0; i < basecount; i++)
+ for (j = 0; j <= i; j++) {
+
+ double value = exp(argument.beta * matrx.matrix[position++]);
+ sub_matrix[i][j] = value;
+ sub_matrix[j][i] = value;
+ }
+
+ if (TRACE)
+ for (i = 0; i < basecount; i++) {
+ for (j = 0; j < basecount; j++)
+ printf(" %g ", sub_matrix[i][j]);
+ printf("\n");
+ }
+
+}
+
+/////////////////////////////////////////////////////////////////
+// read normalized residue exchange matrix
+// compute sequence similarity
+// add by YE Yongtao
+/////////////////////////////////////////////////////////////////
+
+inline void read_normalized_matrix(score_matrix matrx) {
+ int i, j, basecount, position = 0;
+
+ bases = (char *) matrx.monomers;
+
+ basecount = strlen(bases);
+
+ for (i = 0; i < basecount; i++)
+ subst_index[i] = -1;
+
+ for (i = 0; i < basecount; i++)
+ subst_index[bases[i] - 'A'] = i;
+
+ if (TRACE == 1)
+ printf("\nbases read: %d\n", basecount);
+
+ for (i = 0; i < basecount; i++)
+ for (j = 0; j <= i; j++) {
+
+ double value = matrx.matrix[position++];
+ normalized_matrix[i][j] = value;
+ normalized_matrix[j][i] = value;
+ }
+
+ if (TRACE)
+ for (i = 0; i < basecount; i++) {
+ for (j = 0; j < basecount; j++)
+ printf(" %g ", normalized_matrix[i][j]);
+ printf("\n");
+ }
+
+}
+//////////////////////////////////////////////////////////////////////////////////
+//intialize the arguments (default values)
+//////////////////////////////////////////////////////////////////////////////////
+void init_arguments() {
+ float gap_open = 0, gap_ext = 0;
+ int le;
+
+ le = matrixtype_to_int();
+
+ argument.N = 1;
+ strcpy(argument.input, "tempin");
+ argument.matrix = le;
+ argument.gapopen = GAPOPEN;
+ argument.gapext = GAPEXT;
+ argument.T = TEMPERATURE;
+ argument.beta = 1.0 / TEMPERATURE;
+ argument.opt = 'P';
+
+ if (le == 4) //NUC OPTION :default is nuc_simple
+ {
+ read_matrix(nuc_simple);
+ gap_open = -4;
+ gap_ext = -0.25;
+ }
+
+ else if (le == 160) //PROT option: default is gonnet_160
+ {
+ if (TRACE)
+ printf("read matrix\n");
+ read_matrix(gonnet_160);
+ gap_open = -22;
+ gap_ext = -1;
+
+ read_normalized_matrix(normalized_blosum_30); // add by YE Yongtao
+ } else if (le == 1000) { //Error handling
+ printf("Error: enter a valid matrix type\n");
+ exit(1);
+ //additional matrices can only be lower triangular
+ }
+
+ //now override the gapopen and gapext
+ if (argument.gapopen != 0.0 || argument.gapext != 0.00)
+
+ {
+ gap_open = -argument.gapopen;
+ gap_ext = -argument.gapext;
+ }
+
+ if (TRACE)
+ printf("%f %f %f %d\n", argument.T, gap_open, gap_ext, le);
+
+ argument.gapopen = gap_open;
+ argument.gapext = gap_ext;
+ argument.opt = 'P';
+
+}
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// Matrix.h
+//
+// Specifies scoring matrices and their structure
+//
+//
+//
+/////////////////////////////////////////////////////////////////
+
+#ifndef _MSA_READ_MATRIX_H
+#define _MSA_READ_MATRIX_H
+
+typedef struct {
+ char monomers[26]; /* amino or nucleic acid order */
+ float matrix[676]; /* entries of the score matix, 26*26=676 */
+} score_matrix;
+
+//default protein sequence scoring matrix as well as default scoring matrix of the PROBALIGN
+//also used when -prot option is used
+
+score_matrix gonnet_160 = { "ABCDEFGHIKLMNPQRSTVWXYZ",
+
+{ 4.6, 0.0, 0.0, 0.3, 0.0, 13.5, -1.1, 0.0, -5.3, 7.0, -0.4, 0.0, -5.2, 3.4,
+ 5.9, -3.8, 0.0, -1.8, -7.0, -6.2, 9.1, 0.2, 0.0, -3.4, -0.7, -2.1, -7.6,
+ 8.2, -1.8, 0.0, -2.3, -0.1, -0.1, -0.7, -2.7, 9.3, -1.8, 0.0, -2.5,
+ -6.2, -4.3, 0.3, -7.0, -3.7, 5.9, -1.2, 0.0, -4.8, -0.1, 1.3, -5.3,
+ -2.4, 0.2, -3.5, 5.5, -2.2, 0.0, -2.9, -6.5, -4.5, 1.9, -6.7, -3.2, 3.0,
+ -3.4, 5.7, -1.2, 0.0, -1.9, -5.0, -3.1, 1.4, -5.2, -2.1, 2.9, -2.1, 3.4,
+ 7.6, -1.2, 0.0, -3.1, 2.6, 0.5, -4.7, -0.2, 1.5, -4.4, 0.8, -4.8, -3.6,
+ 6.5, -0.1, 0.0, -5.2, -1.9, -1.4, -5.8, -3.0, -2.2, -4.3, -1.6, -3.5,
+ -4.2, -2.2, 9.6, -0.7, 0.0, -4.2, 0.6, 2.3, -4.1, -2.1, 1.7, -3.2, 2.0,
+ -2.4, -1.2, 0.5, -0.8, 5.6, -1.6, 0.0, -3.5, -1.6, -0.3, -5.3, -2.1,
+ 0.3, -4.1, 3.5, -3.5, -2.9, -0.4, -2.1, 1.7, 7.1, 1.6, 0.0, -0.2, 0.0,
+ -0.3, -4.5, -0.1, -0.8, -3.3, -0.4, -3.6, -2.3, 1.1, 0.0, -0.2, -0.9,
+ 4.4, 0.5, 0.0, -1.4, -0.6, -0.8, -3.6, -2.4, -0.8, -1.2, -0.2, -2.4,
+ -1.1, 0.3, -0.4, -0.4, -0.9, 2.3, 5.0, 0.1, 0.0, -0.6, -4.9, -3.0, -0.8,
+ -5.2, -3.5, 4.0, -3.0, 1.7, 1.4, -3.8, -3.2, -2.7, -3.4, -2.0, 0.0, 5.3,
+ -5.5, 0.0, -2.1, -7.8, -6.4, 3.2, -5.5, -1.9, -3.4, -5.4, -2.0, -2.2,
+ -5.5, -7.4, -4.0, -2.4, -4.7, -5.4, -4.5, 15.8, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, -3.7, 0.0, -1.3, -4.2, -4.4, 5.6, -6.0, 2.7, -2.0, -3.5, -1.1,
+ -1.3, -2.2, -4.8, -2.9, -2.9, -2.8, -3.2, -2.4, 3.8, 0.0, 10.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }
+
+};
+
+//normalized blosum_62 scoring matrix for computing protein sequence similarity
+score_matrix normalized_blosum_62 = {
+
+"ABCDEFGHIKLMNPQRSTVWXYZ",
+{
+0.533333333,
+0.133333333,0.533333333,
+0.266666667,0.066666667,0.866666667,
+0.133333333,0.533333333,0.066666667,0.666666667,
+0.2,0.333333333,0,0.4,0.6,
+0.133333333,0.066666667 ,0.133333333,0.066666667,0.066666667,0.666666667,
+0.266666667,0.2 ,0.066666667,0.2,0.133333333,0.066666667,0.666666667,
+0.133333333,0.266666667 ,0.066666667,0.2,0.266666667,0.2,0.133333333,0.8,
+0.2,0.066666667 ,0.2,0.066666667,0.066666667,0.266666667,0,0.066666667,0.533333333,
+0.2,0.266666667 ,0.066666667,0.2,0.333333333,0.066666667,0.133333333,0.2,0.066666667,0.6,
+0.2,0,0.2,0,0.066666667,0.266666667 ,0,0.066666667 ,0.4,0.133333333,0.533333333,
+0.2,0.066666667 ,0.2,0.066666667,0.133333333,0.266666667,0.066666667 ,0.133333333,0.333333333,0.2,0.4,0.6,
+0.133333333,0.466666667 ,0.066666667,0.333333333,0.266666667,0.066666667,0.266666667,0.333333333,0.066666667, 0.266666667,0.066666667 ,0.133333333,0.666666667,
+0.2,0.133333333 ,0.066666667,0.2,0.2, 0,0.133333333, 0.133333333,0.066666667 ,0.2,0.066666667,0.133333333, 0.133333333,0.733333333 ,
+0.2,0.266666667 ,0.066666667,0.266666667,0.4,0.066666667,0.133333333, 0.266666667,0.066666667 ,0.333333333, 0.133333333,0.266666667 ,0.266666667,0.2,0.6,
+0.2,0.2 ,0.066666667,0.133333333,0.266666667,0.066666667,0.133333333,0.266666667,0.066666667,0.4, 0.133333333,0.2,0.266666667,0.133333333,0.333333333,0.6 ,
+0.333333333,0.266666667 ,0.2,0.266666667,0.266666667,0.133333333,0.266666667,0.2,0.133333333, 0.266666667,0.133333333 ,0.2,0.333333333,0.2,0.266666667,0.2,0.533333333,
+0.266666667, 0.2,0.2,0.2, 0.2, 0.133333333,0.133333333,0.133333333,0.2 ,0.2,0.2,0.2,0.266666667,0.2, 0.2,0.2 ,0.333333333,0.6,
+0.266666667,0.066666667 ,0.2,0.066666667,0.133333333,0.2,0.066666667,0.066666667,0.466666667,0.133333333, 0.333333333,0.333333333, 0.066666667,0.133333333,0.133333333 ,0.066666667, 0.133333333, 0.266666667, 0.533333333,
+0.066666667,0,0.133333333,0,0.066666667 ,0.333333333 , 0.133333333, 0.133333333,0.066666667 , 0.066666667, 0.133333333 ,0.2 , 0, 0,0.133333333 ,0.066666667 , 0.066666667 , 0.133333333 , 0.066666667 , 1,
+0.266666667,0.2 ,0.133333333 , 0.2 , 0.2 , 0.2 , 0.2,0.2 , 0.2,0.2 ,0.2 , 0.2 , 0.2 , 0.133333333, 0.2 ,0.2 , 0.266666667 , 0.266666667,0.2 ,0.133333333, 0.2 ,
+0.133333333,0.066666667 , 0.133333333 , 0.066666667 , 0.133333333 , 0.466666667, 0.066666667,0.4 ,0.2, 0.133333333 , 0.2 , 0.2, 0.133333333 , 0.066666667, 0.2, 0.133333333,0.133333333 ,0.133333333 , 0.2 , 0.4,0.2 , 0.733333333,
+0.2,0.333333333 ,0.066666667 ,0.333333333 , 0.533333333 , 0.066666667, 0.133333333, 0.266666667,0.066666667 ,0.333333333 , 0.066666667, 0.2, 0.266666667, 0.2 , 0.466666667,0.266666667 ,0.266666667, 0.2 , 0.133333333, 0.066666667 , 0.2,0.133333333 ,0.533333333
+
+}
+};
+
+//normalized blosum_30 scoring matrix for computing protein sequence similarity
+score_matrix normalized_blosum_30 = {
+
+"ABCDEFGHIKLMNPQRSTVWXYZ",
+{
+0.407407407 ,
+0.259259259 , 0.444444444 ,
+0.148148148 , 0.185185185 , 0.888888889 ,
+0.259259259 , 0.444444444 , 0.148148148 , 0.592592593 ,
+0.259259259 , 0.259259259 , 0.296296296 , 0.296296296 , 0.481481481 ,
+0.185185185 , 0.148148148 , 0.148148148 , 0.074074074 , 0.111111111 , 0.62962963 ,
+0.259259259 , 0.259259259 , 0.111111111 , 0.222222222 , 0.185185185 , 0.148148148 , 0.555555556 ,
+0.185185185 , 0.185185185 , 0.074074074 , 0.185185185 , 0.259259259 , 0.148148148 , 0.148148148 , 0.777777778 ,
+0.259259259 , 0.185185185 , 0.185185185 , 0.111111111 , 0.148148148 , 0.259259259 , 0.222222222 , 0.185185185 , 0.481481481 ,
+0.259259259 , 0.259259259 , 0.148148148 , 0.259259259 , 0.333333333 , 0.222222222 , 0.222222222 , 0.185185185 , 0.185185185 , 0.407407407 ,
+0.222222222 , 0.222222222 , 0.259259259 , 0.222222222 , 0.222222222 , 0.333333333 , 0.185185185 , 0.222222222 , 0.333333333 , 0.185185185 , 0.407407407 ,
+0.296296296 , 0.185185185 , 0.185185185 , 0.148148148 , 0.222222222 , 0.185185185 , 0.185185185 , 0.333333333 , 0.296296296 , 0.333333333 , 0.333333333 , 0.481481481 ,
+0.259259259 , 0.407407407 , 0.222222222 , 0.296296296 , 0.222222222 , 0.222222222 , 0.259259259 , 0.222222222 , 0.259259259 , 0.259259259 , 0.185185185 , 0.259259259 , 0.555555556 ,
+0.222222222 , 0.185185185 , 0.148148148 , 0.222222222 , 0.296296296 , 0.111111111 , 0.222222222 , 0.296296296 , 0.148148148 , 0.296296296 , 0.148148148 , 0.111111111 , 0.148148148 , 0.666666667 ,
+0.296296296 , 0.222222222 , 0.185185185 , 0.222222222 , 0.333333333 , 0.148148148 , 0.185185185 , 0.259259259 , 0.185185185 , 0.259259259 , 0.185185185 , 0.222222222 , 0.222222222 , 0.259259259 , 0.555555556 ,
+0.222222222 , 0.185185185 , 0.185185185 , 0.222222222 , 0.222222222 , 0.222222222 , 0.185185185 , 0.222222222 , 0.148148148 , 0.296296296 , 0.185185185 , 0.259259259 , 0.185185185 , 0.222222222 , 0.37037037 , 0.555555556 ,
+0.296296296 , 0.259259259 , 0.185185185 , 0.259259259 , 0.259259259 , 0.222222222 , 0.259259259 , 0.222222222 , 0.222222222 , 0.259259259 , 0.185185185 , 0.185185185 , 0.259259259 , 0.222222222 , 0.222222222 , 0.222222222 , 0.407407407 ,
+0.296296296 , 0.259259259 , 0.185185185 , 0.222222222 , 0.185185185 , 0.185185185 , 0.185185185 , 0.185185185 , 0.259259259 , 0.222222222 , 0.259259259 , 0.259259259 , 0.296296296 , 0.259259259 , 0.259259259 , 0.148148148 , 0.333333333 , 0.444444444 ,
+0.296296296 , 0.185185185 , 0.185185185 , 0.185185185 , 0.148148148 , 0.296296296 , 0.148148148 , 0.148148148 , 0.407407407 , 0.185185185 , 0.296296296 , 0.259259259 , 0.185185185 , 0.111111111 , 0.148148148 , 0.222222222 , 0.222222222 , 0.296296296 , 0.444444444 ,
+0.074074074 , 0.074074074 , 0.185185185 , 0.111111111 , 0.222222222 , 0.296296296 , 0.296296296 , 0.074074074 , 0.148148148 , 0.185185185 , 0.185185185 , 0.148148148 , 0 , 0.148148148 , 0.222222222 , 0.259259259 , 0.148148148 , 0.074074074 , 0.148148148 , 1 ,
+0.259259259 , 0.222222222 , 0.185185185 , 0.222222222 , 0.222222222 , 0.222222222 , 0.222222222 , 0.222222222 , 0.259259259 , 0.259259259 , 0.259259259 , 0.259259259 , 0.259259259 , 0.222222222 , 0.259259259 , 0.222222222 , 0.259259259 , 0.259259259 , 0.259259259 , 0.185185185 , 0.222222222 ,
+0.111111111 , 0.148148148 , 0.037037037 , 0.222222222 , 0.185185185 , 0.37037037 , 0.148148148 , 0.259259259 , 0.222222222 , 0.222222222 , 0.37037037 , 0.222222222 , 0.111111111 , 0.185185185 , 0.222222222 , 0.259259259 , 0.185185185 , 0.222222222 , 0.296296296 , 0.444444444 , 0.222222222 , 0.592592593 ,
+0.259259259 , 0.259259259 , 0.259259259 , 0.259259259 , 0.444444444 , 0.111111111 , 0.185185185 , 0.259259259 , 0.148148148 , 0.296296296 , 0.222222222 , 0.222222222 , 0.222222222 , 0.259259259 , 0.407407407 , 0.259259259 , 0.222222222 , 0.222222222 , 0.148148148 , 0.222222222 , 0.259259259 , 0.185185185 , 0.407407407
+
+}
+};
+
+//default nucleotide sequence scoring matrix
+//used when -nuc option is used
+score_matrix nuc_simple = {
+
+"ABCDGHKMNRSTUVWXY",
+
+{ 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0 }
+
+//Ribosum85-60
+ /*
+ {
+ 2.22,
+ 0, 0,
+ -1.86, 0, 1.16,
+ 0, 0, 0, 0,
+ -1.46, 0, -2.48, 0, 1.03,
+ 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -1.39, 0, -1.05, 0, -1.74, 0, 0, 0, 0, 0, 0, 1.65,
+ -1.39, 0, -1.05, 0, -1.74, 0, 0, 0, 0, 0, 0, 0, 1.65,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ }
+ */
+
+ };
+
+#endif
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <set>
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <iomanip>
+#include "MSA.h"
+#include "MSAClusterTree.h"
+#include "Defaults.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+string parametersInputFilename = "";
+string parametersOutputFilename = "no training";
+string annotationFilename = "";
+
+bool enableVerbose = false;
+bool enableAnnotation = false;
+bool enableClustalWOutput = false;
+bool enableAlignOrder = false;
+int numConsistencyReps = 2;
+int numPreTrainingReps = 0;
+int numIterativeRefinementReps = 100;
+
+float cutoff = 0;
+
+VF initDistrib(NumMatrixTypes);
+VF gapOpen(2 * NumInsertStates);
+VF gapExtend(2 * NumInsertStates);
+VVF emitPairs(256, VF(256, 1e-10));
+VF emitSingle(256, 1e-5);
+
+string alphabet = alphabetDefault;
+
+const int MIN_PRETRAINING_REPS = 0;
+const int MAX_PRETRAINING_REPS = 20;
+const int MIN_CONSISTENCY_REPS = 0;
+const int MAX_CONSISTENCY_REPS = 5;
+const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
+const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
+
+string posteriorProbsFilename = "";
+bool allscores = true;
+string infilename;
+
+int flag_gui = 0; //0: no gui related o/p
+//1: gui related o/p generated
+int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
+//1: pp score seq added to o/p fasta alignment
+
+///////////////////////////////
+// global scoring matrix variables
+//////////////////////////////
+float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
+char *aminos, *bases, matrixtype[20] = "gonnet_160";
+int subst_index[26];
+
+double sub_matrix[26][26];
+int firstread = 0; //this makes sure that matrices are read only once
+
+float TEMPERATURE = 5;
+int MATRIXTYPE = 160;
+int prot_nuc = 0; //0=prot, 1=nucleotide
+
+float GAPOPEN = 0;
+float GAPEXT = 0;
+int numThreads = 0;
+
+//argument support
+typedef struct {
+ char input[30];
+ int matrix;
+ int N;
+ float T;
+ float beta;
+ char opt; //can be 'P' or 'M'
+ float gapopen;
+ float gapext;
+} argument_decl;
+
+argument_decl argument;
+
+extern inline void read_sustitution_matrix(char *fileName);
+extern void setmatrixtype(int le);
+extern inline int matrixtype_to_int();
+extern inline void read_dna_matrix();
+extern inline void read_vtml_la_matrix();
+extern void init_arguments();
+
+MSA::MSA(int argc, char* argv[]) {
+ //parse program parameters
+ SafeVector<string> sequenceNames = ParseParams(argc, argv);
+
+ //initialize arguments for partition function
+ init_arguments();
+
+ ReadParameters();
+ //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
+
+ //read the input sequences
+ MultiSequence *sequences = new MultiSequence();
+ assert(sequences);
+ for (int i = 0; i < (int) sequenceNames.size(); i++) {
+ cerr << "Loading sequence file: " << sequenceNames[i] << endl;
+ sequences->LoadMFA(sequenceNames[i], true);
+ }
+ //allocate space for sequence weights
+ this->seqsWeights = new int[sequences->GetNumSequences()];
+ //initilaize parameters for OPENMP
+#ifdef _OPENMP
+ if(numThreads <= 0) {
+ numThreads = omp_get_num_procs();
+ cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
+ }
+ cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
+
+ //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
+ omp_set_num_threads(numThreads);
+#endif
+
+ // now, we can perform the alignments and write them out
+ MultiSequence *alignment = doAlign(sequences,
+ ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
+ emitSingle), initDistrib, gapOpen, gapExtend, emitPairs,
+ emitSingle);
+
+ //write the alignment results to standard output
+ if (enableClustalWOutput) {
+ alignment->WriteALN(*alignOutFile);
+ } else {
+ alignment->WriteMFA(*alignOutFile);
+ }
+ //release resources
+ delete[] this->seqsWeights;
+ delete alignment;
+ delete sequences;
+}
+MSA::~MSA() {
+ /*close the output file*/
+ if (alignOutFileName.length() > 0) {
+ ((std::ofstream*) alignOutFile)->close();
+ }
+}
+/////////////////////////////////////////////////////////////////
+// PrintParameters()
+//
+// Prints MSAPROBS parameters to STDERR. If a filename is
+// specified, then the parameters are also written to the file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::PrintParameters(const char *message, const VF &initDistrib,
+ const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
+ const VF &emitSingle, const char *filename) {
+
+ // print parameters to the screen
+ cerr << message << endl << " initDistrib[] = { ";
+ for (int i = 0; i < NumMatrixTypes; i++)
+ cerr << setprecision(10) << initDistrib[i] << " ";
+ cerr << "}" << endl << " gapOpen[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapOpen[i] << " ";
+ cerr << "}" << endl << " gapExtend[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapExtend[i] << " ";
+ cerr << "}" << endl << endl;
+
+ /*
+ for (int i = 0; i < 5; i++){
+ for (int j = 0; j <= i; j++){
+ cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
+ }
+ cerr << endl;
+ }*/
+
+ // if a file name is specified
+ if (filename) {
+
+ // attempt to open the file for writing
+ FILE *file = fopen(filename, "w");
+ if (!file) {
+ cerr << "ERROR: Unable to write parameter file: " << filename
+ << endl;
+ exit(1);
+ }
+
+ // if successful, then write the parameters to the file
+ for (int i = 0; i < NumMatrixTypes; i++)
+ fprintf(file, "%.10f ", initDistrib[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapOpen[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapExtend[i]);
+ fprintf(file, "\n");
+ fprintf(file, "%s\n", alphabet.c_str());
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++)
+ fprintf(file, "%.10f ",
+ emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
+ fprintf(file, "\n");
+ }
+ for (int i = 0; i < (int) alphabet.size(); i++)
+ fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
+ fprintf(file, "\n");
+ fclose(file);
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// doAlign()
+//
+// First computes all pairwise posterior probability matrices.
+// Then, computes new parameters if training, or a final
+// alignment, otherwise.
+/////////////////////////////////////////////////////////////////
+extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
+MultiSequence* MSA::doAlign(MultiSequence *sequences,
+ const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
+ VF &gapExtend, VVF &emitPairs, VF &emitSingle) {
+ assert(sequences);
+
+ //get the number of sequences
+ const int numSeqs = sequences->GetNumSequences();
+
+ //create distance matrix
+ VVF probalign_distances(numSeqs, VF(numSeqs, 0));
+ VVF distances(numSeqs, VF(numSeqs, 0));//msa
+
+ float gl_accuracy = 0;
+ //creat sparseMatrices
+ SafeVector<SafeVector<SparseMatrix *> > probalign_sparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+ SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL)); // msa
+
+#ifdef _OPENMP
+ //calculate sequence pairs for openmp model
+ int pairIdx = 0;
+ numPairs = (numSeqs - 1) * numSeqs / 2;
+ seqsPairs = new SeqsPair[numPairs];
+ for(int a = 0; a < numSeqs; a++) {
+ for(int b = a + 1; b < numSeqs; b++) {
+ seqsPairs[pairIdx].seq1 = a;
+ seqsPairs[pairIdx].seq2 = b;
+ pairIdx++;
+ }
+ }
+#endif
+ // do all pairwise alignments for posterior probability matrices
+#ifdef _OPENMP
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int a= seqsPairs[pairIdx].seq1;
+ int b = seqsPairs[pairIdx].seq2;
+ if(enableVerbose) {
+#pragma omp critical
+ cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
+ }
+#else
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+#endif
+ Sequence *seq1 = sequences->GetSequence(a);
+ Sequence *seq2 = sequences->GetSequence(b);
+
+ // verbose output
+ if (enableVerbose) {
+ cerr << "Computing posterior matrix: (" << a + 1 << ") "
+ << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") "
+ << seq2->GetHeader() << " -- ";
+ }
+
+//probcons
+ // compute forward and backward probabilities
+ VF *forward = model.ComputeForwardMatrix(seq1, seq2);
+ assert(forward);
+ VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
+ assert(backward);
+ // compute posterior probability matrix from HMM
+ VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
+ assert(probcons_posterior);
+ delete forward;
+ delete backward;
+
+//probalign
+ VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
+ assert(probalign_posterior);
+ probalign_sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),seq2->GetLength(), *probalign_posterior);
+ probalign_sparseMatrices[b][a] = NULL;
+ pair<SafeVector<char> *, float> probalign_alignment = model.ComputeAlignment(
+ seq1->GetLength(), seq2->GetLength(), *probalign_posterior);
+ probalign_distances[a][b] =1.0f - probalign_alignment.second / min(seq1->GetLength(), seq2->GetLength());
+ delete probalign_alignment.first;
+
+//local
+ forward = model.ComputeForwardMatrix(seq1, seq2,false);
+ assert(forward);
+ backward = model.ComputeBackwardMatrix(seq1, seq2,false);
+ assert(backward);
+ VF* local_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
+ delete forward;
+ delete backward;
+
+//GL
+ //merge probalign + local + probcons
+ VF::iterator ptr1 = probcons_posterior->begin();
+ VF::iterator ptr2 = probalign_posterior->begin();
+ VF::iterator ptr3 = local_posterior->begin();
+ VF* posterior = new VF((seq1->GetLength()+1) * (seq2->GetLength()+1)); assert (posterior); //msa
+ VF::iterator ptr = posterior->begin();
+ for (int i = 0; i <= seq1->GetLength(); i++) {
+ for (int j = 0; j <= seq2->GetLength(); j++) {
+ float v1 = *ptr1;
+ float v2 = *ptr2;
+ float v3 = *ptr3;
+ *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3);
+ ptr1++;
+ ptr2++;
+ ptr3++;
+ ptr++;
+ }
+ }
+ // perform the pairwise sequence alignment
+ pair<SafeVector<char> *, float> gl_alignment = model.ComputeAlignment(
+ seq1->GetLength(), seq2->GetLength(), *posterior);
+
+ //compute expected accuracy
+ distances[a][b] = distances[b][a] = 1.0f - gl_alignment.second
+ / min(seq1->GetLength(), seq2->GetLength());
+
+ // compute sparse representations
+ sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), *posterior);
+ sparseMatrices[b][a] = NULL;
+ //
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+ float N_correct_match = 0;
+ int i = 1;int j = 1;
+ for (SafeVector<char>::iterator iter = gl_alignment.first->begin();
+ iter != gl_alignment.first->end(); ++iter){
+ if (*iter == 'B'){
+ unsigned char c1 = (unsigned char) iter1[i++];
+ unsigned char c2 = (unsigned char) iter2[j++];
+ if(c1==c2) N_correct_match += 1;
+ }
+ else if(*iter == 'X') i++;
+ else if(*iter == 'Y') j++;
+ }
+ if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "GL"<< endl;
+ gl_accuracy += N_correct_match / min(seq1->GetLength(), seq2->GetLength());
+ //
+ delete probcons_posterior;
+ delete probalign_posterior;
+ delete local_posterior;
+ delete posterior;
+
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+/*
+//self-adaptive
+ gl_accuracy /= numPairs;
+ if(gl_accuracy > 0.4){
+ for (int a = 0; a < numSeqs - 1; a++)
+ for (int b = a + 1; b < numSeqs; b++) {
+ distances[a][b] = distances[b][a] = probalign_distances[a][b];
+ sparseMatrices[a][b] = probalign_sparseMatrices[a][b];
+ sparseMatrices[b][a] = NULL;
+ }
+ }
+*/
+ //create the guide tree
+ this->tree = new MSAClusterTree(this, distances, numSeqs);
+ this->tree->create();
+
+ // perform the consistency transformation the desired number of times
+ float* fweights = new float[numSeqs];
+ for (int r = 0; r < numSeqs; r++) {
+ fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
+ fweights[r] *= 10;
+ }
+ for (int r = 0; r < numConsistencyReps; r++) {
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
+ DoRelaxation(fweights, sequences, sparseMatrices);
+
+ // now replace the old posterior matrices
+ for (int i = 0; i < numSeqs; i++) {
+ for (int j = 0; j < numSeqs; j++) {
+ delete sparseMatrices[i][j];
+ sparseMatrices[i][j] = newSparseMatrices[i][j];
+ }
+ }
+ }
+ delete[] fweights;
+#ifdef _OPENMP
+ delete [] seqsPairs;
+#endif
+
+ //compute the final multiple sequence alignment
+ MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
+ sparseMatrices, model);
+
+ // build annotation
+ if (enableAnnotation) {
+ WriteAnnotation(finalAlignment, sparseMatrices);
+ }
+ //destroy the guide tree
+ delete this->tree;
+ this->tree = 0;
+
+ // delete sparse matrices
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+ delete sparseMatrices[a][b];
+ delete sparseMatrices[b][a];
+ }
+ }
+
+ return finalAlignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetInteger()
+//
+// Attempts to parse an integer from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetInteger(char *data, int *val) {
+ char *endPtr;
+ long int retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtol(data, &endPtr, 0);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
+ return false;
+ if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
+ return false;
+ *val = (int) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetFloat()
+//
+// Attempts to parse a float from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetFloat(char *data, float *val) {
+ char *endPtr;
+ double retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtod(data, &endPtr);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
+ return false;
+ *val = (float) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// ReadParameters()
+//
+// Read initial distribution, transition, and emission
+// parameters from a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::ReadParameters() {
+
+ ifstream data;
+
+ emitPairs = VVF(256, VF(256, 1e-10));
+ emitSingle = VF(256, 1e-5);
+
+ // read initial state distribution and transition parameters
+ if (parametersInputFilename == string("")) {
+ if (NumInsertStates == 1) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend1Default[i];
+ } else if (NumInsertStates == 2) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend2Default[i];
+ } else {
+ cerr
+ << "ERROR: No default initial distribution/parameter settings exist"
+ << endl << " for " << NumInsertStates
+ << " pairs of insert states. Use --paramfile." << endl;
+ exit(1);
+ }
+
+ alphabet = alphabetDefault;
+
+ for (int i = 0; i < (int) alphabet.length(); i++) {
+ emitSingle[(unsigned char) tolower(alphabet[i])] =
+ emitSingleDefault[i];
+ emitSingle[(unsigned char) toupper(alphabet[i])] =
+ emitSingleDefault[i];
+ for (int j = 0; j <= i; j++) {
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ }
+ }
+ } else {
+ data.open(parametersInputFilename.c_str());
+ if (data.fail()) {
+ cerr << "ERROR: Unable to read parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ string line[3];
+ for (int i = 0; i < 3; i++) {
+ if (!getline(data, line[i])) {
+ cerr
+ << "ERROR: Unable to read transition parameters from parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+ }
+ istringstream data2;
+ data2.clear();
+ data2.str(line[0]);
+ for (int i = 0; i < NumMatrixTypes; i++)
+ data2 >> initDistrib[i];
+ data2.clear();
+ data2.str(line[1]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapOpen[i];
+ data2.clear();
+ data2.str(line[2]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapExtend[i];
+
+ if (!getline(data, line[0])) {
+ cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ // read alphabet as concatenation of all characters on alphabet line
+ alphabet = "";
+ string token;
+ data2.clear();
+ data2.str(line[0]);
+ while (data2 >> token)
+ alphabet += token;
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++) {
+ float val;
+ data >> val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ }
+ }
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ float val;
+ data >> val;
+ emitSingle[(unsigned char) tolower(alphabet[i])] = val;
+ emitSingle[(unsigned char) toupper(alphabet[i])] = val;
+ }
+ data.close();
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// ParseParams()
+//
+// Parse all command-line options.
+/////////////////////////////////////////////////////////////////
+void MSA::printUsage() {
+ cerr
+ << "************************************************************************"
+ << endl
+ << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
+ << endl
+ << "\tbased on pair hidden markov model and partition function postirior"
+ << endl
+ << "\tprobabilities. If any comments or problems, please contact"
+ << endl
+ << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
+ << endl
+ << "*************************************************************************"
+ << endl << "Usage:" << endl
+ << " msaprobs [OPTION]... [infile]..." << endl << endl
+ << "Description:" << endl
+ << " Align sequences in multi-FASTA format" << endl << endl
+ << " -o, --outfile <string>" << endl
+ << " specify the output file name (STDOUT by default)"
+ << endl << " -num_threads <integer>" << endl
+ << " specify the number of threads used, and otherwise detect automatically"
+ << endl << " -clustalw" << endl
+ << " use CLUSTALW output format instead of FASTA format"
+ << endl << endl << " -c, --consistency REPS" << endl
+ << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
+ << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
+ << ") passes of consistency transformation" << endl << endl
+ << " -ir, --iterative-refinement REPS" << endl
+ << " use " << MIN_ITERATIVE_REFINEMENT_REPS
+ << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
+ << numIterativeRefinementReps << ") passes of iterative-refinement"
+ << endl << endl << " -v, --verbose" << endl
+ << " report progress while aligning (default: "
+ << (enableVerbose ? "on" : "off") << ")" << endl << endl
+ << " -annot FILENAME" << endl
+ << " write annotation for multiple alignment to FILENAME"
+ << endl << endl << " -a, --alignment-order" << endl
+ << " print sequences in alignment order rather than input order (default: "
+ << (enableAlignOrder ? "on" : "off") << ")" << endl
+ << " -version " << endl
+ << " print out version of MSAPROBS " << endl << endl;
+}
+SafeVector<string> MSA::ParseParams(int argc, char **argv) {
+ if (argc < 2) {
+ printUsage();
+ exit(1);
+ }
+ SafeVector<string> sequenceNames;
+ int tempInt;
+ float tempFloat;
+
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-') {
+ //help
+ if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
+ printUsage();
+ exit(1);
+ //output file name
+ } else if (!strcmp(argv[i], "-o")
+ || !strcmp(argv[i], "--outfile")) {
+ if (i < argc - 1) {
+ alignOutFileName = argv[++i]; //get the file name
+ } else {
+ cerr << "ERROR: String expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // parameter file
+ } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
+ if (i < argc - 1)
+ parametersInputFilename = string (argv[++i]);
+ else {
+ cerr << "ERROR: Filename expected for option " << argv[i] << endl;
+ exit (1);
+ }
+ //number of threads used
+ } else if (!strcmp(argv[i], "-p")
+ || !strcmp(argv[i], "-num_threads")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << " ERROR: invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < 0) {
+ tempInt = 0;
+ }
+ numThreads = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // number of consistency transformations
+ } else if (!strcmp(argv[i], "-c")
+ || !strcmp(argv[i], "--consistency")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_CONSISTENCY_REPS
+ || tempInt > MAX_CONSISTENCY_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_CONSISTENCY_REPS << " and "
+ << MAX_CONSISTENCY_REPS << "." << endl;
+ exit(1);
+ } else {
+ numConsistencyReps = tempInt;
+ }
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // number of randomized partitioning iterative refinement passes
+ else if (!strcmp(argv[i], "-ir")
+ || !strcmp(argv[i], "--iterative-refinement")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
+ || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_ITERATIVE_REFINEMENT_REPS << " and "
+ << MAX_ITERATIVE_REFINEMENT_REPS << "."
+ << endl;
+ exit(1);
+ } else
+ numIterativeRefinementReps = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // annotation files
+ else if (!strcmp(argv[i], "-annot")) {
+ enableAnnotation = true;
+ if (i < argc - 1) {
+ annotationFilename = argv[++i];
+ } else {
+ cerr << "ERROR: FILENAME expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // clustalw output format
+ else if (!strcmp(argv[i], "-clustalw")) {
+ enableClustalWOutput = true;
+ }
+
+ // cutoff
+ else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
+ if (i < argc - 1) {
+ if (!GetFloat(argv[++i], &tempFloat)) {
+ cerr
+ << "ERROR: Invalid floating-point value following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempFloat < 0 || tempFloat > 1) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", floating-point value must be between 0 and 1."
+ << endl;
+ exit(1);
+ } else
+ cutoff = tempFloat;
+ }
+ } else {
+ cerr << "ERROR: Floating-point value expected for option "
+ << argv[i] << endl;
+ exit(1);
+ }
+ }
+
+ // verbose reporting
+ else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
+ enableVerbose = true;
+ }
+
+ // alignment order
+ else if (!strcmp(argv[i], "-a")
+ || !strcmp(argv[i], "--alignment-order")) {
+ enableAlignOrder = true;
+ }
+
+ //print out version
+ else if (!strcmp(argv[i], "-version")) {
+ cerr << "MSAPROBS version " << VERSION << endl;
+ exit(1);
+ }
+ // bad arguments
+ else {
+ cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
+ exit(1);
+ }
+ } else {
+ sequenceNames.push_back(string(argv[i]));
+ }
+ }
+
+ /*check the output file name*/
+ cerr << "-------------------------------------" << endl;
+ if (alignOutFileName.length() == 0) {
+ cerr << "The final alignments will be printed out to STDOUT" << endl;
+ alignOutFile = &std::cout;
+ } else {
+ cerr << "Open the output file " << alignOutFileName << endl;
+ alignOutFile = new ofstream(alignOutFileName.c_str(),
+ ios::binary | ios::out | ios::trunc);
+ }
+ cerr << "-------------------------------------" << endl;
+ return sequenceNames;
+}
+
+/////////////////////////////////////////////////////////////////
+// ProcessTree()
+//
+// Process the tree recursively. Returns the aligned sequences
+// corresponding to a node or leaf of the tree.
+/////////////////////////////////////////////////////////////////
+MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ MultiSequence *result;
+
+ // check if this is a node of the alignment tree
+ //if (tree->GetSequenceLabel() == -1){
+ if (tree->leaf == NODE) {
+ MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
+ sparseMatrices, model);
+ MultiSequence *alignRight = ProcessTree(tree->right, sequences,
+ sparseMatrices, model);
+
+ assert(alignLeft);
+ assert(alignRight);
+
+ result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
+ assert(result);
+
+ delete alignLeft;
+ delete alignRight;
+ }
+
+ // otherwise, this is a leaf of the alignment tree
+ else {
+ result = new MultiSequence();
+ assert(result);
+ //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
+ result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
+ }
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeFinalAlignment()
+//
+// Compute the final alignment by calling ProcessTree(), then
+// performing iterative refinement as needed.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
+ MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+ MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
+ sparseMatrices, model);
+
+ SafeVector<int> oldOrdering;
+ if (enableAlignOrder) {
+ for (int i = 0; i < alignment->GetNumSequences(); i++)
+ oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
+ alignment->SaveOrdering();
+ enableAlignOrder = false;
+ }
+
+ // tree-based refinement
+ // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
+ /*
+ int numSeqs = alignment->GetNumSequences();
+ //if(numSeqs < numIterativeRefinementReps){
+ for(int iter = 0; iter < 5; iter ++){
+ for(int i = 0; i < numSeqs - 1; i++){
+ DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
+ }
+ }
+ //}*/
+ //Refinement return false:no improvement
+ for (int i = 0; i < numIterativeRefinementReps; i++) {
+ DoIterativeRefinement(sparseMatrices, model, alignment);
+ }
+ cerr << endl;
+
+ if (oldOrdering.size() > 0) {
+ for (int i = 0; i < (int) oldOrdering.size(); i++) {
+ alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
+ }
+ }
+
+ // return final alignment
+ return alignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// AlignAlignments()
+//
+// Returns the alignment of two MultiSequence objects.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
+ MultiSequence *align2,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ // print some info about the alignment
+ if (enableVerbose) {
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align1->GetSequence(i)->GetLabel();
+ cerr << "] vs. ";
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align2->GetSequence(i)->GetLabel();
+ cerr << "]: ";
+ }
+#if 1
+ VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
+ sparseMatrices, cutoff);
+#endif
+ // compute an "accuracy" measure for the alignment before refinement
+
+ pair<SafeVector<char> *, float> alignment;
+ //perform alignment
+ alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
+ align2->GetSequence(0)->GetLength(), *posterior);
+
+ delete posterior;
+
+ if (enableVerbose) {
+
+ // compute total length of sequences
+ int totLength = 0;
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ for (int j = 0; j < align2->GetNumSequences(); j++)
+ totLength += min(align1->GetSequence(i)->GetLength(),
+ align2->GetSequence(j)->GetLength());
+
+ // give an "accuracy" measure for the alignment
+ cerr << alignment.second / totLength << endl;
+ }
+
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ result->AddSequence(
+ align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ result->AddSequence(
+ align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
+ if (!enableAlignOrder)
+ result->SortByLabel();
+
+ // free temporary alignment
+ delete alignment.first;
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// DoRelaxation()
+//
+// Performs one round of the weighted probabilistic consistency transformation.
+// 1
+/////////////////////////////////////////////////////////////////
+
+SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
+ MultiSequence *sequences,
+ SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ const int numSeqs = sequences->GetNumSequences();
+
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+ // for every pair of sequences
+#ifdef _OPENMP
+ int pairIdx;
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int i = seqsPairs[pairIdx].seq1;
+ int j = seqsPairs[pairIdx].seq2;
+ float wi = seqsWeights[i];
+ float wj = seqsWeights[j];
+#else
+ for (int i = 0; i < numSeqs; i++) {
+ float wi = seqsWeights[i];
+ for (int j = i + 1; j < numSeqs; j++) {
+ float wj = seqsWeights[j];
+#endif
+ Sequence *seq1 = sequences->GetSequence(i);
+ Sequence *seq2 = sequences->GetSequence(j);
+
+ if (enableVerbose) {
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+ cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
+ << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
+ << ": ";
+ }
+ // get the original posterior matrix
+ VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+
+ // contribution from the summation where z = x and z = y
+ float w = wi * wi * wj + wi * wj * wj;
+ float sumW = w;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ //posterior[k] = w*posterior[k];
+ posterior[k] += posterior[k];
+ }
+
+ if (enableVerbose)
+ cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
+
+ // contribution from all other sequences
+ for (int k = 0; k < numSeqs; k++) {
+ if (k != i && k != j) {
+ float wk = seqsWeights[k];
+ float w = wi * wj * wk;
+ sumW += w;
+ if (k < i)
+ Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
+ posterior);
+ else if (k > i && k < j)
+ Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
+ posterior);
+ else {
+ SparseMatrix *temp =
+ sparseMatrices[j][k]->ComputeTranspose();
+ Relax(w, sparseMatrices[i][k], temp, posterior);
+ delete temp;
+ }
+ }
+ }
+ //cerr<<"sumW "<<sumW<<endl;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ //posterior[k] /= sumW;
+ posterior[k] /= numSeqs;
+ }
+ // mask out positions not originally in the posterior matrix
+ SparseMatrix *matXY = sparseMatrices[i][j];
+ for (int y = 0; y <= seq2Length; y++)
+ posterior[y] = 0;
+ for (int x = 1; x <= seq1Length; x++) {
+ SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
+ SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
+ VF::iterator base = posterior.begin() + x * (seq2Length + 1);
+ int curr = 0;
+ while (XYptr != XYend) {
+
+ // zero out all cells until the first filled column
+ while (curr < XYptr->first) {
+ base[curr] = 0;
+ curr++;
+ }
+
+ // now, skip over this column
+ curr++;
+ ++XYptr;
+ }
+
+ // zero out cells after last column
+ while (curr <= seq2Length) {
+ base[curr] = 0;
+ curr++;
+ }
+ }
+
+ // save the new posterior matrix
+ newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), posterior);
+ newSparseMatrices[j][i] = NULL;
+
+ if (enableVerbose)
+ cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
+
+ delete posteriorPtr;
+
+ if (enableVerbose)
+ cerr << "done." << endl;
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+ return newSparseMatrices;
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matXZ);
+ assert(matZY);
+
+ int lengthX = matXZ->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+ assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
+
+ // for every x[i]
+ for (int i = 1; i <= lengthX; i++) {
+ SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
+ SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
+
+ VF::iterator base = posterior.begin() + i * (lengthY + 1);
+
+ // iterate through all x[i]-z[k]
+ while (XZptr != XZend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
+ SafeVector<PIF>::iterator ZYend = ZYptr
+ + matZY->GetRowSize(XZptr->first);
+ const float XZval = XZptr->second;
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ //base[ZYptr->first] += weight * XZval * ZYptr->second;
+ base[ZYptr->first] += XZval * ZYptr->second;
+ ZYptr++;
+ }
+ XZptr++;
+ }
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax1()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matZX);
+ assert(matZY);
+
+ int lengthZ = matZX->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+
+ // for every z[k]
+ for (int k = 1; k <= lengthZ; k++) {
+ SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
+
+ // iterate through all z[k]-x[i]
+ while (ZXptr != ZXend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
+ const float ZXval = ZXptr->second;
+ VF::iterator base = posterior.begin()
+ + ZXptr->first * (lengthY + 1);
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ //base[ZYptr->first] += weight * ZXval * ZYptr->second;
+ base[ZYptr->first] += ZXval * ZYptr->second;
+ ZYptr++;
+ }
+ ZXptr++;
+ }
+ }
+}
+/////////////////////////////////////////////////////////////////
+// DoIterativeRefinement()
+//
+// Performs a single round of randomized partionining iterative
+// refinement.
+/////////////////////////////////////////////////////////////////
+
+void MSA::DoIterativeRefinement(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+
+ // create two separate groups
+ for (int i = 0; i < numSeqs; i++) {
+ int index = rand();
+ if (index % 2) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty()) return;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+/*
+//start add by Yongtao
+#if 0
+ VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
+ sparseMatrices, cutoff);
+#endif
+
+ // compute an "accuracy" measure for the alignment before refinement
+ SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
+ SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
+ int i=0;
+ for (set<int>::const_iterator iter = groupOne.begin();
+ iter != groupOne.end(); ++iter) {
+ oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
+ }
+ i=0;
+ for (set<int>::const_iterator iter = groupTwo.begin();
+ iter != groupTwo.end(); ++iter) {
+ oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
+ }
+
+ VF &posteriorArr = *posterior;
+ int oldLength = alignment->GetSequence(0)->GetLength();
+ int groupOneindex=0; int groupTwoindex=0;
+ float accuracy_before = 0;
+ for (int i = 1; i <= oldLength; i++) {
+ // check to see if there is a gap in every sequence of the set
+ bool foundOne = false;
+ for (int j = 0; !foundOne && j < (int) groupOne.size(); j++)
+ foundOne = (oldOnePtrs[j][i] != '-');
+ // if not, then this column counts towards the sequence length
+ if (foundOne) groupOneindex ++;
+ bool foundTwo = false;
+ for (int j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
+ foundTwo = (oldTwoPtrs[j][i] != '-');
+ // if not, then this column counts towards the sequence length
+ if (foundTwo) groupTwoindex ++;
+ if(foundOne && foundTwo) accuracy_before +=
+ posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
+ }
+
+ pair<SafeVector<char> *, float> refinealignment;
+ //perform alignment
+ refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
+ groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
+ delete posterior;
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ //compare accuracy measure before and after refinement
+ //if (refinealignment.second > accuracy_before) {
+ //cerr<<"Before:" << accuracy_before<<" after: "<< refinealignment.second<< endl;
+ for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
+ result->AddSequence(
+ groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
+ for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
+ result->AddSequence(
+ groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
+ // free temporary alignment
+ delete refinealignment.first;
+ delete alignment;
+ alignment = result;
+
+ }
+ else{
+ if(numIterativeRefinementReps < 8*numSeqs) numIterativeRefinementReps++;
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+ return false;
+ }
+ */
+//end add by yongtao
+
+ delete alignment;
+ // realign
+ alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, model); //original
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+
+}
+
+void MSA::DoIterativeRefinementTreeNode(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment,
+ int nodeIndex) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+
+ vector<bool> inGroup1;
+ inGroup1.resize(numSeqs);
+ for (int i = 0; i < numSeqs; i++) {
+ inGroup1[i] = false;
+ }
+
+ AlignmentOrder* orders = this->tree->getAlignOrders();
+ AlignmentOrder* order = &orders[nodeIndex];
+ for (int i = 0; i < order->leftNum; i++) {
+ int si = order->leftLeafs[i];
+ inGroup1[si] = true;
+ }
+ for (int i = 0; i < order->rightNum; i++) {
+ int si = order->rightLeafs[i];
+ inGroup1[si] = true;
+ }
+ // create two separate groups
+ for (int i = 0; i < numSeqs; i++) {
+ if (inGroup1[i]) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty())
+ return;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+ delete alignment;
+
+ // realign
+ alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
+ model);
+
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+}
+
+/////////////////////////////////////////////////////////////////
+// WriteAnnotation()
+//
+// Computes annotation for multiple alignment and write values
+// to a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::WriteAnnotation(MultiSequence *alignment,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ ofstream outfile(annotationFilename.c_str());
+
+ if (outfile.fail()) {
+ cerr << "ERROR: Unable to write annotation file." << endl;
+ exit(1);
+ }
+
+ const int alignLength = alignment->GetSequence(0)->GetLength();
+ const int numSeqs = alignment->GetNumSequences();
+
+ SafeVector<int> position(numSeqs, 0);
+ SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
+ for (int i = 0; i < numSeqs; i++)
+ seqs[i] = alignment->GetSequence(i)->GetDataPtr();
+ SafeVector<pair<int, int> > active;
+ active.reserve(numSeqs);
+
+ SafeVector<int> lab;
+ for (int i = 0; i < numSeqs; i++)
+ lab.push_back(alignment->GetSequence(i)->GetSortLabel());
+
+ // for every column
+ for (int i = 1; i <= alignLength; i++) {
+
+ // find all aligned residues in this particular column
+ active.clear();
+ for (int j = 0; j < numSeqs; j++) {
+ if (seqs[j][i] != '-') {
+ active.push_back(make_pair(lab[j], ++position[j]));
+ }
+ }
+
+ sort(active.begin(), active.end());
+ outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
+ }
+
+ outfile.close();
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeScore()
+//
+// Computes the annotation score for a particular column.
+/////////////////////////////////////////////////////////////////
+
+int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+
+ if (active.size() <= 1)
+ return 0;
+
+ // ALTERNATIVE #1: Compute the average alignment score.
+
+ float val = 0;
+ for (int i = 0; i < (int) active.size(); i++) {
+ for (int j = i + 1; j < (int) active.size(); j++) {
+ val += sparseMatrices[active[i].first][active[j].first]->GetValue(
+ active[i].second, active[j].second);
+ }
+ }
+
+ return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
+
+}
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <set>
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <iomanip>
+#include "MSA.h"
+#include "MSAClusterTree.h"
+#include "Defaults.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+string parametersInputFilename = "";
+string parametersOutputFilename = "no training";
+string annotationFilename = "";
+
+bool enableVerbose = false;
+bool enableAnnotation = false;
+bool enableClustalWOutput = false;
+bool enableAlignOrder = false;
+int numConsistencyReps = 2;
+int numPreTrainingReps = 0;
+int numIterativeRefinementReps = 100;
+
+float cutoff = 0;
+
+VF initDistrib(NumMatrixTypes);
+VF gapOpen(2 * NumInsertStates);
+VF gapExtend(2 * NumInsertStates);
+VVF emitPairs(256, VF(256, 1e-10));
+VF emitSingle(256, 1e-5);
+
+string alphabet = alphabetDefault;
+
+const int MIN_PRETRAINING_REPS = 0;
+const int MAX_PRETRAINING_REPS = 20;
+const int MIN_CONSISTENCY_REPS = 0;
+const int MAX_CONSISTENCY_REPS = 5;
+const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
+const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
+
+string posteriorProbsFilename = "";
+bool allscores = true;
+string infilename;
+
+int flag_gui = 0; //0: no gui related o/p
+//1: gui related o/p generated
+int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
+//1: pp score seq added to o/p fasta alignment
+
+///////////////////////////////
+// global scoring matrix variables
+//////////////////////////////
+float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
+char *aminos, *bases, matrixtype[20] = "gonnet_160";
+int subst_index[26];
+
+double sub_matrix[26][26];
+int firstread = 0; //this makes sure that matrices are read only once
+
+float TEMPERATURE = 5;
+int MATRIXTYPE = 160;
+int prot_nuc = 0; //0=prot, 1=nucleotide
+
+float GAPOPEN = 0;
+float GAPEXT = 0;
+int numThreads = 0;
+
+//argument support
+typedef struct {
+ char input[30];
+ int matrix;
+ int N;
+ float T;
+ float beta;
+ char opt; //can be 'P' or 'M'
+ float gapopen;
+ float gapext;
+} argument_decl;
+
+argument_decl argument;
+
+extern inline void read_sustitution_matrix(char *fileName);
+extern void setmatrixtype(int le);
+extern inline int matrixtype_to_int();
+extern inline void read_dna_matrix();
+extern inline void read_vtml_la_matrix();
+extern void init_arguments();
+
+MSA::MSA(int argc, char* argv[]) {
+ //parse program parameters
+ SafeVector<string> sequenceNames = ParseParams(argc, argv);
+
+ //initialize arguments for partition function
+ init_arguments();
+
+ ReadParameters();
+ //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
+
+ //read the input sequences
+ MultiSequence *sequences = new MultiSequence();
+ assert(sequences);
+ for (int i = 0; i < (int) sequenceNames.size(); i++) {
+ cerr << "Loading sequence file: " << sequenceNames[i] << endl;
+ sequences->LoadMFA(sequenceNames[i], true);
+ }
+ //allocate space for sequence weights
+ this->seqsWeights = new int[sequences->GetNumSequences()];
+ //initilaize parameters for OPENMP
+#ifdef _OPENMP
+ if(numThreads <= 0) {
+ numThreads = omp_get_num_procs();
+ cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
+ }
+ cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
+
+ //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
+ omp_set_num_threads(numThreads);
+#endif
+
+ // now, we can perform the alignments and write them out
+ MultiSequence *alignment = doAlign(sequences,
+ ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
+ emitSingle), initDistrib, gapOpen, gapExtend, emitPairs,
+ emitSingle);
+
+ //write the alignment results to standard output
+ if (enableClustalWOutput) {
+ alignment->WriteALN(*alignOutFile);
+ } else {
+ alignment->WriteMFA(*alignOutFile);
+ }
+ //release resources
+ delete[] this->seqsWeights;
+ delete alignment;
+ delete sequences;
+}
+MSA::~MSA() {
+ /*close the output file*/
+ if (alignOutFileName.length() > 0) {
+ ((std::ofstream*) alignOutFile)->close();
+ }
+}
+/////////////////////////////////////////////////////////////////
+// PrintParameters()
+//
+// Prints MSAPROBS parameters to STDERR. If a filename is
+// specified, then the parameters are also written to the file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::PrintParameters(const char *message, const VF &initDistrib,
+ const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
+ const VF &emitSingle, const char *filename) {
+
+ // print parameters to the screen
+ cerr << message << endl << " initDistrib[] = { ";
+ for (int i = 0; i < NumMatrixTypes; i++)
+ cerr << setprecision(10) << initDistrib[i] << " ";
+ cerr << "}" << endl << " gapOpen[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapOpen[i] << " ";
+ cerr << "}" << endl << " gapExtend[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapExtend[i] << " ";
+ cerr << "}" << endl << endl;
+
+ /*
+ for (int i = 0; i < 5; i++){
+ for (int j = 0; j <= i; j++){
+ cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
+ }
+ cerr << endl;
+ }*/
+
+ // if a file name is specified
+ if (filename) {
+
+ // attempt to open the file for writing
+ FILE *file = fopen(filename, "w");
+ if (!file) {
+ cerr << "ERROR: Unable to write parameter file: " << filename
+ << endl;
+ exit(1);
+ }
+
+ // if successful, then write the parameters to the file
+ for (int i = 0; i < NumMatrixTypes; i++)
+ fprintf(file, "%.10f ", initDistrib[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapOpen[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapExtend[i]);
+ fprintf(file, "\n");
+ fprintf(file, "%s\n", alphabet.c_str());
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++)
+ fprintf(file, "%.10f ",
+ emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
+ fprintf(file, "\n");
+ }
+ for (int i = 0; i < (int) alphabet.size(); i++)
+ fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
+ fprintf(file, "\n");
+ fclose(file);
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// doAlign()
+//
+// First computes all pairwise posterior probability matrices.
+// Then, computes new parameters if training, or a final
+// alignment, otherwise.
+/////////////////////////////////////////////////////////////////
+extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
+MultiSequence* MSA::doAlign(MultiSequence *sequences,
+ const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
+ VF &gapExtend, VVF &emitPairs, VF &emitSingle) {
+ assert(sequences);
+
+ //get the number of sequences
+ const int numSeqs = sequences->GetNumSequences();
+
+ //create distance matrix
+ VVF probalign_distances(numSeqs, VF(numSeqs, 0));
+ VVF distances(numSeqs, VF(numSeqs, 0));//msa
+
+ float gl_accuracy = 0;
+ //creat sparseMatrices
+ SafeVector<SafeVector<SparseMatrix *> > probalign_sparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+ SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL)); // msa
+
+#ifdef _OPENMP
+ //calculate sequence pairs for openmp model
+ int pairIdx = 0;
+ numPairs = (numSeqs - 1) * numSeqs / 2;
+ seqsPairs = new SeqsPair[numPairs];
+ for(int a = 0; a < numSeqs; a++) {
+ for(int b = a + 1; b < numSeqs; b++) {
+ seqsPairs[pairIdx].seq1 = a;
+ seqsPairs[pairIdx].seq2 = b;
+ pairIdx++;
+ }
+ }
+#endif
+ // do all pairwise alignments for posterior probability matrices
+#ifdef _OPENMP
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int a= seqsPairs[pairIdx].seq1;
+ int b = seqsPairs[pairIdx].seq2;
+ if(enableVerbose) {
+#pragma omp critical
+ cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
+ }
+#else
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+#endif
+ Sequence *seq1 = sequences->GetSequence(a);
+ Sequence *seq2 = sequences->GetSequence(b);
+
+ // verbose output
+ if (enableVerbose) {
+ cerr << "Computing posterior matrix: (" << a + 1 << ") "
+ << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") "
+ << seq2->GetHeader() << " -- ";
+ }
+
+//probcons
+ // compute forward and backward probabilities
+ VF *forward = model.ComputeForwardMatrix(seq1, seq2);
+ assert(forward);
+ VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
+ assert(backward);
+ // compute posterior probability matrix from HMM
+ VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
+ assert(probcons_posterior);
+ delete forward;
+ delete backward;
+
+//probalign
+ VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
+ assert(probalign_posterior);
+ probalign_sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),seq2->GetLength(), *probalign_posterior);
+ probalign_sparseMatrices[b][a] = NULL;
+ pair<SafeVector<char> *, float> probalign_alignment = model.ComputeAlignment(
+ seq1->GetLength(), seq2->GetLength(), *probalign_posterior);
+ probalign_distances[a][b] =1.0f - probalign_alignment.second / min(seq1->GetLength(), seq2->GetLength());
+ delete probalign_alignment.first;
+
+//local
+ forward = model.ComputeForwardMatrix(seq1, seq2,false);
+ assert(forward);
+ backward = model.ComputeBackwardMatrix(seq1, seq2,false);
+ assert(backward);
+ VF* local_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
+ delete forward;
+ delete backward;
+
+//GL
+ //merge probalign + local + probcons
+ VF::iterator ptr1 = probcons_posterior->begin();
+ VF::iterator ptr2 = probalign_posterior->begin();
+ VF::iterator ptr3 = local_posterior->begin();
+ VF* posterior = new VF((seq1->GetLength()+1) * (seq2->GetLength()+1)); assert (posterior); //msa
+ VF::iterator ptr = posterior->begin();
+ for (int i = 0; i <= seq1->GetLength(); i++) {
+ for (int j = 0; j <= seq2->GetLength(); j++) {
+ float v1 = *ptr1;
+ float v2 = *ptr2;
+ float v3 = *ptr3;
+ *ptr = sqrt(v1*v1 + v2*v2 + v3*v3);
+ ptr1++;
+ ptr2++;
+ ptr3++;
+ ptr++;
+ }
+ }
+ // perform the pairwise sequence alignment
+ pair<SafeVector<char> *, float> gl_alignment = model.ComputeAlignment(
+ seq1->GetLength(), seq2->GetLength(), *posterior);
+
+ //compute expected accuracy
+ distances[a][b] = distances[b][a] = 1.0f - gl_alignment.second
+ / (3*min(seq1->GetLength(), seq2->GetLength()));
+
+ // compute sparse representations
+ sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), *posterior);
+ sparseMatrices[b][a] = NULL;
+ //
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+ float N_correct_match = 0;
+ int i = 1;int j = 1;
+ for (SafeVector<char>::iterator iter = gl_alignment.first->begin();
+ iter != gl_alignment.first->end(); ++iter){
+ if (*iter == 'B'){
+ unsigned char c1 = (unsigned char) iter1[i++];
+ unsigned char c2 = (unsigned char) iter2[j++];
+ if(c1==c2) N_correct_match += 1;
+ }
+ else if(*iter == 'X') i++;
+ else if(*iter == 'Y') j++;
+ }
+ if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "GL"<< endl;
+ gl_accuracy += N_correct_match / min(seq1->GetLength(), seq2->GetLength());
+ //
+ delete probcons_posterior;
+ delete probalign_posterior;
+ delete local_posterior;
+ delete posterior;
+
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+//self-adaptive
+ gl_accuracy /= numPairs;
+ if(gl_accuracy > 0.4){
+ for (int a = 0; a < numSeqs - 1; a++)
+ for (int b = a + 1; b < numSeqs; b++) {
+ distances[a][b] = distances[b][a] = probalign_distances[a][b];
+ sparseMatrices[a][b] = probalign_sparseMatrices[a][b];
+ sparseMatrices[b][a] = NULL;
+ }
+ }
+
+ //create the guide tree
+ this->tree = new MSAClusterTree(this, distances, numSeqs);
+ this->tree->create();
+
+ // perform the consistency transformation the desired number of times
+ float* fweights = new float[numSeqs];
+ for (int r = 0; r < numSeqs; r++) {
+ fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
+ fweights[r] *= 10;
+ }
+ for (int r = 0; r < numConsistencyReps; r++) {
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
+ DoRelaxation(fweights, sequences, sparseMatrices);
+
+ // now replace the old posterior matrices
+ for (int i = 0; i < numSeqs; i++) {
+ for (int j = 0; j < numSeqs; j++) {
+ delete sparseMatrices[i][j];
+ sparseMatrices[i][j] = newSparseMatrices[i][j];
+ }
+ }
+ }
+ delete[] fweights;
+#ifdef _OPENMP
+ delete [] seqsPairs;
+#endif
+
+ //compute the final multiple sequence alignment
+ MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
+ sparseMatrices, model);
+
+ // build annotation
+ if (enableAnnotation) {
+ WriteAnnotation(finalAlignment, sparseMatrices);
+ }
+ //destroy the guide tree
+ delete this->tree;
+ this->tree = 0;
+
+ // delete sparse matrices
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+ delete sparseMatrices[a][b];
+ delete sparseMatrices[b][a];
+ }
+ }
+
+ return finalAlignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetInteger()
+//
+// Attempts to parse an integer from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetInteger(char *data, int *val) {
+ char *endPtr;
+ long int retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtol(data, &endPtr, 0);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
+ return false;
+ if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
+ return false;
+ *val = (int) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetFloat()
+//
+// Attempts to parse a float from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetFloat(char *data, float *val) {
+ char *endPtr;
+ double retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtod(data, &endPtr);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
+ return false;
+ *val = (float) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// ReadParameters()
+//
+// Read initial distribution, transition, and emission
+// parameters from a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::ReadParameters() {
+
+ ifstream data;
+
+ emitPairs = VVF(256, VF(256, 1e-10));
+ emitSingle = VF(256, 1e-5);
+
+ // read initial state distribution and transition parameters
+ if (parametersInputFilename == string("")) {
+ if (NumInsertStates == 1) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend1Default[i];
+ } else if (NumInsertStates == 2) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend2Default[i];
+ } else {
+ cerr
+ << "ERROR: No default initial distribution/parameter settings exist"
+ << endl << " for " << NumInsertStates
+ << " pairs of insert states. Use --paramfile." << endl;
+ exit(1);
+ }
+
+ alphabet = alphabetDefault;
+
+ for (int i = 0; i < (int) alphabet.length(); i++) {
+ emitSingle[(unsigned char) tolower(alphabet[i])] =
+ emitSingleDefault[i];
+ emitSingle[(unsigned char) toupper(alphabet[i])] =
+ emitSingleDefault[i];
+ for (int j = 0; j <= i; j++) {
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ }
+ }
+ } else {
+ data.open(parametersInputFilename.c_str());
+ if (data.fail()) {
+ cerr << "ERROR: Unable to read parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ string line[3];
+ for (int i = 0; i < 3; i++) {
+ if (!getline(data, line[i])) {
+ cerr
+ << "ERROR: Unable to read transition parameters from parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+ }
+ istringstream data2;
+ data2.clear();
+ data2.str(line[0]);
+ for (int i = 0; i < NumMatrixTypes; i++)
+ data2 >> initDistrib[i];
+ data2.clear();
+ data2.str(line[1]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapOpen[i];
+ data2.clear();
+ data2.str(line[2]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapExtend[i];
+
+ if (!getline(data, line[0])) {
+ cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ // read alphabet as concatenation of all characters on alphabet line
+ alphabet = "";
+ string token;
+ data2.clear();
+ data2.str(line[0]);
+ while (data2 >> token)
+ alphabet += token;
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++) {
+ float val;
+ data >> val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ }
+ }
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ float val;
+ data >> val;
+ emitSingle[(unsigned char) tolower(alphabet[i])] = val;
+ emitSingle[(unsigned char) toupper(alphabet[i])] = val;
+ }
+ data.close();
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// ParseParams()
+//
+// Parse all command-line options.
+/////////////////////////////////////////////////////////////////
+void MSA::printUsage() {
+ cerr
+ << "************************************************************************"
+ << endl
+ << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
+ << endl
+ << "\tbased on pair hidden markov model and partition function postirior"
+ << endl
+ << "\tprobabilities. If any comments or problems, please contact"
+ << endl
+ << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
+ << endl
+ << "*************************************************************************"
+ << endl << "Usage:" << endl
+ << " msaprobs [OPTION]... [infile]..." << endl << endl
+ << "Description:" << endl
+ << " Align sequences in multi-FASTA format" << endl << endl
+ << " -o, --outfile <string>" << endl
+ << " specify the output file name (STDOUT by default)"
+ << endl << " -num_threads <integer>" << endl
+ << " specify the number of threads used, and otherwise detect automatically"
+ << endl << " -clustalw" << endl
+ << " use CLUSTALW output format instead of FASTA format"
+ << endl << endl << " -c, --consistency REPS" << endl
+ << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
+ << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
+ << ") passes of consistency transformation" << endl << endl
+ << " -ir, --iterative-refinement REPS" << endl
+ << " use " << MIN_ITERATIVE_REFINEMENT_REPS
+ << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
+ << numIterativeRefinementReps << ") passes of iterative-refinement"
+ << endl << endl << " -v, --verbose" << endl
+ << " report progress while aligning (default: "
+ << (enableVerbose ? "on" : "off") << ")" << endl << endl
+ << " -annot FILENAME" << endl
+ << " write annotation for multiple alignment to FILENAME"
+ << endl << endl << " -a, --alignment-order" << endl
+ << " print sequences in alignment order rather than input order (default: "
+ << (enableAlignOrder ? "on" : "off") << ")" << endl
+ << " -version " << endl
+ << " print out version of MSAPROBS " << endl << endl;
+}
+SafeVector<string> MSA::ParseParams(int argc, char **argv) {
+ if (argc < 2) {
+ printUsage();
+ exit(1);
+ }
+ SafeVector<string> sequenceNames;
+ int tempInt;
+ float tempFloat;
+
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-') {
+ //help
+ if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
+ printUsage();
+ exit(1);
+ //output file name
+ } else if (!strcmp(argv[i], "-o")
+ || !strcmp(argv[i], "--outfile")) {
+ if (i < argc - 1) {
+ alignOutFileName = argv[++i]; //get the file name
+ } else {
+ cerr << "ERROR: String expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // parameter file
+ } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
+ if (i < argc - 1)
+ parametersInputFilename = string (argv[++i]);
+ else {
+ cerr << "ERROR: Filename expected for option " << argv[i] << endl;
+ exit (1);
+ }
+ //number of threads used
+ } else if (!strcmp(argv[i], "-p")
+ || !strcmp(argv[i], "-num_threads")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << " ERROR: invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < 0) {
+ tempInt = 0;
+ }
+ numThreads = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // number of consistency transformations
+ } else if (!strcmp(argv[i], "-c")
+ || !strcmp(argv[i], "--consistency")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_CONSISTENCY_REPS
+ || tempInt > MAX_CONSISTENCY_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_CONSISTENCY_REPS << " and "
+ << MAX_CONSISTENCY_REPS << "." << endl;
+ exit(1);
+ } else {
+ numConsistencyReps = tempInt;
+ }
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // number of randomized partitioning iterative refinement passes
+ else if (!strcmp(argv[i], "-ir")
+ || !strcmp(argv[i], "--iterative-refinement")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
+ || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_ITERATIVE_REFINEMENT_REPS << " and "
+ << MAX_ITERATIVE_REFINEMENT_REPS << "."
+ << endl;
+ exit(1);
+ } else
+ numIterativeRefinementReps = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // annotation files
+ else if (!strcmp(argv[i], "-annot")) {
+ enableAnnotation = true;
+ if (i < argc - 1) {
+ annotationFilename = argv[++i];
+ } else {
+ cerr << "ERROR: FILENAME expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // clustalw output format
+ else if (!strcmp(argv[i], "-clustalw")) {
+ enableClustalWOutput = true;
+ }
+
+ // cutoff
+ else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
+ if (i < argc - 1) {
+ if (!GetFloat(argv[++i], &tempFloat)) {
+ cerr
+ << "ERROR: Invalid floating-point value following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempFloat < 0 || tempFloat > 1) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", floating-point value must be between 0 and 1."
+ << endl;
+ exit(1);
+ } else
+ cutoff = tempFloat;
+ }
+ } else {
+ cerr << "ERROR: Floating-point value expected for option "
+ << argv[i] << endl;
+ exit(1);
+ }
+ }
+
+ // verbose reporting
+ else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
+ enableVerbose = true;
+ }
+
+ // alignment order
+ else if (!strcmp(argv[i], "-a")
+ || !strcmp(argv[i], "--alignment-order")) {
+ enableAlignOrder = true;
+ }
+
+ //print out version
+ else if (!strcmp(argv[i], "-version")) {
+ cerr << "MSAPROBS version " << VERSION << endl;
+ exit(1);
+ }
+ // bad arguments
+ else {
+ cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
+ exit(1);
+ }
+ } else {
+ sequenceNames.push_back(string(argv[i]));
+ }
+ }
+
+ /*check the output file name*/
+ cerr << "-------------------------------------" << endl;
+ if (alignOutFileName.length() == 0) {
+ cerr << "The final alignments will be printed out to STDOUT" << endl;
+ alignOutFile = &std::cout;
+ } else {
+ cerr << "Open the output file " << alignOutFileName << endl;
+ alignOutFile = new ofstream(alignOutFileName.c_str(),
+ ios::binary | ios::out | ios::trunc);
+ }
+ cerr << "-------------------------------------" << endl;
+ return sequenceNames;
+}
+
+/////////////////////////////////////////////////////////////////
+// ProcessTree()
+//
+// Process the tree recursively. Returns the aligned sequences
+// corresponding to a node or leaf of the tree.
+/////////////////////////////////////////////////////////////////
+MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ MultiSequence *result;
+
+ // check if this is a node of the alignment tree
+ //if (tree->GetSequenceLabel() == -1){
+ if (tree->leaf == NODE) {
+ MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
+ sparseMatrices, model);
+ MultiSequence *alignRight = ProcessTree(tree->right, sequences,
+ sparseMatrices, model);
+
+ assert(alignLeft);
+ assert(alignRight);
+
+ result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
+ assert(result);
+
+ delete alignLeft;
+ delete alignRight;
+ }
+
+ // otherwise, this is a leaf of the alignment tree
+ else {
+ result = new MultiSequence();
+ assert(result);
+ //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
+ result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
+ }
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeFinalAlignment()
+//
+// Compute the final alignment by calling ProcessTree(), then
+// performing iterative refinement as needed.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
+ MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+ MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
+ sparseMatrices, model);
+
+ SafeVector<int> oldOrdering;
+ if (enableAlignOrder) {
+ for (int i = 0; i < alignment->GetNumSequences(); i++)
+ oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
+ alignment->SaveOrdering();
+ enableAlignOrder = false;
+ }
+
+ // tree-based refinement
+ // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
+ /*
+ int numSeqs = alignment->GetNumSequences();
+ //if(numSeqs < numIterativeRefinementReps){
+ for(int iter = 0; iter < 5; iter ++){
+ for(int i = 0; i < numSeqs - 1; i++){
+ DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
+ }
+ }
+ //}*/
+ //Refinement return false:no improvement
+ for (int i = 0; i < numIterativeRefinementReps; i++) {
+ DoIterativeRefinement(sparseMatrices, model, alignment);
+ }
+ cerr << endl;
+
+ if (oldOrdering.size() > 0) {
+ for (int i = 0; i < (int) oldOrdering.size(); i++) {
+ alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
+ }
+ }
+
+ // return final alignment
+ return alignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// AlignAlignments()
+//
+// Returns the alignment of two MultiSequence objects.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
+ MultiSequence *align2,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ // print some info about the alignment
+ if (enableVerbose) {
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align1->GetSequence(i)->GetLabel();
+ cerr << "] vs. ";
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align2->GetSequence(i)->GetLabel();
+ cerr << "]: ";
+ }
+#if 0
+ VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
+ sparseMatrices, cutoff);
+#endif
+ // compute an "accuracy" measure for the alignment before refinement
+
+ pair<SafeVector<char> *, float> alignment;
+ //perform alignment
+ alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
+ align2->GetSequence(0)->GetLength(), *posterior);
+
+ delete posterior;
+
+ if (enableVerbose) {
+
+ // compute total length of sequences
+ int totLength = 0;
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ for (int j = 0; j < align2->GetNumSequences(); j++)
+ totLength += min(align1->GetSequence(i)->GetLength(),
+ align2->GetSequence(j)->GetLength());
+
+ // give an "accuracy" measure for the alignment
+ cerr << alignment.second / totLength << endl;
+ }
+
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ result->AddSequence(
+ align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ result->AddSequence(
+ align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
+ if (!enableAlignOrder)
+ result->SortByLabel();
+
+ // free temporary alignment
+ delete alignment.first;
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// DoRelaxation()
+//
+// Performs one round of the weighted probabilistic consistency transformation.
+// 1
+/////////////////////////////////////////////////////////////////
+
+SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
+ MultiSequence *sequences,
+ SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ const int numSeqs = sequences->GetNumSequences();
+
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+ // for every pair of sequences
+#ifdef _OPENMP
+ int pairIdx;
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int i = seqsPairs[pairIdx].seq1;
+ int j = seqsPairs[pairIdx].seq2;
+ float wi = seqsWeights[i];
+ float wj = seqsWeights[j];
+#else
+ for (int i = 0; i < numSeqs; i++) {
+ float wi = seqsWeights[i];
+ for (int j = i + 1; j < numSeqs; j++) {
+ float wj = seqsWeights[j];
+#endif
+ Sequence *seq1 = sequences->GetSequence(i);
+ Sequence *seq2 = sequences->GetSequence(j);
+
+ if (enableVerbose) {
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+ cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
+ << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
+ << ": ";
+ }
+ // get the original posterior matrix
+ VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+
+ // contribution from the summation where z = x and z = y
+ float w = wi * wi * wj + wi * wj * wj;
+ float sumW = w;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ //posterior[k] = w*posterior[k];
+ posterior[k] += posterior[k];
+ }
+
+ if (enableVerbose)
+ cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
+
+ // contribution from all other sequences
+ for (int k = 0; k < numSeqs; k++) {
+ if (k != i && k != j) {
+ float wk = seqsWeights[k];
+ float w = wi * wj * wk;
+ sumW += w;
+ if (k < i)
+ Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
+ posterior);
+ else if (k > i && k < j)
+ Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
+ posterior);
+ else {
+ SparseMatrix *temp =
+ sparseMatrices[j][k]->ComputeTranspose();
+ Relax(w, sparseMatrices[i][k], temp, posterior);
+ delete temp;
+ }
+ }
+ }
+ //cerr<<"sumW "<<sumW<<endl;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ //posterior[k] /= sumW;
+ posterior[k] /= numSeqs;
+ }
+ // mask out positions not originally in the posterior matrix
+ SparseMatrix *matXY = sparseMatrices[i][j];
+ for (int y = 0; y <= seq2Length; y++)
+ posterior[y] = 0;
+ for (int x = 1; x <= seq1Length; x++) {
+ SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
+ SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
+ VF::iterator base = posterior.begin() + x * (seq2Length + 1);
+ int curr = 0;
+ while (XYptr != XYend) {
+
+ // zero out all cells until the first filled column
+ while (curr < XYptr->first) {
+ base[curr] = 0;
+ curr++;
+ }
+
+ // now, skip over this column
+ curr++;
+ ++XYptr;
+ }
+
+ // zero out cells after last column
+ while (curr <= seq2Length) {
+ base[curr] = 0;
+ curr++;
+ }
+ }
+
+ // save the new posterior matrix
+ newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), posterior);
+ newSparseMatrices[j][i] = NULL;
+
+ if (enableVerbose)
+ cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
+
+ delete posteriorPtr;
+
+ if (enableVerbose)
+ cerr << "done." << endl;
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+ return newSparseMatrices;
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matXZ);
+ assert(matZY);
+
+ int lengthX = matXZ->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+ assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
+
+ // for every x[i]
+ for (int i = 1; i <= lengthX; i++) {
+ SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
+ SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
+
+ VF::iterator base = posterior.begin() + i * (lengthY + 1);
+
+ // iterate through all x[i]-z[k]
+ while (XZptr != XZend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
+ SafeVector<PIF>::iterator ZYend = ZYptr
+ + matZY->GetRowSize(XZptr->first);
+ const float XZval = XZptr->second;
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ //base[ZYptr->first] += weight * XZval * ZYptr->second;
+ base[ZYptr->first] += XZval * ZYptr->second;
+ ZYptr++;
+ }
+ XZptr++;
+ }
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax1()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matZX);
+ assert(matZY);
+
+ int lengthZ = matZX->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+
+ // for every z[k]
+ for (int k = 1; k <= lengthZ; k++) {
+ SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
+
+ // iterate through all z[k]-x[i]
+ while (ZXptr != ZXend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
+ const float ZXval = ZXptr->second;
+ VF::iterator base = posterior.begin()
+ + ZXptr->first * (lengthY + 1);
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ //base[ZYptr->first] += weight * ZXval * ZYptr->second;
+ base[ZYptr->first] += ZXval * ZYptr->second;
+ ZYptr++;
+ }
+ ZXptr++;
+ }
+ }
+}
+/////////////////////////////////////////////////////////////////
+// DoIterativeRefinement()
+//
+// Performs a single round of randomized partionining iterative
+// refinement.
+/////////////////////////////////////////////////////////////////
+
+void MSA::DoIterativeRefinement(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+
+ // create two separate groups
+ for (int i = 0; i < numSeqs; i++) {
+ int index = rand();
+ if (index % 2) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty()) return;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+/*
+//start add by Yongtao
+#if 1
+ VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
+ sparseMatrices, cutoff);
+#endif
+
+ // compute an "accuracy" measure for the alignment before refinement
+ SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
+ SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
+ int i=0;
+ for (set<int>::const_iterator iter = groupOne.begin();
+ iter != groupOne.end(); ++iter) {
+ oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
+ }
+ i=0;
+ for (set<int>::const_iterator iter = groupTwo.begin();
+ iter != groupTwo.end(); ++iter) {
+ oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
+ }
+
+ VF &posteriorArr = *posterior;
+ int oldLength = alignment->GetSequence(0)->GetLength();
+ int groupOneindex=0; int groupTwoindex=0;
+ float accuracy_before = 0;
+ for (int i = 1; i <= oldLength; i++) {
+ // check to see if there is a gap in every sequence of the set
+ bool foundOne = false;
+ for (int j = 0; !foundOne && j < (int) groupOne.size(); j++)
+ foundOne = (oldOnePtrs[j][i] != '-');
+ // if not, then this column counts towards the sequence length
+ if (foundOne) groupOneindex ++;
+ bool foundTwo = false;
+ for (int j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
+ foundTwo = (oldTwoPtrs[j][i] != '-');
+ // if not, then this column counts towards the sequence length
+ if (foundTwo) groupTwoindex ++;
+ if(foundOne && foundTwo) accuracy_before +=
+ posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
+ }
+
+ pair<SafeVector<char> *, float> refinealignment;
+ //perform alignment
+ refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
+ groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
+ delete posterior;
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ //compare accuracy measure before and after refinement
+ //if (refinealignment.second > accuracy_before) {
+ //cerr<<"Before:" << accuracy_before<<" after: "<< refinealignment.second<< endl;
+ for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
+ result->AddSequence(
+ groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
+ for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
+ result->AddSequence(
+ groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
+ // free temporary alignment
+ delete refinealignment.first;
+ delete alignment;
+ alignment = result;
+
+ }
+ else{
+ if(numIterativeRefinementReps < 8*numSeqs) numIterativeRefinementReps++;
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+ return false;
+ }
+ */
+//end add by yongtao
+
+ //delete alignment;
+ // realign
+ alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, model); //original
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+
+}
+
+void MSA::DoIterativeRefinementTreeNode(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment,
+ int nodeIndex) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+
+ vector<bool> inGroup1;
+ inGroup1.resize(numSeqs);
+ for (int i = 0; i < numSeqs; i++) {
+ inGroup1[i] = false;
+ }
+
+ AlignmentOrder* orders = this->tree->getAlignOrders();
+ AlignmentOrder* order = &orders[nodeIndex];
+ for (int i = 0; i < order->leftNum; i++) {
+ int si = order->leftLeafs[i];
+ inGroup1[si] = true;
+ }
+ for (int i = 0; i < order->rightNum; i++) {
+ int si = order->rightLeafs[i];
+ inGroup1[si] = true;
+ }
+ // create two separate groups
+ for (int i = 0; i < numSeqs; i++) {
+ if (inGroup1[i]) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty())
+ return;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+ delete alignment;
+
+ // realign
+ alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
+ model);
+
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+}
+
+/////////////////////////////////////////////////////////////////
+// WriteAnnotation()
+//
+// Computes annotation for multiple alignment and write values
+// to a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::WriteAnnotation(MultiSequence *alignment,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ ofstream outfile(annotationFilename.c_str());
+
+ if (outfile.fail()) {
+ cerr << "ERROR: Unable to write annotation file." << endl;
+ exit(1);
+ }
+
+ const int alignLength = alignment->GetSequence(0)->GetLength();
+ const int numSeqs = alignment->GetNumSequences();
+
+ SafeVector<int> position(numSeqs, 0);
+ SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
+ for (int i = 0; i < numSeqs; i++)
+ seqs[i] = alignment->GetSequence(i)->GetDataPtr();
+ SafeVector<pair<int, int> > active;
+ active.reserve(numSeqs);
+
+ SafeVector<int> lab;
+ for (int i = 0; i < numSeqs; i++)
+ lab.push_back(alignment->GetSequence(i)->GetSortLabel());
+
+ // for every column
+ for (int i = 1; i <= alignLength; i++) {
+
+ // find all aligned residues in this particular column
+ active.clear();
+ for (int j = 0; j < numSeqs; j++) {
+ if (seqs[j][i] != '-') {
+ active.push_back(make_pair(lab[j], ++position[j]));
+ }
+ }
+
+ sort(active.begin(), active.end());
+ outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
+ }
+
+ outfile.close();
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeScore()
+//
+// Computes the annotation score for a particular column.
+/////////////////////////////////////////////////////////////////
+
+int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+
+ if (active.size() <= 1)
+ return 0;
+
+ // ALTERNATIVE #1: Compute the average alignment score.
+
+ float val = 0;
+ for (int i = 0; i < (int) active.size(); i++) {
+ for (int j = i + 1; j < (int) active.size(); j++) {
+ val += sparseMatrices[active[i].first][active[j].first]->GetValue(
+ active[i].second, active[j].second);
+ }
+ }
+
+ return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
+
+}
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <set>
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <iomanip>
+#include "MSA.h"
+#include "MSAClusterTree.h"
+#include "Defaults.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+string parametersInputFilename = "";
+string parametersOutputFilename = "no training";
+string annotationFilename = "";
+
+bool enableVerbose = false;
+bool enableAnnotation = false;
+bool enableClustalWOutput = false;
+bool enableAlignOrder = false;
+int numConsistencyReps = 2;
+int numPreTrainingReps = 0;
+int numIterativeRefinementReps = 100;
+
+float cutoff = 0;
+
+VF initDistrib(NumMatrixTypes);
+VF gapOpen(2 * NumInsertStates);
+VF gapExtend(2 * NumInsertStates);
+VVF emitPairs(256, VF(256, 1e-10));
+VF emitSingle(256, 1e-5);
+
+string alphabet = alphabetDefault;
+
+const int MIN_PRETRAINING_REPS = 0;
+const int MAX_PRETRAINING_REPS = 20;
+const int MIN_CONSISTENCY_REPS = 0;
+const int MAX_CONSISTENCY_REPS = 5;
+const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
+const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
+
+string posteriorProbsFilename = "";
+bool allscores = true;
+string infilename;
+
+int flag_gui = 0; //0: no gui related o/p
+//1: gui related o/p generated
+int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
+//1: pp score seq added to o/p fasta alignment
+
+///////////////////////////////
+// global scoring matrix variables
+//////////////////////////////
+float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
+char *aminos, *bases, matrixtype[20] = "gonnet_160";
+int subst_index[26];
+
+double sub_matrix[26][26];
+int firstread = 0; //this makes sure that matrices are read only once
+
+float TEMPERATURE = 5;
+int MATRIXTYPE = 160;
+int prot_nuc = 0; //0=prot, 1=nucleotide
+
+float GAPOPEN = 0;
+float GAPEXT = 0;
+int numThreads = 0;
+
+//argument support
+typedef struct {
+ char input[30];
+ int matrix;
+ int N;
+ float T;
+ float beta;
+ char opt; //can be 'P' or 'M'
+ float gapopen;
+ float gapext;
+} argument_decl;
+
+argument_decl argument;
+
+extern inline void read_sustitution_matrix(char *fileName);
+extern void setmatrixtype(int le);
+extern inline int matrixtype_to_int();
+extern inline void read_dna_matrix();
+extern inline void read_vtml_la_matrix();
+extern void init_arguments();
+
+MSA::MSA(int argc, char* argv[]) {
+ //parse program parameters
+ SafeVector<string> sequenceNames = ParseParams(argc, argv);
+
+ //initialize arguments for partition function
+ init_arguments();
+
+ ReadParameters();
+ //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
+
+ //read the input sequences
+ MultiSequence *sequences = new MultiSequence();
+ assert(sequences);
+ for (int i = 0; i < (int) sequenceNames.size(); i++) {
+ cerr << "Loading sequence file: " << sequenceNames[i] << endl;
+ sequences->LoadMFA(sequenceNames[i], true);
+ }
+ //allocate space for sequence weights
+ this->seqsWeights = new int[sequences->GetNumSequences()];
+ //initilaize parameters for OPENMP
+#ifdef _OPENMP
+ if(numThreads <= 0) {
+ numThreads = omp_get_num_procs();
+ cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
+ }
+ cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
+
+ //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
+ omp_set_num_threads(numThreads);
+#endif
+ int levelid = ComputeSimilarity (sequences,ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,emitSingle));
+
+ // now, we can perform the alignments and write them out
+ MultiSequence *alignment = doAlign(sequences,
+ ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
+ emitSingle), levelid);
+
+ //write the alignment results to standard output
+ if (enableClustalWOutput) {
+ alignment->WriteALN(*alignOutFile);
+ } else {
+ alignment->WriteMFA(*alignOutFile);
+ }
+ //release resources
+ delete[] this->seqsWeights;
+ delete alignment;
+ delete sequences;
+}
+MSA::~MSA() {
+ /*close the output file*/
+ if (alignOutFileName.length() > 0) {
+ ((std::ofstream*) alignOutFile)->close();
+ }
+}
+/////////////////////////////////////////////////////////////////
+// PrintParameters()
+//
+// Prints MSAPROBS parameters to STDERR. If a filename is
+// specified, then the parameters are also written to the file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::PrintParameters(const char *message, const VF &initDistrib,
+ const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
+ const VF &emitSingle, const char *filename) {
+
+ // print parameters to the screen
+ cerr << message << endl << " initDistrib[] = { ";
+ for (int i = 0; i < NumMatrixTypes; i++)
+ cerr << setprecision(10) << initDistrib[i] << " ";
+ cerr << "}" << endl << " gapOpen[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapOpen[i] << " ";
+ cerr << "}" << endl << " gapExtend[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapExtend[i] << " ";
+ cerr << "}" << endl << endl;
+
+ // if a file name is specified
+ if (filename) {
+
+ // attempt to open the file for writing
+ FILE *file = fopen(filename, "w");
+ if (!file) {
+ cerr << "ERROR: Unable to write parameter file: " << filename
+ << endl;
+ exit(1);
+ }
+
+ // if successful, then write the parameters to the file
+ for (int i = 0; i < NumMatrixTypes; i++)
+ fprintf(file, "%.10f ", initDistrib[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapOpen[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapExtend[i]);
+ fprintf(file, "\n");
+ fprintf(file, "%s\n", alphabet.c_str());
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++)
+ fprintf(file, "%.10f ",
+ emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
+ fprintf(file, "\n");
+ }
+ for (int i = 0; i < (int) alphabet.size(); i++)
+ fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
+ fprintf(file, "\n");
+ fclose(file);
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// doAlign()
+//
+// First computes all pairwise posterior probability matrices.
+// Then, computes new parameters if training, or a final
+// alignment, otherwise.
+/////////////////////////////////////////////////////////////////
+extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
+MultiSequence* MSA::doAlign(MultiSequence *sequences,
+ const ProbabilisticModel &model, int levelid) {
+ assert(sequences);
+
+ //get the number of sequences
+ const int numSeqs = sequences->GetNumSequences();
+ //create distance matrix
+ VVF distances(numSeqs, VF(numSeqs, 0));
+ //creat sparseMatrices
+ SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+#ifdef _OPENMP
+ //calculate sequence pairs for openmp model
+ int pairIdx = 0;
+ numPairs = (numSeqs - 1) * numSeqs / 2;
+ seqsPairs = new SeqsPair[numPairs];
+ for(int a = 0; a < numSeqs; a++) {
+ for(int b = a + 1; b < numSeqs; b++) {
+ seqsPairs[pairIdx].seq1 = a;
+ seqsPairs[pairIdx].seq2 = b;
+ pairIdx++;
+ }
+ }
+#endif
+ // do all pairwise alignments for posterior probability matrices
+#ifdef _OPENMP
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int a= seqsPairs[pairIdx].seq1;
+ int b = seqsPairs[pairIdx].seq2;
+ if(enableVerbose) {
+#pragma omp critical
+ cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
+ }
+#else
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+#endif
+ Sequence *seq1 = sequences->GetSequence(a);
+ Sequence *seq2 = sequences->GetSequence(b);
+
+ //posterior probability matrix
+ VF* posterior;
+
+//high similarity use global model
+ if(levelid == 2) posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
+
+//low similarity use local model
+ else if(levelid == 1){
+ VF *forward = model.ComputeForwardMatrix(seq1, seq2,false);
+ assert(forward);
+ VF *backward = model.ComputeBackwardMatrix(seq1, seq2,false);
+ assert(backward);
+ posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
+ delete forward;
+ delete backward;
+ }
+
+//extreme low or extreme high similarity use combined model
+ else{
+//probcons
+ // compute forward and backward probabilities
+ VF *forward = model.ComputeForwardMatrix(seq1, seq2);
+ assert(forward);
+ VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
+ assert(backward);
+ // compute posterior probability matrix from HMM
+ VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
+ assert(probcons_posterior);
+ delete forward;
+ delete backward;
+
+//probalign
+ VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
+ assert(probalign_posterior);
+//local
+ forward = model.ComputeForwardMatrix(seq1, seq2,false);
+ assert(forward);
+ backward = model.ComputeBackwardMatrix(seq1, seq2,false);
+ assert(backward);
+ posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
+ assert(posterior);
+ delete forward;
+ delete backward;
+//combined model
+ //merge probalign + local + probcons
+ VF::iterator ptr1 = probcons_posterior->begin();
+ VF::iterator ptr2 = probalign_posterior->begin();
+ VF::iterator ptr = posterior->begin();
+ for (int i = 0; i <= seq1->GetLength(); i++) {
+ for (int j = 0; j <= seq2->GetLength(); j++) {
+ float v1 = *ptr1;
+ float v2 = *ptr2;
+ float v3 = *ptr;
+ *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3);
+ ptr1++;
+ ptr2++;
+ ptr++;
+ }
+ }
+ delete probcons_posterior;
+ delete probalign_posterior;
+ }
+ assert(posterior);
+ // perform the pairwise sequence alignment
+ pair<SafeVector<char> *, float> alignment = model.ComputeAlignment(
+ seq1->GetLength(), seq2->GetLength(), *posterior);
+
+ //compute expected accuracy
+ distances[a][b] = distances[b][a] = 1.0f - alignment.second
+ / min(seq1->GetLength(), seq2->GetLength());
+
+ // compute sparse representations
+ sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), *posterior);
+ sparseMatrices[b][a] = NULL;
+
+ delete posterior;
+ delete alignment.first;
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+ //create the guide tree
+ this->tree = new MSAClusterTree(this, distances, numSeqs);
+ this->tree->create();
+
+ // perform the consistency transformation the desired number of times
+ float* fweights = new float[numSeqs];
+ for (int r = 0; r < numSeqs; r++) {
+ fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
+ fweights[r] *= 10;
+ }
+ for (int r = 0; r < numConsistencyReps; r++) {
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
+ DoRelaxation(fweights, sequences, sparseMatrices);
+
+ // now replace the old posterior matrices
+ for (int i = 0; i < numSeqs; i++) {
+ for (int j = 0; j < numSeqs; j++) {
+ delete sparseMatrices[i][j];
+ sparseMatrices[i][j] = newSparseMatrices[i][j];
+ }
+ }
+ }
+ delete[] fweights;
+#ifdef _OPENMP
+ delete [] seqsPairs;
+#endif
+
+ //compute the final multiple sequence alignment
+ MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
+ sparseMatrices, model);
+
+ // build annotation
+ if (enableAnnotation) {
+ WriteAnnotation(finalAlignment, sparseMatrices);
+ }
+ //destroy the guide tree
+ delete this->tree;
+ this->tree = 0;
+
+ // delete sparse matrices
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+ delete sparseMatrices[a][b];
+ delete sparseMatrices[b][a];
+ }
+ }
+
+ return finalAlignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetInteger()
+//
+// Attempts to parse an integer from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetInteger(char *data, int *val) {
+ char *endPtr;
+ long int retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtol(data, &endPtr, 0);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
+ return false;
+ if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
+ return false;
+ *val = (int) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetFloat()
+//
+// Attempts to parse a float from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetFloat(char *data, float *val) {
+ char *endPtr;
+ double retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtod(data, &endPtr);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
+ return false;
+ *val = (float) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// ReadParameters()
+//
+// Read initial distribution, transition, and emission
+// parameters from a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::ReadParameters() {
+
+ ifstream data;
+
+ emitPairs = VVF(256, VF(256, 1e-10));
+ emitSingle = VF(256, 1e-5);
+
+ // read initial state distribution and transition parameters
+ if (parametersInputFilename == string("")) {
+ if (NumInsertStates == 1) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend1Default[i];
+ } else if (NumInsertStates == 2) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend2Default[i];
+ } else {
+ cerr
+ << "ERROR: No default initial distribution/parameter settings exist"
+ << endl << " for " << NumInsertStates
+ << " pairs of insert states. Use --paramfile." << endl;
+ exit(1);
+ }
+
+ alphabet = alphabetDefault;
+
+ for (int i = 0; i < (int) alphabet.length(); i++) {
+ emitSingle[(unsigned char) tolower(alphabet[i])] =
+ emitSingleDefault[i];
+ emitSingle[(unsigned char) toupper(alphabet[i])] =
+ emitSingleDefault[i];
+ for (int j = 0; j <= i; j++) {
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ }
+ }
+ } else {
+ data.open(parametersInputFilename.c_str());
+ if (data.fail()) {
+ cerr << "ERROR: Unable to read parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ string line[3];
+ for (int i = 0; i < 3; i++) {
+ if (!getline(data, line[i])) {
+ cerr
+ << "ERROR: Unable to read transition parameters from parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+ }
+ istringstream data2;
+ data2.clear();
+ data2.str(line[0]);
+ for (int i = 0; i < NumMatrixTypes; i++)
+ data2 >> initDistrib[i];
+ data2.clear();
+ data2.str(line[1]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapOpen[i];
+ data2.clear();
+ data2.str(line[2]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapExtend[i];
+
+ if (!getline(data, line[0])) {
+ cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ // read alphabet as concatenation of all characters on alphabet line
+ alphabet = "";
+ string token;
+ data2.clear();
+ data2.str(line[0]);
+ while (data2 >> token)
+ alphabet += token;
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++) {
+ float val;
+ data >> val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ }
+ }
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ float val;
+ data >> val;
+ emitSingle[(unsigned char) tolower(alphabet[i])] = val;
+ emitSingle[(unsigned char) toupper(alphabet[i])] = val;
+ }
+ data.close();
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// ParseParams()
+//
+// Parse all command-line options.
+/////////////////////////////////////////////////////////////////
+void MSA::printUsage() {
+ cerr
+ << "************************************************************************"
+ << endl
+ << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
+ << endl
+ << "\tbased on pair hidden markov model and partition function postirior"
+ << endl
+ << "\tprobabilities. If any comments or problems, please contact"
+ << endl
+ << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
+ << endl
+ << "*************************************************************************"
+ << endl << "Usage:" << endl
+ << " msaprobs [OPTION]... [infile]..." << endl << endl
+ << "Description:" << endl
+ << " Align sequences in multi-FASTA format" << endl << endl
+ << " -o, --outfile <string>" << endl
+ << " specify the output file name (STDOUT by default)"
+ << endl << " -num_threads <integer>" << endl
+ << " specify the number of threads used, and otherwise detect automatically"
+ << endl << " -clustalw" << endl
+ << " use CLUSTALW output format instead of FASTA format"
+ << endl << endl << " -c, --consistency REPS" << endl
+ << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
+ << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
+ << ") passes of consistency transformation" << endl << endl
+ << " -ir, --iterative-refinement REPS" << endl
+ << " use " << MIN_ITERATIVE_REFINEMENT_REPS
+ << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
+ << numIterativeRefinementReps << ") passes of iterative-refinement"
+ << endl << endl << " -v, --verbose" << endl
+ << " report progress while aligning (default: "
+ << (enableVerbose ? "on" : "off") << ")" << endl << endl
+ << " -annot FILENAME" << endl
+ << " write annotation for multiple alignment to FILENAME"
+ << endl << endl << " -a, --alignment-order" << endl
+ << " print sequences in alignment order rather than input order (default: "
+ << (enableAlignOrder ? "on" : "off") << ")" << endl
+ << " -version " << endl
+ << " print out version of MSAPROBS " << endl << endl;
+}
+SafeVector<string> MSA::ParseParams(int argc, char **argv) {
+ if (argc < 2) {
+ printUsage();
+ exit(1);
+ }
+ SafeVector<string> sequenceNames;
+ int tempInt;
+ float tempFloat;
+
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-') {
+ //help
+ if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
+ printUsage();
+ exit(1);
+ //output file name
+ } else if (!strcmp(argv[i], "-o")
+ || !strcmp(argv[i], "--outfile")) {
+ if (i < argc - 1) {
+ alignOutFileName = argv[++i]; //get the file name
+ } else {
+ cerr << "ERROR: String expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // parameter file
+ } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
+ if (i < argc - 1)
+ parametersInputFilename = string (argv[++i]);
+ else {
+ cerr << "ERROR: Filename expected for option " << argv[i] << endl;
+ exit (1);
+ }
+ //number of threads used
+ } else if (!strcmp(argv[i], "-p")
+ || !strcmp(argv[i], "-num_threads")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << " ERROR: invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < 0) {
+ tempInt = 0;
+ }
+ numThreads = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // number of consistency transformations
+ } else if (!strcmp(argv[i], "-c")
+ || !strcmp(argv[i], "--consistency")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_CONSISTENCY_REPS
+ || tempInt > MAX_CONSISTENCY_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_CONSISTENCY_REPS << " and "
+ << MAX_CONSISTENCY_REPS << "." << endl;
+ exit(1);
+ } else {
+ numConsistencyReps = tempInt;
+ }
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // number of randomized partitioning iterative refinement passes
+ else if (!strcmp(argv[i], "-ir")
+ || !strcmp(argv[i], "--iterative-refinement")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
+ || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_ITERATIVE_REFINEMENT_REPS << " and "
+ << MAX_ITERATIVE_REFINEMENT_REPS << "."
+ << endl;
+ exit(1);
+ } else
+ numIterativeRefinementReps = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // annotation files
+ else if (!strcmp(argv[i], "-annot")) {
+ enableAnnotation = true;
+ if (i < argc - 1) {
+ annotationFilename = argv[++i];
+ } else {
+ cerr << "ERROR: FILENAME expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // clustalw output format
+ else if (!strcmp(argv[i], "-clustalw")) {
+ enableClustalWOutput = true;
+ }
+
+ // cutoff
+ else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
+ if (i < argc - 1) {
+ if (!GetFloat(argv[++i], &tempFloat)) {
+ cerr
+ << "ERROR: Invalid floating-point value following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempFloat < 0 || tempFloat > 1) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", floating-point value must be between 0 and 1."
+ << endl;
+ exit(1);
+ } else
+ cutoff = tempFloat;
+ }
+ } else {
+ cerr << "ERROR: Floating-point value expected for option "
+ << argv[i] << endl;
+ exit(1);
+ }
+ }
+
+ // verbose reporting
+ else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
+ enableVerbose = true;
+ }
+
+ // alignment order
+ else if (!strcmp(argv[i], "-a")
+ || !strcmp(argv[i], "--alignment-order")) {
+ enableAlignOrder = true;
+ }
+
+ //print out version
+ else if (!strcmp(argv[i], "-version")) {
+ cerr << "MSAPROBS version " << VERSION << endl;
+ exit(1);
+ }
+ // bad arguments
+ else {
+ cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
+ exit(1);
+ }
+ } else {
+ sequenceNames.push_back(string(argv[i]));
+ }
+ }
+
+ /*check the output file name*/
+ cerr << "-------------------------------------" << endl;
+ if (alignOutFileName.length() == 0) {
+ cerr << "The final alignments will be printed out to STDOUT" << endl;
+ alignOutFile = &std::cout;
+ } else {
+ cerr << "Open the output file " << alignOutFileName << endl;
+ alignOutFile = new ofstream(alignOutFileName.c_str(),
+ ios::binary | ios::out | ios::trunc);
+ }
+ cerr << "-------------------------------------" << endl;
+ return sequenceNames;
+}
+
+/////////////////////////////////////////////////////////////////
+// ProcessTree()
+//
+// Process the tree recursively. Returns the aligned sequences
+// corresponding to a node or leaf of the tree.
+/////////////////////////////////////////////////////////////////
+MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ MultiSequence *result;
+
+ // check if this is a node of the alignment tree
+ //if (tree->GetSequenceLabel() == -1){
+ if (tree->leaf == NODE) {
+ MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
+ sparseMatrices, model);
+ MultiSequence *alignRight = ProcessTree(tree->right, sequences,
+ sparseMatrices, model);
+
+ assert(alignLeft);
+ assert(alignRight);
+
+ result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
+ assert(result);
+
+ delete alignLeft;
+ delete alignRight;
+ }
+
+ // otherwise, this is a leaf of the alignment tree
+ else {
+ result = new MultiSequence();
+ assert(result);
+ //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
+ result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
+ }
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeFinalAlignment()
+//
+// Compute the final alignment by calling ProcessTree(), then
+// performing iterative refinement as needed.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
+ MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+ MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
+ sparseMatrices, model);
+
+ SafeVector<int> oldOrdering;
+ if (enableAlignOrder) {
+ for (int i = 0; i < alignment->GetNumSequences(); i++)
+ oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
+ alignment->SaveOrdering();
+ enableAlignOrder = false;
+ }
+
+ // tree-based refinement
+ // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
+ /*
+ int numSeqs = alignment->GetNumSequences();
+ //if(numSeqs < numIterativeRefinementReps){
+ for(int iter = 0; iter < 5; iter ++){
+ for(int i = 0; i < numSeqs - 1; i++){
+ DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
+ }
+ }
+ //}*/
+ //Refinement return false:no improvement
+ for (int i = 0; i < numIterativeRefinementReps; i++) {
+ DoIterativeRefinement(sparseMatrices, model, alignment);
+ }
+ cerr << endl;
+
+ if (oldOrdering.size() > 0) {
+ for (int i = 0; i < (int) oldOrdering.size(); i++) {
+ alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
+ }
+ }
+
+ // return final alignment
+ return alignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// AlignAlignments()
+//
+// Returns the alignment of two MultiSequence objects.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
+ MultiSequence *align2,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ // print some info about the alignment
+ if (enableVerbose) {
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align1->GetSequence(i)->GetLabel();
+ cerr << "] vs. ";
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align2->GetSequence(i)->GetLabel();
+ cerr << "]: ";
+ }
+#if 0
+ VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
+ sparseMatrices, cutoff);
+#endif
+ // compute an "accuracy" measure for the alignment before refinement
+
+ pair<SafeVector<char> *, float> alignment;
+ //perform alignment
+ alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
+ align2->GetSequence(0)->GetLength(), *posterior);
+
+ delete posterior;
+
+ if (enableVerbose) {
+
+ // compute total length of sequences
+ int totLength = 0;
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ for (int j = 0; j < align2->GetNumSequences(); j++)
+ totLength += min(align1->GetSequence(i)->GetLength(),
+ align2->GetSequence(j)->GetLength());
+
+ // give an "accuracy" measure for the alignment
+ cerr << alignment.second / totLength << endl;
+ }
+
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ result->AddSequence(
+ align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ result->AddSequence(
+ align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
+ if (!enableAlignOrder)
+ result->SortByLabel();
+
+ // free temporary alignment
+ delete alignment.first;
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// DoRelaxation()
+//
+// Performs one round of the weighted probabilistic consistency transformation.
+// 1
+/////////////////////////////////////////////////////////////////
+
+SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
+ MultiSequence *sequences,
+ SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ const int numSeqs = sequences->GetNumSequences();
+
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+ // for every pair of sequences
+#ifdef _OPENMP
+ int pairIdx;
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int i = seqsPairs[pairIdx].seq1;
+ int j = seqsPairs[pairIdx].seq2;
+ float wi = seqsWeights[i];
+ float wj = seqsWeights[j];
+#else
+ for (int i = 0; i < numSeqs; i++) {
+ float wi = seqsWeights[i];
+ for (int j = i + 1; j < numSeqs; j++) {
+ float wj = seqsWeights[j];
+#endif
+ Sequence *seq1 = sequences->GetSequence(i);
+ Sequence *seq2 = sequences->GetSequence(j);
+
+ if (enableVerbose) {
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+ cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
+ << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
+ << ": ";
+ }
+ // get the original posterior matrix
+ VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+
+ // contribution from the summation where z = x and z = y
+ float w = wi * wi * wj + wi * wj * wj;
+ float sumW = w;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ //posterior[k] = w*posterior[k];
+ posterior[k] += posterior[k];
+ }
+
+ if (enableVerbose)
+ cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
+
+ // contribution from all other sequences
+ for (int k = 0; k < numSeqs; k++) {
+ if (k != i && k != j) {
+ float wk = seqsWeights[k];
+ float w = wi * wj * wk;
+ sumW += w;
+ if (k < i)
+ Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
+ posterior);
+ else if (k > i && k < j)
+ Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
+ posterior);
+ else {
+ SparseMatrix *temp =
+ sparseMatrices[j][k]->ComputeTranspose();
+ Relax(w, sparseMatrices[i][k], temp, posterior);
+ delete temp;
+ }
+ }
+ }
+ //cerr<<"sumW "<<sumW<<endl;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ //posterior[k] /= sumW;
+ posterior[k] /= numSeqs;
+ }
+ // mask out positions not originally in the posterior matrix
+ SparseMatrix *matXY = sparseMatrices[i][j];
+ for (int y = 0; y <= seq2Length; y++)
+ posterior[y] = 0;
+ for (int x = 1; x <= seq1Length; x++) {
+ SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
+ SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
+ VF::iterator base = posterior.begin() + x * (seq2Length + 1);
+ int curr = 0;
+ while (XYptr != XYend) {
+
+ // zero out all cells until the first filled column
+ while (curr < XYptr->first) {
+ base[curr] = 0;
+ curr++;
+ }
+
+ // now, skip over this column
+ curr++;
+ ++XYptr;
+ }
+
+ // zero out cells after last column
+ while (curr <= seq2Length) {
+ base[curr] = 0;
+ curr++;
+ }
+ }
+
+ // save the new posterior matrix
+ newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), posterior);
+ newSparseMatrices[j][i] = NULL;
+
+ if (enableVerbose)
+ cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
+
+ delete posteriorPtr;
+
+ if (enableVerbose)
+ cerr << "done." << endl;
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+ return newSparseMatrices;
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matXZ);
+ assert(matZY);
+
+ int lengthX = matXZ->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+ assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
+
+ // for every x[i]
+ for (int i = 1; i <= lengthX; i++) {
+ SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
+ SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
+
+ VF::iterator base = posterior.begin() + i * (lengthY + 1);
+
+ // iterate through all x[i]-z[k]
+ while (XZptr != XZend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
+ SafeVector<PIF>::iterator ZYend = ZYptr
+ + matZY->GetRowSize(XZptr->first);
+ const float XZval = XZptr->second;
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ //base[ZYptr->first] += weight * XZval * ZYptr->second;
+ base[ZYptr->first] += XZval * ZYptr->second;
+ ZYptr++;
+ }
+ XZptr++;
+ }
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax1()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matZX);
+ assert(matZY);
+
+ int lengthZ = matZX->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+
+ // for every z[k]
+ for (int k = 1; k <= lengthZ; k++) {
+ SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
+
+ // iterate through all z[k]-x[i]
+ while (ZXptr != ZXend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
+ const float ZXval = ZXptr->second;
+ VF::iterator base = posterior.begin()
+ + ZXptr->first * (lengthY + 1);
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ //base[ZYptr->first] += weight * ZXval * ZYptr->second;
+ base[ZYptr->first] += ZXval * ZYptr->second;
+ ZYptr++;
+ }
+ ZXptr++;
+ }
+ }
+}
+/////////////////////////////////////////////////////////////////
+// DoIterativeRefinement()
+//
+// Performs a single round of randomized partionining iterative
+// refinement.
+// return 0: successful refinement, 1: ineffective refinement, 2: random problem
+/////////////////////////////////////////////////////////////////
+int MSA::DoIterativeRefinement(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+ int i;
+ // create two separate groups
+ for (i = 0; i < numSeqs; i++) {
+ int index = rand();
+ if (index % 2) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty()) return 2;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+
+//start add by Yongtao
+#if 1
+ VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
+ sparseMatrices, cutoff);
+#endif
+ // compute an "accuracy" measure for the alignment before refinement
+ SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
+ SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
+ i=0;
+ for (set<int>::const_iterator iter = groupOne.begin();
+ iter != groupOne.end(); ++iter) {
+ oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
+ }
+ i=0;
+ for (set<int>::const_iterator iter = groupTwo.begin();
+ iter != groupTwo.end(); ++iter) {
+ oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
+ }
+
+ VF &posteriorArr = *posterior;
+ int oldLength = alignment->GetSequence(0)->GetLength();
+ int groupOneindex=0; int groupTwoindex=0;
+ float accuracy_before = 0;
+ int j;
+ for (i = 1; i <= oldLength; i++) {
+ // check to see if there is a gap in every sequence of the set
+ bool foundOne = false;
+ for (j = 0; !foundOne && j < (int) groupOne.size(); j++)
+ foundOne = (oldOnePtrs[j][i] != '-');
+ // if not, then this column counts towards the sequence length
+ if (foundOne) groupOneindex ++;
+ bool foundTwo = false;
+ for (j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
+ foundTwo = (oldTwoPtrs[j][i] != '-');
+ if (foundTwo) groupTwoindex ++;
+ if(foundOne && foundTwo) accuracy_before +=
+ posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
+ }
+
+ pair<SafeVector<char> *, float> refinealignment;
+ //perform alignment
+ refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
+ groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
+ delete posterior;
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
+ result->AddSequence(
+ groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
+ for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
+ result->AddSequence(
+ groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
+ // free temporary alignment
+ delete refinealignment.first;
+ delete alignment;
+ alignment = result;
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+ if(accuracy_before == refinealignment.second) return 1;
+ else return 0;
+}
+
+
+void MSA::DoIterativeRefinementTreeNode(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment,
+ int nodeIndex) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+
+ vector<bool> inGroup1;
+ inGroup1.resize(numSeqs);
+ for (int i = 0; i < numSeqs; i++) {
+ inGroup1[i] = false;
+ }
+
+ AlignmentOrder* orders = this->tree->getAlignOrders();
+ AlignmentOrder* order = &orders[nodeIndex];
+ for (int i = 0; i < order->leftNum; i++) {
+ int si = order->leftLeafs[i];
+ inGroup1[si] = true;
+ }
+ for (int i = 0; i < order->rightNum; i++) {
+ int si = order->rightLeafs[i];
+ inGroup1[si] = true;
+ }
+ // create two separate groups
+ for (int i = 0; i < numSeqs; i++) {
+ if (inGroup1[i]) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty())
+ return;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+ delete alignment;
+
+ // realign
+ alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
+ model);
+
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+}
+
+/////////////////////////////////////////////////////////////////
+// WriteAnnotation()
+//
+// Computes annotation for multiple alignment and write values
+// to a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::WriteAnnotation(MultiSequence *alignment,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ ofstream outfile(annotationFilename.c_str());
+
+ if (outfile.fail()) {
+ cerr << "ERROR: Unable to write annotation file." << endl;
+ exit(1);
+ }
+
+ const int alignLength = alignment->GetSequence(0)->GetLength();
+ const int numSeqs = alignment->GetNumSequences();
+
+ SafeVector<int> position(numSeqs, 0);
+ SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
+ for (int i = 0; i < numSeqs; i++)
+ seqs[i] = alignment->GetSequence(i)->GetDataPtr();
+ SafeVector<pair<int, int> > active;
+ active.reserve(numSeqs);
+
+ SafeVector<int> lab;
+ for (int i = 0; i < numSeqs; i++)
+ lab.push_back(alignment->GetSequence(i)->GetSortLabel());
+
+ // for every column
+ for (int i = 1; i <= alignLength; i++) {
+
+ // find all aligned residues in this particular column
+ active.clear();
+ for (int j = 0; j < numSeqs; j++) {
+ if (seqs[j][i] != '-') {
+ active.push_back(make_pair(lab[j], ++position[j]));
+ }
+ }
+
+ sort(active.begin(), active.end());
+ outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
+ }
+
+ outfile.close();
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeScore()
+//
+// Computes the annotation score for a particular column.
+/////////////////////////////////////////////////////////////////
+
+int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+
+ if (active.size() <= 1)
+ return 0;
+
+ // ALTERNATIVE #1: Compute the average alignment score.
+
+ float val = 0;
+ for (int i = 0; i < (int) active.size(); i++) {
+ for (int j = i + 1; j < (int) active.size(); j++) {
+ val += sparseMatrices[active[i].first][active[j].first]->GetValue(
+ active[i].second, active[j].second);
+ }
+ }
+
+ return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
+
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeSimilarity ()
+//
+// Computes the average similarity for a particular family.
+// extreme low or extreme high similarity(<=20% or >80%) return 0
+// low similarity(20%-50%) return 1
+// high similarity(50%-80%) return 2
+/////////////////////////////////////////////////////////////////
+int MSA::ComputeSimilarity (MultiSequence *sequences,const ProbabilisticModel &model){
+ assert(sequences);
+
+ //get the number of sequences
+ const int numSeqs = sequences->GetNumSequences();
+ //average identity for all sequences
+ float identity = 0;
+
+#ifdef _OPENMP
+ //calculate sequence pairs for openmp model
+ int pairIdx = 0;
+ numPairs = (numSeqs - 1) * numSeqs / 2;
+ seqsPairs = new SeqsPair[numPairs];
+ for(int a = 0; a < numSeqs; a++) {
+ for(int b = a + 1; b < numSeqs; b++) {
+ seqsPairs[pairIdx].seq1 = a;
+ seqsPairs[pairIdx].seq2 = b;
+ pairIdx++;
+ }
+ }
+#endif
+
+ // do all pairwise alignments for family similarity
+#ifdef _OPENMP
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int a= seqsPairs[pairIdx].seq1;
+ int b = seqsPairs[pairIdx].seq2;
+ if(enableVerbose) {
+#pragma omp critical
+ cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
+ }
+#else
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+#endif
+ Sequence *seq1 = sequences->GetSequence(a);
+ Sequence *seq2 = sequences->GetSequence(b);
+ pair<SafeVector<char> *, float> alignment = model.ComputeViterbiAlignment(seq1,seq2);
+ //
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+ float N_correct_match = 0;
+ //float N_match;
+ //float N_column = 0;
+ float N_alignment = 0;
+ int i = 1;int j = 1;
+ //bool start = false; bool end = false;
+ for (SafeVector<char>::iterator iter = alignment.first->begin();
+ iter != alignment.first->end(); ++iter){
+ if (*iter == 'B'){
+ //N_match += 1;
+ //start = true;
+ //if(i==seq1->GetLength() || j==seq2->GetLength()) end = true;
+ unsigned char c1 = (unsigned char) iter1[i++];
+ unsigned char c2 = (unsigned char) iter2[j++];
+ if(c1==c2) N_correct_match += 1;
+ }
+ else if(*iter == 'X') i++;
+ else if(*iter == 'Y') j++;
+ //if(start && !end) N_column += 1;
+ N_alignment += 1;
+ }
+ if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "similarity error"<< endl;
+ identity += N_correct_match / N_alignment;
+ //
+ delete alignment.first;
+#ifndef _OPENMP
+ }
+#endif
+ }
+ identity /= numPairs;
+ //adapative
+ if(identity <= 0.15) initDistrib[2] = 0.143854;
+ else if(identity <= 0.2) initDistrib[2] = 0.191948;
+ else if(identity <= 0.25) initDistrib[2] = 0.170705;
+ else if(identity <= 0.3) initDistrib[2] = 0.100675;
+ else if(identity <= 0.35) initDistrib[2] = 0.090755;
+ else if(identity <= 0.4) initDistrib[2] = 0.146188;
+ else if(identity <= 0.45) initDistrib[2] = 0.167858;
+ else if(identity <= 0.5) initDistrib[2] = 0.250769;
+ else if(identity <= 0.6) initDistrib[2] = 0.500829;
+ else if(identity <= 0.7) initDistrib[2] = 0.259622;
+
+ if( identity<= 0.2 || identity > 0.8) return 0;
+ else if(identity > 0.2 && identity<= 0.5) return 1;
+ else return 2;
+}
--- /dev/null
+
+CXXOBJS = MSA.o MSAGuideTree.o MSAClusterTree.o MSAPartProbs.o MSAReadMatrix.o main.o
+
+OPENMP = -fopenmp
+CXX = g++
+COMMON_FLAGS = -O3 $(OPENMP) -Wall -funroll-loops -I . -I /usr/include
+CXXFLAGS = $(COMMON_FLAGS)
+
+EXEC = glprobs
+
+all: $(CXXOBJS)
+ $(CXX) $(CXXFLAGS) -o $(EXEC) $(CXXOBJS) $(NVCCOBJS) $(NVCCLIBS)
+ strip $(EXEC)
+clean:
+ rm -rf *.o $(EXEC)
+
--- /dev/null
+////////////////////////////////////////////////////////////////
+// MultiSequence.h
+//
+// Utilities for reading/writing multiple sequence data.
+/////////////////////////////////////////////////////////////////
+
+#ifndef MULTISEQUENCE_H
+#define MULTISEQUENCE_H
+
+#include <cctype>
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+#include <set>
+#include "SafeVector.h"
+#include "Sequence.h"
+#include "FileBuffer.h"
+
+#define VERSION "0.9.7"
+/////////////////////////////////////////////////////////////////
+// MultiSequence
+//
+// Class for multiple sequence alignment input/output.
+/////////////////////////////////////////////////////////////////
+
+class MultiSequence {
+
+ SafeVector<Sequence *> *sequences;
+
+public:
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::MultiSequence()
+ //
+ // Default constructor.
+ /////////////////////////////////////////////////////////////////
+
+ MultiSequence() :
+ sequences(NULL) {
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::MultiSequence()
+ //
+ // Constructor. Load MFA from a FileBuffer object.
+ /////////////////////////////////////////////////////////////////
+
+ MultiSequence(FileBuffer &infile) :
+ sequences(NULL) {
+ LoadMFA(infile);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::MultiSequence()
+ //
+ // Constructor. Load MFA from a filename.
+ /////////////////////////////////////////////////////////////////
+
+ MultiSequence(const string &filename) :
+ sequences(NULL) {
+ LoadMFA(filename);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::~MultiSequence()
+ //
+ // Destructor. Gets rid of sequence objects contained in the
+ // multiple alignment.
+ /////////////////////////////////////////////////////////////////
+
+ ~MultiSequence() {
+
+ // if sequences allocated
+ if (sequences) {
+
+ // free all sequences
+ for (SafeVector<Sequence *>::iterator iter = sequences->begin();
+ iter != sequences->end(); ++iter) {
+ assert(*iter);
+ delete *iter;
+ *iter = NULL;
+ }
+
+ // free sequence vector
+ delete sequences;
+ sequences = NULL;
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::LoadMFA()
+ //
+ // Load MFA from a filename.
+ /////////////////////////////////////////////////////////////////
+
+ void LoadMFA(const string &filename, bool stripGaps = false) {
+
+ // try opening file
+ FileBuffer infile(filename.c_str());
+
+ if (infile.fail()) {
+ cerr << "ERROR: Could not open file '" << filename
+ << "' for reading." << endl;
+ exit(1);
+ }
+
+ // if successful, then load using other LoadMFA() routine
+ LoadMFA(infile, stripGaps);
+
+ infile.close();
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::LoadMFA()
+ //
+ // Load MSF from a FileBuffer object.
+ /////////////////////////////////////////////////////////////////
+
+ void ParseMSF(FileBuffer &infile, string header, bool stripGaps = false) {
+
+ SafeVector<SafeVector<char> *> seqData;
+ SafeVector<string> seqNames;
+ SafeVector<int> seqLengths;
+
+ istringstream in;
+ bool valid = true;
+ bool missingHeader = false;
+ bool clustalW = false;
+
+ // read until data starts
+ while (!infile.eof() && header.find("..", 0) == string::npos) {
+ if (header.find("CLUSTAL", 0) == 0
+ || header.find("MSAPROBS", 0) == 0) {
+ clustalW = true;
+ break;
+ }
+ infile.GetLine(header);
+ if (header.find("//", 0) != string::npos) {
+ missingHeader = true;
+ break;
+ }
+ }
+
+ // read until end-of-file
+ while (valid) {
+ infile.GetLine(header);
+ if (infile.eof())
+ break;
+
+ string word;
+ in.clear();
+ in.str(header);
+
+ // check if there's anything on this line
+ if (in >> word) {
+
+ // clustalw name parsing
+ if (clustalW) {
+ if (!isspace(header[0])
+ && find(seqNames.begin(), seqNames.end(), word)
+ == seqNames.end()) {
+ seqNames.push_back(word);
+ seqData.push_back(new SafeVector<char>());
+ seqLengths.push_back(0);
+ seqData[(int) seqData.size() - 1]->push_back('@');
+ }
+ }
+
+ // look for new sequence label
+ if (word == string("Name:")) {
+ if (in >> word) {
+ seqNames.push_back(word);
+ seqData.push_back(new SafeVector<char>());
+ seqLengths.push_back(0);
+ seqData[(int) seqData.size() - 1]->push_back('@');
+ } else
+ valid = false;
+ }
+
+ // check if this is sequence data
+ else if (find(seqNames.begin(), seqNames.end(), word)
+ != seqNames.end()) {
+ int index = find(seqNames.begin(), seqNames.end(), word)
+ - seqNames.begin();
+
+ // read all remaining characters on the line
+ char ch;
+ while (in >> ch) {
+ if (isspace(ch))
+ continue;
+ if (ch >= 'a' && ch <= 'z')
+ ch = ch - 'a' + 'A';
+ if (ch == '.')
+ ch = '-';
+ if (stripGaps && ch == '-')
+ continue;
+ if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')) {
+ cerr << "ERROR: Unknown character encountered: "
+ << ch << endl;
+ exit(1);
+ }
+
+ // everything's ok so far, so just store this character.
+ seqData[index]->push_back(ch);
+ seqLengths[index]++;
+ }
+ } else if (missingHeader) {
+ seqNames.push_back(word);
+ seqData.push_back(new SafeVector<char>());
+ seqLengths.push_back(0);
+ seqData[(int) seqData.size() - 1]->push_back('@');
+
+ int index = (int) seqNames.size() - 1;
+
+ // read all remaining characters on the line
+ char ch;
+ while (in >> ch) {
+ if (isspace(ch))
+ continue;
+ if (ch >= 'a' && ch <= 'z')
+ ch = ch - 'a' + 'A';
+ if (ch == '.')
+ ch = '-';
+ if (stripGaps && ch == '-')
+ continue;
+ if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')) {
+ cerr << "ERROR: Unknown character encountered: "
+ << ch << endl;
+ exit(1);
+ }
+
+ // everything's ok so far, so just store this character.
+ seqData[index]->push_back(ch);
+ seqLengths[index]++;
+ }
+ }
+ }
+ }
+
+ // check for errors
+ if (seqNames.size() == 0) {
+ cerr << "ERROR: No sequences read!" << endl;
+ exit(1);
+ }
+
+ assert(!sequences);
+ sequences = new SafeVector<Sequence *>;
+ for (int i = 0; i < (int) seqNames.size(); i++) {
+ if (seqLengths[i] == 0) {
+ cerr << "ERROR: Sequence of zero length!" << endl;
+ exit(1);
+ }
+ Sequence *seq = new Sequence(seqData[i], seqNames[i], seqLengths[i],
+ i, i);
+ sequences->push_back(seq);
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::LoadMFA()
+ //
+ // Load MFA from a FileBuffer object.
+ /////////////////////////////////////////////////////////////////
+
+ void LoadMFA(FileBuffer &infile, bool stripGaps = false) {
+
+ // check to make sure that file reading is ok
+ if (infile.fail()) {
+ cerr << "ERROR: Error reading file." << endl;
+ exit(1);
+ }
+
+ // read all sequences
+ while (true) {
+
+ // get the sequence label as being the current # of sequences
+ // NOTE: sequence labels here are zero-based
+ int index = (!sequences) ? 0 : sequences->size();
+
+ // read the sequence
+ Sequence *seq = new Sequence(infile, stripGaps);
+ if (seq->Fail()) {
+
+ // check if alternative file format (i.e. not MFA)
+ if (index == 0) {
+ string header = seq->GetHeader();
+ if (header.length() > 0 && header[0] != '>') {
+
+ // try MSF format
+ ParseMSF(infile, header);
+ break;
+ }
+ }
+
+ delete seq;
+ break;
+ }
+ seq->SetLabel(index);
+
+ // add the sequence to the list of current sequences
+ if (!sequences)
+ sequences = new SafeVector<Sequence *>;
+ sequences->push_back(seq);
+ }
+
+ // make sure at least one sequence was read
+ if (!sequences) {
+ cerr << "ERROR: No sequences read." << endl;
+ exit(1);
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::AddSequence()
+ //
+ // Add another sequence to an existing sequence list
+ /////////////////////////////////////////////////////////////////
+
+ void AddSequence(Sequence *sequence) {
+ assert(sequence);
+ assert(!sequence->Fail());
+
+ // add sequence
+ if (!sequences)
+ sequences = new SafeVector<Sequence *>;
+ sequences->push_back(sequence);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::RemoveSequence()
+ //
+ // Remove a sequence from the MultiSequence
+ /////////////////////////////////////////////////////////////////
+
+ void RemoveSequence(int index) {
+ assert(sequences);
+
+ assert(index >= 0 && index < (int) sequences->size());
+ delete (*sequences)[index];
+
+ sequences->erase(sequences->begin() + index);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::WriteMFA()
+ //
+ // Write MFA to the outfile. Allows the user to specify the
+ // number of columns for the output. Also, useIndices determines
+ // whether or not the actual sequence comments will be printed
+ // out or whether the artificially assigned sequence labels will
+ // be used instead.
+ /////////////////////////////////////////////////////////////////
+
+ void WriteMFA(ostream &outfile, int numColumns = 60,
+ bool useIndices = false) {
+ if (!sequences)
+ return;
+
+ // loop through all sequences and write them out
+ for (SafeVector<Sequence *>::iterator iter = sequences->begin();
+ iter != sequences->end(); ++iter) {
+ (*iter)->WriteMFA(outfile, numColumns, useIndices);
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::GetAnnotationChar()
+ //
+ // Return CLUSTALW annotation for column.
+ /////////////////////////////////////////////////////////////////
+
+ char GetAnnotationChar(SafeVector<char> &column) {
+ SafeVector<int> counts(256, 0);
+ int allChars = (int) column.size();
+
+ for (int i = 0; i < allChars; i++) {
+ counts[(unsigned char) toupper(column[i])]++;
+ }
+
+ allChars -= counts[(unsigned char) '-'];
+ if (allChars == 1)
+ return ' ';
+
+ for (int i = 0; i < 256; i++)
+ if ((char) i != '-' && counts[i] == allChars)
+ return '*';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T']
+ + counts[(unsigned char) 'A'] == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E']
+ + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'H']
+ + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D']
+ + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H']
+ + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I']
+ + counts[(unsigned char) 'L'] + counts[(unsigned char) 'V']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I']
+ + counts[(unsigned char) 'L'] + counts[(unsigned char) 'F']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'Y']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y']
+ + counts[(unsigned char) 'W'] == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'C'] + counts[(unsigned char) 'S']
+ + counts[(unsigned char) 'A'] == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'A'] + counts[(unsigned char) 'T']
+ + counts[(unsigned char) 'V'] == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'A']
+ + counts[(unsigned char) 'G'] == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T']
+ + counts[(unsigned char) 'N'] + counts[(unsigned char) 'K']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T']
+ + counts[(unsigned char) 'P'] + counts[(unsigned char) 'A']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'G']
+ + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'N']
+ + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E']
+ + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D']
+ + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q']
+ + counts[(unsigned char) 'H'] + counts[(unsigned char) 'K']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E']
+ + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q']
+ + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'V']
+ + counts[(unsigned char) 'L'] + counts[(unsigned char) 'I']
+ + counts[(unsigned char) 'M'] == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'F']
+ + counts[(unsigned char) 'Y'] == allChars)
+ return '.';
+
+ return ' ';
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::WriteALN()
+ //
+ // Write ALN to the outfile. Allows the user to specify the
+ // number of columns for the output.
+ /////////////////////////////////////////////////////////////////
+
+ void WriteALN(ostream &outfile, int numColumns = 60) {
+ if (!sequences)
+ return;
+
+ outfile << "CLUSTAL for GLPROBS version " << VERSION << " multiple sequence alignment" << endl;
+//
+// outfile << "//"<<endl<<endl;
+
+ int longestComment = 0;
+ SafeVector<SafeVector<char>::iterator> ptrs(GetNumSequences());
+ SafeVector<int> lengths(GetNumSequences());
+ for (int i = 0; i < GetNumSequences(); i++) {
+ ptrs[i] = GetSequence(i)->GetDataPtr();
+ lengths[i] = GetSequence(i)->GetLength();
+ longestComment = max(longestComment,
+ (int) GetSequence(i)->GetName().length());
+ }
+ longestComment += 4;
+
+ int writtenChars = 0;
+ bool allDone = false;
+
+ while (!allDone) {
+ outfile << endl;
+ allDone = true;
+
+ // loop through all sequences and write them out
+ for (int i = 0; i < GetNumSequences(); i++) {
+
+ if (writtenChars < lengths[i]) {
+ outfile << GetSequence(i)->GetName();
+ for (int j = 0;
+ j
+ < longestComment
+ - (int) GetSequence(i)->GetName().length();
+ j++)
+ outfile << ' ';
+
+ for (int j = 0; j < numColumns; j++) {
+ if (writtenChars + j < lengths[i])
+ outfile << ptrs[i][writtenChars + j + 1];
+ else
+ break;
+ }
+
+ outfile << endl;
+
+ if (writtenChars + numColumns < lengths[i])
+ allDone = false;
+ }
+ }
+
+ // write annotation line
+/*
+ for (int j = 0; j < longestComment; j++)
+ outfile << ' ';
+
+ for (int j = 0; j < numColumns; j++) {
+ SafeVector<char> column;
+
+ for (int i = 0; i < GetNumSequences(); i++)
+ if (writtenChars + j < lengths[i])
+ column.push_back(ptrs[i][writtenChars + j + 1]);
+
+ if (column.size() > 0)
+ outfile << GetAnnotationChar(column);
+ }
+*/
+ outfile << endl;
+ writtenChars += numColumns;
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::GetSequence()
+ //
+ // Retrieve a sequence from the MultiSequence object.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence* GetSequence(int i) {
+ assert(sequences);
+ assert(0 <= i && i < (int) sequences->size());
+
+ return (*sequences)[i];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::GetSequence()
+ //
+ // Retrieve a sequence from the MultiSequence object
+ // (const version).
+ /////////////////////////////////////////////////////////////////
+
+ const Sequence* GetSequence(int i) const {
+ assert(sequences);
+ assert(0 <= i && i < (int) sequences->size());
+
+ return (*sequences)[i];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::GetNumSequences()
+ //
+ // Returns the number of sequences in the MultiSequence.
+ /////////////////////////////////////////////////////////////////
+
+ int GetNumSequences() const {
+ if (!sequences)
+ return 0;
+ return (int) sequences->size();
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::SortByHeader()
+ //
+ // Organizes the sequences according to their sequence headers
+ // in ascending order.
+ /////////////////////////////////////////////////////////////////
+
+ void SortByHeader() {
+ assert(sequences);
+
+ // a quick and easy O(n^2) sort
+ for (int i = 0; i < (int) sequences->size() - 1; i++) {
+ for (int j = i + 1; j < (int) sequences->size(); j++) {
+ if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader())
+ swap((*sequences)[i], (*sequences)[j]);
+ }
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::SortByLabel()
+ //
+ // Organizes the sequences according to their sequence labels
+ // in ascending order.
+ /////////////////////////////////////////////////////////////////
+
+ void SortByLabel() {
+ assert(sequences);
+
+ // a quick and easy O(n^2) sort
+ for (int i = 0; i < (int) sequences->size() - 1; i++) {
+ for (int j = i + 1; j < (int) sequences->size(); j++) {
+ if ((*sequences)[i]->GetSortLabel()
+ > (*sequences)[j]->GetSortLabel())
+ swap((*sequences)[i], (*sequences)[j]);
+ }
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::SaveOrdering()
+ //
+ // Relabels sequences so as to preserve the current ordering.
+ /////////////////////////////////////////////////////////////////
+
+ void SaveOrdering() {
+ assert(sequences);
+
+ for (int i = 0; i < (int) sequences->size(); i++)
+ (*sequences)[i]->SetSortLabel(i);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::Project()
+ //
+ // Given a set of indices, extract all sequences from the current
+ // MultiSequence object whose index is included in the set.
+ // Then, project the multiple alignments down to the desired
+ // subset, and return the projection as a new MultiSequence
+ // object.
+ /////////////////////////////////////////////////////////////////
+
+ MultiSequence *Project(const set<int> &indices) {
+ SafeVector<SafeVector<char>::iterator> oldPtrs(indices.size());
+ SafeVector<SafeVector<char> *> newPtrs(indices.size());
+
+ assert(indices.size() != 0);
+
+ // grab old data
+ int i = 0;
+ for (set<int>::const_iterator iter = indices.begin();
+ iter != indices.end(); ++iter) {
+ oldPtrs[i++] = GetSequence(*iter)->GetDataPtr();
+ }
+
+ // compute new length
+ int oldLength = GetSequence(*indices.begin())->GetLength();
+ int newLength = 0;
+ for (i = 1; i <= oldLength; i++) {
+
+ // check to see if there is a gap in every sequence of the set
+ bool found = false;
+ for (int j = 0; !found && j < (int) indices.size(); j++)
+ found = (oldPtrs[j][i] != '-');
+
+ // if not, then this column counts towards the sequence length
+ if (found)
+ newLength++;
+ }
+
+ // build new alignments
+ for (i = 0; i < (int) indices.size(); i++) {
+ newPtrs[i] = new SafeVector<char>();
+ assert(newPtrs[i]);
+ newPtrs[i]->push_back('@');
+ }
+
+ // add all needed columns
+ for (i = 1; i <= oldLength; i++) {
+
+ // make sure column is not gapped in all sequences in the set
+ bool found = false;
+ for (int j = 0; !found && j < (int) indices.size(); j++)
+ found = (oldPtrs[j][i] != '-');
+
+ // if not, then add it
+ if (found) {
+ for (int j = 0; j < (int) indices.size(); j++)
+ newPtrs[j]->push_back(oldPtrs[j][i]);
+ }
+ }
+
+ // wrap sequences in MultiSequence object
+ MultiSequence *ret = new MultiSequence();
+ i = 0;
+ for (set<int>::const_iterator iter = indices.begin();
+ iter != indices.end(); ++iter) {
+ ret->AddSequence(
+ new Sequence(newPtrs[i++], GetSequence(*iter)->GetHeader(),
+ newLength, GetSequence(*iter)->GetSortLabel(),
+ GetSequence(*iter)->GetLabel()));
+ }
+
+ return ret;
+ }
+};
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////\r
+// ProbabilisticModel.h\r
+//\r
+// Routines for (1) posterior probability computations\r
+// (2) chained anchoring\r
+// (3) maximum weight trace alignment\r
+/////////////////////////////////////////////////////////////////\r
+\r
+#ifndef PROBABILISTICMODEL_H\r
+#define PROBABILISTICMODEL_H\r
+\r
+#include <list>\r
+#include <cmath>\r
+#include <cstdio>\r
+#include "SafeVector.h"\r
+#include "ScoreType.h"\r
+#include "SparseMatrix.h"\r
+#include "MultiSequence.h"\r
+\r
+using namespace std;\r
+\r
+const int NumMatchStates = 1; // note that in this version the number\r
+ // of match states is fixed at 1...will\r
+const int NumInsertStates = 2; // change in future versions\r
+const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;\r
+\r
+/////////////////////////////////////////////////////////////////\r
+// ProbabilisticModel\r
+//\r
+// Class for storing the parameters of a probabilistic model and\r
+// performing different computations based on those parameters.\r
+// In particular, this class handles the computation of\r
+// posterior probabilities that may be used in alignment.\r
+/////////////////////////////////////////////////////////////////\r
+\r
+class ProbabilisticModel {\r
+\r
+ float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state\r
+ float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities\r
+ float matchProb[256][256]; // emission probabilities for match states\r
+ float insProb[256][NumMatrixTypes]; // emission probabilities for insert states\r
+ float local_transProb[3][3];\r
+ float random_transProb[2];\r
+\r
+ public:\r
+\r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::ProbabilisticModel()\r
+ //\r
+ // Constructor. Builds a new probabilistic model using the\r
+ // given parameters.\r
+ /////////////////////////////////////////////////////////////////\r
+\r
+ ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend,\r
+ const VVF &emitPairs, const VF &emitSingle){\r
+\r
+//Probcons model\r
+ // build transition matrix\r
+ VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f));\r
+ transMat[0][0] = 1;\r
+ for (int i = 0; i < NumInsertStates; i++){\r
+ transMat[0][2*i+1] = gapOpen[2*i];\r
+ transMat[0][2*i+2] = gapOpen[2*i];\r
+ transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i]);\r
+ assert (transMat[0][0] > 0);\r
+ transMat[2*i+1][2*i+1] = gapExtend[2*i];\r
+ transMat[2*i+2][2*i+2] = gapExtend[2*i];\r
+ transMat[2*i+1][2*i+2] = 0;\r
+ transMat[2*i+2][2*i+1] = 0;\r
+ transMat[2*i+1][0] = 1 - gapExtend[2*i];\r
+ transMat[2*i+2][0] = 1 - gapExtend[2*i];\r
+ }\r
+\r
+ // create initial and transition probability matrices \r
+ for (int i = 0; i < NumMatrixTypes; i++){\r
+ initialDistribution[i] = LOG (initDistribMat[i]);\r
+ for (int j = 0; j < NumMatrixTypes; j++)\r
+ transProb[i][j] = LOG (transMat[i][j]);\r
+ }\r
+//due to Local model parameters' initilization, need to correct initialDistribution[2]\r
+ initialDistribution[2] = LOG (initDistribMat[1]);\r
+\r
+ // create insertion and match probability matrices\r
+ for (int i = 0; i < 256; i++){\r
+ for (int j = 0; j < NumMatrixTypes; j++)\r
+ insProb[i][j] = LOG (emitSingle[i]);\r
+ for (int j = 0; j < 256; j++)\r
+ matchProb[i][j] = LOG (emitPairs[i][j]);\r
+ }\r
+\r
+//Local model \r
+ // build transition matrix\r
+ VVF ltransMat (3, VF (3, 0.0f));\r
+ ltransMat[0][0] = 1;\r
+ \r
+ ltransMat[0][1] = gapOpen[1];\r
+ ltransMat[0][2] = gapOpen[1];\r
+ ltransMat[0][0] -= (gapOpen[1] + gapOpen[1]);\r
+ assert (ltransMat[0][0] > 0);\r
+ ltransMat[1][1] = gapExtend[1];\r
+ ltransMat[2][2] = gapExtend[1];\r
+ ltransMat[1][2] = 0;\r
+ ltransMat[2][1] = 0;\r
+ ltransMat[1][0] = 1 - gapExtend[1];\r
+ ltransMat[2][0] = 1 - gapExtend[1];\r
+ \r
+ // create initial and transition probability matrices\r
+ for (int i = 0; i < 3; i++){\r
+ for (int j = 0; j < 3; j++)\r
+ local_transProb[i][j] = LOG (ltransMat[i][j]);\r
+ }\r
+\r
+ // create initial and transition probability matrices\r
+ random_transProb[0] = LOG (initDistribMat[2]);//sigma\r
+ random_transProb[1] = LOG (1-initDistribMat[2]);//1-sigma\r
+\r
+ }\r
+\r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::ComputeForwardMatrix()\r
+ //\r
+ // Computes a set of forward probability matrices for aligning\r
+ // seq1 and seq2.\r
+ //\r
+ // For efficiency reasons, a single-dimensional floating-point\r
+ // array is used here, with the following indexing scheme:\r
+ //\r
+ // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]\r
+ // refers to the probability of aligning through j characters\r
+ // of the first sequence, k characters of the second sequence,\r
+ // and ending in state i.\r
+ // flag: 1 probcons, 0 local\r
+ /////////////////////////////////////////////////////////////////\r
+\r
+ VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2, bool flag=true) const {\r
+\r
+ assert (seq1);\r
+ assert (seq2);\r
+\r
+ const int seq1Length = seq1->GetLength();\r
+ const int seq2Length = seq2->GetLength();\r
+\r
+ // retrieve the points to the beginning of each sequence\r
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();\r
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();\r
+\r
+ // create matrix\r
+ VF *forwardPtr;\r
+ if(flag) forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);\r
+ else forwardPtr = new VF (3 * (seq1Length+1) * (seq2Length+1), LOG_ZERO);\r
+ assert (forwardPtr);\r
+ VF &forward = *forwardPtr;\r
+\r
+ // initialization condition\r
+ if(flag){\r
+ forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] = \r
+ initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];\r
+ \r
+ for (int k = 0; k < NumInsertStates; k++){\r
+ forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] = \r
+ initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k];\r
+ forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] = \r
+ initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k]; \r
+ }\r
+ }\r
+ \r
+ // remember offset for each index combination\r
+ int ij = 0;\r
+ int i1j = -seq2Length - 1;\r
+ int ij1 = -1;\r
+ int i1j1 = -seq2Length - 2;\r
+ \r
+ if(flag){\r
+ ij *= NumMatrixTypes;\r
+ i1j *= NumMatrixTypes;\r
+ ij1 *= NumMatrixTypes;\r
+ i1j1 *= NumMatrixTypes;\r
+ }\r
+ else{\r
+ ij *= 3;\r
+ i1j *= 3;\r
+ ij1 *= 3;\r
+ i1j1 *= 3;\r
+ }\r
+\r
+ // compute forward scores\r
+ for (int i = 0; i <= seq1Length; i++){\r
+ unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];\r
+ for (int j = 0; j <= seq2Length; j++){\r
+ unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];\r
+ //local\r
+ if(i == 1 && j == 1 && !flag) forward[0 + ij] = \r
+ matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0] - 2*random_transProb[1];\r
+\r
+ if (i > 1 || j > 1){\r
+ if (i > 0 && j > 0){\r
+ if(flag){\r
+ forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];\r
+ for (int k = 1; k < NumMatrixTypes; k++)\r
+ LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]);\r
+ forward[0 + ij] += matchProb[c1][c2];\r
+ }\r
+ //local\r
+ else{\r
+ forward[0 + ij] = matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0] - 2*random_transProb[1];\r
+ for (int k = 0; k < 3; k++)\r
+ LOG_PLUS_EQUALS (forward[0 + ij], matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0] +\r
+ forward[k + i1j1] + local_transProb[k][0] - 2*random_transProb[1]);\r
+ }\r
+ }\r
+ if (i > 0){\r
+ if(flag){\r
+ for (int k = 0; k < NumInsertStates; k++)\r
+ forward[2*k+1 + ij] = insProb[c1][k] +\r
+ LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1],\r
+ forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]);\r
+ }\r
+\r
+\r
+\r
+\r
+\r
+\r
+ //local\r
+ else{\r
+ forward[1 + ij] = LOG_ADD (forward[0 + i1j] + local_transProb[0][1] - random_transProb[1],\r
+ forward[1 + i1j] + local_transProb[1][1] - random_transProb[1]);\r
+ }\r
+\r
+ }\r
+ if (j > 0){\r
+ if(flag){\r
+ for (int k = 0; k < NumInsertStates; k++)\r
+ forward[2*k+2 + ij] = insProb[c2][k] +\r
+ LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2],\r
+ forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]);\r
+ }\r
+ //local\r
+ else{\r
+ forward[2 + ij] = LOG_ADD (forward[0 + ij1] + local_transProb[0][2] - random_transProb[1],\r
+ forward[2 + ij1] + local_transProb[2][2] - random_transProb[1]);\r
+ }\r
+ }\r
+ }\r
+ if(flag){\r
+ ij += NumMatrixTypes;\r
+ i1j += NumMatrixTypes;\r
+ ij1 += NumMatrixTypes;\r
+ i1j1 += NumMatrixTypes;\r
+ }\r
+ else{\r
+ ij += 3;\r
+ i1j += 3;\r
+ ij1 += 3;\r
+ i1j1 += 3;\r
+ }\r
+ }\r
+ }\r
+\r
+ return forwardPtr;\r
+ }\r
+\r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::ComputeBackwardMatrix()\r
+ //\r
+ // Computes a set of backward probability matrices for aligning\r
+ // seq1 and seq2.\r
+ //\r
+ // For efficiency reasons, a single-dimensional floating-point\r
+ // array is used here, with the following indexing scheme:\r
+ //\r
+ // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]\r
+ // refers to the probability of starting in state i and\r
+ // aligning from character j+1 to the end of the first\r
+ // sequence and from character k+1 to the end of the second\r
+ // sequence.\r
+ /////////////////////////////////////////////////////////////////\r
+\r
+ VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2, bool flag=true) const {\r
+\r
+ assert (seq1);\r
+ assert (seq2);\r
+\r
+ const int seq1Length = seq1->GetLength();\r
+ const int seq2Length = seq2->GetLength();\r
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();\r
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();\r
+\r
+ // create matrix\r
+ VF *backwardPtr;\r
+ if(flag) backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO);\r
+ else backwardPtr = new VF (3 * (seq1Length+1) * (seq2Length+1), LOG_ZERO);\r
+ assert (backwardPtr);\r
+ VF &backward = *backwardPtr;\r
+\r
+ // initialization condition\r
+ if(flag){\r
+ for (int k = 0; k < NumMatrixTypes; k++)\r
+ backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k];\r
+ }\r
+ // remember offset for each index combination\r
+ int ij = (seq1Length+1) * (seq2Length+1) - 1;\r
+ int i1j = ij + seq2Length + 1;\r
+ int ij1 = ij + 1;\r
+ int i1j1 = ij + seq2Length + 2;\r
+ \r
+ if(flag){\r
+ ij *= NumMatrixTypes;\r
+ i1j *= NumMatrixTypes;\r
+ ij1 *= NumMatrixTypes;\r
+ i1j1 *= NumMatrixTypes;\r
+ }\r
+ else{\r
+ ij *= 3;\r
+ i1j *= 3;\r
+ ij1 *= 3;\r
+ i1j1 *= 3; \r
+ }\r
+\r
+ // compute backward scores\r
+ for (int i = seq1Length; i >= 0; i--){\r
+ unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1];\r
+ for (int j = seq2Length; j >= 0; j--){\r
+ unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1];\r
+ \r
+ if(!flag) backward[0 + ij] = LOG_ONE;//local\r
+ if (i < seq1Length && j < seq2Length){\r
+ if(flag){\r
+ const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];\r
+ for (int k = 0; k < NumMatrixTypes; k++)\r
+ LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]);\r
+ }\r
+ //local\r
+ else{\r
+ const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0];\r
+ for (int k = 0; k < 3; k++)\r
+ LOG_PLUS_EQUALS (backward[k + ij], ProbXY + local_transProb[k][0] - 2*random_transProb[1] );\r
+ }\r
+ }\r
+ if (i < seq1Length){\r
+ if(flag){\r
+ for (int k = 0; k < NumInsertStates; k++){\r
+ LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]);\r
+ LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]);\r
+ }\r
+ }\r
+ //local\r
+ else{\r
+ LOG_PLUS_EQUALS (backward[0 + ij], backward[1 + i1j] + local_transProb[0][1] - random_transProb[1]);\r
+ LOG_PLUS_EQUALS (backward[1 + ij], backward[1 + i1j] + local_transProb[1][1] - random_transProb[1]);\r
+ }\r
+ }\r
+ if (j < seq2Length){\r
+ if(flag){\r
+ for (int k = 0; k < NumInsertStates; k++){\r
+ LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]);\r
+ LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]);\r
+ }\r
+ }\r
+ //local \r
+ else{\r
+ LOG_PLUS_EQUALS (backward[0 + ij], backward[2 + ij1] + local_transProb[0][2] - random_transProb[1]);\r
+ LOG_PLUS_EQUALS (backward[2 + ij], backward[2 + ij1] + local_transProb[2][2] - random_transProb[1]);\r
+ }\r
+ }\r
+ if(flag){\r
+ ij -= NumMatrixTypes;\r
+ i1j -= NumMatrixTypes;\r
+ ij1 -= NumMatrixTypes;\r
+ i1j1 -= NumMatrixTypes;\r
+ }\r
+ else{\r
+ ij -= 3;\r
+ i1j -= 3;\r
+ ij1 -= 3;\r
+ i1j1 -= 3;\r
+ }\r
+ }\r
+ }\r
+\r
+ return backwardPtr;\r
+ }\r
+\r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::ComputeTotalProbability()\r
+ //\r
+ // Computes the total probability of an alignment given\r
+ // the forward and backward matrices.\r
+ // flag: 1 probcons, 0 local\r
+ /////////////////////////////////////////////////////////////////\r
+\r
+ float ComputeTotalProbability (Sequence *seq1, Sequence *seq2,\r
+ const VF &forward, const VF &backward, bool flag=true) const {\r
+\r
+ // compute total probability\r
+ float totalForwardProb = LOG_ZERO;\r
+ float totalBackwardProb = LOG_ZERO;\r
+ const int seq1Length = seq1->GetLength();\r
+ const int seq2Length = seq2->GetLength();\r
+\r
+ if(flag){\r
+ for (int k = 0; k < NumMatrixTypes; k++){\r
+ LOG_PLUS_EQUALS (totalForwardProb,\r
+ forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + \r
+ backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);\r
+ }\r
+\r
+ totalBackwardProb = \r
+ forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +\r
+ backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)];\r
+\r
+ for (int k = 0; k < NumInsertStates; k++){\r
+ LOG_PLUS_EQUALS (totalBackwardProb,\r
+ forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +\r
+ backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]);\r
+ LOG_PLUS_EQUALS (totalBackwardProb,\r
+ forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +\r
+ backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]);\r
+ }\r
+ }\r
+ else{\r
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();\r
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();\r
+ int ij = 0;\r
+ for (int i = 0; i <= seq1Length; i++){\r
+ unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];\r
+ for (int j = 0; j <= seq2Length; j++){\r
+ unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];\r
+ if(i>0&&j>0) {\r
+ LOG_PLUS_EQUALS (totalForwardProb,forward[ij]); \r
+ LOG_PLUS_EQUALS (totalBackwardProb,backward[ij] + matchProb[c1][c2] \r
+ - insProb[c1][0] - insProb[c2][0] - 2*random_transProb[1]); \r
+ }\r
+ ij += 3;\r
+ }\r
+ }\r
+\r
+ }\r
+ \r
+ return (totalForwardProb + totalBackwardProb) / 2;\r
+ }\r
+\r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::ComputePosteriorMatrix()\r
+ //\r
+ // Computes the posterior probability matrix based on\r
+ // the forward and backward matrices.\r
+ // flag: 1 probcons, 0 local \r
+ /////////////////////////////////////////////////////////////////\r
+\r
+ VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2,\r
+ const VF &forward, const VF &backward, bool flag=true) const {\r
+\r
+ assert (seq1);\r
+ assert (seq2);\r
+\r
+ const int seq1Length = seq1->GetLength();\r
+ const int seq2Length = seq2->GetLength();\r
+\r
+ float totalProb = ComputeTotalProbability (seq1, seq2,forward, backward, flag);\r
+\r
+ // compute posterior matrices\r
+ VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr);\r
+ VF &posterior = *posteriorPtr;\r
+\r
+ int ij = 0;\r
+ VF::iterator ptr = posterior.begin();\r
+\r
+ for (int i = 0; i <= seq1Length; i++){\r
+ for (int j = 0; j <= seq2Length; j++){\r
+ *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb));\r
+ if(flag) ij += NumMatrixTypes;\r
+ else ij += 3;\r
+ }\r
+ }\r
+\r
+ posterior[0] = 0;\r
+\r
+ return posteriorPtr;\r
+ }\r
+\r
+ /*\r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::ComputeExpectedCounts()\r
+ //\r
+ // Computes the expected counts for the various transitions.\r
+ /////////////////////////////////////////////////////////////////\r
+\r
+ VVF *ComputeExpectedCounts () const {\r
+\r
+ assert (seq1);\r
+ assert (seq2);\r
+\r
+ const int seq1Length = seq1->GetLength();\r
+ const int seq2Length = seq2->GetLength();\r
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();\r
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();\r
+\r
+ // compute total probability\r
+ float totalProb = ComputeTotalProbability (seq1Length, seq2Length,\r
+ forward, backward);\r
+\r
+ // initialize expected counts\r
+ VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);\r
+ VVF &counts = *countsPtr;\r
+\r
+ // remember offset for each index combination\r
+ int ij = 0;\r
+ int i1j = -seq2Length - 1;\r
+ int ij1 = -1;\r
+ int i1j1 = -seq2Length - 2;\r
+\r
+ ij *= NumMatrixTypes;\r
+ i1j *= NumMatrixTypes;\r
+ ij1 *= NumMatrixTypes;\r
+ i1j1 *= NumMatrixTypes;\r
+\r
+ // compute expected counts\r
+ for (int i = 0; i <= seq1Length; i++){\r
+ unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];\r
+ for (int j = 0; j <= seq2Length; j++){\r
+ unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];\r
+\r
+ if (i > 0 && j > 0){\r
+ for (int k = 0; k < NumMatrixTypes; k++)\r
+ LOG_PLUS_EQUALS (counts[k][0],\r
+ forward[k + i1j1] + transProb[k][0] +\r
+ matchProb[c1][c2] + backward[0 + ij]);\r
+ }\r
+ if (i > 0){\r
+ for (int k = 0; k < NumInsertStates; k++){\r
+ LOG_PLUS_EQUALS (counts[0][2*k+1],\r
+ forward[0 + i1j] + transProb[0][2*k+1] +\r
+ insProb[c1][k] + backward[2*k+1 + ij]);\r
+ LOG_PLUS_EQUALS (counts[2*k+1][2*k+1],\r
+ forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +\r
+ insProb[c1][k] + backward[2*k+1 + ij]);\r
+ }\r
+ }\r
+ if (j > 0){\r
+ for (int k = 0; k < NumInsertStates; k++){\r
+ LOG_PLUS_EQUALS (counts[0][2*k+2],\r
+ forward[0 + ij1] + transProb[0][2*k+2] +\r
+ insProb[c2][k] + backward[2*k+2 + ij]);\r
+ LOG_PLUS_EQUALS (counts[2*k+2][2*k+2],\r
+ forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +\r
+ insProb[c2][k] + backward[2*k+2 + ij]);\r
+ }\r
+ }\r
+\r
+ ij += NumMatrixTypes;\r
+ i1j += NumMatrixTypes;\r
+ ij1 += NumMatrixTypes;\r
+ i1j1 += NumMatrixTypes;\r
+ }\r
+ }\r
+\r
+ // scale all expected counts appropriately\r
+ for (int i = 0; i < NumMatrixTypes; i++)\r
+ for (int j = 0; j < NumMatrixTypes; j++)\r
+ counts[i][j] -= totalProb;\r
+\r
+ }\r
+ */\r
+\r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::ComputeNewParameters()\r
+ //\r
+ // Computes a new parameter set based on the expected counts\r
+ // given.\r
+ /////////////////////////////////////////////////////////////////\r
+\r
+ void ComputeNewParameters (Sequence *seq1, Sequence *seq2,\r
+ const VF &forward, const VF &backward,\r
+ VF &initDistribMat, VF &gapOpen,\r
+ VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {\r
+ \r
+ assert (seq1);\r
+ assert (seq2);\r
+\r
+ const int seq1Length = seq1->GetLength();\r
+ const int seq2Length = seq2->GetLength();\r
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();\r
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();\r
+\r
+ // compute total probability\r
+ float totalProb = ComputeTotalProbability (seq1, seq2,\r
+ forward, backward);\r
+ \r
+ // initialize expected counts\r
+ VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO));\r
+ VF initCounts (NumMatrixTypes, LOG_ZERO);\r
+ VVF pairCounts (256, VF (256, LOG_ZERO));\r
+ VF singleCounts (256, LOG_ZERO);\r
+ \r
+ // remember offset for each index combination\r
+ int ij = 0;\r
+ int i1j = -seq2Length - 1;\r
+ int ij1 = -1;\r
+ int i1j1 = -seq2Length - 2;\r
+\r
+ ij *= NumMatrixTypes;\r
+ i1j *= NumMatrixTypes;\r
+ ij1 *= NumMatrixTypes;\r
+ i1j1 *= NumMatrixTypes;\r
+\r
+ // compute initial distribution posteriors\r
+ initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +\r
+ backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)],\r
+ forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + \r
+ backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);\r
+ for (int k = 0; k < NumInsertStates; k++){\r
+ initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +\r
+ backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)],\r
+ forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + \r
+ backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);\r
+ initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +\r
+ backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)],\r
+ forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + \r
+ backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);\r
+ }\r
+\r
+ // compute expected counts\r
+ for (int i = 0; i <= seq1Length; i++){\r
+ unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);\r
+ for (int j = 0; j <= seq2Length; j++){\r
+ unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);\r
+\r
+ if (i > 0 && j > 0){\r
+ if (enableTrainEmissions && i == 1 && j == 1){\r
+ LOG_PLUS_EQUALS (pairCounts[c1][c2],\r
+ initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]);\r
+ LOG_PLUS_EQUALS (pairCounts[c2][c1],\r
+ initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]);\r
+ }\r
+\r
+ for (int k = 0; k < NumMatrixTypes; k++){\r
+ LOG_PLUS_EQUALS (transCounts[k][0],\r
+ forward[k + i1j1] + transProb[k][0] +\r
+ matchProb[c1][c2] + backward[0 + ij]);\r
+ if (enableTrainEmissions && i != 1 || j != 1){\r
+ LOG_PLUS_EQUALS (pairCounts[c1][c2],\r
+ forward[k + i1j1] + transProb[k][0] +\r
+ matchProb[c1][c2] + backward[0 + ij]);\r
+ LOG_PLUS_EQUALS (pairCounts[c2][c1],\r
+ forward[k + i1j1] + transProb[k][0] +\r
+ matchProb[c2][c1] + backward[0 + ij]);\r
+ }\r
+ }\r
+ }\r
+ if (i > 0){\r
+ for (int k = 0; k < NumInsertStates; k++){\r
+ LOG_PLUS_EQUALS (transCounts[0][2*k+1],\r
+ forward[0 + i1j] + transProb[0][2*k+1] +\r
+ insProb[c1][k] + backward[2*k+1 + ij]);\r
+ LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1],\r
+ forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +\r
+ insProb[c1][k] + backward[2*k+1 + ij]);\r
+ if (enableTrainEmissions){\r
+ if (i == 1 && j == 0){\r
+ LOG_PLUS_EQUALS (singleCounts[c1],\r
+ initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]);\r
+ }\r
+ else {\r
+ LOG_PLUS_EQUALS (singleCounts[c1],\r
+ forward[0 + i1j] + transProb[0][2*k+1] +\r
+ insProb[c1][k] + backward[2*k+1 + ij]);\r
+ LOG_PLUS_EQUALS (singleCounts[c1],\r
+ forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +\r
+ insProb[c1][k] + backward[2*k+1 + ij]);\r
+ }\r
+ }\r
+ }\r
+ }\r
+ if (j > 0){\r
+ for (int k = 0; k < NumInsertStates; k++){\r
+ LOG_PLUS_EQUALS (transCounts[0][2*k+2],\r
+ forward[0 + ij1] + transProb[0][2*k+2] +\r
+ insProb[c2][k] + backward[2*k+2 + ij]);\r
+ LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2],\r
+ forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +\r
+ insProb[c2][k] + backward[2*k+2 + ij]);\r
+ if (enableTrainEmissions){\r
+ if (i == 0 && j == 1){\r
+ LOG_PLUS_EQUALS (singleCounts[c2],\r
+ initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]);\r
+ }\r
+ else {\r
+ LOG_PLUS_EQUALS (singleCounts[c2],\r
+ forward[0 + ij1] + transProb[0][2*k+2] +\r
+ insProb[c2][k] + backward[2*k+2 + ij]);\r
+ LOG_PLUS_EQUALS (singleCounts[c2],\r
+ forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +\r
+ insProb[c2][k] + backward[2*k+2 + ij]);\r
+ }\r
+ }\r
+ }\r
+ }\r
+ \r
+ ij += NumMatrixTypes;\r
+ i1j += NumMatrixTypes;\r
+ ij1 += NumMatrixTypes;\r
+ i1j1 += NumMatrixTypes;\r
+ }\r
+ }\r
+\r
+ // scale all expected counts appropriately\r
+ for (int i = 0; i < NumMatrixTypes; i++){\r
+ initCounts[i] -= totalProb;\r
+ for (int j = 0; j < NumMatrixTypes; j++)\r
+ transCounts[i][j] -= totalProb;\r
+ }\r
+ if (enableTrainEmissions){\r
+ for (int i = 0; i < 256; i++){\r
+ for (int j = 0; j < 256; j++)\r
+ pairCounts[i][j] -= totalProb;\r
+ singleCounts[i] -= totalProb;\r
+ }\r
+ }\r
+\r
+ // compute new initial distribution\r
+ float totalInitDistribCounts = 0;\r
+ for (int i = 0; i < NumMatrixTypes; i++)\r
+ totalInitDistribCounts += exp (initCounts[i]); // should be 2\r
+ initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts));\r
+ for (int k = 0; k < NumInsertStates; k++){\r
+ float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2;\r
+ initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts));\r
+ }\r
+\r
+ // compute total counts for match state\r
+ float inMatchStateCounts = 0;\r
+ for (int i = 0; i < NumMatrixTypes; i++)\r
+ inMatchStateCounts += exp (transCounts[0][i]);\r
+ for (int i = 0; i < NumInsertStates; i++){\r
+\r
+ // compute total counts for gap state\r
+ float inGapStateCounts =\r
+ exp (transCounts[2*i+1][0]) +\r
+ exp (transCounts[2*i+1][2*i+1]) +\r
+ exp (transCounts[2*i+2][0]) +\r
+ exp (transCounts[2*i+2][2*i+2]);\r
+\r
+ gapOpen[2*i] = gapOpen[2*i+1] =\r
+ (exp (transCounts[0][2*i+1]) +\r
+ exp (transCounts[0][2*i+2])) /\r
+ (2 * inMatchStateCounts);\r
+\r
+ gapExtend[2*i] = gapExtend[2*i+1] =\r
+ (exp (transCounts[2*i+1][2*i+1]) +\r
+ exp (transCounts[2*i+2][2*i+2])) /\r
+ inGapStateCounts;\r
+ }\r
+\r
+ if (enableTrainEmissions){\r
+ float totalPairCounts = 0;\r
+ float totalSingleCounts = 0;\r
+ for (int i = 0; i < 256; i++){\r
+ for (int j = 0; j <= i; j++)\r
+ totalPairCounts += exp (pairCounts[j][i]);\r
+ totalSingleCounts += exp (singleCounts[i]);\r
+ }\r
+ \r
+ for (int i = 0; i < 256; i++) if (!islower ((char) i)){\r
+ int li = (int)((unsigned char) tolower ((char) i));\r
+ for (int j = 0; j <= i; j++) if (!islower ((char) j)){\r
+ int lj = (int)((unsigned char) tolower ((char) j));\r
+ emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] = \r
+ emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts;\r
+ }\r
+ emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts;\r
+ }\r
+ }\r
+ }\r
+ \r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::ComputeAlignment()\r
+ //\r
+ // Computes an alignment based on the given posterior matrix.\r
+ // This is done by finding the maximum summing path (or\r
+ // maximum weight trace) through the posterior matrix. The\r
+ // final alignment is returned as a pair consisting of:\r
+ // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and\r
+ // denote insertions in one of the two sequences and\r
+ // B's denote that both sequences are present (i.e.\r
+ // matches).\r
+ // (2) a float indicating the sum achieved\r
+ /////////////////////////////////////////////////////////////////\r
+\r
+ pair<SafeVector<char> *, float> ComputeAlignment (int seq1Length, int seq2Length,\r
+ const VF &posterior) const {\r
+\r
+ float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows);\r
+ float *oldRow = twoRows;\r
+ float *newRow = twoRows + seq2Length + 1;\r
+\r
+ char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);\r
+ char *tracebackPtr = tracebackMatrix;\r
+\r
+ VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;\r
+\r
+ // initialization\r
+ for (int i = 0; i <= seq2Length; i++){\r
+ oldRow[i] = 0;\r
+ *(tracebackPtr++) = 'L';\r
+ }\r
+\r
+ // fill in matrix\r
+ for (int i = 1; i <= seq1Length; i++){\r
+\r
+ // initialize left column\r
+ newRow[0] = 0;\r
+ posteriorPtr++;\r
+ *(tracebackPtr++) = 'U';\r
+\r
+ // fill in rest of row\r
+ for (int j = 1; j <= seq2Length; j++){\r
+ ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j],\r
+ 'D', 'L', 'U', &newRow[j], tracebackPtr++);\r
+ }\r
+\r
+ // swap rows\r
+ float *temp = oldRow;\r
+ oldRow = newRow;\r
+ newRow = temp;\r
+ }\r
+\r
+ // store best score\r
+ float total = oldRow[seq2Length];\r
+ delete [] twoRows;\r
+\r
+ // compute traceback\r
+ SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);\r
+ int r = seq1Length, c = seq2Length;\r
+ while (r != 0 || c != 0){\r
+ char ch = tracebackMatrix[r*(seq2Length+1) + c];\r
+ switch (ch){\r
+ case 'L': c--; alignment->push_back ('Y'); break;\r
+ case 'U': r--; alignment->push_back ('X'); break;\r
+ case 'D': c--; r--; alignment->push_back ('B'); break;\r
+ default: assert (false);\r
+ }\r
+ }\r
+\r
+ delete [] tracebackMatrix;\r
+\r
+ reverse (alignment->begin(), alignment->end());\r
+ \r
+ return make_pair(alignment, total);\r
+ }\r
+\r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::ComputeAlignmentWithGapPenalties()\r
+ //\r
+ // Similar to ComputeAlignment() except with gap penalties.\r
+ /////////////////////////////////////////////////////////////////\r
+\r
+ pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1,\r
+ MultiSequence *align2,\r
+ const VF &posterior, int numSeqs1,\r
+ int numSeqs2,\r
+ float gapOpenPenalty,\r
+ float gapContinuePenalty) const {\r
+ int seq1Length = align1->GetSequence(0)->GetLength();\r
+ int seq2Length = align2->GetSequence(0)->GetLength();\r
+ SafeVector<SafeVector<char>::iterator > dataPtrs1 (align1->GetNumSequences());\r
+ SafeVector<SafeVector<char>::iterator > dataPtrs2 (align2->GetNumSequences());\r
+\r
+ // grab character data\r
+ for (int i = 0; i < align1->GetNumSequences(); i++)\r
+ dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();\r
+ for (int i = 0; i < align2->GetNumSequences(); i++)\r
+ dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();\r
+\r
+ // the number of active sequences at any given column is defined to be the\r
+ // number of non-gap characters in that column; the number of gap opens at\r
+ // any given column is defined to be the number of gap characters in that\r
+ // column where the previous character in the respective sequence was not\r
+ // a gap\r
+ SafeVector<int> numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1);\r
+ SafeVector<int> numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1);\r
+\r
+ // compute number of active sequences and gap opens for each group\r
+ for (int i = 0; i < align1->GetNumSequences(); i++){\r
+ SafeVector<char>::iterator dataPtr = align1->GetSequence(i)->GetDataPtr();\r
+ numActive1[0] = numGapOpens1[0] = 0;\r
+ for (int j = 1; j <= seq1Length; j++){\r
+ if (dataPtr[j] != '-'){\r
+ numActive1[j]++;\r
+ numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-');\r
+ }\r
+ }\r
+ }\r
+ for (int i = 0; i < align2->GetNumSequences(); i++){\r
+ SafeVector<char>::iterator dataPtr = align2->GetSequence(i)->GetDataPtr();\r
+ numActive2[0] = numGapOpens2[0] = 0;\r
+ for (int j = 1; j <= seq2Length; j++){\r
+ if (dataPtr[j] != '-'){\r
+ numActive2[j]++;\r
+ numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-');\r
+ }\r
+ }\r
+ }\r
+\r
+ VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1));\r
+ VF continuingPenalty1 (numSeqs1+1);\r
+ VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1));\r
+ VF continuingPenalty2 (numSeqs2+1);\r
+\r
+ // precompute penalties\r
+ for (int i = 0; i <= numSeqs1; i++)\r
+ for (int j = 0; j <= numSeqs2; j++)\r
+ openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j));\r
+ for (int i = 0; i <= numSeqs1; i++)\r
+ continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;\r
+ for (int i = 0; i <= numSeqs2; i++)\r
+ for (int j = 0; j <= numSeqs1; j++)\r
+ openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j));\r
+ for (int i = 0; i <= numSeqs2; i++)\r
+ continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;\r
+\r
+ float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows);\r
+ float *oldRowMatch = twoRows;\r
+ float *newRowMatch = twoRows + (seq2Length+1);\r
+ float *oldRowInsertX = twoRows + 2*(seq2Length+1);\r
+ float *newRowInsertX = twoRows + 3*(seq2Length+1);\r
+ float *oldRowInsertY = twoRows + 4*(seq2Length+1);\r
+ float *newRowInsertY = twoRows + 5*(seq2Length+1);\r
+\r
+ char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);\r
+ char *tracebackPtr = tracebackMatrix;\r
+\r
+ VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;\r
+\r
+ // initialization\r
+ for (int i = 0; i <= seq2Length; i++){\r
+ oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;\r
+ oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]];\r
+ *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y';\r
+ tracebackPtr += 3;\r
+ }\r
+\r
+ // fill in matrix\r
+ for (int i = 1; i <= seq1Length; i++){\r
+\r
+ // initialize left column\r
+ newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;\r
+ newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]];\r
+ posteriorPtr++;\r
+ *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X';\r
+ tracebackPtr += 3;\r
+\r
+ // fill in rest of row\r
+ for (int j = 1; j <= seq2Length; j++){\r
+\r
+ // going to MATCH state\r
+ ChooseBestOfThree (oldRowMatch[j-1],\r
+ oldRowInsertX[j-1],\r
+ oldRowInsertY[j-1],\r
+ 'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++);\r
+ newRowMatch[j] += *(posteriorPtr++);\r
+\r
+ // going to INSERT X state\r
+ ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],\r
+ oldRowInsertX[j] + continuingPenalty1[numActive1[i]],\r
+ oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],\r
+ 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);\r
+\r
+ // going to INSERT Y state\r
+ ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],\r
+ newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],\r
+ newRowInsertY[j-1] + continuingPenalty2[numActive2[j]],\r
+ 'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++);\r
+ }\r
+\r
+ // swap rows\r
+ float *temp;\r
+ temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp;\r
+ temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp;\r
+ temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp;\r
+ }\r
+\r
+ // store best score\r
+ float total;\r
+ char matrix;\r
+ ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length],\r
+ 'M', 'X', 'Y', &total, &matrix);\r
+\r
+ delete [] twoRows;\r
+\r
+ // compute traceback\r
+ SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);\r
+ int r = seq1Length, c = seq2Length;\r
+ while (r != 0 || c != 0){\r
+\r
+ int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;\r
+ char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset];\r
+ switch (matrix){\r
+ case 'Y': c--; alignment->push_back ('Y'); break;\r
+ case 'X': r--; alignment->push_back ('X'); break;\r
+ case 'M': c--; r--; alignment->push_back ('B'); break;\r
+ default: assert (false);\r
+ }\r
+ matrix = ch;\r
+ }\r
+\r
+ delete [] tracebackMatrix;\r
+\r
+ reverse (alignment->begin(), alignment->end());\r
+\r
+ return make_pair(alignment, 1.0f);\r
+ }\r
+\r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::ComputeViterbiAlignment()\r
+ //\r
+ // Computes the highest probability pairwise alignment using the\r
+ // probabilistic model. The final alignment is returned as a\r
+ // pair consisting of:\r
+ // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and\r
+ // denote insertions in one of the two sequences and\r
+ // B's denote that both sequences are present (i.e.\r
+ // matches).\r
+ // (2) a float containing the log probability of the best\r
+ // alignment (not used)\r
+ /////////////////////////////////////////////////////////////////\r
+\r
+\r
+ pair<SafeVector<char> *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const {\r
+ \r
+ assert (seq1);\r
+ assert (seq2);\r
+ \r
+ const int seq1Length = seq1->GetLength();\r
+ const int seq2Length = seq2->GetLength();\r
+ \r
+ // retrieve the points to the beginning of each sequence\r
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();\r
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();\r
+ \r
+ // create viterbi matrix\r
+ VF *viterbiPtr = new VF (3 * (seq1Length+1) * (seq2Length+1), LOG_ZERO);\r
+ assert (viterbiPtr);\r
+ VF &viterbi = *viterbiPtr;\r
+\r
+ // create traceback matrix\r
+ VI *tracebackPtr = new VI (3 * (seq1Length+1) * (seq2Length+1), -1);\r
+ assert (tracebackPtr);\r
+ VI &traceback = *tracebackPtr;\r
+\r
+ // initialization condition\r
+/*\r
+ for (int k = 0; k < NumMatrixTypes; k++)\r
+ viterbi[k] = initialDistribution[k];\r
+*/\r
+ viterbi[0] = LOG(0.6080327034);\r
+ viterbi[1] = LOG(0.1959836632);\r
+ viterbi[2] = LOG(0.1959836632);\r
+\r
+ // remember offset for each index combination\r
+ int ij = 0;\r
+ int i1j = -seq2Length - 1;\r
+ int ij1 = -1;\r
+ int i1j1 = -seq2Length - 2;\r
+\r
+ ij *= 3;\r
+ i1j *= 3;\r
+ ij1 *= 3;\r
+ i1j1 *= 3;\r
+\r
+ // compute viterbi scores\r
+ for (int i = 0; i <= seq1Length; i++){\r
+ unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];\r
+ for (int j = 0; j <= seq2Length; j++){\r
+ unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];\r
+\r
+ if (i > 0 && j > 0){\r
+ for (int k = 0; k < 3; k++){\r
+ float newVal = viterbi[k + i1j1] + local_transProb[k][0] + matchProb[c1][c2];\r
+ if (viterbi[0 + ij] < newVal){\r
+ viterbi[0 + ij] = newVal;\r
+ traceback[0 + ij] = k;\r
+ }\r
+ }\r
+ }\r
+ if (i > 0){\r
+ for (int k = 0; k < 1; k++){\r
+ float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + local_transProb[0][2*k+1];\r
+ float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + local_transProb[2*k+1][2*k+1];\r
+ if (valFromMatch >= valFromIns){\r
+ viterbi[2*k+1 + ij] = valFromMatch;\r
+ traceback[2*k+1 + ij] = 0;\r
+ }\r
+ else {\r
+ viterbi[2*k+1 + ij] = valFromIns;\r
+ traceback[2*k+1 + ij] = 2*k+1;\r
+ }\r
+ }\r
+ }\r
+ if (j > 0){\r
+ for (int k = 0; k < 1; k++){\r
+ float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + local_transProb[0][2*k+2];\r
+ float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + local_transProb[2*k+2][2*k+2];\r
+ if (valFromMatch >= valFromIns){\r
+ viterbi[2*k+2 + ij] = valFromMatch;\r
+ traceback[2*k+2 + ij] = 0;\r
+ }\r
+ else {\r
+ viterbi[2*k+2 + ij] = valFromIns;\r
+ traceback[2*k+2 + ij] = 2*k+2;\r
+ }\r
+ }\r
+ }\r
+\r
+ ij += 3;\r
+ i1j += 3;\r
+ ij1 += 3;\r
+ i1j1 += 3;\r
+ }\r
+ }\r
+\r
+ // figure out best terminating cell\r
+ float bestProb = LOG_ZERO;\r
+ int state = -1;\r
+ viterbi[0] = LOG(0.6080327034);\r
+ viterbi[1] = LOG(0.1959836632);\r
+ viterbi[2] = LOG(0.1959836632);\r
+\r
+ for (int k = 0; k < 3; k++){\r
+ float thisProb = viterbi[k + 3 * ((seq1Length+1)*(seq2Length+1) - 1)] + viterbi[k];\r
+ if (bestProb < thisProb){\r
+ bestProb = thisProb;\r
+ state = k;\r
+ }\r
+ }\r
+ assert (state != -1);\r
+\r
+ delete viterbiPtr;\r
+\r
+ // compute traceback\r
+ SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);\r
+ int r = seq1Length, c = seq2Length;\r
+ while (r != 0 || c != 0){\r
+ int newState = traceback[state + 3 * (r * (seq2Length+1) + c)]; \r
+ if (state == 0){ c--; r--; alignment->push_back ('B');}\r
+ else if (state % 2 == 1){ r--; alignment->push_back ('X'); }\r
+ else { c--; alignment->push_back ('Y'); } \r
+ state = newState;\r
+ }\r
+\r
+ delete tracebackPtr;\r
+\r
+ reverse (alignment->begin(), alignment->end());\r
+ \r
+ return make_pair(alignment, bestProb);\r
+ }\r
+\r
+ /////////////////////////////////////////////////////////////////\r
+ // ProbabilisticModel::BuildPosterior()\r
+ //\r
+ // Builds a posterior probability matrix needed to align a pair\r
+ // of alignments. Mathematically, the returned matrix M is\r
+ // defined as follows:\r
+ // M[i,j] = sum sum f(s,t,i,j)\r
+ // s in align1 t in align2\r
+ // where\r
+ // [ P(s[i'] <--> t[j'])\r
+ // [ if s[i'] is a letter in the ith column of align1 and\r
+ // [ t[j'] it a letter in the jth column of align2\r
+ // f(s,t,i,j) = [\r
+ // [ 0 otherwise\r
+ //\r
+ /////////////////////////////////////////////////////////////////\r
+\r
+ VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2,\r
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,\r
+ float cutoff = 0.0f) const {\r
+ const int seq1Length = align1->GetSequence(0)->GetLength();\r
+ const int seq2Length = align2->GetSequence(0)->GetLength();\r
+\r
+ VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr);\r
+ VF &posterior = *posteriorPtr;\r
+ VF::iterator postPtr = posterior.begin();\r
+\r
+ // for each s in align1\r
+ for (int i = 0; i < align1->GetNumSequences(); i++){\r
+ int first = align1->GetSequence(i)->GetLabel();\r
+ SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();\r
+\r
+ // for each t in align2\r
+ for (int j = 0; j < align2->GetNumSequences(); j++){\r
+ int second = align2->GetSequence(j)->GetLabel();\r
+ SafeVector<int> *mapping2 = align2->GetSequence(j)->GetMapping();\r
+\r
+ if (first < second){\r
+\r
+ // get the associated sparse matrix\r
+ SparseMatrix *matrix = sparseMatrices[first][second];\r
+ \r
+ for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){\r
+ SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);\r
+ int base = (*mapping1)[ii] * (seq2Length+1);\r
+ int rowSize = matrix->GetRowSize(ii);\r
+ \r
+ // add in all relevant values\r
+ for (int jj = 0; jj < rowSize; jj++)\r
+ posterior[base + (*mapping2)[row[jj].first]] += row[jj].second;\r
+ \r
+ // subtract cutoff \r
+ for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)\r
+ posterior[base + (*mapping2)[jj]] -= cutoff;\r
+ }\r
+\r
+ } else {\r
+\r
+ // get the associated sparse matrix\r
+ SparseMatrix *matrix = sparseMatrices[second][first];\r
+ \r
+ for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){\r
+ SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);\r
+ int base = (*mapping2)[jj];\r
+ int rowSize = matrix->GetRowSize(jj);\r
+ \r
+ // add in all relevant values\r
+ for (int ii = 0; ii < rowSize; ii++)\r
+ posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second;\r
+ \r
+ // subtract cutoff \r
+ for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)\r
+ posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff;\r
+ }\r
+\r
+ }\r
+ \r
+\r
+ delete mapping2;\r
+ }\r
+\r
+ delete mapping1;\r
+ }\r
+\r
+ return posteriorPtr;\r
+ }\r
+\r
+ //added by Liu Yongchao.Feb 23, 2010\r
+ VF *BuildPosterior(int* seqsWeights, MultiSequence *align1,\r
+ MultiSequence *align2,\r
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,\r
+ float cutoff = 0.0f) const {\r
+ const int seq1Length = align1->GetSequence(0)->GetLength();\r
+ const int seq2Length = align2->GetSequence(0)->GetLength();\r
+\r
+ VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0);\r
+ assert(posteriorPtr);\r
+ VF &posterior = *posteriorPtr;\r
+ VF::iterator postPtr = posterior.begin();\r
+\r
+ //compute the total sum of all weights\r
+ float totalWeights = 0;\r
+ for (int i = 0; i < align1->GetNumSequences(); i++) {\r
+ int first = align1->GetSequence(i)->GetLabel();\r
+ int w1 = seqsWeights[first];\r
+ for (int j = 0; j < align2->GetNumSequences(); j++) {\r
+ int second = align2->GetSequence(j)->GetLabel();\r
+ int w2 = seqsWeights[second];\r
+\r
+ totalWeights += w1 * w2;\r
+ }\r
+ }\r
+ // for each s in align1\r
+ for (int i = 0; i < align1->GetNumSequences(); i++) {\r
+ int first = align1->GetSequence(i)->GetLabel();\r
+ int w1 = seqsWeights[first];\r
+ SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();\r
+ // for each t in align2\r
+ for (int j = 0; j < align2->GetNumSequences(); j++) {\r
+ int second = align2->GetSequence(j)->GetLabel();\r
+ int w2 = seqsWeights[second];\r
+ SafeVector<int> *mapping2 =\r
+ align2->GetSequence(j)->GetMapping();\r
+\r
+ float w = (float) (w1 * w2) / totalWeights;\r
+ if (first < second) {\r
+\r
+ // get the associated sparse matrix\r
+ SparseMatrix *matrix = sparseMatrices[first][second];\r
+\r
+ for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) {\r
+ SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);\r
+ int base = (*mapping1)[ii] * (seq2Length + 1);\r
+ int rowSize = matrix->GetRowSize(ii);\r
+\r
+ // add in all relevant values\r
+ for (int jj = 0; jj < rowSize; jj++)\r
+ posterior[base + (*mapping2)[row[jj].first]] += w\r
+ * row[jj].second;\r
+\r
+ // subtract cutoff \r
+ for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)\r
+ posterior[base + (*mapping2)[jj]] -= w * cutoff;\r
+ }\r
+\r
+ } else {\r
+\r
+ // get the associated sparse matrix\r
+ SparseMatrix *matrix = sparseMatrices[second][first];\r
+\r
+ for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) {\r
+ SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);\r
+ int base = (*mapping2)[jj];\r
+ int rowSize = matrix->GetRowSize(jj);\r
+\r
+ // add in all relevant values\r
+ for (int ii = 0; ii < rowSize; ii++)\r
+ posterior[base\r
+ + (*mapping1)[row[ii].first]\r
+ * (seq2Length + 1)] += w\r
+ * row[ii].second;\r
+\r
+ // subtract cutoff \r
+ for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)\r
+ posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -=\r
+ w * cutoff;\r
+ }\r
+\r
+ }\r
+\r
+ delete mapping2;\r
+ }\r
+\r
+ delete mapping1;\r
+ }\r
+\r
+ return posteriorPtr;\r
+ }\r
+};\r
+\r
+#endif\r
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// SafeVector.h
+//
+// STL vector with array bounds checking. To enable bounds
+// checking, #define ENABLE_CHECKS.
+/////////////////////////////////////////////////////////////////
+
+#ifndef SAFEVECTOR_H
+#define SAFEVECTOR_H
+
+#include <cassert>
+#include <vector>
+using namespace std;
+
+/////////////////////////////////////////////////////////////////
+// SafeVector
+//
+// Class derived from the STL std::vector for bounds checking.
+/////////////////////////////////////////////////////////////////
+
+template<class TYPE>
+class SafeVector: public std::vector<TYPE> {
+public:
+
+ // miscellaneous constructors
+ SafeVector() :
+ std::vector<TYPE>() {
+ }
+ SafeVector(size_t size) :
+ std::vector<TYPE>(size) {
+ }
+ SafeVector(size_t size, const TYPE &value) :
+ std::vector<TYPE>(size, value) {
+ }
+ SafeVector(const SafeVector &source) :
+ std::vector<TYPE>(source) {
+ }
+
+#ifdef ENABLE_CHECKS
+
+ // [] array bounds checking
+ TYPE &operator[](int index) {
+ assert (index >= 0 && index < (int) size());
+ return std::vector<TYPE>::operator[] ((size_t) index);
+ }
+
+ // [] const array bounds checking
+ const TYPE &operator[] (int index) const {
+ assert (index >= 0 && index < (int) size());
+ return std::vector<TYPE>::operator[] ((size_t) index);
+ }
+
+#endif
+
+};
+
+// some commonly used vector types
+typedef SafeVector<int> VI;
+typedef SafeVector<VI> VVI;
+typedef SafeVector<VVI> VVVI;
+typedef SafeVector<float> VF;
+typedef SafeVector<VF> VVF;
+typedef SafeVector<VVF> VVVF;
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// ScoreType.h
+//
+// Routines for doing math operations in MSAPROBS
+/////////////////////////////////////////////////////////////////
+
+#ifndef SCORETYPE_H
+#define SCORETYPE_H
+
+#include <cmath>
+#include <algorithm>
+#include <cfloat>
+#include <assert.h>
+
+typedef float ScoreType;
+
+const float LOG_ZERO = -2e20;
+const float LOG_ONE = 0.0;
+
+/////////////////////////////////////////////////////////////////
+// LOG()
+//
+// Compute the logarithm of x.
+/////////////////////////////////////////////////////////////////
+
+inline ScoreType LOG(ScoreType x) {
+ return log(x);
+}
+
+/////////////////////////////////////////////////////////////////
+// EXP()
+//
+// Computes exp(x).
+/////////////////////////////////////////////////////////////////
+
+inline ScoreType EXP(ScoreType x) {
+ //return exp(x);
+ if (x > -2) {
+ if (x > -0.5) {
+ if (x > 0)
+ return exp(x);
+ return (((0.03254409303190190000 * x + 0.16280432765779600000) * x
+ + 0.49929760485974900000) * x + 0.99995149601363700000) * x
+ + 0.99999925508501600000;
+ }
+ if (x > -1)
+ return (((0.01973899026052090000 * x + 0.13822379685007000000) * x
+ + 0.48056651562365000000) * x + 0.99326940370383500000) * x
+ + 0.99906756856399500000;
+ return (((0.00940528203591384000 * x + 0.09414963667859410000) * x
+ + 0.40825793595877300000) * x + 0.93933625499130400000) * x
+ + 0.98369508190545300000;
+ }
+ if (x > -8) {
+ if (x > -4)
+ return (((0.00217245711583303000 * x + 0.03484829428350620000) * x
+ + 0.22118199801337800000) * x + 0.67049462206469500000) * x
+ + 0.83556950223398500000;
+ return (((0.00012398771025456900 * x + 0.00349155785951272000) * x
+ + 0.03727721426017900000) * x + 0.17974997741536900000) * x
+ + 0.33249299994217400000;
+ }
+ if (x > -16)
+ return (((0.00000051741713416603 * x + 0.00002721456879608080) * x
+ + 0.00053418601865636800) * x + 0.00464101989351936000) * x
+ + 0.01507447981459420000;
+ return 0;
+}
+
+/*
+ /////////////////////////////////////////////////////////////////
+ // LOOKUP()
+ //
+ // Computes log (exp (x) + 1), for 0 <= x <= 7.5.
+ /////////////////////////////////////////////////////////////////
+
+ inline ScoreType LOOKUP (ScoreType x){
+ //return log (exp(x) + 1);
+ if (x < 2){
+ if (x < 0.5){
+ if (x < 0)
+ return log (exp(x) + 1);
+ return (((-0.00486373205785640000*x - 0.00020245408813934800)*x + 0.12504222666029800000)*x + 0.49999685320563000000)*x + 0.69314723138948900000;
+ }
+ if (x < 1)
+ return (((-0.00278634205460548000*x - 0.00458097251248546000)*x + 0.12865849880472500000)*x + 0.49862228499205200000)*x + 0.69334810088688000000;
+ return (((0.00059633755154209200*x - 0.01918996666063320000)*x + 0.15288232492093800000)*x + 0.48039958825756900000)*x + 0.69857578503189200000;
+ }
+ if (x < 8){
+ if (x < 4)
+ return (((0.00135958539181047000*x - 0.02329807659316430000)*x + 0.15885799609532100000)*x + 0.48167498563270800000)*x + 0.69276185058669200000;
+ return (((0.00011992394456683500*x - 0.00338464503306568000)*x + 0.03622746366545470000)*x + 0.82481250248383700000)*x + 0.32507892994863100000;
+ }
+ if (x < 16)
+ return (((0.00000051726300753785*x - 0.00002720671238876090)*x + 0.00053403733818413500)*x + 0.99536021775747900000)*x + 0.01507065715532010000;
+ return x;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // LOOKUP_SLOW()
+ //
+ // Computes log (exp (x) + 1).
+ /////////////////////////////////////////////////////////////////
+
+ inline ScoreType LOOKUP_SLOW (ScoreType x){
+ return log (exp (x) + 1);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MAX()
+ //
+ // Compute max of three numbers
+ /////////////////////////////////////////////////////////////////
+
+ inline ScoreType MAX (ScoreType x, ScoreType y, ScoreType z){
+ if (x >= y){
+ if (x >= z)
+ return x;
+ return z;
+ }
+ if (y >= z)
+ return y;
+ return z;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // LOG_PLUS_EQUALS()
+ //
+ // Add two log probabilities and store in the first argument
+ /////////////////////////////////////////////////////////////////
+
+ inline void LOG_PLUS_EQUALS (ScoreType &x, ScoreType y){
+ if (x < y)
+ x = (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x;
+ else
+ x = (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // LOG_PLUS_EQUALS_SLOW()
+ //
+ // Add two log probabilities and store in the first argument
+ /////////////////////////////////////////////////////////////////
+
+ inline void LOG_PLUS_EQUALS_SLOW (ScoreType &x, ScoreType y){
+ if (x < y)
+ x = (x <= LOG_ZERO) ? y : LOOKUP_SLOW(y-x) + x;
+ else
+ x = (y <= LOG_ZERO) ? x : LOOKUP_SLOW(x-y) + y;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // LOG_ADD()
+ //
+ // Add two log probabilities
+ /////////////////////////////////////////////////////////////////
+
+ inline ScoreType LOG_ADD (ScoreType x, ScoreType y){
+ if (x < y) return (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x;
+ return (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y;
+ }
+ */
+
+/*
+ /////////////////////////////////////////////////////////////////
+ // LOG()
+ //
+ // Compute the logarithm of x.
+ /////////////////////////////////////////////////////////////////
+
+ inline float LOG (float x){
+ return log (x);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // EXP()
+ //
+ // Computes exp(x), fr -4.6 <= x <= 0.
+ /////////////////////////////////////////////////////////////////
+
+ inline float EXP (float x){
+ assert (x <= 0.00f);
+ if (x < EXP_UNDERFLOW_THRESHOLD) return 0.0f;
+ return (((0.006349841068584 * x + 0.080775412572352) * x + 0.397982026296272) * x + 0.95279335963787f) * x + 0.995176455837312f;
+ //return (((0.00681169825657f * x + 0.08386267698832f) * x + 0.40413983195844f) * x + 0.95656674979767f) * x + 0.99556744049130f;
+ }
+ */
+
+const float EXP_UNDERFLOW_THRESHOLD = -4.6;
+const float LOG_UNDERFLOW_THRESHOLD = 7.5;
+
+/////////////////////////////////////////////////////////////////
+// LOOKUP()
+//
+// Computes log (exp (x) + 1), for 0 <= x <= 7.5.
+/////////////////////////////////////////////////////////////////
+
+inline float LOOKUP(float x) {
+ assert(x >= 0.00f);
+ assert(x <= LOG_UNDERFLOW_THRESHOLD);
+ //return ((-0.00653779113685f * x + 0.09537236626558f) * x + 0.55317574459331f) * x + 0.68672959851568f;
+ if (x <= 1.00f)
+ return ((-0.009350833524763f * x + 0.130659527668286f) * x
+ + 0.498799810682272f) * x + 0.693203116424741f;
+ if (x <= 2.50f)
+ return ((-0.014532321752540f * x + 0.139942324101744f) * x
+ + 0.495635523139337f) * x + 0.692140569840976f;
+ if (x <= 4.50f)
+ return ((-0.004605031767994f * x + 0.063427417320019f) * x
+ + 0.695956496475118f) * x + 0.514272634594009f;
+ assert(x <= LOG_UNDERFLOW_THRESHOLD);
+ return ((-0.000458661602210f * x + 0.009695946122598f) * x
+ + 0.930734667215156f) * x + 0.168037164329057f;
+
+ //return (((0.00089738532761f * x - 0.01859488697982f) * x + 0.14415772028626f) * x + 0.49515490689159f) * x + 0.69311928966454f;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOOKUP_SLOW()
+//
+// Computes log (exp (x) + 1).
+/////////////////////////////////////////////////////////////////
+
+inline float LOOKUP_SLOW(float x) {
+ return log(exp(x) + 1);
+}
+
+/////////////////////////////////////////////////////////////////
+// MAX()
+//
+// Compute max of three numbers
+/////////////////////////////////////////////////////////////////
+
+inline float MAX(float x, float y, float z) {
+ if (x >= y) {
+ if (x >= z)
+ return x;
+ return z;
+ }
+ if (y >= z)
+ return y;
+ return z;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_PLUS_EQUALS()
+//
+// Add two log probabilities and store in the first argument
+/////////////////////////////////////////////////////////////////
+
+inline void LOG_PLUS_EQUALS(float &x, float y) {
+ if (x < y)
+ x = (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ?
+ y : LOOKUP(y - x) + x;
+ else
+ x = (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ?
+ x : LOOKUP(x - y) + y;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_PLUS_EQUALS_SLOW()
+//
+// Add two log probabilities and store in the first argument
+/////////////////////////////////////////////////////////////////
+
+inline void LOG_PLUS_EQUALS_SLOW(float &x, float y) {
+ if (x < y)
+ x = (x == LOG_ZERO) ? y : LOOKUP_SLOW(y - x) + x;
+ else
+ x = (y == LOG_ZERO) ? x : LOOKUP_SLOW(x - y) + y;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add two log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x, float y) {
+ if (x < y)
+ return (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ?
+ y : LOOKUP(y - x) + x;
+ return (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ?
+ x : LOOKUP(x - y) + y;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add three log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x1, float x2, float x3) {
+ return LOG_ADD(x1, LOG_ADD(x2, x3));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add four log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x1, float x2, float x3, float x4) {
+ return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, x4)));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add five log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5) {
+ return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, x5))));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add siz log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5,
+ float x6) {
+ return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, LOG_ADD(x5, x6)))));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add seven log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5, float x6,
+ float x7) {
+ return LOG_ADD(x1,
+ LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, LOG_ADD(x5, LOG_ADD(x6, x7))))));
+}
+
+/////////////////////////////////////////////////////////////////
+// ChooseBestOfThree()
+//
+// Store the largest of three values x1, x2, and x3 in *x. Also
+// if xi is the largest value, then store bi in *b.
+/////////////////////////////////////////////////////////////////
+
+inline void ChooseBestOfThree(float x1, float x2, float x3, char b1, char b2,
+ char b3, float *x, char *b) {
+ if (x1 >= x2) {
+ if (x1 >= x3) {
+ *x = x1;
+ *b = b1;
+ return;
+ }
+ *x = x3;
+ *b = b3;
+ return;
+ }
+ if (x2 >= x3) {
+ *x = x2;
+ *b = b2;
+ return;
+ }
+ *x = x3;
+ *b = b3;
+}
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// Sequence.h
+//
+// Class for reading/manipulating single sequence character data.
+/////////////////////////////////////////////////////////////////
+
+#ifndef SEQUENCE_H
+#define SEQUENCE_H
+
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cctype>
+#include <cstdlib>
+#include "SafeVector.h"
+#include "FileBuffer.h"
+
+/////////////////////////////////////////////////////////////////
+// Sequence
+//
+// Class for storing sequence information.
+/////////////////////////////////////////////////////////////////
+
+class Sequence {
+
+ bool isValid; // a boolean indicating whether the sequence data is valid or not
+ string header; // string containing the comment line of the FASTA file
+ SafeVector<char> *data; // pointer to character data
+ int length; // length of the sequence
+ int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences
+ // in a Multi-FASTA file
+ int inputLabel; // position of sequence in original input
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Default constructor. Does nothing.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence() :
+ isValid(false), header(""), data(NULL), length(0), sequenceLabel(0), inputLabel(
+ 0) {
+ }
+
+public:
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Constructor. Reads the sequence from a FileBuffer.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence(FileBuffer &infile, bool stripGaps = false) :
+ isValid(false), header("~"), data(NULL), length(0), sequenceLabel(
+ 0), inputLabel(0) {
+
+ // read until the first non-blank line
+ while (!infile.eof()) {
+ infile.GetLine(header);
+ if (header.length() != 0)
+ break;
+ }
+
+ // check to make sure that it is a correct header line
+ if (header[0] == '>') {
+
+ // if so, remove the leading ">"
+ header = header.substr(1);
+
+ // remove any leading or trailing white space in the header comment
+ while (header.length() > 0 && isspace(header[0]))
+ header = header.substr(1);
+ while (header.length() > 0 && isspace(header[header.length() - 1]))
+ header = header.substr(0, header.length() - 1);
+
+ // get ready to read the data[] array; note that data[0] is always '@'
+ char ch;
+ data = new SafeVector<char>;
+ assert(data);
+ data->push_back('@');
+
+ // get a character from the file
+ while (infile.Get(ch)) {
+
+ // if we've reached a new comment line, put the character back and stop
+ if (ch == '>') {
+ infile.UnGet();
+ break;
+ }
+
+ // skip whitespace
+ if (isspace(ch))
+ continue;
+
+ // substitute gap character
+ if (ch == '.')
+ ch = '-';
+ if (stripGaps && ch == '-')
+ continue;
+
+ // check for known characters
+ if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))) {
+ cerr << "ERROR: Unknown character encountered: " << ch
+ << endl;
+ exit(1);
+ }
+
+ // everything's ok so far, so just store this character.
+ if (ch >= 'a' && ch <= 'z') {
+ ch = ch - 'a' + 'A';
+ } //change to upper case. fixed by Liu Yongchao, May 21, 2010
+
+ data->push_back(ch);
+ ++length;
+ }
+
+ // sequence must contain data in order to be valid
+ isValid = length > 0;
+ if (!isValid) {
+ delete data;
+ data = NULL;
+ }
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Constructor. Builds a sequence from existing data. Note
+ // that the data must use one-based indexing where data[0] should
+ // be set to '@'.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence(SafeVector<char> *data, string header, int length,
+ int sequenceLabel, int inputLabel) :
+ isValid(data != NULL), header(header), data(data), length(length), sequenceLabel(
+ sequenceLabel), inputLabel(inputLabel) {
+ assert(data);
+ assert((*data)[0] == '@');
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Destructor. Release allocated memory.
+ /////////////////////////////////////////////////////////////////
+
+ ~Sequence() {
+ if (data) {
+ assert(isValid);
+ delete data;
+ data = NULL;
+ isValid = false;
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetHeader()
+ //
+ // Return the string comment associated with this sequence.
+ /////////////////////////////////////////////////////////////////
+
+ string GetHeader() const {
+ return header;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetName()
+ //
+ // Return the first word of the string comment associated with this sequence.
+ /////////////////////////////////////////////////////////////////
+
+ string GetName() const {
+ char name[1024];
+ sscanf(header.c_str(), "%s", name);
+ return string(name);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetDataPtr()
+ //
+ // Return the iterator to data associated with this sequence.
+ /////////////////////////////////////////////////////////////////
+
+ SafeVector<char>::iterator GetDataPtr() {
+ assert(isValid);
+ assert(data);
+ return data->begin();
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetPosition()
+ //
+ // Return the character at position i. Recall that the character
+ // data is stored with one-based indexing.
+ /////////////////////////////////////////////////////////////////
+
+ char GetPosition(int i) const {
+ assert(isValid);
+ assert(data);
+ assert(i >= 1 && i <= length);
+ return (*data)[i];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::SetLabel()
+ //
+ // Sets the sequence label to i.
+ /////////////////////////////////////////////////////////////////
+
+ void SetLabel(int i) {
+ assert(isValid);
+ sequenceLabel = i;
+ inputLabel = i;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::SetSortLabel()
+ //
+ // Sets the sequence sorting label to i.
+ /////////////////////////////////////////////////////////////////
+
+ void SetSortLabel(int i) {
+ assert(isValid);
+ sequenceLabel = i;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetLabel()
+ //
+ // Retrieves the input label.
+ /////////////////////////////////////////////////////////////////
+
+ int GetLabel() const {
+ assert(isValid);
+ return inputLabel;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetSortLabel()
+ //
+ // Retrieves the sorting label.
+ /////////////////////////////////////////////////////////////////
+
+ int GetSortLabel() const {
+ assert(isValid);
+ return sequenceLabel;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Fail()
+ //
+ // Checks to see if the sequence successfully loaded.
+ /////////////////////////////////////////////////////////////////
+
+ bool Fail() const {
+ return !isValid;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Length()
+ //
+ // Returns the length of the sequence.
+ /////////////////////////////////////////////////////////////////
+
+ int GetLength() const {
+ assert(isValid);
+ assert(data);
+ return length;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::WriteMFA()
+ //
+ // Writes the sequence to outfile in MFA format. Uses numColumns
+ // columns per line. If useIndex is set to false, then the
+ // header is printed as normal, but if useIndex is true, then
+ // ">S###" is printed where ### represents the sequence label.
+ /////////////////////////////////////////////////////////////////
+
+ void WriteMFA(ostream &outfile, int numColumns,
+ bool useIndex = false) const {
+ assert(isValid);
+ assert(data);
+ assert(!outfile.fail());
+
+ // print out heading
+ if (useIndex)
+ outfile << ">S" << GetLabel() << endl;
+ else
+ outfile << ">" << header << endl;
+
+ // print out character data
+ int ct = 1;
+ for (; ct <= length; ct++) {
+ outfile << (*data)[ct];
+ if (ct % numColumns == 0)
+ outfile << endl;
+ }
+ if ((ct - 1) % numColumns != 0)
+ outfile << endl;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Clone()
+ //
+ // Returns a new deep copy of the seqeuence.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence *Clone() const {
+ Sequence *ret = new Sequence();
+ assert(ret);
+
+ ret->isValid = isValid;
+ ret->header = header;
+ ret->data = new SafeVector<char>;
+ assert(ret->data);
+ *(ret->data) = *data;
+ ret->length = length;
+ ret->sequenceLabel = sequenceLabel;
+ ret->inputLabel = inputLabel;
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetRange()
+ //
+ // Returns a new sequence object consisting of a range of
+ // characters from the current seuquence.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence *GetRange(int start, int end) const {
+ Sequence *ret = new Sequence();
+ assert(ret);
+
+ assert(start >= 1 && start <= length);
+ assert(end >= 1 && end <= length);
+ assert(start <= end);
+
+ ret->isValid = isValid;
+ ret->header = header;
+ ret->data = new SafeVector<char>;
+ assert(ret->data);
+ ret->data->push_back('@');
+ for (int i = start; i <= end; i++)
+ ret->data->push_back((*data)[i]);
+ ret->length = end - start + 1;
+ ret->sequenceLabel = sequenceLabel;
+ ret->inputLabel = inputLabel;
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::AddGaps()
+ //
+ // Given an SafeVector<char> containing the skeleton for an
+ // alignment and the identity of the current character, this
+ // routine will create a new sequence with all necesssary gaps added.
+ // For instance,
+ // alignment = "XXXBBYYYBBYYXX"
+ // id = 'X'
+ // will perform the transformation
+ // "ATGCAGTCA" --> "ATGCC---GT--CA"
+ // (XXXBBYYYBBYYXX)
+ /////////////////////////////////////////////////////////////////
+
+ Sequence *AddGaps(SafeVector<char> *alignment, char id) {
+ Sequence *ret = new Sequence();
+ assert(ret);
+
+ ret->isValid = isValid;
+ ret->header = header;
+ ret->data = new SafeVector<char>;
+ assert(ret->data);
+ ret->length = (int) alignment->size();
+ ret->sequenceLabel = sequenceLabel;
+ ret->inputLabel = inputLabel;
+ ret->data->push_back('@');
+
+ SafeVector<char>::iterator dataIter = data->begin() + 1;
+ for (SafeVector<char>::iterator iter = alignment->begin();
+ iter != alignment->end(); ++iter) {
+ if (*iter == 'B' || *iter == id) {
+ ret->data->push_back(*dataIter);
+ ++dataIter;
+ } else
+ ret->data->push_back('-');
+ }
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetString()
+ //
+ // Returns the sequence as a string with gaps removed.
+ /////////////////////////////////////////////////////////////////
+
+ string GetString() {
+ string s = "";
+ for (int i = 1; i <= length; i++) {
+ if ((*data)[i] != '-')
+ s += (*data)[i];
+ }
+ return s;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetMapping()
+ //
+ // Returns a SafeVector<int> containing the indices of every
+ // character in the sequence. For instance, if the data is
+ // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
+ /////////////////////////////////////////////////////////////////
+
+ SafeVector<int> *GetMapping() const {
+ SafeVector<int> *ret = new SafeVector<int>(1, 0);
+ for (int i = 1; i <= length; i++) {
+ if ((*data)[i] != '-')
+ ret->push_back(i);
+ }
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Highlight()
+ //
+ // Changes all positions with score >= cutoff to upper case and
+ // all positions with score < cutoff to lower case.
+ /////////////////////////////////////////////////////////////////
+
+ void Highlight(const SafeVector<float> &scores, const float cutoff) {
+ for (int i = 1; i <= length; i++) {
+ if (scores[i - 1] >= cutoff)
+ (*data)[i] = toupper((*data)[i]);
+ else
+ (*data)[i] = tolower((*data)[i]);
+ }
+ }
+};
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// SparseMatrix.h
+//
+// Sparse matrix computations
+/////////////////////////////////////////////////////////////////
+
+#ifndef SPARSEMATRIX_H
+#define SPARSEMATRIX_H
+
+#include <iostream>
+
+using namespace std;
+
+const float POSTERIOR_CUTOFF = 0.01; // minimum posterior probability
+// value that is maintained in the
+// sparse matrix representation
+
+typedef pair<int, float> PIF; // Sparse matrix entry type
+// first --> column
+// second --> value
+
+/////////////////////////////////////////////////////////////////
+// SparseMatrix
+//
+// Class for sparse matrix computations
+/////////////////////////////////////////////////////////////////
+
+class SparseMatrix {
+
+ int seq1Length, seq2Length; // dimensions of matrix
+ VI rowSize; // rowSize[i] = # of cells in row i
+ SafeVector<PIF> data; // data values
+ SafeVector<SafeVector<PIF>::iterator> rowPtrs; // pointers to the beginning of each row
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::SparseMatrix()
+ //
+ // Private constructor.
+ /////////////////////////////////////////////////////////////////
+
+ SparseMatrix() {
+ }
+
+public:
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::SparseMatrix()
+ //
+ // Constructor. Builds a sparse matrix from a posterior matrix.
+ // Note that the expected format for the posterior matrix is as
+ // a (seq1Length+1) x (seq2Length+1) matrix where the 0th row
+ // and 0th column are ignored (they should contain all zeroes).
+ /////////////////////////////////////////////////////////////////
+
+ SparseMatrix(int seq1Length, int seq2Length, const VF &posterior) :
+ seq1Length(seq1Length), seq2Length(seq2Length) {
+
+ int numCells = 0;
+
+ assert(seq1Length > 0);
+ assert(seq2Length > 0);
+
+ // calculate memory required; count the number of cells in the
+ // posterior matrix above the threshold
+ VF::const_iterator postPtr = posterior.begin();
+ for (int i = 0; i <= seq1Length; i++) {
+ for (int j = 0; j <= seq2Length; j++) {
+ if (*(postPtr++) >= POSTERIOR_CUTOFF) {
+ assert(i != 0 && j != 0);
+ numCells++;
+ }
+ }
+ }
+
+ // allocate memory
+ data.resize(numCells);
+ rowSize.resize(seq1Length + 1);
+ rowSize[0] = -1;
+ rowPtrs.resize(seq1Length + 1);
+ rowPtrs[0] = data.end();
+
+ // build sparse matrix
+ postPtr = posterior.begin() + seq2Length + 1; // note that we're skipping the first row here
+ SafeVector<PIF>::iterator dataPtr = data.begin();
+ for (int i = 1; i <= seq1Length; i++) {
+ postPtr++; // and skipping the first column of each row
+ rowPtrs[i] = dataPtr;
+ for (int j = 1; j <= seq2Length; j++) {
+ if (*postPtr >= POSTERIOR_CUTOFF) {
+ dataPtr->first = j;
+ dataPtr->second = *postPtr;
+ dataPtr++;
+ }
+ postPtr++;
+ }
+ rowSize[i] = dataPtr - rowPtrs[i];
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetRowPtr()
+ //
+ // Returns the pointer to a particular row in the sparse matrix.
+ /////////////////////////////////////////////////////////////////
+
+ SafeVector<PIF>::iterator GetRowPtr(int row) const {
+ assert(row >= 1 && row <= seq1Length);
+ return rowPtrs[row];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetValue()
+ //
+ // Returns value at a particular row, column.
+ /////////////////////////////////////////////////////////////////
+
+ float GetValue(int row, int col) {
+ assert(row >= 1 && row <= seq1Length);
+ assert(col >= 1 && col <= seq2Length);
+ for (int i = 0; i < rowSize[row]; i++) {
+ if (rowPtrs[row][i].first == col)
+ return rowPtrs[row][i].second;
+ }
+ return 0;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetRowSize()
+ //
+ // Returns the number of entries in a particular row.
+ /////////////////////////////////////////////////////////////////
+
+ int GetRowSize(int row) const {
+ assert(row >= 1 && row <= seq1Length);
+ return rowSize[row];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetSeq1Length()
+ //
+ // Returns the first dimension of the matrix.
+ /////////////////////////////////////////////////////////////////
+
+ int GetSeq1Length() const {
+ return seq1Length;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetSeq2Length()
+ //
+ // Returns the second dimension of the matrix.
+ /////////////////////////////////////////////////////////////////
+
+ int GetSeq2Length() const {
+ return seq2Length;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetRowPtr
+ //
+ // Returns the pointer to a particular row in the sparse matrix.
+ /////////////////////////////////////////////////////////////////
+
+ int GetNumCells() const {
+ return data.size();
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::Print()
+ //
+ // Prints out a sparse matrix.
+ /////////////////////////////////////////////////////////////////
+
+ void Print(ostream &outfile) const {
+ outfile << "Sparse Matrix:" << endl;
+ for (int i = 1; i <= seq1Length; i++) {
+ outfile << " " << i << ":";
+ for (int j = 0; j < rowSize[i]; j++) {
+ outfile << " (" << rowPtrs[i][j].first << ","
+ << rowPtrs[i][j].second << ")";
+ }
+ outfile << endl;
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::ComputeTranspose()
+ //
+ // Returns a new sparse matrix containing the transpose of the
+ // current matrix.
+ /////////////////////////////////////////////////////////////////
+
+ SparseMatrix *ComputeTranspose() const {
+
+ // create a new sparse matrix
+ SparseMatrix *ret = new SparseMatrix();
+ int numCells = data.size();
+
+ ret->seq1Length = seq2Length;
+ ret->seq2Length = seq1Length;
+
+ // allocate memory
+ ret->data.resize(numCells);
+ ret->rowSize.resize(seq2Length + 1);
+ ret->rowSize[0] = -1;
+ ret->rowPtrs.resize(seq2Length + 1);
+ ret->rowPtrs[0] = ret->data.end();
+
+ // compute row sizes
+ for (int i = 1; i <= seq2Length; i++)
+ ret->rowSize[i] = 0;
+ for (int i = 0; i < numCells; i++)
+ ret->rowSize[data[i].first]++;
+
+ // compute row ptrs
+ for (int i = 1; i <= seq2Length; i++) {
+ ret->rowPtrs[i] =
+ (i == 1) ?
+ ret->data.begin() :
+ ret->rowPtrs[i - 1] + ret->rowSize[i - 1];
+ }
+
+ // now fill in data
+ SafeVector<SafeVector<PIF>::iterator> currPtrs = ret->rowPtrs;
+
+ for (int i = 1; i <= seq1Length; i++) {
+ SafeVector<PIF>::iterator row = rowPtrs[i];
+ for (int j = 0; j < rowSize[i]; j++) {
+ currPtrs[row[j].first]->first = i;
+ currPtrs[row[j].first]->second = row[j].second;
+ currPtrs[row[j].first]++;
+ }
+ }
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetPosterior()
+ //
+ // Return the posterior representation of the sparse matrix.
+ /////////////////////////////////////////////////////////////////
+
+ VF *GetPosterior() const {
+
+ // create a new posterior matrix
+ VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1));
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+
+ // build the posterior matrix
+ for (int i = 0; i < (seq1Length + 1) * (seq2Length + 1); i++)
+ posterior[i] = 0;
+ for (int i = 1; i <= seq1Length; i++) {
+ VF::iterator postPtr = posterior.begin() + i * (seq2Length + 1);
+ for (int j = 0; j < rowSize[i]; j++) {
+ postPtr[rowPtrs[i][j].first] = rowPtrs[i][j].second;
+ }
+ }
+
+ return posteriorPtr;
+ }
+
+};
+
+#endif
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+#include "MSA.h"
+
+int main(int argc, char* argv[]) {
+ MSA msa(argc, argv);
+
+ return 0;
+}
--- /dev/null
+(1) 23 Aug, 2010
+ Add an option "-num_threads" to allow uses to specify the number of
+ threads useda
+(2) 12 April, 2012
+ GCC 4.6 can successfully compile it.
+
+(3) 3 July, 2012
+ Add a new option "-o" (or "--outfile") to allow users to specify the output file name.
+ By default, it will output to STDOUT
--- /dev/null
+\feff\r
+Microsoft Visual Studio Solution File, Format Version 9.00\r
+# Visual Studio 2005\r
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MSAProbs", "MSAProbs\MSAProbs.vcproj", "{671563E4-93A2-419E-8B41-48DDF71DD144}"\r
+EndProject\r
+Global\r
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution\r
+ Debug|Win32 = Debug|Win32\r
+ Release|Win32 = Release|Win32\r
+ EndGlobalSection\r
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution\r
+ {671563E4-93A2-419E-8B41-48DDF71DD144}.Debug|Win32.ActiveCfg = Debug|Win32\r
+ {671563E4-93A2-419E-8B41-48DDF71DD144}.Debug|Win32.Build.0 = Debug|Win32\r
+ {671563E4-93A2-419E-8B41-48DDF71DD144}.Release|Win32.ActiveCfg = Release|Win32\r
+ {671563E4-93A2-419E-8B41-48DDF71DD144}.Release|Win32.Build.0 = Release|Win32\r
+ EndGlobalSection\r
+ GlobalSection(SolutionProperties) = preSolution\r
+ HideSolutionNode = FALSE\r
+ EndGlobalSection\r
+EndGlobal\r
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// Defaults.h
+//
+// Default constants for use in MSAPROBS. The emission
+// probabilities were computed using the program used to build
+// the BLOSUM62 matrix from the BLOCKS 5.0 dataset. Transition
+// parameters were obtained via unsupervised EM training on the
+// BALIBASE 2.0 benchmark alignment database.
+/////////////////////////////////////////////////////////////////
+
+#ifndef DEFAULTS_H
+#define DEFAULTS_H
+
+#include <string>
+
+using namespace std;
+
+/*
+ float initDistrib1Default[] = { 0.3202854395, 0.3398572505, 0.3398572505 };
+ float gapOpen1Default[] = { 0.1375414133, 0.1375414133 };
+ float gapExtend1Default[] = { 0.7832147479, 0.7832147479 };
+ */
+
+float initDistrib1Default[] = { 0.6080327034f, 0.1959836632f, 0.1959836632f };
+float gapOpen1Default[] = { 0.01993141696f, 0.01993141696f };
+float gapExtend1Default[] = { 0.7943345308f, 0.7943345308f };
+
+float initDistrib2Default[] = { 0.6814756989f, 8.615339902e-05f,
+ 8.615339902e-05f, 0.1591759622f, 0.1591759622 };
+float gapOpen2Default[] = { 0.0119511066f, 0.0119511066f, 0.008008334786f,
+ 0.008008334786 };
+float gapExtend2Default[] = { 0.3965826333f, 0.3965826333f, 0.8988758326f,
+ 0.8988758326 };
+
+string alphabetDefault = "ARNDCQEGHILKMFPSTWYV";
+float emitSingleDefault[20] = { 0.07831005f, 0.05246024f, 0.04433257f,
+ 0.05130349f, 0.02189704f, 0.03585766f, 0.05615771f, 0.07783433f,
+ 0.02601093f, 0.06511648f, 0.09716489f, 0.05877077f, 0.02438117f,
+ 0.04463228f, 0.03940142f, 0.05849916f, 0.05115306f, 0.01203523f,
+ 0.03124726f, 0.07343426f };
+
+float emitPairsDefault[20][20] = { { 0.02373072f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f }, { 0.00244502f, 0.01775118f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f }, { 0.00210228f, 0.00207782f, 0.01281864f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f }, { 0.00223549f, 0.00161657f, 0.00353540f, 0.01911178f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f }, { 0.00145515f, 0.00044701f, 0.00042479f,
+ 0.00036798f, 0.01013470f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00219102f,
+ 0.00253532f, 0.00158223f, 0.00176784f, 0.00032102f, 0.00756604f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f }, { 0.00332218f, 0.00268865f, 0.00224738f, 0.00496800f,
+ 0.00037956f, 0.00345128f, 0.01676565f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00597898f,
+ 0.00194865f, 0.00288882f, 0.00235249f, 0.00071206f, 0.00142432f,
+ 0.00214860f, 0.04062876f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00114353f, 0.00132105f, 0.00141205f,
+ 0.00097077f, 0.00026421f, 0.00113901f, 0.00131767f, 0.00103704f,
+ 0.00867996f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f },
+ { 0.00318853f, 0.00138145f, 0.00104273f, 0.00105355f, 0.00094040f,
+ 0.00100883f, 0.00124207f, 0.00142520f, 0.00059716f, 0.01778263f,
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, {
+ 0.00449576f, 0.00246811f, 0.00160275f, 0.00161966f, 0.00138494f,
+ 0.00180553f, 0.00222063f, 0.00212853f, 0.00111754f, 0.01071834f,
+ 0.03583921f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f }, { 0.00331693f, 0.00595650f, 0.00257310f, 0.00252518f,
+ 0.00046951f, 0.00312308f, 0.00428420f, 0.00259311f, 0.00121376f,
+ 0.00157852f, 0.00259626f, 0.01612228f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00148878f, 0.00076734f,
+ 0.00063401f, 0.00047808f, 0.00037421f, 0.00075546f, 0.00076105f,
+ 0.00066504f, 0.00042237f, 0.00224097f, 0.00461939f, 0.00096120f,
+ 0.00409522f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, {
+ 0.00165004f, 0.00090768f, 0.00084658f, 0.00069041f, 0.00052274f,
+ 0.00059248f, 0.00078814f, 0.00115204f, 0.00072545f, 0.00279948f,
+ 0.00533369f, 0.00087222f, 0.00116111f, 0.01661038f, 0.0f, 0.0f,
+ 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00230618f, 0.00106268f,
+ 0.00100282f, 0.00125381f, 0.00034766f, 0.00090111f, 0.00151550f,
+ 0.00155601f, 0.00049078f, 0.00103767f, 0.00157310f, 0.00154836f,
+ 0.00046718f, 0.00060701f, 0.01846071f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f }, { 0.00631752f, 0.00224540f, 0.00301397f, 0.00285226f,
+ 0.00094867f, 0.00191155f, 0.00293898f, 0.00381962f, 0.00116422f,
+ 0.00173565f, 0.00250962f, 0.00312633f, 0.00087787f, 0.00119036f,
+ 0.00180037f, 0.01346609f, 0.0f, 0.0f, 0.0f, 0.0f }, {
+ 0.00389995f, 0.00186053f, 0.00220144f, 0.00180488f, 0.00073798f,
+ 0.00154526f, 0.00216760f, 0.00214841f, 0.00077747f, 0.00248968f,
+ 0.00302273f, 0.00250862f, 0.00093371f, 0.00107595f, 0.00147982f,
+ 0.00487295f, 0.01299436f, 0.0f, 0.0f, 0.0f }, { 0.00039119f,
+ 0.00029139f, 0.00021006f, 0.00016015f, 0.00010666f, 0.00020592f,
+ 0.00023815f, 0.00038786f, 0.00019097f, 0.00039549f, 0.00076736f,
+ 0.00028448f, 0.00016253f, 0.00085751f, 0.00015674f, 0.00026525f,
+ 0.00024961f, 0.00563625f, 0.0f, 0.0f }, { 0.00131840f,
+ 0.00099430f, 0.00074960f, 0.00066005f, 0.00036626f, 0.00070192f,
+ 0.00092548f, 0.00089301f, 0.00131038f, 0.00127857f, 0.00219713f,
+ 0.00100817f, 0.00054105f, 0.00368739f, 0.00047608f, 0.00102648f,
+ 0.00094759f, 0.00069226f, 0.00999315f, 0.0f }, { 0.00533241f,
+ 0.00169359f, 0.00136609f, 0.00127915f, 0.00119152f, 0.00132844f,
+ 0.00178697f, 0.00194579f, 0.00071553f, 0.01117956f, 0.00914460f,
+ 0.00210897f, 0.00197461f, 0.00256159f, 0.00135781f, 0.00241601f,
+ 0.00343452f, 0.00038538f, 0.00148001f, 0.02075171f } };
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// FileBuffer.h
+//
+// Buffered file reading.
+/////////////////////////////////////////////////////////////////
+
+#ifndef FILEBUFFER_H
+#define FILEBUFFER_H
+
+#include <string>
+#include <fstream>
+#include <iostream>
+
+using namespace std;
+
+const int BufferSize = 1000;
+
+/////////////////////////////////////////////////////////////////
+// FileBuffer
+//
+// Class for buffering file reading.
+/////////////////////////////////////////////////////////////////
+
+class FileBuffer {
+ ifstream file;
+ char buffer[BufferSize];
+ int currPos;
+ int size;
+ bool isEOF;
+ bool isValid;
+ bool canUnget;
+
+public:
+
+ // Some common routines
+
+ FileBuffer(const char *filename) :
+ file(filename), currPos(0), size(0), isEOF(false), isValid(
+ !file.fail()), canUnget(false) {
+ }
+ ~FileBuffer() {
+ close();
+ }
+ bool fail() const {
+ return !isValid;
+ }
+ bool eof() const {
+ return (!isValid || isEOF);
+ }
+ void close() {
+ file.close();
+ isValid = false;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // FileBuffer::Get()
+ //
+ // Retrieve a character from the file buffer. Returns true if
+ // and only if a character is read.
+ /////////////////////////////////////////////////////////////////
+
+ bool Get(char &ch) {
+
+ // check to make sure that there's more stuff in the file
+ if (!isValid || isEOF)
+ return false;
+
+ // if the buffer is empty, it's time to reload it
+ if (currPos == size) {
+ file.read(buffer, BufferSize);
+ size = file.gcount();
+ isEOF = (size == 0);
+ currPos = 0;
+ if (isEOF)
+ return false;
+ }
+
+ // store the read character
+ ch = buffer[currPos++];
+ canUnget = true;
+ return true;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // FileBuffer::UnGet()
+ //
+ // Unretrieve the most recently read character from the file
+ // buffer. Note that this allows only a one-level undo.
+ /////////////////////////////////////////////////////////////////
+
+ void UnGet() {
+ assert(canUnget);
+ assert(isValid);
+ assert(currPos > 0);
+ currPos--;
+ assert(currPos < size);
+ isEOF = false;
+ canUnget = false;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // FileBuffer::GetLine()
+ //
+ // Retrieve characters of text until a newline character is
+ // encountered. Terminates properly on end-of-file condition.
+ /////////////////////////////////////////////////////////////////
+
+ void GetLine(string &s) {
+ char ch;
+ s = "";
+ while (Get(ch) && ch != '\n')
+ s += ch;
+ }
+
+};
+
+#endif
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <set>
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cerrno>
+#include <iomanip>
+#include "MSA.h"
+#include "MSAClusterTree.h"
+#include "Defaults.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+string parametersInputFilename = "";
+string parametersOutputFilename = "no training";
+string annotationFilename = "";
+
+bool enableVerbose = false;
+bool enableAnnotation = false;
+bool enableClustalWOutput = false;
+bool enableAlignOrder = false;
+int numConsistencyReps = 2;
+int numPreTrainingReps = 0;
+int numIterativeRefinementReps = 10;
+
+float cutoff = 0;
+
+VF initDistrib(NumMatrixTypes);
+VF gapOpen(2 * NumInsertStates);
+VF gapExtend(2 * NumInsertStates);
+VVF emitPairs(256, VF(256, 1e-10));
+VF emitSingle(256, 1e-5);
+
+string alphabet = alphabetDefault;
+
+const int MIN_PRETRAINING_REPS = 0;
+const int MAX_PRETRAINING_REPS = 20;
+const int MIN_CONSISTENCY_REPS = 0;
+const int MAX_CONSISTENCY_REPS = 5;
+const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
+const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
+
+string posteriorProbsFilename = "";
+bool allscores = true;
+string infilename;
+
+int flag_gui = 0; //0: no gui related o/p
+//1: gui related o/p generated
+int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
+//1: pp score seq added to o/p fasta alignment
+
+///////////////////////////////
+// global scoring matrix variables
+//////////////////////////////
+float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
+char *aminos, *bases, matrixtype[20] = "gonnet_160";
+int subst_index[26];
+
+double sub_matrix[26][26];
+int firstread = 0; //this makes sure that matrices are read only once
+
+float TEMPERATURE = 5;
+int MATRIXTYPE = 160;
+int prot_nuc = 0; //0=prot, 1=nucleotide
+
+float GAPOPEN = 0;
+float GAPEXT = 0;
+int numThreads = 0;
+
+//argument support
+typedef struct {
+ char input[30];
+ int matrix;
+ int N;
+ float T;
+ float beta;
+ char opt; //can be 'P' or 'M'
+ float gapopen;
+ float gapext;
+} argument_decl;
+
+argument_decl argument;
+
+extern inline void read_sustitution_matrix(char *fileName);
+extern void setmatrixtype(int le);
+extern inline int matrixtype_to_int();
+extern inline void read_dna_matrix();
+extern inline void read_vtml_la_matrix();
+extern void init_arguments();
+
+MSA::MSA(int argc, char* argv[]) {
+ //parse program parameters
+ SafeVector<string> sequenceNames = ParseParams(argc, argv);
+
+ //initialize arguments for partition function
+ init_arguments();
+
+ ReadParameters();
+ //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
+
+ //read the input sequences
+ MultiSequence *sequences = new MultiSequence();
+ assert(sequences);
+ for (int i = 0; i < (int) sequenceNames.size(); i++) {
+ cerr << "Loading sequence file: " << sequenceNames[i] << endl;
+ sequences->LoadMFA(sequenceNames[i], true);
+ }
+ //allocate space for sequence weights
+ this->seqsWeights = new int[sequences->GetNumSequences()];
+ //initilaize parameters for OPENMP
+#ifdef _OPENMP
+ if(numThreads <= 0) {
+ numThreads = omp_get_num_procs();
+ cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
+ }
+ cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
+
+ //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
+ omp_set_num_threads(numThreads);
+#endif
+
+ // now, we can perform the alignments and write them out
+ MultiSequence *alignment = doAlign(sequences,
+ ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
+ emitSingle), initDistrib, gapOpen, gapExtend, emitPairs,
+ emitSingle);
+
+ //write the alignment results to standard output
+ if (enableClustalWOutput) {
+ alignment->WriteALN(*alignOutFile);
+ } else {
+ alignment->WriteMFA(*alignOutFile);
+ }
+ //release resources
+ delete[] this->seqsWeights;
+ delete alignment;
+ delete sequences;
+}
+MSA::~MSA() {
+ /*close the output file*/
+ if (alignOutFileName.length() > 0) {
+ ((std::ofstream*) alignOutFile)->close();
+ }
+}
+/////////////////////////////////////////////////////////////////
+// PrintParameters()
+//
+// Prints MSAPROBS parameters to STDERR. If a filename is
+// specified, then the parameters are also written to the file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::PrintParameters(const char *message, const VF &initDistrib,
+ const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
+ const VF &emitSingle, const char *filename) {
+
+ // print parameters to the screen
+ cerr << message << endl << " initDistrib[] = { ";
+ for (int i = 0; i < NumMatrixTypes; i++)
+ cerr << setprecision(10) << initDistrib[i] << " ";
+ cerr << "}" << endl << " gapOpen[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapOpen[i] << " ";
+ cerr << "}" << endl << " gapExtend[] = { ";
+ for (int i = 0; i < NumInsertStates * 2; i++)
+ cerr << setprecision(10) << gapExtend[i] << " ";
+ cerr << "}" << endl << endl;
+
+ /*
+ for (int i = 0; i < 5; i++){
+ for (int j = 0; j <= i; j++){
+ cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
+ }
+ cerr << endl;
+ }*/
+
+ // if a file name is specified
+ if (filename) {
+
+ // attempt to open the file for writing
+ FILE *file = fopen(filename, "w");
+ if (!file) {
+ cerr << "ERROR: Unable to write parameter file: " << filename
+ << endl;
+ exit(1);
+ }
+
+ // if successful, then write the parameters to the file
+ for (int i = 0; i < NumMatrixTypes; i++)
+ fprintf(file, "%.10f ", initDistrib[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapOpen[i]);
+ fprintf(file, "\n");
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ fprintf(file, "%.10f ", gapExtend[i]);
+ fprintf(file, "\n");
+ fprintf(file, "%s\n", alphabet.c_str());
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++)
+ fprintf(file, "%.10f ",
+ emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
+ fprintf(file, "\n");
+ }
+ for (int i = 0; i < (int) alphabet.size(); i++)
+ fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
+ fprintf(file, "\n");
+ fclose(file);
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// doAlign()
+//
+// First computes all pairwise posterior probability matrices.
+// Then, computes new parameters if training, or a final
+// alignment, otherwise.
+/////////////////////////////////////////////////////////////////
+extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
+MultiSequence* MSA::doAlign(MultiSequence *sequences,
+ const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
+ VF &gapExtend, VVF &emitPairs, VF &emitSingle) {
+ assert(sequences);
+
+ //get the number of sequences
+ const int numSeqs = sequences->GetNumSequences();
+
+ //create distance matrix
+ VVF distances(numSeqs, VF(numSeqs, 0));
+ SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+#ifdef _OPENMP
+ //calculate sequence pairs for openmp model
+ int pairIdx = 0;
+ numPairs = (numSeqs - 1) * numSeqs / 2;
+ seqsPairs = new SeqsPair[numPairs];
+ for(int a = 0; a < numSeqs; a++) {
+ for(int b = a + 1; b < numSeqs; b++) {
+ seqsPairs[pairIdx].seq1 = a;
+ seqsPairs[pairIdx].seq2 = b;
+ pairIdx++;
+ }
+ }
+#endif
+ // do all pairwise alignments for posterior probability matrices
+#ifdef _OPENMP
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int a= seqsPairs[pairIdx].seq1;
+ int b = seqsPairs[pairIdx].seq2;
+ if(enableVerbose) {
+#pragma omp critical
+ cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
+ }
+#else
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+#endif
+ Sequence *seq1 = sequences->GetSequence(a);
+ Sequence *seq2 = sequences->GetSequence(b);
+
+ // verbose output
+ if (enableVerbose) {
+ cerr << "Computing posterior matrix: (" << a + 1 << ") "
+ << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") "
+ << seq2->GetHeader() << " -- ";
+ }
+
+ // compute forward and backward probabilities
+ VF *forward = model.ComputeForwardMatrix(seq1, seq2);
+ assert(forward);
+ VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
+ assert(backward);
+
+ // compute posterior probability matrix from HMM
+ VF *posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,
+ *backward);
+ assert(posterior);
+ delete forward;
+ delete backward;
+
+ //compute posterior probability matrix from partition function
+ VF* part_posterior = ::ComputePostProbs(a, b, seq1->GetString(),
+ seq2->GetString());
+ assert(part_posterior);
+
+ //merge the two posterior matrices
+ VF::iterator ptr1 = posterior->begin();
+ VF::iterator ptr2 = part_posterior->begin();
+ for (int i = 0; i <= seq1->GetLength(); i++) {
+ for (int j = 0; j <= seq2->GetLength(); j++) {
+ float v1 = *ptr1;
+ float v2 = *ptr2;
+
+ *ptr1 = sqrt((v1 * v1 + v2 * v2) * 0.5f);
+ ptr1++;
+ ptr2++;
+ }
+ }
+ delete part_posterior;
+
+ // compute sparse representations
+ sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), *posterior);
+ sparseMatrices[b][a] = NULL;
+
+ // perform the pairwise sequence alignment
+ pair<SafeVector<char> *, float> alignment = model.ComputeAlignment(
+ seq1->GetLength(), seq2->GetLength(), *posterior);
+
+ //compute the pairwise distance using expected accuracy
+ float accuracy = alignment.second
+ / min(seq1->GetLength(), seq2->GetLength());
+ distances[a][b] = distances[b][a] = 1.0f - accuracy;
+
+ if (enableVerbose) {
+ cerr << setprecision(10) << accuracy << endl;
+ }
+ delete alignment.first;
+ delete posterior;
+#ifndef _OPENMP
+ }
+#endif
+ }
+ //create the guide tree
+ this->tree = new MSAClusterTree(this, distances, numSeqs);
+ this->tree->create();
+
+ // perform the consistency transformation the desired number of times
+ float* fweights = new float[numSeqs];
+ for (int r = 0; r < numSeqs; r++) {
+ fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
+ fweights[r] *= 10;
+ }
+ for (int r = 0; r < numConsistencyReps; r++) {
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
+ DoRelaxation(fweights, sequences, sparseMatrices);
+
+ // now replace the old posterior matrices
+ for (int i = 0; i < numSeqs; i++) {
+ for (int j = 0; j < numSeqs; j++) {
+ delete sparseMatrices[i][j];
+ sparseMatrices[i][j] = newSparseMatrices[i][j];
+ }
+ }
+ }
+ delete[] fweights;
+#ifdef _OPENMP
+ delete [] seqsPairs;
+#endif
+
+ //compute the final multiple sequence alignment
+ MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
+ sparseMatrices, model);
+
+ // build annotation
+ if (enableAnnotation) {
+ WriteAnnotation(finalAlignment, sparseMatrices);
+ }
+ //destroy the guide tree
+ delete this->tree;
+ this->tree = 0;
+
+ // delete sparse matrices
+ for (int a = 0; a < numSeqs - 1; a++) {
+ for (int b = a + 1; b < numSeqs; b++) {
+ delete sparseMatrices[a][b];
+ delete sparseMatrices[b][a];
+ }
+ }
+
+ return finalAlignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetInteger()
+//
+// Attempts to parse an integer from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetInteger(char *data, int *val) {
+ char *endPtr;
+ long int retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtol(data, &endPtr, 0);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
+ return false;
+ if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
+ return false;
+ *val = (int) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// GetFloat()
+//
+// Attempts to parse a float from the character string given.
+// Returns true only if no parsing error occurs.
+/////////////////////////////////////////////////////////////////
+
+bool GetFloat(char *data, float *val) {
+ char *endPtr;
+ double retVal;
+
+ assert(val);
+
+ errno = 0;
+ retVal = strtod(data, &endPtr);
+ if (retVal == 0 && (errno != 0 || data == endPtr))
+ return false;
+ if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
+ return false;
+ *val = (float) retVal;
+ return true;
+}
+
+/////////////////////////////////////////////////////////////////
+// ReadParameters()
+//
+// Read initial distribution, transition, and emission
+// parameters from a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::ReadParameters() {
+
+ ifstream data;
+
+ emitPairs = VVF(256, VF(256, 1e-10));
+ emitSingle = VF(256, 1e-5);
+
+ // read initial state distribution and transition parameters
+ if (parametersInputFilename == string("")) {
+ if (NumInsertStates == 1) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen1Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend1Default[i];
+ } else if (NumInsertStates == 2) {
+ for (int i = 0; i < NumMatrixTypes; i++)
+ initDistrib[i] = initDistrib2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapOpen[i] = gapOpen2Default[i];
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ gapExtend[i] = gapExtend2Default[i];
+ } else {
+ cerr
+ << "ERROR: No default initial distribution/parameter settings exist"
+ << endl << " for " << NumInsertStates
+ << " pairs of insert states. Use --paramfile." << endl;
+ exit(1);
+ }
+
+ alphabet = alphabetDefault;
+
+ for (int i = 0; i < (int) alphabet.length(); i++) {
+ emitSingle[(unsigned char) tolower(alphabet[i])] =
+ emitSingleDefault[i];
+ emitSingle[(unsigned char) toupper(alphabet[i])] =
+ emitSingleDefault[i];
+ for (int j = 0; j <= i; j++) {
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = emitPairsDefault[i][j];
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = emitPairsDefault[i][j];
+ }
+ }
+ } else {
+ data.open(parametersInputFilename.c_str());
+ if (data.fail()) {
+ cerr << "ERROR: Unable to read parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ string line[3];
+ for (int i = 0; i < 3; i++) {
+ if (!getline(data, line[i])) {
+ cerr
+ << "ERROR: Unable to read transition parameters from parameter file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+ }
+ istringstream data2;
+ data2.clear();
+ data2.str(line[0]);
+ for (int i = 0; i < NumMatrixTypes; i++)
+ data2 >> initDistrib[i];
+ data2.clear();
+ data2.str(line[1]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapOpen[i];
+ data2.clear();
+ data2.str(line[2]);
+ for (int i = 0; i < 2 * NumInsertStates; i++)
+ data2 >> gapExtend[i];
+
+ if (!getline(data, line[0])) {
+ cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
+ << parametersInputFilename << endl;
+ exit(1);
+ }
+
+ // read alphabet as concatenation of all characters on alphabet line
+ alphabet = "";
+ string token;
+ data2.clear();
+ data2.str(line[0]);
+ while (data2 >> token)
+ alphabet += token;
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ for (int j = 0; j <= i; j++) {
+ float val;
+ data >> val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
+ alphabet[j])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
+ alphabet[i])] = val;
+ emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
+ alphabet[i])] = val;
+ }
+ }
+
+ for (int i = 0; i < (int) alphabet.size(); i++) {
+ float val;
+ data >> val;
+ emitSingle[(unsigned char) tolower(alphabet[i])] = val;
+ emitSingle[(unsigned char) toupper(alphabet[i])] = val;
+ }
+ data.close();
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// ParseParams()
+//
+// Parse all command-line options.
+/////////////////////////////////////////////////////////////////
+void MSA::printUsage() {
+ cerr
+ << "************************************************************************"
+ << endl
+ << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
+ << endl
+ << "\tbased on pair hidden markov model and partition function postirior"
+ << endl
+ << "\tprobabilities. If any comments or problems, please contact"
+ << endl
+ << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
+ << endl
+ << "*************************************************************************"
+ << endl << "Usage:" << endl
+ << " msaprobs [OPTION]... [infile]..." << endl << endl
+ << "Description:" << endl
+ << " Align sequences in multi-FASTA format" << endl << endl
+ << " -o, --outfile <string>" << endl
+ << " specify the output file name (STDOUT by default)"
+ << endl << " -num_threads <integer>" << endl
+ << " specify the number of threads used, and otherwise detect automatically"
+ << endl << " -clustalw" << endl
+ << " use CLUSTALW output format instead of FASTA format"
+ << endl << endl << " -c, --consistency REPS" << endl
+ << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
+ << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
+ << ") passes of consistency transformation" << endl << endl
+ << " -ir, --iterative-refinement REPS" << endl
+ << " use " << MIN_ITERATIVE_REFINEMENT_REPS
+ << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
+ << numIterativeRefinementReps << ") passes of iterative-refinement"
+ << endl << endl << " -v, --verbose" << endl
+ << " report progress while aligning (default: "
+ << (enableVerbose ? "on" : "off") << ")" << endl << endl
+ << " -annot FILENAME" << endl
+ << " write annotation for multiple alignment to FILENAME"
+ << endl << endl << " -a, --alignment-order" << endl
+ << " print sequences in alignment order rather than input order (default: "
+ << (enableAlignOrder ? "on" : "off") << ")" << endl
+ << " -version " << endl
+ << " print out version of MSAPROBS " << endl << endl;
+}
+SafeVector<string> MSA::ParseParams(int argc, char **argv) {
+ if (argc < 2) {
+ printUsage();
+ exit(1);
+ }
+ SafeVector<string> sequenceNames;
+ int tempInt;
+ float tempFloat;
+
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-') {
+ //help
+ if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
+ printUsage();
+ exit(1);
+ //output file name
+ } else if (!strcmp(argv[i], "-o")
+ || !strcmp(argv[i], "--outfile")) {
+ if (i < argc - 1) {
+ alignOutFileName = argv[++i]; //get the file name
+ } else {
+ cerr << "ERROR: String expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ //number of threads used
+ } else if (!strcmp(argv[i], "-p")
+ || !strcmp(argv[i], "-num_threads")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << " ERROR: invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < 0) {
+ tempInt = 0;
+ }
+ numThreads = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ // number of consistency transformations
+ } else if (!strcmp(argv[i], "-c")
+ || !strcmp(argv[i], "--consistency")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_CONSISTENCY_REPS
+ || tempInt > MAX_CONSISTENCY_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_CONSISTENCY_REPS << " and "
+ << MAX_CONSISTENCY_REPS << "." << endl;
+ exit(1);
+ } else {
+ numConsistencyReps = tempInt;
+ }
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // number of randomized partitioning iterative refinement passes
+ else if (!strcmp(argv[i], "-ir")
+ || !strcmp(argv[i], "--iterative-refinement")) {
+ if (i < argc - 1) {
+ if (!GetInteger(argv[++i], &tempInt)) {
+ cerr << "ERROR: Invalid integer following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
+ || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", integer must be between "
+ << MIN_ITERATIVE_REFINEMENT_REPS << " and "
+ << MAX_ITERATIVE_REFINEMENT_REPS << "."
+ << endl;
+ exit(1);
+ } else
+ numIterativeRefinementReps = tempInt;
+ }
+ } else {
+ cerr << "ERROR: Integer expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // annotation files
+ else if (!strcmp(argv[i], "-annot")) {
+ enableAnnotation = true;
+ if (i < argc - 1) {
+ annotationFilename = argv[++i];
+ } else {
+ cerr << "ERROR: FILENAME expected for option " << argv[i]
+ << endl;
+ exit(1);
+ }
+ }
+
+ // clustalw output format
+ else if (!strcmp(argv[i], "-clustalw")) {
+ enableClustalWOutput = true;
+ }
+
+ // cutoff
+ else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
+ if (i < argc - 1) {
+ if (!GetFloat(argv[++i], &tempFloat)) {
+ cerr
+ << "ERROR: Invalid floating-point value following option "
+ << argv[i - 1] << ": " << argv[i] << endl;
+ exit(1);
+ } else {
+ if (tempFloat < 0 || tempFloat > 1) {
+ cerr << "ERROR: For option " << argv[i - 1]
+ << ", floating-point value must be between 0 and 1."
+ << endl;
+ exit(1);
+ } else
+ cutoff = tempFloat;
+ }
+ } else {
+ cerr << "ERROR: Floating-point value expected for option "
+ << argv[i] << endl;
+ exit(1);
+ }
+ }
+
+ // verbose reporting
+ else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
+ enableVerbose = true;
+ }
+
+ // alignment order
+ else if (!strcmp(argv[i], "-a")
+ || !strcmp(argv[i], "--alignment-order")) {
+ enableAlignOrder = true;
+ }
+
+ //print out version
+ else if (!strcmp(argv[i], "-version")) {
+ cerr << "MSAPROBS version " << VERSION << endl;
+ exit(1);
+ }
+ // bad arguments
+ else {
+ cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
+ exit(1);
+ }
+ } else {
+ sequenceNames.push_back(string(argv[i]));
+ }
+ }
+
+ /*check the output file name*/
+ cerr << "-------------------------------------" << endl;
+ if (alignOutFileName.length() == 0) {
+ cerr << "The final alignments will be printed out to STDOUT" << endl;
+ alignOutFile = &std::cout;
+ } else {
+ cerr << "Open the output file " << alignOutFileName << endl;
+ alignOutFile = new ofstream(alignOutFileName.c_str(),
+ ios::binary | ios::out | ios::trunc);
+ }
+ cerr << "-------------------------------------" << endl;
+ return sequenceNames;
+}
+
+/////////////////////////////////////////////////////////////////
+// ProcessTree()
+//
+// Process the tree recursively. Returns the aligned sequences
+// corresponding to a node or leaf of the tree.
+/////////////////////////////////////////////////////////////////
+MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ MultiSequence *result;
+
+ // check if this is a node of the alignment tree
+ //if (tree->GetSequenceLabel() == -1){
+ if (tree->leaf == NODE) {
+ MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
+ sparseMatrices, model);
+ MultiSequence *alignRight = ProcessTree(tree->right, sequences,
+ sparseMatrices, model);
+
+ assert(alignLeft);
+ assert(alignRight);
+
+ result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
+ assert(result);
+
+ delete alignLeft;
+ delete alignRight;
+ }
+
+ // otherwise, this is a leaf of the alignment tree
+ else {
+ result = new MultiSequence();
+ assert(result);
+ //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
+ result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
+ }
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeFinalAlignment()
+//
+// Compute the final alignment by calling ProcessTree(), then
+// performing iterative refinement as needed.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
+ MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+ MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
+ sparseMatrices, model);
+
+ SafeVector<int> oldOrdering;
+ if (enableAlignOrder) {
+ for (int i = 0; i < alignment->GetNumSequences(); i++)
+ oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
+ alignment->SaveOrdering();
+ enableAlignOrder = false;
+ }
+
+ // tree-based refinement
+ // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
+ /*int numSeqs = alignment->GetNumSequences();
+ if(numSeqs < numIterativeRefinementReps){
+ for(int iter = 0; iter < 1; iter ++){
+ for(int i = 0; i < numSeqs - 1; i++){
+ DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
+ }
+ }
+ }*/
+ for (int i = 0; i < numIterativeRefinementReps; i++) {
+ DoIterativeRefinement(sparseMatrices, model, alignment, i);
+ }
+ cerr << endl;
+
+ if (oldOrdering.size() > 0) {
+ for (int i = 0; i < (int) oldOrdering.size(); i++) {
+ alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
+ }
+ }
+
+ // return final alignment
+ return alignment;
+}
+
+/////////////////////////////////////////////////////////////////
+// AlignAlignments()
+//
+// Returns the alignment of two MultiSequence objects.
+/////////////////////////////////////////////////////////////////
+
+MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
+ MultiSequence *align2,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model) {
+
+ // print some info about the alignment
+ if (enableVerbose) {
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align1->GetSequence(i)->GetLabel();
+ cerr << "] vs. ";
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ cerr << ((i == 0) ? "[" : ",")
+ << align2->GetSequence(i)->GetLabel();
+ cerr << "]: ";
+ }
+#if 0
+ VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
+#else
+ VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
+ sparseMatrices, cutoff);
+#endif
+ pair<SafeVector<char> *, float> alignment;
+
+ //perform alignment
+ alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
+ align2->GetSequence(0)->GetLength(), *posterior);
+
+ delete posterior;
+
+ if (enableVerbose) {
+
+ // compute total length of sequences
+ int totLength = 0;
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ for (int j = 0; j < align2->GetNumSequences(); j++)
+ totLength += min(align1->GetSequence(i)->GetLength(),
+ align2->GetSequence(j)->GetLength());
+
+ // give an "accuracy" measure for the alignment
+ cerr << alignment.second / totLength << endl;
+ }
+
+ // now build final alignment
+ MultiSequence *result = new MultiSequence();
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ result->AddSequence(
+ align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ result->AddSequence(
+ align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
+ if (!enableAlignOrder)
+ result->SortByLabel();
+
+ // free temporary alignment
+ delete alignment.first;
+
+ return result;
+}
+
+/////////////////////////////////////////////////////////////////
+// DoRelaxation()
+//
+// Performs one round of the weighted probabilistic consistency transformation.
+// 1
+/////////////////////////////////////////////////////////////////
+
+SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
+ MultiSequence *sequences,
+ SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ const int numSeqs = sequences->GetNumSequences();
+
+ SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
+ SafeVector<SparseMatrix *>(numSeqs, NULL));
+
+ // for every pair of sequences
+#ifdef _OPENMP
+ int pairIdx;
+#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
+ for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
+ int i = seqsPairs[pairIdx].seq1;
+ int j = seqsPairs[pairIdx].seq2;
+ float wi = seqsWeights[i];
+ float wj = seqsWeights[j];
+#else
+ for (int i = 0; i < numSeqs; i++) {
+ float wi = seqsWeights[i];
+ for (int j = i + 1; j < numSeqs; j++) {
+ float wj = seqsWeights[j];
+#endif
+ Sequence *seq1 = sequences->GetSequence(i);
+ Sequence *seq2 = sequences->GetSequence(j);
+
+ if (enableVerbose) {
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+ cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
+ << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
+ << ": ";
+ }
+ // get the original posterior matrix
+ VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+
+ // contribution from the summation where z = x and z = y
+ float w = wi * wi * wj + wi * wj * wj;
+ float sumW = w;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ posterior[k] = w * posterior[k];
+ }
+
+ if (enableVerbose)
+ cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
+
+ // contribution from all other sequences
+ for (int k = 0; k < numSeqs; k++) {
+ if (k != i && k != j) {
+ float wk = seqsWeights[k];
+ float w = wi * wj * wk;
+ sumW += w;
+ if (k < i)
+ Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
+ posterior);
+ else if (k > i && k < j)
+ Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
+ posterior);
+ else {
+ SparseMatrix *temp =
+ sparseMatrices[j][k]->ComputeTranspose();
+ Relax(w, sparseMatrices[i][k], temp, posterior);
+ delete temp;
+ }
+ }
+ }
+ //cerr<<"sumW "<<sumW<<endl;
+ for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
+ posterior[k] /= sumW;
+ }
+ // mask out positions not originally in the posterior matrix
+ SparseMatrix *matXY = sparseMatrices[i][j];
+ for (int y = 0; y <= seq2Length; y++)
+ posterior[y] = 0;
+ for (int x = 1; x <= seq1Length; x++) {
+ SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
+ SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
+ VF::iterator base = posterior.begin() + x * (seq2Length + 1);
+ int curr = 0;
+ while (XYptr != XYend) {
+
+ // zero out all cells until the first filled column
+ while (curr < XYptr->first) {
+ base[curr] = 0;
+ curr++;
+ }
+
+ // now, skip over this column
+ curr++;
+ ++XYptr;
+ }
+
+ // zero out cells after last column
+ while (curr <= seq2Length) {
+ base[curr] = 0;
+ curr++;
+ }
+ }
+
+ // save the new posterior matrix
+ newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
+ seq2->GetLength(), posterior);
+ newSparseMatrices[j][i] = NULL;
+
+ if (enableVerbose)
+ cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
+
+ delete posteriorPtr;
+
+ if (enableVerbose)
+ cerr << "done." << endl;
+#ifndef _OPENMP
+ }
+#endif
+ }
+
+ return newSparseMatrices;
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matXZ);
+ assert(matZY);
+
+ int lengthX = matXZ->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+ assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
+
+ // for every x[i]
+ for (int i = 1; i <= lengthX; i++) {
+ SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
+ SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
+
+ VF::iterator base = posterior.begin() + i * (lengthY + 1);
+
+ // iterate through all x[i]-z[k]
+ while (XZptr != XZend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
+ SafeVector<PIF>::iterator ZYend = ZYptr
+ + matZY->GetRowSize(XZptr->first);
+ const float XZval = XZptr->second;
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ base[ZYptr->first] += weight * XZval * ZYptr->second;
+ ZYptr++;
+ }
+ XZptr++;
+ }
+ }
+}
+
+/////////////////////////////////////////////////////////////////
+// Relax1()
+//
+// Computes the consistency transformation for a single sequence
+// z, and adds the transformed matrix to "posterior".
+/////////////////////////////////////////////////////////////////
+
+void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
+ VF &posterior) {
+
+ assert(matZX);
+ assert(matZY);
+
+ int lengthZ = matZX->GetSeq1Length();
+ int lengthY = matZY->GetSeq2Length();
+
+ // for every z[k]
+ for (int k = 1; k <= lengthZ; k++) {
+ SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
+
+ // iterate through all z[k]-x[i]
+ while (ZXptr != ZXend) {
+ SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
+ SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
+ const float ZXval = ZXptr->second;
+ VF::iterator base = posterior.begin()
+ + ZXptr->first * (lengthY + 1);
+
+ // iterate through all z[k]-y[j]
+ while (ZYptr != ZYend) {
+ base[ZYptr->first] += weight * ZXval * ZYptr->second;
+ ZYptr++;
+ }
+ ZXptr++;
+ }
+ }
+}
+/////////////////////////////////////////////////////////////////
+// DoIterativeRefinement()
+//
+// Performs a single round of randomized partionining iterative
+// refinement.
+/////////////////////////////////////////////////////////////////
+
+int MSA::GenRandom(int m, int seed, bool init) {
+ static const int a = 5, b = 3, n = 7;
+ static int rand0;
+ if (init == true) {
+ rand0 = seed;
+ }
+ m *= 19;
+ int rand1;
+ for (int i = 0; i < n; i++) {
+ rand1 = (a * rand0 + b) % m;
+ rand0 = rand1;
+ }
+ return rand1;
+}
+
+void MSA::DoIterativeRefinement(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment, int si) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+
+ int index = GenRandom(numSeqs, si, true);
+ // create two separate groups
+ for (int i = 0; i < numSeqs; i++) {
+ index = GenRandom(numSeqs, si);
+ if (index % 2) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty())
+ return;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+ delete alignment;
+
+ // realign
+ alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
+ model);
+
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+}
+void MSA::DoIterativeRefinementTreeNode(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment,
+ int nodeIndex) {
+ set<int> groupOne, groupTwo;
+ int numSeqs = alignment->GetNumSequences();
+
+ vector<bool> inGroup1;
+ inGroup1.resize(numSeqs);
+ for (int i = 0; i < numSeqs; i++) {
+ inGroup1[i] = false;
+ }
+
+ AlignmentOrder* orders = this->tree->getAlignOrders();
+ AlignmentOrder* order = &orders[nodeIndex];
+ for (int i = 0; i < order->leftNum; i++) {
+ int si = order->leftLeafs[i];
+ inGroup1[si] = true;
+ }
+ for (int i = 0; i < order->rightNum; i++) {
+ int si = order->rightLeafs[i];
+ inGroup1[si] = true;
+ }
+ // create two separate groups
+ for (int i = 0; i < numSeqs; i++) {
+ if (inGroup1[i]) {
+ groupOne.insert(i);
+ } else {
+ groupTwo.insert(i);
+ }
+ }
+ if (groupOne.empty() || groupTwo.empty())
+ return;
+
+ // project into the two groups
+ MultiSequence *groupOneSeqs = alignment->Project(groupOne);
+ assert(groupOneSeqs);
+ MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
+ assert(groupTwoSeqs);
+ delete alignment;
+
+ // realign
+ alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
+ model);
+
+ delete groupOneSeqs;
+ delete groupTwoSeqs;
+}
+
+/////////////////////////////////////////////////////////////////
+// WriteAnnotation()
+//
+// Computes annotation for multiple alignment and write values
+// to a file.
+/////////////////////////////////////////////////////////////////
+
+void MSA::WriteAnnotation(MultiSequence *alignment,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+ ofstream outfile(annotationFilename.c_str());
+
+ if (outfile.fail()) {
+ cerr << "ERROR: Unable to write annotation file." << endl;
+ exit(1);
+ }
+
+ const int alignLength = alignment->GetSequence(0)->GetLength();
+ const int numSeqs = alignment->GetNumSequences();
+
+ SafeVector<int> position(numSeqs, 0);
+ SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
+ for (int i = 0; i < numSeqs; i++)
+ seqs[i] = alignment->GetSequence(i)->GetDataPtr();
+ SafeVector<pair<int, int> > active;
+ active.reserve(numSeqs);
+
+ SafeVector<int> lab;
+ for (int i = 0; i < numSeqs; i++)
+ lab.push_back(alignment->GetSequence(i)->GetSortLabel());
+
+ // for every column
+ for (int i = 1; i <= alignLength; i++) {
+
+ // find all aligned residues in this particular column
+ active.clear();
+ for (int j = 0; j < numSeqs; j++) {
+ if (seqs[j][i] != '-') {
+ active.push_back(make_pair(lab[j], ++position[j]));
+ }
+ }
+
+ sort(active.begin(), active.end());
+ outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
+ }
+
+ outfile.close();
+}
+
+/////////////////////////////////////////////////////////////////
+// ComputeScore()
+//
+// Computes the annotation score for a particular column.
+/////////////////////////////////////////////////////////////////
+
+int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
+
+ if (active.size() <= 1)
+ return 0;
+
+ // ALTERNATIVE #1: Compute the average alignment score.
+
+ float val = 0;
+ for (int i = 0; i < (int) active.size(); i++) {
+ for (int j = i + 1; j < (int) active.size(); j++) {
+ val += sparseMatrices[active[i].first][active[j].first]->GetValue(
+ active[i].second, active[j].second);
+ }
+ }
+
+ return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
+
+}
--- /dev/null
+#ifndef _MSA_H
+#define _MSA_H
+#include "MSADef.h"
+#include "MSAGuideTree.h"
+
+#include "SafeVector.h"
+#include "MultiSequence.h"
+#include "ScoreType.h"
+#include "ProbabilisticModel.h"
+#include "SparseMatrix.h"
+#include <string>
+using namespace std;
+
+class MSAGuideTree;
+struct TreeNode;
+class MSA {
+public:
+ MSA(int argc, char* argv[]);
+ ~MSA();
+
+ static void getSysTime(double * dtime);
+ MSAGuideTree* getGuideTree() {
+ return tree;
+ }
+ int * getSeqsWeights() {
+ return seqsWeights;
+ }
+private:
+ //print usage
+ void printUsage();
+ //do multiple sequence alignment
+ void doAlign();
+
+ //for sequence weights
+ void createSeqsWeights(int seqsNum);
+ void releaseSeqsWeights();
+
+ //weights of sequences
+ int * seqsWeights;
+ //guide tree
+ MSAGuideTree* tree;
+ //output file
+ string alignOutFileName;
+ std::ostream* alignOutFile;
+private:
+ SafeVector<string> ParseParams(int argc, char *argv[]);
+ void PrintParameters(const char *message, const VF &initDistrib,
+ const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
+ const VF &emitSingle, const char *filename);
+
+ SafeVector<string> PostProbsParseParams(int argc, char **argv);
+ MultiSequence *doAlign(MultiSequence *sequence,
+ const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
+ VF &gapExtend, VVF &emitPairs, VF &emitSingle);
+ void ReadParameters();
+ MultiSequence* ProcessTree(TreeNode *tree, MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model);
+ MultiSequence *ComputeFinalAlignment(MSAGuideTree *tree,
+ MultiSequence *sequences,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model);
+ MultiSequence *AlignAlignments(MultiSequence *align1, MultiSequence *align2,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model);
+ SafeVector<SafeVector<SparseMatrix *> > DoRelaxation(float* seqsWeights,
+ MultiSequence *sequences,
+ SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
+ void Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
+ VF &posterior);
+ void Relax1(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
+ VF &posterior);
+
+ int GenRandom(int m, int seed, bool init = false);
+ void DoIterativeRefinement(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment, int si);
+ void DoIterativeRefinement(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment);
+ void DoIterativeRefinementTreeNode(
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ const ProbabilisticModel &model, MultiSequence* &alignment,
+ int nodeIndex);
+ void WriteAnnotation(MultiSequence *alignment,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
+ int ComputeScore(const SafeVector<pair<int, int> > &active,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
+#ifdef _OPENMP
+ //private struct
+ struct SeqsPair {
+ int seq1;
+ int seq2;
+ };
+ int numPairs;
+ SeqsPair* seqsPairs;
+#endif
+};
+
+#endif
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#include "MSAClusterTree.h"
+MSAClusterTree::MSAClusterTree(MSA* msa, VVF& distMatrix, int numSeqs) :
+ MSAGuideTree(msa, distMatrix, numSeqs) {
+}
+MSAClusterTree::~MSAClusterTree() {
+}
+void MSAClusterTree::create() {
+ //generate the neighbor-joining tree
+ this->generateClusterTree();
+
+ //calculate sequence weights
+ this->getSeqsWeights();
+
+ //construct the alignment orders
+ this->createAlignmentOrders();
+}
+void MSAClusterTree::generateClusterTree() {
+ int i;
+ ValidNode* validNodes, *headValidNodes;
+ ValidNode* miniPtr, *minjPtr, *ivalid, *jvalid;
+ int mini, minj;
+ float* joins;
+ unsigned int* clusterLeafs;
+
+ //initialize the valid nodes link list
+ validNodes = new ValidNode[leafsNum + 1];
+ joins = new float[leafsNum + 1];
+ clusterLeafs = new unsigned int[nodesNum + 1];
+ if (!validNodes || !joins || !clusterLeafs) {
+ cerr << "Out of memory of the reconstruction of cluster tree" << endl;
+ }
+ //initialize cluster size
+ for (i = 0; i < this->leafsNum; i++) {
+ clusterLeafs[i] = 1;
+ }
+
+ headValidNodes = &validNodes[0];
+ headValidNodes->next = &validNodes[1];
+ headValidNodes->n = -1;
+ headValidNodes->node = -1;
+ headValidNodes->prev = NULL;
+
+ //build an initial link list
+ ValidNode* curr = &validNodes[1];
+ ValidNode* prev = headValidNodes;
+ ValidNode* next = &validNodes[2];
+ for (i = 0; i < leafsNum; i++) {
+ curr->n = i;
+ curr->node = i;
+ curr->prev = prev;
+ curr->next = next;
+ prev = curr;
+ curr = next;
+ next++;
+ }
+ prev->next = NULL;
+
+ //to generate the cluster tree
+ int nodeIdx; //the index of an internal node
+ int firstNode = leafsNum; //the index of the first internal node
+ int lastNode = firstNode + leafsNum - 1;//the index of the last internal node
+
+ for (nodeIdx = firstNode; nodeIdx < lastNode; nodeIdx++) {
+ //find closest pair of clusters
+ float minDist = 2.0f;
+ miniPtr = headValidNodes;
+ minjPtr = headValidNodes;
+
+ for (ivalid = headValidNodes->next; ivalid != NULL;
+ ivalid = ivalid->next) {
+ mini = ivalid->n;
+ for (jvalid = headValidNodes->next;
+ jvalid != NULL && jvalid->n < mini; jvalid = jvalid->next) {
+ minj = jvalid->n;
+ float dist = (*distMatrix)[mini][minj];
+ if (dist < 0) {
+ cerr
+ << "ERROR: It is impossible to have distance value less than zero"
+ << endl;
+ dist = 0;
+ }
+ if (dist < minDist) {
+ minDist = dist;
+ miniPtr = ivalid;
+ minjPtr = jvalid;
+ }
+ //printf("dist %g mini %d minj %d\n", dist, ivalid->node, jvalid->node);
+ }
+ }
+ //printf("**** mini %d minj %d minDist %g *****\n", miniPtr->node, minjPtr->node, minDist);
+ //check the validity of miniPtr and minjPtr;
+ if (miniPtr == headValidNodes || minjPtr == headValidNodes) {
+ cerr << "OOPS: Error occurred while constructing the cluster tree\n"
+ << endl;
+ exit(-1);
+ }
+ //computing branch length and join the two nodes
+ float branchLength = minDist * 0.5f;
+ this->connectNodes(&nodes[nodeIdx], nodeIdx, &nodes[miniPtr->node],
+ branchLength, &nodes[minjPtr->node], branchLength);
+ clusterLeafs[nodeIdx] = clusterLeafs[miniPtr->node]
+ + clusterLeafs[minjPtr->node];
+
+ //remove the valid node minjPtr from the list
+ minjPtr->prev->next = minjPtr->next;
+ if (minjPtr->next != NULL) {
+ minjPtr->next->prev = minjPtr->prev;
+ }
+ minjPtr->prev = minjPtr->next = NULL;
+
+ //compute the distance of each remaining valid node to the new node
+ for (ivalid = headValidNodes->next; ivalid != NULL;
+ ivalid = ivalid->next) {
+ int idx = ivalid->n;
+
+ float idist = (*distMatrix)[miniPtr->n][idx];
+ float jdist = (*distMatrix)[minjPtr->n][idx];
+
+ unsigned int isize = clusterLeafs[miniPtr->node];
+ unsigned int jsize = clusterLeafs[minjPtr->node];
+ joins[idx] = (idist * isize + jdist * jsize) / (isize + jsize);
+ }
+ //update the distance to the new node
+ miniPtr->node = nodeIdx;
+ mini = miniPtr->n;
+ for (jvalid = headValidNodes->next; jvalid != NULL;
+ jvalid = jvalid->next) {
+ minj = jvalid->n;
+
+ float dist = joins[minj];
+ (*distMatrix)[mini][minj] = dist;
+ (*distMatrix)[minj][mini] = dist;
+ }
+ }
+ //add a pseudo root to this unrooted NJ tree
+ this->root = &nodes[lastNode - 1];
+
+ delete[] validNodes;
+ delete[] joins;
+ delete[] clusterLeafs;
+}
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#ifndef _MSA_CLUSTER_TREE_H
+#define _MSA_CLUSTER_TREE_H
+
+#include "MSAGuideTree.h"
+
+class MSAClusterTree: public MSAGuideTree {
+public:
+ MSAClusterTree(MSA* msa, VVF& distMatrix, int numSeqs);
+ ~MSAClusterTree();
+
+ //construct the cluster tree
+ void create();
+private:
+ //generate the cluster tree
+ void generateClusterTree();
+};
+#endif
--- /dev/null
+#ifndef _MSA_DEF_H
+#define _MSA_DEF_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <float.h>
+#include <math.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+//maximum number
+#define MAX_INT_NUM 0x7FFFFFFF
+#define MAX_FLOAT_NUM FLT_MAX
+#define INT_MULTIPLY 1000
+
+#define SUBMATRIX_INT_SCALE 100
+
+//a tree node is a leaf or a node
+enum {
+ NONE, NODE, LEAF
+};
+
+#endif
+
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+#include "MSAGuideTree.h"
+#include "MSA.h"
+MSAGuideTree::MSAGuideTree(MSA* msa, VVF& distances, int numSeqs) {
+ int i;
+ TreeNode* node;
+ //system configuration
+ this->msa = msa;
+ this->distMatrix = &distances;
+ this->numSeqs = numSeqs;
+ this->seqsWeights = msa->getSeqsWeights();
+
+ //tree structure
+ this->nodesSize = this->numSeqs * 2 + 1;
+ this->nodes = new TreeNode[this->nodesSize];
+ if (!this->nodes) {
+ cerr << "TreeNodes memory allocation failed" << endl;
+ exit(-1);
+ }
+ //initialize all the tree nodes
+ this->leafs = this->nodes;
+ this->leafsNum = this->numSeqs;
+ this->nodesNum = 2 * this->leafsNum - 1;
+ for (i = 0; i < this->nodesSize; i++) {
+ node = &nodes[i];
+ node->left = 0;
+ node->right = 0;
+ node->parent = 0;
+ node->leftIdx = -1;
+ node->rightIdx = -1;
+ node->parentIdx = -1;
+ node->idx = -1;
+ node->dist = 0;
+ node->leaf = NODE; //setted to be NODE, by default
+ node->order = 0;
+ node->depth = 0;
+ }
+ //initialize the leaf nodes
+ for (i = 0; i < this->leafsNum; i++) {
+ node = &this->leafs[i];
+ node->idx = i;
+ node->leaf = LEAF;
+ }
+}
+MSAGuideTree::~MSAGuideTree() {
+ //release tree nodes
+ delete[] this->nodes;
+
+ //release alignment orders
+ releaseAlignmentOrders();
+
+}
+//get the tree nodes
+TreeNode* MSAGuideTree::getNodes() {
+ return nodes;
+}
+//get the leaf nodes
+TreeNode* MSAGuideTree::getLeafs() {
+ return leafs;
+}
+//get the number of nodes;
+int MSAGuideTree::getNodesNum() {
+ return nodesNum;
+}
+//get the number of leaf nodes
+int MSAGuideTree::getLeafsNum() {
+ return leafsNum;
+}
+//get the alignment orders
+AlignmentOrder* MSAGuideTree::getAlignOrders() {
+ return alignOrders;
+}
+int MSAGuideTree::getAlignOrdersNum() {
+ return alignOrdersNum;
+}
+/****************************************************
+ create the evolutionary relationship
+ ****************************************************/
+void MSAGuideTree::connectNodes(TreeNode* parent, int parentIdx,
+ TreeNode* leftChild, float leftDist, TreeNode* rightChild,
+ float rightDist) {
+ //save the parents index for each child
+ leftChild->parent = parent;
+ leftChild->parentIdx = parentIdx;
+ rightChild->parent = parent;
+ rightChild->parentIdx = parentIdx;
+
+ //save the branch lengths (i.e. distance) from each child to its parent
+ leftChild->dist = leftDist;
+ rightChild->dist = rightDist;
+
+ //save the indices of itself and its children for this new tree node
+ parent->idx = parentIdx;
+ parent->left = leftChild;
+ parent->leftIdx = leftChild->idx;
+ parent->right = rightChild;
+ parent->rightIdx = rightChild->idx;
+}
+/*****************************************
+ compute the alignment order of the phylogentic tree
+ *****************************************/
+void MSAGuideTree::createAlignmentOrders() {
+ int i;
+
+ AlignmentOrder* order;
+ //allocate memory space for alignment orders vector
+ this->alignOrdersNum = 0;//for alignment orders, it starts from 1 instead of 0
+ this->alignOrdersSize = numSeqs;//the number of internal nodes of the phylogentic tree + 1
+ this->alignOrders = new AlignmentOrder[this->alignOrdersSize];
+ if (!this->alignOrders) {
+ cerr << "OOPS: Alignment orders memory allocation failed" << endl;
+ exit(-1);
+ }
+ //initialize the alignment orders vector
+ for (i = 0; i < this->alignOrdersSize; i++) {
+ order = &this->alignOrders[i];
+ order->leftOrder = 0;
+ order->rightOrder = 0;
+ order->leftLeafs = 0;
+ order->leftNum = 0;
+ order->rightLeafs = 0;
+ order->rightNum = 0;
+ }
+ //starting out constructing the alignment orders
+ int subLeafsNum;
+ int nodeDepth = 1;
+ int subOrder = recursiveCreateAlignmentOrders(this->root, 0, subLeafsNum,
+ nodeDepth);
+
+ //check whether the function works well
+ if (subLeafsNum != numSeqs || this->alignOrdersNum != subOrder) {
+ fprintf(stderr,
+ "The alignment orders constructed were wrong (subLeafsNum %d, alignOrdersNum %d, subOrder %d)\n",
+ subLeafsNum, alignOrdersNum, subOrder);
+ }
+
+}
+int MSAGuideTree::recursiveCreateAlignmentOrders(TreeNode* subRoot,
+ int* subLeafs, int& subLeafsNum, int nodeDepth) {
+ int leftNum, rightNum;
+ int leftOrder, rightOrder;
+ int* leftLeafs, *rightLeafs;
+
+ if (subRoot->leaf == LEAF) {
+ subLeafs[0] = subRoot->idx;
+ subLeafsNum = 1;
+
+ return 0; //if it is a leaf, return the index 0
+ }
+ leftOrder = rightOrder = 0;
+ leftNum = rightNum = 0;
+ leftLeafs = new int[numSeqs];
+ rightLeafs = new int[numSeqs];
+
+ //check the left subtree
+ if (subRoot->left) {
+ //recursively tranverse the left subtree
+ leftOrder = recursiveCreateAlignmentOrders(subRoot->left, leftLeafs,
+ leftNum, nodeDepth + 1);
+ }
+ //check the right subtree
+ if (subRoot->right) {
+ rightOrder = recursiveCreateAlignmentOrders(subRoot->right, rightLeafs,
+ rightNum, nodeDepth + 1);
+ }
+ //save the leafs in the left and right subtrees of the current subtree
+ if (this->alignOrdersNum > this->alignOrdersSize) {
+ fprintf(stderr, "the alignment order function works bad\n");\\r
+ exit(-1);
+ }
+
+ AlignmentOrder* order = &this->alignOrders[++this->alignOrdersNum];
+ order->nodeDepth = nodeDepth;
+ order->leftOrder = leftOrder;
+ order->rightOrder = rightOrder;
+ order->leftNum = leftNum;
+ order->rightNum = rightNum;
+ order->leftLeafs = new int[order->leftNum];
+ order->rightLeafs = new int[order->rightNum];
+ if (!order->leftLeafs || !order->rightLeafs) {
+ fprintf(stderr,
+ "memory allocation failed while recursively constructing alignment orders\n");
+ exit(-1);
+ }
+ memcpy(order->leftLeafs, leftLeafs, order->leftNum * sizeof(int));
+ memcpy(order->rightLeafs, rightLeafs, order->rightNum * sizeof(int));
+
+ delete[] leftLeafs;
+ delete[] rightLeafs;
+
+ //for the root of the tree, subLeafs buffer is set to 0
+ if (subLeafs) {
+ //copy the results to the parent tree node
+ memcpy(subLeafs, order->leftLeafs, order->leftNum * sizeof(int));
+ memcpy(subLeafs + order->leftNum, order->rightLeafs,
+ order->rightNum * sizeof(int));
+ }
+ //compute the total number of leafs in this subtree
+ subLeafsNum = order->leftNum + order->rightNum;
+
+ return this->alignOrdersNum;//return the index of itself, starting from 1, instead of 0
+}
+void MSAGuideTree::releaseAlignmentOrders() {
+ if (!this->alignOrders) {
+ return;
+ }
+ for (int i = 0; i < this->alignOrdersNum; i++) {
+ AlignmentOrder* order = &this->alignOrders[i];
+ if (order->leftLeafs) {
+ delete[] order->leftLeafs;
+ }
+ if (order->rightLeafs) {
+ delete[] order->rightLeafs;
+ }
+ }
+ delete[] alignOrders;
+}
+/********************************
+ display the alignment orders
+ ********************************/
+void MSAGuideTree::displayAlignmentOrders() {
+ int i, j;
+ AlignmentOrder* order;
+ fprintf(stderr, "************DISPLAY ALIGNMENT ORDER***************\n");
+ for (i = 1; i <= this->alignOrdersNum; i++) {
+ order = &this->alignOrders[i];
+
+ fprintf(stderr, "GROUP (%d depth %d):\n---LEFT ORDER: %d\n", i,
+ order->nodeDepth, order->leftOrder);
+ fprintf(stderr, "---LEFT: ");
+ for (j = 0; j < order->leftNum; j++) {
+ fprintf(stderr, "%d ", order->leftLeafs[j]);
+ }
+
+ fprintf(stderr, "\n---RIGHT ORDER: %d\n", order->rightOrder);
+ fprintf(stderr, "\n---RIGHT: ");
+ for (j = 0; j < order->rightNum; j++) {
+ fprintf(stderr, "%d ", order->rightLeafs[j]);
+ }
+ fprintf(stderr, "\n");
+ }
+ fprintf(stderr, "*******************************************\n");
+}
+/*********************************
+ display the tree
+ *********************************/
+void MSAGuideTree::displayTree() {
+ fprintf(stderr, "**************DISPLAY TREE*********************\n");
+ for (int i = 0; i < nodesNum; i++) {
+ TreeNode* node = &nodes[i];
+
+ fprintf(stderr,
+ "%d(%p): left(%p) %d, right(%p) %d, parent(%p) %d, dist %f\n",
+ (node == &nodes[node->idx]) ? node->idx : -2, node, node->left,
+ (!node->left || node->left == &nodes[node->leftIdx]) ?
+ node->leftIdx : -2, node->right,
+ (!node->right || node->right == &nodes[node->rightIdx]) ?
+ node->rightIdx : -2, node->parent,
+ (!node->parent || node->parent == &nodes[node->parentIdx]) ?
+ node->parentIdx : -2, node->dist);
+ }
+ fprintf(stderr, "*******************************************\n");
+}
+/*********************************
+ compute the sequence weights
+ *********************************/
+void MSAGuideTree::getSeqsWeights() {
+ int i;
+ TreeNode* curr;
+
+ //compute the order of each node, which represents the number of leaf nodes in the substree rooting from it.
+ for (i = 0; i < leafsNum; i++) {
+ //for each leaf nodes
+ curr = &this->leafs[i];
+ while (curr != 0) {
+ curr->order++;
+
+ curr = curr->parent;
+ }
+ }
+ //compute the weight of each sequence, which corresponds to a leaf node
+ for (i = 0; i < numSeqs; i++) {
+ //compute the weight of each sequence
+ float weights = 0;
+ curr = &this->leafs[i];
+ while (curr->parent != 0) {
+ weights += curr->dist / curr->order;
+ curr = curr->parent;
+ //printf("order:%d weights: %f\n", curr->order, weights);
+ }
+ //save the weight of this sequence
+ seqsWeights[i] = (int) (100 * weights);
+ //printf("%d\n", seqsWeights[i]);
+ }
+ //normalize the weights
+ int wsum = 0;
+ for (i = 0; i < numSeqs; i++) {
+ wsum += seqsWeights[i];
+ }
+ if (wsum == 0) {
+ //in this case, every sequence is assumed to have an identical weight
+ for (i = 0; i < numSeqs; i++) {
+ seqsWeights[i] = 1;
+ }
+ wsum = numSeqs;
+ }
+ //printf("wsum:%d \n", wsum);
+ for (i = 0; i < numSeqs; i++) {
+ seqsWeights[i] = (seqsWeights[i] * INT_MULTIPLY) / wsum;
+ if (seqsWeights[i] < 1) {
+ seqsWeights[i] = 1;
+ }
+ //printf("%d \n", seqsWeights[i]);
+ }
+}
+void MSAGuideTree::create() {
+ //do nothing
+}
+
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#ifndef _MSA_GUIDE_TREE_H
+#define _MSA_GUIDE_TREE_H
+#include "MSADef.h"
+#include "MSA.h"
+
+#include "SafeVector.h"
+#include "MultiSequence.h"
+#include "ScoreType.h"
+#include "ProbabilisticModel.h"
+#include "SparseMatrix.h"
+
+class MSA;
+struct ValidNode {
+ ValidNode* prev;
+ ValidNode* next;
+ int n; //the index in the distance matrix
+ int node; //the index in the tree node entries
+};
+
+struct TreeNode {
+ struct TreeNode *left; //the pointer to its left child
+ struct TreeNode *right; //the pointer to its right child
+ struct TreeNode *parent; //the pointer to its parent
+ int leftIdx; //the index of the left child
+ int rightIdx; //the index of the right child
+ int parentIdx; //the index of its parent
+ int idx; //the index of itself
+ float dist; //the distance to its parent
+ int leaf; //whether it is a leaf node or not
+ int order; //the number of generations dating back to its ancestor
+ int depth; //the depth of the node
+};
+struct AlignmentOrder {
+ int nodeDepth; //the depth of the internal node
+ int leftOrder; //the order number of the right child
+ int rightOrder; //the order number of the left child
+ int* leftLeafs; //the indices of leafs in the left subtree
+ int leftNum; //the number of leafs in the left subtree
+ int* rightLeafs; //the indices of leafs in the right subtree
+ int rightNum; //the number of leafs in the right substree
+};
+
+class MSAGuideTree {
+public:
+ MSAGuideTree(MSA* msa, VVF& distMatrix, int numSeqs);
+ virtual ~MSAGuideTree() = 0; //abstract class
+
+ //get the tree nodes
+ TreeNode* getNodes();
+ //get the leaf nodes
+ TreeNode* getLeafs();
+ //get the number of nodes;
+ int getNodesNum();
+ //get the number of leaf nodes
+ int getLeafsNum();
+ //get the root of the tree
+ TreeNode* getRoot() {
+ return this->root;
+ }
+ //get the alignment orders
+ AlignmentOrder* getAlignOrders();
+ int getAlignOrdersNum();
+ //construct the alignment orders
+ void createAlignmentOrders();
+
+ //construct the guide tree
+ virtual void create();
+ //calculate the sequence weights
+ virtual void getSeqsWeights();
+
+ /**********DEBUGING****************/
+ //display the tree
+ void displayTree();
+ //display the alignment orders
+ void displayAlignmentOrders();
+
+protected:
+ //join two nodes
+ void connectNodes(TreeNode* parent, int parentIdx, TreeNode* leftChild,
+ float leftDist, TreeNode* rightChild, float rightDist);
+ //release the alignment orders vector
+ void releaseAlignmentOrders();
+ //recursive implemenation of constructing the alignment orders
+ int recursiveCreateAlignmentOrders(TreeNode* subRoot, int* subLeafs,
+ int& subLeafsNum, int nodeDepth);
+
+ //system configurations
+ MSA* msa;
+ VVF* distMatrix;
+ int numSeqs;
+ int* seqsWeights;
+
+ //all the tree nodes
+ TreeNode* nodes;
+ int nodesNum;
+ int nodesSize;
+ //the root tree node
+ TreeNode* root;
+ //leaf node
+ TreeNode* leafs;
+ int leafsNum;
+
+ //alignment order
+ AlignmentOrder* alignOrders;
+ int alignOrdersNum;
+ int alignOrdersSize;
+};
+#endif
+
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+#include "SafeVector.h"
+#include <iostream>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <ctype.h>
+#include <assert.h>
+#define TRACE 0 // 0: NOTRACE 1: TRACE
+//proba like settings
+#define endgaps 1 // 1: engap penaties enabled 0: disabled
+#define PART_FULL_MEMORY 0 //0: LOW MEM OPTION
+#define REVPART_FULL_MEMORY 0 //0: LOW MEM OPTION
+using namespace std;
+
+#ifdef _WIN32
+#define OS_HUGE_VALL HUGE_VAL
+#else
+#define OS_HUGE_VALL HUGE_VALL
+#endif
+
+typedef struct {
+ char input[30];
+ int matrix;
+ int N;
+ float T;
+ float beta;
+ char opt; //can be 'P' or 'M'
+ float gapopen;
+ float gapext;
+} argument_decl;
+
+typedef struct sequence {
+ char *title;
+ char *text;
+ int length;
+} fasta;
+
+typedef struct alignment {
+ char *title;
+ char *text;
+ int length;
+} align;
+
+////////////////////////////////////////////////////////
+//externs related to scoring matrix and input arguments
+///////////////////////////////////////////////////////////
+extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
+extern char aminos[26], matrixtype[20], bases[26];
+
+extern double sub_matrix[26][26];
+extern int subst_index[26];
+
+extern float TEMPERATURE;
+extern int MATRIXTYPE;
+
+extern float GAPOPEN;
+extern float GAPEXT;
+extern argument_decl argument;
+
+//////////////////////////////////////////////////////////////////////////////
+//calculates reverse partition function values based on z matrices
+//and also simulaneously calculates the propability of each basepair
+//or aminoacid residue pair i,j
+//////////////////////////////////////////////////////////////////////////////
+
+VF *revers_partf(fasta sequences[2], const double termgapopen,
+ const double termgapextend, long double **Zfm, const double d,
+ const double e) {
+ // printf("revpart\n");
+ //rest of the declarations
+ int i, j;
+ long double **Zm = NULL;
+ long double **Ze = NULL;
+ long double **Zf = NULL;
+ int len0, len1;
+ float probability;
+ long double tempvar;
+ int Si, Tj;
+ double endgapopen, endgapextend;
+ FILE *fo;
+
+ //Init lengths of sequences
+ len0 = strlen(sequences[0].text);
+ len1 = strlen(sequences[1].text);
+
+ //Safe vector declared
+ VF *posteriorPtr = new VF((len0 + 1) * (len1 + 1));
+ VF & posterior = *posteriorPtr;
+ VF::iterator ptr = posterior.begin();
+
+ if (TRACE) //open the trace file
+ fo = fopen("revpartdump", "a");
+
+ //default:
+ endgapopen = termgapopen;
+ endgapextend = termgapextend;
+
+ //instantiate the z matrix
+ if (REVPART_FULL_MEMORY) {
+
+ Ze = new long double *[sequences[1].length + 1];
+ Zf = new long double *[sequences[1].length + 1];
+ Zm = new long double *[sequences[1].length + 1];
+
+ if (TRACE)
+ printf("\n\n %e %e\n", d, e);
+
+ //DYNAMICALLY GROW 2D Zm Zf Ze MARICES (long double)
+ for (i = 0; i <= sequences[1].length; i++) {
+ Ze[i] = new long double[sequences[0].length + 1];
+ Zf[i] = new long double[sequences[0].length + 1];
+ Zm[i] = new long double[sequences[0].length + 1];
+ }
+ } else {
+ Zm = new long double *[2];
+ Ze = new long double *[2];
+ Zf = new long double *[2];
+ for (i = 0; i <= 1; i++) {
+ Zm[i] = new long double[sequences[0].length + 1];
+ Ze[i] = new long double[sequences[0].length + 1];
+ Zf[i] = new long double[sequences[0].length + 1];
+ }
+
+ }
+
+ if (TRACE) {
+ printf("in rev partf---");
+ printf("\n\n");
+ }
+
+ if (REVPART_FULL_MEMORY) {
+ for (i = 0; i <= len1; i++)
+ for (j = 0; j <= len0; j++) {
+ Zm[i][j] = 0.0;
+ Zf[i][j] = 0.0;
+ Ze[i][j] = 0.0;
+ }
+ } else {
+
+ for (j = 0; j <= len0; j++) {
+ Zm[0][j] = 0;
+ Zf[0][j] = 0;
+ Ze[0][j] = 0;
+ Zf[1][j] = 0;
+ Ze[1][j] = 0;
+ Zm[1][j] = 0;
+ }
+ }
+
+ //fill the probability matrix with 0s
+ for (i = 0; i <= len1; i++)
+ for (j = 0; j <= len0; j++)
+ ptr[j * (len1 + 1) + i] = 0;
+
+ if (endgaps == 0) {
+ Zm[len1][len0] = 1;
+ Ze[len1][len0] = Zf[len1][len0] = 0;
+ Zf[len1 - 1][len0] = Zm[len1][len0] * d;
+ Ze[len1][len0 - 1] = Zm[len1][len0] * d;
+
+ //>=2ND ROW INIT
+ if (REVPART_FULL_MEMORY) {
+ for (i = len1 - 2; i >= 0; i--) {
+ Zf[i][len0] = Zf[i + 1][len0] * e;
+ }
+ }
+
+ //>=2ND COL INIT
+ if (REVPART_FULL_MEMORY) {
+ for (j = len0 - 2; j >= 0; j--) {
+ Ze[len1][j] = Ze[len1][j + 1] * e;
+ }
+ } else {
+ for (j = len0 - 2; j >= 0; j--) {
+ Ze[0][j] = Ze[0][j + 1] * e;
+ }
+ }
+ } else {
+
+ if (REVPART_FULL_MEMORY) {
+
+ Zm[len1][len0] = 1;
+ Ze[len1][len0] = Zf[len1][len0] = 0;
+ Zf[len1 - 1][len0] = Zm[len1][len0] * endgapopen;
+ Ze[len1][len0 - 1] = Zm[len1][len0] * endgapopen;
+
+ //>=2ND ROW INIT
+ for (i = len1 - 2; i >= 0; i--) {
+ Zf[i][len0] = Zf[i + 1][len0] * endgapextend;
+ }
+
+ //M Iy= d+j*e
+
+ //>=2ND COL INIT
+ for (j = len0 - 2; j >= 0; j--) {
+ Ze[len1][j] = Ze[len1][j + 1] * endgapextend;
+ }
+
+ } else {
+ //in Zm
+ //let:
+ // Zm(0) be the current row being filled/computed
+ // Zm(1) be the previous row
+
+ Zm[1][len0] = 1;
+ Ze[0][len0] = Zf[0][len0] = 0;
+ Zf[1][len0] = Zm[1][len0] * endgapopen;
+ Ze[0][len0 - 1] = Zm[1][len0] * endgapopen;
+
+ //>=2ND COL INIT
+ for (j = len0 - 2; j >= 0; j--) {
+ Ze[0][j] = Ze[0][j + 1] * endgapextend;
+ }
+
+ } //END ELSE
+
+ } //END FULL MEMORY and GAP enablement IF STATEMENT
+
+ double scorez, zz = 0;
+
+ for (i = len1 - 1; i >= 0; i--) {
+
+ for (j = len0 - 1; j >= 0; j--) {
+ Si = subst_index[sequences[1].text[i] - 'A'];
+ Tj = subst_index[sequences[0].text[j] - 'A'];
+ scorez = sub_matrix[Si][Tj];
+
+ //endgaps modification aug 10
+ double open0, extend0, open1, extend1;
+
+ open0 = open1 = d;
+ extend0 = extend1 = e;
+
+ if (endgaps == 1) {
+
+ //check to see if one of the 2 sequences or both reach the end
+
+ if (i == 0) {
+ open0 = endgapopen;
+ extend0 = endgapextend;
+
+ }
+
+ if (j == 0) {
+ open1 = endgapopen;
+ extend1 = endgapextend;
+ }
+
+ }
+
+ if (REVPART_FULL_MEMORY) {
+ //z computation
+
+ Ze[i][j] = Zm[i][j + 1] * open0 + Ze[i][j + 1] * extend0;
+ Zf[i][j] = Zm[i + 1][j] * open1 + Zf[i + 1][j] * extend1;
+ Zm[i][j] = (Zm[i + 1][j + 1] + Zf[i + 1][j + 1]
+ + Ze[i + 1][j + 1]) * scorez;
+ zz = Zm[i][j] + Zf[i][j] + Ze[i][j];
+
+ } else {
+
+ //2 ROW zE zF ALGORITHM GOES...:
+ //Ze[1][j] =Zm[i][j + 1] * exp(beta * open0) + Ze[1][j + 1] *exp(beta * extend0);
+ //Zf[1][j] = Zm[i + 1][j] * exp(beta * open1) + Zf[0][j] * exp(beta * extend1);
+ //Zm[i][j] = (Zm[i + 1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1]) * exp(beta * scorez);
+ //zz = Zm[0][j] + Zf[1][j] + Ze[1][j];
+
+ //lowmem code for merging probability calculating module
+ //Here we make use of Zm as a 2 row matrix
+
+ Zf[1][j] = Zm[1][j] * open1 + Zf[0][j] * extend1;
+ Ze[1][j] = Zm[0][j + 1] * open0 + Ze[1][j + 1] * extend0;
+ Zm[0][j] = (Zm[1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1])
+ * scorez;
+
+ tempvar = Zfm[i + 1][j + 1] * Zm[0][j];
+ //divide P(i,j) i.e. pairwise probability by denominator
+ tempvar /= (scorez * Zfm[0][0]);
+ probability = (float) tempvar;
+
+ //store only noticable probabilities
+ if (probability <= 1 && probability >= 0.001) {
+ //algorithm goes...
+ //validprob[i + 1][j + 1] = probability;
+ ptr[(j + 1) * (len1 + 1) + (i + 1)] = probability;
+ }
+ //lowmem code ends here
+
+ }
+
+ } //end of for
+
+ if (REVPART_FULL_MEMORY == 0) {
+ for (int t = 0; t <= sequences[0].length; t++) {
+ Ze[0][t] = Ze[1][t];
+ Ze[1][t] = 0;
+
+ Zf[0][t] = Zf[1][t];
+ Zf[1][t] = 0;
+
+ Zm[1][t] = Zm[0][t];
+ Zm[0][t] = 0;
+
+ }
+ Zf[0][len0] = 1;
+
+ }
+
+ } //end of for
+
+ if (TRACE) {
+ printf("\n\nrM:....\n\n");
+ if (REVPART_FULL_MEMORY) {
+ for (i = 0; i <= len1; i++) {
+ for (j = 0; j <= len0; j++)
+ printf("%.2Le ", Zm[i][j]);
+ printf("\n");
+ }
+
+ printf("\n\nrE:....\n\n");
+ for (i = 0; i <= len1; i++) {
+ for (j = 0; j <= len0; j++)
+ printf("%.2Le ", Ze[i][j]);
+ printf("\n");
+
+ }
+
+ printf("\n\nrF:....\n\n");
+ for (i = 0; i <= len1; i++) {
+ for (j = 0; j <= len0; j++)
+ printf("%.2Le ", Zf[i][j]);
+ printf("\n");
+
+ }
+
+ }
+
+ }
+
+ if (TRACE) {
+ fprintf(fo, "\n");
+ fclose(fo);
+ }
+
+ //delete unused memory
+
+ if (REVPART_FULL_MEMORY) {
+ for (i = 0; i <= len1; i++) {
+ delete (Zm[i]);
+ delete (Zf[i]);
+ delete (Ze[i]);
+ }
+ } else {
+ delete (Zf[0]);
+ delete (Ze[0]);
+ delete (Zm[0]);
+
+ delete (Zm[1]);
+ delete (Zf[1]);
+ delete (Ze[1]);
+ }
+
+ for (i = 0; i <= len1; i++) {
+ delete (Zfm[i]);
+ }
+
+ if (Zf != NULL)
+ delete (Zf);
+
+ if (Ze != NULL)
+ delete (Ze);
+
+ if (Zm != NULL)
+ delete (Zm);
+
+ if (Zfm != NULL)
+ delete (Zfm);
+
+ posterior[0] = 0;
+ return (posteriorPtr);
+
+}
+
+//////////////////////////////////////////////////////////////
+//forward partition function
+/////////////////////////////////////////////////////////////
+
+long double **partf(fasta sequences[2], const double termgapopen,
+ const double termgapextend, const double d, const double e) {
+ //printf("partf\n");
+ int i, j, len1, len0;
+ long double **Zm = NULL, **Zf = NULL, **Ze = NULL, zz = 0;
+ double endgapopen, endgapextend;
+
+ //default:
+ endgapopen = termgapopen;
+ endgapextend = termgapextend;
+
+ //the flag endgaps is set at the #define section
+ if (PART_FULL_MEMORY) {
+
+ Zf = new long double *[sequences[1].length + 1];
+ Ze = new long double *[sequences[1].length + 1];
+ Zm = new long double *[sequences[1].length + 1];
+
+ //comment
+ if (TRACE)
+ printf("\nPARTF:====\n");
+
+ //DYNAMICALLY GROW 2D M,IX,IY,PIX,PIY MARICES
+ for (i = 0; i <= sequences[1].length; i++) {
+ Zf[i] = new long double[sequences[0].length + 1];
+ Ze[i] = new long double[sequences[0].length + 1];
+ Zm[i] = new long double[sequences[0].length + 1];
+ }
+ } else {
+ Zm = new long double *[sequences[1].length + 1];
+ Ze = new long double *[2];
+ Zf = new long double *[2];
+ for (i = 0; i <= sequences[1].length; i++) {
+ Zm[i] = new long double[sequences[0].length + 1];
+ }
+ Ze[0] = new long double[sequences[0].length + 1];
+ Zf[0] = new long double[sequences[0].length + 1];
+ Ze[1] = new long double[sequences[0].length + 1];
+ Zf[1] = new long double[sequences[0].length + 1];
+ }
+
+ len0 = strlen(sequences[0].text);
+ len1 = strlen(sequences[1].text);
+
+ if (PART_FULL_MEMORY) {
+ for (i = 0; i <= sequences[1].length; i++)
+ for (j = 0; j <= sequences[0].length; j++) {
+ Zm[i][j] = 0.00;
+ Zf[i][j] = 0.00;
+ Ze[i][j] = 0.00;
+ }
+ } else {
+ for (i = 0; i <= len1; i++) {
+ for (j = 0; j <= len0; j++) {
+ Zm[i][j] = 0;
+ }
+ }
+ for (j = 0; j <= len0; j++) {
+ Zf[0][j] = 0;
+ Ze[0][j] = 0;
+ Zf[1][j] = 0;
+ Ze[1][j] = 0;
+ }
+ }
+
+ //INTITIALIZE THE DP
+
+ if (endgaps == 0) {
+ Zm[0][0] = 1.00;
+
+ Zf[0][0] = Ze[0][0] = 0;
+ Zf[1][0] = Zm[0][0] * d;
+ Ze[0][1] = Zm[0][0] * d;
+
+ //>=2ND ROW INIT
+ if (PART_FULL_MEMORY) {
+ for (i = 2; i <= sequences[1].length; i++) {
+ Zf[i][0] = Zf[i - 1][0] * e;
+ }
+ }
+
+ //>=2ND COL INIT
+ for (j = 2; j <= sequences[0].length; j++) {
+ Ze[0][j] = Ze[0][j - 1] * e;
+ }
+ } else {
+ //init z
+ Zm[0][0] = 1.00;
+ Zf[0][0] = Ze[0][0] = 0;
+ Zf[1][0] = Zm[0][0] * endgapopen;
+ Ze[0][1] = Zm[0][0] * endgapopen;
+
+ //>=2ND ROW INIT
+ if (PART_FULL_MEMORY) {
+ for (i = 2; i <= sequences[1].length; i++) {
+ Zf[i][0] = Zf[i - 1][0] * endgapextend;
+ }
+ }
+
+ //>=2ND COL INIT
+ for (j = 2; j <= sequences[0].length; j++) {
+ Ze[0][j] = Ze[0][j - 1] * endgapextend;
+ }
+ }
+
+ //1ST ROW/COL INIT
+
+ int Si, Tj;
+ double score;
+
+ for (i = 1; i <= sequences[1].length; i++) {
+
+ for (j = 1; j <= sequences[0].length; j++) {
+
+ Si = subst_index[sequences[1].text[i - 1] - 'A'];
+ Tj = subst_index[sequences[0].text[j - 1] - 'A'];
+
+ score = sub_matrix[Si][Tj];
+
+ double open0, extend0, open1, extend1;
+
+ open0 = open1 = d;
+ extend0 = extend1 = e;
+
+ if (endgaps == 1) {
+ //check to see if one of the 2 sequences or both reach the end
+
+ if (i == sequences[1].length) {
+ open0 = endgapopen;
+ extend0 = endgapextend;
+
+ }
+
+ if (j == sequences[0].length) {
+ open1 = endgapopen;
+ extend1 = endgapextend;
+ }
+ }
+
+ //
+ //z computation using open and extend temp vars
+ //open0 is gap open in seq0 and open1 is gap open in seq1
+ //entend0 is gap extend in seq0 and extend1 is gap extend in seq1
+
+ if (PART_FULL_MEMORY) {
+ Ze[i][j] = Zm[i][j - 1] * open0 + Ze[i][j - 1] * extend0;
+
+ if (Ze[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for Ze\n");
+ exit(1);
+ }
+
+ Zf[i][j] = Zm[i - 1][j] * open1 + Zf[i - 1][j] * extend1;
+
+ if (Zf[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for Zf\n");
+ exit(1);
+ }
+
+ Zm[i][j] = (Zm[i - 1][j - 1] + Ze[i - 1][j - 1]
+ + Zf[i - 1][j - 1]) * score;
+
+ if (Zm[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for Zm\n");
+ exit(1);
+ }
+
+ zz = Zm[i][j] + Ze[i][j] + Zf[i][j];
+ } else {
+ Ze[1][j] = Zm[i][j - 1] * open0 + Ze[1][j - 1] * extend0;
+
+ if (Ze[1][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for zE\n");
+ exit(1);
+ }
+
+ Zf[1][j] = Zm[i - 1][j] * open1 + Zf[0][j] * extend1;
+
+ if (Zf[1][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for zF\n");
+ exit(1);
+ }
+
+ Zm[i][j] = (Zm[i - 1][j - 1] + Ze[0][j - 1] + Zf[0][j - 1])
+ * score;
+
+ if (Zm[i][j] >= OS_HUGE_VALL) {
+ printf("ERROR: huge val error for zM\n");
+ exit(1);
+ }
+
+ zz = Zm[i][j] + Ze[1][j] + Zf[1][j];
+ }
+
+ } //end for
+
+ if (!PART_FULL_MEMORY) {
+ for (int t = 0; t <= sequences[0].length; t++) {
+ Ze[0][t] = Ze[1][t];
+ Ze[1][t] = 0;
+
+ Zf[0][t] = Zf[1][t];
+ Zf[1][t] = 0;
+ }
+
+ Zf[1][0] = 1;
+
+ }
+
+ } //end for
+
+ //store the sum of zm zf ze (m,n)s in zm's 0,0 th position
+ Zm[0][0] = zz;
+
+ if (TRACE) {
+ //debug code aug 3
+ //print the 3 Z matrices namely Zm Zf and Ze
+
+ printf("\n\nFINAL Zm:\n");
+ for (i = 0; i <= sequences[1].length; i++) {
+ for (j = 0; j <= sequences[0].length; j++)
+ printf("%.2Le ", Zm[i][j]);
+ printf("\n");
+ }
+
+ printf("FINAL Zf \n");
+ for (i = 0; i <= sequences[1].length; i++) {
+ for (j = 0; j <= sequences[0].length; j++)
+ printf("%.2Le ", Zf[i][j]);
+ printf("\n");
+ }
+
+ printf("FINAL Ze \n");
+ for (i = 0; i <= sequences[1].length; i++) {
+ for (j = 0; j <= sequences[0].length; j++)
+ printf("%.2Le ", Ze[i][j]);
+ printf("\n");
+ }
+
+ //end debug dump code
+
+ }
+
+ if (PART_FULL_MEMORY) {
+ for (i = 0; i <= sequences[1].length; i++) {
+ delete (Zf[i]);
+ delete (Ze[i]);
+ }
+ } else {
+ delete (Zf[0]);
+ delete (Ze[0]);
+ delete (Zf[1]);
+ delete (Ze[1]);
+ }
+
+ delete (Zf);
+ delete (Ze);
+
+ return Zm;
+
+} //end of forward partition function
+
+/////////////////////////////////////////////////////////////////////////////////////////
+//entry point (was the main function) , returns the posterior probability safe vector
+////////////////////////////////////////////////////////////////////////////////////////
+VF *ComputePostProbs(int a, int b, string seq1, string seq2) {
+ //printf("probamod\n");
+ double gap_open = -22, gap_ext = -1, beta = 0.2;//T = 5, beta = 1/T = 0.2, by default
+ int stock_loop = 1;
+ int le = 160;
+ double termgapopen = 1.0f; //exp(0)
+ double termgapextend = 1.0f; //exp(0)
+
+ //initialize the sequence structure
+ fasta sequences[2];
+
+ sequences[0].length = strlen((char *) seq1.c_str());
+ sequences[0].text = (char *) seq1.c_str();
+ sequences[0].title = new char[10];
+ strcpy(sequences[0].title, "seq0");
+ sequences[1].length = strlen((char *) seq2.c_str());
+ sequences[1].text = (char *) seq2.c_str();
+ sequences[1].title = new char[10];
+ strcpy(sequences[1].title, "seq1");
+
+ if (TRACE)
+
+ {
+ printf("%d %d %s\n%d %d %s\n--\n", a, sequences[0].length,
+ sequences[0].text, b, sequences[1].length, sequences[1].text);
+ printf("after init\n");
+
+ FILE *dump1 = fopen("dump1", "a");
+ fprintf(dump1, "%d %d %s\n%d %d %s\n--\n", a, sequences[0].length,
+ sequences[0].text, b, sequences[1].length, sequences[1].text);
+ fclose(dump1);
+ }
+
+ gap_open = argument.gapopen;
+ gap_ext = argument.gapext;
+ beta = argument.beta;
+
+ stock_loop = argument.N;
+ le = argument.matrix;
+
+ //compute the values of exp(beta * ?)
+ termgapopen = exp(beta * 0.0);
+ termgapextend = exp(beta * 0.0);
+ gap_open = exp(beta * gap_open);
+ gap_ext = exp(beta * gap_ext);
+
+ if (TRACE)
+ printf("%f %f %f %d\n", gap_open, gap_ext, beta, le);
+
+ //call for calculating the posterior probabilities
+ // 1. call partition function partf
+ // 2. calculate revpartition using revers_parf
+ // 3. calculate probabilities
+ /// MODIFICATION... POPULATE SAFE VECTOR
+
+ long double **MAT1;
+
+ MAT1 = partf(sequences, termgapopen, termgapextend, gap_open, gap_ext);
+
+ return revers_partf(sequences, termgapopen, termgapextend, MAT1, gap_open,
+ gap_ext);
+
+}
+
+//end of posterior probability module
--- /dev/null
+<?xml version="1.0" encoding="gb2312"?>\r
+<VisualStudioProject\r
+ ProjectType="Visual C++"\r
+ Version="8.00"\r
+ Name="MSAProbs"\r
+ ProjectGUID="{671563E4-93A2-419E-8B41-48DDF71DD144}"\r
+ RootNamespace="MSAProbs"\r
+ Keyword="Win32Proj"\r
+ >\r
+ <Platforms>\r
+ <Platform\r
+ Name="Win32"\r
+ />\r
+ </Platforms>\r
+ <ToolFiles>\r
+ </ToolFiles>\r
+ <Configurations>\r
+ <Configuration\r
+ Name="Debug|Win32"\r
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)"\r
+ IntermediateDirectory="$(ConfigurationName)"\r
+ ConfigurationType="1"\r
+ CharacterSet="1"\r
+ >\r
+ <Tool\r
+ Name="VCPreBuildEventTool"\r
+ />\r
+ <Tool\r
+ Name="VCCustomBuildTool"\r
+ />\r
+ <Tool\r
+ Name="VCXMLDataGeneratorTool"\r
+ />\r
+ <Tool\r
+ Name="VCWebServiceProxyGeneratorTool"\r
+ />\r
+ <Tool\r
+ Name="VCMIDLTool"\r
+ />\r
+ <Tool\r
+ Name="VCCLCompilerTool"\r
+ AdditionalOptions="/openmp"\r
+ Optimization="0"\r
+ PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"\r
+ MinimalRebuild="true"\r
+ BasicRuntimeChecks="3"\r
+ RuntimeLibrary="3"\r
+ OpenMP="true"\r
+ UsePrecompiledHeader="0"\r
+ WarningLevel="3"\r
+ Detect64BitPortabilityProblems="true"\r
+ DebugInformationFormat="4"\r
+ />\r
+ <Tool\r
+ Name="VCManagedResourceCompilerTool"\r
+ />\r
+ <Tool\r
+ Name="VCResourceCompilerTool"\r
+ />\r
+ <Tool\r
+ Name="VCPreLinkEventTool"\r
+ />\r
+ <Tool\r
+ Name="VCLinkerTool"\r
+ LinkIncremental="2"\r
+ GenerateDebugInformation="true"\r
+ SubSystem="1"\r
+ TargetMachine="1"\r
+ />\r
+ <Tool\r
+ Name="VCALinkTool"\r
+ />\r
+ <Tool\r
+ Name="VCManifestTool"\r
+ />\r
+ <Tool\r
+ Name="VCXDCMakeTool"\r
+ />\r
+ <Tool\r
+ Name="VCBscMakeTool"\r
+ />\r
+ <Tool\r
+ Name="VCFxCopTool"\r
+ />\r
+ <Tool\r
+ Name="VCAppVerifierTool"\r
+ />\r
+ <Tool\r
+ Name="VCWebDeploymentTool"\r
+ />\r
+ <Tool\r
+ Name="VCPostBuildEventTool"\r
+ />\r
+ </Configuration>\r
+ <Configuration\r
+ Name="Release|Win32"\r
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)"\r
+ IntermediateDirectory="$(ConfigurationName)"\r
+ ConfigurationType="1"\r
+ CharacterSet="1"\r
+ WholeProgramOptimization="1"\r
+ >\r
+ <Tool\r
+ Name="VCPreBuildEventTool"\r
+ />\r
+ <Tool\r
+ Name="VCCustomBuildTool"\r
+ />\r
+ <Tool\r
+ Name="VCXMLDataGeneratorTool"\r
+ />\r
+ <Tool\r
+ Name="VCWebServiceProxyGeneratorTool"\r
+ />\r
+ <Tool\r
+ Name="VCMIDLTool"\r
+ />\r
+ <Tool\r
+ Name="VCCLCompilerTool"\r
+ PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"\r
+ RuntimeLibrary="2"\r
+ OpenMP="true"\r
+ UsePrecompiledHeader="0"\r
+ WarningLevel="3"\r
+ Detect64BitPortabilityProblems="true"\r
+ DebugInformationFormat="3"\r
+ />\r
+ <Tool\r
+ Name="VCManagedResourceCompilerTool"\r
+ />\r
+ <Tool\r
+ Name="VCResourceCompilerTool"\r
+ />\r
+ <Tool\r
+ Name="VCPreLinkEventTool"\r
+ />\r
+ <Tool\r
+ Name="VCLinkerTool"\r
+ LinkIncremental="1"\r
+ GenerateDebugInformation="true"\r
+ SubSystem="1"\r
+ OptimizeReferences="2"\r
+ EnableCOMDATFolding="2"\r
+ TargetMachine="1"\r
+ />\r
+ <Tool\r
+ Name="VCALinkTool"\r
+ />\r
+ <Tool\r
+ Name="VCManifestTool"\r
+ />\r
+ <Tool\r
+ Name="VCXDCMakeTool"\r
+ />\r
+ <Tool\r
+ Name="VCBscMakeTool"\r
+ />\r
+ <Tool\r
+ Name="VCFxCopTool"\r
+ />\r
+ <Tool\r
+ Name="VCAppVerifierTool"\r
+ />\r
+ <Tool\r
+ Name="VCWebDeploymentTool"\r
+ />\r
+ <Tool\r
+ Name="VCPostBuildEventTool"\r
+ />\r
+ </Configuration>\r
+ </Configurations>\r
+ <References>\r
+ </References>\r
+ <Files>\r
+ <Filter\r
+ Name="Source Files"\r
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"\r
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"\r
+ >\r
+ <File\r
+ RelativePath=".\main.cpp"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSA.cpp"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAClusterTree.cpp"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAGuideTree.cpp"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAPartProbs.cpp"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAReadMatrix.cpp"\r
+ >\r
+ </File>\r
+ </Filter>\r
+ <Filter\r
+ Name="Header Files"\r
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"\r
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"\r
+ >\r
+ <File\r
+ RelativePath=".\Defaults.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\FileBuffer.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSA.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAClusterTree.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSADef.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAGuideTree.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MSAReadMatrix.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\MultiSequence.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\ProbabilisticModel.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\SafeVector.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\ScoreType.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\Sequence.h"\r
+ >\r
+ </File>\r
+ <File\r
+ RelativePath=".\SparseMatrix.h"\r
+ >\r
+ </File>\r
+ </Filter>\r
+ <Filter\r
+ Name="Resource Files"\r
+ Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"\r
+ UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"\r
+ >\r
+ </Filter>\r
+ </Files>\r
+ <Globals>\r
+ </Globals>\r
+</VisualStudioProject>\r
--- /dev/null
+<?xml version="1.0" encoding="gb2312"?>\r
+<VisualStudioUserFile\r
+ ProjectType="Visual C++"\r
+ Version="8.00"\r
+ ShowAllFiles="false"\r
+ >\r
+ <Configurations>\r
+ <Configuration\r
+ Name="Debug|Win32"\r
+ >\r
+ <DebugSettings\r
+ Command="$(TargetPath)"\r
+ WorkingDirectory=""\r
+ CommandArguments=""\r
+ Attach="false"\r
+ DebuggerType="3"\r
+ Remote="1"\r
+ RemoteMachine="GS-14"\r
+ RemoteCommand=""\r
+ HttpUrl=""\r
+ PDBPath=""\r
+ SQLDebugging=""\r
+ Environment=""\r
+ EnvironmentMerge="true"\r
+ DebuggerFlavor=""\r
+ MPIRunCommand=""\r
+ MPIRunArguments=""\r
+ MPIRunWorkingDirectory=""\r
+ ApplicationCommand=""\r
+ ApplicationArguments=""\r
+ ShimCommand=""\r
+ MPIAcceptMode=""\r
+ MPIAcceptFilter=""\r
+ />\r
+ </Configuration>\r
+ <Configuration\r
+ Name="Release|Win32"\r
+ >\r
+ <DebugSettings\r
+ Command="$(TargetPath)"\r
+ WorkingDirectory=""\r
+ CommandArguments=""\r
+ Attach="false"\r
+ DebuggerType="3"\r
+ Remote="1"\r
+ RemoteMachine="GS-14"\r
+ RemoteCommand=""\r
+ HttpUrl=""\r
+ PDBPath=""\r
+ SQLDebugging=""\r
+ Environment=""\r
+ EnvironmentMerge="true"\r
+ DebuggerFlavor=""\r
+ MPIRunCommand=""\r
+ MPIRunArguments=""\r
+ MPIRunWorkingDirectory=""\r
+ ApplicationCommand=""\r
+ ApplicationArguments=""\r
+ ShimCommand=""\r
+ MPIAcceptMode=""\r
+ MPIAcceptFilter=""\r
+ />\r
+ </Configuration>\r
+ </Configurations>\r
+</VisualStudioUserFile>\r
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "MSAReadMatrix.h"
+
+#define TRACE 0
+
+////////////////////////////////////////////////////////////
+// extern variables for scoring matrix data
+////////////////////////////////////////////////////////////
+extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
+extern char *aminos, *bases, matrixtype[20];
+extern int subst_index[26];
+
+extern double sub_matrix[26][26];
+
+extern float TEMPERATURE;
+extern int MATRIXTYPE;
+
+extern float GAPOPEN;
+extern float GAPEXT;
+
+typedef struct {
+ char input[30];
+ int matrix;
+ int N;
+ float T;
+ float beta;
+ char opt; //can be 'P' or 'M'
+ float gapopen;
+ float gapext;
+} argument_decl;
+
+//argument support
+extern argument_decl argument;
+
+/////////////////////////////////////////////////////////
+//sets substitution matrix type
+////////////////////////////////////////////////////////
+void setmatrixtype(int le) {
+ switch (le) {
+ case 160:
+ strcpy(matrixtype, "gonnet_160");
+ break;
+ case 4:
+ strcpy(matrixtype, "nuc_simple");
+ break;
+ default:
+ strcpy(matrixtype, "CUSTOM");
+ break;
+
+ };
+
+}
+
+///////////////////////////////////////////////////////////////////
+//sets matrix flag
+///////////////////////////////////////////////////////////////////
+inline int matrixtype_to_int() {
+
+ if (!strcmp(matrixtype, "nuc_simple"))
+ return 4;
+ else if (!strcmp(matrixtype, "gonnet_160"))
+ return 160;
+ else
+ return 1000;
+
+}
+
+/////////////////////////////////////////////////////////////////
+//
+// Can read any scoring matrix as long as it is defined in Matrix.h
+// AND it is a lower triangular
+// AND the order of amino acids/bases is mentioned
+/////////////////////////////////////////////////////////////////
+
+inline void read_matrix(score_matrix matrx) {
+ int i, j, basecount, position = 0;
+
+ bases = (char *) matrx.monomers;
+
+ basecount = strlen(bases);
+
+ for (i = 0; i < basecount; i++)
+ subst_index[i] = -1;
+
+ for (i = 0; i < basecount; i++)
+ subst_index[bases[i] - 'A'] = i;
+
+ if (TRACE == 1)
+ printf("\nbases read: %d\n", basecount);
+
+ for (i = 0; i < basecount; i++)
+ for (j = 0; j <= i; j++) {
+
+ double value = exp(argument.beta * matrx.matrix[position++]);
+ sub_matrix[i][j] = value;
+ sub_matrix[j][i] = value;
+ }
+
+ if (TRACE)
+ for (i = 0; i < basecount; i++) {
+ for (j = 0; j < basecount; j++)
+ printf(" %g ", sub_matrix[i][j]);
+ printf("\n");
+ }
+
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+//intialize the arguments (default values)
+//////////////////////////////////////////////////////////////////////////////////
+void init_arguments() {
+ float gap_open = 0, gap_ext = 0;
+ int le;
+
+ le = matrixtype_to_int();
+
+ argument.N = 1;
+ strcpy(argument.input, "tempin");
+ argument.matrix = le;
+ argument.gapopen = GAPOPEN;
+ argument.gapext = GAPEXT;
+ argument.T = TEMPERATURE;
+ argument.beta = 1.0 / TEMPERATURE;
+ argument.opt = 'P';
+
+ if (le == 4) //NUC OPTION :default is nuc_simple
+ {
+ read_matrix(nuc_simple);
+ gap_open = -4;
+ gap_ext = -0.25;
+ }
+
+ else if (le == 160) //PROT option: default is gonnet_160
+ {
+ if (TRACE)
+ printf("read matrix\n");
+ read_matrix(gonnet_160);
+ gap_open = -22;
+ gap_ext = -1;
+ } else if (le == 1000) { //Error handling
+ printf("Error: enter a valid matrix type\n");
+ exit(1);
+ //additional matrices can only be lower triangular
+ }
+
+ //now override the gapopen and gapext
+ if (argument.gapopen != 0.0 || argument.gapext != 0.00)
+
+ {
+ gap_open = -argument.gapopen;
+ gap_ext = -argument.gapext;
+ }
+
+ if (TRACE)
+ printf("%f %f %f %d\n", argument.T, gap_open, gap_ext, le);
+
+ argument.gapopen = gap_open;
+ argument.gapext = gap_ext;
+ argument.opt = 'P';
+
+}
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// Matrix.h
+//
+// Specifies scoring matrices and their structure
+//
+//
+//
+/////////////////////////////////////////////////////////////////
+
+#ifndef _MSA_READ_MATRIX_H
+#define _MSA_READ_MATRIX_H
+
+typedef struct {
+ char monomers[26]; /* amino or nucleic acid order */
+ float matrix[676]; /* entries of the score matix, 26*26=676 */
+} score_matrix;
+
+//default protein sequence scoring matrix as well as default scoring matrix of the PROBALIGN
+//also used when -prot option is used
+
+score_matrix gonnet_160 = { "ABCDEFGHIKLMNPQRSTVWXYZ",
+
+{ 4.6, 0.0, 0.0, 0.3, 0.0, 13.5, -1.1, 0.0, -5.3, 7.0, -0.4, 0.0, -5.2, 3.4,
+ 5.9, -3.8, 0.0, -1.8, -7.0, -6.2, 9.1, 0.2, 0.0, -3.4, -0.7, -2.1, -7.6,
+ 8.2, -1.8, 0.0, -2.3, -0.1, -0.1, -0.7, -2.7, 9.3, -1.8, 0.0, -2.5,
+ -6.2, -4.3, 0.3, -7.0, -3.7, 5.9, -1.2, 0.0, -4.8, -0.1, 1.3, -5.3,
+ -2.4, 0.2, -3.5, 5.5, -2.2, 0.0, -2.9, -6.5, -4.5, 1.9, -6.7, -3.2, 3.0,
+ -3.4, 5.7, -1.2, 0.0, -1.9, -5.0, -3.1, 1.4, -5.2, -2.1, 2.9, -2.1, 3.4,
+ 7.6, -1.2, 0.0, -3.1, 2.6, 0.5, -4.7, -0.2, 1.5, -4.4, 0.8, -4.8, -3.6,
+ 6.5, -0.1, 0.0, -5.2, -1.9, -1.4, -5.8, -3.0, -2.2, -4.3, -1.6, -3.5,
+ -4.2, -2.2, 9.6, -0.7, 0.0, -4.2, 0.6, 2.3, -4.1, -2.1, 1.7, -3.2, 2.0,
+ -2.4, -1.2, 0.5, -0.8, 5.6, -1.6, 0.0, -3.5, -1.6, -0.3, -5.3, -2.1,
+ 0.3, -4.1, 3.5, -3.5, -2.9, -0.4, -2.1, 1.7, 7.1, 1.6, 0.0, -0.2, 0.0,
+ -0.3, -4.5, -0.1, -0.8, -3.3, -0.4, -3.6, -2.3, 1.1, 0.0, -0.2, -0.9,
+ 4.4, 0.5, 0.0, -1.4, -0.6, -0.8, -3.6, -2.4, -0.8, -1.2, -0.2, -2.4,
+ -1.1, 0.3, -0.4, -0.4, -0.9, 2.3, 5.0, 0.1, 0.0, -0.6, -4.9, -3.0, -0.8,
+ -5.2, -3.5, 4.0, -3.0, 1.7, 1.4, -3.8, -3.2, -2.7, -3.4, -2.0, 0.0, 5.3,
+ -5.5, 0.0, -2.1, -7.8, -6.4, 3.2, -5.5, -1.9, -3.4, -5.4, -2.0, -2.2,
+ -5.5, -7.4, -4.0, -2.4, -4.7, -5.4, -4.5, 15.8, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, -3.7, 0.0, -1.3, -4.2, -4.4, 5.6, -6.0, 2.7, -2.0, -3.5, -1.1,
+ -1.3, -2.2, -4.8, -2.9, -2.9, -2.8, -3.2, -2.4, 3.8, 0.0, 10.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }
+
+};
+
+//default nucleotide sequence scoring matrix
+//used when -nuc option is used
+score_matrix nuc_simple = {
+
+"ABCDGHKMNRSTUVWXY",
+
+{ 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0 }
+
+//Ribosum85-60
+ /*
+ {
+ 2.22,
+ 0, 0,
+ -1.86, 0, 1.16,
+ 0, 0, 0, 0,
+ -1.46, 0, -2.48, 0, 1.03,
+ 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ -1.39, 0, -1.05, 0, -1.74, 0, 0, 0, 0, 0, 0, 1.65,
+ -1.39, 0, -1.05, 0, -1.74, 0, 0, 0, 0, 0, 0, 0, 1.65,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ }
+ */
+
+ };
+
+#endif
--- /dev/null
+
+CXXOBJS = MSA.o MSAGuideTree.o MSAClusterTree.o MSAPartProbs.o MSAReadMatrix.o main.o
+
+OPENMP = -fopenmp
+CXX = g++
+COMMON_FLAGS = -O3 $(OPENMP) -Wall -funroll-loops -I . -I /usr/include
+CXXFLAGS = $(COMMON_FLAGS)
+
+EXEC = msaprobs
+
+all: $(CXXOBJS)
+ $(CXX) $(CXXFLAGS) -o $(EXEC) $(CXXOBJS) $(NVCCOBJS) $(NVCCLIBS)
+ strip $(EXEC)
+clean:
+ rm -rf *.o $(EXEC)
+
--- /dev/null
+////////////////////////////////////////////////////////////////
+// MultiSequence.h
+//
+// Utilities for reading/writing multiple sequence data.
+/////////////////////////////////////////////////////////////////
+
+#ifndef MULTISEQUENCE_H
+#define MULTISEQUENCE_H
+
+#include <cctype>
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+#include <set>
+#include "SafeVector.h"
+#include "Sequence.h"
+#include "FileBuffer.h"
+
+#define VERSION "0.9.7"
+/////////////////////////////////////////////////////////////////
+// MultiSequence
+//
+// Class for multiple sequence alignment input/output.
+/////////////////////////////////////////////////////////////////
+
+class MultiSequence {
+
+ SafeVector<Sequence *> *sequences;
+
+public:
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::MultiSequence()
+ //
+ // Default constructor.
+ /////////////////////////////////////////////////////////////////
+
+ MultiSequence() :
+ sequences(NULL) {
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::MultiSequence()
+ //
+ // Constructor. Load MFA from a FileBuffer object.
+ /////////////////////////////////////////////////////////////////
+
+ MultiSequence(FileBuffer &infile) :
+ sequences(NULL) {
+ LoadMFA(infile);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::MultiSequence()
+ //
+ // Constructor. Load MFA from a filename.
+ /////////////////////////////////////////////////////////////////
+
+ MultiSequence(const string &filename) :
+ sequences(NULL) {
+ LoadMFA(filename);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::~MultiSequence()
+ //
+ // Destructor. Gets rid of sequence objects contained in the
+ // multiple alignment.
+ /////////////////////////////////////////////////////////////////
+
+ ~MultiSequence() {
+
+ // if sequences allocated
+ if (sequences) {
+
+ // free all sequences
+ for (SafeVector<Sequence *>::iterator iter = sequences->begin();
+ iter != sequences->end(); ++iter) {
+ assert(*iter);
+ delete *iter;
+ *iter = NULL;
+ }
+
+ // free sequence vector
+ delete sequences;
+ sequences = NULL;
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::LoadMFA()
+ //
+ // Load MFA from a filename.
+ /////////////////////////////////////////////////////////////////
+
+ void LoadMFA(const string &filename, bool stripGaps = false) {
+
+ // try opening file
+ FileBuffer infile(filename.c_str());
+
+ if (infile.fail()) {
+ cerr << "ERROR: Could not open file '" << filename
+ << "' for reading." << endl;
+ exit(1);
+ }
+
+ // if successful, then load using other LoadMFA() routine
+ LoadMFA(infile, stripGaps);
+
+ infile.close();
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::LoadMFA()
+ //
+ // Load MSF from a FileBuffer object.
+ /////////////////////////////////////////////////////////////////
+
+ void ParseMSF(FileBuffer &infile, string header, bool stripGaps = false) {
+
+ SafeVector<SafeVector<char> *> seqData;
+ SafeVector<string> seqNames;
+ SafeVector<int> seqLengths;
+
+ istringstream in;
+ bool valid = true;
+ bool missingHeader = false;
+ bool clustalW = false;
+
+ // read until data starts
+ while (!infile.eof() && header.find("..", 0) == string::npos) {
+ if (header.find("CLUSTAL", 0) == 0
+ || header.find("MSAPROBS", 0) == 0) {
+ clustalW = true;
+ break;
+ }
+ infile.GetLine(header);
+ if (header.find("//", 0) != string::npos) {
+ missingHeader = true;
+ break;
+ }
+ }
+
+ // read until end-of-file
+ while (valid) {
+ infile.GetLine(header);
+ if (infile.eof())
+ break;
+
+ string word;
+ in.clear();
+ in.str(header);
+
+ // check if there's anything on this line
+ if (in >> word) {
+
+ // clustalw name parsing
+ if (clustalW) {
+ if (!isspace(header[0])
+ && find(seqNames.begin(), seqNames.end(), word)
+ == seqNames.end()) {
+ seqNames.push_back(word);
+ seqData.push_back(new SafeVector<char>());
+ seqLengths.push_back(0);
+ seqData[(int) seqData.size() - 1]->push_back('@');
+ }
+ }
+
+ // look for new sequence label
+ if (word == string("Name:")) {
+ if (in >> word) {
+ seqNames.push_back(word);
+ seqData.push_back(new SafeVector<char>());
+ seqLengths.push_back(0);
+ seqData[(int) seqData.size() - 1]->push_back('@');
+ } else
+ valid = false;
+ }
+
+ // check if this is sequence data
+ else if (find(seqNames.begin(), seqNames.end(), word)
+ != seqNames.end()) {
+ int index = find(seqNames.begin(), seqNames.end(), word)
+ - seqNames.begin();
+
+ // read all remaining characters on the line
+ char ch;
+ while (in >> ch) {
+ if (isspace(ch))
+ continue;
+ if (ch >= 'a' && ch <= 'z')
+ ch = ch - 'a' + 'A';
+ if (ch == '.')
+ ch = '-';
+ if (stripGaps && ch == '-')
+ continue;
+ if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')) {
+ cerr << "ERROR: Unknown character encountered: "
+ << ch << endl;
+ exit(1);
+ }
+
+ // everything's ok so far, so just store this character.
+ seqData[index]->push_back(ch);
+ seqLengths[index]++;
+ }
+ } else if (missingHeader) {
+ seqNames.push_back(word);
+ seqData.push_back(new SafeVector<char>());
+ seqLengths.push_back(0);
+ seqData[(int) seqData.size() - 1]->push_back('@');
+
+ int index = (int) seqNames.size() - 1;
+
+ // read all remaining characters on the line
+ char ch;
+ while (in >> ch) {
+ if (isspace(ch))
+ continue;
+ if (ch >= 'a' && ch <= 'z')
+ ch = ch - 'a' + 'A';
+ if (ch == '.')
+ ch = '-';
+ if (stripGaps && ch == '-')
+ continue;
+ if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')) {
+ cerr << "ERROR: Unknown character encountered: "
+ << ch << endl;
+ exit(1);
+ }
+
+ // everything's ok so far, so just store this character.
+ seqData[index]->push_back(ch);
+ seqLengths[index]++;
+ }
+ }
+ }
+ }
+
+ // check for errors
+ if (seqNames.size() == 0) {
+ cerr << "ERROR: No sequences read!" << endl;
+ exit(1);
+ }
+
+ assert(!sequences);
+ sequences = new SafeVector<Sequence *>;
+ for (int i = 0; i < (int) seqNames.size(); i++) {
+ if (seqLengths[i] == 0) {
+ cerr << "ERROR: Sequence of zero length!" << endl;
+ exit(1);
+ }
+ Sequence *seq = new Sequence(seqData[i], seqNames[i], seqLengths[i],
+ i, i);
+ sequences->push_back(seq);
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::LoadMFA()
+ //
+ // Load MFA from a FileBuffer object.
+ /////////////////////////////////////////////////////////////////
+
+ void LoadMFA(FileBuffer &infile, bool stripGaps = false) {
+
+ // check to make sure that file reading is ok
+ if (infile.fail()) {
+ cerr << "ERROR: Error reading file." << endl;
+ exit(1);
+ }
+
+ // read all sequences
+ while (true) {
+
+ // get the sequence label as being the current # of sequences
+ // NOTE: sequence labels here are zero-based
+ int index = (!sequences) ? 0 : sequences->size();
+
+ // read the sequence
+ Sequence *seq = new Sequence(infile, stripGaps);
+ if (seq->Fail()) {
+
+ // check if alternative file format (i.e. not MFA)
+ if (index == 0) {
+ string header = seq->GetHeader();
+ if (header.length() > 0 && header[0] != '>') {
+
+ // try MSF format
+ ParseMSF(infile, header);
+ break;
+ }
+ }
+
+ delete seq;
+ break;
+ }
+ seq->SetLabel(index);
+
+ // add the sequence to the list of current sequences
+ if (!sequences)
+ sequences = new SafeVector<Sequence *>;
+ sequences->push_back(seq);
+ }
+
+ // make sure at least one sequence was read
+ if (!sequences) {
+ cerr << "ERROR: No sequences read." << endl;
+ exit(1);
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::AddSequence()
+ //
+ // Add another sequence to an existing sequence list
+ /////////////////////////////////////////////////////////////////
+
+ void AddSequence(Sequence *sequence) {
+ assert(sequence);
+ assert(!sequence->Fail());
+
+ // add sequence
+ if (!sequences)
+ sequences = new SafeVector<Sequence *>;
+ sequences->push_back(sequence);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::RemoveSequence()
+ //
+ // Remove a sequence from the MultiSequence
+ /////////////////////////////////////////////////////////////////
+
+ void RemoveSequence(int index) {
+ assert(sequences);
+
+ assert(index >= 0 && index < (int) sequences->size());
+ delete (*sequences)[index];
+
+ sequences->erase(sequences->begin() + index);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::WriteMFA()
+ //
+ // Write MFA to the outfile. Allows the user to specify the
+ // number of columns for the output. Also, useIndices determines
+ // whether or not the actual sequence comments will be printed
+ // out or whether the artificially assigned sequence labels will
+ // be used instead.
+ /////////////////////////////////////////////////////////////////
+
+ void WriteMFA(ostream &outfile, int numColumns = 60,
+ bool useIndices = false) {
+ if (!sequences)
+ return;
+
+ // loop through all sequences and write them out
+ for (SafeVector<Sequence *>::iterator iter = sequences->begin();
+ iter != sequences->end(); ++iter) {
+ (*iter)->WriteMFA(outfile, numColumns, useIndices);
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::GetAnnotationChar()
+ //
+ // Return CLUSTALW annotation for column.
+ /////////////////////////////////////////////////////////////////
+
+ char GetAnnotationChar(SafeVector<char> &column) {
+ SafeVector<int> counts(256, 0);
+ int allChars = (int) column.size();
+
+ for (int i = 0; i < allChars; i++) {
+ counts[(unsigned char) toupper(column[i])]++;
+ }
+
+ allChars -= counts[(unsigned char) '-'];
+ if (allChars == 1)
+ return ' ';
+
+ for (int i = 0; i < 256; i++)
+ if ((char) i != '-' && counts[i] == allChars)
+ return '*';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T']
+ + counts[(unsigned char) 'A'] == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E']
+ + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'H']
+ + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D']
+ + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H']
+ + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I']
+ + counts[(unsigned char) 'L'] + counts[(unsigned char) 'V']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I']
+ + counts[(unsigned char) 'L'] + counts[(unsigned char) 'F']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'Y']
+ == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y']
+ + counts[(unsigned char) 'W'] == allChars)
+ return ':';
+
+ if (counts[(unsigned char) 'C'] + counts[(unsigned char) 'S']
+ + counts[(unsigned char) 'A'] == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'A'] + counts[(unsigned char) 'T']
+ + counts[(unsigned char) 'V'] == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'A']
+ + counts[(unsigned char) 'G'] == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T']
+ + counts[(unsigned char) 'N'] + counts[(unsigned char) 'K']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T']
+ + counts[(unsigned char) 'P'] + counts[(unsigned char) 'A']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'G']
+ + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'N']
+ + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E']
+ + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D']
+ + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q']
+ + counts[(unsigned char) 'H'] + counts[(unsigned char) 'K']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E']
+ + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q']
+ + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K']
+ == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'V']
+ + counts[(unsigned char) 'L'] + counts[(unsigned char) 'I']
+ + counts[(unsigned char) 'M'] == allChars)
+ return '.';
+
+ if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'F']
+ + counts[(unsigned char) 'Y'] == allChars)
+ return '.';
+
+ return ' ';
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::WriteALN()
+ //
+ // Write ALN to the outfile. Allows the user to specify the
+ // number of columns for the output.
+ /////////////////////////////////////////////////////////////////
+
+ void WriteALN(ostream &outfile, int numColumns = 60) {
+ if (!sequences)
+ return;
+
+ outfile << "MSAPROBS version " << VERSION
+ << " multiple sequence alignment" << endl;
+
+ int longestComment = 0;
+ SafeVector<SafeVector<char>::iterator> ptrs(GetNumSequences());
+ SafeVector<int> lengths(GetNumSequences());
+ for (int i = 0; i < GetNumSequences(); i++) {
+ ptrs[i] = GetSequence(i)->GetDataPtr();
+ lengths[i] = GetSequence(i)->GetLength();
+ longestComment = max(longestComment,
+ (int) GetSequence(i)->GetName().length());
+ }
+ longestComment += 4;
+
+ int writtenChars = 0;
+ bool allDone = false;
+
+ while (!allDone) {
+ outfile << endl;
+ allDone = true;
+
+ // loop through all sequences and write them out
+ for (int i = 0; i < GetNumSequences(); i++) {
+
+ if (writtenChars < lengths[i]) {
+ outfile << GetSequence(i)->GetName();
+ for (int j = 0;
+ j
+ < longestComment
+ - (int) GetSequence(i)->GetName().length();
+ j++)
+ outfile << ' ';
+
+ for (int j = 0; j < numColumns; j++) {
+ if (writtenChars + j < lengths[i])
+ outfile << ptrs[i][writtenChars + j + 1];
+ else
+ break;
+ }
+
+ outfile << endl;
+
+ if (writtenChars + numColumns < lengths[i])
+ allDone = false;
+ }
+ }
+
+ // write annotation line
+ for (int j = 0; j < longestComment; j++)
+ outfile << ' ';
+
+ for (int j = 0; j < numColumns; j++) {
+ SafeVector<char> column;
+
+ for (int i = 0; i < GetNumSequences(); i++)
+ if (writtenChars + j < lengths[i])
+ column.push_back(ptrs[i][writtenChars + j + 1]);
+
+ if (column.size() > 0)
+ outfile << GetAnnotationChar(column);
+ }
+
+ outfile << endl;
+ writtenChars += numColumns;
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::GetSequence()
+ //
+ // Retrieve a sequence from the MultiSequence object.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence* GetSequence(int i) {
+ assert(sequences);
+ assert(0 <= i && i < (int) sequences->size());
+
+ return (*sequences)[i];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::GetSequence()
+ //
+ // Retrieve a sequence from the MultiSequence object
+ // (const version).
+ /////////////////////////////////////////////////////////////////
+
+ const Sequence* GetSequence(int i) const {
+ assert(sequences);
+ assert(0 <= i && i < (int) sequences->size());
+
+ return (*sequences)[i];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::GetNumSequences()
+ //
+ // Returns the number of sequences in the MultiSequence.
+ /////////////////////////////////////////////////////////////////
+
+ int GetNumSequences() const {
+ if (!sequences)
+ return 0;
+ return (int) sequences->size();
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::SortByHeader()
+ //
+ // Organizes the sequences according to their sequence headers
+ // in ascending order.
+ /////////////////////////////////////////////////////////////////
+
+ void SortByHeader() {
+ assert(sequences);
+
+ // a quick and easy O(n^2) sort
+ for (int i = 0; i < (int) sequences->size() - 1; i++) {
+ for (int j = i + 1; j < (int) sequences->size(); j++) {
+ if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader())
+ swap((*sequences)[i], (*sequences)[j]);
+ }
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::SortByLabel()
+ //
+ // Organizes the sequences according to their sequence labels
+ // in ascending order.
+ /////////////////////////////////////////////////////////////////
+
+ void SortByLabel() {
+ assert(sequences);
+
+ // a quick and easy O(n^2) sort
+ for (int i = 0; i < (int) sequences->size() - 1; i++) {
+ for (int j = i + 1; j < (int) sequences->size(); j++) {
+ if ((*sequences)[i]->GetSortLabel()
+ > (*sequences)[j]->GetSortLabel())
+ swap((*sequences)[i], (*sequences)[j]);
+ }
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::SaveOrdering()
+ //
+ // Relabels sequences so as to preserve the current ordering.
+ /////////////////////////////////////////////////////////////////
+
+ void SaveOrdering() {
+ assert(sequences);
+
+ for (int i = 0; i < (int) sequences->size(); i++)
+ (*sequences)[i]->SetSortLabel(i);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MultiSequence::Project()
+ //
+ // Given a set of indices, extract all sequences from the current
+ // MultiSequence object whose index is included in the set.
+ // Then, project the multiple alignments down to the desired
+ // subset, and return the projection as a new MultiSequence
+ // object.
+ /////////////////////////////////////////////////////////////////
+
+ MultiSequence *Project(const set<int> &indices) {
+ SafeVector<SafeVector<char>::iterator> oldPtrs(indices.size());
+ SafeVector<SafeVector<char> *> newPtrs(indices.size());
+
+ assert(indices.size() != 0);
+
+ // grab old data
+ int i = 0;
+ for (set<int>::const_iterator iter = indices.begin();
+ iter != indices.end(); ++iter) {
+ oldPtrs[i++] = GetSequence(*iter)->GetDataPtr();
+ }
+
+ // compute new length
+ int oldLength = GetSequence(*indices.begin())->GetLength();
+ int newLength = 0;
+ for (i = 1; i <= oldLength; i++) {
+
+ // check to see if there is a gap in every sequence of the set
+ bool found = false;
+ for (int j = 0; !found && j < (int) indices.size(); j++)
+ found = (oldPtrs[j][i] != '-');
+
+ // if not, then this column counts towards the sequence length
+ if (found)
+ newLength++;
+ }
+
+ // build new alignments
+ for (i = 0; i < (int) indices.size(); i++) {
+ newPtrs[i] = new SafeVector<char>();
+ assert(newPtrs[i]);
+ newPtrs[i]->push_back('@');
+ }
+
+ // add all needed columns
+ for (i = 1; i <= oldLength; i++) {
+
+ // make sure column is not gapped in all sequences in the set
+ bool found = false;
+ for (int j = 0; !found && j < (int) indices.size(); j++)
+ found = (oldPtrs[j][i] != '-');
+
+ // if not, then add it
+ if (found) {
+ for (int j = 0; j < (int) indices.size(); j++)
+ newPtrs[j]->push_back(oldPtrs[j][i]);
+ }
+ }
+
+ // wrap sequences in MultiSequence object
+ MultiSequence *ret = new MultiSequence();
+ i = 0;
+ for (set<int>::const_iterator iter = indices.begin();
+ iter != indices.end(); ++iter) {
+ ret->AddSequence(
+ new Sequence(newPtrs[i++], GetSequence(*iter)->GetHeader(),
+ newLength, GetSequence(*iter)->GetSortLabel(),
+ GetSequence(*iter)->GetLabel()));
+ }
+
+ return ret;
+ }
+};
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// ProbabilisticModel.h
+//
+// Routines for (1) posterior probability computations
+// (2) chained anchoring
+// (3) maximum weight trace alignment
+/////////////////////////////////////////////////////////////////
+
+#ifndef PROBABILISTICMODEL_H
+#define PROBABILISTICMODEL_H
+
+#include <list>
+#include <cmath>
+#include <cstdio>
+#include "SafeVector.h"
+#include "ScoreType.h"
+#include "SparseMatrix.h"
+#include "MultiSequence.h"
+
+using namespace std;
+
+const int NumMatchStates = 1; // note that in this version the number
+// of match states is fixed at 1...will
+// change in future versions
+const int NumInsertStates = 2;
+const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;
+
+/////////////////////////////////////////////////////////////////
+// ProbabilisticModel
+//
+// Class for storing the parameters of a probabilistic model and
+// performing different computations based on those parameters.
+// In particular, this class handles the computation of
+// posterior probabilities that may be used in alignment.
+/////////////////////////////////////////////////////////////////
+
+class ProbabilisticModel {
+
+ float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state
+ float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities
+ float matchProb[256][256]; // emission probabilities for match states
+ float insProb[256][NumMatrixTypes]; // emission probabilities for insert states
+
+public:
+
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::ProbabilisticModel()
+ //
+ // Constructor. Builds a new probabilistic model using the
+ // given parameters.
+ /////////////////////////////////////////////////////////////////
+
+ ProbabilisticModel(const VF &initDistribMat, const VF &gapOpen,
+ const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle) {
+
+ // build transition matrix
+ VVF transMat(NumMatrixTypes, VF(NumMatrixTypes, 0.0f));
+ transMat[0][0] = 1;
+ for (int i = 0; i < NumInsertStates; i++) {
+ transMat[0][2 * i + 1] = gapOpen[2 * i];
+ transMat[0][2 * i + 2] = gapOpen[2 * i + 1];
+ transMat[0][0] -= (gapOpen[2 * i] + gapOpen[2 * i + 1]);
+ assert(transMat[0][0] > 0);
+ transMat[2 * i + 1][2 * i + 1] = gapExtend[2 * i];
+ transMat[2 * i + 2][2 * i + 2] = gapExtend[2 * i + 1];
+ transMat[2 * i + 1][2 * i + 2] = 0;
+ transMat[2 * i + 2][2 * i + 1] = 0;
+ transMat[2 * i + 1][0] = 1 - gapExtend[2 * i];
+ transMat[2 * i + 2][0] = 1 - gapExtend[2 * i + 1];
+ }
+
+ // create initial and transition probability matrices
+ for (int i = 0; i < NumMatrixTypes; i++) {
+ initialDistribution[i] = LOG(initDistribMat[i]);
+ for (int j = 0; j < NumMatrixTypes; j++)
+ transProb[i][j] = LOG(transMat[i][j]);
+ }
+
+ // create insertion and match probability matrices
+ for (int i = 0; i < 256; i++) {
+ for (int j = 0; j < NumMatrixTypes; j++)
+ insProb[i][j] = LOG(emitSingle[i]);
+ for (int j = 0; j < 256; j++)
+ matchProb[i][j] = LOG(emitPairs[i][j]);
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::ComputeForwardMatrix()
+ //
+ // Computes a set of forward probability matrices for aligning
+ // seq1 and seq2.
+ //
+ // For efficiency reasons, a single-dimensional floating-point
+ // array is used here, with the following indexing scheme:
+ //
+ // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
+ // refers to the probability of aligning through j characters
+ // of the first sequence, k characters of the second sequence,
+ // and ending in state i.
+ /////////////////////////////////////////////////////////////////
+
+ VF *ComputeForwardMatrix(Sequence *seq1, Sequence *seq2) const {
+
+ assert(seq1);
+ assert(seq2);
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+
+ // retrieve the points to the beginning of each sequence
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+ // create matrix
+ VF *forwardPtr = new VF(
+ NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
+ assert(forwardPtr);
+ VF &forward = *forwardPtr;
+
+ // initialization condition
+ forward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)] =
+ initialDistribution[0]
+ + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];
+
+ for (int k = 0; k < NumInsertStates; k++) {
+ forward[2 * k + 1 + NumMatrixTypes * (1 * (seq2Length + 1) + 0)] =
+ initialDistribution[2 * k + 1]
+ + insProb[(unsigned char) iter1[1]][k];
+ forward[2 * k + 2 + NumMatrixTypes * (0 * (seq2Length + 1) + 1)] =
+ initialDistribution[2 * k + 2]
+ + insProb[(unsigned char) iter2[1]][k];
+ }
+
+ // remember offset for each index combination
+ int ij = 0;
+ int i1j = -seq2Length - 1;
+ int ij1 = -1;
+ int i1j1 = -seq2Length - 2;
+
+ ij *= NumMatrixTypes;
+ i1j *= NumMatrixTypes;
+ ij1 *= NumMatrixTypes;
+ i1j1 *= NumMatrixTypes;
+
+ // compute forward scores
+ for (int i = 0; i <= seq1Length; i++) {
+ unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
+ for (int j = 0; j <= seq2Length; j++) {
+ unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
+
+ if (i > 1 || j > 1) {
+ if (i > 0 && j > 0) {
+ forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];
+ for (int k = 1; k < NumMatrixTypes; k++)
+ LOG_PLUS_EQUALS(forward[0 + ij],
+ forward[k + i1j1] + transProb[k][0]);
+ forward[0 + ij] += matchProb[c1][c2];
+ }
+ if (i > 0) {
+ for (int k = 0; k < NumInsertStates; k++)
+ forward[2 * k + 1 + ij] = insProb[c1][k]
+ + LOG_ADD(
+ forward[0 + i1j]
+ + transProb[0][2 * k + 1],
+ forward[2 * k + 1 + i1j]
+ + transProb[2 * k + 1][2 * k
+ + 1]);
+ }
+ if (j > 0) {
+ for (int k = 0; k < NumInsertStates; k++)
+ forward[2 * k + 2 + ij] = insProb[c2][k]
+ + LOG_ADD(
+ forward[0 + ij1]
+ + transProb[0][2 * k + 2],
+ forward[2 * k + 2 + ij1]
+ + transProb[2 * k + 2][2 * k
+ + 2]);
+ }
+ }
+
+ ij += NumMatrixTypes;
+ i1j += NumMatrixTypes;
+ ij1 += NumMatrixTypes;
+ i1j1 += NumMatrixTypes;
+ }
+ }
+
+ return forwardPtr;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::ComputeBackwardMatrix()
+ //
+ // Computes a set of backward probability matrices for aligning
+ // seq1 and seq2.
+ //
+ // For efficiency reasons, a single-dimensional floating-point
+ // array is used here, with the following indexing scheme:
+ //
+ // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
+ // refers to the probability of starting in state i and
+ // aligning from character j+1 to the end of the first
+ // sequence and from character k+1 to the end of the second
+ // sequence.
+ /////////////////////////////////////////////////////////////////
+
+ VF *ComputeBackwardMatrix(Sequence *seq1, Sequence *seq2) const {
+
+ assert(seq1);
+ assert(seq2);
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+ // create matrix
+ VF *backwardPtr = new VF(
+ NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
+ assert(backwardPtr);
+ VF &backward = *backwardPtr;
+
+ // initialization condition
+ for (int k = 0; k < NumMatrixTypes; k++)
+ backward[NumMatrixTypes * ((seq1Length + 1) * (seq2Length + 1) - 1)
+ + k] = initialDistribution[k];
+
+ // remember offset for each index combination
+ int ij = (seq1Length + 1) * (seq2Length + 1) - 1;
+ int i1j = ij + seq2Length + 1;
+ int ij1 = ij + 1;
+ int i1j1 = ij + seq2Length + 2;
+
+ ij *= NumMatrixTypes;
+ i1j *= NumMatrixTypes;
+ ij1 *= NumMatrixTypes;
+ i1j1 *= NumMatrixTypes;
+
+ // compute backward scores
+ for (int i = seq1Length; i >= 0; i--) {
+ unsigned char c1 =
+ (i == seq1Length) ? '~' : (unsigned char) iter1[i + 1];
+ for (int j = seq2Length; j >= 0; j--) {
+ unsigned char c2 =
+ (j == seq2Length) ? '~' : (unsigned char) iter2[j + 1];
+
+ if (i < seq1Length && j < seq2Length) {
+ const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];
+ for (int k = 0; k < NumMatrixTypes; k++)
+ LOG_PLUS_EQUALS(backward[k + ij],
+ ProbXY + transProb[k][0]);
+ }
+ if (i < seq1Length) {
+ for (int k = 0; k < NumInsertStates; k++) {
+ LOG_PLUS_EQUALS(backward[0 + ij],
+ backward[2 * k + 1 + i1j] + insProb[c1][k]
+ + transProb[0][2 * k + 1]);
+ LOG_PLUS_EQUALS(backward[2 * k + 1 + ij],
+ backward[2 * k + 1 + i1j] + insProb[c1][k]
+ + transProb[2 * k + 1][2 * k + 1]);
+ }
+ }
+ if (j < seq2Length) {
+ for (int k = 0; k < NumInsertStates; k++) {
+ LOG_PLUS_EQUALS(backward[0 + ij],
+ backward[2 * k + 2 + ij1] + insProb[c2][k]
+ + transProb[0][2 * k + 2]);
+ LOG_PLUS_EQUALS(backward[2 * k + 2 + ij],
+ backward[2 * k + 2 + ij1] + insProb[c2][k]
+ + transProb[2 * k + 2][2 * k + 2]);
+ }
+ }
+
+ ij -= NumMatrixTypes;
+ i1j -= NumMatrixTypes;
+ ij1 -= NumMatrixTypes;
+ i1j1 -= NumMatrixTypes;
+ }
+ }
+
+ return backwardPtr;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::ComputeTotalProbability()
+ //
+ // Computes the total probability of an alignment given
+ // the forward and backward matrices.
+ /////////////////////////////////////////////////////////////////
+
+ float ComputeTotalProbability(int seq1Length, int seq2Length,
+ const VF &forward, const VF &backward) const {
+
+ // compute total probability
+ float totalForwardProb = LOG_ZERO;
+ float totalBackwardProb = LOG_ZERO;
+ for (int k = 0; k < NumMatrixTypes; k++) {
+ LOG_PLUS_EQUALS(totalForwardProb,
+ forward[k
+ + NumMatrixTypes
+ * ((seq1Length + 1) * (seq2Length + 1) - 1)]
+ + backward[k
+ + NumMatrixTypes
+ * ((seq1Length + 1)
+ * (seq2Length + 1) - 1)]);
+ }
+
+ totalBackwardProb = forward[0
+ + NumMatrixTypes * (1 * (seq2Length + 1) + 1)]
+ + backward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)];
+
+ for (int k = 0; k < NumInsertStates; k++) {
+ LOG_PLUS_EQUALS(totalBackwardProb,
+ forward[2 * k + 1
+ + NumMatrixTypes * (1 * (seq2Length + 1) + 0)]
+ + backward[2 * k + 1
+ + NumMatrixTypes
+ * (1 * (seq2Length + 1) + 0)]);
+ LOG_PLUS_EQUALS(totalBackwardProb,
+ forward[2 * k + 2
+ + NumMatrixTypes * (0 * (seq2Length + 1) + 1)]
+ + backward[2 * k + 2
+ + NumMatrixTypes
+ * (0 * (seq2Length + 1) + 1)]);
+ }
+
+ // cerr << totalForwardProb << " " << totalBackwardProb << endl;
+
+ return (totalForwardProb + totalBackwardProb) / 2;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::ComputePosteriorMatrix()
+ //
+ // Computes the posterior probability matrix based on
+ // the forward and backward matrices.
+ /////////////////////////////////////////////////////////////////
+
+ VF *ComputePosteriorMatrix(Sequence *seq1, Sequence *seq2,
+ const VF &forward, const VF &backward) const {
+
+ assert(seq1);
+ assert(seq2);
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+
+ float totalProb = ComputeTotalProbability(seq1Length, seq2Length,
+ forward, backward);
+
+ // compute posterior matrices
+ VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1));
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+
+ int ij = 0;
+ if (totalProb == 0) {
+ totalProb = 1.0f;
+ }
+ VF::iterator ptr = posterior.begin();
+
+ for (int i = 0; i <= seq1Length; i++) {
+ for (int j = 0; j <= seq2Length; j++) {
+ *(ptr++) = EXP(
+ min(LOG_ONE, forward[ij] + backward[ij] - totalProb));
+ ij += NumMatrixTypes;
+ }
+ }
+
+ posterior[0] = 0;
+
+ return posteriorPtr;
+ }
+
+ /*
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::ComputeExpectedCounts()
+ //
+ // Computes the expected counts for the various transitions.
+ /////////////////////////////////////////////////////////////////
+
+ VVF *ComputeExpectedCounts () const {
+
+ assert (seq1);
+ assert (seq2);
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+ // compute total probability
+ float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
+ forward, backward);
+
+ // initialize expected counts
+ VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);
+ VVF &counts = *countsPtr;
+
+ // remember offset for each index combination
+ int ij = 0;
+ int i1j = -seq2Length - 1;
+ int ij1 = -1;
+ int i1j1 = -seq2Length - 2;
+
+ ij *= NumMatrixTypes;
+ i1j *= NumMatrixTypes;
+ ij1 *= NumMatrixTypes;
+ i1j1 *= NumMatrixTypes;
+
+ // compute expected counts
+ for (int i = 0; i <= seq1Length; i++){
+ unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
+ for (int j = 0; j <= seq2Length; j++){
+ unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
+
+ if (i > 0 && j > 0){
+ for (int k = 0; k < NumMatrixTypes; k++)
+ LOG_PLUS_EQUALS (counts[k][0],
+ forward[k + i1j1] + transProb[k][0] +
+ matchProb[c1][c2] + backward[0 + ij]);
+ }
+ if (i > 0){
+ for (int k = 0; k < NumInsertStates; k++){
+ LOG_PLUS_EQUALS (counts[0][2*k+1],
+ forward[0 + i1j] + transProb[0][2*k+1] +
+ insProb[c1][k] + backward[2*k+1 + ij]);
+ LOG_PLUS_EQUALS (counts[2*k+1][2*k+1],
+ forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] +
+ insProb[c1][k] + backward[2*k+1 + ij]);
+ }
+ }
+ if (j > 0){
+ for (int k = 0; k < NumInsertStates; k++){
+ LOG_PLUS_EQUALS (counts[0][2*k+2],
+ forward[0 + ij1] + transProb[0][2*k+2] +
+ insProb[c2][k] + backward[2*k+2 + ij]);
+ LOG_PLUS_EQUALS (counts[2*k+2][2*k+2],
+ forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] +
+ insProb[c2][k] + backward[2*k+2 + ij]);
+ }
+ }
+
+ ij += NumMatrixTypes;
+ i1j += NumMatrixTypes;
+ ij1 += NumMatrixTypes;
+ i1j1 += NumMatrixTypes;
+ }
+ }
+
+ // scale all expected counts appropriately
+ for (int i = 0; i < NumMatrixTypes; i++)
+ for (int j = 0; j < NumMatrixTypes; j++)
+ counts[i][j] -= totalProb;
+
+ }
+ */
+
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::ComputeNewParameters()
+ //
+ // Computes a new parameter set based on the expected counts
+ // given.
+ /////////////////////////////////////////////////////////////////
+ void ComputeNewParameters(Sequence *seq1, Sequence *seq2, const VF &forward,
+ const VF &backward, VF &initDistribMat, VF &gapOpen, VF &gapExtend,
+ VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {
+
+ assert(seq1);
+ assert(seq2);
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+ // compute total probability
+ float totalProb = ComputeTotalProbability(seq1Length, seq2Length,
+ forward, backward);
+
+ // initialize expected counts
+ VVF transCounts(NumMatrixTypes, VF(NumMatrixTypes, LOG_ZERO));
+ VF initCounts(NumMatrixTypes, LOG_ZERO);
+ VVF pairCounts(256, VF(256, LOG_ZERO));
+ VF singleCounts(256, LOG_ZERO);
+
+ // remember offset for each index combination
+ int ij = 0;
+ int i1j = -seq2Length - 1;
+ int ij1 = -1;
+ int i1j1 = -seq2Length - 2;
+
+ ij *= NumMatrixTypes;
+ i1j *= NumMatrixTypes;
+ ij1 *= NumMatrixTypes;
+ i1j1 *= NumMatrixTypes;
+
+ // compute initial distribution posteriors
+ initCounts[0] = LOG_ADD(
+ forward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)]
+ + backward[0
+ + NumMatrixTypes * (1 * (seq2Length + 1) + 1)],
+ forward[0
+ + NumMatrixTypes
+ * ((seq1Length + 1) * (seq2Length + 1) - 1)]
+ + backward[0
+ + NumMatrixTypes
+ * ((seq1Length + 1) * (seq2Length + 1)
+ - 1)]);
+ for (int k = 0; k < NumInsertStates; k++) {
+ initCounts[2 * k + 1] = LOG_ADD(
+ forward[2 * k + 1
+ + NumMatrixTypes * (1 * (seq2Length + 1) + 0)]
+ + backward[2 * k + 1
+ + NumMatrixTypes
+ * (1 * (seq2Length + 1) + 0)],
+ forward[2 * k + 1
+ + NumMatrixTypes
+ * ((seq1Length + 1) * (seq2Length + 1) - 1)]
+ + backward[2 * k + 1
+ + NumMatrixTypes
+ * ((seq1Length + 1)
+ * (seq2Length + 1) - 1)]);
+ initCounts[2 * k + 2] = LOG_ADD(
+ forward[2 * k + 2
+ + NumMatrixTypes * (0 * (seq2Length + 1) + 1)]
+ + backward[2 * k + 2
+ + NumMatrixTypes
+ * (0 * (seq2Length + 1) + 1)],
+ forward[2 * k + 2
+ + NumMatrixTypes
+ * ((seq1Length + 1) * (seq2Length + 1) - 1)]
+ + backward[2 * k + 2
+ + NumMatrixTypes
+ * ((seq1Length + 1)
+ * (seq2Length + 1) - 1)]);
+ }
+
+ // compute expected counts
+ for (int i = 0; i <= seq1Length; i++) {
+ unsigned char c1 =
+ (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);
+ for (int j = 0; j <= seq2Length; j++) {
+ unsigned char c2 =
+ (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);
+
+ if (i > 0 && j > 0) {
+ if (enableTrainEmissions && i == 1 && j == 1) {
+ LOG_PLUS_EQUALS(pairCounts[c1][c2],
+ initialDistribution[0] + matchProb[c1][c2]
+ + backward[0 + ij]);
+ LOG_PLUS_EQUALS(pairCounts[c2][c1],
+ initialDistribution[0] + matchProb[c2][c1]
+ + backward[0 + ij]);
+ }
+
+ for (int k = 0; k < NumMatrixTypes; k++) {
+ LOG_PLUS_EQUALS(transCounts[k][0],
+ forward[k + i1j1] + transProb[k][0]
+ + matchProb[c1][c2] + backward[0 + ij]);
+ if (enableTrainEmissions && (i != 1 || j != 1)) {//adding parentheses by Liu Yongchao, 5 Mar, 2010
+ LOG_PLUS_EQUALS(pairCounts[c1][c2],
+ forward[k + i1j1] + transProb[k][0]
+ + matchProb[c1][c2]
+ + backward[0 + ij]);
+ LOG_PLUS_EQUALS(pairCounts[c2][c1],
+ forward[k + i1j1] + transProb[k][0]
+ + matchProb[c2][c1]
+ + backward[0 + ij]);
+ }
+ }
+ }
+ if (i > 0) {
+ for (int k = 0; k < NumInsertStates; k++) {
+ LOG_PLUS_EQUALS(transCounts[0][2 * k + 1],
+ forward[0 + i1j] + transProb[0][2 * k + 1]
+ + insProb[c1][k]
+ + backward[2 * k + 1 + ij]);
+ LOG_PLUS_EQUALS(transCounts[2 * k + 1][2 * k + 1],
+ forward[2 * k + 1 + i1j]
+ + transProb[2 * k + 1][2 * k + 1]
+ + insProb[c1][k]
+ + backward[2 * k + 1 + ij]);
+ if (enableTrainEmissions) {
+ if (i == 1 && j == 0) {
+ LOG_PLUS_EQUALS(singleCounts[c1],
+ initialDistribution[2 * k + 1]
+ + insProb[c1][k]
+ + backward[2 * k + 1 + ij]);
+ } else {
+ LOG_PLUS_EQUALS(singleCounts[c1],
+ forward[0 + i1j]
+ + transProb[0][2 * k + 1]
+ + insProb[c1][k]
+ + backward[2 * k + 1 + ij]);
+ LOG_PLUS_EQUALS(singleCounts[c1],
+ forward[2 * k + 1 + i1j]
+ + transProb[2 * k + 1][2 * k + 1]
+ + insProb[c1][k]
+ + backward[2 * k + 1 + ij]);
+ }
+ }
+ }
+ }
+ if (j > 0) {
+ for (int k = 0; k < NumInsertStates; k++) {
+ LOG_PLUS_EQUALS(transCounts[0][2 * k + 2],
+ forward[0 + ij1] + transProb[0][2 * k + 2]
+ + insProb[c2][k]
+ + backward[2 * k + 2 + ij]);
+ LOG_PLUS_EQUALS(transCounts[2 * k + 2][2 * k + 2],
+ forward[2 * k + 2 + ij1]
+ + transProb[2 * k + 2][2 * k + 2]
+ + insProb[c2][k]
+ + backward[2 * k + 2 + ij]);
+ if (enableTrainEmissions) {
+ if (i == 0 && j == 1) {
+ LOG_PLUS_EQUALS(singleCounts[c2],
+ initialDistribution[2 * k + 2]
+ + insProb[c2][k]
+ + backward[2 * k + 2 + ij]);
+ } else {
+ LOG_PLUS_EQUALS(singleCounts[c2],
+ forward[0 + ij1]
+ + transProb[0][2 * k + 2]
+ + insProb[c2][k]
+ + backward[2 * k + 2 + ij]);
+ LOG_PLUS_EQUALS(singleCounts[c2],
+ forward[2 * k + 2 + ij1]
+ + transProb[2 * k + 2][2 * k + 2]
+ + insProb[c2][k]
+ + backward[2 * k + 2 + ij]);
+ }
+ }
+ }
+ }
+
+ ij += NumMatrixTypes;
+ i1j += NumMatrixTypes;
+ ij1 += NumMatrixTypes;
+ i1j1 += NumMatrixTypes;
+ }
+ }
+
+ // scale all expected counts appropriately
+ for (int i = 0; i < NumMatrixTypes; i++) {
+ initCounts[i] -= totalProb;
+ for (int j = 0; j < NumMatrixTypes; j++)
+ transCounts[i][j] -= totalProb;
+ }
+ if (enableTrainEmissions) {
+ for (int i = 0; i < 256; i++) {
+ for (int j = 0; j < 256; j++)
+ pairCounts[i][j] -= totalProb;
+ singleCounts[i] -= totalProb;
+ }
+ }
+
+ // compute new initial distribution
+ float totalInitDistribCounts = 0;
+ for (int i = 0; i < NumMatrixTypes; i++)
+ totalInitDistribCounts += exp(initCounts[i]); // should be 2
+ initDistribMat[0] = min(1.0f,
+ max(0.0f, (float) exp(initCounts[0]) / totalInitDistribCounts));
+ for (int k = 0; k < NumInsertStates; k++) {
+ float val =
+ (exp(initCounts[2 * k + 1]) + exp(initCounts[2 * k + 2]))
+ / 2;
+ initDistribMat[2 * k + 1] = initDistribMat[2 * k + 2] = min(1.0f,
+ max(0.0f, val / totalInitDistribCounts));
+ }
+
+ // compute total counts for match state
+ float inMatchStateCounts = 0;
+ for (int i = 0; i < NumMatrixTypes; i++)
+ inMatchStateCounts += exp(transCounts[0][i]);
+ for (int i = 0; i < NumInsertStates; i++) {
+
+ // compute total counts for gap state
+ float inGapStateCounts = exp(transCounts[2 * i + 1][0])
+ + exp(transCounts[2 * i + 1][2 * i + 1])
+ + exp(transCounts[2 * i + 2][0])
+ + exp(transCounts[2 * i + 2][2 * i + 2]);
+
+ gapOpen[2 * i] = gapOpen[2 * i + 1] = (exp(
+ transCounts[0][2 * i + 1]) + exp(transCounts[0][2 * i + 2]))
+ / (2 * inMatchStateCounts);
+
+ gapExtend[2 * i] = gapExtend[2 * i + 1] = (exp(
+ transCounts[2 * i + 1][2 * i + 1])
+ + exp(transCounts[2 * i + 2][2 * i + 2]))
+ / inGapStateCounts;
+ }
+
+ if (enableTrainEmissions) {
+ float totalPairCounts = 0;
+ float totalSingleCounts = 0;
+ for (int i = 0; i < 256; i++) {
+ for (int j = 0; j <= i; j++)
+ totalPairCounts += exp(pairCounts[j][i]);
+ totalSingleCounts += exp(singleCounts[i]);
+ }
+
+ for (int i = 0; i < 256; i++)
+ if (!islower((char) i)) {
+ int li = (int) ((unsigned char) tolower((char) i));
+ for (int j = 0; j <= i; j++)
+ if (!islower((char) j)) {
+ int lj = (int) ((unsigned char) tolower((char) j));
+ emitPairs[i][j] =
+ emitPairs[i][lj] =
+ emitPairs[li][j] =
+ emitPairs[li][lj] =
+ emitPairs[j][i] =
+ emitPairs[j][li] =
+ emitPairs[lj][i] =
+ emitPairs[lj][li] =
+ exp(
+ pairCounts[j][i])
+ / totalPairCounts;
+ }
+ emitSingle[i] = emitSingle[li] = exp(singleCounts[i])
+ / totalSingleCounts;
+ }
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::ComputeAlignment()
+ //
+ // Computes an alignment based on the given posterior matrix.
+ // This is done by finding the maximum summing path (or
+ // maximum weight trace) through the posterior matrix. The
+ // final alignment is returned as a pair consisting of:
+ // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
+ // denote insertions in one of the two sequences and
+ // B's denote that both sequences are present (i.e.
+ // matches).
+ // (2) a float indicating the sum achieved
+ /////////////////////////////////////////////////////////////////
+
+ pair<SafeVector<char> *, float> ComputeAlignment(int seq1Length,
+ int seq2Length, const VF &posterior) const {
+
+ float *twoRows = new float[(seq2Length + 1) * 2];
+ assert(twoRows);
+ float *oldRow = twoRows;
+ float *newRow = twoRows + seq2Length + 1;
+
+ char *tracebackMatrix = new char[(seq1Length + 1) * (seq2Length + 1)];
+ assert(tracebackMatrix);
+ char *tracebackPtr = tracebackMatrix;
+
+ VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
+
+ // initialization
+ for (int i = 0; i <= seq2Length; i++) {
+ oldRow[i] = 0;
+ *(tracebackPtr++) = 'L';
+ }
+
+ // fill in matrix
+ for (int i = 1; i <= seq1Length; i++) {
+
+ // initialize left column
+ newRow[0] = 0;
+ posteriorPtr++;
+ *(tracebackPtr++) = 'U';
+
+ // fill in rest of row
+ for (int j = 1; j <= seq2Length; j++) {
+ ChooseBestOfThree(*(posteriorPtr++) + oldRow[j - 1],
+ newRow[j - 1], oldRow[j], 'D', 'L', 'U', &newRow[j],
+ tracebackPtr++);
+ }
+
+ // swap rows
+ float *temp = oldRow;
+ oldRow = newRow;
+ newRow = temp;
+ }
+
+ // store best score
+ float total = oldRow[seq2Length];
+ delete[] twoRows;
+
+ // compute traceback
+ SafeVector<char> *alignment = new SafeVector<char>;
+ assert(alignment);
+ int r = seq1Length, c = seq2Length;
+ while (r != 0 || c != 0) {
+ char ch = tracebackMatrix[r * (seq2Length + 1) + c];
+ switch (ch) {
+ case 'L':
+ c--;
+ alignment->push_back('Y');
+ break;
+ case 'U':
+ r--;
+ alignment->push_back('X');
+ break;
+ case 'D':
+ c--;
+ r--;
+ alignment->push_back('B');
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ delete[] tracebackMatrix;
+
+ reverse(alignment->begin(), alignment->end());
+
+ return make_pair(alignment, total);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::ComputeAlignmentWithGapPenalties()
+ //
+ // Similar to ComputeAlignment() except with gap penalties.
+ /////////////////////////////////////////////////////////////////
+
+ pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties(
+ MultiSequence *align1, MultiSequence *align2, const VF &posterior,
+ int numSeqs1, int numSeqs2, float gapOpenPenalty,
+ float gapContinuePenalty) const {
+ int seq1Length = align1->GetSequence(0)->GetLength();
+ int seq2Length = align2->GetSequence(0)->GetLength();
+ SafeVector<SafeVector<char>::iterator> dataPtrs1(
+ align1->GetNumSequences());
+ SafeVector<SafeVector<char>::iterator> dataPtrs2(
+ align2->GetNumSequences());
+
+ // grab character data
+ for (int i = 0; i < align1->GetNumSequences(); i++)
+ dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();
+ for (int i = 0; i < align2->GetNumSequences(); i++)
+ dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();
+
+ // the number of active sequences at any given column is defined to be the
+ // number of non-gap characters in that column; the number of gap opens at
+ // any given column is defined to be the number of gap characters in that
+ // column where the previous character in the respective sequence was not
+ // a gap
+ SafeVector<int> numActive1(seq1Length + 1), numGapOpens1(
+ seq1Length + 1);
+ SafeVector<int> numActive2(seq2Length + 1), numGapOpens2(
+ seq2Length + 1);
+
+ // compute number of active sequences and gap opens for each group
+ for (int i = 0; i < align1->GetNumSequences(); i++) {
+ SafeVector<char>::iterator dataPtr =
+ align1->GetSequence(i)->GetDataPtr();
+ numActive1[0] = numGapOpens1[0] = 0;
+ for (int j = 1; j <= seq1Length; j++) {
+ if (dataPtr[j] != '-') {
+ numActive1[j]++;
+ numGapOpens1[j] += (j != 1 && dataPtr[j - 1] != '-');
+ }
+ }
+ }
+ for (int i = 0; i < align2->GetNumSequences(); i++) {
+ SafeVector<char>::iterator dataPtr =
+ align2->GetSequence(i)->GetDataPtr();
+ numActive2[0] = numGapOpens2[0] = 0;
+ for (int j = 1; j <= seq2Length; j++) {
+ if (dataPtr[j] != '-') {
+ numActive2[j]++;
+ numGapOpens2[j] += (j != 1 && dataPtr[j - 1] != '-');
+ }
+ }
+ }
+
+ VVF openingPenalty1(numSeqs1 + 1, VF(numSeqs2 + 1));
+ VF continuingPenalty1(numSeqs1 + 1);
+ VVF openingPenalty2(numSeqs1 + 1, VF(numSeqs2 + 1));
+ VF continuingPenalty2(numSeqs2 + 1);
+
+ // precompute penalties
+ for (int i = 0; i <= numSeqs1; i++)
+ for (int j = 0; j <= numSeqs2; j++)
+ openingPenalty1[i][j] = i
+ * (gapOpenPenalty * j
+ + gapContinuePenalty * (numSeqs2 - j));
+ for (int i = 0; i <= numSeqs1; i++)
+ continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;
+ for (int i = 0; i <= numSeqs2; i++)
+ for (int j = 0; j <= numSeqs1; j++)
+ openingPenalty2[i][j] = i
+ * (gapOpenPenalty * j
+ + gapContinuePenalty * (numSeqs1 - j));
+ for (int i = 0; i <= numSeqs2; i++)
+ continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;
+
+ float *twoRows = new float[6 * (seq2Length + 1)];
+ assert(twoRows);
+ float *oldRowMatch = twoRows;
+ float *newRowMatch = twoRows + (seq2Length + 1);
+ float *oldRowInsertX = twoRows + 2 * (seq2Length + 1);
+ float *newRowInsertX = twoRows + 3 * (seq2Length + 1);
+ float *oldRowInsertY = twoRows + 4 * (seq2Length + 1);
+ float *newRowInsertY = twoRows + 5 * (seq2Length + 1);
+
+ char *tracebackMatrix =
+ new char[3 * (seq1Length + 1) * (seq2Length + 1)];
+ assert(tracebackMatrix);
+ char *tracebackPtr = tracebackMatrix;
+
+ VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
+
+ // initialization
+ for (int i = 0; i <= seq2Length; i++) {
+ oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;
+ oldRowInsertY[i] =
+ (i == 0) ?
+ 0 :
+ oldRowInsertY[i - 1]
+ + continuingPenalty2[numActive2[i]];
+ *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'Y';
+ tracebackPtr += 3;
+ }
+
+ // fill in matrix
+ for (int i = 1; i <= seq1Length; i++) {
+
+ // initialize left column
+ newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;
+ newRowInsertX[0] = oldRowInsertX[0]
+ + continuingPenalty1[numActive1[i]];
+ posteriorPtr++;
+ *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'X';
+ tracebackPtr += 3;
+
+ // fill in rest of row
+ for (int j = 1; j <= seq2Length; j++) {
+
+ // going to MATCH state
+ ChooseBestOfThree(oldRowMatch[j - 1], oldRowInsertX[j - 1],
+ oldRowInsertY[j - 1], 'M', 'X', 'Y', &newRowMatch[j],
+ tracebackPtr++);
+ newRowMatch[j] += *(posteriorPtr++);
+
+ // going to INSERT X state
+ ChooseBestOfThree(
+ oldRowMatch[j]
+ + openingPenalty1[numActive1[i]][numGapOpens2[j]],
+ oldRowInsertX[j] + continuingPenalty1[numActive1[i]],
+ oldRowInsertY[j]
+ + openingPenalty1[numActive1[i]][numGapOpens2[j]],
+ 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);
+
+ // going to INSERT Y state
+ ChooseBestOfThree(
+ newRowMatch[j - 1]
+ + openingPenalty2[numActive2[j]][numGapOpens1[i]],
+ newRowInsertX[j - 1]
+ + openingPenalty2[numActive2[j]][numGapOpens1[i]],
+ newRowInsertY[j - 1]
+ + continuingPenalty2[numActive2[j]], 'M', 'X',
+ 'Y', &newRowInsertY[j], tracebackPtr++);
+ }
+
+ // swap rows
+ float *temp;
+ temp = oldRowMatch;
+ oldRowMatch = newRowMatch;
+ newRowMatch = temp;
+ temp = oldRowInsertX;
+ oldRowInsertX = newRowInsertX;
+ newRowInsertX = temp;
+ temp = oldRowInsertY;
+ oldRowInsertY = newRowInsertY;
+ newRowInsertY = temp;
+ }
+
+ // store best score
+ float total;
+ char matrix;
+ ChooseBestOfThree(oldRowMatch[seq2Length], oldRowInsertX[seq2Length],
+ oldRowInsertY[seq2Length], 'M', 'X', 'Y', &total, &matrix);
+
+ delete[] twoRows;
+
+ // compute traceback
+ SafeVector<char> *alignment = new SafeVector<char>;
+ assert(alignment);
+ int r = seq1Length, c = seq2Length;
+ while (r != 0 || c != 0) {
+
+ int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;
+ char ch = tracebackMatrix[(r * (seq2Length + 1) + c) * 3 + offset];
+ switch (matrix) {
+ case 'Y':
+ c--;
+ alignment->push_back('Y');
+ break;
+ case 'X':
+ r--;
+ alignment->push_back('X');
+ break;
+ case 'M':
+ c--;
+ r--;
+ alignment->push_back('B');
+ break;
+ default:
+ assert(false);
+ }
+ matrix = ch;
+ }
+
+ delete[] tracebackMatrix;
+
+ reverse(alignment->begin(), alignment->end());
+
+ return make_pair(alignment, 1.0f);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::ComputeViterbiAlignment()
+ //
+ // Computes the highest probability pairwise alignment using the
+ // probabilistic model. The final alignment is returned as a
+ // pair consisting of:
+ // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
+ // denote insertions in one of the two sequences and
+ // B's denote that both sequences are present (i.e.
+ // matches).
+ // (2) a float containing the log probability of the best
+ // alignment (not used)
+ /////////////////////////////////////////////////////////////////
+
+ pair<SafeVector<char> *, float> ComputeViterbiAlignment(Sequence *seq1,
+ Sequence *seq2) const {
+
+ assert(seq1);
+ assert(seq2);
+
+ const int seq1Length = seq1->GetLength();
+ const int seq2Length = seq2->GetLength();
+
+ // retrieve the points to the beginning of each sequence
+ SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
+ SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
+
+ // create viterbi matrix
+ VF *viterbiPtr = new VF(
+ NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO);
+ assert(viterbiPtr);
+ VF &viterbi = *viterbiPtr;
+
+ // create traceback matrix
+ VI *tracebackPtr = new VI(
+ NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), -1);
+ assert(tracebackPtr);
+ VI &traceback = *tracebackPtr;
+
+ // initialization condition
+ for (int k = 0; k < NumMatrixTypes; k++)
+ viterbi[k] = initialDistribution[k];
+
+ // remember offset for each index combination
+ int ij = 0;
+ int i1j = -seq2Length - 1;
+ int ij1 = -1;
+ int i1j1 = -seq2Length - 2;
+
+ ij *= NumMatrixTypes;
+ i1j *= NumMatrixTypes;
+ ij1 *= NumMatrixTypes;
+ i1j1 *= NumMatrixTypes;
+
+ // compute viterbi scores
+ for (int i = 0; i <= seq1Length; i++) {
+ unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
+ for (int j = 0; j <= seq2Length; j++) {
+ unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
+
+ if (i > 0 && j > 0) {
+ for (int k = 0; k < NumMatrixTypes; k++) {
+ float newVal = viterbi[k + i1j1] + transProb[k][0]
+ + matchProb[c1][c2];
+ if (viterbi[0 + ij] < newVal) {
+ viterbi[0 + ij] = newVal;
+ traceback[0 + ij] = k;
+ }
+ }
+ }
+ if (i > 0) {
+ for (int k = 0; k < NumInsertStates; k++) {
+ float valFromMatch = insProb[c1][k] + viterbi[0 + i1j]
+ + transProb[0][2 * k + 1];
+ float valFromIns = insProb[c1][k]
+ + viterbi[2 * k + 1 + i1j]
+ + transProb[2 * k + 1][2 * k + 1];
+ if (valFromMatch >= valFromIns) {
+ viterbi[2 * k + 1 + ij] = valFromMatch;
+ traceback[2 * k + 1 + ij] = 0;
+ } else {
+ viterbi[2 * k + 1 + ij] = valFromIns;
+ traceback[2 * k + 1 + ij] = 2 * k + 1;
+ }
+ }
+ }
+ if (j > 0) {
+ for (int k = 0; k < NumInsertStates; k++) {
+ float valFromMatch = insProb[c2][k] + viterbi[0 + ij1]
+ + transProb[0][2 * k + 2];
+ float valFromIns = insProb[c2][k]
+ + viterbi[2 * k + 2 + ij1]
+ + transProb[2 * k + 2][2 * k + 2];
+ if (valFromMatch >= valFromIns) {
+ viterbi[2 * k + 2 + ij] = valFromMatch;
+ traceback[2 * k + 2 + ij] = 0;
+ } else {
+ viterbi[2 * k + 2 + ij] = valFromIns;
+ traceback[2 * k + 2 + ij] = 2 * k + 2;
+ }
+ }
+ }
+
+ ij += NumMatrixTypes;
+ i1j += NumMatrixTypes;
+ ij1 += NumMatrixTypes;
+ i1j1 += NumMatrixTypes;
+ }
+ }
+
+ // figure out best terminating cell
+ float bestProb = LOG_ZERO;
+ int state = -1;
+ for (int k = 0; k < NumMatrixTypes; k++) {
+ float thisProb =
+ viterbi[k
+ + NumMatrixTypes
+ * ((seq1Length + 1) * (seq2Length + 1) - 1)]
+ + initialDistribution[k];
+ if (bestProb < thisProb) {
+ bestProb = thisProb;
+ state = k;
+ }
+ }
+ assert(state != -1);
+
+ delete viterbiPtr;
+
+ // compute traceback
+ SafeVector<char> *alignment = new SafeVector<char>;
+ assert(alignment);
+ int r = seq1Length, c = seq2Length;
+ while (r != 0 || c != 0) {
+ int newState = traceback[state
+ + NumMatrixTypes * (r * (seq2Length + 1) + c)];
+
+ if (state == 0) {
+ c--;
+ r--;
+ alignment->push_back('B');
+ } else if (state % 2 == 1) {
+ r--;
+ alignment->push_back('X');
+ } else {
+ c--;
+ alignment->push_back('Y');
+ }
+
+ state = newState;
+ }
+
+ delete tracebackPtr;
+
+ reverse(alignment->begin(), alignment->end());
+
+ return make_pair(alignment, bestProb);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // ProbabilisticModel::BuildPosterior()
+ //
+ // Builds a posterior probability matrix needed to align a pair
+ // of alignments. Mathematically, the returned matrix M is
+ // defined as follows:
+ // M[i,j] = sum sum f(s,t,i,j)
+ // s in align1 t in align2
+ // where
+ // [ P(s[i'] <--> t[j'])
+ // [ if s[i'] is a letter in the ith column of align1 and
+ // [ t[j'] it a letter in the jth column of align2
+ // f(s,t,i,j) = [
+ // [ 0 otherwise
+ //
+ /////////////////////////////////////////////////////////////////
+
+ VF *BuildPosterior(MultiSequence *align1, MultiSequence *align2,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ float cutoff = 0.0f) const {
+ const int seq1Length = align1->GetSequence(0)->GetLength();
+ const int seq2Length = align2->GetSequence(0)->GetLength();
+
+ VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0);
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+ VF::iterator postPtr = posterior.begin();
+
+ // for each s in align1
+ for (int i = 0; i < align1->GetNumSequences(); i++) {
+ int first = align1->GetSequence(i)->GetLabel();
+ SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
+
+ // for each t in align2
+ for (int j = 0; j < align2->GetNumSequences(); j++) {
+ int second = align2->GetSequence(j)->GetLabel();
+ SafeVector<int> *mapping2 =
+ align2->GetSequence(j)->GetMapping();
+
+ if (first < second) {
+
+ // get the associated sparse matrix
+ SparseMatrix *matrix = sparseMatrices[first][second];
+
+ for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) {
+ SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
+ int base = (*mapping1)[ii] * (seq2Length + 1);
+ int rowSize = matrix->GetRowSize(ii);
+
+ // add in all relevant values
+ for (int jj = 0; jj < rowSize; jj++)
+ posterior[base + (*mapping2)[row[jj].first]] +=
+ row[jj].second;
+
+ // subtract cutoff
+ for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
+ posterior[base + (*mapping2)[jj]] -= cutoff;
+ }
+
+ } else {
+
+ // get the associated sparse matrix
+ SparseMatrix *matrix = sparseMatrices[second][first];
+
+ for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) {
+ SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
+ int base = (*mapping2)[jj];
+ int rowSize = matrix->GetRowSize(jj);
+
+ // add in all relevant values
+ for (int ii = 0; ii < rowSize; ii++)
+ posterior[base
+ + (*mapping1)[row[ii].first]
+ * (seq2Length + 1)] +=
+ row[ii].second;
+
+ // subtract cutoff
+ for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
+ posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -=
+ cutoff;
+ }
+
+ }
+
+ delete mapping2;
+ }
+
+ delete mapping1;
+ }
+
+ return posteriorPtr;
+ }
+ //added by Liu Yongchao.Feb 23, 2010
+ VF *BuildPosterior(int* seqsWeights, MultiSequence *align1,
+ MultiSequence *align2,
+ const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
+ float cutoff = 0.0f) const {
+ const int seq1Length = align1->GetSequence(0)->GetLength();
+ const int seq2Length = align2->GetSequence(0)->GetLength();
+
+ VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0);
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+ VF::iterator postPtr = posterior.begin();
+
+ //compute the total sum of all weights
+ float totalWeights = 0;
+ for (int i = 0; i < align1->GetNumSequences(); i++) {
+ int first = align1->GetSequence(i)->GetLabel();
+ int w1 = seqsWeights[first];
+ for (int j = 0; j < align2->GetNumSequences(); j++) {
+ int second = align2->GetSequence(j)->GetLabel();
+ int w2 = seqsWeights[second];
+
+ totalWeights += w1 * w2;
+ }
+ }
+ // for each s in align1
+ for (int i = 0; i < align1->GetNumSequences(); i++) {
+ int first = align1->GetSequence(i)->GetLabel();
+ int w1 = seqsWeights[first];
+ SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
+ // for each t in align2
+ for (int j = 0; j < align2->GetNumSequences(); j++) {
+ int second = align2->GetSequence(j)->GetLabel();
+ int w2 = seqsWeights[second];
+ SafeVector<int> *mapping2 =
+ align2->GetSequence(j)->GetMapping();
+
+ float w = (float) (w1 * w2) / totalWeights;
+ if (first < second) {
+
+ // get the associated sparse matrix
+ SparseMatrix *matrix = sparseMatrices[first][second];
+
+ for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) {
+ SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
+ int base = (*mapping1)[ii] * (seq2Length + 1);
+ int rowSize = matrix->GetRowSize(ii);
+
+ // add in all relevant values
+ for (int jj = 0; jj < rowSize; jj++)
+ posterior[base + (*mapping2)[row[jj].first]] += w
+ * row[jj].second;
+
+ // subtract cutoff
+ for (int jj = 0; jj < matrix->GetSeq2Length(); jj++)
+ posterior[base + (*mapping2)[jj]] -= w * cutoff;
+ }
+
+ } else {
+
+ // get the associated sparse matrix
+ SparseMatrix *matrix = sparseMatrices[second][first];
+
+ for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) {
+ SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
+ int base = (*mapping2)[jj];
+ int rowSize = matrix->GetRowSize(jj);
+
+ // add in all relevant values
+ for (int ii = 0; ii < rowSize; ii++)
+ posterior[base
+ + (*mapping1)[row[ii].first]
+ * (seq2Length + 1)] += w
+ * row[ii].second;
+
+ // subtract cutoff
+ for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
+ posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -=
+ w * cutoff;
+ }
+
+ }
+
+ delete mapping2;
+ }
+
+ delete mapping1;
+ }
+
+ return posteriorPtr;
+ }
+};
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// SafeVector.h
+//
+// STL vector with array bounds checking. To enable bounds
+// checking, #define ENABLE_CHECKS.
+/////////////////////////////////////////////////////////////////
+
+#ifndef SAFEVECTOR_H
+#define SAFEVECTOR_H
+
+#include <cassert>
+#include <vector>
+using namespace std;
+
+/////////////////////////////////////////////////////////////////
+// SafeVector
+//
+// Class derived from the STL std::vector for bounds checking.
+/////////////////////////////////////////////////////////////////
+
+template<class TYPE>
+class SafeVector: public std::vector<TYPE> {
+public:
+
+ // miscellaneous constructors
+ SafeVector() :
+ std::vector<TYPE>() {
+ }
+ SafeVector(size_t size) :
+ std::vector<TYPE>(size) {
+ }
+ SafeVector(size_t size, const TYPE &value) :
+ std::vector<TYPE>(size, value) {
+ }
+ SafeVector(const SafeVector &source) :
+ std::vector<TYPE>(source) {
+ }
+
+#ifdef ENABLE_CHECKS
+
+ // [] array bounds checking
+ TYPE &operator[](int index) {
+ assert (index >= 0 && index < (int) size());
+ return std::vector<TYPE>::operator[] ((size_t) index);
+ }
+
+ // [] const array bounds checking
+ const TYPE &operator[] (int index) const {
+ assert (index >= 0 && index < (int) size());
+ return std::vector<TYPE>::operator[] ((size_t) index);
+ }
+
+#endif
+
+};
+
+// some commonly used vector types
+typedef SafeVector<int> VI;
+typedef SafeVector<VI> VVI;
+typedef SafeVector<VVI> VVVI;
+typedef SafeVector<float> VF;
+typedef SafeVector<VF> VVF;
+typedef SafeVector<VVF> VVVF;
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// ScoreType.h
+//
+// Routines for doing math operations in MSAPROBS
+/////////////////////////////////////////////////////////////////
+
+#ifndef SCORETYPE_H
+#define SCORETYPE_H
+
+#include <cmath>
+#include <algorithm>
+#include <cfloat>
+#include <assert.h>
+
+typedef float ScoreType;
+
+const float LOG_ZERO = -2e20;
+const float LOG_ONE = 0.0;
+
+/////////////////////////////////////////////////////////////////
+// LOG()
+//
+// Compute the logarithm of x.
+/////////////////////////////////////////////////////////////////
+
+inline ScoreType LOG(ScoreType x) {
+ return log(x);
+}
+
+/////////////////////////////////////////////////////////////////
+// EXP()
+//
+// Computes exp(x).
+/////////////////////////////////////////////////////////////////
+
+inline ScoreType EXP(ScoreType x) {
+ //return exp(x);
+ if (x > -2) {
+ if (x > -0.5) {
+ if (x > 0)
+ return exp(x);
+ return (((0.03254409303190190000 * x + 0.16280432765779600000) * x
+ + 0.49929760485974900000) * x + 0.99995149601363700000) * x
+ + 0.99999925508501600000;
+ }
+ if (x > -1)
+ return (((0.01973899026052090000 * x + 0.13822379685007000000) * x
+ + 0.48056651562365000000) * x + 0.99326940370383500000) * x
+ + 0.99906756856399500000;
+ return (((0.00940528203591384000 * x + 0.09414963667859410000) * x
+ + 0.40825793595877300000) * x + 0.93933625499130400000) * x
+ + 0.98369508190545300000;
+ }
+ if (x > -8) {
+ if (x > -4)
+ return (((0.00217245711583303000 * x + 0.03484829428350620000) * x
+ + 0.22118199801337800000) * x + 0.67049462206469500000) * x
+ + 0.83556950223398500000;
+ return (((0.00012398771025456900 * x + 0.00349155785951272000) * x
+ + 0.03727721426017900000) * x + 0.17974997741536900000) * x
+ + 0.33249299994217400000;
+ }
+ if (x > -16)
+ return (((0.00000051741713416603 * x + 0.00002721456879608080) * x
+ + 0.00053418601865636800) * x + 0.00464101989351936000) * x
+ + 0.01507447981459420000;
+ return 0;
+}
+
+/*
+ /////////////////////////////////////////////////////////////////
+ // LOOKUP()
+ //
+ // Computes log (exp (x) + 1), for 0 <= x <= 7.5.
+ /////////////////////////////////////////////////////////////////
+
+ inline ScoreType LOOKUP (ScoreType x){
+ //return log (exp(x) + 1);
+ if (x < 2){
+ if (x < 0.5){
+ if (x < 0)
+ return log (exp(x) + 1);
+ return (((-0.00486373205785640000*x - 0.00020245408813934800)*x + 0.12504222666029800000)*x + 0.49999685320563000000)*x + 0.69314723138948900000;
+ }
+ if (x < 1)
+ return (((-0.00278634205460548000*x - 0.00458097251248546000)*x + 0.12865849880472500000)*x + 0.49862228499205200000)*x + 0.69334810088688000000;
+ return (((0.00059633755154209200*x - 0.01918996666063320000)*x + 0.15288232492093800000)*x + 0.48039958825756900000)*x + 0.69857578503189200000;
+ }
+ if (x < 8){
+ if (x < 4)
+ return (((0.00135958539181047000*x - 0.02329807659316430000)*x + 0.15885799609532100000)*x + 0.48167498563270800000)*x + 0.69276185058669200000;
+ return (((0.00011992394456683500*x - 0.00338464503306568000)*x + 0.03622746366545470000)*x + 0.82481250248383700000)*x + 0.32507892994863100000;
+ }
+ if (x < 16)
+ return (((0.00000051726300753785*x - 0.00002720671238876090)*x + 0.00053403733818413500)*x + 0.99536021775747900000)*x + 0.01507065715532010000;
+ return x;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // LOOKUP_SLOW()
+ //
+ // Computes log (exp (x) + 1).
+ /////////////////////////////////////////////////////////////////
+
+ inline ScoreType LOOKUP_SLOW (ScoreType x){
+ return log (exp (x) + 1);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // MAX()
+ //
+ // Compute max of three numbers
+ /////////////////////////////////////////////////////////////////
+
+ inline ScoreType MAX (ScoreType x, ScoreType y, ScoreType z){
+ if (x >= y){
+ if (x >= z)
+ return x;
+ return z;
+ }
+ if (y >= z)
+ return y;
+ return z;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // LOG_PLUS_EQUALS()
+ //
+ // Add two log probabilities and store in the first argument
+ /////////////////////////////////////////////////////////////////
+
+ inline void LOG_PLUS_EQUALS (ScoreType &x, ScoreType y){
+ if (x < y)
+ x = (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x;
+ else
+ x = (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // LOG_PLUS_EQUALS_SLOW()
+ //
+ // Add two log probabilities and store in the first argument
+ /////////////////////////////////////////////////////////////////
+
+ inline void LOG_PLUS_EQUALS_SLOW (ScoreType &x, ScoreType y){
+ if (x < y)
+ x = (x <= LOG_ZERO) ? y : LOOKUP_SLOW(y-x) + x;
+ else
+ x = (y <= LOG_ZERO) ? x : LOOKUP_SLOW(x-y) + y;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // LOG_ADD()
+ //
+ // Add two log probabilities
+ /////////////////////////////////////////////////////////////////
+
+ inline ScoreType LOG_ADD (ScoreType x, ScoreType y){
+ if (x < y) return (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x;
+ return (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y;
+ }
+ */
+
+/*
+ /////////////////////////////////////////////////////////////////
+ // LOG()
+ //
+ // Compute the logarithm of x.
+ /////////////////////////////////////////////////////////////////
+
+ inline float LOG (float x){
+ return log (x);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // EXP()
+ //
+ // Computes exp(x), fr -4.6 <= x <= 0.
+ /////////////////////////////////////////////////////////////////
+
+ inline float EXP (float x){
+ assert (x <= 0.00f);
+ if (x < EXP_UNDERFLOW_THRESHOLD) return 0.0f;
+ return (((0.006349841068584 * x + 0.080775412572352) * x + 0.397982026296272) * x + 0.95279335963787f) * x + 0.995176455837312f;
+ //return (((0.00681169825657f * x + 0.08386267698832f) * x + 0.40413983195844f) * x + 0.95656674979767f) * x + 0.99556744049130f;
+ }
+ */
+
+const float EXP_UNDERFLOW_THRESHOLD = -4.6;
+const float LOG_UNDERFLOW_THRESHOLD = 7.5;
+
+/////////////////////////////////////////////////////////////////
+// LOOKUP()
+//
+// Computes log (exp (x) + 1), for 0 <= x <= 7.5.
+/////////////////////////////////////////////////////////////////
+
+inline float LOOKUP(float x) {
+ assert(x >= 0.00f);
+ assert(x <= LOG_UNDERFLOW_THRESHOLD);
+ //return ((-0.00653779113685f * x + 0.09537236626558f) * x + 0.55317574459331f) * x + 0.68672959851568f;
+ if (x <= 1.00f)
+ return ((-0.009350833524763f * x + 0.130659527668286f) * x
+ + 0.498799810682272f) * x + 0.693203116424741f;
+ if (x <= 2.50f)
+ return ((-0.014532321752540f * x + 0.139942324101744f) * x
+ + 0.495635523139337f) * x + 0.692140569840976f;
+ if (x <= 4.50f)
+ return ((-0.004605031767994f * x + 0.063427417320019f) * x
+ + 0.695956496475118f) * x + 0.514272634594009f;
+ assert(x <= LOG_UNDERFLOW_THRESHOLD);
+ return ((-0.000458661602210f * x + 0.009695946122598f) * x
+ + 0.930734667215156f) * x + 0.168037164329057f;
+
+ //return (((0.00089738532761f * x - 0.01859488697982f) * x + 0.14415772028626f) * x + 0.49515490689159f) * x + 0.69311928966454f;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOOKUP_SLOW()
+//
+// Computes log (exp (x) + 1).
+/////////////////////////////////////////////////////////////////
+
+inline float LOOKUP_SLOW(float x) {
+ return log(exp(x) + 1);
+}
+
+/////////////////////////////////////////////////////////////////
+// MAX()
+//
+// Compute max of three numbers
+/////////////////////////////////////////////////////////////////
+
+inline float MAX(float x, float y, float z) {
+ if (x >= y) {
+ if (x >= z)
+ return x;
+ return z;
+ }
+ if (y >= z)
+ return y;
+ return z;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_PLUS_EQUALS()
+//
+// Add two log probabilities and store in the first argument
+/////////////////////////////////////////////////////////////////
+
+inline void LOG_PLUS_EQUALS(float &x, float y) {
+ if (x < y)
+ x = (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ?
+ y : LOOKUP(y - x) + x;
+ else
+ x = (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ?
+ x : LOOKUP(x - y) + y;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_PLUS_EQUALS_SLOW()
+//
+// Add two log probabilities and store in the first argument
+/////////////////////////////////////////////////////////////////
+
+inline void LOG_PLUS_EQUALS_SLOW(float &x, float y) {
+ if (x < y)
+ x = (x == LOG_ZERO) ? y : LOOKUP_SLOW(y - x) + x;
+ else
+ x = (y == LOG_ZERO) ? x : LOOKUP_SLOW(x - y) + y;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add two log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x, float y) {
+ if (x < y)
+ return (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ?
+ y : LOOKUP(y - x) + x;
+ return (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ?
+ x : LOOKUP(x - y) + y;
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add three log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x1, float x2, float x3) {
+ return LOG_ADD(x1, LOG_ADD(x2, x3));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add four log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x1, float x2, float x3, float x4) {
+ return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, x4)));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add five log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5) {
+ return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, x5))));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add siz log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5,
+ float x6) {
+ return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, LOG_ADD(x5, x6)))));
+}
+
+/////////////////////////////////////////////////////////////////
+// LOG_ADD()
+//
+// Add seven log probabilities
+/////////////////////////////////////////////////////////////////
+
+inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5, float x6,
+ float x7) {
+ return LOG_ADD(x1,
+ LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, LOG_ADD(x5, LOG_ADD(x6, x7))))));
+}
+
+/////////////////////////////////////////////////////////////////
+// ChooseBestOfThree()
+//
+// Store the largest of three values x1, x2, and x3 in *x. Also
+// if xi is the largest value, then store bi in *b.
+/////////////////////////////////////////////////////////////////
+
+inline void ChooseBestOfThree(float x1, float x2, float x3, char b1, char b2,
+ char b3, float *x, char *b) {
+ if (x1 >= x2) {
+ if (x1 >= x3) {
+ *x = x1;
+ *b = b1;
+ return;
+ }
+ *x = x3;
+ *b = b3;
+ return;
+ }
+ if (x2 >= x3) {
+ *x = x2;
+ *b = b2;
+ return;
+ }
+ *x = x3;
+ *b = b3;
+}
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// Sequence.h
+//
+// Class for reading/manipulating single sequence character data.
+/////////////////////////////////////////////////////////////////
+
+#ifndef SEQUENCE_H
+#define SEQUENCE_H
+
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cctype>
+#include <cstdlib>
+#include "SafeVector.h"
+#include "FileBuffer.h"
+
+/////////////////////////////////////////////////////////////////
+// Sequence
+//
+// Class for storing sequence information.
+/////////////////////////////////////////////////////////////////
+
+class Sequence {
+
+ bool isValid; // a boolean indicating whether the sequence data is valid or not
+ string header; // string containing the comment line of the FASTA file
+ SafeVector<char> *data; // pointer to character data
+ int length; // length of the sequence
+ int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences
+ // in a Multi-FASTA file
+ int inputLabel; // position of sequence in original input
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Default constructor. Does nothing.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence() :
+ isValid(false), header(""), data(NULL), length(0), sequenceLabel(0), inputLabel(
+ 0) {
+ }
+
+public:
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Constructor. Reads the sequence from a FileBuffer.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence(FileBuffer &infile, bool stripGaps = false) :
+ isValid(false), header("~"), data(NULL), length(0), sequenceLabel(
+ 0), inputLabel(0) {
+
+ // read until the first non-blank line
+ while (!infile.eof()) {
+ infile.GetLine(header);
+ if (header.length() != 0)
+ break;
+ }
+
+ // check to make sure that it is a correct header line
+ if (header[0] == '>') {
+
+ // if so, remove the leading ">"
+ header = header.substr(1);
+
+ // remove any leading or trailing white space in the header comment
+ while (header.length() > 0 && isspace(header[0]))
+ header = header.substr(1);
+ while (header.length() > 0 && isspace(header[header.length() - 1]))
+ header = header.substr(0, header.length() - 1);
+
+ // get ready to read the data[] array; note that data[0] is always '@'
+ char ch;
+ data = new SafeVector<char>;
+ assert(data);
+ data->push_back('@');
+
+ // get a character from the file
+ while (infile.Get(ch)) {
+
+ // if we've reached a new comment line, put the character back and stop
+ if (ch == '>') {
+ infile.UnGet();
+ break;
+ }
+
+ // skip whitespace
+ if (isspace(ch))
+ continue;
+
+ // substitute gap character
+ if (ch == '.')
+ ch = '-';
+ if (stripGaps && ch == '-')
+ continue;
+
+ // check for known characters
+ if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))) {
+ cerr << "ERROR: Unknown character encountered: " << ch
+ << endl;
+ exit(1);
+ }
+
+ // everything's ok so far, so just store this character.
+ if (ch >= 'a' && ch <= 'z') {
+ ch = ch - 'a' + 'A';
+ } //change to upper case. fixed by Liu Yongchao, May 21, 2010
+
+ data->push_back(ch);
+ ++length;
+ }
+
+ // sequence must contain data in order to be valid
+ isValid = length > 0;
+ if (!isValid) {
+ delete data;
+ data = NULL;
+ }
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Constructor. Builds a sequence from existing data. Note
+ // that the data must use one-based indexing where data[0] should
+ // be set to '@'.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence(SafeVector<char> *data, string header, int length,
+ int sequenceLabel, int inputLabel) :
+ isValid(data != NULL), header(header), data(data), length(length), sequenceLabel(
+ sequenceLabel), inputLabel(inputLabel) {
+ assert(data);
+ assert((*data)[0] == '@');
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Sequence()
+ //
+ // Destructor. Release allocated memory.
+ /////////////////////////////////////////////////////////////////
+
+ ~Sequence() {
+ if (data) {
+ assert(isValid);
+ delete data;
+ data = NULL;
+ isValid = false;
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetHeader()
+ //
+ // Return the string comment associated with this sequence.
+ /////////////////////////////////////////////////////////////////
+
+ string GetHeader() const {
+ return header;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetName()
+ //
+ // Return the first word of the string comment associated with this sequence.
+ /////////////////////////////////////////////////////////////////
+
+ string GetName() const {
+ char name[1024];
+ sscanf(header.c_str(), "%s", name);
+ return string(name);
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetDataPtr()
+ //
+ // Return the iterator to data associated with this sequence.
+ /////////////////////////////////////////////////////////////////
+
+ SafeVector<char>::iterator GetDataPtr() {
+ assert(isValid);
+ assert(data);
+ return data->begin();
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetPosition()
+ //
+ // Return the character at position i. Recall that the character
+ // data is stored with one-based indexing.
+ /////////////////////////////////////////////////////////////////
+
+ char GetPosition(int i) const {
+ assert(isValid);
+ assert(data);
+ assert(i >= 1 && i <= length);
+ return (*data)[i];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::SetLabel()
+ //
+ // Sets the sequence label to i.
+ /////////////////////////////////////////////////////////////////
+
+ void SetLabel(int i) {
+ assert(isValid);
+ sequenceLabel = i;
+ inputLabel = i;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::SetSortLabel()
+ //
+ // Sets the sequence sorting label to i.
+ /////////////////////////////////////////////////////////////////
+
+ void SetSortLabel(int i) {
+ assert(isValid);
+ sequenceLabel = i;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetLabel()
+ //
+ // Retrieves the input label.
+ /////////////////////////////////////////////////////////////////
+
+ int GetLabel() const {
+ assert(isValid);
+ return inputLabel;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetSortLabel()
+ //
+ // Retrieves the sorting label.
+ /////////////////////////////////////////////////////////////////
+
+ int GetSortLabel() const {
+ assert(isValid);
+ return sequenceLabel;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Fail()
+ //
+ // Checks to see if the sequence successfully loaded.
+ /////////////////////////////////////////////////////////////////
+
+ bool Fail() const {
+ return !isValid;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Length()
+ //
+ // Returns the length of the sequence.
+ /////////////////////////////////////////////////////////////////
+
+ int GetLength() const {
+ assert(isValid);
+ assert(data);
+ return length;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::WriteMFA()
+ //
+ // Writes the sequence to outfile in MFA format. Uses numColumns
+ // columns per line. If useIndex is set to false, then the
+ // header is printed as normal, but if useIndex is true, then
+ // ">S###" is printed where ### represents the sequence label.
+ /////////////////////////////////////////////////////////////////
+
+ void WriteMFA(ostream &outfile, int numColumns,
+ bool useIndex = false) const {
+ assert(isValid);
+ assert(data);
+ assert(!outfile.fail());
+
+ // print out heading
+ if (useIndex)
+ outfile << ">S" << GetLabel() << endl;
+ else
+ outfile << ">" << header << endl;
+
+ // print out character data
+ int ct = 1;
+ for (; ct <= length; ct++) {
+ outfile << (*data)[ct];
+ if (ct % numColumns == 0)
+ outfile << endl;
+ }
+ if ((ct - 1) % numColumns != 0)
+ outfile << endl;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Clone()
+ //
+ // Returns a new deep copy of the seqeuence.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence *Clone() const {
+ Sequence *ret = new Sequence();
+ assert(ret);
+
+ ret->isValid = isValid;
+ ret->header = header;
+ ret->data = new SafeVector<char>;
+ assert(ret->data);
+ *(ret->data) = *data;
+ ret->length = length;
+ ret->sequenceLabel = sequenceLabel;
+ ret->inputLabel = inputLabel;
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetRange()
+ //
+ // Returns a new sequence object consisting of a range of
+ // characters from the current seuquence.
+ /////////////////////////////////////////////////////////////////
+
+ Sequence *GetRange(int start, int end) const {
+ Sequence *ret = new Sequence();
+ assert(ret);
+
+ assert(start >= 1 && start <= length);
+ assert(end >= 1 && end <= length);
+ assert(start <= end);
+
+ ret->isValid = isValid;
+ ret->header = header;
+ ret->data = new SafeVector<char>;
+ assert(ret->data);
+ ret->data->push_back('@');
+ for (int i = start; i <= end; i++)
+ ret->data->push_back((*data)[i]);
+ ret->length = end - start + 1;
+ ret->sequenceLabel = sequenceLabel;
+ ret->inputLabel = inputLabel;
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::AddGaps()
+ //
+ // Given an SafeVector<char> containing the skeleton for an
+ // alignment and the identity of the current character, this
+ // routine will create a new sequence with all necesssary gaps added.
+ // For instance,
+ // alignment = "XXXBBYYYBBYYXX"
+ // id = 'X'
+ // will perform the transformation
+ // "ATGCAGTCA" --> "ATGCC---GT--CA"
+ // (XXXBBYYYBBYYXX)
+ /////////////////////////////////////////////////////////////////
+
+ Sequence *AddGaps(SafeVector<char> *alignment, char id) {
+ Sequence *ret = new Sequence();
+ assert(ret);
+
+ ret->isValid = isValid;
+ ret->header = header;
+ ret->data = new SafeVector<char>;
+ assert(ret->data);
+ ret->length = (int) alignment->size();
+ ret->sequenceLabel = sequenceLabel;
+ ret->inputLabel = inputLabel;
+ ret->data->push_back('@');
+
+ SafeVector<char>::iterator dataIter = data->begin() + 1;
+ for (SafeVector<char>::iterator iter = alignment->begin();
+ iter != alignment->end(); ++iter) {
+ if (*iter == 'B' || *iter == id) {
+ ret->data->push_back(*dataIter);
+ ++dataIter;
+ } else
+ ret->data->push_back('-');
+ }
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetString()
+ //
+ // Returns the sequence as a string with gaps removed.
+ /////////////////////////////////////////////////////////////////
+
+ string GetString() {
+ string s = "";
+ for (int i = 1; i <= length; i++) {
+ if ((*data)[i] != '-')
+ s += (*data)[i];
+ }
+ return s;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::GetMapping()
+ //
+ // Returns a SafeVector<int> containing the indices of every
+ // character in the sequence. For instance, if the data is
+ // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
+ /////////////////////////////////////////////////////////////////
+
+ SafeVector<int> *GetMapping() const {
+ SafeVector<int> *ret = new SafeVector<int>(1, 0);
+ for (int i = 1; i <= length; i++) {
+ if ((*data)[i] != '-')
+ ret->push_back(i);
+ }
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // Sequence::Highlight()
+ //
+ // Changes all positions with score >= cutoff to upper case and
+ // all positions with score < cutoff to lower case.
+ /////////////////////////////////////////////////////////////////
+
+ void Highlight(const SafeVector<float> &scores, const float cutoff) {
+ for (int i = 1; i <= length; i++) {
+ if (scores[i - 1] >= cutoff)
+ (*data)[i] = toupper((*data)[i]);
+ else
+ (*data)[i] = tolower((*data)[i]);
+ }
+ }
+};
+
+#endif
--- /dev/null
+/////////////////////////////////////////////////////////////////
+// SparseMatrix.h
+//
+// Sparse matrix computations
+/////////////////////////////////////////////////////////////////
+
+#ifndef SPARSEMATRIX_H
+#define SPARSEMATRIX_H
+
+#include <iostream>
+
+using namespace std;
+
+const float POSTERIOR_CUTOFF = 0.01; // minimum posterior probability
+// value that is maintained in the
+// sparse matrix representation
+
+typedef pair<int, float> PIF; // Sparse matrix entry type
+// first --> column
+// second --> value
+
+/////////////////////////////////////////////////////////////////
+// SparseMatrix
+//
+// Class for sparse matrix computations
+/////////////////////////////////////////////////////////////////
+
+class SparseMatrix {
+
+ int seq1Length, seq2Length; // dimensions of matrix
+ VI rowSize; // rowSize[i] = # of cells in row i
+ SafeVector<PIF> data; // data values
+ SafeVector<SafeVector<PIF>::iterator> rowPtrs; // pointers to the beginning of each row
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::SparseMatrix()
+ //
+ // Private constructor.
+ /////////////////////////////////////////////////////////////////
+
+ SparseMatrix() {
+ }
+
+public:
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::SparseMatrix()
+ //
+ // Constructor. Builds a sparse matrix from a posterior matrix.
+ // Note that the expected format for the posterior matrix is as
+ // a (seq1Length+1) x (seq2Length+1) matrix where the 0th row
+ // and 0th column are ignored (they should contain all zeroes).
+ /////////////////////////////////////////////////////////////////
+
+ SparseMatrix(int seq1Length, int seq2Length, const VF &posterior) :
+ seq1Length(seq1Length), seq2Length(seq2Length) {
+
+ int numCells = 0;
+
+ assert(seq1Length > 0);
+ assert(seq2Length > 0);
+
+ // calculate memory required; count the number of cells in the
+ // posterior matrix above the threshold
+ VF::const_iterator postPtr = posterior.begin();
+ for (int i = 0; i <= seq1Length; i++) {
+ for (int j = 0; j <= seq2Length; j++) {
+ if (*(postPtr++) >= POSTERIOR_CUTOFF) {
+ assert(i != 0 && j != 0);
+ numCells++;
+ }
+ }
+ }
+
+ // allocate memory
+ data.resize(numCells);
+ rowSize.resize(seq1Length + 1);
+ rowSize[0] = -1;
+ rowPtrs.resize(seq1Length + 1);
+ rowPtrs[0] = data.end();
+
+ // build sparse matrix
+ postPtr = posterior.begin() + seq2Length + 1; // note that we're skipping the first row here
+ SafeVector<PIF>::iterator dataPtr = data.begin();
+ for (int i = 1; i <= seq1Length; i++) {
+ postPtr++; // and skipping the first column of each row
+ rowPtrs[i] = dataPtr;
+ for (int j = 1; j <= seq2Length; j++) {
+ if (*postPtr >= POSTERIOR_CUTOFF) {
+ dataPtr->first = j;
+ dataPtr->second = *postPtr;
+ dataPtr++;
+ }
+ postPtr++;
+ }
+ rowSize[i] = dataPtr - rowPtrs[i];
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetRowPtr()
+ //
+ // Returns the pointer to a particular row in the sparse matrix.
+ /////////////////////////////////////////////////////////////////
+
+ SafeVector<PIF>::iterator GetRowPtr(int row) const {
+ assert(row >= 1 && row <= seq1Length);
+ return rowPtrs[row];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetValue()
+ //
+ // Returns value at a particular row, column.
+ /////////////////////////////////////////////////////////////////
+
+ float GetValue(int row, int col) {
+ assert(row >= 1 && row <= seq1Length);
+ assert(col >= 1 && col <= seq2Length);
+ for (int i = 0; i < rowSize[row]; i++) {
+ if (rowPtrs[row][i].first == col)
+ return rowPtrs[row][i].second;
+ }
+ return 0;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetRowSize()
+ //
+ // Returns the number of entries in a particular row.
+ /////////////////////////////////////////////////////////////////
+
+ int GetRowSize(int row) const {
+ assert(row >= 1 && row <= seq1Length);
+ return rowSize[row];
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetSeq1Length()
+ //
+ // Returns the first dimension of the matrix.
+ /////////////////////////////////////////////////////////////////
+
+ int GetSeq1Length() const {
+ return seq1Length;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetSeq2Length()
+ //
+ // Returns the second dimension of the matrix.
+ /////////////////////////////////////////////////////////////////
+
+ int GetSeq2Length() const {
+ return seq2Length;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetRowPtr
+ //
+ // Returns the pointer to a particular row in the sparse matrix.
+ /////////////////////////////////////////////////////////////////
+
+ int GetNumCells() const {
+ return data.size();
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::Print()
+ //
+ // Prints out a sparse matrix.
+ /////////////////////////////////////////////////////////////////
+
+ void Print(ostream &outfile) const {
+ outfile << "Sparse Matrix:" << endl;
+ for (int i = 1; i <= seq1Length; i++) {
+ outfile << " " << i << ":";
+ for (int j = 0; j < rowSize[i]; j++) {
+ outfile << " (" << rowPtrs[i][j].first << ","
+ << rowPtrs[i][j].second << ")";
+ }
+ outfile << endl;
+ }
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::ComputeTranspose()
+ //
+ // Returns a new sparse matrix containing the transpose of the
+ // current matrix.
+ /////////////////////////////////////////////////////////////////
+
+ SparseMatrix *ComputeTranspose() const {
+
+ // create a new sparse matrix
+ SparseMatrix *ret = new SparseMatrix();
+ int numCells = data.size();
+
+ ret->seq1Length = seq2Length;
+ ret->seq2Length = seq1Length;
+
+ // allocate memory
+ ret->data.resize(numCells);
+ ret->rowSize.resize(seq2Length + 1);
+ ret->rowSize[0] = -1;
+ ret->rowPtrs.resize(seq2Length + 1);
+ ret->rowPtrs[0] = ret->data.end();
+
+ // compute row sizes
+ for (int i = 1; i <= seq2Length; i++)
+ ret->rowSize[i] = 0;
+ for (int i = 0; i < numCells; i++)
+ ret->rowSize[data[i].first]++;
+
+ // compute row ptrs
+ for (int i = 1; i <= seq2Length; i++) {
+ ret->rowPtrs[i] =
+ (i == 1) ?
+ ret->data.begin() :
+ ret->rowPtrs[i - 1] + ret->rowSize[i - 1];
+ }
+
+ // now fill in data
+ SafeVector<SafeVector<PIF>::iterator> currPtrs = ret->rowPtrs;
+
+ for (int i = 1; i <= seq1Length; i++) {
+ SafeVector<PIF>::iterator row = rowPtrs[i];
+ for (int j = 0; j < rowSize[i]; j++) {
+ currPtrs[row[j].first]->first = i;
+ currPtrs[row[j].first]->second = row[j].second;
+ currPtrs[row[j].first]++;
+ }
+ }
+
+ return ret;
+ }
+
+ /////////////////////////////////////////////////////////////////
+ // SparseMatrix::GetPosterior()
+ //
+ // Return the posterior representation of the sparse matrix.
+ /////////////////////////////////////////////////////////////////
+
+ VF *GetPosterior() const {
+
+ // create a new posterior matrix
+ VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1));
+ assert(posteriorPtr);
+ VF &posterior = *posteriorPtr;
+
+ // build the posterior matrix
+ for (int i = 0; i < (seq1Length + 1) * (seq2Length + 1); i++)
+ posterior[i] = 0;
+ for (int i = 1; i <= seq1Length; i++) {
+ VF::iterator postPtr = posterior.begin() + i * (seq2Length + 1);
+ for (int j = 0; j < rowSize[i]; j++) {
+ postPtr[rowPtrs[i][j].first] = rowPtrs[i][j].second;
+ }
+ }
+
+ return posteriorPtr;
+ }
+
+};
+
+#endif
--- /dev/null
+/***********************************************
+ * # Copyright 2009-2010. Liu Yongchao
+ * # Contact: Liu Yongchao, School of Computer Engineering,
+ * # Nanyang Technological University.
+ * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
+ * #
+ * # GPL version 3.0 applies.
+ * #
+ * ************************************************/
+#include "MSA.h"
+
+int main(int argc, char* argv[]) {
+ MSA msa(argc, argv);
+
+ return 0;
+}
--- /dev/null
+\r
+MSAPROBS is a new and practial protein multiple sequence alignment\r
+algorithm based on pair hidden markov model and partition function\r
+posterrior probabilities. Assessed on BAliBASE 3.0, PREFAB 4.0,\r
+SABMARK 1.65, and OXBENCH, MSAProbs achieves the statistically \r
+highest alignment accuracy, compared to ClustalW 2.0.10, MAFFT 6.717(\r
+using L-INS-i with --maxiterate = 1000), MUSCLE 3.8.31, ProbCons 1.12,\r
+and Probalign 1.3. (current version 0.9.3, March 17, 2010).\r
+\r
+\r
+To use this software, please cite the following paper:\r
+/******************************************************\r
+Yongchao Liu, Bertil Schmidt, Douglas L. Maskell:\r
+\r
+"MSAProbs: multiple sequence alignment based on \r
+pair hidden Markov models and partition function posterior probabilities",\r
+\r
+Bioinformatics 2010, 26(16): 1958-1964\r
+\r
+*******************************************************/\r
+\r
+This software is developed by Liu Yongchao, School of Computer Engineering,\r
+Nanyang Technological University. If any comments or problems, \r
+please directly contact Liu Yongchao using either of the following email \r
+addresses: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com.\r
+\r
+MSAPROBS is an open-source software, complying with General Public \r
+Licence (GPL) version 3.0. MSAPROBS is distributed WITHOUT WARRANTY, express or\r
+implied. The authors accept NO LEGAL LIABILITY OR RESPONSIBILITY for\r
+loss due to reliance on the program.\r
+\r
+(1) Linux and Windows are supported, with a Makefile and a Visual Studio 2005\r
+project co-existing in the source code tarball.\r
+\r
+Change to sub-directory MSAProbs, the Makefile file for Linux can be found.\r
+\r
+(2)The default compiling options enable OpenMP support to fully utlized the \r
+compute capability of multi-core CPUs, as multi-core CPUs have been commonplace.\r
+\r
+Typical Usage:\r
+ (1) "./msaprobs -help" or "./msaprobs -?"\r
+ Get the command line options\r
+\r
+ (2) "./msaprobs infile >outfile" or "./msaprobs infile -o outfile"\r
+ The alignments are printed out into file "outfile" in FASTA format\r
+\r
+ (3) ./msaprobs infile -o outfile -num_threads 4\r
+ Use four threads to accelerate the execution\r
echo "DONE"
cd ..
+echo "Compiling GLProbs ..."
+cd GLProbs-1.0
+make clean
+make
+echo "DONE"
+cd ..
+
+
+echo "Compiling MSAProbs-0.9.7 ..."
+cd MSAProbs-0.9.7/MSAProbs
+make clean
+make
+echo "DONE"
+cd ../..
+
echo "Setting executable flag for Jpred..."
chmod +x jpred/i686/* jpred/x86_64/*
+
+echo "Setting executable flag for ViennaRNA..."
+chmod +x ViennaRNA/Progs/RNAalifold
+
+echo "Setting executable flag for MSAProbs-0.9.7..."
+chmod +x MSAProbs-0.9.7/MSAProbs/msaprobs
+
+echo "Setting executable flag for GLProbs-1.0..."
+chmod +x GLProbs-1.0/glprobs