From: Sasha Sherstnev Date: Tue, 24 Sep 2013 14:08:57 +0000 (+0100) Subject: Add GLprobs and MSAprobs to binaries X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=e3dd02911f34876e22c551ff21482a278a0399b5;p=jabaws.git Add GLprobs and MSAprobs to binaries --- diff --git a/binaries/src/GLProbs-1.0/Defaults.h b/binaries/src/GLProbs-1.0/Defaults.h new file mode 100644 index 0000000..953cdac --- /dev/null +++ b/binaries/src/GLProbs-1.0/Defaults.h @@ -0,0 +1,120 @@ +///////////////////////////////////////////////////////////////// +// Defaults.h +// +// Default constants for use in MSAPROBS. The emission +// probabilities were computed using the program used to build +// the BLOSUM62 matrix from the BLOCKS 5.0 dataset. Transition +// parameters were obtained via unsupervised EM training on the +// BALIBASE 2.0 benchmark alignment database. +///////////////////////////////////////////////////////////////// + +#ifndef DEFAULTS_H +#define DEFAULTS_H + +#include + +using namespace std; + +/* + float initDistrib1Default[] = { 0.3202854395, 0.3398572505, 0.3398572505 }; + float gapOpen1Default[] = { 0.1375414133, 0.1375414133 }; + float gapExtend1Default[] = { 0.7832147479, 0.7832147479 }; + */ +/* +float initDistrib1Default[] = { 0.6080327034f, 0.1959836632f, 0.1959836632f }; +float gapOpen1Default[] = { 0.01993141696f, 0.01993141696f }; +float gapExtend1Default[] = { 0.7943345308f, 0.7943345308f }; +*/ + +float initDistrib1Default[] = { 0.06188, 0.93812, 0.1959836632f }; +float gapOpen1Default[] = { 0.01993141696f, 0.01993141696f }; +float gapExtend1Default[] = { 0.7943345308f, 0.7943345308f }; + +/* +float initDistrib1Default[] = { 0.2031769156f, 0.7968229055f, 0.05529401079f }; +float gapOpen1Default[] = { 0.006541831419f, 0.006541831419f }; +float gapExtend1Default[] = { 0.3042867482f, 0.3042867482f }; +*/ +/* +float initDistrib1Default[] = { 0.109684445f, 0.8903156519f, 0.01231110841f }; +float gapOpen1Default[] = { 0.01968936995f, 0.01968936995f }; +float gapExtend1Default[] = { 0.5699355602f, 0.5699355602f }; +*/ +float initDistrib2Default[] = { 0.6814756989f, 8.615339902e-05f, + 0.700645f, 0.1591759622f, 0.1591759622 }; +float gapOpen2Default[] = { 0.0119511066f, 0.01993141696f, 0.008008334786f, + 0.008008334786 }; +float gapExtend2Default[] = { 0.3965826333f, 0.7943345308f, 0.8988758326f, + 0.8988758326 }; + +string alphabetDefault = "ARNDCQEGHILKMFPSTWYV"; +float emitSingleDefault[20] = { 0.07831005f, 0.05246024f, 0.04433257f, + 0.05130349f, 0.02189704f, 0.03585766f, 0.05615771f, 0.07783433f, + 0.02601093f, 0.06511648f, 0.09716489f, 0.05877077f, 0.02438117f, + 0.04463228f, 0.03940142f, 0.05849916f, 0.05115306f, 0.01203523f, + 0.03124726f, 0.07343426f }; + +float emitPairsDefault[20][20] = { { 0.02373072f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f }, { 0.00244502f, 0.01775118f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, { 0.00210228f, 0.00207782f, 0.01281864f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, { 0.00223549f, 0.00161657f, 0.00353540f, 0.01911178f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f }, { 0.00145515f, 0.00044701f, 0.00042479f, + 0.00036798f, 0.01013470f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00219102f, + 0.00253532f, 0.00158223f, 0.00176784f, 0.00032102f, 0.00756604f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, { 0.00332218f, 0.00268865f, 0.00224738f, 0.00496800f, + 0.00037956f, 0.00345128f, 0.01676565f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00597898f, + 0.00194865f, 0.00288882f, 0.00235249f, 0.00071206f, 0.00142432f, + 0.00214860f, 0.04062876f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00114353f, 0.00132105f, 0.00141205f, + 0.00097077f, 0.00026421f, 0.00113901f, 0.00131767f, 0.00103704f, + 0.00867996f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, + { 0.00318853f, 0.00138145f, 0.00104273f, 0.00105355f, 0.00094040f, + 0.00100883f, 0.00124207f, 0.00142520f, 0.00059716f, 0.01778263f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { + 0.00449576f, 0.00246811f, 0.00160275f, 0.00161966f, 0.00138494f, + 0.00180553f, 0.00222063f, 0.00212853f, 0.00111754f, 0.01071834f, + 0.03583921f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, { 0.00331693f, 0.00595650f, 0.00257310f, 0.00252518f, + 0.00046951f, 0.00312308f, 0.00428420f, 0.00259311f, 0.00121376f, + 0.00157852f, 0.00259626f, 0.01612228f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00148878f, 0.00076734f, + 0.00063401f, 0.00047808f, 0.00037421f, 0.00075546f, 0.00076105f, + 0.00066504f, 0.00042237f, 0.00224097f, 0.00461939f, 0.00096120f, + 0.00409522f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { + 0.00165004f, 0.00090768f, 0.00084658f, 0.00069041f, 0.00052274f, + 0.00059248f, 0.00078814f, 0.00115204f, 0.00072545f, 0.00279948f, + 0.00533369f, 0.00087222f, 0.00116111f, 0.01661038f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00230618f, 0.00106268f, + 0.00100282f, 0.00125381f, 0.00034766f, 0.00090111f, 0.00151550f, + 0.00155601f, 0.00049078f, 0.00103767f, 0.00157310f, 0.00154836f, + 0.00046718f, 0.00060701f, 0.01846071f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, { 0.00631752f, 0.00224540f, 0.00301397f, 0.00285226f, + 0.00094867f, 0.00191155f, 0.00293898f, 0.00381962f, 0.00116422f, + 0.00173565f, 0.00250962f, 0.00312633f, 0.00087787f, 0.00119036f, + 0.00180037f, 0.01346609f, 0.0f, 0.0f, 0.0f, 0.0f }, { + 0.00389995f, 0.00186053f, 0.00220144f, 0.00180488f, 0.00073798f, + 0.00154526f, 0.00216760f, 0.00214841f, 0.00077747f, 0.00248968f, + 0.00302273f, 0.00250862f, 0.00093371f, 0.00107595f, 0.00147982f, + 0.00487295f, 0.01299436f, 0.0f, 0.0f, 0.0f }, { 0.00039119f, + 0.00029139f, 0.00021006f, 0.00016015f, 0.00010666f, 0.00020592f, + 0.00023815f, 0.00038786f, 0.00019097f, 0.00039549f, 0.00076736f, + 0.00028448f, 0.00016253f, 0.00085751f, 0.00015674f, 0.00026525f, + 0.00024961f, 0.00563625f, 0.0f, 0.0f }, { 0.00131840f, + 0.00099430f, 0.00074960f, 0.00066005f, 0.00036626f, 0.00070192f, + 0.00092548f, 0.00089301f, 0.00131038f, 0.00127857f, 0.00219713f, + 0.00100817f, 0.00054105f, 0.00368739f, 0.00047608f, 0.00102648f, + 0.00094759f, 0.00069226f, 0.00999315f, 0.0f }, { 0.00533241f, + 0.00169359f, 0.00136609f, 0.00127915f, 0.00119152f, 0.00132844f, + 0.00178697f, 0.00194579f, 0.00071553f, 0.01117956f, 0.00914460f, + 0.00210897f, 0.00197461f, 0.00256159f, 0.00135781f, 0.00241601f, + 0.00343452f, 0.00038538f, 0.00148001f, 0.02075171f } }; + +#endif diff --git a/binaries/src/GLProbs-1.0/FileBuffer.h b/binaries/src/GLProbs-1.0/FileBuffer.h new file mode 100644 index 0000000..06af54b --- /dev/null +++ b/binaries/src/GLProbs-1.0/FileBuffer.h @@ -0,0 +1,117 @@ +///////////////////////////////////////////////////////////////// +// FileBuffer.h +// +// Buffered file reading. +///////////////////////////////////////////////////////////////// + +#ifndef FILEBUFFER_H +#define FILEBUFFER_H + +#include +#include +#include + +using namespace std; + +const int BufferSize = 1000; + +///////////////////////////////////////////////////////////////// +// FileBuffer +// +// Class for buffering file reading. +///////////////////////////////////////////////////////////////// + +class FileBuffer { + ifstream file; + char buffer[BufferSize]; + int currPos; + int size; + bool isEOF; + bool isValid; + bool canUnget; + +public: + + // Some common routines + + FileBuffer(const char *filename) : + file(filename), currPos(0), size(0), isEOF(false), isValid( + !file.fail()), canUnget(false) { + } + ~FileBuffer() { + close(); + } + bool fail() const { + return !isValid; + } + bool eof() const { + return (!isValid || isEOF); + } + void close() { + file.close(); + isValid = false; + } + + ///////////////////////////////////////////////////////////////// + // FileBuffer::Get() + // + // Retrieve a character from the file buffer. Returns true if + // and only if a character is read. + ///////////////////////////////////////////////////////////////// + + bool Get(char &ch) { + + // check to make sure that there's more stuff in the file + if (!isValid || isEOF) + return false; + + // if the buffer is empty, it's time to reload it + if (currPos == size) { + file.read(buffer, BufferSize); + size = file.gcount(); + isEOF = (size == 0); + currPos = 0; + if (isEOF) + return false; + } + + // store the read character + ch = buffer[currPos++]; + canUnget = true; + return true; + } + + ///////////////////////////////////////////////////////////////// + // FileBuffer::UnGet() + // + // Unretrieve the most recently read character from the file + // buffer. Note that this allows only a one-level undo. + ///////////////////////////////////////////////////////////////// + + void UnGet() { + assert(canUnget); + assert(isValid); + assert(currPos > 0); + currPos--; + assert(currPos < size); + isEOF = false; + canUnget = false; + } + + ///////////////////////////////////////////////////////////////// + // FileBuffer::GetLine() + // + // Retrieve characters of text until a newline character is + // encountered. Terminates properly on end-of-file condition. + ///////////////////////////////////////////////////////////////// + + void GetLine(string &s) { + char ch; + s = ""; + while (Get(ch) && ch != '\n') + s += ch; + } + +}; + +#endif diff --git a/binaries/src/GLProbs-1.0/MSA.cpp b/binaries/src/GLProbs-1.0/MSA.cpp new file mode 100644 index 0000000..0144492 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSA.cpp @@ -0,0 +1,1541 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "MSA.h" +#include "MSAClusterTree.h" +#include "Defaults.h" + +#ifdef _OPENMP +#include +#endif + +string parametersInputFilename = ""; +string parametersOutputFilename = "no training"; +string annotationFilename = ""; + +bool enableVerbose = false; +bool enableAnnotation = false; +bool enableClustalWOutput = false; +bool enableAlignOrder = false; +int numConsistencyReps = 2; +int numPreTrainingReps = 0; +int numIterativeRefinementReps = 100; + +float cutoff = 0; + +VF initDistrib(NumMatrixTypes); +VF gapOpen(2 * NumInsertStates); +VF gapExtend(2 * NumInsertStates); +VVF emitPairs(256, VF(256, 1e-10)); +VF emitSingle(256, 1e-5); + +string alphabet = alphabetDefault; + +const int MIN_PRETRAINING_REPS = 0; +const int MAX_PRETRAINING_REPS = 20; +const int MIN_CONSISTENCY_REPS = 0; +const int MAX_CONSISTENCY_REPS = 5; +const int MIN_ITERATIVE_REFINEMENT_REPS = 0; +const int MAX_ITERATIVE_REFINEMENT_REPS = 1000; + +string posteriorProbsFilename = ""; +bool allscores = true; +string infilename; + +int flag_gui = 0; //0: no gui related o/p +//1: gui related o/p generated +int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment +//1: pp score seq added to o/p fasta alignment + +/////////////////////////////// +// global scoring matrix variables +////////////////////////////// +float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; +char *aminos, *bases, matrixtype[20] = "gonnet_160"; +int subst_index[26]; + +double sub_matrix[26][26]; +double normalized_matrix[26][26];// add by YE Yongtao +int firstread = 0; //this makes sure that matrices are read only once + +float TEMPERATURE = 5; +int MATRIXTYPE = 160; +int prot_nuc = 0; //0=prot, 1=nucleotide + +float GAPOPEN = 0; +float GAPEXT = 0; +int numThreads = 0; + +//argument support +typedef struct { + char input[30]; + int matrix; + int N; + float T; + float beta; + char opt; //can be 'P' or 'M' + float gapopen; + float gapext; +} argument_decl; + +argument_decl argument; + +extern inline void read_sustitution_matrix(char *fileName); +extern void setmatrixtype(int le); +extern inline int matrixtype_to_int(); +extern inline void read_dna_matrix(); +extern inline void read_vtml_la_matrix(); +extern void init_arguments(); + +MSA::MSA(int argc, char* argv[]) { + //parse program parameters + SafeVector sequenceNames = ParseParams(argc, argv); + + //initialize arguments for partition function + init_arguments(); + + ReadParameters(); + //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL); + + //read the input sequences + MultiSequence *sequences = new MultiSequence(); + assert(sequences); + for (int i = 0; i < (int) sequenceNames.size(); i++) { + cerr << "Loading sequence file: " << sequenceNames[i] << endl; + sequences->LoadMFA(sequenceNames[i], true); + } + //allocate space for sequence weights + this->seqsWeights = new int[sequences->GetNumSequences()]; + //initilaize parameters for OPENMP +#ifdef _OPENMP + if(numThreads <= 0) { + numThreads = omp_get_num_procs(); +// cerr << "Automatically detected " << numThreads << " CPU cores" << endl; + } +// cerr <<"Enabling OpenMP (with "<WriteALN(*alignOutFile); + } else { + alignment->WriteMFA(*alignOutFile); + } + + //release resources + delete[] this->seqsWeights; + delete alignment; + delete sequences; +} +MSA::~MSA() { + /*close the output file*/ + if (alignOutFileName.length() > 0) { + ((std::ofstream*) alignOutFile)->close(); + } +} +///////////////////////////////////////////////////////////////// +// PrintParameters() +// +// Prints MSAPROBS parameters to STDERR. If a filename is +// specified, then the parameters are also written to the file. +///////////////////////////////////////////////////////////////// + +void MSA::PrintParameters(const char *message, const VF &initDistrib, + const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, + const VF &emitSingle, const char *filename) { + + // print parameters to the screen + cerr << message << endl << " initDistrib[] = { "; + for (int i = 0; i < NumMatrixTypes; i++) + cerr << setprecision(10) << initDistrib[i] << " "; + cerr << "}" << endl << " gapOpen[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapOpen[i] << " "; + cerr << "}" << endl << " gapExtend[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapExtend[i] << " "; + cerr << "}" << endl << endl; + + // if a file name is specified + if (filename) { + + // attempt to open the file for writing + FILE *file = fopen(filename, "w"); + if (!file) { + cerr << "ERROR: Unable to write parameter file: " << filename + << endl; + exit(1); + } + + // if successful, then write the parameters to the file + for (int i = 0; i < NumMatrixTypes; i++) + fprintf(file, "%.10f ", initDistrib[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapOpen[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapExtend[i]); + fprintf(file, "\n"); + fprintf(file, "%s\n", alphabet.c_str()); + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) + fprintf(file, "%.10f ", + emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]); + fprintf(file, "\n"); + } + for (int i = 0; i < (int) alphabet.size(); i++) + fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]); + fprintf(file, "\n"); + fclose(file); + } +} + +///////////////////////////////////////////////////////////////// +// doAlign() +// +// First computes all pairwise posterior probability matrices. +// Then, computes new parameters if training, or a final +// alignment, otherwise. +///////////////////////////////////////////////////////////////// +extern VF *ComputePostProbs(int a, int b, string seq1, string seq2); +MultiSequence* MSA::doAlign(MultiSequence *sequences, + const ProbabilisticModel &model, int levelid) { + assert(sequences); + + //get the number of sequences + const int numSeqs = sequences->GetNumSequences(); + //create distance matrix + VVF distances(numSeqs, VF(numSeqs, 0)); + //creat sparseMatrices + SafeVector > sparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + +#ifdef _OPENMP + //calculate sequence pairs for openmp model + int pairIdx = 0; + numPairs = (numSeqs - 1) * numSeqs / 2; + seqsPairs = new SeqsPair[numPairs]; + for(int a = 0; a < numSeqs; a++) { + for(int b = a + 1; b < numSeqs; b++) { + seqsPairs[pairIdx].seq1 = a; + seqsPairs[pairIdx].seq2 = b; + pairIdx++; + } + } +#endif + // do all pairwise alignments for posterior probability matrices +#ifdef _OPENMP +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int a= seqsPairs[pairIdx].seq1; + int b = seqsPairs[pairIdx].seq2; + if(enableVerbose) { +#pragma omp critical + cerr <<"tid "<GetSequence(a); + Sequence *seq2 = sequences->GetSequence(b); + + //posterior probability matrix + VF* posterior; + +//low similarity use local model + if(levelid == 1){ + + VF *forward = model.ComputeForwardMatrix(seq1, seq2,false); + assert(forward); + VF *backward = model.ComputeBackwardMatrix(seq1, seq2,false); + assert(backward); + posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false); + delete forward; + delete backward; + + } +//high similarity use global model + else if(levelid >= 2) posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString()); + +//extreme low or extreme high similarity use combined model + else{ + +//probcons + // compute forward and backward probabilities + VF *forward = model.ComputeForwardMatrix(seq1, seq2); + assert(forward); + VF *backward = model.ComputeBackwardMatrix(seq1, seq2); + assert(backward); + // compute posterior probability matrix from HMM + VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward); + assert(probcons_posterior); + delete forward; + delete backward; + +//probalign + VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString()); + assert(probalign_posterior); +//local + forward = model.ComputeForwardMatrix(seq1, seq2,false); + assert(forward); + backward = model.ComputeBackwardMatrix(seq1, seq2,false); + assert(backward); + posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false); + assert(posterior); + delete forward; + delete backward; +//combined model + //merge probalign + local + probcons + VF::iterator ptr1 = probcons_posterior->begin(); + VF::iterator ptr2 = probalign_posterior->begin(); + VF::iterator ptr = posterior->begin(); + for (int i = 0; i <= seq1->GetLength(); i++) { + for (int j = 0; j <= seq2->GetLength(); j++) { + float v1 = *ptr1; + float v2 = *ptr2; + float v3 = *ptr; + *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3); + ptr1++; + ptr2++; + ptr++; + } + } + delete probcons_posterior; + delete probalign_posterior; + } + + assert(posterior); + // perform the pairwise sequence alignment + pair *, float> alignment = model.ComputeAlignment( + seq1->GetLength(), seq2->GetLength(), *posterior); + + //compute expected accuracy + distances[a][b] = distances[b][a] = 1.0f - alignment.second + / min(seq1->GetLength(), seq2->GetLength()); + + // compute sparse representations + sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), *posterior); + sparseMatrices[b][a] = NULL; + + delete posterior; + delete alignment.first; +#ifndef _OPENMP + } +#endif + } + + //create the guide tree + this->tree = new MSAClusterTree(this, distances, numSeqs); + this->tree->create(); + + // perform the consistency transformation the desired number of times + float* fweights = new float[numSeqs]; + for (int r = 0; r < numSeqs; r++) { + fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY; + fweights[r] *= 10; + } + for (int r = 0; r < numConsistencyReps; r++) { + SafeVector > newSparseMatrices = + DoRelaxation(fweights, sequences, sparseMatrices); + + // now replace the old posterior matrices + for (int i = 0; i < numSeqs; i++) { + for (int j = 0; j < numSeqs; j++) { + delete sparseMatrices[i][j]; + sparseMatrices[i][j] = newSparseMatrices[i][j]; + } + } + } + delete[] fweights; +#ifdef _OPENMP + delete [] seqsPairs; +#endif + + //compute the final multiple sequence alignment + MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences, + sparseMatrices, model,levelid); + + // build annotation + if (enableAnnotation) { + WriteAnnotation(finalAlignment, sparseMatrices); + } + //destroy the guide tree + delete this->tree; + this->tree = 0; + + // delete sparse matrices + for (int a = 0; a < numSeqs - 1; a++) { + for (int b = a + 1; b < numSeqs; b++) { + delete sparseMatrices[a][b]; + delete sparseMatrices[b][a]; + } + } + + return finalAlignment; +} + +///////////////////////////////////////////////////////////////// +// GetInteger() +// +// Attempts to parse an integer from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetInteger(char *data, int *val) { + char *endPtr; + long int retVal; + + assert(val); + + errno = 0; + retVal = strtol(data, &endPtr, 0); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) + return false; + if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) + return false; + *val = (int) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// GetFloat() +// +// Attempts to parse a float from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetFloat(char *data, float *val) { + char *endPtr; + double retVal; + + assert(val); + + errno = 0; + retVal = strtod(data, &endPtr); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) + return false; + *val = (float) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// ReadParameters() +// +// Read initial distribution, transition, and emission +// parameters from a file. +///////////////////////////////////////////////////////////////// + +void MSA::ReadParameters() { + + ifstream data; + + emitPairs = VVF(256, VF(256, 1e-10)); + emitSingle = VF(256, 1e-5); + + // read initial state distribution and transition parameters + if (parametersInputFilename == string("")) { + if (NumInsertStates == 1) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend1Default[i]; + } else if (NumInsertStates == 2) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend2Default[i]; + } else { + cerr + << "ERROR: No default initial distribution/parameter settings exist" + << endl << " for " << NumInsertStates + << " pairs of insert states. Use --paramfile." << endl; + exit(1); + } + + alphabet = alphabetDefault; + + for (int i = 0; i < (int) alphabet.length(); i++) { + emitSingle[(unsigned char) tolower(alphabet[i])] = + emitSingleDefault[i]; + emitSingle[(unsigned char) toupper(alphabet[i])] = + emitSingleDefault[i]; + for (int j = 0; j <= i; j++) { + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + } + } + } else { + data.open(parametersInputFilename.c_str()); + if (data.fail()) { + cerr << "ERROR: Unable to read parameter file: " + << parametersInputFilename << endl; + exit(1); + } + + string line[3]; + for (int i = 0; i < 3; i++) { + if (!getline(data, line[i])) { + cerr + << "ERROR: Unable to read transition parameters from parameter file: " + << parametersInputFilename << endl; + exit(1); + } + } + istringstream data2; + data2.clear(); + data2.str(line[0]); + for (int i = 0; i < NumMatrixTypes; i++) + data2 >> initDistrib[i]; + data2.clear(); + data2.str(line[1]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapOpen[i]; + data2.clear(); + data2.str(line[2]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapExtend[i]; + + if (!getline(data, line[0])) { + cerr << "ERROR: Unable to read alphabet from scoring matrix file: " + << parametersInputFilename << endl; + exit(1); + } + + // read alphabet as concatenation of all characters on alphabet line + alphabet = ""; + string token; + data2.clear(); + data2.str(line[0]); + while (data2 >> token) + alphabet += token; + + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) { + float val; + data >> val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + } + } + + for (int i = 0; i < (int) alphabet.size(); i++) { + float val; + data >> val; + emitSingle[(unsigned char) tolower(alphabet[i])] = val; + emitSingle[(unsigned char) toupper(alphabet[i])] = val; + } + data.close(); + } +} + +///////////////////////////////////////////////////////////////// +// ParseParams() +// +// Parse all command-line options. +///////////////////////////////////////////////////////////////// +void MSA::printUsage() { + cerr + << "************************************************************************" + << endl + << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm" + << endl + << "\tbased on pair hidden markov model and partition function postirior" + << endl + << "\tprobabilities. If any comments or problems, please contact" + << endl + << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)" + << endl + << "*************************************************************************" + << endl << "Usage:" << endl + << " msaprobs [OPTION]... [infile]..." << endl << endl + << "Description:" << endl + << " Align sequences in multi-FASTA format" << endl << endl + << " -o, --outfile " << endl + << " specify the output file name (STDOUT by default)" + << endl << " -num_threads " << endl + << " specify the number of threads used, and otherwise detect automatically" + << endl << " -clustalw" << endl + << " use CLUSTALW output format instead of FASTA format" + << endl << endl << " -c, --consistency REPS" << endl + << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " + << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps + << ") passes of consistency transformation" << endl << endl + << " -ir, --iterative-refinement REPS" << endl + << " use " << MIN_ITERATIVE_REFINEMENT_REPS + << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: " + << numIterativeRefinementReps << ") passes of iterative-refinement" + << endl << endl << " -v, --verbose" << endl + << " report progress while aligning (default: " + << (enableVerbose ? "on" : "off") << ")" << endl << endl + << " -annot FILENAME" << endl + << " write annotation for multiple alignment to FILENAME" + << endl << endl << " -a, --alignment-order" << endl + << " print sequences in alignment order rather than input order (default: " + << (enableAlignOrder ? "on" : "off") << ")" << endl + << " -version " << endl + << " print out version of MSAPROBS " << endl << endl; +} +SafeVector MSA::ParseParams(int argc, char **argv) { + if (argc < 2) { + printUsage(); + exit(1); + } + SafeVector sequenceNames; + int tempInt; + float tempFloat; + + for (int i = 1; i < argc; i++) { + if (argv[i][0] == '-') { + //help + if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) { + printUsage(); + exit(1); + //output file name + } else if (!strcmp(argv[i], "-o") + || !strcmp(argv[i], "--outfile")) { + if (i < argc - 1) { + alignOutFileName = argv[++i]; //get the file name + } else { + cerr << "ERROR: String expected for option " << argv[i] + << endl; + exit(1); + } + // parameter file + } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){ + if (i < argc - 1) + parametersInputFilename = string (argv[++i]); + else { + cerr << "ERROR: Filename expected for option " << argv[i] << endl; + exit (1); + } + //number of threads used + } else if (!strcmp(argv[i], "-p") + || !strcmp(argv[i], "-num_threads")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << " ERROR: invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < 0) { + tempInt = 0; + } + numThreads = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + // number of consistency transformations + } else if (!strcmp(argv[i], "-c") + || !strcmp(argv[i], "--consistency")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_CONSISTENCY_REPS + || tempInt > MAX_CONSISTENCY_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_CONSISTENCY_REPS << " and " + << MAX_CONSISTENCY_REPS << "." << endl; + exit(1); + } else { + numConsistencyReps = tempInt; + } + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // number of randomized partitioning iterative refinement passes + else if (!strcmp(argv[i], "-ir") + || !strcmp(argv[i], "--iterative-refinement")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS + || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_ITERATIVE_REFINEMENT_REPS << " and " + << MAX_ITERATIVE_REFINEMENT_REPS << "." + << endl; + exit(1); + } else + numIterativeRefinementReps = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // annotation files + else if (!strcmp(argv[i], "-annot")) { + enableAnnotation = true; + if (i < argc - 1) { + annotationFilename = argv[++i]; + } else { + cerr << "ERROR: FILENAME expected for option " << argv[i] + << endl; + exit(1); + } + } + + // clustalw output format + else if (!strcmp(argv[i], "-clustalw")) { + enableClustalWOutput = true; + } + + // cutoff + else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) { + if (i < argc - 1) { + if (!GetFloat(argv[++i], &tempFloat)) { + cerr + << "ERROR: Invalid floating-point value following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempFloat < 0 || tempFloat > 1) { + cerr << "ERROR: For option " << argv[i - 1] + << ", floating-point value must be between 0 and 1." + << endl; + exit(1); + } else + cutoff = tempFloat; + } + } else { + cerr << "ERROR: Floating-point value expected for option " + << argv[i] << endl; + exit(1); + } + } + + // verbose reporting + else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) { + enableVerbose = true; + } + + // alignment order + else if (!strcmp(argv[i], "-a") + || !strcmp(argv[i], "--alignment-order")) { + enableAlignOrder = true; + } + + //print out version + else if (!strcmp(argv[i], "-version")) { + cerr << "MSAPROBS version " << VERSION << endl; + exit(1); + } + // bad arguments + else { + cerr << "ERROR: Unrecognized option: " << argv[i] << endl; + exit(1); + } + } else { + sequenceNames.push_back(string(argv[i])); + } + } + + /*check the output file name*/ + cerr << "-------------------------------------" << endl; + if (alignOutFileName.length() == 0) { + cerr << "The final alignments will be printed out to STDOUT" << endl; + alignOutFile = &std::cout; + } else { + cerr << "Open the output file " << alignOutFileName << endl; + alignOutFile = new ofstream(alignOutFileName.c_str(), + ios::binary | ios::out | ios::trunc); + } + cerr << "-------------------------------------" << endl; + return sequenceNames; +} + +///////////////////////////////////////////////////////////////// +// ProcessTree() +// +// Process the tree recursively. Returns the aligned sequences +// corresponding to a node or leaf of the tree. +///////////////////////////////////////////////////////////////// +MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + MultiSequence *result; + + // check if this is a node of the alignment tree + //if (tree->GetSequenceLabel() == -1){ + if (tree->leaf == NODE) { + MultiSequence *alignLeft = ProcessTree(tree->left, sequences, + sparseMatrices, model); + MultiSequence *alignRight = ProcessTree(tree->right, sequences, + sparseMatrices, model); + + assert(alignLeft); + assert(alignRight); + + result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model); + assert(result); + + delete alignLeft; + delete alignRight; + } + + // otherwise, this is a leaf of the alignment tree + else { + result = new MultiSequence(); + assert(result); + //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone()); + result->AddSequence(sequences->GetSequence(tree->idx)->Clone()); + } + + return result; +} + +///////////////////////////////////////////////////////////////// +// ComputeFinalAlignment() +// +// Compute the final alignment by calling ProcessTree(), then +// performing iterative refinement as needed. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree, + MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, int levelid) { + MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences, + sparseMatrices, model); + + SafeVector oldOrdering; + int numSeqs = alignment->GetNumSequences(); + if (enableAlignOrder) { + for (int i = 0; i < numSeqs; i++) + oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel()); + alignment->SaveOrdering(); + enableAlignOrder = false; + } + + // tree-based refinement + // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree); + /* + int numSeqs = alignment->GetNumSequences(); + //if(numSeqs < numIterativeRefinementReps){ + for(int iter = 0; iter < 5; iter ++){ + for(int i = 0; i < numSeqs - 1; i++){ + DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i); + } + } + //}*/ +/* + //DoIterativeRefinement() return 1,2: this refinement unsuccessful + if(levelid == 3) numIterativeRefinementReps=10; + int ineffectiveness = 0; + for (int i = 0; i < numIterativeRefinementReps; i++){ + int flag = DoIterativeRefinement(sparseMatrices, model, alignment); + if(numSeqs > 35 && levelid < 3){ + if(flag > 0){ + if(numIterativeRefinementReps < 10*numSeqs) + numIterativeRefinementReps ++; + if(flag == 1) ineffectiveness ++; + } + //else ineffectiveness = 0; + if(ineffectiveness > numSeqs && i >100 ) break; + } + } +*/ + + //if(levelid == 3) numIterativeRefinementReps=10; + for (int i = 0; i < numIterativeRefinementReps; i++) + DoIterativeRefinement(sparseMatrices, model, alignment); + + cerr << endl; + + if (oldOrdering.size() > 0) { + for (int i = 0; i < (int) oldOrdering.size(); i++) { + alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]); + } + } + + // return final alignment + return alignment; +} + +///////////////////////////////////////////////////////////////// +// AlignAlignments() +// +// Returns the alignment of two MultiSequence objects. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::AlignAlignments(MultiSequence *align1, + MultiSequence *align2, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + // print some info about the alignment + if (enableVerbose) { + for (int i = 0; i < align1->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align1->GetSequence(i)->GetLabel(); + cerr << "] vs. "; + for (int i = 0; i < align2->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align2->GetSequence(i)->GetLabel(); + cerr << "]: "; + } +#if 0 + VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2, + sparseMatrices, cutoff); +#endif + // compute an "accuracy" measure for the alignment before refinement + + pair *, float> alignment; + //perform alignment + alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(), + align2->GetSequence(0)->GetLength(), *posterior); + + delete posterior; + + if (enableVerbose) { + + // compute total length of sequences + int totLength = 0; + for (int i = 0; i < align1->GetNumSequences(); i++) + for (int j = 0; j < align2->GetNumSequences(); j++) + totLength += min(align1->GetSequence(i)->GetLength(), + align2->GetSequence(j)->GetLength()); + + // give an "accuracy" measure for the alignment + cerr << alignment.second / totLength << endl; + } + + // now build final alignment + MultiSequence *result = new MultiSequence(); + for (int i = 0; i < align1->GetNumSequences(); i++) + result->AddSequence( + align1->GetSequence(i)->AddGaps(alignment.first, 'X')); + for (int i = 0; i < align2->GetNumSequences(); i++) + result->AddSequence( + align2->GetSequence(i)->AddGaps(alignment.first, 'Y')); + if (!enableAlignOrder) + result->SortByLabel(); + + // free temporary alignment + delete alignment.first; + + return result; +} + +///////////////////////////////////////////////////////////////// +// DoRelaxation() +// +// Performs one round of the weighted probabilistic consistency transformation. +// 1 +///////////////////////////////////////////////////////////////// + +SafeVector > MSA::DoRelaxation(float* seqsWeights, + MultiSequence *sequences, + SafeVector > &sparseMatrices) { + const int numSeqs = sequences->GetNumSequences(); + + SafeVector > newSparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + + // for every pair of sequences +#ifdef _OPENMP + int pairIdx; +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int i = seqsPairs[pairIdx].seq1; + int j = seqsPairs[pairIdx].seq2; + float wi = seqsWeights[i]; + float wj = seqsWeights[j]; +#else + for (int i = 0; i < numSeqs; i++) { + float wi = seqsWeights[i]; + for (int j = i + 1; j < numSeqs; j++) { + float wj = seqsWeights[j]; +#endif + Sequence *seq1 = sequences->GetSequence(i); + Sequence *seq2 = sequences->GetSequence(j); + + if (enableVerbose) { +#ifdef _OPENMP +#pragma omp critical +#endif + cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader() + << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader() + << ": "; + } + // get the original posterior matrix + VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + // contribution from the summation where z = x and z = y + float w = wi * wi * wj + wi * wj * wj; + float sumW = w; + for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) { + //posterior[k] = w*posterior[k]; + posterior[k] += posterior[k]; + } + + if (enableVerbose) + cerr << sparseMatrices[i][j]->GetNumCells() << " --> "; + + // contribution from all other sequences + for (int k = 0; k < numSeqs; k++) { + if (k != i && k != j) { + float wk = seqsWeights[k]; + float w = wi * wj * wk; + sumW += w; + if (k < i) + Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j], + posterior); + else if (k > i && k < j) + Relax(w, sparseMatrices[i][k], sparseMatrices[k][j], + posterior); + else { + SparseMatrix *temp = + sparseMatrices[j][k]->ComputeTranspose(); + Relax(w, sparseMatrices[i][k], temp, posterior); + delete temp; + } + } + } + //cerr<<"sumW "<::iterator XYptr = matXY->GetRowPtr(x); + SafeVector::iterator XYend = XYptr + matXY->GetRowSize(x); + VF::iterator base = posterior.begin() + x * (seq2Length + 1); + int curr = 0; + while (XYptr != XYend) { + + // zero out all cells until the first filled column + while (curr < XYptr->first) { + base[curr] = 0; + curr++; + } + + // now, skip over this column + curr++; + ++XYptr; + } + + // zero out cells after last column + while (curr <= seq2Length) { + base[curr] = 0; + curr++; + } + } + + // save the new posterior matrix + newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), posterior); + newSparseMatrices[j][i] = NULL; + + if (enableVerbose) + cerr << newSparseMatrices[i][j]->GetNumCells() << " -- "; + + delete posteriorPtr; + + if (enableVerbose) + cerr << "done." << endl; +#ifndef _OPENMP + } +#endif + } + + return newSparseMatrices; +} + +///////////////////////////////////////////////////////////////// +// Relax() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY, + VF &posterior) { + + assert(matXZ); + assert(matZY); + + int lengthX = matXZ->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length()); + + // for every x[i] + for (int i = 1; i <= lengthX; i++) { + SafeVector::iterator XZptr = matXZ->GetRowPtr(i); + SafeVector::iterator XZend = XZptr + matXZ->GetRowSize(i); + + VF::iterator base = posterior.begin() + i * (lengthY + 1); + + // iterate through all x[i]-z[k] + while (XZptr != XZend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(XZptr->first); + SafeVector::iterator ZYend = ZYptr + + matZY->GetRowSize(XZptr->first); + const float XZval = XZptr->second; + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + //base[ZYptr->first] += weight * XZval * ZYptr->second; + base[ZYptr->first] += XZval * ZYptr->second; + ZYptr++; + } + XZptr++; + } + } +} + +///////////////////////////////////////////////////////////////// +// Relax1() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY, + VF &posterior) { + + assert(matZX); + assert(matZY); + + int lengthZ = matZX->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + + // for every z[k] + for (int k = 1; k <= lengthZ; k++) { + SafeVector::iterator ZXptr = matZX->GetRowPtr(k); + SafeVector::iterator ZXend = ZXptr + matZX->GetRowSize(k); + + // iterate through all z[k]-x[i] + while (ZXptr != ZXend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(k); + SafeVector::iterator ZYend = ZYptr + matZY->GetRowSize(k); + const float ZXval = ZXptr->second; + VF::iterator base = posterior.begin() + + ZXptr->first * (lengthY + 1); + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + //base[ZYptr->first] += weight * ZXval * ZYptr->second; + base[ZYptr->first] += ZXval * ZYptr->second; + ZYptr++; + } + ZXptr++; + } + } +} +///////////////////////////////////////////////////////////////// +// DoIterativeRefinement() +// +// Performs a single round of randomized partionining iterative +// refinement. +// return 0: successful refinement, 1: ineffective refinement, 2: random problem +///////////////////////////////////////////////////////////////// +int MSA::DoIterativeRefinement( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + int i; + // create two separate groups + for (i = 0; i < numSeqs; i++) { + int index = rand(); + if (index % 2) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) return 2; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); + +//start add by Yongtao +#if 1 + VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs, + sparseMatrices, cutoff); +#endif + // compute an "accuracy" measure for the alignment before refinement + SafeVector::iterator> oldOnePtrs(groupOne.size()); + SafeVector::iterator> oldTwoPtrs(groupTwo.size()); + i=0; + for (set::const_iterator iter = groupOne.begin(); + iter != groupOne.end(); ++iter) { + oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr(); + } + i=0; + for (set::const_iterator iter = groupTwo.begin(); + iter != groupTwo.end(); ++iter) { + oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr(); + } + + VF &posteriorArr = *posterior; + int oldLength = alignment->GetSequence(0)->GetLength(); + int groupOneindex=0; int groupTwoindex=0; + float accuracy_before = 0; + int j; + for (i = 1; i <= oldLength; i++) { + // check to see if there is a gap in every sequence of the set + bool foundOne = false; + for (j = 0; !foundOne && j < (int) groupOne.size(); j++) + foundOne = (oldOnePtrs[j][i] != '-'); + // if not, then this column counts towards the sequence length + if (foundOne) groupOneindex ++; + bool foundTwo = false; + for (j = 0; !foundTwo && j < (int) groupTwo.size(); j++) + foundTwo = (oldTwoPtrs[j][i] != '-'); + if (foundTwo) groupTwoindex ++; + if(foundOne && foundTwo) accuracy_before += + posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex]; + } + + pair *, float> refinealignment; + //perform alignment + refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(), + groupTwoSeqs->GetSequence(0)->GetLength(), *posterior); + delete posterior; + // now build final alignment + MultiSequence *result = new MultiSequence(); + for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++) + result->AddSequence( + groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X')); + for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++) + result->AddSequence( + groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y')); + // free temporary alignment + delete refinealignment.first; + delete alignment; + alignment = result; + delete groupOneSeqs; + delete groupTwoSeqs; + if(accuracy_before == refinealignment.second) return 1; + else return 0; +} + + +void MSA::DoIterativeRefinementTreeNode( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment, + int nodeIndex) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + + vector inGroup1; + inGroup1.resize(numSeqs); + for (int i = 0; i < numSeqs; i++) { + inGroup1[i] = false; + } + + AlignmentOrder* orders = this->tree->getAlignOrders(); + AlignmentOrder* order = &orders[nodeIndex]; + for (int i = 0; i < order->leftNum; i++) { + int si = order->leftLeafs[i]; + inGroup1[si] = true; + } + for (int i = 0; i < order->rightNum; i++) { + int si = order->rightLeafs[i]; + inGroup1[si] = true; + } + // create two separate groups + for (int i = 0; i < numSeqs; i++) { + if (inGroup1[i]) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) + return; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); + delete alignment; + + // realign + alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, + model); + + delete groupOneSeqs; + delete groupTwoSeqs; +} + +///////////////////////////////////////////////////////////////// +// WriteAnnotation() +// +// Computes annotation for multiple alignment and write values +// to a file. +///////////////////////////////////////////////////////////////// + +void MSA::WriteAnnotation(MultiSequence *alignment, + const SafeVector > &sparseMatrices) { + ofstream outfile(annotationFilename.c_str()); + + if (outfile.fail()) { + cerr << "ERROR: Unable to write annotation file." << endl; + exit(1); + } + + const int alignLength = alignment->GetSequence(0)->GetLength(); + const int numSeqs = alignment->GetNumSequences(); + + SafeVector position(numSeqs, 0); + SafeVector::iterator> seqs(numSeqs); + for (int i = 0; i < numSeqs; i++) + seqs[i] = alignment->GetSequence(i)->GetDataPtr(); + SafeVector > active; + active.reserve(numSeqs); + + SafeVector lab; + for (int i = 0; i < numSeqs; i++) + lab.push_back(alignment->GetSequence(i)->GetSortLabel()); + + // for every column + for (int i = 1; i <= alignLength; i++) { + + // find all aligned residues in this particular column + active.clear(); + for (int j = 0; j < numSeqs; j++) { + if (seqs[j][i] != '-') { + active.push_back(make_pair(lab[j], ++position[j])); + } + } + + sort(active.begin(), active.end()); + outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl; + } + + outfile.close(); +} + +///////////////////////////////////////////////////////////////// +// ComputeScore() +// +// Computes the annotation score for a particular column. +///////////////////////////////////////////////////////////////// + +int MSA::ComputeScore(const SafeVector > &active, + const SafeVector > &sparseMatrices) { + + if (active.size() <= 1) + return 0; + + // ALTERNATIVE #1: Compute the average alignment score. + + float val = 0; + for (int i = 0; i < (int) active.size(); i++) { + for (int j = i + 1; j < (int) active.size(); j++) { + val += sparseMatrices[active[i].first][active[j].first]->GetValue( + active[i].second, active[j].second); + } + } + + return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1))); + +} + +///////////////////////////////////////////////////////////////// +// ComputeSimilarity () +// +// Computes the average similarity for a particular family. +// extreme low similarity(<=25%) return 0 +// low similarity(<=40%) return 1 +// high similarity(<=70%) return 2 +// extreme high similarity(>70%) return 3 +///////////////////////////////////////////////////////////////// +int MSA::AdjustmentTest(MultiSequence *sequences,const ProbabilisticModel &model){ + assert(sequences); + + //get the number of sequences + const int numSeqs = sequences->GetNumSequences(); + //average identity for all sequences + float identity = 0; + +#ifdef _OPENMP + //calculate sequence pairs for openmp model + int pairIdx = 0; + numPairs = (numSeqs - 1) * numSeqs / 2; + seqsPairs = new SeqsPair[numPairs]; + for(int a = 0; a < numSeqs; a++) { + for(int b = a + 1; b < numSeqs; b++) { + seqsPairs[pairIdx].seq1 = a; + seqsPairs[pairIdx].seq2 = b; + pairIdx++; + } + } +#endif + + // do all pairwise alignments for family similarity +#ifdef _OPENMP +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int a= seqsPairs[pairIdx].seq1; + int b = seqsPairs[pairIdx].seq2; + if(enableVerbose) { +#pragma omp critical + cerr <<"tid "<GetSequence(a); + Sequence *seq2 = sequences->GetSequence(b); + pair *, float> alignment = model.ComputeViterbiAlignment(seq1,seq2); + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + float N_correct_match = 0; + //float N_alignment = 0; + int i = 1;int j = 1; + for (SafeVector::iterator iter = alignment.first->begin(); + iter != alignment.first->end(); ++iter){ + //N_alignment += 1; + if (*iter == 'B'){ + unsigned char c1 = (unsigned char) iter1[i++]; + unsigned char c2 = (unsigned char) iter2[j++]; + if(c1==c2) N_correct_match += 1; + } + else if(*iter == 'X') i++; + else if(*iter == 'Y') j++; + } + if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "similarity error"<< endl; + identity += N_correct_match / alignment.first->size(); + delete alignment.first; +#ifndef _OPENMP + } +#endif + } + identity /= numPairs; +/* +FILE *fi = fopen ("accuracy", "a"); +fprintf (fi, " %.10f ", similarity); fprintf (fi, "\n"); +fclose (fi); +*/ + + //adapative + if( identity <= 0.15 ) initDistrib[2] = 0.143854; + else if( identity <= 0.2 ) initDistrib[2] = 0.191948; + else if( identity <= 0.25 ) initDistrib[2] = 0.170705; + else if( identity <= 0.3 ) initDistrib[2] = 0.100675; + else if( identity <= 0.35 ) initDistrib[2] = 0.090755; + else if( identity <= 0.4 ) initDistrib[2] = 0.146188; + else if( identity <= 0.45 ) initDistrib[2] = 0.167858; + else if( identity <= 0.5) initDistrib[2] = 0.250769; + + + if( identity <= 0.25 ) return 0; + else if( identity <= 0.4) return 1; + else if( identity <= 0.7) return 2; + else return 3; + +} diff --git a/binaries/src/GLProbs-1.0/MSA.h b/binaries/src/GLProbs-1.0/MSA.h new file mode 100644 index 0000000..2e37e85 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSA.h @@ -0,0 +1,95 @@ +#ifndef _MSA_H +#define _MSA_H +#include "MSADef.h" +#include "MSAGuideTree.h" + +#include "SafeVector.h" +#include "MultiSequence.h" +#include "ScoreType.h" +#include "ProbabilisticModel.h" +#include "SparseMatrix.h" +#include +using namespace std; + +class MSAGuideTree; +struct TreeNode; +class MSA { +public: + MSA(int argc, char* argv[]); + ~MSA(); + + static void getSysTime(double * dtime); + MSAGuideTree* getGuideTree() { + return tree; + } + int * getSeqsWeights() { + return seqsWeights; + } +private: + //print usage + void printUsage(); + //do multiple sequence alignment + void doAlign(); + + //for sequence weights + void createSeqsWeights(int seqsNum); + void releaseSeqsWeights(); + + //weights of sequences + int * seqsWeights; + //guide tree + MSAGuideTree* tree; + //output file + string alignOutFileName; + std::ostream* alignOutFile; +private: + SafeVector ParseParams(int argc, char *argv[]); + void PrintParameters(const char *message, const VF &initDistrib, + const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, + const VF &emitSingle, const char *filename); + + SafeVector PostProbsParseParams(int argc, char **argv); + MultiSequence *doAlign(MultiSequence *sequence, + const ProbabilisticModel &model, int levelid); + void ReadParameters(); + MultiSequence* ProcessTree(TreeNode *tree, MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model); + MultiSequence *ComputeFinalAlignment(MSAGuideTree *tree, + MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model,int levelid); + MultiSequence *AlignAlignments(MultiSequence *align1, MultiSequence *align2, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model); + SafeVector > DoRelaxation(float* seqsWeights, + MultiSequence *sequences, + SafeVector > &sparseMatrices); + void Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY, + VF &posterior); + void Relax1(float weight, SparseMatrix *matXZ, SparseMatrix *matZY, + VF &posterior); + int DoIterativeRefinement( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment); + void DoIterativeRefinementTreeNode( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment, + int nodeIndex); + void WriteAnnotation(MultiSequence *alignment, + const SafeVector > &sparseMatrices); + int ComputeScore(const SafeVector > &active, + const SafeVector > &sparseMatrices); + int AdjustmentTest(MultiSequence *sequences,const ProbabilisticModel &model); +#ifdef _OPENMP + //private struct + struct SeqsPair { + int seq1; + int seq2; + }; + int numPairs; + SeqsPair* seqsPairs; +#endif +}; + +#endif diff --git a/binaries/src/GLProbs-1.0/MSA2.cpp b/binaries/src/GLProbs-1.0/MSA2.cpp new file mode 100644 index 0000000..f27a7bc --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSA2.cpp @@ -0,0 +1,1562 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "MSA.h" +#include "MSAClusterTree.h" +#include "Defaults.h" + +#ifdef _OPENMP +#include +#endif + +string parametersInputFilename = ""; +string parametersOutputFilename = "no training"; +string annotationFilename = ""; + +bool enableVerbose = false; +bool enableAnnotation = false; +bool enableClustalWOutput = false; +bool enableAlignOrder = false; +int numConsistencyReps = 2; +int numPreTrainingReps = 0; +int numIterativeRefinementReps = 100; + +float cutoff = 0; + +VF initDistrib(NumMatrixTypes); +VF gapOpen(2 * NumInsertStates); +VF gapExtend(2 * NumInsertStates); +VVF emitPairs(256, VF(256, 1e-10)); +VF emitSingle(256, 1e-5); + +string alphabet = alphabetDefault; + +const int MIN_PRETRAINING_REPS = 0; +const int MAX_PRETRAINING_REPS = 20; +const int MIN_CONSISTENCY_REPS = 0; +const int MAX_CONSISTENCY_REPS = 5; +const int MIN_ITERATIVE_REFINEMENT_REPS = 0; +const int MAX_ITERATIVE_REFINEMENT_REPS = 1000; + +string posteriorProbsFilename = ""; +bool allscores = true; +string infilename; + +int flag_gui = 0; //0: no gui related o/p +//1: gui related o/p generated +int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment +//1: pp score seq added to o/p fasta alignment + +/////////////////////////////// +// global scoring matrix variables +////////////////////////////// +float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; +char *aminos, *bases, matrixtype[20] = "gonnet_160"; +int subst_index[26]; + +double sub_matrix[26][26]; +double normalized_matrix[26][26];// add by YE Yongtao +int firstread = 0; //this makes sure that matrices are read only once + +float TEMPERATURE = 5; +int MATRIXTYPE = 160; +int prot_nuc = 0; //0=prot, 1=nucleotide + +float GAPOPEN = 0; +float GAPEXT = 0; +int numThreads = 0; + +//argument support +typedef struct { + char input[30]; + int matrix; + int N; + float T; + float beta; + char opt; //can be 'P' or 'M' + float gapopen; + float gapext; +} argument_decl; + +argument_decl argument; + +extern inline void read_sustitution_matrix(char *fileName); +extern void setmatrixtype(int le); +extern inline int matrixtype_to_int(); +extern inline void read_dna_matrix(); +extern inline void read_vtml_la_matrix(); +extern void init_arguments(); + +MSA::MSA(int argc, char* argv[]) { + //parse program parameters + SafeVector sequenceNames = ParseParams(argc, argv); + + //initialize arguments for partition function + init_arguments(); + + ReadParameters(); + //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL); + + //read the input sequences + MultiSequence *sequences = new MultiSequence(); + assert(sequences); + for (int i = 0; i < (int) sequenceNames.size(); i++) { + cerr << "Loading sequence file: " << sequenceNames[i] << endl; + sequences->LoadMFA(sequenceNames[i], true); + } + //allocate space for sequence weights + this->seqsWeights = new int[sequences->GetNumSequences()]; + //initilaize parameters for OPENMP +#ifdef _OPENMP + if(numThreads <= 0) { + numThreads = omp_get_num_procs(); + cerr << "Automatically detected " << numThreads << " CPU cores" << endl; + } + cerr <<"Enabling OpenMP (with "<WriteALN(*alignOutFile); + } else { + alignment->WriteMFA(*alignOutFile); + } + //release resources + delete[] this->seqsWeights; + delete alignment; +*/ + delete sequences; +} +MSA::~MSA() { + /*close the output file*/ + if (alignOutFileName.length() > 0) { + ((std::ofstream*) alignOutFile)->close(); + } +} +///////////////////////////////////////////////////////////////// +// PrintParameters() +// +// Prints MSAPROBS parameters to STDERR. If a filename is +// specified, then the parameters are also written to the file. +///////////////////////////////////////////////////////////////// + +void MSA::PrintParameters(const char *message, const VF &initDistrib, + const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, + const VF &emitSingle, const char *filename) { + + // print parameters to the screen + cerr << message << endl << " initDistrib[] = { "; + for (int i = 0; i < NumMatrixTypes; i++) + cerr << setprecision(10) << initDistrib[i] << " "; + cerr << "}" << endl << " gapOpen[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapOpen[i] << " "; + cerr << "}" << endl << " gapExtend[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapExtend[i] << " "; + cerr << "}" << endl << endl; + + // if a file name is specified + if (filename) { + + // attempt to open the file for writing + FILE *file = fopen(filename, "w"); + if (!file) { + cerr << "ERROR: Unable to write parameter file: " << filename + << endl; + exit(1); + } + + // if successful, then write the parameters to the file + for (int i = 0; i < NumMatrixTypes; i++) + fprintf(file, "%.10f ", initDistrib[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapOpen[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapExtend[i]); + fprintf(file, "\n"); + fprintf(file, "%s\n", alphabet.c_str()); + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) + fprintf(file, "%.10f ", + emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]); + fprintf(file, "\n"); + } + for (int i = 0; i < (int) alphabet.size(); i++) + fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]); + fprintf(file, "\n"); + fclose(file); + } +} + +///////////////////////////////////////////////////////////////// +// doAlign() +// +// First computes all pairwise posterior probability matrices. +// Then, computes new parameters if training, or a final +// alignment, otherwise. +///////////////////////////////////////////////////////////////// +extern VF *ComputePostProbs(int a, int b, string seq1, string seq2); +MultiSequence* MSA::doAlign(MultiSequence *sequences, + const ProbabilisticModel &model, int levelid) { + assert(sequences); + + //get the number of sequences + const int numSeqs = sequences->GetNumSequences(); + //create distance matrix + VVF distances(numSeqs, VF(numSeqs, 0)); + //creat sparseMatrices + SafeVector > sparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + +#ifdef _OPENMP + //calculate sequence pairs for openmp model + int pairIdx = 0; + numPairs = (numSeqs - 1) * numSeqs / 2; + seqsPairs = new SeqsPair[numPairs]; + for(int a = 0; a < numSeqs; a++) { + for(int b = a + 1; b < numSeqs; b++) { + seqsPairs[pairIdx].seq1 = a; + seqsPairs[pairIdx].seq2 = b; + pairIdx++; + } + } +#endif + // do all pairwise alignments for posterior probability matrices +#ifdef _OPENMP +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int a= seqsPairs[pairIdx].seq1; + int b = seqsPairs[pairIdx].seq2; + if(enableVerbose) { +#pragma omp critical + cerr <<"tid "<GetSequence(a); + Sequence *seq2 = sequences->GetSequence(b); + + //posterior probability matrix + VF* posterior; + + +//high similarity use global model + //if(levelid == 2) + if(1) posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString()); + +/* +//low similarity use local model + else if(levelid == 1){ + VF *forward = model.ComputeForwardMatrix(seq1, seq2,false); + assert(forward); + VF *backward = model.ComputeBackwardMatrix(seq1, seq2,false); + assert(backward); + posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false); + delete forward; + delete backward; + } + +//extreme low or extreme high similarity use combined model + else{ + +//probcons + // compute forward and backward probabilities + VF *forward = model.ComputeForwardMatrix(seq1, seq2); + assert(forward); + VF *backward = model.ComputeBackwardMatrix(seq1, seq2); + assert(backward); + // compute posterior probability matrix from HMM + VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward); + assert(probcons_posterior); + delete forward; + delete backward; + +//probalign + VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString()); + assert(probalign_posterior); +//local + forward = model.ComputeForwardMatrix(seq1, seq2,false); + assert(forward); + backward = model.ComputeBackwardMatrix(seq1, seq2,false); + assert(backward); + posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false); + assert(posterior); + delete forward; + delete backward; +//combined model + //merge probalign + local + probcons + VF::iterator ptr1 = probcons_posterior->begin(); + VF::iterator ptr2 = probalign_posterior->begin(); + VF::iterator ptr = posterior->begin(); + for (int i = 0; i <= seq1->GetLength(); i++) { + for (int j = 0; j <= seq2->GetLength(); j++) { + float v1 = *ptr1; + float v2 = *ptr2; + float v3 = *ptr; + *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3); + ptr1++; + ptr2++; + ptr++; + } + } + delete probcons_posterior; + delete probalign_posterior; + } +*/ + assert(posterior); + // perform the pairwise sequence alignment + pair *, float> alignment = model.ComputeAlignment( + seq1->GetLength(), seq2->GetLength(), *posterior); + + //compute expected accuracy + distances[a][b] = distances[b][a] = 1.0f - alignment.second + / min(seq1->GetLength(), seq2->GetLength()); + + // compute sparse representations + sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), *posterior); + sparseMatrices[b][a] = NULL; + + delete posterior; + delete alignment.first; +#ifndef _OPENMP + } +#endif + } + + //create the guide tree + this->tree = new MSAClusterTree(this, distances, numSeqs); + this->tree->create(); + + // perform the consistency transformation the desired number of times + float* fweights = new float[numSeqs]; + for (int r = 0; r < numSeqs; r++) { + fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY; + fweights[r] *= 10; + } + for (int r = 0; r < numConsistencyReps; r++) { + SafeVector > newSparseMatrices = + DoRelaxation(fweights, sequences, sparseMatrices); + + // now replace the old posterior matrices + for (int i = 0; i < numSeqs; i++) { + for (int j = 0; j < numSeqs; j++) { + delete sparseMatrices[i][j]; + sparseMatrices[i][j] = newSparseMatrices[i][j]; + } + } + } + delete[] fweights; +#ifdef _OPENMP + delete [] seqsPairs; +#endif + + //compute the final multiple sequence alignment + MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences, + sparseMatrices, model); + + // build annotation + if (enableAnnotation) { + WriteAnnotation(finalAlignment, sparseMatrices); + } + //destroy the guide tree + delete this->tree; + this->tree = 0; + + // delete sparse matrices + for (int a = 0; a < numSeqs - 1; a++) { + for (int b = a + 1; b < numSeqs; b++) { + delete sparseMatrices[a][b]; + delete sparseMatrices[b][a]; + } + } + + return finalAlignment; +} + +///////////////////////////////////////////////////////////////// +// GetInteger() +// +// Attempts to parse an integer from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetInteger(char *data, int *val) { + char *endPtr; + long int retVal; + + assert(val); + + errno = 0; + retVal = strtol(data, &endPtr, 0); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) + return false; + if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) + return false; + *val = (int) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// GetFloat() +// +// Attempts to parse a float from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetFloat(char *data, float *val) { + char *endPtr; + double retVal; + + assert(val); + + errno = 0; + retVal = strtod(data, &endPtr); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) + return false; + *val = (float) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// ReadParameters() +// +// Read initial distribution, transition, and emission +// parameters from a file. +///////////////////////////////////////////////////////////////// + +void MSA::ReadParameters() { + + ifstream data; + + emitPairs = VVF(256, VF(256, 1e-10)); + emitSingle = VF(256, 1e-5); + + // read initial state distribution and transition parameters + if (parametersInputFilename == string("")) { + if (NumInsertStates == 1) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend1Default[i]; + } else if (NumInsertStates == 2) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend2Default[i]; + } else { + cerr + << "ERROR: No default initial distribution/parameter settings exist" + << endl << " for " << NumInsertStates + << " pairs of insert states. Use --paramfile." << endl; + exit(1); + } + + alphabet = alphabetDefault; + + for (int i = 0; i < (int) alphabet.length(); i++) { + emitSingle[(unsigned char) tolower(alphabet[i])] = + emitSingleDefault[i]; + emitSingle[(unsigned char) toupper(alphabet[i])] = + emitSingleDefault[i]; + for (int j = 0; j <= i; j++) { + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + } + } + } else { + data.open(parametersInputFilename.c_str()); + if (data.fail()) { + cerr << "ERROR: Unable to read parameter file: " + << parametersInputFilename << endl; + exit(1); + } + + string line[3]; + for (int i = 0; i < 3; i++) { + if (!getline(data, line[i])) { + cerr + << "ERROR: Unable to read transition parameters from parameter file: " + << parametersInputFilename << endl; + exit(1); + } + } + istringstream data2; + data2.clear(); + data2.str(line[0]); + for (int i = 0; i < NumMatrixTypes; i++) + data2 >> initDistrib[i]; + data2.clear(); + data2.str(line[1]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapOpen[i]; + data2.clear(); + data2.str(line[2]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapExtend[i]; + + if (!getline(data, line[0])) { + cerr << "ERROR: Unable to read alphabet from scoring matrix file: " + << parametersInputFilename << endl; + exit(1); + } + + // read alphabet as concatenation of all characters on alphabet line + alphabet = ""; + string token; + data2.clear(); + data2.str(line[0]); + while (data2 >> token) + alphabet += token; + + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) { + float val; + data >> val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + } + } + + for (int i = 0; i < (int) alphabet.size(); i++) { + float val; + data >> val; + emitSingle[(unsigned char) tolower(alphabet[i])] = val; + emitSingle[(unsigned char) toupper(alphabet[i])] = val; + } + data.close(); + } +} + +///////////////////////////////////////////////////////////////// +// ParseParams() +// +// Parse all command-line options. +///////////////////////////////////////////////////////////////// +void MSA::printUsage() { + cerr + << "************************************************************************" + << endl + << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm" + << endl + << "\tbased on pair hidden markov model and partition function postirior" + << endl + << "\tprobabilities. If any comments or problems, please contact" + << endl + << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)" + << endl + << "*************************************************************************" + << endl << "Usage:" << endl + << " msaprobs [OPTION]... [infile]..." << endl << endl + << "Description:" << endl + << " Align sequences in multi-FASTA format" << endl << endl + << " -o, --outfile " << endl + << " specify the output file name (STDOUT by default)" + << endl << " -num_threads " << endl + << " specify the number of threads used, and otherwise detect automatically" + << endl << " -clustalw" << endl + << " use CLUSTALW output format instead of FASTA format" + << endl << endl << " -c, --consistency REPS" << endl + << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " + << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps + << ") passes of consistency transformation" << endl << endl + << " -ir, --iterative-refinement REPS" << endl + << " use " << MIN_ITERATIVE_REFINEMENT_REPS + << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: " + << numIterativeRefinementReps << ") passes of iterative-refinement" + << endl << endl << " -v, --verbose" << endl + << " report progress while aligning (default: " + << (enableVerbose ? "on" : "off") << ")" << endl << endl + << " -annot FILENAME" << endl + << " write annotation for multiple alignment to FILENAME" + << endl << endl << " -a, --alignment-order" << endl + << " print sequences in alignment order rather than input order (default: " + << (enableAlignOrder ? "on" : "off") << ")" << endl + << " -version " << endl + << " print out version of MSAPROBS " << endl << endl; +} +SafeVector MSA::ParseParams(int argc, char **argv) { + if (argc < 2) { + printUsage(); + exit(1); + } + SafeVector sequenceNames; + int tempInt; + float tempFloat; + + for (int i = 1; i < argc; i++) { + if (argv[i][0] == '-') { + //help + if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) { + printUsage(); + exit(1); + //output file name + } else if (!strcmp(argv[i], "-o") + || !strcmp(argv[i], "--outfile")) { + if (i < argc - 1) { + alignOutFileName = argv[++i]; //get the file name + } else { + cerr << "ERROR: String expected for option " << argv[i] + << endl; + exit(1); + } + // parameter file + } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){ + if (i < argc - 1) + parametersInputFilename = string (argv[++i]); + else { + cerr << "ERROR: Filename expected for option " << argv[i] << endl; + exit (1); + } + //number of threads used + } else if (!strcmp(argv[i], "-p") + || !strcmp(argv[i], "-num_threads")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << " ERROR: invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < 0) { + tempInt = 0; + } + numThreads = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + // number of consistency transformations + } else if (!strcmp(argv[i], "-c") + || !strcmp(argv[i], "--consistency")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_CONSISTENCY_REPS + || tempInt > MAX_CONSISTENCY_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_CONSISTENCY_REPS << " and " + << MAX_CONSISTENCY_REPS << "." << endl; + exit(1); + } else { + numConsistencyReps = tempInt; + } + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // number of randomized partitioning iterative refinement passes + else if (!strcmp(argv[i], "-ir") + || !strcmp(argv[i], "--iterative-refinement")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS + || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_ITERATIVE_REFINEMENT_REPS << " and " + << MAX_ITERATIVE_REFINEMENT_REPS << "." + << endl; + exit(1); + } else + numIterativeRefinementReps = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // annotation files + else if (!strcmp(argv[i], "-annot")) { + enableAnnotation = true; + if (i < argc - 1) { + annotationFilename = argv[++i]; + } else { + cerr << "ERROR: FILENAME expected for option " << argv[i] + << endl; + exit(1); + } + } + + // clustalw output format + else if (!strcmp(argv[i], "-clustalw")) { + enableClustalWOutput = true; + } + + // cutoff + else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) { + if (i < argc - 1) { + if (!GetFloat(argv[++i], &tempFloat)) { + cerr + << "ERROR: Invalid floating-point value following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempFloat < 0 || tempFloat > 1) { + cerr << "ERROR: For option " << argv[i - 1] + << ", floating-point value must be between 0 and 1." + << endl; + exit(1); + } else + cutoff = tempFloat; + } + } else { + cerr << "ERROR: Floating-point value expected for option " + << argv[i] << endl; + exit(1); + } + } + + // verbose reporting + else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) { + enableVerbose = true; + } + + // alignment order + else if (!strcmp(argv[i], "-a") + || !strcmp(argv[i], "--alignment-order")) { + enableAlignOrder = true; + } + + //print out version + else if (!strcmp(argv[i], "-version")) { + cerr << "MSAPROBS version " << VERSION << endl; + exit(1); + } + // bad arguments + else { + cerr << "ERROR: Unrecognized option: " << argv[i] << endl; + exit(1); + } + } else { + sequenceNames.push_back(string(argv[i])); + } + } + + /*check the output file name*/ + cerr << "-------------------------------------" << endl; + if (alignOutFileName.length() == 0) { + cerr << "The final alignments will be printed out to STDOUT" << endl; + alignOutFile = &std::cout; + } else { + cerr << "Open the output file " << alignOutFileName << endl; + alignOutFile = new ofstream(alignOutFileName.c_str(), + ios::binary | ios::out | ios::trunc); + } + cerr << "-------------------------------------" << endl; + return sequenceNames; +} + +///////////////////////////////////////////////////////////////// +// ProcessTree() +// +// Process the tree recursively. Returns the aligned sequences +// corresponding to a node or leaf of the tree. +///////////////////////////////////////////////////////////////// +MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + MultiSequence *result; + + // check if this is a node of the alignment tree + //if (tree->GetSequenceLabel() == -1){ + if (tree->leaf == NODE) { + MultiSequence *alignLeft = ProcessTree(tree->left, sequences, + sparseMatrices, model); + MultiSequence *alignRight = ProcessTree(tree->right, sequences, + sparseMatrices, model); + + assert(alignLeft); + assert(alignRight); + + result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model); + assert(result); + + delete alignLeft; + delete alignRight; + } + + // otherwise, this is a leaf of the alignment tree + else { + result = new MultiSequence(); + assert(result); + //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone()); + result->AddSequence(sequences->GetSequence(tree->idx)->Clone()); + } + + return result; +} + +///////////////////////////////////////////////////////////////// +// ComputeFinalAlignment() +// +// Compute the final alignment by calling ProcessTree(), then +// performing iterative refinement as needed. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree, + MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences, + sparseMatrices, model); + + SafeVector oldOrdering; + int numSeqs = alignment->GetNumSequences(); + if (enableAlignOrder) { + for (int i = 0; i < numSeqs; i++) + oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel()); + alignment->SaveOrdering(); + enableAlignOrder = false; + } + + // tree-based refinement + // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree); + /* + int numSeqs = alignment->GetNumSequences(); + //if(numSeqs < numIterativeRefinementReps){ + for(int iter = 0; iter < 5; iter ++){ + for(int i = 0; i < numSeqs - 1; i++){ + DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i); + } + } + //}*/ + //DoIterativeRefinement() return 1,2: this refinement unsuccessful +/* + int ineffectiveness = 0; + for (int i = 0; i < numIterativeRefinementReps; i++){ + int flag = DoIterativeRefinement(sparseMatrices, model, alignment); + if(numSeqs > 25){ + if(flag > 0){ + if(numIterativeRefinementReps < 20*numSeqs) + numIterativeRefinementReps ++; + if(flag == 1) ineffectiveness ++; + } + //else ineffectiveness = 0; + if(ineffectiveness > 2*numSeqs && i >100 ) break; + } + } +*/ + + for (int i = 0; i < numIterativeRefinementReps; i++) + DoIterativeRefinement(sparseMatrices, model, alignment); + + cerr << endl; + + if (oldOrdering.size() > 0) { + for (int i = 0; i < (int) oldOrdering.size(); i++) { + alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]); + } + } + + // return final alignment + return alignment; +} + +///////////////////////////////////////////////////////////////// +// AlignAlignments() +// +// Returns the alignment of two MultiSequence objects. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::AlignAlignments(MultiSequence *align1, + MultiSequence *align2, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + // print some info about the alignment + if (enableVerbose) { + for (int i = 0; i < align1->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align1->GetSequence(i)->GetLabel(); + cerr << "] vs. "; + for (int i = 0; i < align2->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align2->GetSequence(i)->GetLabel(); + cerr << "]: "; + } +#if 0 + VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2, + sparseMatrices, cutoff); +#endif + // compute an "accuracy" measure for the alignment before refinement + + pair *, float> alignment; + //perform alignment + alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(), + align2->GetSequence(0)->GetLength(), *posterior); + + delete posterior; + + if (enableVerbose) { + + // compute total length of sequences + int totLength = 0; + for (int i = 0; i < align1->GetNumSequences(); i++) + for (int j = 0; j < align2->GetNumSequences(); j++) + totLength += min(align1->GetSequence(i)->GetLength(), + align2->GetSequence(j)->GetLength()); + + // give an "accuracy" measure for the alignment + cerr << alignment.second / totLength << endl; + } + + // now build final alignment + MultiSequence *result = new MultiSequence(); + for (int i = 0; i < align1->GetNumSequences(); i++) + result->AddSequence( + align1->GetSequence(i)->AddGaps(alignment.first, 'X')); + for (int i = 0; i < align2->GetNumSequences(); i++) + result->AddSequence( + align2->GetSequence(i)->AddGaps(alignment.first, 'Y')); + if (!enableAlignOrder) + result->SortByLabel(); + + // free temporary alignment + delete alignment.first; + + return result; +} + +///////////////////////////////////////////////////////////////// +// DoRelaxation() +// +// Performs one round of the weighted probabilistic consistency transformation. +// 1 +///////////////////////////////////////////////////////////////// + +SafeVector > MSA::DoRelaxation(float* seqsWeights, + MultiSequence *sequences, + SafeVector > &sparseMatrices) { + const int numSeqs = sequences->GetNumSequences(); + + SafeVector > newSparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + + // for every pair of sequences +#ifdef _OPENMP + int pairIdx; +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int i = seqsPairs[pairIdx].seq1; + int j = seqsPairs[pairIdx].seq2; + float wi = seqsWeights[i]; + float wj = seqsWeights[j]; +#else + for (int i = 0; i < numSeqs; i++) { + float wi = seqsWeights[i]; + for (int j = i + 1; j < numSeqs; j++) { + float wj = seqsWeights[j]; +#endif + Sequence *seq1 = sequences->GetSequence(i); + Sequence *seq2 = sequences->GetSequence(j); + + if (enableVerbose) { +#ifdef _OPENMP +#pragma omp critical +#endif + cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader() + << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader() + << ": "; + } + // get the original posterior matrix + VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + // contribution from the summation where z = x and z = y + float w = wi * wi * wj + wi * wj * wj; + float sumW = w; + for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) { + //posterior[k] = w*posterior[k]; + posterior[k] += posterior[k]; + } + + if (enableVerbose) + cerr << sparseMatrices[i][j]->GetNumCells() << " --> "; + + // contribution from all other sequences + for (int k = 0; k < numSeqs; k++) { + if (k != i && k != j) { + float wk = seqsWeights[k]; + float w = wi * wj * wk; + sumW += w; + if (k < i) + Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j], + posterior); + else if (k > i && k < j) + Relax(w, sparseMatrices[i][k], sparseMatrices[k][j], + posterior); + else { + SparseMatrix *temp = + sparseMatrices[j][k]->ComputeTranspose(); + Relax(w, sparseMatrices[i][k], temp, posterior); + delete temp; + } + } + } + //cerr<<"sumW "<::iterator XYptr = matXY->GetRowPtr(x); + SafeVector::iterator XYend = XYptr + matXY->GetRowSize(x); + VF::iterator base = posterior.begin() + x * (seq2Length + 1); + int curr = 0; + while (XYptr != XYend) { + + // zero out all cells until the first filled column + while (curr < XYptr->first) { + base[curr] = 0; + curr++; + } + + // now, skip over this column + curr++; + ++XYptr; + } + + // zero out cells after last column + while (curr <= seq2Length) { + base[curr] = 0; + curr++; + } + } + + // save the new posterior matrix + newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), posterior); + newSparseMatrices[j][i] = NULL; + + if (enableVerbose) + cerr << newSparseMatrices[i][j]->GetNumCells() << " -- "; + + delete posteriorPtr; + + if (enableVerbose) + cerr << "done." << endl; +#ifndef _OPENMP + } +#endif + } + + return newSparseMatrices; +} + +///////////////////////////////////////////////////////////////// +// Relax() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY, + VF &posterior) { + + assert(matXZ); + assert(matZY); + + int lengthX = matXZ->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length()); + + // for every x[i] + for (int i = 1; i <= lengthX; i++) { + SafeVector::iterator XZptr = matXZ->GetRowPtr(i); + SafeVector::iterator XZend = XZptr + matXZ->GetRowSize(i); + + VF::iterator base = posterior.begin() + i * (lengthY + 1); + + // iterate through all x[i]-z[k] + while (XZptr != XZend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(XZptr->first); + SafeVector::iterator ZYend = ZYptr + + matZY->GetRowSize(XZptr->first); + const float XZval = XZptr->second; + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + //base[ZYptr->first] += weight * XZval * ZYptr->second; + base[ZYptr->first] += XZval * ZYptr->second; + ZYptr++; + } + XZptr++; + } + } +} + +///////////////////////////////////////////////////////////////// +// Relax1() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY, + VF &posterior) { + + assert(matZX); + assert(matZY); + + int lengthZ = matZX->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + + // for every z[k] + for (int k = 1; k <= lengthZ; k++) { + SafeVector::iterator ZXptr = matZX->GetRowPtr(k); + SafeVector::iterator ZXend = ZXptr + matZX->GetRowSize(k); + + // iterate through all z[k]-x[i] + while (ZXptr != ZXend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(k); + SafeVector::iterator ZYend = ZYptr + matZY->GetRowSize(k); + const float ZXval = ZXptr->second; + VF::iterator base = posterior.begin() + + ZXptr->first * (lengthY + 1); + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + //base[ZYptr->first] += weight * ZXval * ZYptr->second; + base[ZYptr->first] += ZXval * ZYptr->second; + ZYptr++; + } + ZXptr++; + } + } +} +///////////////////////////////////////////////////////////////// +// DoIterativeRefinement() +// +// Performs a single round of randomized partionining iterative +// refinement. +// return 0: successful refinement, 1: ineffective refinement, 2: random problem +///////////////////////////////////////////////////////////////// +int MSA::DoIterativeRefinement( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + int i; + // create two separate groups + for (i = 0; i < numSeqs; i++) { + int index = rand(); + if (index % 2) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) return 2; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); + +//start add by Yongtao +#if 1 + VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs, + sparseMatrices, cutoff); +#endif + // compute an "accuracy" measure for the alignment before refinement + SafeVector::iterator> oldOnePtrs(groupOne.size()); + SafeVector::iterator> oldTwoPtrs(groupTwo.size()); + i=0; + for (set::const_iterator iter = groupOne.begin(); + iter != groupOne.end(); ++iter) { + oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr(); + } + i=0; + for (set::const_iterator iter = groupTwo.begin(); + iter != groupTwo.end(); ++iter) { + oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr(); + } + + VF &posteriorArr = *posterior; + int oldLength = alignment->GetSequence(0)->GetLength(); + int groupOneindex=0; int groupTwoindex=0; + float accuracy_before = 0; + int j; + for (i = 1; i <= oldLength; i++) { + // check to see if there is a gap in every sequence of the set + bool foundOne = false; + for (j = 0; !foundOne && j < (int) groupOne.size(); j++) + foundOne = (oldOnePtrs[j][i] != '-'); + // if not, then this column counts towards the sequence length + if (foundOne) groupOneindex ++; + bool foundTwo = false; + for (j = 0; !foundTwo && j < (int) groupTwo.size(); j++) + foundTwo = (oldTwoPtrs[j][i] != '-'); + if (foundTwo) groupTwoindex ++; + if(foundOne && foundTwo) accuracy_before += + posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex]; + } + + pair *, float> refinealignment; + //perform alignment + refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(), + groupTwoSeqs->GetSequence(0)->GetLength(), *posterior); + delete posterior; + // now build final alignment + MultiSequence *result = new MultiSequence(); + for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++) + result->AddSequence( + groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X')); + for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++) + result->AddSequence( + groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y')); + // free temporary alignment + delete refinealignment.first; + delete alignment; + alignment = result; + delete groupOneSeqs; + delete groupTwoSeqs; + if(accuracy_before == refinealignment.second) return 1; + else return 0; +} + + +void MSA::DoIterativeRefinementTreeNode( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment, + int nodeIndex) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + + vector inGroup1; + inGroup1.resize(numSeqs); + for (int i = 0; i < numSeqs; i++) { + inGroup1[i] = false; + } + + AlignmentOrder* orders = this->tree->getAlignOrders(); + AlignmentOrder* order = &orders[nodeIndex]; + for (int i = 0; i < order->leftNum; i++) { + int si = order->leftLeafs[i]; + inGroup1[si] = true; + } + for (int i = 0; i < order->rightNum; i++) { + int si = order->rightLeafs[i]; + inGroup1[si] = true; + } + // create two separate groups + for (int i = 0; i < numSeqs; i++) { + if (inGroup1[i]) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) + return; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); + delete alignment; + + // realign + alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, + model); + + delete groupOneSeqs; + delete groupTwoSeqs; +} + +///////////////////////////////////////////////////////////////// +// WriteAnnotation() +// +// Computes annotation for multiple alignment and write values +// to a file. +///////////////////////////////////////////////////////////////// + +void MSA::WriteAnnotation(MultiSequence *alignment, + const SafeVector > &sparseMatrices) { + ofstream outfile(annotationFilename.c_str()); + + if (outfile.fail()) { + cerr << "ERROR: Unable to write annotation file." << endl; + exit(1); + } + + const int alignLength = alignment->GetSequence(0)->GetLength(); + const int numSeqs = alignment->GetNumSequences(); + + SafeVector position(numSeqs, 0); + SafeVector::iterator> seqs(numSeqs); + for (int i = 0; i < numSeqs; i++) + seqs[i] = alignment->GetSequence(i)->GetDataPtr(); + SafeVector > active; + active.reserve(numSeqs); + + SafeVector lab; + for (int i = 0; i < numSeqs; i++) + lab.push_back(alignment->GetSequence(i)->GetSortLabel()); + + // for every column + for (int i = 1; i <= alignLength; i++) { + + // find all aligned residues in this particular column + active.clear(); + for (int j = 0; j < numSeqs; j++) { + if (seqs[j][i] != '-') { + active.push_back(make_pair(lab[j], ++position[j])); + } + } + + sort(active.begin(), active.end()); + outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl; + } + + outfile.close(); +} + +///////////////////////////////////////////////////////////////// +// ComputeScore() +// +// Computes the annotation score for a particular column. +///////////////////////////////////////////////////////////////// + +int MSA::ComputeScore(const SafeVector > &active, + const SafeVector > &sparseMatrices) { + + if (active.size() <= 1) + return 0; + + // ALTERNATIVE #1: Compute the average alignment score. + + float val = 0; + for (int i = 0; i < (int) active.size(); i++) { + for (int j = i + 1; j < (int) active.size(); j++) { + val += sparseMatrices[active[i].first][active[j].first]->GetValue( + active[i].second, active[j].second); + } + } + + return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1))); + +} + +///////////////////////////////////////////////////////////////// +// ComputeSimilarity () +// +// Computes the average similarity for a particular family. +// extreme low or extreme high similarity(<=20% or >80%) return 0 +// low similarity(20%-50%) return 1 +// high similarity(50%-80%) return 2 +///////////////////////////////////////////////////////////////// +extern pair *, float> partViterbi(string seq1, string seq2); +extern float computeS(string seq1, string seq2, SafeVector * alignment); + +int MSA::ComputeSimilarity (MultiSequence *sequences,const ProbabilisticModel &model){ + assert(sequences); + + //get the number of sequences + const int numSeqs = sequences->GetNumSequences(); + //average identity for all sequences + float identity = 0; + +#ifdef _OPENMP + //calculate sequence pairs for openmp model + int pairIdx = 0; + numPairs = (numSeqs - 1) * numSeqs / 2; + seqsPairs = new SeqsPair[numPairs]; + for(int a = 0; a < numSeqs; a++) { + for(int b = a + 1; b < numSeqs; b++) { + seqsPairs[pairIdx].seq1 = a; + seqsPairs[pairIdx].seq2 = b; + pairIdx++; + } + } +#endif + + // do all pairwise alignments for family similarity +#ifdef _OPENMP +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int a= seqsPairs[pairIdx].seq1; + int b = seqsPairs[pairIdx].seq2; + if(enableVerbose) { +#pragma omp critical + cerr <<"tid "<GetSequence(a); + Sequence *seq2 = sequences->GetSequence(b); + + //pair *, float> alignment = ::partViterbi(seq1->GetString(),seq2->GetString()); + //cerr << alignment.second / alignment.first->size(); + //cerr << computeS(seq1->GetString(),seq2->GetString(),alignment.first)<< endl; + pair *, float> alignment = model.ComputeViterbiAlignment(seq1,seq2); +/* + VF* posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString()); + pair *, float> alignment = model.ComputeAlignment( + seq1->GetLength(), seq2->GetLength(), *posterior); +*/ +/* + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + float N_correct_match = 0; + //float N_match; + //float N_column = 0; + //float N_alignment = 0; + int i = 1;int j = 1; + //bool start = false; bool end = false; + for (SafeVector::iterator iter = alignment.first->begin(); + iter != alignment.first->end(); ++iter){ + if (*iter == 'B'){ + //N_match += 1; + //start = true; + //if(i==seq1->GetLength() || j==seq2->GetLength()) end = true; + unsigned char c1 = (unsigned char) iter1[i++]; + unsigned char c2 = (unsigned char) iter2[j++]; + if(c1==c2) N_correct_match += 1; + } + else if(*iter == 'X') i++; + else if(*iter == 'Y') j++; + //if(start && !end) N_column += 1; + N_alignment += 1; + } + if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "similarity error"<< endl; + identity += N_correct_match / N_alignment; + // +*/ + identity += alignment.second / alignment.first->size(); + delete alignment.first; +#ifndef _OPENMP + } +#endif + } + identity /= numPairs; + +FILE *fi = fopen ("accuracy", "a"); +fprintf (fi, " %.10f ", identity); fprintf (fi, "\n"); +fclose (fi); + +/* + //adapative + if(identity <= 0.15) initDistrib[2] = 0.143854; + else if(identity <= 0.2) initDistrib[2] = 0.191948; + else if(identity <= 0.25) initDistrib[2] = 0.170705; + else if(identity <= 0.3) initDistrib[2] = 0.100675; + else if(identity <= 0.35) initDistrib[2] = 0.090755; + else if(identity <= 0.4) initDistrib[2] = 0.146188; + else if(identity <= 0.45) initDistrib[2] = 0.167858; + else if(identity <= 0.5) initDistrib[2] = 0.250769; + //else if(identity <= 0.6) initDistrib[2] = 0.500829; + //else if(identity <= 0.7) initDistrib[2] = 0.259622; +*/ + if( identity<= 0.25 || identity > 0.8 ) return 0; + else if(identity > 0.2 && identity<= 0.4) return 1; + else return 2; + +} diff --git a/binaries/src/GLProbs-1.0/MSAClusterTree.cpp b/binaries/src/GLProbs-1.0/MSAClusterTree.cpp new file mode 100644 index 0000000..3bf34a1 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAClusterTree.cpp @@ -0,0 +1,153 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#include "MSAClusterTree.h" +MSAClusterTree::MSAClusterTree(MSA* msa, VVF& distMatrix, int numSeqs) : + MSAGuideTree(msa, distMatrix, numSeqs) { +} +MSAClusterTree::~MSAClusterTree() { +} +void MSAClusterTree::create() { + //generate the neighbor-joining tree + this->generateClusterTree(); + + //calculate sequence weights + this->getSeqsWeights(); + + //construct the alignment orders + this->createAlignmentOrders(); +} +void MSAClusterTree::generateClusterTree() { + int i; + ValidNode* validNodes, *headValidNodes; + ValidNode* miniPtr, *minjPtr, *ivalid, *jvalid; + int mini, minj; + float* joins; + unsigned int* clusterLeafs; + + //initialize the valid nodes link list + validNodes = new ValidNode[leafsNum + 1]; + joins = new float[leafsNum + 1]; + clusterLeafs = new unsigned int[nodesNum + 1]; + if (!validNodes || !joins || !clusterLeafs) { + cerr << "Out of memory of the reconstruction of cluster tree" << endl; + } + //initialize cluster size + for (i = 0; i < this->leafsNum; i++) { + clusterLeafs[i] = 1; + } + + headValidNodes = &validNodes[0]; + headValidNodes->next = &validNodes[1]; + headValidNodes->n = -1; + headValidNodes->node = -1; + headValidNodes->prev = NULL; + + //build an initial link list + ValidNode* curr = &validNodes[1]; + ValidNode* prev = headValidNodes; + ValidNode* next = &validNodes[2]; + for (i = 0; i < leafsNum; i++) { + curr->n = i; + curr->node = i; + curr->prev = prev; + curr->next = next; + prev = curr; + curr = next; + next++; + } + prev->next = NULL; + + //to generate the cluster tree + int nodeIdx; //the index of an internal node + int firstNode = leafsNum; //the index of the first internal node + int lastNode = firstNode + leafsNum - 1;//the index of the last internal node + + for (nodeIdx = firstNode; nodeIdx < lastNode; nodeIdx++) { + //find closest pair of clusters + float minDist = 1.1f; + miniPtr = headValidNodes; + minjPtr = headValidNodes; + + for (ivalid = headValidNodes->next; ivalid != NULL; + ivalid = ivalid->next) { + mini = ivalid->n; + + for (jvalid = headValidNodes->next; + jvalid != NULL && jvalid->n < mini; jvalid = jvalid->next) { + minj = jvalid->n; + float dist = (*distMatrix)[mini][minj]; + if (dist < 0) { + cerr + << "ERROR: It is impossible to have distance value less than zero" + << endl; + dist = 0; + } + if (dist < minDist) { + minDist = dist; + miniPtr = ivalid; + minjPtr = jvalid; + } + //printf("dist %g mini %d minj %d\n", dist, ivalid->node, jvalid->node); + } + } + //printf("**** mini %d minj %d minDist %g *****\n", miniPtr->node, minjPtr->node, minDist); + //check the validity of miniPtr and minjPtr; + if (miniPtr == headValidNodes || minjPtr == headValidNodes) { + cerr << "OOPS: Error occurred while constructing the cluster tree\n" + << endl; + exit(-1); + } + //computing branch length and join the two nodes + float branchLength = minDist * 0.5f; + this->connectNodes(&nodes[nodeIdx], nodeIdx, &nodes[miniPtr->node], + branchLength, &nodes[minjPtr->node], branchLength); + clusterLeafs[nodeIdx] = clusterLeafs[miniPtr->node] + + clusterLeafs[minjPtr->node]; + + //remove the valid node minjPtr from the list + minjPtr->prev->next = minjPtr->next; + if (minjPtr->next != NULL) { + minjPtr->next->prev = minjPtr->prev; + } + minjPtr->prev = minjPtr->next = NULL; + + //compute the distance of each remaining valid node to the new node + for (ivalid = headValidNodes->next; ivalid != NULL; + ivalid = ivalid->next) { + int idx = ivalid->n; + + float idist = (*distMatrix)[miniPtr->n][idx]; + float jdist = (*distMatrix)[minjPtr->n][idx]; + + unsigned int isize = clusterLeafs[miniPtr->node]; + unsigned int jsize = clusterLeafs[minjPtr->node]; + joins[idx] = (idist * isize + jdist * jsize) / (isize + jsize); + //joins[idx] = (idist + jdist )/ 2; + } + //update the distance to the new node + miniPtr->node = nodeIdx; + mini = miniPtr->n; + for (jvalid = headValidNodes->next; jvalid != NULL; + jvalid = jvalid->next) { + minj = jvalid->n; + + float dist = joins[minj]; + (*distMatrix)[mini][minj] = dist; + (*distMatrix)[minj][mini] = dist; + } + } + //add a pseudo root to this unrooted NJ tree + this->root = &nodes[lastNode - 1]; + + delete[] validNodes; + delete[] joins; + delete[] clusterLeafs; +} diff --git a/binaries/src/GLProbs-1.0/MSAClusterTree.h b/binaries/src/GLProbs-1.0/MSAClusterTree.h new file mode 100644 index 0000000..30bce05 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAClusterTree.h @@ -0,0 +1,27 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#ifndef _MSA_CLUSTER_TREE_H +#define _MSA_CLUSTER_TREE_H + +#include "MSAGuideTree.h" + +class MSAClusterTree: public MSAGuideTree { +public: + MSAClusterTree(MSA* msa, VVF& distMatrix, int numSeqs); + ~MSAClusterTree(); + + //construct the cluster tree + void create(); +private: + //generate the cluster tree + void generateClusterTree(); +}; +#endif diff --git a/binaries/src/GLProbs-1.0/MSADef.h b/binaries/src/GLProbs-1.0/MSADef.h new file mode 100644 index 0000000..6a3d178 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSADef.h @@ -0,0 +1,26 @@ +#ifndef _MSA_DEF_H +#define _MSA_DEF_H +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +//maximum number +#define MAX_INT_NUM 0x7FFFFFFF +#define MAX_FLOAT_NUM FLT_MAX +#define INT_MULTIPLY 1000 + +#define SUBMATRIX_INT_SCALE 100 + +//a tree node is a leaf or a node +enum { + NONE, NODE, LEAF +}; + +#endif + diff --git a/binaries/src/GLProbs-1.0/MSAGuideTree.cpp b/binaries/src/GLProbs-1.0/MSAGuideTree.cpp new file mode 100644 index 0000000..ec9a5e8 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAGuideTree.cpp @@ -0,0 +1,327 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ +#include "MSAGuideTree.h" +#include "MSA.h" +MSAGuideTree::MSAGuideTree(MSA* msa, VVF& distances, int numSeqs) { + int i; + TreeNode* node; + //system configuration + this->msa = msa; + this->distMatrix = &distances; + this->numSeqs = numSeqs; + this->seqsWeights = msa->getSeqsWeights(); + + //tree structure + this->nodesSize = this->numSeqs * 2 + 1; + this->nodes = new TreeNode[this->nodesSize]; + if (!this->nodes) { + cerr << "TreeNodes memory allocation failed" << endl; + exit(-1); + } + //initialize all the tree nodes + this->leafs = this->nodes; + this->leafsNum = this->numSeqs; + this->nodesNum = 2 * this->leafsNum - 1; + for (i = 0; i < this->nodesSize; i++) { + node = &nodes[i]; + node->left = 0; + node->right = 0; + node->parent = 0; + node->leftIdx = -1; + node->rightIdx = -1; + node->parentIdx = -1; + node->idx = -1; + node->dist = 0; + node->leaf = NODE; //setted to be NODE, by default + node->order = 0; + node->depth = 0; + } + //initialize the leaf nodes + for (i = 0; i < this->leafsNum; i++) { + node = &this->leafs[i]; + node->idx = i; + node->leaf = LEAF; + } +} +MSAGuideTree::~MSAGuideTree() { + //release tree nodes + delete[] this->nodes; + + //release alignment orders + releaseAlignmentOrders(); + +} +//get the tree nodes +TreeNode* MSAGuideTree::getNodes() { + return nodes; +} +//get the leaf nodes +TreeNode* MSAGuideTree::getLeafs() { + return leafs; +} +//get the number of nodes; +int MSAGuideTree::getNodesNum() { + return nodesNum; +} +//get the number of leaf nodes +int MSAGuideTree::getLeafsNum() { + return leafsNum; +} +//get the alignment orders +AlignmentOrder* MSAGuideTree::getAlignOrders() { + return alignOrders; +} +int MSAGuideTree::getAlignOrdersNum() { + return alignOrdersNum; +} +/**************************************************** + create the evolutionary relationship + ****************************************************/ +void MSAGuideTree::connectNodes(TreeNode* parent, int parentIdx, + TreeNode* leftChild, float leftDist, TreeNode* rightChild, + float rightDist) { + //save the parents index for each child + leftChild->parent = parent; + leftChild->parentIdx = parentIdx; + rightChild->parent = parent; + rightChild->parentIdx = parentIdx; + + //save the branch lengths (i.e. distance) from each child to its parent + leftChild->dist = leftDist; + rightChild->dist = rightDist; + + //save the indices of itself and its children for this new tree node + parent->idx = parentIdx; + parent->left = leftChild; + parent->leftIdx = leftChild->idx; + parent->right = rightChild; + parent->rightIdx = rightChild->idx; +} +/***************************************** + compute the alignment order of the phylogentic tree + *****************************************/ +void MSAGuideTree::createAlignmentOrders() { + int i; + + AlignmentOrder* order; + //allocate memory space for alignment orders vector + this->alignOrdersNum = 0;//for alignment orders, it starts from 1 instead of 0 + this->alignOrdersSize = numSeqs;//the number of internal nodes of the phylogentic tree + 1 + this->alignOrders = new AlignmentOrder[this->alignOrdersSize]; + if (!this->alignOrders) { + cerr << "OOPS: Alignment orders memory allocation failed" << endl; + exit(-1); + } + //initialize the alignment orders vector + for (i = 0; i < this->alignOrdersSize; i++) { + order = &this->alignOrders[i]; + order->leftOrder = 0; + order->rightOrder = 0; + order->leftLeafs = 0; + order->leftNum = 0; + order->rightLeafs = 0; + order->rightNum = 0; + } + //starting out constructing the alignment orders + int subLeafsNum; + int nodeDepth = 1; + int subOrder = recursiveCreateAlignmentOrders(this->root, 0, subLeafsNum, + nodeDepth); + + //check whether the function works well + if (subLeafsNum != numSeqs || this->alignOrdersNum != subOrder) { + fprintf(stderr, + "The alignment orders constructed were wrong (subLeafsNum %d, alignOrdersNum %d, subOrder %d)\n", + subLeafsNum, alignOrdersNum, subOrder); + } + +} +int MSAGuideTree::recursiveCreateAlignmentOrders(TreeNode* subRoot, + int* subLeafs, int& subLeafsNum, int nodeDepth) { + int leftNum, rightNum; + int leftOrder, rightOrder; + int* leftLeafs, *rightLeafs; + + if (subRoot->leaf == LEAF) { + subLeafs[0] = subRoot->idx; + subLeafsNum = 1; + + return 0; //if it is a leaf, return the index 0 + } + leftOrder = rightOrder = 0; + leftNum = rightNum = 0; + leftLeafs = new int[numSeqs]; + rightLeafs = new int[numSeqs]; + + //check the left subtree + if (subRoot->left) { + //recursively tranverse the left subtree + leftOrder = recursiveCreateAlignmentOrders(subRoot->left, leftLeafs, + leftNum, nodeDepth + 1); + } + //check the right subtree + if (subRoot->right) { + rightOrder = recursiveCreateAlignmentOrders(subRoot->right, rightLeafs, + rightNum, nodeDepth + 1); + } + //save the leafs in the left and right subtrees of the current subtree + if (this->alignOrdersNum > this->alignOrdersSize) { + fprintf(stderr, "the alignment order function works bad\n");\ + exit(-1); + } + + AlignmentOrder* order = &this->alignOrders[++this->alignOrdersNum]; + order->nodeDepth = nodeDepth; + order->leftOrder = leftOrder; + order->rightOrder = rightOrder; + order->leftNum = leftNum; + order->rightNum = rightNum; + order->leftLeafs = new int[order->leftNum]; + order->rightLeafs = new int[order->rightNum]; + if (!order->leftLeafs || !order->rightLeafs) { + fprintf(stderr, + "memory allocation failed while recursively constructing alignment orders\n"); + exit(-1); + } + memcpy(order->leftLeafs, leftLeafs, order->leftNum * sizeof(int)); + memcpy(order->rightLeafs, rightLeafs, order->rightNum * sizeof(int)); + + delete[] leftLeafs; + delete[] rightLeafs; + + //for the root of the tree, subLeafs buffer is set to 0 + if (subLeafs) { + //copy the results to the parent tree node + memcpy(subLeafs, order->leftLeafs, order->leftNum * sizeof(int)); + memcpy(subLeafs + order->leftNum, order->rightLeafs, + order->rightNum * sizeof(int)); + } + //compute the total number of leafs in this subtree + subLeafsNum = order->leftNum + order->rightNum; + + return this->alignOrdersNum;//return the index of itself, starting from 1, instead of 0 +} +void MSAGuideTree::releaseAlignmentOrders() { + if (!this->alignOrders) { + return; + } + for (int i = 0; i < this->alignOrdersNum; i++) { + AlignmentOrder* order = &this->alignOrders[i]; + if (order->leftLeafs) { + delete[] order->leftLeafs; + } + if (order->rightLeafs) { + delete[] order->rightLeafs; + } + } + delete[] alignOrders; +} +/******************************** + display the alignment orders + ********************************/ +void MSAGuideTree::displayAlignmentOrders() { + int i, j; + AlignmentOrder* order; + fprintf(stderr, "************DISPLAY ALIGNMENT ORDER***************\n"); + for (i = 1; i <= this->alignOrdersNum; i++) { + order = &this->alignOrders[i]; + + fprintf(stderr, "GROUP (%d depth %d):\n---LEFT ORDER: %d\n", i, + order->nodeDepth, order->leftOrder); + fprintf(stderr, "---LEFT: "); + for (j = 0; j < order->leftNum; j++) { + fprintf(stderr, "%d ", order->leftLeafs[j]); + } + + fprintf(stderr, "\n---RIGHT ORDER: %d\n", order->rightOrder); + fprintf(stderr, "\n---RIGHT: "); + for (j = 0; j < order->rightNum; j++) { + fprintf(stderr, "%d ", order->rightLeafs[j]); + } + fprintf(stderr, "\n"); + } + fprintf(stderr, "*******************************************\n"); +} +/********************************* + display the tree + *********************************/ +void MSAGuideTree::displayTree() { + fprintf(stderr, "**************DISPLAY TREE*********************\n"); + for (int i = 0; i < nodesNum; i++) { + TreeNode* node = &nodes[i]; + + fprintf(stderr, + "%d(%p): left(%p) %d, right(%p) %d, parent(%p) %d, dist %f\n", + (node == &nodes[node->idx]) ? node->idx : -2, node, node->left, + (!node->left || node->left == &nodes[node->leftIdx]) ? + node->leftIdx : -2, node->right, + (!node->right || node->right == &nodes[node->rightIdx]) ? + node->rightIdx : -2, node->parent, + (!node->parent || node->parent == &nodes[node->parentIdx]) ? + node->parentIdx : -2, node->dist); + } + fprintf(stderr, "*******************************************\n"); +} +/********************************* + compute the sequence weights + *********************************/ +void MSAGuideTree::getSeqsWeights() { + int i; + TreeNode* curr; + + //compute the order of each node, which represents the number of leaf nodes in the substree rooting from it. + for (i = 0; i < leafsNum; i++) { + //for each leaf nodes + curr = &this->leafs[i]; + while (curr != 0) { + curr->order++; + + curr = curr->parent; + } + } + //compute the weight of each sequence, which corresponds to a leaf node + for (i = 0; i < numSeqs; i++) { + //compute the weight of each sequence + float weights = 0; + curr = &this->leafs[i]; + while (curr->parent != 0) { + weights += curr->dist / curr->order; + curr = curr->parent; + //printf("order:%d weights: %f\n", curr->order, weights); + } + //save the weight of this sequence + seqsWeights[i] = (int) (100 * weights); + //printf("%d\n", seqsWeights[i]); + } + //normalize the weights + int wsum = 0; + for (i = 0; i < numSeqs; i++) { + wsum += seqsWeights[i]; + } + if (wsum == 0) { + //in this case, every sequence is assumed to have an identical weight + for (i = 0; i < numSeqs; i++) { + seqsWeights[i] = 1; + } + wsum = numSeqs; + } + //printf("wsum:%d \n", wsum); + for (i = 0; i < numSeqs; i++) { + seqsWeights[i] = (seqsWeights[i] * INT_MULTIPLY) / wsum; + if (seqsWeights[i] < 1) { + seqsWeights[i] = 1; + } + //printf("%d \n", seqsWeights[i]); + } +} +void MSAGuideTree::create() { + //do nothing +} + diff --git a/binaries/src/GLProbs-1.0/MSAGuideTree.h b/binaries/src/GLProbs-1.0/MSAGuideTree.h new file mode 100644 index 0000000..97d538a --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAGuideTree.h @@ -0,0 +1,119 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#ifndef _MSA_GUIDE_TREE_H +#define _MSA_GUIDE_TREE_H +#include "MSADef.h" +#include "MSA.h" + +#include "SafeVector.h" +#include "MultiSequence.h" +#include "ScoreType.h" +#include "ProbabilisticModel.h" +#include "SparseMatrix.h" + +class MSA; +struct ValidNode { + ValidNode* prev; + ValidNode* next; + int n; //the index in the distance matrix + int node; //the index in the tree node entries +}; + +struct TreeNode { + struct TreeNode *left; //the pointer to its left child + struct TreeNode *right; //the pointer to its right child + struct TreeNode *parent; //the pointer to its parent + int leftIdx; //the index of the left child + int rightIdx; //the index of the right child + int parentIdx; //the index of its parent + int idx; //the index of itself + float dist; //the distance to its parent + int leaf; //whether it is a leaf node or not + int order; //the number of generations dating back to its ancestor + int depth; //the depth of the node +}; +struct AlignmentOrder { + int nodeDepth; //the depth of the internal node + int leftOrder; //the order number of the right child + int rightOrder; //the order number of the left child + int* leftLeafs; //the indices of leafs in the left subtree + int leftNum; //the number of leafs in the left subtree + int* rightLeafs; //the indices of leafs in the right subtree + int rightNum; //the number of leafs in the right substree +}; + +class MSAGuideTree { +public: + MSAGuideTree(MSA* msa, VVF& distMatrix, int numSeqs); + virtual ~MSAGuideTree() = 0; //abstract class + + //get the tree nodes + TreeNode* getNodes(); + //get the leaf nodes + TreeNode* getLeafs(); + //get the number of nodes; + int getNodesNum(); + //get the number of leaf nodes + int getLeafsNum(); + //get the root of the tree + TreeNode* getRoot() { + return this->root; + } + //get the alignment orders + AlignmentOrder* getAlignOrders(); + int getAlignOrdersNum(); + //construct the alignment orders + void createAlignmentOrders(); + + //construct the guide tree + virtual void create(); + //calculate the sequence weights + virtual void getSeqsWeights(); + + /**********DEBUGING****************/ + //display the tree + void displayTree(); + //display the alignment orders + void displayAlignmentOrders(); + +protected: + //join two nodes + void connectNodes(TreeNode* parent, int parentIdx, TreeNode* leftChild, + float leftDist, TreeNode* rightChild, float rightDist); + //release the alignment orders vector + void releaseAlignmentOrders(); + //recursive implemenation of constructing the alignment orders + int recursiveCreateAlignmentOrders(TreeNode* subRoot, int* subLeafs, + int& subLeafsNum, int nodeDepth); + + //system configurations + MSA* msa; + VVF* distMatrix; + int numSeqs; + int* seqsWeights; + + //all the tree nodes + TreeNode* nodes; + int nodesNum; + int nodesSize; + //the root tree node + TreeNode* root; + //leaf node + TreeNode* leafs; + int leafsNum; + + //alignment order + AlignmentOrder* alignOrders; + int alignOrdersNum; + int alignOrdersSize; +}; +#endif + diff --git a/binaries/src/GLProbs-1.0/MSAPartProbs.cpp b/binaries/src/GLProbs-1.0/MSAPartProbs.cpp new file mode 100644 index 0000000..b234588 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAPartProbs.cpp @@ -0,0 +1,1023 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ +#include "SafeVector.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "MultiSequence.h" +#include "ScoreType.h" + +#define TRACE 0 // 0: NOTRACE 1: TRACE +//proba like settings +#define endgaps 1 // 1: engap penaties enabled 0: disabled +#define PART_FULL_MEMORY 0 //0: LOW MEM OPTION +#define REVPART_FULL_MEMORY 0 //0: LOW MEM OPTION +using namespace std; + +#ifdef _WIN32 +#define OS_HUGE_VALL HUGE_VAL +#else +#define OS_HUGE_VALL HUGE_VALL +#endif + +typedef struct { + char input[30]; + int matrix; + int N; + float T; + float beta; + char opt; //can be 'P' or 'M' + float gapopen; + float gapext; +} argument_decl; + +typedef struct sequence { + char *title; + char *text; + int length; +} fasta; + +typedef struct alignment { + char *title; + char *text; + int length; +} align; + +//////////////////////////////////////////////////////// +//externs related to scoring matrix and input arguments +/////////////////////////////////////////////////////////// +extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; +extern char aminos[26], matrixtype[20], bases[26]; + +extern double sub_matrix[26][26]; +extern double normalized_matrix[26][26]; // add by YE Yongtao +extern int subst_index[26]; + +extern float TEMPERATURE; +extern int MATRIXTYPE; + +extern float GAPOPEN; +extern float GAPEXT; +extern argument_decl argument; + +////////////////////////////////////////////////////////////////////////////// +//calculates reverse partition function values based on z matrices +//and also simulaneously calculates the propability of each basepair +//or aminoacid residue pair i,j +////////////////////////////////////////////////////////////////////////////// + +VF *revers_partf(fasta sequences[2], const double termgapopen, + const double termgapextend, long double **Zfm, const double d, + const double e) { + // printf("revpart\n"); + //rest of the declarations + int i, j; + long double **Zm = NULL; + long double **Ze = NULL; + long double **Zf = NULL; + int len0, len1; + float probability; + long double tempvar; + int Si, Tj; + double endgapopen, endgapextend; + FILE *fo; + + //Init lengths of sequences + len0 = strlen(sequences[0].text); + len1 = strlen(sequences[1].text); + + //Safe vector declared + VF *posteriorPtr = new VF((len0 + 1) * (len1 + 1)); + VF & posterior = *posteriorPtr; + VF::iterator ptr = posterior.begin(); + + if (TRACE) //open the trace file + fo = fopen("revpartdump", "a"); + + //default: + endgapopen = termgapopen; + endgapextend = termgapextend; + + //instantiate the z matrix + if (REVPART_FULL_MEMORY) { + + Ze = new long double *[sequences[1].length + 1]; + Zf = new long double *[sequences[1].length + 1]; + Zm = new long double *[sequences[1].length + 1]; + + if (TRACE) + printf("\n\n %e %e\n", d, e); + + //DYNAMICALLY GROW 2D Zm Zf Ze MARICES (long double) + for (i = 0; i <= sequences[1].length; i++) { + Ze[i] = new long double[sequences[0].length + 1]; + Zf[i] = new long double[sequences[0].length + 1]; + Zm[i] = new long double[sequences[0].length + 1]; + } + } else { + Zm = new long double *[2]; + Ze = new long double *[2]; + Zf = new long double *[2]; + for (i = 0; i <= 1; i++) { + Zm[i] = new long double[sequences[0].length + 1]; + Ze[i] = new long double[sequences[0].length + 1]; + Zf[i] = new long double[sequences[0].length + 1]; + } + + } + + if (TRACE) { + printf("in rev partf---"); + printf("\n\n"); + } + + if (REVPART_FULL_MEMORY) { + for (i = 0; i <= len1; i++) + for (j = 0; j <= len0; j++) { + Zm[i][j] = 0.0; + Zf[i][j] = 0.0; + Ze[i][j] = 0.0; + } + } else { + + for (j = 0; j <= len0; j++) { + Zm[0][j] = 0; + Zf[0][j] = 0; + Ze[0][j] = 0; + Zf[1][j] = 0; + Ze[1][j] = 0; + Zm[1][j] = 0; + } + } + + //fill the probability matrix with 0s + for (i = 0; i <= len1; i++) + for (j = 0; j <= len0; j++) + ptr[j * (len1 + 1) + i] = 0; + + if (endgaps == 0) { + Zm[len1][len0] = 1; + Ze[len1][len0] = Zf[len1][len0] = 0; + Zf[len1 - 1][len0] = Zm[len1][len0] * d; + Ze[len1][len0 - 1] = Zm[len1][len0] * d; + + //>=2ND ROW INIT + if (REVPART_FULL_MEMORY) { + for (i = len1 - 2; i >= 0; i--) { + Zf[i][len0] = Zf[i + 1][len0] * e; + } + } + + //>=2ND COL INIT + if (REVPART_FULL_MEMORY) { + for (j = len0 - 2; j >= 0; j--) { + Ze[len1][j] = Ze[len1][j + 1] * e; + } + } else { + for (j = len0 - 2; j >= 0; j--) { + Ze[0][j] = Ze[0][j + 1] * e; + } + } + } else { + + if (REVPART_FULL_MEMORY) { + + Zm[len1][len0] = 1; + Ze[len1][len0] = Zf[len1][len0] = 0; + Zf[len1 - 1][len0] = Zm[len1][len0] * endgapopen; + Ze[len1][len0 - 1] = Zm[len1][len0] * endgapopen; + + //>=2ND ROW INIT + for (i = len1 - 2; i >= 0; i--) { + Zf[i][len0] = Zf[i + 1][len0] * endgapextend; + } + + //M Iy= d+j*e + + //>=2ND COL INIT + for (j = len0 - 2; j >= 0; j--) { + Ze[len1][j] = Ze[len1][j + 1] * endgapextend; + } + + } else { + //in Zm + //let: + // Zm(0) be the current row being filled/computed + // Zm(1) be the previous row + + Zm[1][len0] = 1; + Ze[0][len0] = Zf[0][len0] = 0; + Zf[1][len0] = Zm[1][len0] * endgapopen; + Ze[0][len0 - 1] = Zm[1][len0] * endgapopen; + + //>=2ND COL INIT + for (j = len0 - 2; j >= 0; j--) { + Ze[0][j] = Ze[0][j + 1] * endgapextend; + } + + } //END ELSE + + } //END FULL MEMORY and GAP enablement IF STATEMENT + + double scorez, zz = 0; + + for (i = len1 - 1; i >= 0; i--) { + + for (j = len0 - 1; j >= 0; j--) { + Si = subst_index[sequences[1].text[i] - 'A']; + Tj = subst_index[sequences[0].text[j] - 'A']; + scorez = sub_matrix[Si][Tj]; + + //endgaps modification aug 10 + double open0, extend0, open1, extend1; + + open0 = open1 = d; + extend0 = extend1 = e; + + if (endgaps == 1) { + + //check to see if one of the 2 sequences or both reach the end + + if (i == 0) { + open0 = endgapopen; + extend0 = endgapextend; + + } + + if (j == 0) { + open1 = endgapopen; + extend1 = endgapextend; + } + + } + + if (REVPART_FULL_MEMORY) { + //z computation + + Ze[i][j] = Zm[i][j + 1] * open0 + Ze[i][j + 1] * extend0; + Zf[i][j] = Zm[i + 1][j] * open1 + Zf[i + 1][j] * extend1; + Zm[i][j] = (Zm[i + 1][j + 1] + Zf[i + 1][j + 1] + + Ze[i + 1][j + 1]) * scorez; + zz = Zm[i][j] + Zf[i][j] + Ze[i][j]; + + } else { + + //2 ROW zE zF ALGORITHM GOES...: + //Ze[1][j] =Zm[i][j + 1] * exp(beta * open0) + Ze[1][j + 1] *exp(beta * extend0); + //Zf[1][j] = Zm[i + 1][j] * exp(beta * open1) + Zf[0][j] * exp(beta * extend1); + //Zm[i][j] = (Zm[i + 1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1]) * exp(beta * scorez); + //zz = Zm[0][j] + Zf[1][j] + Ze[1][j]; + + //lowmem code for merging probability calculating module + //Here we make use of Zm as a 2 row matrix + + Zf[1][j] = Zm[1][j] * open1 + Zf[0][j] * extend1; + Ze[1][j] = Zm[0][j + 1] * open0 + Ze[1][j + 1] * extend0; + Zm[0][j] = (Zm[1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1]) + * scorez; + + tempvar = Zfm[i + 1][j + 1] * Zm[0][j]; + //divide P(i,j) i.e. pairwise probability by denominator + tempvar /= (scorez * Zfm[0][0]); + probability = (float) tempvar; + + //store only noticable probabilities + //if (probability <= 1 && probability >= 0.001) { + //algorithm goes... + //validprob[i + 1][j + 1] = probability; + ptr[(j + 1) * (len1 + 1) + (i + 1)] = probability; + //} + //lowmem code ends here + + } + + } //end of for + + if (REVPART_FULL_MEMORY == 0) { + for (int t = 0; t <= sequences[0].length; t++) { + Ze[0][t] = Ze[1][t]; + Ze[1][t] = 0; + + Zf[0][t] = Zf[1][t]; + Zf[1][t] = 0; + + Zm[1][t] = Zm[0][t]; + Zm[0][t] = 0; + + } + Zf[0][len0] = 1; + + } + + } //end of for + + if (TRACE) { + printf("\n\nrM:....\n\n"); + if (REVPART_FULL_MEMORY) { + for (i = 0; i <= len1; i++) { + for (j = 0; j <= len0; j++) + printf("%.2Le ", Zm[i][j]); + printf("\n"); + } + + printf("\n\nrE:....\n\n"); + for (i = 0; i <= len1; i++) { + for (j = 0; j <= len0; j++) + printf("%.2Le ", Ze[i][j]); + printf("\n"); + + } + + printf("\n\nrF:....\n\n"); + for (i = 0; i <= len1; i++) { + for (j = 0; j <= len0; j++) + printf("%.2Le ", Zf[i][j]); + printf("\n"); + + } + + } + + } + + if (TRACE) { + fprintf(fo, "\n"); + fclose(fo); + } + + //delete unused memory + + if (REVPART_FULL_MEMORY) { + for (i = 0; i <= len1; i++) { + delete (Zm[i]); + delete (Zf[i]); + delete (Ze[i]); + } + } else { + delete (Zf[0]); + delete (Ze[0]); + delete (Zm[0]); + + delete (Zm[1]); + delete (Zf[1]); + delete (Ze[1]); + } + + for (i = 0; i <= len1; i++) { + delete (Zfm[i]); + } + + if (Zf != NULL) + delete (Zf); + + if (Ze != NULL) + delete (Ze); + + if (Zm != NULL) + delete (Zm); + + if (Zfm != NULL) + delete (Zfm); + + posterior[0] = 0; + return (posteriorPtr); + +} + +////////////////////////////////////////////////////////////// +//forward partition function +///////////////////////////////////////////////////////////// + +long double **partf(fasta sequences[2], const double termgapopen, + const double termgapextend, const double d, const double e) { + //printf("partf\n"); + int i, j, len1, len0; + long double **Zm = NULL, **Zf = NULL, **Ze = NULL, zz = 0; + double endgapopen, endgapextend; + + //default: + endgapopen = termgapopen; + endgapextend = termgapextend; + + //the flag endgaps is set at the #define section + if (PART_FULL_MEMORY) { + + Zf = new long double *[sequences[1].length + 1]; + Ze = new long double *[sequences[1].length + 1]; + Zm = new long double *[sequences[1].length + 1]; + + //comment + if (TRACE) + printf("\nPARTF:====\n"); + + //DYNAMICALLY GROW 2D M,IX,IY,PIX,PIY MARICES + for (i = 0; i <= sequences[1].length; i++) { + Zf[i] = new long double[sequences[0].length + 1]; + Ze[i] = new long double[sequences[0].length + 1]; + Zm[i] = new long double[sequences[0].length + 1]; + } + } else { + Zm = new long double *[sequences[1].length + 1]; + Ze = new long double *[2]; + Zf = new long double *[2]; + for (i = 0; i <= sequences[1].length; i++) { + Zm[i] = new long double[sequences[0].length + 1]; + } + Ze[0] = new long double[sequences[0].length + 1]; + Zf[0] = new long double[sequences[0].length + 1]; + Ze[1] = new long double[sequences[0].length + 1]; + Zf[1] = new long double[sequences[0].length + 1]; + } + + len0 = strlen(sequences[0].text); + len1 = strlen(sequences[1].text); + + if (PART_FULL_MEMORY) { + for (i = 0; i <= sequences[1].length; i++) + for (j = 0; j <= sequences[0].length; j++) { + Zm[i][j] = 0.00; + Zf[i][j] = 0.00; + Ze[i][j] = 0.00; + } + } else { + for (i = 0; i <= len1; i++) { + for (j = 0; j <= len0; j++) { + Zm[i][j] = 0; + } + } + for (j = 0; j <= len0; j++) { + Zf[0][j] = 0; + Ze[0][j] = 0; + Zf[1][j] = 0; + Ze[1][j] = 0; + } + } + + //INTITIALIZE THE DP + + if (endgaps == 0) { + Zm[0][0] = 1.00; + + Zf[0][0] = Ze[0][0] = 0; + Zf[1][0] = Zm[0][0] * d; + Ze[0][1] = Zm[0][0] * d; + + //>=2ND ROW INIT + if (PART_FULL_MEMORY) { + for (i = 2; i <= sequences[1].length; i++) { + Zf[i][0] = Zf[i - 1][0] * e; + } + } + + //>=2ND COL INIT + for (j = 2; j <= sequences[0].length; j++) { + Ze[0][j] = Ze[0][j - 1] * e; + } + } else { + //init z + Zm[0][0] = 1.00; + Zf[0][0] = Ze[0][0] = 0; + Zf[1][0] = Zm[0][0] * endgapopen; + Ze[0][1] = Zm[0][0] * endgapopen; + + //>=2ND ROW INIT + if (PART_FULL_MEMORY) { + for (i = 2; i <= sequences[1].length; i++) { + Zf[i][0] = Zf[i - 1][0] * endgapextend; + } + } + + //>=2ND COL INIT + for (j = 2; j <= sequences[0].length; j++) { + Ze[0][j] = Ze[0][j - 1] * endgapextend; + } + } + + //1ST ROW/COL INIT + + int Si, Tj; + double score; + + for (i = 1; i <= sequences[1].length; i++) { + + for (j = 1; j <= sequences[0].length; j++) { + + Si = subst_index[sequences[1].text[i - 1] - 'A']; + Tj = subst_index[sequences[0].text[j - 1] - 'A']; + + score = sub_matrix[Si][Tj]; + + double open0, extend0, open1, extend1; + + open0 = open1 = d; + extend0 = extend1 = e; + + if (endgaps == 1) { + //check to see if one of the 2 sequences or both reach the end + + if (i == sequences[1].length) { + open0 = endgapopen; + extend0 = endgapextend; + + } + + if (j == sequences[0].length) { + open1 = endgapopen; + extend1 = endgapextend; + } + } + + // + //z computation using open and extend temp vars + //open0 is gap open in seq0 and open1 is gap open in seq1 + //entend0 is gap extend in seq0 and extend1 is gap extend in seq1 + + if (PART_FULL_MEMORY) { + Ze[i][j] = Zm[i][j - 1] * open0 + Ze[i][j - 1] * extend0; + + if (Ze[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for Ze\n"); + exit(1); + } + + Zf[i][j] = Zm[i - 1][j] * open1 + Zf[i - 1][j] * extend1; + + if (Zf[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for Zf\n"); + exit(1); + } + + Zm[i][j] = (Zm[i - 1][j - 1] + Ze[i - 1][j - 1] + + Zf[i - 1][j - 1]) * score; + + if (Zm[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for Zm\n"); + exit(1); + } + + zz = Zm[i][j] + Ze[i][j] + Zf[i][j]; + } else { + Ze[1][j] = Zm[i][j - 1] * open0 + Ze[1][j - 1] * extend0; + + if (Ze[1][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for zE\n"); + exit(1); + } + + Zf[1][j] = Zm[i - 1][j] * open1 + Zf[0][j] * extend1; + + if (Zf[1][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for zF\n"); + exit(1); + } + + Zm[i][j] = (Zm[i - 1][j - 1] + Ze[0][j - 1] + Zf[0][j - 1]) + * score; + + if (Zm[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for zM\n"); + exit(1); + } + + zz = Zm[i][j] + Ze[1][j] + Zf[1][j]; + } + + } //end for + + if (!PART_FULL_MEMORY) { + for (int t = 0; t <= sequences[0].length; t++) { + Ze[0][t] = Ze[1][t]; + Ze[1][t] = 0; + + Zf[0][t] = Zf[1][t]; + Zf[1][t] = 0; + } + + Zf[1][0] = 1; + + } + + } //end for + + //store the sum of zm zf ze (m,n)s in zm's 0,0 th position + Zm[0][0] = zz; + + if (TRACE) { + //debug code aug 3 + //print the 3 Z matrices namely Zm Zf and Ze + + printf("\n\nFINAL Zm:\n"); + for (i = 0; i <= sequences[1].length; i++) { + for (j = 0; j <= sequences[0].length; j++) + printf("%.2Le ", Zm[i][j]); + printf("\n"); + } + + printf("FINAL Zf \n"); + for (i = 0; i <= sequences[1].length; i++) { + for (j = 0; j <= sequences[0].length; j++) + printf("%.2Le ", Zf[i][j]); + printf("\n"); + } + + printf("FINAL Ze \n"); + for (i = 0; i <= sequences[1].length; i++) { + for (j = 0; j <= sequences[0].length; j++) + printf("%.2Le ", Ze[i][j]); + printf("\n"); + } + + //end debug dump code + + } + + if (PART_FULL_MEMORY) { + for (i = 0; i <= sequences[1].length; i++) { + delete (Zf[i]); + delete (Ze[i]); + } + } else { + delete (Zf[0]); + delete (Ze[0]); + delete (Zf[1]); + delete (Ze[1]); + } + + delete (Zf); + delete (Ze); + + return Zm; + +} //end of forward partition function + +///////////////////////////////////////////////////////////////////////////////////////// +//entry point (was the main function) , returns the posterior probability safe vector +//////////////////////////////////////////////////////////////////////////////////////// +VF *ComputePostProbs(int a, int b, string seq1, string seq2) { + //printf("probamod\n"); + double gap_open = -22, gap_ext = -1, beta = 0.2;//T = 5, beta = 1/T = 0.2, by default + int stock_loop = 1; + int le = 160; + double termgapopen = 1.0f; //exp(0) + double termgapextend = 1.0f; //exp(0) + + //initialize the sequence structure + fasta sequences[2]; + + sequences[0].length = strlen((char *) seq1.c_str()); + sequences[0].text = (char *) seq1.c_str(); + sequences[0].title = new char[10]; + strcpy(sequences[0].title, "seq0"); + sequences[1].length = strlen((char *) seq2.c_str()); + sequences[1].text = (char *) seq2.c_str(); + sequences[1].title = new char[10]; + strcpy(sequences[1].title, "seq1"); + + if (TRACE) + + { + printf("%d %d %s\n%d %d %s\n--\n", a, sequences[0].length, + sequences[0].text, b, sequences[1].length, sequences[1].text); + printf("after init\n"); + + FILE *dump1 = fopen("dump1", "a"); + fprintf(dump1, "%d %d %s\n%d %d %s\n--\n", a, sequences[0].length, + sequences[0].text, b, sequences[1].length, sequences[1].text); + fclose(dump1); + } + + gap_open = argument.gapopen; + gap_ext = argument.gapext; + beta = argument.beta; + + stock_loop = argument.N; + le = argument.matrix; + + //compute the values of exp(beta * ?) + termgapopen = exp(beta * 0.0); + termgapextend = exp(beta * 0.0); + gap_open = exp(beta * gap_open); + gap_ext = exp(beta * gap_ext); + + if (TRACE) + printf("%f %f %f %d\n", gap_open, gap_ext, beta, le); + + //call for calculating the posterior probabilities + // 1. call partition function partf + // 2. calculate revpartition using revers_parf + // 3. calculate probabilities + /// MODIFICATION... POPULATE SAFE VECTOR + + long double **MAT1; + + MAT1 = partf(sequences, termgapopen, termgapextend, gap_open, gap_ext); + + return revers_partf(sequences, termgapopen, termgapextend, MAT1, gap_open, + gap_ext); + +} + +////////////////////////////////////////////////////////////// +//Compute Viterbi Alignment +// Added by YE Yongtao +///////////////////////////////////////////////////////////// + +pair *, float> partViterbi(string seq1, string seq2) { + + + double gap_open = -12, gap_ext = -1, beta = 0.2;//T = 5, beta = 1/T = 0.2, by default + int stock_loop = 1; + int le = 160; + //double termgapopen = 1.0f; //exp(0) + //double termgapextend = 1.0f; //exp(0) + + //initialize the sequence structure + fasta sequences[2]; + sequences[0].length = strlen((char *) seq1.c_str()); + sequences[0].text = (char *) seq1.c_str(); + sequences[0].title = new char[10]; + strcpy(sequences[0].title, "seq0"); + sequences[1].length = strlen((char *) seq2.c_str()); + sequences[1].text = (char *) seq2.c_str(); + sequences[1].title = new char[10]; + strcpy(sequences[1].title, "seq1"); + + gap_open = argument.gapopen; + gap_ext = argument.gapext; + beta = argument.beta; + + stock_loop = argument.N; + le = argument.matrix; + + //compute the values of exp(beta * ?) + double endgapopen = exp(beta * 0.0); + double endgapextend = exp(beta * 0.0); + double d = exp(beta * gap_open); + double e = exp(beta * gap_ext); + + int i, j, len1, len0; + long double **Zm = NULL, **Zf = NULL, **Ze = NULL; + int **traceZm = NULL, **traceZf = NULL, **traceZe = NULL; + + //the flag endgaps is set at the #define section + Zf = new long double *[sequences[1].length + 1]; + Ze = new long double *[sequences[1].length + 1]; + Zm = new long double *[sequences[1].length + 1]; + + traceZf = new int *[sequences[1].length + 1]; + traceZe = new int *[sequences[1].length + 1]; + traceZm = new int *[sequences[1].length + 1]; + + //DYNAMICALLY GROW 2D M,IX,IY,PIX,PIY MARICES + for (i = 0; i <= sequences[1].length; i++) { + Zf[i] = new long double[sequences[0].length + 1]; + Ze[i] = new long double[sequences[0].length + 1]; + Zm[i] = new long double[sequences[0].length + 1]; + + traceZf[i] = new int[sequences[0].length + 1]; + traceZe[i] = new int[sequences[0].length + 1]; + traceZm[i] = new int[sequences[0].length + 1]; + } + + len0 = strlen(sequences[0].text); + len1 = strlen(sequences[1].text); + + + for (i = 0; i <= sequences[1].length; i++) + for (j = 0; j <= sequences[0].length; j++) { + Zm[i][j] = 0.00; + Zf[i][j] = 0.00; + Ze[i][j] = 0.00; + + traceZm[i][j] = -1; + traceZf[i][j] = -1; + traceZe[i][j] = -1; + } + + + //INTITIALIZE THE DP + if (endgaps == 0) { + Zm[0][0] = 1.00; + + Zf[0][0] = Ze[0][0] = 0; + Zf[1][0] = Zm[0][0] * d; + Ze[0][1] = Zm[0][0] * d; + + //>=2ND ROW INIT + + for (i = 2; i <= sequences[1].length; i++) { + Zf[i][0] = Zf[i - 1][0] * e; + traceZf[i][0] = 2; + } + + + //>=2ND COL INIT + for (j = 2; j <= sequences[0].length; j++) { + Ze[0][j] = Ze[0][j - 1] * e; + traceZe[0][j] = 1; + } + } else { + //init z + Zm[0][0] = 1.00; + Zf[0][0] = Ze[0][0] = 0; + Zf[1][0] = Zm[0][0] * endgapopen; + Ze[0][1] = Zm[0][0] * endgapopen; + + //>=2ND ROW INIT + + for (i = 2; i <= sequences[1].length; i++) { + Zf[i][0] = Zf[i - 1][0] * endgapextend; + traceZf[i][0] = 2; + } + //>=2ND COL INIT + for (j = 2; j <= sequences[0].length; j++) { + Ze[0][j] = Ze[0][j - 1] * endgapextend; + traceZe[0][j] = 1; + } + } + + //1ST ROW/COL INIT + + int Si, Tj; + double score; + + for (i = 1; i <= sequences[1].length; i++) { + + for (j = 1; j <= sequences[0].length; j++) { + + Si = subst_index[sequences[1].text[i - 1] - 'A']; + Tj = subst_index[sequences[0].text[j - 1] - 'A']; + + score = sub_matrix[Si][Tj]; + + double open0, extend0, open1, extend1; + + open0 = open1 = d; + extend0 = extend1 = e; + + if (endgaps == 1) { + //check to see if one of the 2 sequences or both reach the end + + if (i == sequences[1].length) { + open0 = endgapopen; + extend0 = endgapextend; + + } + + if (j == sequences[0].length) { + open1 = endgapopen; + extend1 = endgapextend; + } + } + + // + //z computation using open and extend temp vars + //open0 is gap open in seq0 and open1 is gap open in seq1 + //entend0 is gap extend in seq0 and extend1 is gap extend in seq1 + Zf[i][j] = Zf[i - 1][j] * extend1; + traceZf[i][j] = 2; + + if(Zm[i - 1][j] * open1 > Zf[i][j]){ + Zf[i][j] = Zm[i - 1][j] * open1; + traceZf[i][j] = 0; + } + if (Zf[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for Zf\n"); + exit(1); + } + Ze[i][j] = Ze[i][j - 1] * extend0; + traceZe[i][j] = 1; + if(Zm[i][j - 1] * open0 > Ze[i][j]){ + Ze[i][j] = Zm[i][j - 1] * open0; + traceZe[i][j] = 0; + } + + if (Ze[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for Ze\n"); + exit(1); + } + + Zm[i][j] = Zm[i - 1][j - 1] * score; + traceZm[i][j] = 0; + if(Zf[i - 1][j - 1] * score > Zm[i][j]){ + Zm[i][j] = Zf[i - 1][j - 1] * score; + traceZm[i][j] = 2; + } + if(Ze[i - 1][j - 1] * score > Zm[i][j]){ + Zm[i][j] = Ze[i - 1][j - 1] * score; + traceZm[i][j] = 1; + } + if (Zm[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for Zm\n"); + exit(1); + } + + }//end for + }//end for + // figure out best terminating cell + + float bestProb = Zm[sequences[1].length][sequences[0].length]; + int state = 0; + if( bestProb < Zf[sequences[1].length][sequences[0].length]){ + bestProb = Zf[sequences[1].length][sequences[0].length]; + state = 2; + } + if( bestProb < Ze[sequences[1].length][sequences[0].length]){ + bestProb = Ze[sequences[1].length][sequences[0].length]; + state = 1; + } + assert (state != -1); + + // compute traceback + SafeVector *alignment = new SafeVector; assert (alignment); + int c = sequences[1].length, r = sequences[0].length; + while (r != 0 || c != 0){ + int newState; + if(state == 0){ + newState = traceZm[c][r]; + c--; r--; alignment->push_back ('B'); + } + else if(state == 1){ + newState = traceZe[c][r]; + r--; alignment->push_back ('X'); + } + else{ + newState = traceZf[c][r]; + c--; alignment->push_back ('Y'); + } + state = newState; + } + + reverse (alignment->begin(), alignment->end()); + + for (i = 0; i <= sequences[1].length; i++) { + delete (Zf[i]); + delete (Ze[i]); + delete (Zm[i]); + delete (traceZf[i]); + delete (traceZe[i]); + delete (traceZm[i]); + } + + delete (Zf); + delete (Ze); + delete (Zm); + delete (traceZf); + delete (traceZe); + delete (traceZm); + + return make_pair(alignment, bestProb); +} + +////////////////////////////////////////////////////////////// +// Compute two sequences' similarity defined as the normalized alignment score without gap penalties +// Added by YE Yongtao +///////////////////////////////////////////////////////////// + +float computeSimilarity(string seq1, string seq2, SafeVector * alignment) { + + //initialize the sequence structure + fasta sequences[2]; + sequences[0].length = strlen((char *) seq1.c_str()); + sequences[0].text = (char *) seq1.c_str(); + sequences[0].title = new char[10]; + strcpy(sequences[0].title, "seq0"); + sequences[1].length = strlen((char *) seq2.c_str()); + sequences[1].text = (char *) seq2.c_str(); + sequences[1].title = new char[10]; + strcpy(sequences[1].title, "seq1"); + + float bestProb = 0; + int Si, Tj; + double score; + int i = 1;int j = 1; + for (SafeVector::iterator iter = alignment->begin(); + iter != alignment->end(); ++iter){ + if (*iter == 'B'){ + Si = subst_index[sequences[1].text[j - 1] - 'A']; + Tj = subst_index[sequences[0].text[i - 1] - 'A']; + score = normalized_matrix[Si][Tj]; + bestProb += score; + i++; j++; + } + else if(*iter == 'X') i++; + else if(*iter == 'Y') j++; + } + if(i!= sequences[0].length + 1 || j!= sequences[1].length + 1 ) cerr << "similarity error"<< endl; + bestProb /= alignment->size(); + //bestProb /= min(sequences[0].length, sequences[1].length); + return bestProb; +} +//end of posterior probability module diff --git a/binaries/src/GLProbs-1.0/MSAProbs.vcproj b/binaries/src/GLProbs-1.0/MSAProbs.vcproj new file mode 100644 index 0000000..5212610 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAProbs.vcproj @@ -0,0 +1,272 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/binaries/src/GLProbs-1.0/MSAProbs.vcproj.STUDENT.LIUY0039.user b/binaries/src/GLProbs-1.0/MSAProbs.vcproj.STUDENT.LIUY0039.user new file mode 100644 index 0000000..99168da --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAProbs.vcproj.STUDENT.LIUY0039.user @@ -0,0 +1,65 @@ + + + + + + + + + + + diff --git a/binaries/src/GLProbs-1.0/MSAReadMatrix.cpp b/binaries/src/GLProbs-1.0/MSAReadMatrix.cpp new file mode 100644 index 0000000..e978eb4 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAReadMatrix.cpp @@ -0,0 +1,215 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#include +#include +#include +#include +#include "MSAReadMatrix.h" + +#define TRACE 0 + +//////////////////////////////////////////////////////////// +// extern variables for scoring matrix data +//////////////////////////////////////////////////////////// +extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; +extern char *aminos, *bases, matrixtype[20]; +extern int subst_index[26]; + +extern double sub_matrix[26][26]; +extern double normalized_matrix[26][26]; + +extern float TEMPERATURE; +extern int MATRIXTYPE; + +extern float GAPOPEN; +extern float GAPEXT; + +typedef struct { + char input[30]; + int matrix; + int N; + float T; + float beta; + char opt; //can be 'P' or 'M' + float gapopen; + float gapext; +} argument_decl; + +//argument support +extern argument_decl argument; + +///////////////////////////////////////////////////////// +//sets substitution matrix type +//////////////////////////////////////////////////////// +void setmatrixtype(int le) { + switch (le) { + case 160: + strcpy(matrixtype, "gonnet_160"); + break; + case 4: + strcpy(matrixtype, "nuc_simple"); + break; + default: + strcpy(matrixtype, "CUSTOM"); + break; + + }; + +} + +/////////////////////////////////////////////////////////////////// +//sets matrix flag +/////////////////////////////////////////////////////////////////// +inline int matrixtype_to_int() { + + if (!strcmp(matrixtype, "nuc_simple")) + return 4; + else if (!strcmp(matrixtype, "gonnet_160")) + return 160; + else + return 1000; + +} + +///////////////////////////////////////////////////////////////// +// +// Can read any scoring matrix as long as it is defined in Matrix.h +// AND it is a lower triangular +// AND the order of amino acids/bases is mentioned +///////////////////////////////////////////////////////////////// + +inline void read_matrix(score_matrix matrx) { + int i, j, basecount, position = 0; + + bases = (char *) matrx.monomers; + + basecount = strlen(bases); + + for (i = 0; i < basecount; i++) + subst_index[i] = -1; + + for (i = 0; i < basecount; i++) + subst_index[bases[i] - 'A'] = i; + + if (TRACE == 1) + printf("\nbases read: %d\n", basecount); + + for (i = 0; i < basecount; i++) + for (j = 0; j <= i; j++) { + + double value = exp(argument.beta * matrx.matrix[position++]); + sub_matrix[i][j] = value; + sub_matrix[j][i] = value; + } + + if (TRACE) + for (i = 0; i < basecount; i++) { + for (j = 0; j < basecount; j++) + printf(" %g ", sub_matrix[i][j]); + printf("\n"); + } + +} + +///////////////////////////////////////////////////////////////// +// read normalized residue exchange matrix +// compute sequence similarity +// add by YE Yongtao +///////////////////////////////////////////////////////////////// + +inline void read_normalized_matrix(score_matrix matrx) { + int i, j, basecount, position = 0; + + bases = (char *) matrx.monomers; + + basecount = strlen(bases); + + for (i = 0; i < basecount; i++) + subst_index[i] = -1; + + for (i = 0; i < basecount; i++) + subst_index[bases[i] - 'A'] = i; + + if (TRACE == 1) + printf("\nbases read: %d\n", basecount); + + for (i = 0; i < basecount; i++) + for (j = 0; j <= i; j++) { + + double value = matrx.matrix[position++]; + normalized_matrix[i][j] = value; + normalized_matrix[j][i] = value; + } + + if (TRACE) + for (i = 0; i < basecount; i++) { + for (j = 0; j < basecount; j++) + printf(" %g ", normalized_matrix[i][j]); + printf("\n"); + } + +} +////////////////////////////////////////////////////////////////////////////////// +//intialize the arguments (default values) +////////////////////////////////////////////////////////////////////////////////// +void init_arguments() { + float gap_open = 0, gap_ext = 0; + int le; + + le = matrixtype_to_int(); + + argument.N = 1; + strcpy(argument.input, "tempin"); + argument.matrix = le; + argument.gapopen = GAPOPEN; + argument.gapext = GAPEXT; + argument.T = TEMPERATURE; + argument.beta = 1.0 / TEMPERATURE; + argument.opt = 'P'; + + if (le == 4) //NUC OPTION :default is nuc_simple + { + read_matrix(nuc_simple); + gap_open = -4; + gap_ext = -0.25; + } + + else if (le == 160) //PROT option: default is gonnet_160 + { + if (TRACE) + printf("read matrix\n"); + read_matrix(gonnet_160); + gap_open = -22; + gap_ext = -1; + + read_normalized_matrix(normalized_blosum_30); // add by YE Yongtao + } else if (le == 1000) { //Error handling + printf("Error: enter a valid matrix type\n"); + exit(1); + //additional matrices can only be lower triangular + } + + //now override the gapopen and gapext + if (argument.gapopen != 0.0 || argument.gapext != 0.00) + + { + gap_open = -argument.gapopen; + gap_ext = -argument.gapext; + } + + if (TRACE) + printf("%f %f %f %d\n", argument.T, gap_open, gap_ext, le); + + argument.gapopen = gap_open; + argument.gapext = gap_ext; + argument.opt = 'P'; + +} diff --git a/binaries/src/GLProbs-1.0/MSAReadMatrix.h b/binaries/src/GLProbs-1.0/MSAReadMatrix.h new file mode 100644 index 0000000..0f0ad00 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAReadMatrix.h @@ -0,0 +1,151 @@ +///////////////////////////////////////////////////////////////// +// Matrix.h +// +// Specifies scoring matrices and their structure +// +// +// +///////////////////////////////////////////////////////////////// + +#ifndef _MSA_READ_MATRIX_H +#define _MSA_READ_MATRIX_H + +typedef struct { + char monomers[26]; /* amino or nucleic acid order */ + float matrix[676]; /* entries of the score matix, 26*26=676 */ +} score_matrix; + +//default protein sequence scoring matrix as well as default scoring matrix of the PROBALIGN +//also used when -prot option is used + +score_matrix gonnet_160 = { "ABCDEFGHIKLMNPQRSTVWXYZ", + +{ 4.6, 0.0, 0.0, 0.3, 0.0, 13.5, -1.1, 0.0, -5.3, 7.0, -0.4, 0.0, -5.2, 3.4, + 5.9, -3.8, 0.0, -1.8, -7.0, -6.2, 9.1, 0.2, 0.0, -3.4, -0.7, -2.1, -7.6, + 8.2, -1.8, 0.0, -2.3, -0.1, -0.1, -0.7, -2.7, 9.3, -1.8, 0.0, -2.5, + -6.2, -4.3, 0.3, -7.0, -3.7, 5.9, -1.2, 0.0, -4.8, -0.1, 1.3, -5.3, + -2.4, 0.2, -3.5, 5.5, -2.2, 0.0, -2.9, -6.5, -4.5, 1.9, -6.7, -3.2, 3.0, + -3.4, 5.7, -1.2, 0.0, -1.9, -5.0, -3.1, 1.4, -5.2, -2.1, 2.9, -2.1, 3.4, + 7.6, -1.2, 0.0, -3.1, 2.6, 0.5, -4.7, -0.2, 1.5, -4.4, 0.8, -4.8, -3.6, + 6.5, -0.1, 0.0, -5.2, -1.9, -1.4, -5.8, -3.0, -2.2, -4.3, -1.6, -3.5, + -4.2, -2.2, 9.6, -0.7, 0.0, -4.2, 0.6, 2.3, -4.1, -2.1, 1.7, -3.2, 2.0, + -2.4, -1.2, 0.5, -0.8, 5.6, -1.6, 0.0, -3.5, -1.6, -0.3, -5.3, -2.1, + 0.3, -4.1, 3.5, -3.5, -2.9, -0.4, -2.1, 1.7, 7.1, 1.6, 0.0, -0.2, 0.0, + -0.3, -4.5, -0.1, -0.8, -3.3, -0.4, -3.6, -2.3, 1.1, 0.0, -0.2, -0.9, + 4.4, 0.5, 0.0, -1.4, -0.6, -0.8, -3.6, -2.4, -0.8, -1.2, -0.2, -2.4, + -1.1, 0.3, -0.4, -0.4, -0.9, 2.3, 5.0, 0.1, 0.0, -0.6, -4.9, -3.0, -0.8, + -5.2, -3.5, 4.0, -3.0, 1.7, 1.4, -3.8, -3.2, -2.7, -3.4, -2.0, 0.0, 5.3, + -5.5, 0.0, -2.1, -7.8, -6.4, 3.2, -5.5, -1.9, -3.4, -5.4, -2.0, -2.2, + -5.5, -7.4, -4.0, -2.4, -4.7, -5.4, -4.5, 15.8, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, -3.7, 0.0, -1.3, -4.2, -4.4, 5.6, -6.0, 2.7, -2.0, -3.5, -1.1, + -1.3, -2.2, -4.8, -2.9, -2.9, -2.8, -3.2, -2.4, 3.8, 0.0, 10.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 } + +}; + +//normalized blosum_62 scoring matrix for computing protein sequence similarity +score_matrix normalized_blosum_62 = { + +"ABCDEFGHIKLMNPQRSTVWXYZ", +{ +0.533333333, +0.133333333,0.533333333, +0.266666667,0.066666667,0.866666667, +0.133333333,0.533333333,0.066666667,0.666666667, +0.2,0.333333333,0,0.4,0.6, +0.133333333,0.066666667 ,0.133333333,0.066666667,0.066666667,0.666666667, +0.266666667,0.2 ,0.066666667,0.2,0.133333333,0.066666667,0.666666667, +0.133333333,0.266666667 ,0.066666667,0.2,0.266666667,0.2,0.133333333,0.8, +0.2,0.066666667 ,0.2,0.066666667,0.066666667,0.266666667,0,0.066666667,0.533333333, +0.2,0.266666667 ,0.066666667,0.2,0.333333333,0.066666667,0.133333333,0.2,0.066666667,0.6, +0.2,0,0.2,0,0.066666667,0.266666667 ,0,0.066666667 ,0.4,0.133333333,0.533333333, +0.2,0.066666667 ,0.2,0.066666667,0.133333333,0.266666667,0.066666667 ,0.133333333,0.333333333,0.2,0.4,0.6, +0.133333333,0.466666667 ,0.066666667,0.333333333,0.266666667,0.066666667,0.266666667,0.333333333,0.066666667, 0.266666667,0.066666667 ,0.133333333,0.666666667, +0.2,0.133333333 ,0.066666667,0.2,0.2, 0,0.133333333, 0.133333333,0.066666667 ,0.2,0.066666667,0.133333333, 0.133333333,0.733333333 , +0.2,0.266666667 ,0.066666667,0.266666667,0.4,0.066666667,0.133333333, 0.266666667,0.066666667 ,0.333333333, 0.133333333,0.266666667 ,0.266666667,0.2,0.6, +0.2,0.2 ,0.066666667,0.133333333,0.266666667,0.066666667,0.133333333,0.266666667,0.066666667,0.4, 0.133333333,0.2,0.266666667,0.133333333,0.333333333,0.6 , +0.333333333,0.266666667 ,0.2,0.266666667,0.266666667,0.133333333,0.266666667,0.2,0.133333333, 0.266666667,0.133333333 ,0.2,0.333333333,0.2,0.266666667,0.2,0.533333333, +0.266666667, 0.2,0.2,0.2, 0.2, 0.133333333,0.133333333,0.133333333,0.2 ,0.2,0.2,0.2,0.266666667,0.2, 0.2,0.2 ,0.333333333,0.6, +0.266666667,0.066666667 ,0.2,0.066666667,0.133333333,0.2,0.066666667,0.066666667,0.466666667,0.133333333, 0.333333333,0.333333333, 0.066666667,0.133333333,0.133333333 ,0.066666667, 0.133333333, 0.266666667, 0.533333333, +0.066666667,0,0.133333333,0,0.066666667 ,0.333333333 , 0.133333333, 0.133333333,0.066666667 , 0.066666667, 0.133333333 ,0.2 , 0, 0,0.133333333 ,0.066666667 , 0.066666667 , 0.133333333 , 0.066666667 , 1, +0.266666667,0.2 ,0.133333333 , 0.2 , 0.2 , 0.2 , 0.2,0.2 , 0.2,0.2 ,0.2 , 0.2 , 0.2 , 0.133333333, 0.2 ,0.2 , 0.266666667 , 0.266666667,0.2 ,0.133333333, 0.2 , +0.133333333,0.066666667 , 0.133333333 , 0.066666667 , 0.133333333 , 0.466666667, 0.066666667,0.4 ,0.2, 0.133333333 , 0.2 , 0.2, 0.133333333 , 0.066666667, 0.2, 0.133333333,0.133333333 ,0.133333333 , 0.2 , 0.4,0.2 , 0.733333333, +0.2,0.333333333 ,0.066666667 ,0.333333333 , 0.533333333 , 0.066666667, 0.133333333, 0.266666667,0.066666667 ,0.333333333 , 0.066666667, 0.2, 0.266666667, 0.2 , 0.466666667,0.266666667 ,0.266666667, 0.2 , 0.133333333, 0.066666667 , 0.2,0.133333333 ,0.533333333 + +} +}; + +//normalized blosum_30 scoring matrix for computing protein sequence similarity +score_matrix normalized_blosum_30 = { + +"ABCDEFGHIKLMNPQRSTVWXYZ", +{ +0.407407407 , +0.259259259 , 0.444444444 , +0.148148148 , 0.185185185 , 0.888888889 , +0.259259259 , 0.444444444 , 0.148148148 , 0.592592593 , +0.259259259 , 0.259259259 , 0.296296296 , 0.296296296 , 0.481481481 , +0.185185185 , 0.148148148 , 0.148148148 , 0.074074074 , 0.111111111 , 0.62962963 , +0.259259259 , 0.259259259 , 0.111111111 , 0.222222222 , 0.185185185 , 0.148148148 , 0.555555556 , +0.185185185 , 0.185185185 , 0.074074074 , 0.185185185 , 0.259259259 , 0.148148148 , 0.148148148 , 0.777777778 , +0.259259259 , 0.185185185 , 0.185185185 , 0.111111111 , 0.148148148 , 0.259259259 , 0.222222222 , 0.185185185 , 0.481481481 , +0.259259259 , 0.259259259 , 0.148148148 , 0.259259259 , 0.333333333 , 0.222222222 , 0.222222222 , 0.185185185 , 0.185185185 , 0.407407407 , +0.222222222 , 0.222222222 , 0.259259259 , 0.222222222 , 0.222222222 , 0.333333333 , 0.185185185 , 0.222222222 , 0.333333333 , 0.185185185 , 0.407407407 , +0.296296296 , 0.185185185 , 0.185185185 , 0.148148148 , 0.222222222 , 0.185185185 , 0.185185185 , 0.333333333 , 0.296296296 , 0.333333333 , 0.333333333 , 0.481481481 , +0.259259259 , 0.407407407 , 0.222222222 , 0.296296296 , 0.222222222 , 0.222222222 , 0.259259259 , 0.222222222 , 0.259259259 , 0.259259259 , 0.185185185 , 0.259259259 , 0.555555556 , +0.222222222 , 0.185185185 , 0.148148148 , 0.222222222 , 0.296296296 , 0.111111111 , 0.222222222 , 0.296296296 , 0.148148148 , 0.296296296 , 0.148148148 , 0.111111111 , 0.148148148 , 0.666666667 , +0.296296296 , 0.222222222 , 0.185185185 , 0.222222222 , 0.333333333 , 0.148148148 , 0.185185185 , 0.259259259 , 0.185185185 , 0.259259259 , 0.185185185 , 0.222222222 , 0.222222222 , 0.259259259 , 0.555555556 , +0.222222222 , 0.185185185 , 0.185185185 , 0.222222222 , 0.222222222 , 0.222222222 , 0.185185185 , 0.222222222 , 0.148148148 , 0.296296296 , 0.185185185 , 0.259259259 , 0.185185185 , 0.222222222 , 0.37037037 , 0.555555556 , +0.296296296 , 0.259259259 , 0.185185185 , 0.259259259 , 0.259259259 , 0.222222222 , 0.259259259 , 0.222222222 , 0.222222222 , 0.259259259 , 0.185185185 , 0.185185185 , 0.259259259 , 0.222222222 , 0.222222222 , 0.222222222 , 0.407407407 , +0.296296296 , 0.259259259 , 0.185185185 , 0.222222222 , 0.185185185 , 0.185185185 , 0.185185185 , 0.185185185 , 0.259259259 , 0.222222222 , 0.259259259 , 0.259259259 , 0.296296296 , 0.259259259 , 0.259259259 , 0.148148148 , 0.333333333 , 0.444444444 , +0.296296296 , 0.185185185 , 0.185185185 , 0.185185185 , 0.148148148 , 0.296296296 , 0.148148148 , 0.148148148 , 0.407407407 , 0.185185185 , 0.296296296 , 0.259259259 , 0.185185185 , 0.111111111 , 0.148148148 , 0.222222222 , 0.222222222 , 0.296296296 , 0.444444444 , +0.074074074 , 0.074074074 , 0.185185185 , 0.111111111 , 0.222222222 , 0.296296296 , 0.296296296 , 0.074074074 , 0.148148148 , 0.185185185 , 0.185185185 , 0.148148148 , 0 , 0.148148148 , 0.222222222 , 0.259259259 , 0.148148148 , 0.074074074 , 0.148148148 , 1 , +0.259259259 , 0.222222222 , 0.185185185 , 0.222222222 , 0.222222222 , 0.222222222 , 0.222222222 , 0.222222222 , 0.259259259 , 0.259259259 , 0.259259259 , 0.259259259 , 0.259259259 , 0.222222222 , 0.259259259 , 0.222222222 , 0.259259259 , 0.259259259 , 0.259259259 , 0.185185185 , 0.222222222 , +0.111111111 , 0.148148148 , 0.037037037 , 0.222222222 , 0.185185185 , 0.37037037 , 0.148148148 , 0.259259259 , 0.222222222 , 0.222222222 , 0.37037037 , 0.222222222 , 0.111111111 , 0.185185185 , 0.222222222 , 0.259259259 , 0.185185185 , 0.222222222 , 0.296296296 , 0.444444444 , 0.222222222 , 0.592592593 , +0.259259259 , 0.259259259 , 0.259259259 , 0.259259259 , 0.444444444 , 0.111111111 , 0.185185185 , 0.259259259 , 0.148148148 , 0.296296296 , 0.222222222 , 0.222222222 , 0.222222222 , 0.259259259 , 0.407407407 , 0.259259259 , 0.222222222 , 0.222222222 , 0.148148148 , 0.222222222 , 0.259259259 , 0.185185185 , 0.407407407 + +} +}; + +//default nucleotide sequence scoring matrix +//used when -nuc option is used +score_matrix nuc_simple = { + +"ABCDGHKMNRSTUVWXY", + +{ 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0 } + +//Ribosum85-60 + /* + { + 2.22, + 0, 0, + -1.86, 0, 1.16, + 0, 0, 0, 0, + -1.46, 0, -2.48, 0, 1.03, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -1.39, 0, -1.05, 0, -1.74, 0, 0, 0, 0, 0, 0, 1.65, + -1.39, 0, -1.05, 0, -1.74, 0, 0, 0, 0, 0, 0, 0, 1.65, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + } + */ + + }; + +#endif diff --git a/binaries/src/GLProbs-1.0/MSAdiv3.cpp b/binaries/src/GLProbs-1.0/MSAdiv3.cpp new file mode 100644 index 0000000..2d0411d --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAdiv3.cpp @@ -0,0 +1,1472 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "MSA.h" +#include "MSAClusterTree.h" +#include "Defaults.h" + +#ifdef _OPENMP +#include +#endif + +string parametersInputFilename = ""; +string parametersOutputFilename = "no training"; +string annotationFilename = ""; + +bool enableVerbose = false; +bool enableAnnotation = false; +bool enableClustalWOutput = false; +bool enableAlignOrder = false; +int numConsistencyReps = 2; +int numPreTrainingReps = 0; +int numIterativeRefinementReps = 100; + +float cutoff = 0; + +VF initDistrib(NumMatrixTypes); +VF gapOpen(2 * NumInsertStates); +VF gapExtend(2 * NumInsertStates); +VVF emitPairs(256, VF(256, 1e-10)); +VF emitSingle(256, 1e-5); + +string alphabet = alphabetDefault; + +const int MIN_PRETRAINING_REPS = 0; +const int MAX_PRETRAINING_REPS = 20; +const int MIN_CONSISTENCY_REPS = 0; +const int MAX_CONSISTENCY_REPS = 5; +const int MIN_ITERATIVE_REFINEMENT_REPS = 0; +const int MAX_ITERATIVE_REFINEMENT_REPS = 1000; + +string posteriorProbsFilename = ""; +bool allscores = true; +string infilename; + +int flag_gui = 0; //0: no gui related o/p +//1: gui related o/p generated +int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment +//1: pp score seq added to o/p fasta alignment + +/////////////////////////////// +// global scoring matrix variables +////////////////////////////// +float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; +char *aminos, *bases, matrixtype[20] = "gonnet_160"; +int subst_index[26]; + +double sub_matrix[26][26]; +int firstread = 0; //this makes sure that matrices are read only once + +float TEMPERATURE = 5; +int MATRIXTYPE = 160; +int prot_nuc = 0; //0=prot, 1=nucleotide + +float GAPOPEN = 0; +float GAPEXT = 0; +int numThreads = 0; + +//argument support +typedef struct { + char input[30]; + int matrix; + int N; + float T; + float beta; + char opt; //can be 'P' or 'M' + float gapopen; + float gapext; +} argument_decl; + +argument_decl argument; + +extern inline void read_sustitution_matrix(char *fileName); +extern void setmatrixtype(int le); +extern inline int matrixtype_to_int(); +extern inline void read_dna_matrix(); +extern inline void read_vtml_la_matrix(); +extern void init_arguments(); + +MSA::MSA(int argc, char* argv[]) { + //parse program parameters + SafeVector sequenceNames = ParseParams(argc, argv); + + //initialize arguments for partition function + init_arguments(); + + ReadParameters(); + //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL); + + //read the input sequences + MultiSequence *sequences = new MultiSequence(); + assert(sequences); + for (int i = 0; i < (int) sequenceNames.size(); i++) { + cerr << "Loading sequence file: " << sequenceNames[i] << endl; + sequences->LoadMFA(sequenceNames[i], true); + } + //allocate space for sequence weights + this->seqsWeights = new int[sequences->GetNumSequences()]; + //initilaize parameters for OPENMP +#ifdef _OPENMP + if(numThreads <= 0) { + numThreads = omp_get_num_procs(); + cerr << "Automatically detected " << numThreads << " CPU cores" << endl; + } + cerr <<"Enabling OpenMP (with "<WriteALN(*alignOutFile); + } else { + alignment->WriteMFA(*alignOutFile); + } + //release resources + delete[] this->seqsWeights; + delete alignment; + delete sequences; +} +MSA::~MSA() { + /*close the output file*/ + if (alignOutFileName.length() > 0) { + ((std::ofstream*) alignOutFile)->close(); + } +} +///////////////////////////////////////////////////////////////// +// PrintParameters() +// +// Prints MSAPROBS parameters to STDERR. If a filename is +// specified, then the parameters are also written to the file. +///////////////////////////////////////////////////////////////// + +void MSA::PrintParameters(const char *message, const VF &initDistrib, + const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, + const VF &emitSingle, const char *filename) { + + // print parameters to the screen + cerr << message << endl << " initDistrib[] = { "; + for (int i = 0; i < NumMatrixTypes; i++) + cerr << setprecision(10) << initDistrib[i] << " "; + cerr << "}" << endl << " gapOpen[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapOpen[i] << " "; + cerr << "}" << endl << " gapExtend[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapExtend[i] << " "; + cerr << "}" << endl << endl; + + /* + for (int i = 0; i < 5; i++){ + for (int j = 0; j <= i; j++){ + cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " "; + } + cerr << endl; + }*/ + + // if a file name is specified + if (filename) { + + // attempt to open the file for writing + FILE *file = fopen(filename, "w"); + if (!file) { + cerr << "ERROR: Unable to write parameter file: " << filename + << endl; + exit(1); + } + + // if successful, then write the parameters to the file + for (int i = 0; i < NumMatrixTypes; i++) + fprintf(file, "%.10f ", initDistrib[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapOpen[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapExtend[i]); + fprintf(file, "\n"); + fprintf(file, "%s\n", alphabet.c_str()); + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) + fprintf(file, "%.10f ", + emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]); + fprintf(file, "\n"); + } + for (int i = 0; i < (int) alphabet.size(); i++) + fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]); + fprintf(file, "\n"); + fclose(file); + } +} + +///////////////////////////////////////////////////////////////// +// doAlign() +// +// First computes all pairwise posterior probability matrices. +// Then, computes new parameters if training, or a final +// alignment, otherwise. +///////////////////////////////////////////////////////////////// +extern VF *ComputePostProbs(int a, int b, string seq1, string seq2); +MultiSequence* MSA::doAlign(MultiSequence *sequences, + const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, + VF &gapExtend, VVF &emitPairs, VF &emitSingle) { + assert(sequences); + + //get the number of sequences + const int numSeqs = sequences->GetNumSequences(); + + //create distance matrix + VVF probalign_distances(numSeqs, VF(numSeqs, 0)); + VVF distances(numSeqs, VF(numSeqs, 0));//msa + + float gl_accuracy = 0; + //creat sparseMatrices + SafeVector > probalign_sparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + SafeVector > sparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); // msa + +#ifdef _OPENMP + //calculate sequence pairs for openmp model + int pairIdx = 0; + numPairs = (numSeqs - 1) * numSeqs / 2; + seqsPairs = new SeqsPair[numPairs]; + for(int a = 0; a < numSeqs; a++) { + for(int b = a + 1; b < numSeqs; b++) { + seqsPairs[pairIdx].seq1 = a; + seqsPairs[pairIdx].seq2 = b; + pairIdx++; + } + } +#endif + // do all pairwise alignments for posterior probability matrices +#ifdef _OPENMP +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int a= seqsPairs[pairIdx].seq1; + int b = seqsPairs[pairIdx].seq2; + if(enableVerbose) { +#pragma omp critical + cerr <<"tid "<GetSequence(a); + Sequence *seq2 = sequences->GetSequence(b); + + // verbose output + if (enableVerbose) { + cerr << "Computing posterior matrix: (" << a + 1 << ") " + << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") " + << seq2->GetHeader() << " -- "; + } + +//probcons + // compute forward and backward probabilities + VF *forward = model.ComputeForwardMatrix(seq1, seq2); + assert(forward); + VF *backward = model.ComputeBackwardMatrix(seq1, seq2); + assert(backward); + // compute posterior probability matrix from HMM + VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward); + assert(probcons_posterior); + delete forward; + delete backward; + +//probalign + VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString()); + assert(probalign_posterior); + probalign_sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),seq2->GetLength(), *probalign_posterior); + probalign_sparseMatrices[b][a] = NULL; + pair *, float> probalign_alignment = model.ComputeAlignment( + seq1->GetLength(), seq2->GetLength(), *probalign_posterior); + probalign_distances[a][b] =1.0f - probalign_alignment.second / min(seq1->GetLength(), seq2->GetLength()); + delete probalign_alignment.first; + +//local + forward = model.ComputeForwardMatrix(seq1, seq2,false); + assert(forward); + backward = model.ComputeBackwardMatrix(seq1, seq2,false); + assert(backward); + VF* local_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false); + delete forward; + delete backward; + +//GL + //merge probalign + local + probcons + VF::iterator ptr1 = probcons_posterior->begin(); + VF::iterator ptr2 = probalign_posterior->begin(); + VF::iterator ptr3 = local_posterior->begin(); + VF* posterior = new VF((seq1->GetLength()+1) * (seq2->GetLength()+1)); assert (posterior); //msa + VF::iterator ptr = posterior->begin(); + for (int i = 0; i <= seq1->GetLength(); i++) { + for (int j = 0; j <= seq2->GetLength(); j++) { + float v1 = *ptr1; + float v2 = *ptr2; + float v3 = *ptr3; + *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3); + ptr1++; + ptr2++; + ptr3++; + ptr++; + } + } + // perform the pairwise sequence alignment + pair *, float> gl_alignment = model.ComputeAlignment( + seq1->GetLength(), seq2->GetLength(), *posterior); + + //compute expected accuracy + distances[a][b] = distances[b][a] = 1.0f - gl_alignment.second + / min(seq1->GetLength(), seq2->GetLength()); + + // compute sparse representations + sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), *posterior); + sparseMatrices[b][a] = NULL; + // + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + float N_correct_match = 0; + int i = 1;int j = 1; + for (SafeVector::iterator iter = gl_alignment.first->begin(); + iter != gl_alignment.first->end(); ++iter){ + if (*iter == 'B'){ + unsigned char c1 = (unsigned char) iter1[i++]; + unsigned char c2 = (unsigned char) iter2[j++]; + if(c1==c2) N_correct_match += 1; + } + else if(*iter == 'X') i++; + else if(*iter == 'Y') j++; + } + if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "GL"<< endl; + gl_accuracy += N_correct_match / min(seq1->GetLength(), seq2->GetLength()); + // + delete probcons_posterior; + delete probalign_posterior; + delete local_posterior; + delete posterior; + +#ifndef _OPENMP + } +#endif + } + +/* +//self-adaptive + gl_accuracy /= numPairs; + if(gl_accuracy > 0.4){ + for (int a = 0; a < numSeqs - 1; a++) + for (int b = a + 1; b < numSeqs; b++) { + distances[a][b] = distances[b][a] = probalign_distances[a][b]; + sparseMatrices[a][b] = probalign_sparseMatrices[a][b]; + sparseMatrices[b][a] = NULL; + } + } +*/ + //create the guide tree + this->tree = new MSAClusterTree(this, distances, numSeqs); + this->tree->create(); + + // perform the consistency transformation the desired number of times + float* fweights = new float[numSeqs]; + for (int r = 0; r < numSeqs; r++) { + fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY; + fweights[r] *= 10; + } + for (int r = 0; r < numConsistencyReps; r++) { + SafeVector > newSparseMatrices = + DoRelaxation(fweights, sequences, sparseMatrices); + + // now replace the old posterior matrices + for (int i = 0; i < numSeqs; i++) { + for (int j = 0; j < numSeqs; j++) { + delete sparseMatrices[i][j]; + sparseMatrices[i][j] = newSparseMatrices[i][j]; + } + } + } + delete[] fweights; +#ifdef _OPENMP + delete [] seqsPairs; +#endif + + //compute the final multiple sequence alignment + MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences, + sparseMatrices, model); + + // build annotation + if (enableAnnotation) { + WriteAnnotation(finalAlignment, sparseMatrices); + } + //destroy the guide tree + delete this->tree; + this->tree = 0; + + // delete sparse matrices + for (int a = 0; a < numSeqs - 1; a++) { + for (int b = a + 1; b < numSeqs; b++) { + delete sparseMatrices[a][b]; + delete sparseMatrices[b][a]; + } + } + + return finalAlignment; +} + +///////////////////////////////////////////////////////////////// +// GetInteger() +// +// Attempts to parse an integer from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetInteger(char *data, int *val) { + char *endPtr; + long int retVal; + + assert(val); + + errno = 0; + retVal = strtol(data, &endPtr, 0); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) + return false; + if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) + return false; + *val = (int) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// GetFloat() +// +// Attempts to parse a float from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetFloat(char *data, float *val) { + char *endPtr; + double retVal; + + assert(val); + + errno = 0; + retVal = strtod(data, &endPtr); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) + return false; + *val = (float) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// ReadParameters() +// +// Read initial distribution, transition, and emission +// parameters from a file. +///////////////////////////////////////////////////////////////// + +void MSA::ReadParameters() { + + ifstream data; + + emitPairs = VVF(256, VF(256, 1e-10)); + emitSingle = VF(256, 1e-5); + + // read initial state distribution and transition parameters + if (parametersInputFilename == string("")) { + if (NumInsertStates == 1) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend1Default[i]; + } else if (NumInsertStates == 2) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend2Default[i]; + } else { + cerr + << "ERROR: No default initial distribution/parameter settings exist" + << endl << " for " << NumInsertStates + << " pairs of insert states. Use --paramfile." << endl; + exit(1); + } + + alphabet = alphabetDefault; + + for (int i = 0; i < (int) alphabet.length(); i++) { + emitSingle[(unsigned char) tolower(alphabet[i])] = + emitSingleDefault[i]; + emitSingle[(unsigned char) toupper(alphabet[i])] = + emitSingleDefault[i]; + for (int j = 0; j <= i; j++) { + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + } + } + } else { + data.open(parametersInputFilename.c_str()); + if (data.fail()) { + cerr << "ERROR: Unable to read parameter file: " + << parametersInputFilename << endl; + exit(1); + } + + string line[3]; + for (int i = 0; i < 3; i++) { + if (!getline(data, line[i])) { + cerr + << "ERROR: Unable to read transition parameters from parameter file: " + << parametersInputFilename << endl; + exit(1); + } + } + istringstream data2; + data2.clear(); + data2.str(line[0]); + for (int i = 0; i < NumMatrixTypes; i++) + data2 >> initDistrib[i]; + data2.clear(); + data2.str(line[1]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapOpen[i]; + data2.clear(); + data2.str(line[2]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapExtend[i]; + + if (!getline(data, line[0])) { + cerr << "ERROR: Unable to read alphabet from scoring matrix file: " + << parametersInputFilename << endl; + exit(1); + } + + // read alphabet as concatenation of all characters on alphabet line + alphabet = ""; + string token; + data2.clear(); + data2.str(line[0]); + while (data2 >> token) + alphabet += token; + + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) { + float val; + data >> val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + } + } + + for (int i = 0; i < (int) alphabet.size(); i++) { + float val; + data >> val; + emitSingle[(unsigned char) tolower(alphabet[i])] = val; + emitSingle[(unsigned char) toupper(alphabet[i])] = val; + } + data.close(); + } +} + +///////////////////////////////////////////////////////////////// +// ParseParams() +// +// Parse all command-line options. +///////////////////////////////////////////////////////////////// +void MSA::printUsage() { + cerr + << "************************************************************************" + << endl + << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm" + << endl + << "\tbased on pair hidden markov model and partition function postirior" + << endl + << "\tprobabilities. If any comments or problems, please contact" + << endl + << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)" + << endl + << "*************************************************************************" + << endl << "Usage:" << endl + << " msaprobs [OPTION]... [infile]..." << endl << endl + << "Description:" << endl + << " Align sequences in multi-FASTA format" << endl << endl + << " -o, --outfile " << endl + << " specify the output file name (STDOUT by default)" + << endl << " -num_threads " << endl + << " specify the number of threads used, and otherwise detect automatically" + << endl << " -clustalw" << endl + << " use CLUSTALW output format instead of FASTA format" + << endl << endl << " -c, --consistency REPS" << endl + << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " + << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps + << ") passes of consistency transformation" << endl << endl + << " -ir, --iterative-refinement REPS" << endl + << " use " << MIN_ITERATIVE_REFINEMENT_REPS + << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: " + << numIterativeRefinementReps << ") passes of iterative-refinement" + << endl << endl << " -v, --verbose" << endl + << " report progress while aligning (default: " + << (enableVerbose ? "on" : "off") << ")" << endl << endl + << " -annot FILENAME" << endl + << " write annotation for multiple alignment to FILENAME" + << endl << endl << " -a, --alignment-order" << endl + << " print sequences in alignment order rather than input order (default: " + << (enableAlignOrder ? "on" : "off") << ")" << endl + << " -version " << endl + << " print out version of MSAPROBS " << endl << endl; +} +SafeVector MSA::ParseParams(int argc, char **argv) { + if (argc < 2) { + printUsage(); + exit(1); + } + SafeVector sequenceNames; + int tempInt; + float tempFloat; + + for (int i = 1; i < argc; i++) { + if (argv[i][0] == '-') { + //help + if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) { + printUsage(); + exit(1); + //output file name + } else if (!strcmp(argv[i], "-o") + || !strcmp(argv[i], "--outfile")) { + if (i < argc - 1) { + alignOutFileName = argv[++i]; //get the file name + } else { + cerr << "ERROR: String expected for option " << argv[i] + << endl; + exit(1); + } + // parameter file + } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){ + if (i < argc - 1) + parametersInputFilename = string (argv[++i]); + else { + cerr << "ERROR: Filename expected for option " << argv[i] << endl; + exit (1); + } + //number of threads used + } else if (!strcmp(argv[i], "-p") + || !strcmp(argv[i], "-num_threads")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << " ERROR: invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < 0) { + tempInt = 0; + } + numThreads = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + // number of consistency transformations + } else if (!strcmp(argv[i], "-c") + || !strcmp(argv[i], "--consistency")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_CONSISTENCY_REPS + || tempInt > MAX_CONSISTENCY_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_CONSISTENCY_REPS << " and " + << MAX_CONSISTENCY_REPS << "." << endl; + exit(1); + } else { + numConsistencyReps = tempInt; + } + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // number of randomized partitioning iterative refinement passes + else if (!strcmp(argv[i], "-ir") + || !strcmp(argv[i], "--iterative-refinement")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS + || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_ITERATIVE_REFINEMENT_REPS << " and " + << MAX_ITERATIVE_REFINEMENT_REPS << "." + << endl; + exit(1); + } else + numIterativeRefinementReps = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // annotation files + else if (!strcmp(argv[i], "-annot")) { + enableAnnotation = true; + if (i < argc - 1) { + annotationFilename = argv[++i]; + } else { + cerr << "ERROR: FILENAME expected for option " << argv[i] + << endl; + exit(1); + } + } + + // clustalw output format + else if (!strcmp(argv[i], "-clustalw")) { + enableClustalWOutput = true; + } + + // cutoff + else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) { + if (i < argc - 1) { + if (!GetFloat(argv[++i], &tempFloat)) { + cerr + << "ERROR: Invalid floating-point value following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempFloat < 0 || tempFloat > 1) { + cerr << "ERROR: For option " << argv[i - 1] + << ", floating-point value must be between 0 and 1." + << endl; + exit(1); + } else + cutoff = tempFloat; + } + } else { + cerr << "ERROR: Floating-point value expected for option " + << argv[i] << endl; + exit(1); + } + } + + // verbose reporting + else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) { + enableVerbose = true; + } + + // alignment order + else if (!strcmp(argv[i], "-a") + || !strcmp(argv[i], "--alignment-order")) { + enableAlignOrder = true; + } + + //print out version + else if (!strcmp(argv[i], "-version")) { + cerr << "MSAPROBS version " << VERSION << endl; + exit(1); + } + // bad arguments + else { + cerr << "ERROR: Unrecognized option: " << argv[i] << endl; + exit(1); + } + } else { + sequenceNames.push_back(string(argv[i])); + } + } + + /*check the output file name*/ + cerr << "-------------------------------------" << endl; + if (alignOutFileName.length() == 0) { + cerr << "The final alignments will be printed out to STDOUT" << endl; + alignOutFile = &std::cout; + } else { + cerr << "Open the output file " << alignOutFileName << endl; + alignOutFile = new ofstream(alignOutFileName.c_str(), + ios::binary | ios::out | ios::trunc); + } + cerr << "-------------------------------------" << endl; + return sequenceNames; +} + +///////////////////////////////////////////////////////////////// +// ProcessTree() +// +// Process the tree recursively. Returns the aligned sequences +// corresponding to a node or leaf of the tree. +///////////////////////////////////////////////////////////////// +MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + MultiSequence *result; + + // check if this is a node of the alignment tree + //if (tree->GetSequenceLabel() == -1){ + if (tree->leaf == NODE) { + MultiSequence *alignLeft = ProcessTree(tree->left, sequences, + sparseMatrices, model); + MultiSequence *alignRight = ProcessTree(tree->right, sequences, + sparseMatrices, model); + + assert(alignLeft); + assert(alignRight); + + result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model); + assert(result); + + delete alignLeft; + delete alignRight; + } + + // otherwise, this is a leaf of the alignment tree + else { + result = new MultiSequence(); + assert(result); + //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone()); + result->AddSequence(sequences->GetSequence(tree->idx)->Clone()); + } + + return result; +} + +///////////////////////////////////////////////////////////////// +// ComputeFinalAlignment() +// +// Compute the final alignment by calling ProcessTree(), then +// performing iterative refinement as needed. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree, + MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences, + sparseMatrices, model); + + SafeVector oldOrdering; + if (enableAlignOrder) { + for (int i = 0; i < alignment->GetNumSequences(); i++) + oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel()); + alignment->SaveOrdering(); + enableAlignOrder = false; + } + + // tree-based refinement + // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree); + /* + int numSeqs = alignment->GetNumSequences(); + //if(numSeqs < numIterativeRefinementReps){ + for(int iter = 0; iter < 5; iter ++){ + for(int i = 0; i < numSeqs - 1; i++){ + DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i); + } + } + //}*/ + //Refinement return false:no improvement + for (int i = 0; i < numIterativeRefinementReps; i++) { + DoIterativeRefinement(sparseMatrices, model, alignment); + } + cerr << endl; + + if (oldOrdering.size() > 0) { + for (int i = 0; i < (int) oldOrdering.size(); i++) { + alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]); + } + } + + // return final alignment + return alignment; +} + +///////////////////////////////////////////////////////////////// +// AlignAlignments() +// +// Returns the alignment of two MultiSequence objects. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::AlignAlignments(MultiSequence *align1, + MultiSequence *align2, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + // print some info about the alignment + if (enableVerbose) { + for (int i = 0; i < align1->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align1->GetSequence(i)->GetLabel(); + cerr << "] vs. "; + for (int i = 0; i < align2->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align2->GetSequence(i)->GetLabel(); + cerr << "]: "; + } +#if 1 + VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2, + sparseMatrices, cutoff); +#endif + // compute an "accuracy" measure for the alignment before refinement + + pair *, float> alignment; + //perform alignment + alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(), + align2->GetSequence(0)->GetLength(), *posterior); + + delete posterior; + + if (enableVerbose) { + + // compute total length of sequences + int totLength = 0; + for (int i = 0; i < align1->GetNumSequences(); i++) + for (int j = 0; j < align2->GetNumSequences(); j++) + totLength += min(align1->GetSequence(i)->GetLength(), + align2->GetSequence(j)->GetLength()); + + // give an "accuracy" measure for the alignment + cerr << alignment.second / totLength << endl; + } + + // now build final alignment + MultiSequence *result = new MultiSequence(); + for (int i = 0; i < align1->GetNumSequences(); i++) + result->AddSequence( + align1->GetSequence(i)->AddGaps(alignment.first, 'X')); + for (int i = 0; i < align2->GetNumSequences(); i++) + result->AddSequence( + align2->GetSequence(i)->AddGaps(alignment.first, 'Y')); + if (!enableAlignOrder) + result->SortByLabel(); + + // free temporary alignment + delete alignment.first; + + return result; +} + +///////////////////////////////////////////////////////////////// +// DoRelaxation() +// +// Performs one round of the weighted probabilistic consistency transformation. +// 1 +///////////////////////////////////////////////////////////////// + +SafeVector > MSA::DoRelaxation(float* seqsWeights, + MultiSequence *sequences, + SafeVector > &sparseMatrices) { + const int numSeqs = sequences->GetNumSequences(); + + SafeVector > newSparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + + // for every pair of sequences +#ifdef _OPENMP + int pairIdx; +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int i = seqsPairs[pairIdx].seq1; + int j = seqsPairs[pairIdx].seq2; + float wi = seqsWeights[i]; + float wj = seqsWeights[j]; +#else + for (int i = 0; i < numSeqs; i++) { + float wi = seqsWeights[i]; + for (int j = i + 1; j < numSeqs; j++) { + float wj = seqsWeights[j]; +#endif + Sequence *seq1 = sequences->GetSequence(i); + Sequence *seq2 = sequences->GetSequence(j); + + if (enableVerbose) { +#ifdef _OPENMP +#pragma omp critical +#endif + cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader() + << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader() + << ": "; + } + // get the original posterior matrix + VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + // contribution from the summation where z = x and z = y + float w = wi * wi * wj + wi * wj * wj; + float sumW = w; + for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) { + //posterior[k] = w*posterior[k]; + posterior[k] += posterior[k]; + } + + if (enableVerbose) + cerr << sparseMatrices[i][j]->GetNumCells() << " --> "; + + // contribution from all other sequences + for (int k = 0; k < numSeqs; k++) { + if (k != i && k != j) { + float wk = seqsWeights[k]; + float w = wi * wj * wk; + sumW += w; + if (k < i) + Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j], + posterior); + else if (k > i && k < j) + Relax(w, sparseMatrices[i][k], sparseMatrices[k][j], + posterior); + else { + SparseMatrix *temp = + sparseMatrices[j][k]->ComputeTranspose(); + Relax(w, sparseMatrices[i][k], temp, posterior); + delete temp; + } + } + } + //cerr<<"sumW "<::iterator XYptr = matXY->GetRowPtr(x); + SafeVector::iterator XYend = XYptr + matXY->GetRowSize(x); + VF::iterator base = posterior.begin() + x * (seq2Length + 1); + int curr = 0; + while (XYptr != XYend) { + + // zero out all cells until the first filled column + while (curr < XYptr->first) { + base[curr] = 0; + curr++; + } + + // now, skip over this column + curr++; + ++XYptr; + } + + // zero out cells after last column + while (curr <= seq2Length) { + base[curr] = 0; + curr++; + } + } + + // save the new posterior matrix + newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), posterior); + newSparseMatrices[j][i] = NULL; + + if (enableVerbose) + cerr << newSparseMatrices[i][j]->GetNumCells() << " -- "; + + delete posteriorPtr; + + if (enableVerbose) + cerr << "done." << endl; +#ifndef _OPENMP + } +#endif + } + + return newSparseMatrices; +} + +///////////////////////////////////////////////////////////////// +// Relax() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY, + VF &posterior) { + + assert(matXZ); + assert(matZY); + + int lengthX = matXZ->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length()); + + // for every x[i] + for (int i = 1; i <= lengthX; i++) { + SafeVector::iterator XZptr = matXZ->GetRowPtr(i); + SafeVector::iterator XZend = XZptr + matXZ->GetRowSize(i); + + VF::iterator base = posterior.begin() + i * (lengthY + 1); + + // iterate through all x[i]-z[k] + while (XZptr != XZend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(XZptr->first); + SafeVector::iterator ZYend = ZYptr + + matZY->GetRowSize(XZptr->first); + const float XZval = XZptr->second; + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + //base[ZYptr->first] += weight * XZval * ZYptr->second; + base[ZYptr->first] += XZval * ZYptr->second; + ZYptr++; + } + XZptr++; + } + } +} + +///////////////////////////////////////////////////////////////// +// Relax1() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY, + VF &posterior) { + + assert(matZX); + assert(matZY); + + int lengthZ = matZX->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + + // for every z[k] + for (int k = 1; k <= lengthZ; k++) { + SafeVector::iterator ZXptr = matZX->GetRowPtr(k); + SafeVector::iterator ZXend = ZXptr + matZX->GetRowSize(k); + + // iterate through all z[k]-x[i] + while (ZXptr != ZXend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(k); + SafeVector::iterator ZYend = ZYptr + matZY->GetRowSize(k); + const float ZXval = ZXptr->second; + VF::iterator base = posterior.begin() + + ZXptr->first * (lengthY + 1); + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + //base[ZYptr->first] += weight * ZXval * ZYptr->second; + base[ZYptr->first] += ZXval * ZYptr->second; + ZYptr++; + } + ZXptr++; + } + } +} +///////////////////////////////////////////////////////////////// +// DoIterativeRefinement() +// +// Performs a single round of randomized partionining iterative +// refinement. +///////////////////////////////////////////////////////////////// + +void MSA::DoIterativeRefinement( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + + // create two separate groups + for (int i = 0; i < numSeqs; i++) { + int index = rand(); + if (index % 2) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) return; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); +/* +//start add by Yongtao +#if 0 + VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs, + sparseMatrices, cutoff); +#endif + + // compute an "accuracy" measure for the alignment before refinement + SafeVector::iterator> oldOnePtrs(groupOne.size()); + SafeVector::iterator> oldTwoPtrs(groupTwo.size()); + int i=0; + for (set::const_iterator iter = groupOne.begin(); + iter != groupOne.end(); ++iter) { + oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr(); + } + i=0; + for (set::const_iterator iter = groupTwo.begin(); + iter != groupTwo.end(); ++iter) { + oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr(); + } + + VF &posteriorArr = *posterior; + int oldLength = alignment->GetSequence(0)->GetLength(); + int groupOneindex=0; int groupTwoindex=0; + float accuracy_before = 0; + for (int i = 1; i <= oldLength; i++) { + // check to see if there is a gap in every sequence of the set + bool foundOne = false; + for (int j = 0; !foundOne && j < (int) groupOne.size(); j++) + foundOne = (oldOnePtrs[j][i] != '-'); + // if not, then this column counts towards the sequence length + if (foundOne) groupOneindex ++; + bool foundTwo = false; + for (int j = 0; !foundTwo && j < (int) groupTwo.size(); j++) + foundTwo = (oldTwoPtrs[j][i] != '-'); + // if not, then this column counts towards the sequence length + if (foundTwo) groupTwoindex ++; + if(foundOne && foundTwo) accuracy_before += + posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex]; + } + + pair *, float> refinealignment; + //perform alignment + refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(), + groupTwoSeqs->GetSequence(0)->GetLength(), *posterior); + delete posterior; + // now build final alignment + MultiSequence *result = new MultiSequence(); + //compare accuracy measure before and after refinement + //if (refinealignment.second > accuracy_before) { + //cerr<<"Before:" << accuracy_before<<" after: "<< refinealignment.second<< endl; + for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++) + result->AddSequence( + groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X')); + for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++) + result->AddSequence( + groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y')); + // free temporary alignment + delete refinealignment.first; + delete alignment; + alignment = result; + + } + else{ + if(numIterativeRefinementReps < 8*numSeqs) numIterativeRefinementReps++; + delete groupOneSeqs; + delete groupTwoSeqs; + return false; + } + */ +//end add by yongtao + + delete alignment; + // realign + alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, model); //original + delete groupOneSeqs; + delete groupTwoSeqs; + +} + +void MSA::DoIterativeRefinementTreeNode( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment, + int nodeIndex) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + + vector inGroup1; + inGroup1.resize(numSeqs); + for (int i = 0; i < numSeqs; i++) { + inGroup1[i] = false; + } + + AlignmentOrder* orders = this->tree->getAlignOrders(); + AlignmentOrder* order = &orders[nodeIndex]; + for (int i = 0; i < order->leftNum; i++) { + int si = order->leftLeafs[i]; + inGroup1[si] = true; + } + for (int i = 0; i < order->rightNum; i++) { + int si = order->rightLeafs[i]; + inGroup1[si] = true; + } + // create two separate groups + for (int i = 0; i < numSeqs; i++) { + if (inGroup1[i]) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) + return; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); + delete alignment; + + // realign + alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, + model); + + delete groupOneSeqs; + delete groupTwoSeqs; +} + +///////////////////////////////////////////////////////////////// +// WriteAnnotation() +// +// Computes annotation for multiple alignment and write values +// to a file. +///////////////////////////////////////////////////////////////// + +void MSA::WriteAnnotation(MultiSequence *alignment, + const SafeVector > &sparseMatrices) { + ofstream outfile(annotationFilename.c_str()); + + if (outfile.fail()) { + cerr << "ERROR: Unable to write annotation file." << endl; + exit(1); + } + + const int alignLength = alignment->GetSequence(0)->GetLength(); + const int numSeqs = alignment->GetNumSequences(); + + SafeVector position(numSeqs, 0); + SafeVector::iterator> seqs(numSeqs); + for (int i = 0; i < numSeqs; i++) + seqs[i] = alignment->GetSequence(i)->GetDataPtr(); + SafeVector > active; + active.reserve(numSeqs); + + SafeVector lab; + for (int i = 0; i < numSeqs; i++) + lab.push_back(alignment->GetSequence(i)->GetSortLabel()); + + // for every column + for (int i = 1; i <= alignLength; i++) { + + // find all aligned residues in this particular column + active.clear(); + for (int j = 0; j < numSeqs; j++) { + if (seqs[j][i] != '-') { + active.push_back(make_pair(lab[j], ++position[j])); + } + } + + sort(active.begin(), active.end()); + outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl; + } + + outfile.close(); +} + +///////////////////////////////////////////////////////////////// +// ComputeScore() +// +// Computes the annotation score for a particular column. +///////////////////////////////////////////////////////////////// + +int MSA::ComputeScore(const SafeVector > &active, + const SafeVector > &sparseMatrices) { + + if (active.size() <= 1) + return 0; + + // ALTERNATIVE #1: Compute the average alignment score. + + float val = 0; + for (int i = 0; i < (int) active.size(); i++) { + for (int j = i + 1; j < (int) active.size(); j++) { + val += sparseMatrices[active[i].first][active[j].first]->GetValue( + active[i].second, active[j].second); + } + } + + return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1))); + +} diff --git a/binaries/src/GLProbs-1.0/MSAfull.cpp b/binaries/src/GLProbs-1.0/MSAfull.cpp new file mode 100644 index 0000000..efe4dc5 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAfull.cpp @@ -0,0 +1,1471 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "MSA.h" +#include "MSAClusterTree.h" +#include "Defaults.h" + +#ifdef _OPENMP +#include +#endif + +string parametersInputFilename = ""; +string parametersOutputFilename = "no training"; +string annotationFilename = ""; + +bool enableVerbose = false; +bool enableAnnotation = false; +bool enableClustalWOutput = false; +bool enableAlignOrder = false; +int numConsistencyReps = 2; +int numPreTrainingReps = 0; +int numIterativeRefinementReps = 100; + +float cutoff = 0; + +VF initDistrib(NumMatrixTypes); +VF gapOpen(2 * NumInsertStates); +VF gapExtend(2 * NumInsertStates); +VVF emitPairs(256, VF(256, 1e-10)); +VF emitSingle(256, 1e-5); + +string alphabet = alphabetDefault; + +const int MIN_PRETRAINING_REPS = 0; +const int MAX_PRETRAINING_REPS = 20; +const int MIN_CONSISTENCY_REPS = 0; +const int MAX_CONSISTENCY_REPS = 5; +const int MIN_ITERATIVE_REFINEMENT_REPS = 0; +const int MAX_ITERATIVE_REFINEMENT_REPS = 1000; + +string posteriorProbsFilename = ""; +bool allscores = true; +string infilename; + +int flag_gui = 0; //0: no gui related o/p +//1: gui related o/p generated +int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment +//1: pp score seq added to o/p fasta alignment + +/////////////////////////////// +// global scoring matrix variables +////////////////////////////// +float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; +char *aminos, *bases, matrixtype[20] = "gonnet_160"; +int subst_index[26]; + +double sub_matrix[26][26]; +int firstread = 0; //this makes sure that matrices are read only once + +float TEMPERATURE = 5; +int MATRIXTYPE = 160; +int prot_nuc = 0; //0=prot, 1=nucleotide + +float GAPOPEN = 0; +float GAPEXT = 0; +int numThreads = 0; + +//argument support +typedef struct { + char input[30]; + int matrix; + int N; + float T; + float beta; + char opt; //can be 'P' or 'M' + float gapopen; + float gapext; +} argument_decl; + +argument_decl argument; + +extern inline void read_sustitution_matrix(char *fileName); +extern void setmatrixtype(int le); +extern inline int matrixtype_to_int(); +extern inline void read_dna_matrix(); +extern inline void read_vtml_la_matrix(); +extern void init_arguments(); + +MSA::MSA(int argc, char* argv[]) { + //parse program parameters + SafeVector sequenceNames = ParseParams(argc, argv); + + //initialize arguments for partition function + init_arguments(); + + ReadParameters(); + //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL); + + //read the input sequences + MultiSequence *sequences = new MultiSequence(); + assert(sequences); + for (int i = 0; i < (int) sequenceNames.size(); i++) { + cerr << "Loading sequence file: " << sequenceNames[i] << endl; + sequences->LoadMFA(sequenceNames[i], true); + } + //allocate space for sequence weights + this->seqsWeights = new int[sequences->GetNumSequences()]; + //initilaize parameters for OPENMP +#ifdef _OPENMP + if(numThreads <= 0) { + numThreads = omp_get_num_procs(); + cerr << "Automatically detected " << numThreads << " CPU cores" << endl; + } + cerr <<"Enabling OpenMP (with "<WriteALN(*alignOutFile); + } else { + alignment->WriteMFA(*alignOutFile); + } + //release resources + delete[] this->seqsWeights; + delete alignment; + delete sequences; +} +MSA::~MSA() { + /*close the output file*/ + if (alignOutFileName.length() > 0) { + ((std::ofstream*) alignOutFile)->close(); + } +} +///////////////////////////////////////////////////////////////// +// PrintParameters() +// +// Prints MSAPROBS parameters to STDERR. If a filename is +// specified, then the parameters are also written to the file. +///////////////////////////////////////////////////////////////// + +void MSA::PrintParameters(const char *message, const VF &initDistrib, + const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, + const VF &emitSingle, const char *filename) { + + // print parameters to the screen + cerr << message << endl << " initDistrib[] = { "; + for (int i = 0; i < NumMatrixTypes; i++) + cerr << setprecision(10) << initDistrib[i] << " "; + cerr << "}" << endl << " gapOpen[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapOpen[i] << " "; + cerr << "}" << endl << " gapExtend[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapExtend[i] << " "; + cerr << "}" << endl << endl; + + /* + for (int i = 0; i < 5; i++){ + for (int j = 0; j <= i; j++){ + cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " "; + } + cerr << endl; + }*/ + + // if a file name is specified + if (filename) { + + // attempt to open the file for writing + FILE *file = fopen(filename, "w"); + if (!file) { + cerr << "ERROR: Unable to write parameter file: " << filename + << endl; + exit(1); + } + + // if successful, then write the parameters to the file + for (int i = 0; i < NumMatrixTypes; i++) + fprintf(file, "%.10f ", initDistrib[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapOpen[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapExtend[i]); + fprintf(file, "\n"); + fprintf(file, "%s\n", alphabet.c_str()); + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) + fprintf(file, "%.10f ", + emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]); + fprintf(file, "\n"); + } + for (int i = 0; i < (int) alphabet.size(); i++) + fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]); + fprintf(file, "\n"); + fclose(file); + } +} + +///////////////////////////////////////////////////////////////// +// doAlign() +// +// First computes all pairwise posterior probability matrices. +// Then, computes new parameters if training, or a final +// alignment, otherwise. +///////////////////////////////////////////////////////////////// +extern VF *ComputePostProbs(int a, int b, string seq1, string seq2); +MultiSequence* MSA::doAlign(MultiSequence *sequences, + const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, + VF &gapExtend, VVF &emitPairs, VF &emitSingle) { + assert(sequences); + + //get the number of sequences + const int numSeqs = sequences->GetNumSequences(); + + //create distance matrix + VVF probalign_distances(numSeqs, VF(numSeqs, 0)); + VVF distances(numSeqs, VF(numSeqs, 0));//msa + + float gl_accuracy = 0; + //creat sparseMatrices + SafeVector > probalign_sparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + SafeVector > sparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); // msa + +#ifdef _OPENMP + //calculate sequence pairs for openmp model + int pairIdx = 0; + numPairs = (numSeqs - 1) * numSeqs / 2; + seqsPairs = new SeqsPair[numPairs]; + for(int a = 0; a < numSeqs; a++) { + for(int b = a + 1; b < numSeqs; b++) { + seqsPairs[pairIdx].seq1 = a; + seqsPairs[pairIdx].seq2 = b; + pairIdx++; + } + } +#endif + // do all pairwise alignments for posterior probability matrices +#ifdef _OPENMP +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int a= seqsPairs[pairIdx].seq1; + int b = seqsPairs[pairIdx].seq2; + if(enableVerbose) { +#pragma omp critical + cerr <<"tid "<GetSequence(a); + Sequence *seq2 = sequences->GetSequence(b); + + // verbose output + if (enableVerbose) { + cerr << "Computing posterior matrix: (" << a + 1 << ") " + << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") " + << seq2->GetHeader() << " -- "; + } + +//probcons + // compute forward and backward probabilities + VF *forward = model.ComputeForwardMatrix(seq1, seq2); + assert(forward); + VF *backward = model.ComputeBackwardMatrix(seq1, seq2); + assert(backward); + // compute posterior probability matrix from HMM + VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward); + assert(probcons_posterior); + delete forward; + delete backward; + +//probalign + VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString()); + assert(probalign_posterior); + probalign_sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),seq2->GetLength(), *probalign_posterior); + probalign_sparseMatrices[b][a] = NULL; + pair *, float> probalign_alignment = model.ComputeAlignment( + seq1->GetLength(), seq2->GetLength(), *probalign_posterior); + probalign_distances[a][b] =1.0f - probalign_alignment.second / min(seq1->GetLength(), seq2->GetLength()); + delete probalign_alignment.first; + +//local + forward = model.ComputeForwardMatrix(seq1, seq2,false); + assert(forward); + backward = model.ComputeBackwardMatrix(seq1, seq2,false); + assert(backward); + VF* local_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false); + delete forward; + delete backward; + +//GL + //merge probalign + local + probcons + VF::iterator ptr1 = probcons_posterior->begin(); + VF::iterator ptr2 = probalign_posterior->begin(); + VF::iterator ptr3 = local_posterior->begin(); + VF* posterior = new VF((seq1->GetLength()+1) * (seq2->GetLength()+1)); assert (posterior); //msa + VF::iterator ptr = posterior->begin(); + for (int i = 0; i <= seq1->GetLength(); i++) { + for (int j = 0; j <= seq2->GetLength(); j++) { + float v1 = *ptr1; + float v2 = *ptr2; + float v3 = *ptr3; + *ptr = sqrt(v1*v1 + v2*v2 + v3*v3); + ptr1++; + ptr2++; + ptr3++; + ptr++; + } + } + // perform the pairwise sequence alignment + pair *, float> gl_alignment = model.ComputeAlignment( + seq1->GetLength(), seq2->GetLength(), *posterior); + + //compute expected accuracy + distances[a][b] = distances[b][a] = 1.0f - gl_alignment.second + / (3*min(seq1->GetLength(), seq2->GetLength())); + + // compute sparse representations + sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), *posterior); + sparseMatrices[b][a] = NULL; + // + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + float N_correct_match = 0; + int i = 1;int j = 1; + for (SafeVector::iterator iter = gl_alignment.first->begin(); + iter != gl_alignment.first->end(); ++iter){ + if (*iter == 'B'){ + unsigned char c1 = (unsigned char) iter1[i++]; + unsigned char c2 = (unsigned char) iter2[j++]; + if(c1==c2) N_correct_match += 1; + } + else if(*iter == 'X') i++; + else if(*iter == 'Y') j++; + } + if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "GL"<< endl; + gl_accuracy += N_correct_match / min(seq1->GetLength(), seq2->GetLength()); + // + delete probcons_posterior; + delete probalign_posterior; + delete local_posterior; + delete posterior; + +#ifndef _OPENMP + } +#endif + } + +//self-adaptive + gl_accuracy /= numPairs; + if(gl_accuracy > 0.4){ + for (int a = 0; a < numSeqs - 1; a++) + for (int b = a + 1; b < numSeqs; b++) { + distances[a][b] = distances[b][a] = probalign_distances[a][b]; + sparseMatrices[a][b] = probalign_sparseMatrices[a][b]; + sparseMatrices[b][a] = NULL; + } + } + + //create the guide tree + this->tree = new MSAClusterTree(this, distances, numSeqs); + this->tree->create(); + + // perform the consistency transformation the desired number of times + float* fweights = new float[numSeqs]; + for (int r = 0; r < numSeqs; r++) { + fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY; + fweights[r] *= 10; + } + for (int r = 0; r < numConsistencyReps; r++) { + SafeVector > newSparseMatrices = + DoRelaxation(fweights, sequences, sparseMatrices); + + // now replace the old posterior matrices + for (int i = 0; i < numSeqs; i++) { + for (int j = 0; j < numSeqs; j++) { + delete sparseMatrices[i][j]; + sparseMatrices[i][j] = newSparseMatrices[i][j]; + } + } + } + delete[] fweights; +#ifdef _OPENMP + delete [] seqsPairs; +#endif + + //compute the final multiple sequence alignment + MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences, + sparseMatrices, model); + + // build annotation + if (enableAnnotation) { + WriteAnnotation(finalAlignment, sparseMatrices); + } + //destroy the guide tree + delete this->tree; + this->tree = 0; + + // delete sparse matrices + for (int a = 0; a < numSeqs - 1; a++) { + for (int b = a + 1; b < numSeqs; b++) { + delete sparseMatrices[a][b]; + delete sparseMatrices[b][a]; + } + } + + return finalAlignment; +} + +///////////////////////////////////////////////////////////////// +// GetInteger() +// +// Attempts to parse an integer from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetInteger(char *data, int *val) { + char *endPtr; + long int retVal; + + assert(val); + + errno = 0; + retVal = strtol(data, &endPtr, 0); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) + return false; + if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) + return false; + *val = (int) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// GetFloat() +// +// Attempts to parse a float from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetFloat(char *data, float *val) { + char *endPtr; + double retVal; + + assert(val); + + errno = 0; + retVal = strtod(data, &endPtr); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) + return false; + *val = (float) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// ReadParameters() +// +// Read initial distribution, transition, and emission +// parameters from a file. +///////////////////////////////////////////////////////////////// + +void MSA::ReadParameters() { + + ifstream data; + + emitPairs = VVF(256, VF(256, 1e-10)); + emitSingle = VF(256, 1e-5); + + // read initial state distribution and transition parameters + if (parametersInputFilename == string("")) { + if (NumInsertStates == 1) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend1Default[i]; + } else if (NumInsertStates == 2) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend2Default[i]; + } else { + cerr + << "ERROR: No default initial distribution/parameter settings exist" + << endl << " for " << NumInsertStates + << " pairs of insert states. Use --paramfile." << endl; + exit(1); + } + + alphabet = alphabetDefault; + + for (int i = 0; i < (int) alphabet.length(); i++) { + emitSingle[(unsigned char) tolower(alphabet[i])] = + emitSingleDefault[i]; + emitSingle[(unsigned char) toupper(alphabet[i])] = + emitSingleDefault[i]; + for (int j = 0; j <= i; j++) { + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + } + } + } else { + data.open(parametersInputFilename.c_str()); + if (data.fail()) { + cerr << "ERROR: Unable to read parameter file: " + << parametersInputFilename << endl; + exit(1); + } + + string line[3]; + for (int i = 0; i < 3; i++) { + if (!getline(data, line[i])) { + cerr + << "ERROR: Unable to read transition parameters from parameter file: " + << parametersInputFilename << endl; + exit(1); + } + } + istringstream data2; + data2.clear(); + data2.str(line[0]); + for (int i = 0; i < NumMatrixTypes; i++) + data2 >> initDistrib[i]; + data2.clear(); + data2.str(line[1]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapOpen[i]; + data2.clear(); + data2.str(line[2]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapExtend[i]; + + if (!getline(data, line[0])) { + cerr << "ERROR: Unable to read alphabet from scoring matrix file: " + << parametersInputFilename << endl; + exit(1); + } + + // read alphabet as concatenation of all characters on alphabet line + alphabet = ""; + string token; + data2.clear(); + data2.str(line[0]); + while (data2 >> token) + alphabet += token; + + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) { + float val; + data >> val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + } + } + + for (int i = 0; i < (int) alphabet.size(); i++) { + float val; + data >> val; + emitSingle[(unsigned char) tolower(alphabet[i])] = val; + emitSingle[(unsigned char) toupper(alphabet[i])] = val; + } + data.close(); + } +} + +///////////////////////////////////////////////////////////////// +// ParseParams() +// +// Parse all command-line options. +///////////////////////////////////////////////////////////////// +void MSA::printUsage() { + cerr + << "************************************************************************" + << endl + << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm" + << endl + << "\tbased on pair hidden markov model and partition function postirior" + << endl + << "\tprobabilities. If any comments or problems, please contact" + << endl + << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)" + << endl + << "*************************************************************************" + << endl << "Usage:" << endl + << " msaprobs [OPTION]... [infile]..." << endl << endl + << "Description:" << endl + << " Align sequences in multi-FASTA format" << endl << endl + << " -o, --outfile " << endl + << " specify the output file name (STDOUT by default)" + << endl << " -num_threads " << endl + << " specify the number of threads used, and otherwise detect automatically" + << endl << " -clustalw" << endl + << " use CLUSTALW output format instead of FASTA format" + << endl << endl << " -c, --consistency REPS" << endl + << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " + << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps + << ") passes of consistency transformation" << endl << endl + << " -ir, --iterative-refinement REPS" << endl + << " use " << MIN_ITERATIVE_REFINEMENT_REPS + << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: " + << numIterativeRefinementReps << ") passes of iterative-refinement" + << endl << endl << " -v, --verbose" << endl + << " report progress while aligning (default: " + << (enableVerbose ? "on" : "off") << ")" << endl << endl + << " -annot FILENAME" << endl + << " write annotation for multiple alignment to FILENAME" + << endl << endl << " -a, --alignment-order" << endl + << " print sequences in alignment order rather than input order (default: " + << (enableAlignOrder ? "on" : "off") << ")" << endl + << " -version " << endl + << " print out version of MSAPROBS " << endl << endl; +} +SafeVector MSA::ParseParams(int argc, char **argv) { + if (argc < 2) { + printUsage(); + exit(1); + } + SafeVector sequenceNames; + int tempInt; + float tempFloat; + + for (int i = 1; i < argc; i++) { + if (argv[i][0] == '-') { + //help + if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) { + printUsage(); + exit(1); + //output file name + } else if (!strcmp(argv[i], "-o") + || !strcmp(argv[i], "--outfile")) { + if (i < argc - 1) { + alignOutFileName = argv[++i]; //get the file name + } else { + cerr << "ERROR: String expected for option " << argv[i] + << endl; + exit(1); + } + // parameter file + } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){ + if (i < argc - 1) + parametersInputFilename = string (argv[++i]); + else { + cerr << "ERROR: Filename expected for option " << argv[i] << endl; + exit (1); + } + //number of threads used + } else if (!strcmp(argv[i], "-p") + || !strcmp(argv[i], "-num_threads")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << " ERROR: invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < 0) { + tempInt = 0; + } + numThreads = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + // number of consistency transformations + } else if (!strcmp(argv[i], "-c") + || !strcmp(argv[i], "--consistency")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_CONSISTENCY_REPS + || tempInt > MAX_CONSISTENCY_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_CONSISTENCY_REPS << " and " + << MAX_CONSISTENCY_REPS << "." << endl; + exit(1); + } else { + numConsistencyReps = tempInt; + } + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // number of randomized partitioning iterative refinement passes + else if (!strcmp(argv[i], "-ir") + || !strcmp(argv[i], "--iterative-refinement")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS + || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_ITERATIVE_REFINEMENT_REPS << " and " + << MAX_ITERATIVE_REFINEMENT_REPS << "." + << endl; + exit(1); + } else + numIterativeRefinementReps = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // annotation files + else if (!strcmp(argv[i], "-annot")) { + enableAnnotation = true; + if (i < argc - 1) { + annotationFilename = argv[++i]; + } else { + cerr << "ERROR: FILENAME expected for option " << argv[i] + << endl; + exit(1); + } + } + + // clustalw output format + else if (!strcmp(argv[i], "-clustalw")) { + enableClustalWOutput = true; + } + + // cutoff + else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) { + if (i < argc - 1) { + if (!GetFloat(argv[++i], &tempFloat)) { + cerr + << "ERROR: Invalid floating-point value following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempFloat < 0 || tempFloat > 1) { + cerr << "ERROR: For option " << argv[i - 1] + << ", floating-point value must be between 0 and 1." + << endl; + exit(1); + } else + cutoff = tempFloat; + } + } else { + cerr << "ERROR: Floating-point value expected for option " + << argv[i] << endl; + exit(1); + } + } + + // verbose reporting + else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) { + enableVerbose = true; + } + + // alignment order + else if (!strcmp(argv[i], "-a") + || !strcmp(argv[i], "--alignment-order")) { + enableAlignOrder = true; + } + + //print out version + else if (!strcmp(argv[i], "-version")) { + cerr << "MSAPROBS version " << VERSION << endl; + exit(1); + } + // bad arguments + else { + cerr << "ERROR: Unrecognized option: " << argv[i] << endl; + exit(1); + } + } else { + sequenceNames.push_back(string(argv[i])); + } + } + + /*check the output file name*/ + cerr << "-------------------------------------" << endl; + if (alignOutFileName.length() == 0) { + cerr << "The final alignments will be printed out to STDOUT" << endl; + alignOutFile = &std::cout; + } else { + cerr << "Open the output file " << alignOutFileName << endl; + alignOutFile = new ofstream(alignOutFileName.c_str(), + ios::binary | ios::out | ios::trunc); + } + cerr << "-------------------------------------" << endl; + return sequenceNames; +} + +///////////////////////////////////////////////////////////////// +// ProcessTree() +// +// Process the tree recursively. Returns the aligned sequences +// corresponding to a node or leaf of the tree. +///////////////////////////////////////////////////////////////// +MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + MultiSequence *result; + + // check if this is a node of the alignment tree + //if (tree->GetSequenceLabel() == -1){ + if (tree->leaf == NODE) { + MultiSequence *alignLeft = ProcessTree(tree->left, sequences, + sparseMatrices, model); + MultiSequence *alignRight = ProcessTree(tree->right, sequences, + sparseMatrices, model); + + assert(alignLeft); + assert(alignRight); + + result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model); + assert(result); + + delete alignLeft; + delete alignRight; + } + + // otherwise, this is a leaf of the alignment tree + else { + result = new MultiSequence(); + assert(result); + //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone()); + result->AddSequence(sequences->GetSequence(tree->idx)->Clone()); + } + + return result; +} + +///////////////////////////////////////////////////////////////// +// ComputeFinalAlignment() +// +// Compute the final alignment by calling ProcessTree(), then +// performing iterative refinement as needed. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree, + MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences, + sparseMatrices, model); + + SafeVector oldOrdering; + if (enableAlignOrder) { + for (int i = 0; i < alignment->GetNumSequences(); i++) + oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel()); + alignment->SaveOrdering(); + enableAlignOrder = false; + } + + // tree-based refinement + // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree); + /* + int numSeqs = alignment->GetNumSequences(); + //if(numSeqs < numIterativeRefinementReps){ + for(int iter = 0; iter < 5; iter ++){ + for(int i = 0; i < numSeqs - 1; i++){ + DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i); + } + } + //}*/ + //Refinement return false:no improvement + for (int i = 0; i < numIterativeRefinementReps; i++) { + DoIterativeRefinement(sparseMatrices, model, alignment); + } + cerr << endl; + + if (oldOrdering.size() > 0) { + for (int i = 0; i < (int) oldOrdering.size(); i++) { + alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]); + } + } + + // return final alignment + return alignment; +} + +///////////////////////////////////////////////////////////////// +// AlignAlignments() +// +// Returns the alignment of two MultiSequence objects. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::AlignAlignments(MultiSequence *align1, + MultiSequence *align2, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + // print some info about the alignment + if (enableVerbose) { + for (int i = 0; i < align1->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align1->GetSequence(i)->GetLabel(); + cerr << "] vs. "; + for (int i = 0; i < align2->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align2->GetSequence(i)->GetLabel(); + cerr << "]: "; + } +#if 0 + VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2, + sparseMatrices, cutoff); +#endif + // compute an "accuracy" measure for the alignment before refinement + + pair *, float> alignment; + //perform alignment + alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(), + align2->GetSequence(0)->GetLength(), *posterior); + + delete posterior; + + if (enableVerbose) { + + // compute total length of sequences + int totLength = 0; + for (int i = 0; i < align1->GetNumSequences(); i++) + for (int j = 0; j < align2->GetNumSequences(); j++) + totLength += min(align1->GetSequence(i)->GetLength(), + align2->GetSequence(j)->GetLength()); + + // give an "accuracy" measure for the alignment + cerr << alignment.second / totLength << endl; + } + + // now build final alignment + MultiSequence *result = new MultiSequence(); + for (int i = 0; i < align1->GetNumSequences(); i++) + result->AddSequence( + align1->GetSequence(i)->AddGaps(alignment.first, 'X')); + for (int i = 0; i < align2->GetNumSequences(); i++) + result->AddSequence( + align2->GetSequence(i)->AddGaps(alignment.first, 'Y')); + if (!enableAlignOrder) + result->SortByLabel(); + + // free temporary alignment + delete alignment.first; + + return result; +} + +///////////////////////////////////////////////////////////////// +// DoRelaxation() +// +// Performs one round of the weighted probabilistic consistency transformation. +// 1 +///////////////////////////////////////////////////////////////// + +SafeVector > MSA::DoRelaxation(float* seqsWeights, + MultiSequence *sequences, + SafeVector > &sparseMatrices) { + const int numSeqs = sequences->GetNumSequences(); + + SafeVector > newSparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + + // for every pair of sequences +#ifdef _OPENMP + int pairIdx; +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int i = seqsPairs[pairIdx].seq1; + int j = seqsPairs[pairIdx].seq2; + float wi = seqsWeights[i]; + float wj = seqsWeights[j]; +#else + for (int i = 0; i < numSeqs; i++) { + float wi = seqsWeights[i]; + for (int j = i + 1; j < numSeqs; j++) { + float wj = seqsWeights[j]; +#endif + Sequence *seq1 = sequences->GetSequence(i); + Sequence *seq2 = sequences->GetSequence(j); + + if (enableVerbose) { +#ifdef _OPENMP +#pragma omp critical +#endif + cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader() + << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader() + << ": "; + } + // get the original posterior matrix + VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + // contribution from the summation where z = x and z = y + float w = wi * wi * wj + wi * wj * wj; + float sumW = w; + for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) { + //posterior[k] = w*posterior[k]; + posterior[k] += posterior[k]; + } + + if (enableVerbose) + cerr << sparseMatrices[i][j]->GetNumCells() << " --> "; + + // contribution from all other sequences + for (int k = 0; k < numSeqs; k++) { + if (k != i && k != j) { + float wk = seqsWeights[k]; + float w = wi * wj * wk; + sumW += w; + if (k < i) + Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j], + posterior); + else if (k > i && k < j) + Relax(w, sparseMatrices[i][k], sparseMatrices[k][j], + posterior); + else { + SparseMatrix *temp = + sparseMatrices[j][k]->ComputeTranspose(); + Relax(w, sparseMatrices[i][k], temp, posterior); + delete temp; + } + } + } + //cerr<<"sumW "<::iterator XYptr = matXY->GetRowPtr(x); + SafeVector::iterator XYend = XYptr + matXY->GetRowSize(x); + VF::iterator base = posterior.begin() + x * (seq2Length + 1); + int curr = 0; + while (XYptr != XYend) { + + // zero out all cells until the first filled column + while (curr < XYptr->first) { + base[curr] = 0; + curr++; + } + + // now, skip over this column + curr++; + ++XYptr; + } + + // zero out cells after last column + while (curr <= seq2Length) { + base[curr] = 0; + curr++; + } + } + + // save the new posterior matrix + newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), posterior); + newSparseMatrices[j][i] = NULL; + + if (enableVerbose) + cerr << newSparseMatrices[i][j]->GetNumCells() << " -- "; + + delete posteriorPtr; + + if (enableVerbose) + cerr << "done." << endl; +#ifndef _OPENMP + } +#endif + } + + return newSparseMatrices; +} + +///////////////////////////////////////////////////////////////// +// Relax() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY, + VF &posterior) { + + assert(matXZ); + assert(matZY); + + int lengthX = matXZ->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length()); + + // for every x[i] + for (int i = 1; i <= lengthX; i++) { + SafeVector::iterator XZptr = matXZ->GetRowPtr(i); + SafeVector::iterator XZend = XZptr + matXZ->GetRowSize(i); + + VF::iterator base = posterior.begin() + i * (lengthY + 1); + + // iterate through all x[i]-z[k] + while (XZptr != XZend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(XZptr->first); + SafeVector::iterator ZYend = ZYptr + + matZY->GetRowSize(XZptr->first); + const float XZval = XZptr->second; + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + //base[ZYptr->first] += weight * XZval * ZYptr->second; + base[ZYptr->first] += XZval * ZYptr->second; + ZYptr++; + } + XZptr++; + } + } +} + +///////////////////////////////////////////////////////////////// +// Relax1() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY, + VF &posterior) { + + assert(matZX); + assert(matZY); + + int lengthZ = matZX->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + + // for every z[k] + for (int k = 1; k <= lengthZ; k++) { + SafeVector::iterator ZXptr = matZX->GetRowPtr(k); + SafeVector::iterator ZXend = ZXptr + matZX->GetRowSize(k); + + // iterate through all z[k]-x[i] + while (ZXptr != ZXend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(k); + SafeVector::iterator ZYend = ZYptr + matZY->GetRowSize(k); + const float ZXval = ZXptr->second; + VF::iterator base = posterior.begin() + + ZXptr->first * (lengthY + 1); + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + //base[ZYptr->first] += weight * ZXval * ZYptr->second; + base[ZYptr->first] += ZXval * ZYptr->second; + ZYptr++; + } + ZXptr++; + } + } +} +///////////////////////////////////////////////////////////////// +// DoIterativeRefinement() +// +// Performs a single round of randomized partionining iterative +// refinement. +///////////////////////////////////////////////////////////////// + +void MSA::DoIterativeRefinement( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + + // create two separate groups + for (int i = 0; i < numSeqs; i++) { + int index = rand(); + if (index % 2) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) return; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); +/* +//start add by Yongtao +#if 1 + VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs, + sparseMatrices, cutoff); +#endif + + // compute an "accuracy" measure for the alignment before refinement + SafeVector::iterator> oldOnePtrs(groupOne.size()); + SafeVector::iterator> oldTwoPtrs(groupTwo.size()); + int i=0; + for (set::const_iterator iter = groupOne.begin(); + iter != groupOne.end(); ++iter) { + oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr(); + } + i=0; + for (set::const_iterator iter = groupTwo.begin(); + iter != groupTwo.end(); ++iter) { + oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr(); + } + + VF &posteriorArr = *posterior; + int oldLength = alignment->GetSequence(0)->GetLength(); + int groupOneindex=0; int groupTwoindex=0; + float accuracy_before = 0; + for (int i = 1; i <= oldLength; i++) { + // check to see if there is a gap in every sequence of the set + bool foundOne = false; + for (int j = 0; !foundOne && j < (int) groupOne.size(); j++) + foundOne = (oldOnePtrs[j][i] != '-'); + // if not, then this column counts towards the sequence length + if (foundOne) groupOneindex ++; + bool foundTwo = false; + for (int j = 0; !foundTwo && j < (int) groupTwo.size(); j++) + foundTwo = (oldTwoPtrs[j][i] != '-'); + // if not, then this column counts towards the sequence length + if (foundTwo) groupTwoindex ++; + if(foundOne && foundTwo) accuracy_before += + posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex]; + } + + pair *, float> refinealignment; + //perform alignment + refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(), + groupTwoSeqs->GetSequence(0)->GetLength(), *posterior); + delete posterior; + // now build final alignment + MultiSequence *result = new MultiSequence(); + //compare accuracy measure before and after refinement + //if (refinealignment.second > accuracy_before) { + //cerr<<"Before:" << accuracy_before<<" after: "<< refinealignment.second<< endl; + for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++) + result->AddSequence( + groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X')); + for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++) + result->AddSequence( + groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y')); + // free temporary alignment + delete refinealignment.first; + delete alignment; + alignment = result; + + } + else{ + if(numIterativeRefinementReps < 8*numSeqs) numIterativeRefinementReps++; + delete groupOneSeqs; + delete groupTwoSeqs; + return false; + } + */ +//end add by yongtao + + //delete alignment; + // realign + alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, model); //original + delete groupOneSeqs; + delete groupTwoSeqs; + +} + +void MSA::DoIterativeRefinementTreeNode( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment, + int nodeIndex) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + + vector inGroup1; + inGroup1.resize(numSeqs); + for (int i = 0; i < numSeqs; i++) { + inGroup1[i] = false; + } + + AlignmentOrder* orders = this->tree->getAlignOrders(); + AlignmentOrder* order = &orders[nodeIndex]; + for (int i = 0; i < order->leftNum; i++) { + int si = order->leftLeafs[i]; + inGroup1[si] = true; + } + for (int i = 0; i < order->rightNum; i++) { + int si = order->rightLeafs[i]; + inGroup1[si] = true; + } + // create two separate groups + for (int i = 0; i < numSeqs; i++) { + if (inGroup1[i]) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) + return; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); + delete alignment; + + // realign + alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, + model); + + delete groupOneSeqs; + delete groupTwoSeqs; +} + +///////////////////////////////////////////////////////////////// +// WriteAnnotation() +// +// Computes annotation for multiple alignment and write values +// to a file. +///////////////////////////////////////////////////////////////// + +void MSA::WriteAnnotation(MultiSequence *alignment, + const SafeVector > &sparseMatrices) { + ofstream outfile(annotationFilename.c_str()); + + if (outfile.fail()) { + cerr << "ERROR: Unable to write annotation file." << endl; + exit(1); + } + + const int alignLength = alignment->GetSequence(0)->GetLength(); + const int numSeqs = alignment->GetNumSequences(); + + SafeVector position(numSeqs, 0); + SafeVector::iterator> seqs(numSeqs); + for (int i = 0; i < numSeqs; i++) + seqs[i] = alignment->GetSequence(i)->GetDataPtr(); + SafeVector > active; + active.reserve(numSeqs); + + SafeVector lab; + for (int i = 0; i < numSeqs; i++) + lab.push_back(alignment->GetSequence(i)->GetSortLabel()); + + // for every column + for (int i = 1; i <= alignLength; i++) { + + // find all aligned residues in this particular column + active.clear(); + for (int j = 0; j < numSeqs; j++) { + if (seqs[j][i] != '-') { + active.push_back(make_pair(lab[j], ++position[j])); + } + } + + sort(active.begin(), active.end()); + outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl; + } + + outfile.close(); +} + +///////////////////////////////////////////////////////////////// +// ComputeScore() +// +// Computes the annotation score for a particular column. +///////////////////////////////////////////////////////////////// + +int MSA::ComputeScore(const SafeVector > &active, + const SafeVector > &sparseMatrices) { + + if (active.size() <= 1) + return 0; + + // ALTERNATIVE #1: Compute the average alignment score. + + float val = 0; + for (int i = 0; i < (int) active.size(); i++) { + for (int j = i + 1; j < (int) active.size(); j++) { + val += sparseMatrices[active[i].first][active[j].first]->GetValue( + active[i].second, active[j].second); + } + } + + return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1))); + +} diff --git a/binaries/src/GLProbs-1.0/MSAgl+l+p+gl.cpp b/binaries/src/GLProbs-1.0/MSAgl+l+p+gl.cpp new file mode 100644 index 0000000..713acbe --- /dev/null +++ b/binaries/src/GLProbs-1.0/MSAgl+l+p+gl.cpp @@ -0,0 +1,1512 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "MSA.h" +#include "MSAClusterTree.h" +#include "Defaults.h" + +#ifdef _OPENMP +#include +#endif + +string parametersInputFilename = ""; +string parametersOutputFilename = "no training"; +string annotationFilename = ""; + +bool enableVerbose = false; +bool enableAnnotation = false; +bool enableClustalWOutput = false; +bool enableAlignOrder = false; +int numConsistencyReps = 2; +int numPreTrainingReps = 0; +int numIterativeRefinementReps = 100; + +float cutoff = 0; + +VF initDistrib(NumMatrixTypes); +VF gapOpen(2 * NumInsertStates); +VF gapExtend(2 * NumInsertStates); +VVF emitPairs(256, VF(256, 1e-10)); +VF emitSingle(256, 1e-5); + +string alphabet = alphabetDefault; + +const int MIN_PRETRAINING_REPS = 0; +const int MAX_PRETRAINING_REPS = 20; +const int MIN_CONSISTENCY_REPS = 0; +const int MAX_CONSISTENCY_REPS = 5; +const int MIN_ITERATIVE_REFINEMENT_REPS = 0; +const int MAX_ITERATIVE_REFINEMENT_REPS = 1000; + +string posteriorProbsFilename = ""; +bool allscores = true; +string infilename; + +int flag_gui = 0; //0: no gui related o/p +//1: gui related o/p generated +int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment +//1: pp score seq added to o/p fasta alignment + +/////////////////////////////// +// global scoring matrix variables +////////////////////////////// +float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; +char *aminos, *bases, matrixtype[20] = "gonnet_160"; +int subst_index[26]; + +double sub_matrix[26][26]; +int firstread = 0; //this makes sure that matrices are read only once + +float TEMPERATURE = 5; +int MATRIXTYPE = 160; +int prot_nuc = 0; //0=prot, 1=nucleotide + +float GAPOPEN = 0; +float GAPEXT = 0; +int numThreads = 0; + +//argument support +typedef struct { + char input[30]; + int matrix; + int N; + float T; + float beta; + char opt; //can be 'P' or 'M' + float gapopen; + float gapext; +} argument_decl; + +argument_decl argument; + +extern inline void read_sustitution_matrix(char *fileName); +extern void setmatrixtype(int le); +extern inline int matrixtype_to_int(); +extern inline void read_dna_matrix(); +extern inline void read_vtml_la_matrix(); +extern void init_arguments(); + +MSA::MSA(int argc, char* argv[]) { + //parse program parameters + SafeVector sequenceNames = ParseParams(argc, argv); + + //initialize arguments for partition function + init_arguments(); + + ReadParameters(); + //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL); + + //read the input sequences + MultiSequence *sequences = new MultiSequence(); + assert(sequences); + for (int i = 0; i < (int) sequenceNames.size(); i++) { + cerr << "Loading sequence file: " << sequenceNames[i] << endl; + sequences->LoadMFA(sequenceNames[i], true); + } + //allocate space for sequence weights + this->seqsWeights = new int[sequences->GetNumSequences()]; + //initilaize parameters for OPENMP +#ifdef _OPENMP + if(numThreads <= 0) { + numThreads = omp_get_num_procs(); + cerr << "Automatically detected " << numThreads << " CPU cores" << endl; + } + cerr <<"Enabling OpenMP (with "<WriteALN(*alignOutFile); + } else { + alignment->WriteMFA(*alignOutFile); + } + //release resources + delete[] this->seqsWeights; + delete alignment; + delete sequences; +} +MSA::~MSA() { + /*close the output file*/ + if (alignOutFileName.length() > 0) { + ((std::ofstream*) alignOutFile)->close(); + } +} +///////////////////////////////////////////////////////////////// +// PrintParameters() +// +// Prints MSAPROBS parameters to STDERR. If a filename is +// specified, then the parameters are also written to the file. +///////////////////////////////////////////////////////////////// + +void MSA::PrintParameters(const char *message, const VF &initDistrib, + const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, + const VF &emitSingle, const char *filename) { + + // print parameters to the screen + cerr << message << endl << " initDistrib[] = { "; + for (int i = 0; i < NumMatrixTypes; i++) + cerr << setprecision(10) << initDistrib[i] << " "; + cerr << "}" << endl << " gapOpen[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapOpen[i] << " "; + cerr << "}" << endl << " gapExtend[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapExtend[i] << " "; + cerr << "}" << endl << endl; + + // if a file name is specified + if (filename) { + + // attempt to open the file for writing + FILE *file = fopen(filename, "w"); + if (!file) { + cerr << "ERROR: Unable to write parameter file: " << filename + << endl; + exit(1); + } + + // if successful, then write the parameters to the file + for (int i = 0; i < NumMatrixTypes; i++) + fprintf(file, "%.10f ", initDistrib[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapOpen[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapExtend[i]); + fprintf(file, "\n"); + fprintf(file, "%s\n", alphabet.c_str()); + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) + fprintf(file, "%.10f ", + emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]); + fprintf(file, "\n"); + } + for (int i = 0; i < (int) alphabet.size(); i++) + fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]); + fprintf(file, "\n"); + fclose(file); + } +} + +///////////////////////////////////////////////////////////////// +// doAlign() +// +// First computes all pairwise posterior probability matrices. +// Then, computes new parameters if training, or a final +// alignment, otherwise. +///////////////////////////////////////////////////////////////// +extern VF *ComputePostProbs(int a, int b, string seq1, string seq2); +MultiSequence* MSA::doAlign(MultiSequence *sequences, + const ProbabilisticModel &model, int levelid) { + assert(sequences); + + //get the number of sequences + const int numSeqs = sequences->GetNumSequences(); + //create distance matrix + VVF distances(numSeqs, VF(numSeqs, 0)); + //creat sparseMatrices + SafeVector > sparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + +#ifdef _OPENMP + //calculate sequence pairs for openmp model + int pairIdx = 0; + numPairs = (numSeqs - 1) * numSeqs / 2; + seqsPairs = new SeqsPair[numPairs]; + for(int a = 0; a < numSeqs; a++) { + for(int b = a + 1; b < numSeqs; b++) { + seqsPairs[pairIdx].seq1 = a; + seqsPairs[pairIdx].seq2 = b; + pairIdx++; + } + } +#endif + // do all pairwise alignments for posterior probability matrices +#ifdef _OPENMP +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int a= seqsPairs[pairIdx].seq1; + int b = seqsPairs[pairIdx].seq2; + if(enableVerbose) { +#pragma omp critical + cerr <<"tid "<GetSequence(a); + Sequence *seq2 = sequences->GetSequence(b); + + //posterior probability matrix + VF* posterior; + +//high similarity use global model + if(levelid == 2) posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString()); + +//low similarity use local model + else if(levelid == 1){ + VF *forward = model.ComputeForwardMatrix(seq1, seq2,false); + assert(forward); + VF *backward = model.ComputeBackwardMatrix(seq1, seq2,false); + assert(backward); + posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false); + delete forward; + delete backward; + } + +//extreme low or extreme high similarity use combined model + else{ +//probcons + // compute forward and backward probabilities + VF *forward = model.ComputeForwardMatrix(seq1, seq2); + assert(forward); + VF *backward = model.ComputeBackwardMatrix(seq1, seq2); + assert(backward); + // compute posterior probability matrix from HMM + VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward); + assert(probcons_posterior); + delete forward; + delete backward; + +//probalign + VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString()); + assert(probalign_posterior); +//local + forward = model.ComputeForwardMatrix(seq1, seq2,false); + assert(forward); + backward = model.ComputeBackwardMatrix(seq1, seq2,false); + assert(backward); + posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false); + assert(posterior); + delete forward; + delete backward; +//combined model + //merge probalign + local + probcons + VF::iterator ptr1 = probcons_posterior->begin(); + VF::iterator ptr2 = probalign_posterior->begin(); + VF::iterator ptr = posterior->begin(); + for (int i = 0; i <= seq1->GetLength(); i++) { + for (int j = 0; j <= seq2->GetLength(); j++) { + float v1 = *ptr1; + float v2 = *ptr2; + float v3 = *ptr; + *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3); + ptr1++; + ptr2++; + ptr++; + } + } + delete probcons_posterior; + delete probalign_posterior; + } + assert(posterior); + // perform the pairwise sequence alignment + pair *, float> alignment = model.ComputeAlignment( + seq1->GetLength(), seq2->GetLength(), *posterior); + + //compute expected accuracy + distances[a][b] = distances[b][a] = 1.0f - alignment.second + / min(seq1->GetLength(), seq2->GetLength()); + + // compute sparse representations + sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), *posterior); + sparseMatrices[b][a] = NULL; + + delete posterior; + delete alignment.first; +#ifndef _OPENMP + } +#endif + } + + //create the guide tree + this->tree = new MSAClusterTree(this, distances, numSeqs); + this->tree->create(); + + // perform the consistency transformation the desired number of times + float* fweights = new float[numSeqs]; + for (int r = 0; r < numSeqs; r++) { + fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY; + fweights[r] *= 10; + } + for (int r = 0; r < numConsistencyReps; r++) { + SafeVector > newSparseMatrices = + DoRelaxation(fweights, sequences, sparseMatrices); + + // now replace the old posterior matrices + for (int i = 0; i < numSeqs; i++) { + for (int j = 0; j < numSeqs; j++) { + delete sparseMatrices[i][j]; + sparseMatrices[i][j] = newSparseMatrices[i][j]; + } + } + } + delete[] fweights; +#ifdef _OPENMP + delete [] seqsPairs; +#endif + + //compute the final multiple sequence alignment + MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences, + sparseMatrices, model); + + // build annotation + if (enableAnnotation) { + WriteAnnotation(finalAlignment, sparseMatrices); + } + //destroy the guide tree + delete this->tree; + this->tree = 0; + + // delete sparse matrices + for (int a = 0; a < numSeqs - 1; a++) { + for (int b = a + 1; b < numSeqs; b++) { + delete sparseMatrices[a][b]; + delete sparseMatrices[b][a]; + } + } + + return finalAlignment; +} + +///////////////////////////////////////////////////////////////// +// GetInteger() +// +// Attempts to parse an integer from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetInteger(char *data, int *val) { + char *endPtr; + long int retVal; + + assert(val); + + errno = 0; + retVal = strtol(data, &endPtr, 0); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) + return false; + if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) + return false; + *val = (int) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// GetFloat() +// +// Attempts to parse a float from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetFloat(char *data, float *val) { + char *endPtr; + double retVal; + + assert(val); + + errno = 0; + retVal = strtod(data, &endPtr); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) + return false; + *val = (float) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// ReadParameters() +// +// Read initial distribution, transition, and emission +// parameters from a file. +///////////////////////////////////////////////////////////////// + +void MSA::ReadParameters() { + + ifstream data; + + emitPairs = VVF(256, VF(256, 1e-10)); + emitSingle = VF(256, 1e-5); + + // read initial state distribution and transition parameters + if (parametersInputFilename == string("")) { + if (NumInsertStates == 1) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend1Default[i]; + } else if (NumInsertStates == 2) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend2Default[i]; + } else { + cerr + << "ERROR: No default initial distribution/parameter settings exist" + << endl << " for " << NumInsertStates + << " pairs of insert states. Use --paramfile." << endl; + exit(1); + } + + alphabet = alphabetDefault; + + for (int i = 0; i < (int) alphabet.length(); i++) { + emitSingle[(unsigned char) tolower(alphabet[i])] = + emitSingleDefault[i]; + emitSingle[(unsigned char) toupper(alphabet[i])] = + emitSingleDefault[i]; + for (int j = 0; j <= i; j++) { + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + } + } + } else { + data.open(parametersInputFilename.c_str()); + if (data.fail()) { + cerr << "ERROR: Unable to read parameter file: " + << parametersInputFilename << endl; + exit(1); + } + + string line[3]; + for (int i = 0; i < 3; i++) { + if (!getline(data, line[i])) { + cerr + << "ERROR: Unable to read transition parameters from parameter file: " + << parametersInputFilename << endl; + exit(1); + } + } + istringstream data2; + data2.clear(); + data2.str(line[0]); + for (int i = 0; i < NumMatrixTypes; i++) + data2 >> initDistrib[i]; + data2.clear(); + data2.str(line[1]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapOpen[i]; + data2.clear(); + data2.str(line[2]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapExtend[i]; + + if (!getline(data, line[0])) { + cerr << "ERROR: Unable to read alphabet from scoring matrix file: " + << parametersInputFilename << endl; + exit(1); + } + + // read alphabet as concatenation of all characters on alphabet line + alphabet = ""; + string token; + data2.clear(); + data2.str(line[0]); + while (data2 >> token) + alphabet += token; + + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) { + float val; + data >> val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + } + } + + for (int i = 0; i < (int) alphabet.size(); i++) { + float val; + data >> val; + emitSingle[(unsigned char) tolower(alphabet[i])] = val; + emitSingle[(unsigned char) toupper(alphabet[i])] = val; + } + data.close(); + } +} + +///////////////////////////////////////////////////////////////// +// ParseParams() +// +// Parse all command-line options. +///////////////////////////////////////////////////////////////// +void MSA::printUsage() { + cerr + << "************************************************************************" + << endl + << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm" + << endl + << "\tbased on pair hidden markov model and partition function postirior" + << endl + << "\tprobabilities. If any comments or problems, please contact" + << endl + << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)" + << endl + << "*************************************************************************" + << endl << "Usage:" << endl + << " msaprobs [OPTION]... [infile]..." << endl << endl + << "Description:" << endl + << " Align sequences in multi-FASTA format" << endl << endl + << " -o, --outfile " << endl + << " specify the output file name (STDOUT by default)" + << endl << " -num_threads " << endl + << " specify the number of threads used, and otherwise detect automatically" + << endl << " -clustalw" << endl + << " use CLUSTALW output format instead of FASTA format" + << endl << endl << " -c, --consistency REPS" << endl + << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " + << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps + << ") passes of consistency transformation" << endl << endl + << " -ir, --iterative-refinement REPS" << endl + << " use " << MIN_ITERATIVE_REFINEMENT_REPS + << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: " + << numIterativeRefinementReps << ") passes of iterative-refinement" + << endl << endl << " -v, --verbose" << endl + << " report progress while aligning (default: " + << (enableVerbose ? "on" : "off") << ")" << endl << endl + << " -annot FILENAME" << endl + << " write annotation for multiple alignment to FILENAME" + << endl << endl << " -a, --alignment-order" << endl + << " print sequences in alignment order rather than input order (default: " + << (enableAlignOrder ? "on" : "off") << ")" << endl + << " -version " << endl + << " print out version of MSAPROBS " << endl << endl; +} +SafeVector MSA::ParseParams(int argc, char **argv) { + if (argc < 2) { + printUsage(); + exit(1); + } + SafeVector sequenceNames; + int tempInt; + float tempFloat; + + for (int i = 1; i < argc; i++) { + if (argv[i][0] == '-') { + //help + if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) { + printUsage(); + exit(1); + //output file name + } else if (!strcmp(argv[i], "-o") + || !strcmp(argv[i], "--outfile")) { + if (i < argc - 1) { + alignOutFileName = argv[++i]; //get the file name + } else { + cerr << "ERROR: String expected for option " << argv[i] + << endl; + exit(1); + } + // parameter file + } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){ + if (i < argc - 1) + parametersInputFilename = string (argv[++i]); + else { + cerr << "ERROR: Filename expected for option " << argv[i] << endl; + exit (1); + } + //number of threads used + } else if (!strcmp(argv[i], "-p") + || !strcmp(argv[i], "-num_threads")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << " ERROR: invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < 0) { + tempInt = 0; + } + numThreads = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + // number of consistency transformations + } else if (!strcmp(argv[i], "-c") + || !strcmp(argv[i], "--consistency")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_CONSISTENCY_REPS + || tempInt > MAX_CONSISTENCY_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_CONSISTENCY_REPS << " and " + << MAX_CONSISTENCY_REPS << "." << endl; + exit(1); + } else { + numConsistencyReps = tempInt; + } + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // number of randomized partitioning iterative refinement passes + else if (!strcmp(argv[i], "-ir") + || !strcmp(argv[i], "--iterative-refinement")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS + || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_ITERATIVE_REFINEMENT_REPS << " and " + << MAX_ITERATIVE_REFINEMENT_REPS << "." + << endl; + exit(1); + } else + numIterativeRefinementReps = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // annotation files + else if (!strcmp(argv[i], "-annot")) { + enableAnnotation = true; + if (i < argc - 1) { + annotationFilename = argv[++i]; + } else { + cerr << "ERROR: FILENAME expected for option " << argv[i] + << endl; + exit(1); + } + } + + // clustalw output format + else if (!strcmp(argv[i], "-clustalw")) { + enableClustalWOutput = true; + } + + // cutoff + else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) { + if (i < argc - 1) { + if (!GetFloat(argv[++i], &tempFloat)) { + cerr + << "ERROR: Invalid floating-point value following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempFloat < 0 || tempFloat > 1) { + cerr << "ERROR: For option " << argv[i - 1] + << ", floating-point value must be between 0 and 1." + << endl; + exit(1); + } else + cutoff = tempFloat; + } + } else { + cerr << "ERROR: Floating-point value expected for option " + << argv[i] << endl; + exit(1); + } + } + + // verbose reporting + else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) { + enableVerbose = true; + } + + // alignment order + else if (!strcmp(argv[i], "-a") + || !strcmp(argv[i], "--alignment-order")) { + enableAlignOrder = true; + } + + //print out version + else if (!strcmp(argv[i], "-version")) { + cerr << "MSAPROBS version " << VERSION << endl; + exit(1); + } + // bad arguments + else { + cerr << "ERROR: Unrecognized option: " << argv[i] << endl; + exit(1); + } + } else { + sequenceNames.push_back(string(argv[i])); + } + } + + /*check the output file name*/ + cerr << "-------------------------------------" << endl; + if (alignOutFileName.length() == 0) { + cerr << "The final alignments will be printed out to STDOUT" << endl; + alignOutFile = &std::cout; + } else { + cerr << "Open the output file " << alignOutFileName << endl; + alignOutFile = new ofstream(alignOutFileName.c_str(), + ios::binary | ios::out | ios::trunc); + } + cerr << "-------------------------------------" << endl; + return sequenceNames; +} + +///////////////////////////////////////////////////////////////// +// ProcessTree() +// +// Process the tree recursively. Returns the aligned sequences +// corresponding to a node or leaf of the tree. +///////////////////////////////////////////////////////////////// +MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + MultiSequence *result; + + // check if this is a node of the alignment tree + //if (tree->GetSequenceLabel() == -1){ + if (tree->leaf == NODE) { + MultiSequence *alignLeft = ProcessTree(tree->left, sequences, + sparseMatrices, model); + MultiSequence *alignRight = ProcessTree(tree->right, sequences, + sparseMatrices, model); + + assert(alignLeft); + assert(alignRight); + + result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model); + assert(result); + + delete alignLeft; + delete alignRight; + } + + // otherwise, this is a leaf of the alignment tree + else { + result = new MultiSequence(); + assert(result); + //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone()); + result->AddSequence(sequences->GetSequence(tree->idx)->Clone()); + } + + return result; +} + +///////////////////////////////////////////////////////////////// +// ComputeFinalAlignment() +// +// Compute the final alignment by calling ProcessTree(), then +// performing iterative refinement as needed. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree, + MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences, + sparseMatrices, model); + + SafeVector oldOrdering; + if (enableAlignOrder) { + for (int i = 0; i < alignment->GetNumSequences(); i++) + oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel()); + alignment->SaveOrdering(); + enableAlignOrder = false; + } + + // tree-based refinement + // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree); + /* + int numSeqs = alignment->GetNumSequences(); + //if(numSeqs < numIterativeRefinementReps){ + for(int iter = 0; iter < 5; iter ++){ + for(int i = 0; i < numSeqs - 1; i++){ + DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i); + } + } + //}*/ + //Refinement return false:no improvement + for (int i = 0; i < numIterativeRefinementReps; i++) { + DoIterativeRefinement(sparseMatrices, model, alignment); + } + cerr << endl; + + if (oldOrdering.size() > 0) { + for (int i = 0; i < (int) oldOrdering.size(); i++) { + alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]); + } + } + + // return final alignment + return alignment; +} + +///////////////////////////////////////////////////////////////// +// AlignAlignments() +// +// Returns the alignment of two MultiSequence objects. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::AlignAlignments(MultiSequence *align1, + MultiSequence *align2, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + // print some info about the alignment + if (enableVerbose) { + for (int i = 0; i < align1->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align1->GetSequence(i)->GetLabel(); + cerr << "] vs. "; + for (int i = 0; i < align2->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align2->GetSequence(i)->GetLabel(); + cerr << "]: "; + } +#if 0 + VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2, + sparseMatrices, cutoff); +#endif + // compute an "accuracy" measure for the alignment before refinement + + pair *, float> alignment; + //perform alignment + alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(), + align2->GetSequence(0)->GetLength(), *posterior); + + delete posterior; + + if (enableVerbose) { + + // compute total length of sequences + int totLength = 0; + for (int i = 0; i < align1->GetNumSequences(); i++) + for (int j = 0; j < align2->GetNumSequences(); j++) + totLength += min(align1->GetSequence(i)->GetLength(), + align2->GetSequence(j)->GetLength()); + + // give an "accuracy" measure for the alignment + cerr << alignment.second / totLength << endl; + } + + // now build final alignment + MultiSequence *result = new MultiSequence(); + for (int i = 0; i < align1->GetNumSequences(); i++) + result->AddSequence( + align1->GetSequence(i)->AddGaps(alignment.first, 'X')); + for (int i = 0; i < align2->GetNumSequences(); i++) + result->AddSequence( + align2->GetSequence(i)->AddGaps(alignment.first, 'Y')); + if (!enableAlignOrder) + result->SortByLabel(); + + // free temporary alignment + delete alignment.first; + + return result; +} + +///////////////////////////////////////////////////////////////// +// DoRelaxation() +// +// Performs one round of the weighted probabilistic consistency transformation. +// 1 +///////////////////////////////////////////////////////////////// + +SafeVector > MSA::DoRelaxation(float* seqsWeights, + MultiSequence *sequences, + SafeVector > &sparseMatrices) { + const int numSeqs = sequences->GetNumSequences(); + + SafeVector > newSparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + + // for every pair of sequences +#ifdef _OPENMP + int pairIdx; +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int i = seqsPairs[pairIdx].seq1; + int j = seqsPairs[pairIdx].seq2; + float wi = seqsWeights[i]; + float wj = seqsWeights[j]; +#else + for (int i = 0; i < numSeqs; i++) { + float wi = seqsWeights[i]; + for (int j = i + 1; j < numSeqs; j++) { + float wj = seqsWeights[j]; +#endif + Sequence *seq1 = sequences->GetSequence(i); + Sequence *seq2 = sequences->GetSequence(j); + + if (enableVerbose) { +#ifdef _OPENMP +#pragma omp critical +#endif + cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader() + << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader() + << ": "; + } + // get the original posterior matrix + VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + // contribution from the summation where z = x and z = y + float w = wi * wi * wj + wi * wj * wj; + float sumW = w; + for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) { + //posterior[k] = w*posterior[k]; + posterior[k] += posterior[k]; + } + + if (enableVerbose) + cerr << sparseMatrices[i][j]->GetNumCells() << " --> "; + + // contribution from all other sequences + for (int k = 0; k < numSeqs; k++) { + if (k != i && k != j) { + float wk = seqsWeights[k]; + float w = wi * wj * wk; + sumW += w; + if (k < i) + Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j], + posterior); + else if (k > i && k < j) + Relax(w, sparseMatrices[i][k], sparseMatrices[k][j], + posterior); + else { + SparseMatrix *temp = + sparseMatrices[j][k]->ComputeTranspose(); + Relax(w, sparseMatrices[i][k], temp, posterior); + delete temp; + } + } + } + //cerr<<"sumW "<::iterator XYptr = matXY->GetRowPtr(x); + SafeVector::iterator XYend = XYptr + matXY->GetRowSize(x); + VF::iterator base = posterior.begin() + x * (seq2Length + 1); + int curr = 0; + while (XYptr != XYend) { + + // zero out all cells until the first filled column + while (curr < XYptr->first) { + base[curr] = 0; + curr++; + } + + // now, skip over this column + curr++; + ++XYptr; + } + + // zero out cells after last column + while (curr <= seq2Length) { + base[curr] = 0; + curr++; + } + } + + // save the new posterior matrix + newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), posterior); + newSparseMatrices[j][i] = NULL; + + if (enableVerbose) + cerr << newSparseMatrices[i][j]->GetNumCells() << " -- "; + + delete posteriorPtr; + + if (enableVerbose) + cerr << "done." << endl; +#ifndef _OPENMP + } +#endif + } + + return newSparseMatrices; +} + +///////////////////////////////////////////////////////////////// +// Relax() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY, + VF &posterior) { + + assert(matXZ); + assert(matZY); + + int lengthX = matXZ->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length()); + + // for every x[i] + for (int i = 1; i <= lengthX; i++) { + SafeVector::iterator XZptr = matXZ->GetRowPtr(i); + SafeVector::iterator XZend = XZptr + matXZ->GetRowSize(i); + + VF::iterator base = posterior.begin() + i * (lengthY + 1); + + // iterate through all x[i]-z[k] + while (XZptr != XZend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(XZptr->first); + SafeVector::iterator ZYend = ZYptr + + matZY->GetRowSize(XZptr->first); + const float XZval = XZptr->second; + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + //base[ZYptr->first] += weight * XZval * ZYptr->second; + base[ZYptr->first] += XZval * ZYptr->second; + ZYptr++; + } + XZptr++; + } + } +} + +///////////////////////////////////////////////////////////////// +// Relax1() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY, + VF &posterior) { + + assert(matZX); + assert(matZY); + + int lengthZ = matZX->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + + // for every z[k] + for (int k = 1; k <= lengthZ; k++) { + SafeVector::iterator ZXptr = matZX->GetRowPtr(k); + SafeVector::iterator ZXend = ZXptr + matZX->GetRowSize(k); + + // iterate through all z[k]-x[i] + while (ZXptr != ZXend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(k); + SafeVector::iterator ZYend = ZYptr + matZY->GetRowSize(k); + const float ZXval = ZXptr->second; + VF::iterator base = posterior.begin() + + ZXptr->first * (lengthY + 1); + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + //base[ZYptr->first] += weight * ZXval * ZYptr->second; + base[ZYptr->first] += ZXval * ZYptr->second; + ZYptr++; + } + ZXptr++; + } + } +} +///////////////////////////////////////////////////////////////// +// DoIterativeRefinement() +// +// Performs a single round of randomized partionining iterative +// refinement. +// return 0: successful refinement, 1: ineffective refinement, 2: random problem +///////////////////////////////////////////////////////////////// +int MSA::DoIterativeRefinement( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + int i; + // create two separate groups + for (i = 0; i < numSeqs; i++) { + int index = rand(); + if (index % 2) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) return 2; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); + +//start add by Yongtao +#if 1 + VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs, + sparseMatrices, cutoff); +#endif + // compute an "accuracy" measure for the alignment before refinement + SafeVector::iterator> oldOnePtrs(groupOne.size()); + SafeVector::iterator> oldTwoPtrs(groupTwo.size()); + i=0; + for (set::const_iterator iter = groupOne.begin(); + iter != groupOne.end(); ++iter) { + oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr(); + } + i=0; + for (set::const_iterator iter = groupTwo.begin(); + iter != groupTwo.end(); ++iter) { + oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr(); + } + + VF &posteriorArr = *posterior; + int oldLength = alignment->GetSequence(0)->GetLength(); + int groupOneindex=0; int groupTwoindex=0; + float accuracy_before = 0; + int j; + for (i = 1; i <= oldLength; i++) { + // check to see if there is a gap in every sequence of the set + bool foundOne = false; + for (j = 0; !foundOne && j < (int) groupOne.size(); j++) + foundOne = (oldOnePtrs[j][i] != '-'); + // if not, then this column counts towards the sequence length + if (foundOne) groupOneindex ++; + bool foundTwo = false; + for (j = 0; !foundTwo && j < (int) groupTwo.size(); j++) + foundTwo = (oldTwoPtrs[j][i] != '-'); + if (foundTwo) groupTwoindex ++; + if(foundOne && foundTwo) accuracy_before += + posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex]; + } + + pair *, float> refinealignment; + //perform alignment + refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(), + groupTwoSeqs->GetSequence(0)->GetLength(), *posterior); + delete posterior; + // now build final alignment + MultiSequence *result = new MultiSequence(); + for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++) + result->AddSequence( + groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X')); + for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++) + result->AddSequence( + groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y')); + // free temporary alignment + delete refinealignment.first; + delete alignment; + alignment = result; + delete groupOneSeqs; + delete groupTwoSeqs; + if(accuracy_before == refinealignment.second) return 1; + else return 0; +} + + +void MSA::DoIterativeRefinementTreeNode( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment, + int nodeIndex) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + + vector inGroup1; + inGroup1.resize(numSeqs); + for (int i = 0; i < numSeqs; i++) { + inGroup1[i] = false; + } + + AlignmentOrder* orders = this->tree->getAlignOrders(); + AlignmentOrder* order = &orders[nodeIndex]; + for (int i = 0; i < order->leftNum; i++) { + int si = order->leftLeafs[i]; + inGroup1[si] = true; + } + for (int i = 0; i < order->rightNum; i++) { + int si = order->rightLeafs[i]; + inGroup1[si] = true; + } + // create two separate groups + for (int i = 0; i < numSeqs; i++) { + if (inGroup1[i]) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) + return; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); + delete alignment; + + // realign + alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, + model); + + delete groupOneSeqs; + delete groupTwoSeqs; +} + +///////////////////////////////////////////////////////////////// +// WriteAnnotation() +// +// Computes annotation for multiple alignment and write values +// to a file. +///////////////////////////////////////////////////////////////// + +void MSA::WriteAnnotation(MultiSequence *alignment, + const SafeVector > &sparseMatrices) { + ofstream outfile(annotationFilename.c_str()); + + if (outfile.fail()) { + cerr << "ERROR: Unable to write annotation file." << endl; + exit(1); + } + + const int alignLength = alignment->GetSequence(0)->GetLength(); + const int numSeqs = alignment->GetNumSequences(); + + SafeVector position(numSeqs, 0); + SafeVector::iterator> seqs(numSeqs); + for (int i = 0; i < numSeqs; i++) + seqs[i] = alignment->GetSequence(i)->GetDataPtr(); + SafeVector > active; + active.reserve(numSeqs); + + SafeVector lab; + for (int i = 0; i < numSeqs; i++) + lab.push_back(alignment->GetSequence(i)->GetSortLabel()); + + // for every column + for (int i = 1; i <= alignLength; i++) { + + // find all aligned residues in this particular column + active.clear(); + for (int j = 0; j < numSeqs; j++) { + if (seqs[j][i] != '-') { + active.push_back(make_pair(lab[j], ++position[j])); + } + } + + sort(active.begin(), active.end()); + outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl; + } + + outfile.close(); +} + +///////////////////////////////////////////////////////////////// +// ComputeScore() +// +// Computes the annotation score for a particular column. +///////////////////////////////////////////////////////////////// + +int MSA::ComputeScore(const SafeVector > &active, + const SafeVector > &sparseMatrices) { + + if (active.size() <= 1) + return 0; + + // ALTERNATIVE #1: Compute the average alignment score. + + float val = 0; + for (int i = 0; i < (int) active.size(); i++) { + for (int j = i + 1; j < (int) active.size(); j++) { + val += sparseMatrices[active[i].first][active[j].first]->GetValue( + active[i].second, active[j].second); + } + } + + return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1))); + +} + +///////////////////////////////////////////////////////////////// +// ComputeSimilarity () +// +// Computes the average similarity for a particular family. +// extreme low or extreme high similarity(<=20% or >80%) return 0 +// low similarity(20%-50%) return 1 +// high similarity(50%-80%) return 2 +///////////////////////////////////////////////////////////////// +int MSA::ComputeSimilarity (MultiSequence *sequences,const ProbabilisticModel &model){ + assert(sequences); + + //get the number of sequences + const int numSeqs = sequences->GetNumSequences(); + //average identity for all sequences + float identity = 0; + +#ifdef _OPENMP + //calculate sequence pairs for openmp model + int pairIdx = 0; + numPairs = (numSeqs - 1) * numSeqs / 2; + seqsPairs = new SeqsPair[numPairs]; + for(int a = 0; a < numSeqs; a++) { + for(int b = a + 1; b < numSeqs; b++) { + seqsPairs[pairIdx].seq1 = a; + seqsPairs[pairIdx].seq2 = b; + pairIdx++; + } + } +#endif + + // do all pairwise alignments for family similarity +#ifdef _OPENMP +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int a= seqsPairs[pairIdx].seq1; + int b = seqsPairs[pairIdx].seq2; + if(enableVerbose) { +#pragma omp critical + cerr <<"tid "<GetSequence(a); + Sequence *seq2 = sequences->GetSequence(b); + pair *, float> alignment = model.ComputeViterbiAlignment(seq1,seq2); + // + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + float N_correct_match = 0; + //float N_match; + //float N_column = 0; + float N_alignment = 0; + int i = 1;int j = 1; + //bool start = false; bool end = false; + for (SafeVector::iterator iter = alignment.first->begin(); + iter != alignment.first->end(); ++iter){ + if (*iter == 'B'){ + //N_match += 1; + //start = true; + //if(i==seq1->GetLength() || j==seq2->GetLength()) end = true; + unsigned char c1 = (unsigned char) iter1[i++]; + unsigned char c2 = (unsigned char) iter2[j++]; + if(c1==c2) N_correct_match += 1; + } + else if(*iter == 'X') i++; + else if(*iter == 'Y') j++; + //if(start && !end) N_column += 1; + N_alignment += 1; + } + if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "similarity error"<< endl; + identity += N_correct_match / N_alignment; + // + delete alignment.first; +#ifndef _OPENMP + } +#endif + } + identity /= numPairs; + //adapative + if(identity <= 0.15) initDistrib[2] = 0.143854; + else if(identity <= 0.2) initDistrib[2] = 0.191948; + else if(identity <= 0.25) initDistrib[2] = 0.170705; + else if(identity <= 0.3) initDistrib[2] = 0.100675; + else if(identity <= 0.35) initDistrib[2] = 0.090755; + else if(identity <= 0.4) initDistrib[2] = 0.146188; + else if(identity <= 0.45) initDistrib[2] = 0.167858; + else if(identity <= 0.5) initDistrib[2] = 0.250769; + else if(identity <= 0.6) initDistrib[2] = 0.500829; + else if(identity <= 0.7) initDistrib[2] = 0.259622; + + if( identity<= 0.2 || identity > 0.8) return 0; + else if(identity > 0.2 && identity<= 0.5) return 1; + else return 2; +} diff --git a/binaries/src/GLProbs-1.0/Makefile b/binaries/src/GLProbs-1.0/Makefile new file mode 100644 index 0000000..9128fbc --- /dev/null +++ b/binaries/src/GLProbs-1.0/Makefile @@ -0,0 +1,16 @@ + +CXXOBJS = MSA.o MSAGuideTree.o MSAClusterTree.o MSAPartProbs.o MSAReadMatrix.o main.o + +OPENMP = -fopenmp +CXX = g++ +COMMON_FLAGS = -O3 $(OPENMP) -Wall -funroll-loops -I . -I /usr/include +CXXFLAGS = $(COMMON_FLAGS) + +EXEC = glprobs + +all: $(CXXOBJS) + $(CXX) $(CXXFLAGS) -o $(EXEC) $(CXXOBJS) $(NVCCOBJS) $(NVCCLIBS) + strip $(EXEC) +clean: + rm -rf *.o $(EXEC) + diff --git a/binaries/src/GLProbs-1.0/MultiSequence.h b/binaries/src/GLProbs-1.0/MultiSequence.h new file mode 100644 index 0000000..96a61f5 --- /dev/null +++ b/binaries/src/GLProbs-1.0/MultiSequence.h @@ -0,0 +1,735 @@ +//////////////////////////////////////////////////////////////// +// MultiSequence.h +// +// Utilities for reading/writing multiple sequence data. +///////////////////////////////////////////////////////////////// + +#ifndef MULTISEQUENCE_H +#define MULTISEQUENCE_H + +#include +#include +#include +#include +#include +#include +#include +#include "SafeVector.h" +#include "Sequence.h" +#include "FileBuffer.h" + +#define VERSION "0.9.7" +///////////////////////////////////////////////////////////////// +// MultiSequence +// +// Class for multiple sequence alignment input/output. +///////////////////////////////////////////////////////////////// + +class MultiSequence { + + SafeVector *sequences; + +public: + + ///////////////////////////////////////////////////////////////// + // MultiSequence::MultiSequence() + // + // Default constructor. + ///////////////////////////////////////////////////////////////// + + MultiSequence() : + sequences(NULL) { + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::MultiSequence() + // + // Constructor. Load MFA from a FileBuffer object. + ///////////////////////////////////////////////////////////////// + + MultiSequence(FileBuffer &infile) : + sequences(NULL) { + LoadMFA(infile); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::MultiSequence() + // + // Constructor. Load MFA from a filename. + ///////////////////////////////////////////////////////////////// + + MultiSequence(const string &filename) : + sequences(NULL) { + LoadMFA(filename); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::~MultiSequence() + // + // Destructor. Gets rid of sequence objects contained in the + // multiple alignment. + ///////////////////////////////////////////////////////////////// + + ~MultiSequence() { + + // if sequences allocated + if (sequences) { + + // free all sequences + for (SafeVector::iterator iter = sequences->begin(); + iter != sequences->end(); ++iter) { + assert(*iter); + delete *iter; + *iter = NULL; + } + + // free sequence vector + delete sequences; + sequences = NULL; + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::LoadMFA() + // + // Load MFA from a filename. + ///////////////////////////////////////////////////////////////// + + void LoadMFA(const string &filename, bool stripGaps = false) { + + // try opening file + FileBuffer infile(filename.c_str()); + + if (infile.fail()) { + cerr << "ERROR: Could not open file '" << filename + << "' for reading." << endl; + exit(1); + } + + // if successful, then load using other LoadMFA() routine + LoadMFA(infile, stripGaps); + + infile.close(); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::LoadMFA() + // + // Load MSF from a FileBuffer object. + ///////////////////////////////////////////////////////////////// + + void ParseMSF(FileBuffer &infile, string header, bool stripGaps = false) { + + SafeVector *> seqData; + SafeVector seqNames; + SafeVector seqLengths; + + istringstream in; + bool valid = true; + bool missingHeader = false; + bool clustalW = false; + + // read until data starts + while (!infile.eof() && header.find("..", 0) == string::npos) { + if (header.find("CLUSTAL", 0) == 0 + || header.find("MSAPROBS", 0) == 0) { + clustalW = true; + break; + } + infile.GetLine(header); + if (header.find("//", 0) != string::npos) { + missingHeader = true; + break; + } + } + + // read until end-of-file + while (valid) { + infile.GetLine(header); + if (infile.eof()) + break; + + string word; + in.clear(); + in.str(header); + + // check if there's anything on this line + if (in >> word) { + + // clustalw name parsing + if (clustalW) { + if (!isspace(header[0]) + && find(seqNames.begin(), seqNames.end(), word) + == seqNames.end()) { + seqNames.push_back(word); + seqData.push_back(new SafeVector()); + seqLengths.push_back(0); + seqData[(int) seqData.size() - 1]->push_back('@'); + } + } + + // look for new sequence label + if (word == string("Name:")) { + if (in >> word) { + seqNames.push_back(word); + seqData.push_back(new SafeVector()); + seqLengths.push_back(0); + seqData[(int) seqData.size() - 1]->push_back('@'); + } else + valid = false; + } + + // check if this is sequence data + else if (find(seqNames.begin(), seqNames.end(), word) + != seqNames.end()) { + int index = find(seqNames.begin(), seqNames.end(), word) + - seqNames.begin(); + + // read all remaining characters on the line + char ch; + while (in >> ch) { + if (isspace(ch)) + continue; + if (ch >= 'a' && ch <= 'z') + ch = ch - 'a' + 'A'; + if (ch == '.') + ch = '-'; + if (stripGaps && ch == '-') + continue; + if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')) { + cerr << "ERROR: Unknown character encountered: " + << ch << endl; + exit(1); + } + + // everything's ok so far, so just store this character. + seqData[index]->push_back(ch); + seqLengths[index]++; + } + } else if (missingHeader) { + seqNames.push_back(word); + seqData.push_back(new SafeVector()); + seqLengths.push_back(0); + seqData[(int) seqData.size() - 1]->push_back('@'); + + int index = (int) seqNames.size() - 1; + + // read all remaining characters on the line + char ch; + while (in >> ch) { + if (isspace(ch)) + continue; + if (ch >= 'a' && ch <= 'z') + ch = ch - 'a' + 'A'; + if (ch == '.') + ch = '-'; + if (stripGaps && ch == '-') + continue; + if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')) { + cerr << "ERROR: Unknown character encountered: " + << ch << endl; + exit(1); + } + + // everything's ok so far, so just store this character. + seqData[index]->push_back(ch); + seqLengths[index]++; + } + } + } + } + + // check for errors + if (seqNames.size() == 0) { + cerr << "ERROR: No sequences read!" << endl; + exit(1); + } + + assert(!sequences); + sequences = new SafeVector; + for (int i = 0; i < (int) seqNames.size(); i++) { + if (seqLengths[i] == 0) { + cerr << "ERROR: Sequence of zero length!" << endl; + exit(1); + } + Sequence *seq = new Sequence(seqData[i], seqNames[i], seqLengths[i], + i, i); + sequences->push_back(seq); + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::LoadMFA() + // + // Load MFA from a FileBuffer object. + ///////////////////////////////////////////////////////////////// + + void LoadMFA(FileBuffer &infile, bool stripGaps = false) { + + // check to make sure that file reading is ok + if (infile.fail()) { + cerr << "ERROR: Error reading file." << endl; + exit(1); + } + + // read all sequences + while (true) { + + // get the sequence label as being the current # of sequences + // NOTE: sequence labels here are zero-based + int index = (!sequences) ? 0 : sequences->size(); + + // read the sequence + Sequence *seq = new Sequence(infile, stripGaps); + if (seq->Fail()) { + + // check if alternative file format (i.e. not MFA) + if (index == 0) { + string header = seq->GetHeader(); + if (header.length() > 0 && header[0] != '>') { + + // try MSF format + ParseMSF(infile, header); + break; + } + } + + delete seq; + break; + } + seq->SetLabel(index); + + // add the sequence to the list of current sequences + if (!sequences) + sequences = new SafeVector; + sequences->push_back(seq); + } + + // make sure at least one sequence was read + if (!sequences) { + cerr << "ERROR: No sequences read." << endl; + exit(1); + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::AddSequence() + // + // Add another sequence to an existing sequence list + ///////////////////////////////////////////////////////////////// + + void AddSequence(Sequence *sequence) { + assert(sequence); + assert(!sequence->Fail()); + + // add sequence + if (!sequences) + sequences = new SafeVector; + sequences->push_back(sequence); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::RemoveSequence() + // + // Remove a sequence from the MultiSequence + ///////////////////////////////////////////////////////////////// + + void RemoveSequence(int index) { + assert(sequences); + + assert(index >= 0 && index < (int) sequences->size()); + delete (*sequences)[index]; + + sequences->erase(sequences->begin() + index); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::WriteMFA() + // + // Write MFA to the outfile. Allows the user to specify the + // number of columns for the output. Also, useIndices determines + // whether or not the actual sequence comments will be printed + // out or whether the artificially assigned sequence labels will + // be used instead. + ///////////////////////////////////////////////////////////////// + + void WriteMFA(ostream &outfile, int numColumns = 60, + bool useIndices = false) { + if (!sequences) + return; + + // loop through all sequences and write them out + for (SafeVector::iterator iter = sequences->begin(); + iter != sequences->end(); ++iter) { + (*iter)->WriteMFA(outfile, numColumns, useIndices); + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::GetAnnotationChar() + // + // Return CLUSTALW annotation for column. + ///////////////////////////////////////////////////////////////// + + char GetAnnotationChar(SafeVector &column) { + SafeVector counts(256, 0); + int allChars = (int) column.size(); + + for (int i = 0; i < allChars; i++) { + counts[(unsigned char) toupper(column[i])]++; + } + + allChars -= counts[(unsigned char) '-']; + if (allChars == 1) + return ' '; + + for (int i = 0; i < 256; i++) + if ((char) i != '-' && counts[i] == allChars) + return '*'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + + counts[(unsigned char) 'A'] == allChars) + return ':'; + + if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E'] + + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'H'] + + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H'] + + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I'] + + counts[(unsigned char) 'L'] + counts[(unsigned char) 'V'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I'] + + counts[(unsigned char) 'L'] + counts[(unsigned char) 'F'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'Y'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y'] + + counts[(unsigned char) 'W'] == allChars) + return ':'; + + if (counts[(unsigned char) 'C'] + counts[(unsigned char) 'S'] + + counts[(unsigned char) 'A'] == allChars) + return '.'; + + if (counts[(unsigned char) 'A'] + counts[(unsigned char) 'T'] + + counts[(unsigned char) 'V'] == allChars) + return '.'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'A'] + + counts[(unsigned char) 'G'] == allChars) + return '.'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + + counts[(unsigned char) 'N'] + counts[(unsigned char) 'K'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + + counts[(unsigned char) 'P'] + counts[(unsigned char) 'A'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'G'] + + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'N'] + + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + + counts[(unsigned char) 'H'] + counts[(unsigned char) 'K'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E'] + + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q'] + + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'V'] + + counts[(unsigned char) 'L'] + counts[(unsigned char) 'I'] + + counts[(unsigned char) 'M'] == allChars) + return '.'; + + if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'F'] + + counts[(unsigned char) 'Y'] == allChars) + return '.'; + + return ' '; + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::WriteALN() + // + // Write ALN to the outfile. Allows the user to specify the + // number of columns for the output. + ///////////////////////////////////////////////////////////////// + + void WriteALN(ostream &outfile, int numColumns = 60) { + if (!sequences) + return; + + outfile << "CLUSTAL for GLPROBS version " << VERSION << " multiple sequence alignment" << endl; +// +// outfile << "//"<::iterator> ptrs(GetNumSequences()); + SafeVector lengths(GetNumSequences()); + for (int i = 0; i < GetNumSequences(); i++) { + ptrs[i] = GetSequence(i)->GetDataPtr(); + lengths[i] = GetSequence(i)->GetLength(); + longestComment = max(longestComment, + (int) GetSequence(i)->GetName().length()); + } + longestComment += 4; + + int writtenChars = 0; + bool allDone = false; + + while (!allDone) { + outfile << endl; + allDone = true; + + // loop through all sequences and write them out + for (int i = 0; i < GetNumSequences(); i++) { + + if (writtenChars < lengths[i]) { + outfile << GetSequence(i)->GetName(); + for (int j = 0; + j + < longestComment + - (int) GetSequence(i)->GetName().length(); + j++) + outfile << ' '; + + for (int j = 0; j < numColumns; j++) { + if (writtenChars + j < lengths[i]) + outfile << ptrs[i][writtenChars + j + 1]; + else + break; + } + + outfile << endl; + + if (writtenChars + numColumns < lengths[i]) + allDone = false; + } + } + + // write annotation line +/* + for (int j = 0; j < longestComment; j++) + outfile << ' '; + + for (int j = 0; j < numColumns; j++) { + SafeVector column; + + for (int i = 0; i < GetNumSequences(); i++) + if (writtenChars + j < lengths[i]) + column.push_back(ptrs[i][writtenChars + j + 1]); + + if (column.size() > 0) + outfile << GetAnnotationChar(column); + } +*/ + outfile << endl; + writtenChars += numColumns; + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::GetSequence() + // + // Retrieve a sequence from the MultiSequence object. + ///////////////////////////////////////////////////////////////// + + Sequence* GetSequence(int i) { + assert(sequences); + assert(0 <= i && i < (int) sequences->size()); + + return (*sequences)[i]; + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::GetSequence() + // + // Retrieve a sequence from the MultiSequence object + // (const version). + ///////////////////////////////////////////////////////////////// + + const Sequence* GetSequence(int i) const { + assert(sequences); + assert(0 <= i && i < (int) sequences->size()); + + return (*sequences)[i]; + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::GetNumSequences() + // + // Returns the number of sequences in the MultiSequence. + ///////////////////////////////////////////////////////////////// + + int GetNumSequences() const { + if (!sequences) + return 0; + return (int) sequences->size(); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::SortByHeader() + // + // Organizes the sequences according to their sequence headers + // in ascending order. + ///////////////////////////////////////////////////////////////// + + void SortByHeader() { + assert(sequences); + + // a quick and easy O(n^2) sort + for (int i = 0; i < (int) sequences->size() - 1; i++) { + for (int j = i + 1; j < (int) sequences->size(); j++) { + if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader()) + swap((*sequences)[i], (*sequences)[j]); + } + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::SortByLabel() + // + // Organizes the sequences according to their sequence labels + // in ascending order. + ///////////////////////////////////////////////////////////////// + + void SortByLabel() { + assert(sequences); + + // a quick and easy O(n^2) sort + for (int i = 0; i < (int) sequences->size() - 1; i++) { + for (int j = i + 1; j < (int) sequences->size(); j++) { + if ((*sequences)[i]->GetSortLabel() + > (*sequences)[j]->GetSortLabel()) + swap((*sequences)[i], (*sequences)[j]); + } + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::SaveOrdering() + // + // Relabels sequences so as to preserve the current ordering. + ///////////////////////////////////////////////////////////////// + + void SaveOrdering() { + assert(sequences); + + for (int i = 0; i < (int) sequences->size(); i++) + (*sequences)[i]->SetSortLabel(i); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::Project() + // + // Given a set of indices, extract all sequences from the current + // MultiSequence object whose index is included in the set. + // Then, project the multiple alignments down to the desired + // subset, and return the projection as a new MultiSequence + // object. + ///////////////////////////////////////////////////////////////// + + MultiSequence *Project(const set &indices) { + SafeVector::iterator> oldPtrs(indices.size()); + SafeVector *> newPtrs(indices.size()); + + assert(indices.size() != 0); + + // grab old data + int i = 0; + for (set::const_iterator iter = indices.begin(); + iter != indices.end(); ++iter) { + oldPtrs[i++] = GetSequence(*iter)->GetDataPtr(); + } + + // compute new length + int oldLength = GetSequence(*indices.begin())->GetLength(); + int newLength = 0; + for (i = 1; i <= oldLength; i++) { + + // check to see if there is a gap in every sequence of the set + bool found = false; + for (int j = 0; !found && j < (int) indices.size(); j++) + found = (oldPtrs[j][i] != '-'); + + // if not, then this column counts towards the sequence length + if (found) + newLength++; + } + + // build new alignments + for (i = 0; i < (int) indices.size(); i++) { + newPtrs[i] = new SafeVector(); + assert(newPtrs[i]); + newPtrs[i]->push_back('@'); + } + + // add all needed columns + for (i = 1; i <= oldLength; i++) { + + // make sure column is not gapped in all sequences in the set + bool found = false; + for (int j = 0; !found && j < (int) indices.size(); j++) + found = (oldPtrs[j][i] != '-'); + + // if not, then add it + if (found) { + for (int j = 0; j < (int) indices.size(); j++) + newPtrs[j]->push_back(oldPtrs[j][i]); + } + } + + // wrap sequences in MultiSequence object + MultiSequence *ret = new MultiSequence(); + i = 0; + for (set::const_iterator iter = indices.begin(); + iter != indices.end(); ++iter) { + ret->AddSequence( + new Sequence(newPtrs[i++], GetSequence(*iter)->GetHeader(), + newLength, GetSequence(*iter)->GetSortLabel(), + GetSequence(*iter)->GetLabel())); + } + + return ret; + } +}; + +#endif diff --git a/binaries/src/GLProbs-1.0/ProbabilisticModel.h b/binaries/src/GLProbs-1.0/ProbabilisticModel.h new file mode 100644 index 0000000..6c7ab1b --- /dev/null +++ b/binaries/src/GLProbs-1.0/ProbabilisticModel.h @@ -0,0 +1,1338 @@ +///////////////////////////////////////////////////////////////// +// ProbabilisticModel.h +// +// Routines for (1) posterior probability computations +// (2) chained anchoring +// (3) maximum weight trace alignment +///////////////////////////////////////////////////////////////// + +#ifndef PROBABILISTICMODEL_H +#define PROBABILISTICMODEL_H + +#include +#include +#include +#include "SafeVector.h" +#include "ScoreType.h" +#include "SparseMatrix.h" +#include "MultiSequence.h" + +using namespace std; + +const int NumMatchStates = 1; // note that in this version the number + // of match states is fixed at 1...will +const int NumInsertStates = 2; // change in future versions +const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2; + +///////////////////////////////////////////////////////////////// +// ProbabilisticModel +// +// Class for storing the parameters of a probabilistic model and +// performing different computations based on those parameters. +// In particular, this class handles the computation of +// posterior probabilities that may be used in alignment. +///////////////////////////////////////////////////////////////// + +class ProbabilisticModel { + + float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state + float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities + float matchProb[256][256]; // emission probabilities for match states + float insProb[256][NumMatrixTypes]; // emission probabilities for insert states + float local_transProb[3][3]; + float random_transProb[2]; + + public: + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ProbabilisticModel() + // + // Constructor. Builds a new probabilistic model using the + // given parameters. + ///////////////////////////////////////////////////////////////// + + ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend, + const VVF &emitPairs, const VF &emitSingle){ + +//Probcons model + // build transition matrix + VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f)); + transMat[0][0] = 1; + for (int i = 0; i < NumInsertStates; i++){ + transMat[0][2*i+1] = gapOpen[2*i]; + transMat[0][2*i+2] = gapOpen[2*i]; + transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i]); + assert (transMat[0][0] > 0); + transMat[2*i+1][2*i+1] = gapExtend[2*i]; + transMat[2*i+2][2*i+2] = gapExtend[2*i]; + transMat[2*i+1][2*i+2] = 0; + transMat[2*i+2][2*i+1] = 0; + transMat[2*i+1][0] = 1 - gapExtend[2*i]; + transMat[2*i+2][0] = 1 - gapExtend[2*i]; + } + + // create initial and transition probability matrices + for (int i = 0; i < NumMatrixTypes; i++){ + initialDistribution[i] = LOG (initDistribMat[i]); + for (int j = 0; j < NumMatrixTypes; j++) + transProb[i][j] = LOG (transMat[i][j]); + } +//due to Local model parameters' initilization, need to correct initialDistribution[2] + initialDistribution[2] = LOG (initDistribMat[1]); + + // create insertion and match probability matrices + for (int i = 0; i < 256; i++){ + for (int j = 0; j < NumMatrixTypes; j++) + insProb[i][j] = LOG (emitSingle[i]); + for (int j = 0; j < 256; j++) + matchProb[i][j] = LOG (emitPairs[i][j]); + } + +//Local model + // build transition matrix + VVF ltransMat (3, VF (3, 0.0f)); + ltransMat[0][0] = 1; + + ltransMat[0][1] = gapOpen[1]; + ltransMat[0][2] = gapOpen[1]; + ltransMat[0][0] -= (gapOpen[1] + gapOpen[1]); + assert (ltransMat[0][0] > 0); + ltransMat[1][1] = gapExtend[1]; + ltransMat[2][2] = gapExtend[1]; + ltransMat[1][2] = 0; + ltransMat[2][1] = 0; + ltransMat[1][0] = 1 - gapExtend[1]; + ltransMat[2][0] = 1 - gapExtend[1]; + + // create initial and transition probability matrices + for (int i = 0; i < 3; i++){ + for (int j = 0; j < 3; j++) + local_transProb[i][j] = LOG (ltransMat[i][j]); + } + + // create initial and transition probability matrices + random_transProb[0] = LOG (initDistribMat[2]);//sigma + random_transProb[1] = LOG (1-initDistribMat[2]);//1-sigma + + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeForwardMatrix() + // + // Computes a set of forward probability matrices for aligning + // seq1 and seq2. + // + // For efficiency reasons, a single-dimensional floating-point + // array is used here, with the following indexing scheme: + // + // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] + // refers to the probability of aligning through j characters + // of the first sequence, k characters of the second sequence, + // and ending in state i. + // flag: 1 probcons, 0 local + ///////////////////////////////////////////////////////////////// + + VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2, bool flag=true) const { + + assert (seq1); + assert (seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + // retrieve the points to the beginning of each sequence + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + // create matrix + VF *forwardPtr; + if(flag) forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); + else forwardPtr = new VF (3 * (seq1Length+1) * (seq2Length+1), LOG_ZERO); + assert (forwardPtr); + VF &forward = *forwardPtr; + + // initialization condition + if(flag){ + forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] = + initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]]; + + for (int k = 0; k < NumInsertStates; k++){ + forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] = + initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k]; + forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] = + initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k]; + } + } + + // remember offset for each index combination + int ij = 0; + int i1j = -seq2Length - 1; + int ij1 = -1; + int i1j1 = -seq2Length - 2; + + if(flag){ + ij *= NumMatrixTypes; + i1j *= NumMatrixTypes; + ij1 *= NumMatrixTypes; + i1j1 *= NumMatrixTypes; + } + else{ + ij *= 3; + i1j *= 3; + ij1 *= 3; + i1j1 *= 3; + } + + // compute forward scores + for (int i = 0; i <= seq1Length; i++){ + unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; + for (int j = 0; j <= seq2Length; j++){ + unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; + //local + if(i == 1 && j == 1 && !flag) forward[0 + ij] = + matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0] - 2*random_transProb[1]; + + if (i > 1 || j > 1){ + if (i > 0 && j > 0){ + if(flag){ + forward[0 + ij] = forward[0 + i1j1] + transProb[0][0]; + for (int k = 1; k < NumMatrixTypes; k++) + LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]); + forward[0 + ij] += matchProb[c1][c2]; + } + //local + else{ + forward[0 + ij] = matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0] - 2*random_transProb[1]; + for (int k = 0; k < 3; k++) + LOG_PLUS_EQUALS (forward[0 + ij], matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0] + + forward[k + i1j1] + local_transProb[k][0] - 2*random_transProb[1]); + } + } + if (i > 0){ + if(flag){ + for (int k = 0; k < NumInsertStates; k++) + forward[2*k+1 + ij] = insProb[c1][k] + + LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1], + forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]); + } + + + + + + + //local + else{ + forward[1 + ij] = LOG_ADD (forward[0 + i1j] + local_transProb[0][1] - random_transProb[1], + forward[1 + i1j] + local_transProb[1][1] - random_transProb[1]); + } + + } + if (j > 0){ + if(flag){ + for (int k = 0; k < NumInsertStates; k++) + forward[2*k+2 + ij] = insProb[c2][k] + + LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2], + forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]); + } + //local + else{ + forward[2 + ij] = LOG_ADD (forward[0 + ij1] + local_transProb[0][2] - random_transProb[1], + forward[2 + ij1] + local_transProb[2][2] - random_transProb[1]); + } + } + } + if(flag){ + ij += NumMatrixTypes; + i1j += NumMatrixTypes; + ij1 += NumMatrixTypes; + i1j1 += NumMatrixTypes; + } + else{ + ij += 3; + i1j += 3; + ij1 += 3; + i1j1 += 3; + } + } + } + + return forwardPtr; + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeBackwardMatrix() + // + // Computes a set of backward probability matrices for aligning + // seq1 and seq2. + // + // For efficiency reasons, a single-dimensional floating-point + // array is used here, with the following indexing scheme: + // + // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] + // refers to the probability of starting in state i and + // aligning from character j+1 to the end of the first + // sequence and from character k+1 to the end of the second + // sequence. + ///////////////////////////////////////////////////////////////// + + VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2, bool flag=true) const { + + assert (seq1); + assert (seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + // create matrix + VF *backwardPtr; + if(flag) backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); + else backwardPtr = new VF (3 * (seq1Length+1) * (seq2Length+1), LOG_ZERO); + assert (backwardPtr); + VF &backward = *backwardPtr; + + // initialization condition + if(flag){ + for (int k = 0; k < NumMatrixTypes; k++) + backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k]; + } + // remember offset for each index combination + int ij = (seq1Length+1) * (seq2Length+1) - 1; + int i1j = ij + seq2Length + 1; + int ij1 = ij + 1; + int i1j1 = ij + seq2Length + 2; + + if(flag){ + ij *= NumMatrixTypes; + i1j *= NumMatrixTypes; + ij1 *= NumMatrixTypes; + i1j1 *= NumMatrixTypes; + } + else{ + ij *= 3; + i1j *= 3; + ij1 *= 3; + i1j1 *= 3; + } + + // compute backward scores + for (int i = seq1Length; i >= 0; i--){ + unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1]; + for (int j = seq2Length; j >= 0; j--){ + unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1]; + + if(!flag) backward[0 + ij] = LOG_ONE;//local + if (i < seq1Length && j < seq2Length){ + if(flag){ + const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2]; + for (int k = 0; k < NumMatrixTypes; k++) + LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]); + } + //local + else{ + const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2] - insProb[c1][0] - insProb[c2][0]; + for (int k = 0; k < 3; k++) + LOG_PLUS_EQUALS (backward[k + ij], ProbXY + local_transProb[k][0] - 2*random_transProb[1] ); + } + } + if (i < seq1Length){ + if(flag){ + for (int k = 0; k < NumInsertStates; k++){ + LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]); + LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]); + } + } + //local + else{ + LOG_PLUS_EQUALS (backward[0 + ij], backward[1 + i1j] + local_transProb[0][1] - random_transProb[1]); + LOG_PLUS_EQUALS (backward[1 + ij], backward[1 + i1j] + local_transProb[1][1] - random_transProb[1]); + } + } + if (j < seq2Length){ + if(flag){ + for (int k = 0; k < NumInsertStates; k++){ + LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]); + LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]); + } + } + //local + else{ + LOG_PLUS_EQUALS (backward[0 + ij], backward[2 + ij1] + local_transProb[0][2] - random_transProb[1]); + LOG_PLUS_EQUALS (backward[2 + ij], backward[2 + ij1] + local_transProb[2][2] - random_transProb[1]); + } + } + if(flag){ + ij -= NumMatrixTypes; + i1j -= NumMatrixTypes; + ij1 -= NumMatrixTypes; + i1j1 -= NumMatrixTypes; + } + else{ + ij -= 3; + i1j -= 3; + ij1 -= 3; + i1j1 -= 3; + } + } + } + + return backwardPtr; + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeTotalProbability() + // + // Computes the total probability of an alignment given + // the forward and backward matrices. + // flag: 1 probcons, 0 local + ///////////////////////////////////////////////////////////////// + + float ComputeTotalProbability (Sequence *seq1, Sequence *seq2, + const VF &forward, const VF &backward, bool flag=true) const { + + // compute total probability + float totalForwardProb = LOG_ZERO; + float totalBackwardProb = LOG_ZERO; + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + if(flag){ + for (int k = 0; k < NumMatrixTypes; k++){ + LOG_PLUS_EQUALS (totalForwardProb, + forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + + backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); + } + + totalBackwardProb = + forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] + + backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)]; + + for (int k = 0; k < NumInsertStates; k++){ + LOG_PLUS_EQUALS (totalBackwardProb, + forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] + + backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]); + LOG_PLUS_EQUALS (totalBackwardProb, + forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] + + backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]); + } + } + else{ + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + int ij = 0; + for (int i = 0; i <= seq1Length; i++){ + unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; + for (int j = 0; j <= seq2Length; j++){ + unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; + if(i>0&&j>0) { + LOG_PLUS_EQUALS (totalForwardProb,forward[ij]); + LOG_PLUS_EQUALS (totalBackwardProb,backward[ij] + matchProb[c1][c2] + - insProb[c1][0] - insProb[c2][0] - 2*random_transProb[1]); + } + ij += 3; + } + } + + } + + return (totalForwardProb + totalBackwardProb) / 2; + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputePosteriorMatrix() + // + // Computes the posterior probability matrix based on + // the forward and backward matrices. + // flag: 1 probcons, 0 local + ///////////////////////////////////////////////////////////////// + + VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2, + const VF &forward, const VF &backward, bool flag=true) const { + + assert (seq1); + assert (seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + float totalProb = ComputeTotalProbability (seq1, seq2,forward, backward, flag); + + // compute posterior matrices + VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr); + VF &posterior = *posteriorPtr; + + int ij = 0; + VF::iterator ptr = posterior.begin(); + + for (int i = 0; i <= seq1Length; i++){ + for (int j = 0; j <= seq2Length; j++){ + *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb)); + if(flag) ij += NumMatrixTypes; + else ij += 3; + } + } + + posterior[0] = 0; + + return posteriorPtr; + } + + /* + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeExpectedCounts() + // + // Computes the expected counts for the various transitions. + ///////////////////////////////////////////////////////////////// + + VVF *ComputeExpectedCounts () const { + + assert (seq1); + assert (seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + // compute total probability + float totalProb = ComputeTotalProbability (seq1Length, seq2Length, + forward, backward); + + // initialize expected counts + VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr); + VVF &counts = *countsPtr; + + // remember offset for each index combination + int ij = 0; + int i1j = -seq2Length - 1; + int ij1 = -1; + int i1j1 = -seq2Length - 2; + + ij *= NumMatrixTypes; + i1j *= NumMatrixTypes; + ij1 *= NumMatrixTypes; + i1j1 *= NumMatrixTypes; + + // compute expected counts + for (int i = 0; i <= seq1Length; i++){ + unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; + for (int j = 0; j <= seq2Length; j++){ + unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; + + if (i > 0 && j > 0){ + for (int k = 0; k < NumMatrixTypes; k++) + LOG_PLUS_EQUALS (counts[k][0], + forward[k + i1j1] + transProb[k][0] + + matchProb[c1][c2] + backward[0 + ij]); + } + if (i > 0){ + for (int k = 0; k < NumInsertStates; k++){ + LOG_PLUS_EQUALS (counts[0][2*k+1], + forward[0 + i1j] + transProb[0][2*k+1] + + insProb[c1][k] + backward[2*k+1 + ij]); + LOG_PLUS_EQUALS (counts[2*k+1][2*k+1], + forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + + insProb[c1][k] + backward[2*k+1 + ij]); + } + } + if (j > 0){ + for (int k = 0; k < NumInsertStates; k++){ + LOG_PLUS_EQUALS (counts[0][2*k+2], + forward[0 + ij1] + transProb[0][2*k+2] + + insProb[c2][k] + backward[2*k+2 + ij]); + LOG_PLUS_EQUALS (counts[2*k+2][2*k+2], + forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + + insProb[c2][k] + backward[2*k+2 + ij]); + } + } + + ij += NumMatrixTypes; + i1j += NumMatrixTypes; + ij1 += NumMatrixTypes; + i1j1 += NumMatrixTypes; + } + } + + // scale all expected counts appropriately + for (int i = 0; i < NumMatrixTypes; i++) + for (int j = 0; j < NumMatrixTypes; j++) + counts[i][j] -= totalProb; + + } + */ + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeNewParameters() + // + // Computes a new parameter set based on the expected counts + // given. + ///////////////////////////////////////////////////////////////// + + void ComputeNewParameters (Sequence *seq1, Sequence *seq2, + const VF &forward, const VF &backward, + VF &initDistribMat, VF &gapOpen, + VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const { + + assert (seq1); + assert (seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + // compute total probability + float totalProb = ComputeTotalProbability (seq1, seq2, + forward, backward); + + // initialize expected counts + VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO)); + VF initCounts (NumMatrixTypes, LOG_ZERO); + VVF pairCounts (256, VF (256, LOG_ZERO)); + VF singleCounts (256, LOG_ZERO); + + // remember offset for each index combination + int ij = 0; + int i1j = -seq2Length - 1; + int ij1 = -1; + int i1j1 = -seq2Length - 2; + + ij *= NumMatrixTypes; + i1j *= NumMatrixTypes; + ij1 *= NumMatrixTypes; + i1j1 *= NumMatrixTypes; + + // compute initial distribution posteriors + initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] + + backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)], + forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + + backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); + for (int k = 0; k < NumInsertStates; k++){ + initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] + + backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)], + forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + + backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); + initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] + + backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)], + forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + + backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); + } + + // compute expected counts + for (int i = 0; i <= seq1Length; i++){ + unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]); + for (int j = 0; j <= seq2Length; j++){ + unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]); + + if (i > 0 && j > 0){ + if (enableTrainEmissions && i == 1 && j == 1){ + LOG_PLUS_EQUALS (pairCounts[c1][c2], + initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]); + LOG_PLUS_EQUALS (pairCounts[c2][c1], + initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]); + } + + for (int k = 0; k < NumMatrixTypes; k++){ + LOG_PLUS_EQUALS (transCounts[k][0], + forward[k + i1j1] + transProb[k][0] + + matchProb[c1][c2] + backward[0 + ij]); + if (enableTrainEmissions && i != 1 || j != 1){ + LOG_PLUS_EQUALS (pairCounts[c1][c2], + forward[k + i1j1] + transProb[k][0] + + matchProb[c1][c2] + backward[0 + ij]); + LOG_PLUS_EQUALS (pairCounts[c2][c1], + forward[k + i1j1] + transProb[k][0] + + matchProb[c2][c1] + backward[0 + ij]); + } + } + } + if (i > 0){ + for (int k = 0; k < NumInsertStates; k++){ + LOG_PLUS_EQUALS (transCounts[0][2*k+1], + forward[0 + i1j] + transProb[0][2*k+1] + + insProb[c1][k] + backward[2*k+1 + ij]); + LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1], + forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + + insProb[c1][k] + backward[2*k+1 + ij]); + if (enableTrainEmissions){ + if (i == 1 && j == 0){ + LOG_PLUS_EQUALS (singleCounts[c1], + initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); + } + else { + LOG_PLUS_EQUALS (singleCounts[c1], + forward[0 + i1j] + transProb[0][2*k+1] + + insProb[c1][k] + backward[2*k+1 + ij]); + LOG_PLUS_EQUALS (singleCounts[c1], + forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + + insProb[c1][k] + backward[2*k+1 + ij]); + } + } + } + } + if (j > 0){ + for (int k = 0; k < NumInsertStates; k++){ + LOG_PLUS_EQUALS (transCounts[0][2*k+2], + forward[0 + ij1] + transProb[0][2*k+2] + + insProb[c2][k] + backward[2*k+2 + ij]); + LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2], + forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + + insProb[c2][k] + backward[2*k+2 + ij]); + if (enableTrainEmissions){ + if (i == 0 && j == 1){ + LOG_PLUS_EQUALS (singleCounts[c2], + initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); + } + else { + LOG_PLUS_EQUALS (singleCounts[c2], + forward[0 + ij1] + transProb[0][2*k+2] + + insProb[c2][k] + backward[2*k+2 + ij]); + LOG_PLUS_EQUALS (singleCounts[c2], + forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + + insProb[c2][k] + backward[2*k+2 + ij]); + } + } + } + } + + ij += NumMatrixTypes; + i1j += NumMatrixTypes; + ij1 += NumMatrixTypes; + i1j1 += NumMatrixTypes; + } + } + + // scale all expected counts appropriately + for (int i = 0; i < NumMatrixTypes; i++){ + initCounts[i] -= totalProb; + for (int j = 0; j < NumMatrixTypes; j++) + transCounts[i][j] -= totalProb; + } + if (enableTrainEmissions){ + for (int i = 0; i < 256; i++){ + for (int j = 0; j < 256; j++) + pairCounts[i][j] -= totalProb; + singleCounts[i] -= totalProb; + } + } + + // compute new initial distribution + float totalInitDistribCounts = 0; + for (int i = 0; i < NumMatrixTypes; i++) + totalInitDistribCounts += exp (initCounts[i]); // should be 2 + initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts)); + for (int k = 0; k < NumInsertStates; k++){ + float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2; + initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts)); + } + + // compute total counts for match state + float inMatchStateCounts = 0; + for (int i = 0; i < NumMatrixTypes; i++) + inMatchStateCounts += exp (transCounts[0][i]); + for (int i = 0; i < NumInsertStates; i++){ + + // compute total counts for gap state + float inGapStateCounts = + exp (transCounts[2*i+1][0]) + + exp (transCounts[2*i+1][2*i+1]) + + exp (transCounts[2*i+2][0]) + + exp (transCounts[2*i+2][2*i+2]); + + gapOpen[2*i] = gapOpen[2*i+1] = + (exp (transCounts[0][2*i+1]) + + exp (transCounts[0][2*i+2])) / + (2 * inMatchStateCounts); + + gapExtend[2*i] = gapExtend[2*i+1] = + (exp (transCounts[2*i+1][2*i+1]) + + exp (transCounts[2*i+2][2*i+2])) / + inGapStateCounts; + } + + if (enableTrainEmissions){ + float totalPairCounts = 0; + float totalSingleCounts = 0; + for (int i = 0; i < 256; i++){ + for (int j = 0; j <= i; j++) + totalPairCounts += exp (pairCounts[j][i]); + totalSingleCounts += exp (singleCounts[i]); + } + + for (int i = 0; i < 256; i++) if (!islower ((char) i)){ + int li = (int)((unsigned char) tolower ((char) i)); + for (int j = 0; j <= i; j++) if (!islower ((char) j)){ + int lj = (int)((unsigned char) tolower ((char) j)); + emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] = + emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts; + } + emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts; + } + } + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeAlignment() + // + // Computes an alignment based on the given posterior matrix. + // This is done by finding the maximum summing path (or + // maximum weight trace) through the posterior matrix. The + // final alignment is returned as a pair consisting of: + // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and + // denote insertions in one of the two sequences and + // B's denote that both sequences are present (i.e. + // matches). + // (2) a float indicating the sum achieved + ///////////////////////////////////////////////////////////////// + + pair *, float> ComputeAlignment (int seq1Length, int seq2Length, + const VF &posterior) const { + + float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows); + float *oldRow = twoRows; + float *newRow = twoRows + seq2Length + 1; + + char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); + char *tracebackPtr = tracebackMatrix; + + VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; + + // initialization + for (int i = 0; i <= seq2Length; i++){ + oldRow[i] = 0; + *(tracebackPtr++) = 'L'; + } + + // fill in matrix + for (int i = 1; i <= seq1Length; i++){ + + // initialize left column + newRow[0] = 0; + posteriorPtr++; + *(tracebackPtr++) = 'U'; + + // fill in rest of row + for (int j = 1; j <= seq2Length; j++){ + ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j], + 'D', 'L', 'U', &newRow[j], tracebackPtr++); + } + + // swap rows + float *temp = oldRow; + oldRow = newRow; + newRow = temp; + } + + // store best score + float total = oldRow[seq2Length]; + delete [] twoRows; + + // compute traceback + SafeVector *alignment = new SafeVector; assert (alignment); + int r = seq1Length, c = seq2Length; + while (r != 0 || c != 0){ + char ch = tracebackMatrix[r*(seq2Length+1) + c]; + switch (ch){ + case 'L': c--; alignment->push_back ('Y'); break; + case 'U': r--; alignment->push_back ('X'); break; + case 'D': c--; r--; alignment->push_back ('B'); break; + default: assert (false); + } + } + + delete [] tracebackMatrix; + + reverse (alignment->begin(), alignment->end()); + + return make_pair(alignment, total); + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeAlignmentWithGapPenalties() + // + // Similar to ComputeAlignment() except with gap penalties. + ///////////////////////////////////////////////////////////////// + + pair *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1, + MultiSequence *align2, + const VF &posterior, int numSeqs1, + int numSeqs2, + float gapOpenPenalty, + float gapContinuePenalty) const { + int seq1Length = align1->GetSequence(0)->GetLength(); + int seq2Length = align2->GetSequence(0)->GetLength(); + SafeVector::iterator > dataPtrs1 (align1->GetNumSequences()); + SafeVector::iterator > dataPtrs2 (align2->GetNumSequences()); + + // grab character data + for (int i = 0; i < align1->GetNumSequences(); i++) + dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr(); + for (int i = 0; i < align2->GetNumSequences(); i++) + dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr(); + + // the number of active sequences at any given column is defined to be the + // number of non-gap characters in that column; the number of gap opens at + // any given column is defined to be the number of gap characters in that + // column where the previous character in the respective sequence was not + // a gap + SafeVector numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1); + SafeVector numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1); + + // compute number of active sequences and gap opens for each group + for (int i = 0; i < align1->GetNumSequences(); i++){ + SafeVector::iterator dataPtr = align1->GetSequence(i)->GetDataPtr(); + numActive1[0] = numGapOpens1[0] = 0; + for (int j = 1; j <= seq1Length; j++){ + if (dataPtr[j] != '-'){ + numActive1[j]++; + numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-'); + } + } + } + for (int i = 0; i < align2->GetNumSequences(); i++){ + SafeVector::iterator dataPtr = align2->GetSequence(i)->GetDataPtr(); + numActive2[0] = numGapOpens2[0] = 0; + for (int j = 1; j <= seq2Length; j++){ + if (dataPtr[j] != '-'){ + numActive2[j]++; + numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-'); + } + } + } + + VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1)); + VF continuingPenalty1 (numSeqs1+1); + VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1)); + VF continuingPenalty2 (numSeqs2+1); + + // precompute penalties + for (int i = 0; i <= numSeqs1; i++) + for (int j = 0; j <= numSeqs2; j++) + openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j)); + for (int i = 0; i <= numSeqs1; i++) + continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2; + for (int i = 0; i <= numSeqs2; i++) + for (int j = 0; j <= numSeqs1; j++) + openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j)); + for (int i = 0; i <= numSeqs2; i++) + continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1; + + float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows); + float *oldRowMatch = twoRows; + float *newRowMatch = twoRows + (seq2Length+1); + float *oldRowInsertX = twoRows + 2*(seq2Length+1); + float *newRowInsertX = twoRows + 3*(seq2Length+1); + float *oldRowInsertY = twoRows + 4*(seq2Length+1); + float *newRowInsertY = twoRows + 5*(seq2Length+1); + + char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); + char *tracebackPtr = tracebackMatrix; + + VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; + + // initialization + for (int i = 0; i <= seq2Length; i++){ + oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO; + oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]]; + *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y'; + tracebackPtr += 3; + } + + // fill in matrix + for (int i = 1; i <= seq1Length; i++){ + + // initialize left column + newRowMatch[0] = newRowInsertY[0] = LOG_ZERO; + newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]]; + posteriorPtr++; + *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X'; + tracebackPtr += 3; + + // fill in rest of row + for (int j = 1; j <= seq2Length; j++){ + + // going to MATCH state + ChooseBestOfThree (oldRowMatch[j-1], + oldRowInsertX[j-1], + oldRowInsertY[j-1], + 'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++); + newRowMatch[j] += *(posteriorPtr++); + + // going to INSERT X state + ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]], + oldRowInsertX[j] + continuingPenalty1[numActive1[i]], + oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]], + 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++); + + // going to INSERT Y state + ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]], + newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]], + newRowInsertY[j-1] + continuingPenalty2[numActive2[j]], + 'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++); + } + + // swap rows + float *temp; + temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp; + temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp; + temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp; + } + + // store best score + float total; + char matrix; + ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length], + 'M', 'X', 'Y', &total, &matrix); + + delete [] twoRows; + + // compute traceback + SafeVector *alignment = new SafeVector; assert (alignment); + int r = seq1Length, c = seq2Length; + while (r != 0 || c != 0){ + + int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2; + char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset]; + switch (matrix){ + case 'Y': c--; alignment->push_back ('Y'); break; + case 'X': r--; alignment->push_back ('X'); break; + case 'M': c--; r--; alignment->push_back ('B'); break; + default: assert (false); + } + matrix = ch; + } + + delete [] tracebackMatrix; + + reverse (alignment->begin(), alignment->end()); + + return make_pair(alignment, 1.0f); + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeViterbiAlignment() + // + // Computes the highest probability pairwise alignment using the + // probabilistic model. The final alignment is returned as a + // pair consisting of: + // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and + // denote insertions in one of the two sequences and + // B's denote that both sequences are present (i.e. + // matches). + // (2) a float containing the log probability of the best + // alignment (not used) + ///////////////////////////////////////////////////////////////// + + + pair *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const { + + assert (seq1); + assert (seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + // retrieve the points to the beginning of each sequence + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + // create viterbi matrix + VF *viterbiPtr = new VF (3 * (seq1Length+1) * (seq2Length+1), LOG_ZERO); + assert (viterbiPtr); + VF &viterbi = *viterbiPtr; + + // create traceback matrix + VI *tracebackPtr = new VI (3 * (seq1Length+1) * (seq2Length+1), -1); + assert (tracebackPtr); + VI &traceback = *tracebackPtr; + + // initialization condition +/* + for (int k = 0; k < NumMatrixTypes; k++) + viterbi[k] = initialDistribution[k]; +*/ + viterbi[0] = LOG(0.6080327034); + viterbi[1] = LOG(0.1959836632); + viterbi[2] = LOG(0.1959836632); + + // remember offset for each index combination + int ij = 0; + int i1j = -seq2Length - 1; + int ij1 = -1; + int i1j1 = -seq2Length - 2; + + ij *= 3; + i1j *= 3; + ij1 *= 3; + i1j1 *= 3; + + // compute viterbi scores + for (int i = 0; i <= seq1Length; i++){ + unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; + for (int j = 0; j <= seq2Length; j++){ + unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; + + if (i > 0 && j > 0){ + for (int k = 0; k < 3; k++){ + float newVal = viterbi[k + i1j1] + local_transProb[k][0] + matchProb[c1][c2]; + if (viterbi[0 + ij] < newVal){ + viterbi[0 + ij] = newVal; + traceback[0 + ij] = k; + } + } + } + if (i > 0){ + for (int k = 0; k < 1; k++){ + float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + local_transProb[0][2*k+1]; + float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + local_transProb[2*k+1][2*k+1]; + if (valFromMatch >= valFromIns){ + viterbi[2*k+1 + ij] = valFromMatch; + traceback[2*k+1 + ij] = 0; + } + else { + viterbi[2*k+1 + ij] = valFromIns; + traceback[2*k+1 + ij] = 2*k+1; + } + } + } + if (j > 0){ + for (int k = 0; k < 1; k++){ + float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + local_transProb[0][2*k+2]; + float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + local_transProb[2*k+2][2*k+2]; + if (valFromMatch >= valFromIns){ + viterbi[2*k+2 + ij] = valFromMatch; + traceback[2*k+2 + ij] = 0; + } + else { + viterbi[2*k+2 + ij] = valFromIns; + traceback[2*k+2 + ij] = 2*k+2; + } + } + } + + ij += 3; + i1j += 3; + ij1 += 3; + i1j1 += 3; + } + } + + // figure out best terminating cell + float bestProb = LOG_ZERO; + int state = -1; + viterbi[0] = LOG(0.6080327034); + viterbi[1] = LOG(0.1959836632); + viterbi[2] = LOG(0.1959836632); + + for (int k = 0; k < 3; k++){ + float thisProb = viterbi[k + 3 * ((seq1Length+1)*(seq2Length+1) - 1)] + viterbi[k]; + if (bestProb < thisProb){ + bestProb = thisProb; + state = k; + } + } + assert (state != -1); + + delete viterbiPtr; + + // compute traceback + SafeVector *alignment = new SafeVector; assert (alignment); + int r = seq1Length, c = seq2Length; + while (r != 0 || c != 0){ + int newState = traceback[state + 3 * (r * (seq2Length+1) + c)]; + if (state == 0){ c--; r--; alignment->push_back ('B');} + else if (state % 2 == 1){ r--; alignment->push_back ('X'); } + else { c--; alignment->push_back ('Y'); } + state = newState; + } + + delete tracebackPtr; + + reverse (alignment->begin(), alignment->end()); + + return make_pair(alignment, bestProb); + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::BuildPosterior() + // + // Builds a posterior probability matrix needed to align a pair + // of alignments. Mathematically, the returned matrix M is + // defined as follows: + // M[i,j] = sum sum f(s,t,i,j) + // s in align1 t in align2 + // where + // [ P(s[i'] <--> t[j']) + // [ if s[i'] is a letter in the ith column of align1 and + // [ t[j'] it a letter in the jth column of align2 + // f(s,t,i,j) = [ + // [ 0 otherwise + // + ///////////////////////////////////////////////////////////////// + + VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2, + const SafeVector > &sparseMatrices, + float cutoff = 0.0f) const { + const int seq1Length = align1->GetSequence(0)->GetLength(); + const int seq2Length = align2->GetSequence(0)->GetLength(); + + VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr); + VF &posterior = *posteriorPtr; + VF::iterator postPtr = posterior.begin(); + + // for each s in align1 + for (int i = 0; i < align1->GetNumSequences(); i++){ + int first = align1->GetSequence(i)->GetLabel(); + SafeVector *mapping1 = align1->GetSequence(i)->GetMapping(); + + // for each t in align2 + for (int j = 0; j < align2->GetNumSequences(); j++){ + int second = align2->GetSequence(j)->GetLabel(); + SafeVector *mapping2 = align2->GetSequence(j)->GetMapping(); + + if (first < second){ + + // get the associated sparse matrix + SparseMatrix *matrix = sparseMatrices[first][second]; + + for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){ + SafeVector::iterator row = matrix->GetRowPtr(ii); + int base = (*mapping1)[ii] * (seq2Length+1); + int rowSize = matrix->GetRowSize(ii); + + // add in all relevant values + for (int jj = 0; jj < rowSize; jj++) + posterior[base + (*mapping2)[row[jj].first]] += row[jj].second; + + // subtract cutoff + for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) + posterior[base + (*mapping2)[jj]] -= cutoff; + } + + } else { + + // get the associated sparse matrix + SparseMatrix *matrix = sparseMatrices[second][first]; + + for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){ + SafeVector::iterator row = matrix->GetRowPtr(jj); + int base = (*mapping2)[jj]; + int rowSize = matrix->GetRowSize(jj); + + // add in all relevant values + for (int ii = 0; ii < rowSize; ii++) + posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second; + + // subtract cutoff + for (int ii = 0; ii < matrix->GetSeq2Length(); ii++) + posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff; + } + + } + + + delete mapping2; + } + + delete mapping1; + } + + return posteriorPtr; + } + + //added by Liu Yongchao.Feb 23, 2010 + VF *BuildPosterior(int* seqsWeights, MultiSequence *align1, + MultiSequence *align2, + const SafeVector > &sparseMatrices, + float cutoff = 0.0f) const { + const int seq1Length = align1->GetSequence(0)->GetLength(); + const int seq2Length = align2->GetSequence(0)->GetLength(); + + VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + VF::iterator postPtr = posterior.begin(); + + //compute the total sum of all weights + float totalWeights = 0; + for (int i = 0; i < align1->GetNumSequences(); i++) { + int first = align1->GetSequence(i)->GetLabel(); + int w1 = seqsWeights[first]; + for (int j = 0; j < align2->GetNumSequences(); j++) { + int second = align2->GetSequence(j)->GetLabel(); + int w2 = seqsWeights[second]; + + totalWeights += w1 * w2; + } + } + // for each s in align1 + for (int i = 0; i < align1->GetNumSequences(); i++) { + int first = align1->GetSequence(i)->GetLabel(); + int w1 = seqsWeights[first]; + SafeVector *mapping1 = align1->GetSequence(i)->GetMapping(); + // for each t in align2 + for (int j = 0; j < align2->GetNumSequences(); j++) { + int second = align2->GetSequence(j)->GetLabel(); + int w2 = seqsWeights[second]; + SafeVector *mapping2 = + align2->GetSequence(j)->GetMapping(); + + float w = (float) (w1 * w2) / totalWeights; + if (first < second) { + + // get the associated sparse matrix + SparseMatrix *matrix = sparseMatrices[first][second]; + + for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) { + SafeVector::iterator row = matrix->GetRowPtr(ii); + int base = (*mapping1)[ii] * (seq2Length + 1); + int rowSize = matrix->GetRowSize(ii); + + // add in all relevant values + for (int jj = 0; jj < rowSize; jj++) + posterior[base + (*mapping2)[row[jj].first]] += w + * row[jj].second; + + // subtract cutoff + for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) + posterior[base + (*mapping2)[jj]] -= w * cutoff; + } + + } else { + + // get the associated sparse matrix + SparseMatrix *matrix = sparseMatrices[second][first]; + + for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) { + SafeVector::iterator row = matrix->GetRowPtr(jj); + int base = (*mapping2)[jj]; + int rowSize = matrix->GetRowSize(jj); + + // add in all relevant values + for (int ii = 0; ii < rowSize; ii++) + posterior[base + + (*mapping1)[row[ii].first] + * (seq2Length + 1)] += w + * row[ii].second; + + // subtract cutoff + for (int ii = 0; ii < matrix->GetSeq2Length(); ii++) + posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= + w * cutoff; + } + + } + + delete mapping2; + } + + delete mapping1; + } + + return posteriorPtr; + } +}; + +#endif diff --git a/binaries/src/GLProbs-1.0/SafeVector.h b/binaries/src/GLProbs-1.0/SafeVector.h new file mode 100644 index 0000000..f42c2da --- /dev/null +++ b/binaries/src/GLProbs-1.0/SafeVector.h @@ -0,0 +1,65 @@ +///////////////////////////////////////////////////////////////// +// SafeVector.h +// +// STL vector with array bounds checking. To enable bounds +// checking, #define ENABLE_CHECKS. +///////////////////////////////////////////////////////////////// + +#ifndef SAFEVECTOR_H +#define SAFEVECTOR_H + +#include +#include +using namespace std; + +///////////////////////////////////////////////////////////////// +// SafeVector +// +// Class derived from the STL std::vector for bounds checking. +///////////////////////////////////////////////////////////////// + +template +class SafeVector: public std::vector { +public: + + // miscellaneous constructors + SafeVector() : + std::vector() { + } + SafeVector(size_t size) : + std::vector(size) { + } + SafeVector(size_t size, const TYPE &value) : + std::vector(size, value) { + } + SafeVector(const SafeVector &source) : + std::vector(source) { + } + +#ifdef ENABLE_CHECKS + + // [] array bounds checking + TYPE &operator[](int index) { + assert (index >= 0 && index < (int) size()); + return std::vector::operator[] ((size_t) index); + } + + // [] const array bounds checking + const TYPE &operator[] (int index) const { + assert (index >= 0 && index < (int) size()); + return std::vector::operator[] ((size_t) index); + } + +#endif + +}; + +// some commonly used vector types +typedef SafeVector VI; +typedef SafeVector VVI; +typedef SafeVector VVVI; +typedef SafeVector VF; +typedef SafeVector VVF; +typedef SafeVector VVVF; + +#endif diff --git a/binaries/src/GLProbs-1.0/ScoreType.h b/binaries/src/GLProbs-1.0/ScoreType.h new file mode 100644 index 0000000..47de13d --- /dev/null +++ b/binaries/src/GLProbs-1.0/ScoreType.h @@ -0,0 +1,368 @@ +///////////////////////////////////////////////////////////////// +// ScoreType.h +// +// Routines for doing math operations in MSAPROBS +///////////////////////////////////////////////////////////////// + +#ifndef SCORETYPE_H +#define SCORETYPE_H + +#include +#include +#include +#include + +typedef float ScoreType; + +const float LOG_ZERO = -2e20; +const float LOG_ONE = 0.0; + +///////////////////////////////////////////////////////////////// +// LOG() +// +// Compute the logarithm of x. +///////////////////////////////////////////////////////////////// + +inline ScoreType LOG(ScoreType x) { + return log(x); +} + +///////////////////////////////////////////////////////////////// +// EXP() +// +// Computes exp(x). +///////////////////////////////////////////////////////////////// + +inline ScoreType EXP(ScoreType x) { + //return exp(x); + if (x > -2) { + if (x > -0.5) { + if (x > 0) + return exp(x); + return (((0.03254409303190190000 * x + 0.16280432765779600000) * x + + 0.49929760485974900000) * x + 0.99995149601363700000) * x + + 0.99999925508501600000; + } + if (x > -1) + return (((0.01973899026052090000 * x + 0.13822379685007000000) * x + + 0.48056651562365000000) * x + 0.99326940370383500000) * x + + 0.99906756856399500000; + return (((0.00940528203591384000 * x + 0.09414963667859410000) * x + + 0.40825793595877300000) * x + 0.93933625499130400000) * x + + 0.98369508190545300000; + } + if (x > -8) { + if (x > -4) + return (((0.00217245711583303000 * x + 0.03484829428350620000) * x + + 0.22118199801337800000) * x + 0.67049462206469500000) * x + + 0.83556950223398500000; + return (((0.00012398771025456900 * x + 0.00349155785951272000) * x + + 0.03727721426017900000) * x + 0.17974997741536900000) * x + + 0.33249299994217400000; + } + if (x > -16) + return (((0.00000051741713416603 * x + 0.00002721456879608080) * x + + 0.00053418601865636800) * x + 0.00464101989351936000) * x + + 0.01507447981459420000; + return 0; +} + +/* + ///////////////////////////////////////////////////////////////// + // LOOKUP() + // + // Computes log (exp (x) + 1), for 0 <= x <= 7.5. + ///////////////////////////////////////////////////////////////// + + inline ScoreType LOOKUP (ScoreType x){ + //return log (exp(x) + 1); + if (x < 2){ + if (x < 0.5){ + if (x < 0) + return log (exp(x) + 1); + return (((-0.00486373205785640000*x - 0.00020245408813934800)*x + 0.12504222666029800000)*x + 0.49999685320563000000)*x + 0.69314723138948900000; + } + if (x < 1) + return (((-0.00278634205460548000*x - 0.00458097251248546000)*x + 0.12865849880472500000)*x + 0.49862228499205200000)*x + 0.69334810088688000000; + return (((0.00059633755154209200*x - 0.01918996666063320000)*x + 0.15288232492093800000)*x + 0.48039958825756900000)*x + 0.69857578503189200000; + } + if (x < 8){ + if (x < 4) + return (((0.00135958539181047000*x - 0.02329807659316430000)*x + 0.15885799609532100000)*x + 0.48167498563270800000)*x + 0.69276185058669200000; + return (((0.00011992394456683500*x - 0.00338464503306568000)*x + 0.03622746366545470000)*x + 0.82481250248383700000)*x + 0.32507892994863100000; + } + if (x < 16) + return (((0.00000051726300753785*x - 0.00002720671238876090)*x + 0.00053403733818413500)*x + 0.99536021775747900000)*x + 0.01507065715532010000; + return x; + } + + ///////////////////////////////////////////////////////////////// + // LOOKUP_SLOW() + // + // Computes log (exp (x) + 1). + ///////////////////////////////////////////////////////////////// + + inline ScoreType LOOKUP_SLOW (ScoreType x){ + return log (exp (x) + 1); + } + + ///////////////////////////////////////////////////////////////// + // MAX() + // + // Compute max of three numbers + ///////////////////////////////////////////////////////////////// + + inline ScoreType MAX (ScoreType x, ScoreType y, ScoreType z){ + if (x >= y){ + if (x >= z) + return x; + return z; + } + if (y >= z) + return y; + return z; + } + + ///////////////////////////////////////////////////////////////// + // LOG_PLUS_EQUALS() + // + // Add two log probabilities and store in the first argument + ///////////////////////////////////////////////////////////////// + + inline void LOG_PLUS_EQUALS (ScoreType &x, ScoreType y){ + if (x < y) + x = (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x; + else + x = (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y; + } + + ///////////////////////////////////////////////////////////////// + // LOG_PLUS_EQUALS_SLOW() + // + // Add two log probabilities and store in the first argument + ///////////////////////////////////////////////////////////////// + + inline void LOG_PLUS_EQUALS_SLOW (ScoreType &x, ScoreType y){ + if (x < y) + x = (x <= LOG_ZERO) ? y : LOOKUP_SLOW(y-x) + x; + else + x = (y <= LOG_ZERO) ? x : LOOKUP_SLOW(x-y) + y; + } + + ///////////////////////////////////////////////////////////////// + // LOG_ADD() + // + // Add two log probabilities + ///////////////////////////////////////////////////////////////// + + inline ScoreType LOG_ADD (ScoreType x, ScoreType y){ + if (x < y) return (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x; + return (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y; + } + */ + +/* + ///////////////////////////////////////////////////////////////// + // LOG() + // + // Compute the logarithm of x. + ///////////////////////////////////////////////////////////////// + + inline float LOG (float x){ + return log (x); + } + + ///////////////////////////////////////////////////////////////// + // EXP() + // + // Computes exp(x), fr -4.6 <= x <= 0. + ///////////////////////////////////////////////////////////////// + + inline float EXP (float x){ + assert (x <= 0.00f); + if (x < EXP_UNDERFLOW_THRESHOLD) return 0.0f; + return (((0.006349841068584 * x + 0.080775412572352) * x + 0.397982026296272) * x + 0.95279335963787f) * x + 0.995176455837312f; + //return (((0.00681169825657f * x + 0.08386267698832f) * x + 0.40413983195844f) * x + 0.95656674979767f) * x + 0.99556744049130f; + } + */ + +const float EXP_UNDERFLOW_THRESHOLD = -4.6; +const float LOG_UNDERFLOW_THRESHOLD = 7.5; + +///////////////////////////////////////////////////////////////// +// LOOKUP() +// +// Computes log (exp (x) + 1), for 0 <= x <= 7.5. +///////////////////////////////////////////////////////////////// + +inline float LOOKUP(float x) { + assert(x >= 0.00f); + assert(x <= LOG_UNDERFLOW_THRESHOLD); + //return ((-0.00653779113685f * x + 0.09537236626558f) * x + 0.55317574459331f) * x + 0.68672959851568f; + if (x <= 1.00f) + return ((-0.009350833524763f * x + 0.130659527668286f) * x + + 0.498799810682272f) * x + 0.693203116424741f; + if (x <= 2.50f) + return ((-0.014532321752540f * x + 0.139942324101744f) * x + + 0.495635523139337f) * x + 0.692140569840976f; + if (x <= 4.50f) + return ((-0.004605031767994f * x + 0.063427417320019f) * x + + 0.695956496475118f) * x + 0.514272634594009f; + assert(x <= LOG_UNDERFLOW_THRESHOLD); + return ((-0.000458661602210f * x + 0.009695946122598f) * x + + 0.930734667215156f) * x + 0.168037164329057f; + + //return (((0.00089738532761f * x - 0.01859488697982f) * x + 0.14415772028626f) * x + 0.49515490689159f) * x + 0.69311928966454f; +} + +///////////////////////////////////////////////////////////////// +// LOOKUP_SLOW() +// +// Computes log (exp (x) + 1). +///////////////////////////////////////////////////////////////// + +inline float LOOKUP_SLOW(float x) { + return log(exp(x) + 1); +} + +///////////////////////////////////////////////////////////////// +// MAX() +// +// Compute max of three numbers +///////////////////////////////////////////////////////////////// + +inline float MAX(float x, float y, float z) { + if (x >= y) { + if (x >= z) + return x; + return z; + } + if (y >= z) + return y; + return z; +} + +///////////////////////////////////////////////////////////////// +// LOG_PLUS_EQUALS() +// +// Add two log probabilities and store in the first argument +///////////////////////////////////////////////////////////////// + +inline void LOG_PLUS_EQUALS(float &x, float y) { + if (x < y) + x = (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? + y : LOOKUP(y - x) + x; + else + x = (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? + x : LOOKUP(x - y) + y; +} + +///////////////////////////////////////////////////////////////// +// LOG_PLUS_EQUALS_SLOW() +// +// Add two log probabilities and store in the first argument +///////////////////////////////////////////////////////////////// + +inline void LOG_PLUS_EQUALS_SLOW(float &x, float y) { + if (x < y) + x = (x == LOG_ZERO) ? y : LOOKUP_SLOW(y - x) + x; + else + x = (y == LOG_ZERO) ? x : LOOKUP_SLOW(x - y) + y; +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add two log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x, float y) { + if (x < y) + return (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? + y : LOOKUP(y - x) + x; + return (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? + x : LOOKUP(x - y) + y; +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add three log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x1, float x2, float x3) { + return LOG_ADD(x1, LOG_ADD(x2, x3)); +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add four log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x1, float x2, float x3, float x4) { + return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, x4))); +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add five log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5) { + return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, x5)))); +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add siz log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5, + float x6) { + return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, LOG_ADD(x5, x6))))); +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add seven log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5, float x6, + float x7) { + return LOG_ADD(x1, + LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, LOG_ADD(x5, LOG_ADD(x6, x7)))))); +} + +///////////////////////////////////////////////////////////////// +// ChooseBestOfThree() +// +// Store the largest of three values x1, x2, and x3 in *x. Also +// if xi is the largest value, then store bi in *b. +///////////////////////////////////////////////////////////////// + +inline void ChooseBestOfThree(float x1, float x2, float x3, char b1, char b2, + char b3, float *x, char *b) { + if (x1 >= x2) { + if (x1 >= x3) { + *x = x1; + *b = b1; + return; + } + *x = x3; + *b = b3; + return; + } + if (x2 >= x3) { + *x = x2; + *b = b2; + return; + } + *x = x3; + *b = b3; +} + +#endif diff --git a/binaries/src/GLProbs-1.0/Sequence.h b/binaries/src/GLProbs-1.0/Sequence.h new file mode 100644 index 0000000..5bd1ef9 --- /dev/null +++ b/binaries/src/GLProbs-1.0/Sequence.h @@ -0,0 +1,444 @@ +///////////////////////////////////////////////////////////////// +// Sequence.h +// +// Class for reading/manipulating single sequence character data. +///////////////////////////////////////////////////////////////// + +#ifndef SEQUENCE_H +#define SEQUENCE_H + +#include +#include +#include +#include +#include +#include "SafeVector.h" +#include "FileBuffer.h" + +///////////////////////////////////////////////////////////////// +// Sequence +// +// Class for storing sequence information. +///////////////////////////////////////////////////////////////// + +class Sequence { + + bool isValid; // a boolean indicating whether the sequence data is valid or not + string header; // string containing the comment line of the FASTA file + SafeVector *data; // pointer to character data + int length; // length of the sequence + int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences + // in a Multi-FASTA file + int inputLabel; // position of sequence in original input + + ///////////////////////////////////////////////////////////////// + // Sequence::Sequence() + // + // Default constructor. Does nothing. + ///////////////////////////////////////////////////////////////// + + Sequence() : + isValid(false), header(""), data(NULL), length(0), sequenceLabel(0), inputLabel( + 0) { + } + +public: + + ///////////////////////////////////////////////////////////////// + // Sequence::Sequence() + // + // Constructor. Reads the sequence from a FileBuffer. + ///////////////////////////////////////////////////////////////// + + Sequence(FileBuffer &infile, bool stripGaps = false) : + isValid(false), header("~"), data(NULL), length(0), sequenceLabel( + 0), inputLabel(0) { + + // read until the first non-blank line + while (!infile.eof()) { + infile.GetLine(header); + if (header.length() != 0) + break; + } + + // check to make sure that it is a correct header line + if (header[0] == '>') { + + // if so, remove the leading ">" + header = header.substr(1); + + // remove any leading or trailing white space in the header comment + while (header.length() > 0 && isspace(header[0])) + header = header.substr(1); + while (header.length() > 0 && isspace(header[header.length() - 1])) + header = header.substr(0, header.length() - 1); + + // get ready to read the data[] array; note that data[0] is always '@' + char ch; + data = new SafeVector; + assert(data); + data->push_back('@'); + + // get a character from the file + while (infile.Get(ch)) { + + // if we've reached a new comment line, put the character back and stop + if (ch == '>') { + infile.UnGet(); + break; + } + + // skip whitespace + if (isspace(ch)) + continue; + + // substitute gap character + if (ch == '.') + ch = '-'; + if (stripGaps && ch == '-') + continue; + + // check for known characters + if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))) { + cerr << "ERROR: Unknown character encountered: " << ch + << endl; + exit(1); + } + + // everything's ok so far, so just store this character. + if (ch >= 'a' && ch <= 'z') { + ch = ch - 'a' + 'A'; + } //change to upper case. fixed by Liu Yongchao, May 21, 2010 + + data->push_back(ch); + ++length; + } + + // sequence must contain data in order to be valid + isValid = length > 0; + if (!isValid) { + delete data; + data = NULL; + } + } + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Sequence() + // + // Constructor. Builds a sequence from existing data. Note + // that the data must use one-based indexing where data[0] should + // be set to '@'. + ///////////////////////////////////////////////////////////////// + + Sequence(SafeVector *data, string header, int length, + int sequenceLabel, int inputLabel) : + isValid(data != NULL), header(header), data(data), length(length), sequenceLabel( + sequenceLabel), inputLabel(inputLabel) { + assert(data); + assert((*data)[0] == '@'); + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Sequence() + // + // Destructor. Release allocated memory. + ///////////////////////////////////////////////////////////////// + + ~Sequence() { + if (data) { + assert(isValid); + delete data; + data = NULL; + isValid = false; + } + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetHeader() + // + // Return the string comment associated with this sequence. + ///////////////////////////////////////////////////////////////// + + string GetHeader() const { + return header; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetName() + // + // Return the first word of the string comment associated with this sequence. + ///////////////////////////////////////////////////////////////// + + string GetName() const { + char name[1024]; + sscanf(header.c_str(), "%s", name); + return string(name); + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetDataPtr() + // + // Return the iterator to data associated with this sequence. + ///////////////////////////////////////////////////////////////// + + SafeVector::iterator GetDataPtr() { + assert(isValid); + assert(data); + return data->begin(); + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetPosition() + // + // Return the character at position i. Recall that the character + // data is stored with one-based indexing. + ///////////////////////////////////////////////////////////////// + + char GetPosition(int i) const { + assert(isValid); + assert(data); + assert(i >= 1 && i <= length); + return (*data)[i]; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::SetLabel() + // + // Sets the sequence label to i. + ///////////////////////////////////////////////////////////////// + + void SetLabel(int i) { + assert(isValid); + sequenceLabel = i; + inputLabel = i; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::SetSortLabel() + // + // Sets the sequence sorting label to i. + ///////////////////////////////////////////////////////////////// + + void SetSortLabel(int i) { + assert(isValid); + sequenceLabel = i; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetLabel() + // + // Retrieves the input label. + ///////////////////////////////////////////////////////////////// + + int GetLabel() const { + assert(isValid); + return inputLabel; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetSortLabel() + // + // Retrieves the sorting label. + ///////////////////////////////////////////////////////////////// + + int GetSortLabel() const { + assert(isValid); + return sequenceLabel; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Fail() + // + // Checks to see if the sequence successfully loaded. + ///////////////////////////////////////////////////////////////// + + bool Fail() const { + return !isValid; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Length() + // + // Returns the length of the sequence. + ///////////////////////////////////////////////////////////////// + + int GetLength() const { + assert(isValid); + assert(data); + return length; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::WriteMFA() + // + // Writes the sequence to outfile in MFA format. Uses numColumns + // columns per line. If useIndex is set to false, then the + // header is printed as normal, but if useIndex is true, then + // ">S###" is printed where ### represents the sequence label. + ///////////////////////////////////////////////////////////////// + + void WriteMFA(ostream &outfile, int numColumns, + bool useIndex = false) const { + assert(isValid); + assert(data); + assert(!outfile.fail()); + + // print out heading + if (useIndex) + outfile << ">S" << GetLabel() << endl; + else + outfile << ">" << header << endl; + + // print out character data + int ct = 1; + for (; ct <= length; ct++) { + outfile << (*data)[ct]; + if (ct % numColumns == 0) + outfile << endl; + } + if ((ct - 1) % numColumns != 0) + outfile << endl; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Clone() + // + // Returns a new deep copy of the seqeuence. + ///////////////////////////////////////////////////////////////// + + Sequence *Clone() const { + Sequence *ret = new Sequence(); + assert(ret); + + ret->isValid = isValid; + ret->header = header; + ret->data = new SafeVector; + assert(ret->data); + *(ret->data) = *data; + ret->length = length; + ret->sequenceLabel = sequenceLabel; + ret->inputLabel = inputLabel; + + return ret; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetRange() + // + // Returns a new sequence object consisting of a range of + // characters from the current seuquence. + ///////////////////////////////////////////////////////////////// + + Sequence *GetRange(int start, int end) const { + Sequence *ret = new Sequence(); + assert(ret); + + assert(start >= 1 && start <= length); + assert(end >= 1 && end <= length); + assert(start <= end); + + ret->isValid = isValid; + ret->header = header; + ret->data = new SafeVector; + assert(ret->data); + ret->data->push_back('@'); + for (int i = start; i <= end; i++) + ret->data->push_back((*data)[i]); + ret->length = end - start + 1; + ret->sequenceLabel = sequenceLabel; + ret->inputLabel = inputLabel; + + return ret; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::AddGaps() + // + // Given an SafeVector containing the skeleton for an + // alignment and the identity of the current character, this + // routine will create a new sequence with all necesssary gaps added. + // For instance, + // alignment = "XXXBBYYYBBYYXX" + // id = 'X' + // will perform the transformation + // "ATGCAGTCA" --> "ATGCC---GT--CA" + // (XXXBBYYYBBYYXX) + ///////////////////////////////////////////////////////////////// + + Sequence *AddGaps(SafeVector *alignment, char id) { + Sequence *ret = new Sequence(); + assert(ret); + + ret->isValid = isValid; + ret->header = header; + ret->data = new SafeVector; + assert(ret->data); + ret->length = (int) alignment->size(); + ret->sequenceLabel = sequenceLabel; + ret->inputLabel = inputLabel; + ret->data->push_back('@'); + + SafeVector::iterator dataIter = data->begin() + 1; + for (SafeVector::iterator iter = alignment->begin(); + iter != alignment->end(); ++iter) { + if (*iter == 'B' || *iter == id) { + ret->data->push_back(*dataIter); + ++dataIter; + } else + ret->data->push_back('-'); + } + + return ret; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetString() + // + // Returns the sequence as a string with gaps removed. + ///////////////////////////////////////////////////////////////// + + string GetString() { + string s = ""; + for (int i = 1; i <= length; i++) { + if ((*data)[i] != '-') + s += (*data)[i]; + } + return s; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetMapping() + // + // Returns a SafeVector containing the indices of every + // character in the sequence. For instance, if the data is + // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}. + ///////////////////////////////////////////////////////////////// + + SafeVector *GetMapping() const { + SafeVector *ret = new SafeVector(1, 0); + for (int i = 1; i <= length; i++) { + if ((*data)[i] != '-') + ret->push_back(i); + } + return ret; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Highlight() + // + // Changes all positions with score >= cutoff to upper case and + // all positions with score < cutoff to lower case. + ///////////////////////////////////////////////////////////////// + + void Highlight(const SafeVector &scores, const float cutoff) { + for (int i = 1; i <= length; i++) { + if (scores[i - 1] >= cutoff) + (*data)[i] = toupper((*data)[i]); + else + (*data)[i] = tolower((*data)[i]); + } + } +}; + +#endif diff --git a/binaries/src/GLProbs-1.0/SparseMatrix.h b/binaries/src/GLProbs-1.0/SparseMatrix.h new file mode 100644 index 0000000..51b273d --- /dev/null +++ b/binaries/src/GLProbs-1.0/SparseMatrix.h @@ -0,0 +1,266 @@ +///////////////////////////////////////////////////////////////// +// SparseMatrix.h +// +// Sparse matrix computations +///////////////////////////////////////////////////////////////// + +#ifndef SPARSEMATRIX_H +#define SPARSEMATRIX_H + +#include + +using namespace std; + +const float POSTERIOR_CUTOFF = 0.01; // minimum posterior probability +// value that is maintained in the +// sparse matrix representation + +typedef pair PIF; // Sparse matrix entry type +// first --> column +// second --> value + +///////////////////////////////////////////////////////////////// +// SparseMatrix +// +// Class for sparse matrix computations +///////////////////////////////////////////////////////////////// + +class SparseMatrix { + + int seq1Length, seq2Length; // dimensions of matrix + VI rowSize; // rowSize[i] = # of cells in row i + SafeVector data; // data values + SafeVector::iterator> rowPtrs; // pointers to the beginning of each row + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::SparseMatrix() + // + // Private constructor. + ///////////////////////////////////////////////////////////////// + + SparseMatrix() { + } + +public: + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::SparseMatrix() + // + // Constructor. Builds a sparse matrix from a posterior matrix. + // Note that the expected format for the posterior matrix is as + // a (seq1Length+1) x (seq2Length+1) matrix where the 0th row + // and 0th column are ignored (they should contain all zeroes). + ///////////////////////////////////////////////////////////////// + + SparseMatrix(int seq1Length, int seq2Length, const VF &posterior) : + seq1Length(seq1Length), seq2Length(seq2Length) { + + int numCells = 0; + + assert(seq1Length > 0); + assert(seq2Length > 0); + + // calculate memory required; count the number of cells in the + // posterior matrix above the threshold + VF::const_iterator postPtr = posterior.begin(); + for (int i = 0; i <= seq1Length; i++) { + for (int j = 0; j <= seq2Length; j++) { + if (*(postPtr++) >= POSTERIOR_CUTOFF) { + assert(i != 0 && j != 0); + numCells++; + } + } + } + + // allocate memory + data.resize(numCells); + rowSize.resize(seq1Length + 1); + rowSize[0] = -1; + rowPtrs.resize(seq1Length + 1); + rowPtrs[0] = data.end(); + + // build sparse matrix + postPtr = posterior.begin() + seq2Length + 1; // note that we're skipping the first row here + SafeVector::iterator dataPtr = data.begin(); + for (int i = 1; i <= seq1Length; i++) { + postPtr++; // and skipping the first column of each row + rowPtrs[i] = dataPtr; + for (int j = 1; j <= seq2Length; j++) { + if (*postPtr >= POSTERIOR_CUTOFF) { + dataPtr->first = j; + dataPtr->second = *postPtr; + dataPtr++; + } + postPtr++; + } + rowSize[i] = dataPtr - rowPtrs[i]; + } + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetRowPtr() + // + // Returns the pointer to a particular row in the sparse matrix. + ///////////////////////////////////////////////////////////////// + + SafeVector::iterator GetRowPtr(int row) const { + assert(row >= 1 && row <= seq1Length); + return rowPtrs[row]; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetValue() + // + // Returns value at a particular row, column. + ///////////////////////////////////////////////////////////////// + + float GetValue(int row, int col) { + assert(row >= 1 && row <= seq1Length); + assert(col >= 1 && col <= seq2Length); + for (int i = 0; i < rowSize[row]; i++) { + if (rowPtrs[row][i].first == col) + return rowPtrs[row][i].second; + } + return 0; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetRowSize() + // + // Returns the number of entries in a particular row. + ///////////////////////////////////////////////////////////////// + + int GetRowSize(int row) const { + assert(row >= 1 && row <= seq1Length); + return rowSize[row]; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetSeq1Length() + // + // Returns the first dimension of the matrix. + ///////////////////////////////////////////////////////////////// + + int GetSeq1Length() const { + return seq1Length; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetSeq2Length() + // + // Returns the second dimension of the matrix. + ///////////////////////////////////////////////////////////////// + + int GetSeq2Length() const { + return seq2Length; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetRowPtr + // + // Returns the pointer to a particular row in the sparse matrix. + ///////////////////////////////////////////////////////////////// + + int GetNumCells() const { + return data.size(); + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::Print() + // + // Prints out a sparse matrix. + ///////////////////////////////////////////////////////////////// + + void Print(ostream &outfile) const { + outfile << "Sparse Matrix:" << endl; + for (int i = 1; i <= seq1Length; i++) { + outfile << " " << i << ":"; + for (int j = 0; j < rowSize[i]; j++) { + outfile << " (" << rowPtrs[i][j].first << "," + << rowPtrs[i][j].second << ")"; + } + outfile << endl; + } + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::ComputeTranspose() + // + // Returns a new sparse matrix containing the transpose of the + // current matrix. + ///////////////////////////////////////////////////////////////// + + SparseMatrix *ComputeTranspose() const { + + // create a new sparse matrix + SparseMatrix *ret = new SparseMatrix(); + int numCells = data.size(); + + ret->seq1Length = seq2Length; + ret->seq2Length = seq1Length; + + // allocate memory + ret->data.resize(numCells); + ret->rowSize.resize(seq2Length + 1); + ret->rowSize[0] = -1; + ret->rowPtrs.resize(seq2Length + 1); + ret->rowPtrs[0] = ret->data.end(); + + // compute row sizes + for (int i = 1; i <= seq2Length; i++) + ret->rowSize[i] = 0; + for (int i = 0; i < numCells; i++) + ret->rowSize[data[i].first]++; + + // compute row ptrs + for (int i = 1; i <= seq2Length; i++) { + ret->rowPtrs[i] = + (i == 1) ? + ret->data.begin() : + ret->rowPtrs[i - 1] + ret->rowSize[i - 1]; + } + + // now fill in data + SafeVector::iterator> currPtrs = ret->rowPtrs; + + for (int i = 1; i <= seq1Length; i++) { + SafeVector::iterator row = rowPtrs[i]; + for (int j = 0; j < rowSize[i]; j++) { + currPtrs[row[j].first]->first = i; + currPtrs[row[j].first]->second = row[j].second; + currPtrs[row[j].first]++; + } + } + + return ret; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetPosterior() + // + // Return the posterior representation of the sparse matrix. + ///////////////////////////////////////////////////////////////// + + VF *GetPosterior() const { + + // create a new posterior matrix + VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1)); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + + // build the posterior matrix + for (int i = 0; i < (seq1Length + 1) * (seq2Length + 1); i++) + posterior[i] = 0; + for (int i = 1; i <= seq1Length; i++) { + VF::iterator postPtr = posterior.begin() + i * (seq2Length + 1); + for (int j = 0; j < rowSize[i]; j++) { + postPtr[rowPtrs[i][j].first] = rowPtrs[i][j].second; + } + } + + return posteriorPtr; + } + +}; + +#endif diff --git a/binaries/src/GLProbs-1.0/glprobs b/binaries/src/GLProbs-1.0/glprobs new file mode 100755 index 0000000..238ac42 Binary files /dev/null and b/binaries/src/GLProbs-1.0/glprobs differ diff --git a/binaries/src/GLProbs-1.0/main.cpp b/binaries/src/GLProbs-1.0/main.cpp new file mode 100644 index 0000000..6fd1934 --- /dev/null +++ b/binaries/src/GLProbs-1.0/main.cpp @@ -0,0 +1,16 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ +#include "MSA.h" + +int main(int argc, char* argv[]) { + MSA msa(argc, argv); + + return 0; +} diff --git a/binaries/src/MSAProbs-0.9.7/ChangeLog b/binaries/src/MSAProbs-0.9.7/ChangeLog new file mode 100644 index 0000000..c34e041 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/ChangeLog @@ -0,0 +1,9 @@ +(1) 23 Aug, 2010 + Add an option "-num_threads" to allow uses to specify the number of + threads useda +(2) 12 April, 2012 + GCC 4.6 can successfully compile it. + +(3) 3 July, 2012 + Add a new option "-o" (or "--outfile") to allow users to specify the output file name. + By default, it will output to STDOUT diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs.ncb b/binaries/src/MSAProbs-0.9.7/MSAProbs.ncb new file mode 100644 index 0000000..248d356 Binary files /dev/null and b/binaries/src/MSAProbs-0.9.7/MSAProbs.ncb differ diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs.sln b/binaries/src/MSAProbs-0.9.7/MSAProbs.sln new file mode 100644 index 0000000..fc350a8 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs.sln @@ -0,0 +1,20 @@ + +Microsoft Visual Studio Solution File, Format Version 9.00 +# Visual Studio 2005 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MSAProbs", "MSAProbs\MSAProbs.vcproj", "{671563E4-93A2-419E-8B41-48DDF71DD144}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Release|Win32 = Release|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {671563E4-93A2-419E-8B41-48DDF71DD144}.Debug|Win32.ActiveCfg = Debug|Win32 + {671563E4-93A2-419E-8B41-48DDF71DD144}.Debug|Win32.Build.0 = Debug|Win32 + {671563E4-93A2-419E-8B41-48DDF71DD144}.Release|Win32.ActiveCfg = Release|Win32 + {671563E4-93A2-419E-8B41-48DDF71DD144}.Release|Win32.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs.suo b/binaries/src/MSAProbs-0.9.7/MSAProbs.suo new file mode 100644 index 0000000..08a75bc Binary files /dev/null and b/binaries/src/MSAProbs-0.9.7/MSAProbs.suo differ diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/Defaults.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/Defaults.h new file mode 100644 index 0000000..b108cf3 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/Defaults.h @@ -0,0 +1,105 @@ +///////////////////////////////////////////////////////////////// +// Defaults.h +// +// Default constants for use in MSAPROBS. The emission +// probabilities were computed using the program used to build +// the BLOSUM62 matrix from the BLOCKS 5.0 dataset. Transition +// parameters were obtained via unsupervised EM training on the +// BALIBASE 2.0 benchmark alignment database. +///////////////////////////////////////////////////////////////// + +#ifndef DEFAULTS_H +#define DEFAULTS_H + +#include + +using namespace std; + +/* + float initDistrib1Default[] = { 0.3202854395, 0.3398572505, 0.3398572505 }; + float gapOpen1Default[] = { 0.1375414133, 0.1375414133 }; + float gapExtend1Default[] = { 0.7832147479, 0.7832147479 }; + */ + +float initDistrib1Default[] = { 0.6080327034f, 0.1959836632f, 0.1959836632f }; +float gapOpen1Default[] = { 0.01993141696f, 0.01993141696f }; +float gapExtend1Default[] = { 0.7943345308f, 0.7943345308f }; + +float initDistrib2Default[] = { 0.6814756989f, 8.615339902e-05f, + 8.615339902e-05f, 0.1591759622f, 0.1591759622 }; +float gapOpen2Default[] = { 0.0119511066f, 0.0119511066f, 0.008008334786f, + 0.008008334786 }; +float gapExtend2Default[] = { 0.3965826333f, 0.3965826333f, 0.8988758326f, + 0.8988758326 }; + +string alphabetDefault = "ARNDCQEGHILKMFPSTWYV"; +float emitSingleDefault[20] = { 0.07831005f, 0.05246024f, 0.04433257f, + 0.05130349f, 0.02189704f, 0.03585766f, 0.05615771f, 0.07783433f, + 0.02601093f, 0.06511648f, 0.09716489f, 0.05877077f, 0.02438117f, + 0.04463228f, 0.03940142f, 0.05849916f, 0.05115306f, 0.01203523f, + 0.03124726f, 0.07343426f }; + +float emitPairsDefault[20][20] = { { 0.02373072f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f }, { 0.00244502f, 0.01775118f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, { 0.00210228f, 0.00207782f, 0.01281864f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, { 0.00223549f, 0.00161657f, 0.00353540f, 0.01911178f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f }, { 0.00145515f, 0.00044701f, 0.00042479f, + 0.00036798f, 0.01013470f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00219102f, + 0.00253532f, 0.00158223f, 0.00176784f, 0.00032102f, 0.00756604f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, { 0.00332218f, 0.00268865f, 0.00224738f, 0.00496800f, + 0.00037956f, 0.00345128f, 0.01676565f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00597898f, + 0.00194865f, 0.00288882f, 0.00235249f, 0.00071206f, 0.00142432f, + 0.00214860f, 0.04062876f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00114353f, 0.00132105f, 0.00141205f, + 0.00097077f, 0.00026421f, 0.00113901f, 0.00131767f, 0.00103704f, + 0.00867996f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, + { 0.00318853f, 0.00138145f, 0.00104273f, 0.00105355f, 0.00094040f, + 0.00100883f, 0.00124207f, 0.00142520f, 0.00059716f, 0.01778263f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { + 0.00449576f, 0.00246811f, 0.00160275f, 0.00161966f, 0.00138494f, + 0.00180553f, 0.00222063f, 0.00212853f, 0.00111754f, 0.01071834f, + 0.03583921f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, { 0.00331693f, 0.00595650f, 0.00257310f, 0.00252518f, + 0.00046951f, 0.00312308f, 0.00428420f, 0.00259311f, 0.00121376f, + 0.00157852f, 0.00259626f, 0.01612228f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00148878f, 0.00076734f, + 0.00063401f, 0.00047808f, 0.00037421f, 0.00075546f, 0.00076105f, + 0.00066504f, 0.00042237f, 0.00224097f, 0.00461939f, 0.00096120f, + 0.00409522f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }, { + 0.00165004f, 0.00090768f, 0.00084658f, 0.00069041f, 0.00052274f, + 0.00059248f, 0.00078814f, 0.00115204f, 0.00072545f, 0.00279948f, + 0.00533369f, 0.00087222f, 0.00116111f, 0.01661038f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f }, { 0.00230618f, 0.00106268f, + 0.00100282f, 0.00125381f, 0.00034766f, 0.00090111f, 0.00151550f, + 0.00155601f, 0.00049078f, 0.00103767f, 0.00157310f, 0.00154836f, + 0.00046718f, 0.00060701f, 0.01846071f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f }, { 0.00631752f, 0.00224540f, 0.00301397f, 0.00285226f, + 0.00094867f, 0.00191155f, 0.00293898f, 0.00381962f, 0.00116422f, + 0.00173565f, 0.00250962f, 0.00312633f, 0.00087787f, 0.00119036f, + 0.00180037f, 0.01346609f, 0.0f, 0.0f, 0.0f, 0.0f }, { + 0.00389995f, 0.00186053f, 0.00220144f, 0.00180488f, 0.00073798f, + 0.00154526f, 0.00216760f, 0.00214841f, 0.00077747f, 0.00248968f, + 0.00302273f, 0.00250862f, 0.00093371f, 0.00107595f, 0.00147982f, + 0.00487295f, 0.01299436f, 0.0f, 0.0f, 0.0f }, { 0.00039119f, + 0.00029139f, 0.00021006f, 0.00016015f, 0.00010666f, 0.00020592f, + 0.00023815f, 0.00038786f, 0.00019097f, 0.00039549f, 0.00076736f, + 0.00028448f, 0.00016253f, 0.00085751f, 0.00015674f, 0.00026525f, + 0.00024961f, 0.00563625f, 0.0f, 0.0f }, { 0.00131840f, + 0.00099430f, 0.00074960f, 0.00066005f, 0.00036626f, 0.00070192f, + 0.00092548f, 0.00089301f, 0.00131038f, 0.00127857f, 0.00219713f, + 0.00100817f, 0.00054105f, 0.00368739f, 0.00047608f, 0.00102648f, + 0.00094759f, 0.00069226f, 0.00999315f, 0.0f }, { 0.00533241f, + 0.00169359f, 0.00136609f, 0.00127915f, 0.00119152f, 0.00132844f, + 0.00178697f, 0.00194579f, 0.00071553f, 0.01117956f, 0.00914460f, + 0.00210897f, 0.00197461f, 0.00256159f, 0.00135781f, 0.00241601f, + 0.00343452f, 0.00038538f, 0.00148001f, 0.02075171f } }; + +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/FileBuffer.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/FileBuffer.h new file mode 100644 index 0000000..06af54b --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/FileBuffer.h @@ -0,0 +1,117 @@ +///////////////////////////////////////////////////////////////// +// FileBuffer.h +// +// Buffered file reading. +///////////////////////////////////////////////////////////////// + +#ifndef FILEBUFFER_H +#define FILEBUFFER_H + +#include +#include +#include + +using namespace std; + +const int BufferSize = 1000; + +///////////////////////////////////////////////////////////////// +// FileBuffer +// +// Class for buffering file reading. +///////////////////////////////////////////////////////////////// + +class FileBuffer { + ifstream file; + char buffer[BufferSize]; + int currPos; + int size; + bool isEOF; + bool isValid; + bool canUnget; + +public: + + // Some common routines + + FileBuffer(const char *filename) : + file(filename), currPos(0), size(0), isEOF(false), isValid( + !file.fail()), canUnget(false) { + } + ~FileBuffer() { + close(); + } + bool fail() const { + return !isValid; + } + bool eof() const { + return (!isValid || isEOF); + } + void close() { + file.close(); + isValid = false; + } + + ///////////////////////////////////////////////////////////////// + // FileBuffer::Get() + // + // Retrieve a character from the file buffer. Returns true if + // and only if a character is read. + ///////////////////////////////////////////////////////////////// + + bool Get(char &ch) { + + // check to make sure that there's more stuff in the file + if (!isValid || isEOF) + return false; + + // if the buffer is empty, it's time to reload it + if (currPos == size) { + file.read(buffer, BufferSize); + size = file.gcount(); + isEOF = (size == 0); + currPos = 0; + if (isEOF) + return false; + } + + // store the read character + ch = buffer[currPos++]; + canUnget = true; + return true; + } + + ///////////////////////////////////////////////////////////////// + // FileBuffer::UnGet() + // + // Unretrieve the most recently read character from the file + // buffer. Note that this allows only a one-level undo. + ///////////////////////////////////////////////////////////////// + + void UnGet() { + assert(canUnget); + assert(isValid); + assert(currPos > 0); + currPos--; + assert(currPos < size); + isEOF = false; + canUnget = false; + } + + ///////////////////////////////////////////////////////////////// + // FileBuffer::GetLine() + // + // Retrieve characters of text until a newline character is + // encountered. Terminates properly on end-of-file condition. + ///////////////////////////////////////////////////////////////// + + void GetLine(string &s) { + char ch; + s = ""; + while (Get(ch) && ch != '\n') + s += ch; + } + +}; + +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSA.cpp b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSA.cpp new file mode 100644 index 0000000..db1550e --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSA.cpp @@ -0,0 +1,1349 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "MSA.h" +#include "MSAClusterTree.h" +#include "Defaults.h" + +#ifdef _OPENMP +#include +#endif + +string parametersInputFilename = ""; +string parametersOutputFilename = "no training"; +string annotationFilename = ""; + +bool enableVerbose = false; +bool enableAnnotation = false; +bool enableClustalWOutput = false; +bool enableAlignOrder = false; +int numConsistencyReps = 2; +int numPreTrainingReps = 0; +int numIterativeRefinementReps = 10; + +float cutoff = 0; + +VF initDistrib(NumMatrixTypes); +VF gapOpen(2 * NumInsertStates); +VF gapExtend(2 * NumInsertStates); +VVF emitPairs(256, VF(256, 1e-10)); +VF emitSingle(256, 1e-5); + +string alphabet = alphabetDefault; + +const int MIN_PRETRAINING_REPS = 0; +const int MAX_PRETRAINING_REPS = 20; +const int MIN_CONSISTENCY_REPS = 0; +const int MAX_CONSISTENCY_REPS = 5; +const int MIN_ITERATIVE_REFINEMENT_REPS = 0; +const int MAX_ITERATIVE_REFINEMENT_REPS = 1000; + +string posteriorProbsFilename = ""; +bool allscores = true; +string infilename; + +int flag_gui = 0; //0: no gui related o/p +//1: gui related o/p generated +int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment +//1: pp score seq added to o/p fasta alignment + +/////////////////////////////// +// global scoring matrix variables +////////////////////////////// +float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; +char *aminos, *bases, matrixtype[20] = "gonnet_160"; +int subst_index[26]; + +double sub_matrix[26][26]; +int firstread = 0; //this makes sure that matrices are read only once + +float TEMPERATURE = 5; +int MATRIXTYPE = 160; +int prot_nuc = 0; //0=prot, 1=nucleotide + +float GAPOPEN = 0; +float GAPEXT = 0; +int numThreads = 0; + +//argument support +typedef struct { + char input[30]; + int matrix; + int N; + float T; + float beta; + char opt; //can be 'P' or 'M' + float gapopen; + float gapext; +} argument_decl; + +argument_decl argument; + +extern inline void read_sustitution_matrix(char *fileName); +extern void setmatrixtype(int le); +extern inline int matrixtype_to_int(); +extern inline void read_dna_matrix(); +extern inline void read_vtml_la_matrix(); +extern void init_arguments(); + +MSA::MSA(int argc, char* argv[]) { + //parse program parameters + SafeVector sequenceNames = ParseParams(argc, argv); + + //initialize arguments for partition function + init_arguments(); + + ReadParameters(); + //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL); + + //read the input sequences + MultiSequence *sequences = new MultiSequence(); + assert(sequences); + for (int i = 0; i < (int) sequenceNames.size(); i++) { + cerr << "Loading sequence file: " << sequenceNames[i] << endl; + sequences->LoadMFA(sequenceNames[i], true); + } + //allocate space for sequence weights + this->seqsWeights = new int[sequences->GetNumSequences()]; + //initilaize parameters for OPENMP +#ifdef _OPENMP + if(numThreads <= 0) { + numThreads = omp_get_num_procs(); + cerr << "Automatically detected " << numThreads << " CPU cores" << endl; + } + cerr <<"Enabling OpenMP (with "<WriteALN(*alignOutFile); + } else { + alignment->WriteMFA(*alignOutFile); + } + //release resources + delete[] this->seqsWeights; + delete alignment; + delete sequences; +} +MSA::~MSA() { + /*close the output file*/ + if (alignOutFileName.length() > 0) { + ((std::ofstream*) alignOutFile)->close(); + } +} +///////////////////////////////////////////////////////////////// +// PrintParameters() +// +// Prints MSAPROBS parameters to STDERR. If a filename is +// specified, then the parameters are also written to the file. +///////////////////////////////////////////////////////////////// + +void MSA::PrintParameters(const char *message, const VF &initDistrib, + const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, + const VF &emitSingle, const char *filename) { + + // print parameters to the screen + cerr << message << endl << " initDistrib[] = { "; + for (int i = 0; i < NumMatrixTypes; i++) + cerr << setprecision(10) << initDistrib[i] << " "; + cerr << "}" << endl << " gapOpen[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapOpen[i] << " "; + cerr << "}" << endl << " gapExtend[] = { "; + for (int i = 0; i < NumInsertStates * 2; i++) + cerr << setprecision(10) << gapExtend[i] << " "; + cerr << "}" << endl << endl; + + /* + for (int i = 0; i < 5; i++){ + for (int j = 0; j <= i; j++){ + cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " "; + } + cerr << endl; + }*/ + + // if a file name is specified + if (filename) { + + // attempt to open the file for writing + FILE *file = fopen(filename, "w"); + if (!file) { + cerr << "ERROR: Unable to write parameter file: " << filename + << endl; + exit(1); + } + + // if successful, then write the parameters to the file + for (int i = 0; i < NumMatrixTypes; i++) + fprintf(file, "%.10f ", initDistrib[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapOpen[i]); + fprintf(file, "\n"); + for (int i = 0; i < 2 * NumInsertStates; i++) + fprintf(file, "%.10f ", gapExtend[i]); + fprintf(file, "\n"); + fprintf(file, "%s\n", alphabet.c_str()); + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) + fprintf(file, "%.10f ", + emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]); + fprintf(file, "\n"); + } + for (int i = 0; i < (int) alphabet.size(); i++) + fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]); + fprintf(file, "\n"); + fclose(file); + } +} + +///////////////////////////////////////////////////////////////// +// doAlign() +// +// First computes all pairwise posterior probability matrices. +// Then, computes new parameters if training, or a final +// alignment, otherwise. +///////////////////////////////////////////////////////////////// +extern VF *ComputePostProbs(int a, int b, string seq1, string seq2); +MultiSequence* MSA::doAlign(MultiSequence *sequences, + const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, + VF &gapExtend, VVF &emitPairs, VF &emitSingle) { + assert(sequences); + + //get the number of sequences + const int numSeqs = sequences->GetNumSequences(); + + //create distance matrix + VVF distances(numSeqs, VF(numSeqs, 0)); + SafeVector > sparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + +#ifdef _OPENMP + //calculate sequence pairs for openmp model + int pairIdx = 0; + numPairs = (numSeqs - 1) * numSeqs / 2; + seqsPairs = new SeqsPair[numPairs]; + for(int a = 0; a < numSeqs; a++) { + for(int b = a + 1; b < numSeqs; b++) { + seqsPairs[pairIdx].seq1 = a; + seqsPairs[pairIdx].seq2 = b; + pairIdx++; + } + } +#endif + // do all pairwise alignments for posterior probability matrices +#ifdef _OPENMP +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int a= seqsPairs[pairIdx].seq1; + int b = seqsPairs[pairIdx].seq2; + if(enableVerbose) { +#pragma omp critical + cerr <<"tid "<GetSequence(a); + Sequence *seq2 = sequences->GetSequence(b); + + // verbose output + if (enableVerbose) { + cerr << "Computing posterior matrix: (" << a + 1 << ") " + << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") " + << seq2->GetHeader() << " -- "; + } + + // compute forward and backward probabilities + VF *forward = model.ComputeForwardMatrix(seq1, seq2); + assert(forward); + VF *backward = model.ComputeBackwardMatrix(seq1, seq2); + assert(backward); + + // compute posterior probability matrix from HMM + VF *posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward, + *backward); + assert(posterior); + delete forward; + delete backward; + + //compute posterior probability matrix from partition function + VF* part_posterior = ::ComputePostProbs(a, b, seq1->GetString(), + seq2->GetString()); + assert(part_posterior); + + //merge the two posterior matrices + VF::iterator ptr1 = posterior->begin(); + VF::iterator ptr2 = part_posterior->begin(); + for (int i = 0; i <= seq1->GetLength(); i++) { + for (int j = 0; j <= seq2->GetLength(); j++) { + float v1 = *ptr1; + float v2 = *ptr2; + + *ptr1 = sqrt((v1 * v1 + v2 * v2) * 0.5f); + ptr1++; + ptr2++; + } + } + delete part_posterior; + + // compute sparse representations + sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), *posterior); + sparseMatrices[b][a] = NULL; + + // perform the pairwise sequence alignment + pair *, float> alignment = model.ComputeAlignment( + seq1->GetLength(), seq2->GetLength(), *posterior); + + //compute the pairwise distance using expected accuracy + float accuracy = alignment.second + / min(seq1->GetLength(), seq2->GetLength()); + distances[a][b] = distances[b][a] = 1.0f - accuracy; + + if (enableVerbose) { + cerr << setprecision(10) << accuracy << endl; + } + delete alignment.first; + delete posterior; +#ifndef _OPENMP + } +#endif + } + //create the guide tree + this->tree = new MSAClusterTree(this, distances, numSeqs); + this->tree->create(); + + // perform the consistency transformation the desired number of times + float* fweights = new float[numSeqs]; + for (int r = 0; r < numSeqs; r++) { + fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY; + fweights[r] *= 10; + } + for (int r = 0; r < numConsistencyReps; r++) { + SafeVector > newSparseMatrices = + DoRelaxation(fweights, sequences, sparseMatrices); + + // now replace the old posterior matrices + for (int i = 0; i < numSeqs; i++) { + for (int j = 0; j < numSeqs; j++) { + delete sparseMatrices[i][j]; + sparseMatrices[i][j] = newSparseMatrices[i][j]; + } + } + } + delete[] fweights; +#ifdef _OPENMP + delete [] seqsPairs; +#endif + + //compute the final multiple sequence alignment + MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences, + sparseMatrices, model); + + // build annotation + if (enableAnnotation) { + WriteAnnotation(finalAlignment, sparseMatrices); + } + //destroy the guide tree + delete this->tree; + this->tree = 0; + + // delete sparse matrices + for (int a = 0; a < numSeqs - 1; a++) { + for (int b = a + 1; b < numSeqs; b++) { + delete sparseMatrices[a][b]; + delete sparseMatrices[b][a]; + } + } + + return finalAlignment; +} + +///////////////////////////////////////////////////////////////// +// GetInteger() +// +// Attempts to parse an integer from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetInteger(char *data, int *val) { + char *endPtr; + long int retVal; + + assert(val); + + errno = 0; + retVal = strtol(data, &endPtr, 0); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) + return false; + if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) + return false; + *val = (int) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// GetFloat() +// +// Attempts to parse a float from the character string given. +// Returns true only if no parsing error occurs. +///////////////////////////////////////////////////////////////// + +bool GetFloat(char *data, float *val) { + char *endPtr; + double retVal; + + assert(val); + + errno = 0; + retVal = strtod(data, &endPtr); + if (retVal == 0 && (errno != 0 || data == endPtr)) + return false; + if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) + return false; + *val = (float) retVal; + return true; +} + +///////////////////////////////////////////////////////////////// +// ReadParameters() +// +// Read initial distribution, transition, and emission +// parameters from a file. +///////////////////////////////////////////////////////////////// + +void MSA::ReadParameters() { + + ifstream data; + + emitPairs = VVF(256, VF(256, 1e-10)); + emitSingle = VF(256, 1e-5); + + // read initial state distribution and transition parameters + if (parametersInputFilename == string("")) { + if (NumInsertStates == 1) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen1Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend1Default[i]; + } else if (NumInsertStates == 2) { + for (int i = 0; i < NumMatrixTypes; i++) + initDistrib[i] = initDistrib2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapOpen[i] = gapOpen2Default[i]; + for (int i = 0; i < 2 * NumInsertStates; i++) + gapExtend[i] = gapExtend2Default[i]; + } else { + cerr + << "ERROR: No default initial distribution/parameter settings exist" + << endl << " for " << NumInsertStates + << " pairs of insert states. Use --paramfile." << endl; + exit(1); + } + + alphabet = alphabetDefault; + + for (int i = 0; i < (int) alphabet.length(); i++) { + emitSingle[(unsigned char) tolower(alphabet[i])] = + emitSingleDefault[i]; + emitSingle[(unsigned char) toupper(alphabet[i])] = + emitSingleDefault[i]; + for (int j = 0; j <= i; j++) { + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = emitPairsDefault[i][j]; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = emitPairsDefault[i][j]; + } + } + } else { + data.open(parametersInputFilename.c_str()); + if (data.fail()) { + cerr << "ERROR: Unable to read parameter file: " + << parametersInputFilename << endl; + exit(1); + } + + string line[3]; + for (int i = 0; i < 3; i++) { + if (!getline(data, line[i])) { + cerr + << "ERROR: Unable to read transition parameters from parameter file: " + << parametersInputFilename << endl; + exit(1); + } + } + istringstream data2; + data2.clear(); + data2.str(line[0]); + for (int i = 0; i < NumMatrixTypes; i++) + data2 >> initDistrib[i]; + data2.clear(); + data2.str(line[1]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapOpen[i]; + data2.clear(); + data2.str(line[2]); + for (int i = 0; i < 2 * NumInsertStates; i++) + data2 >> gapExtend[i]; + + if (!getline(data, line[0])) { + cerr << "ERROR: Unable to read alphabet from scoring matrix file: " + << parametersInputFilename << endl; + exit(1); + } + + // read alphabet as concatenation of all characters on alphabet line + alphabet = ""; + string token; + data2.clear(); + data2.str(line[0]); + while (data2 >> token) + alphabet += token; + + for (int i = 0; i < (int) alphabet.size(); i++) { + for (int j = 0; j <= i; j++) { + float val; + data >> val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower( + alphabet[j])] = val; + emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper( + alphabet[j])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower( + alphabet[i])] = val; + emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper( + alphabet[i])] = val; + } + } + + for (int i = 0; i < (int) alphabet.size(); i++) { + float val; + data >> val; + emitSingle[(unsigned char) tolower(alphabet[i])] = val; + emitSingle[(unsigned char) toupper(alphabet[i])] = val; + } + data.close(); + } +} + +///////////////////////////////////////////////////////////////// +// ParseParams() +// +// Parse all command-line options. +///////////////////////////////////////////////////////////////// +void MSA::printUsage() { + cerr + << "************************************************************************" + << endl + << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm" + << endl + << "\tbased on pair hidden markov model and partition function postirior" + << endl + << "\tprobabilities. If any comments or problems, please contact" + << endl + << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)" + << endl + << "*************************************************************************" + << endl << "Usage:" << endl + << " msaprobs [OPTION]... [infile]..." << endl << endl + << "Description:" << endl + << " Align sequences in multi-FASTA format" << endl << endl + << " -o, --outfile " << endl + << " specify the output file name (STDOUT by default)" + << endl << " -num_threads " << endl + << " specify the number of threads used, and otherwise detect automatically" + << endl << " -clustalw" << endl + << " use CLUSTALW output format instead of FASTA format" + << endl << endl << " -c, --consistency REPS" << endl + << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " + << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps + << ") passes of consistency transformation" << endl << endl + << " -ir, --iterative-refinement REPS" << endl + << " use " << MIN_ITERATIVE_REFINEMENT_REPS + << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: " + << numIterativeRefinementReps << ") passes of iterative-refinement" + << endl << endl << " -v, --verbose" << endl + << " report progress while aligning (default: " + << (enableVerbose ? "on" : "off") << ")" << endl << endl + << " -annot FILENAME" << endl + << " write annotation for multiple alignment to FILENAME" + << endl << endl << " -a, --alignment-order" << endl + << " print sequences in alignment order rather than input order (default: " + << (enableAlignOrder ? "on" : "off") << ")" << endl + << " -version " << endl + << " print out version of MSAPROBS " << endl << endl; +} +SafeVector MSA::ParseParams(int argc, char **argv) { + if (argc < 2) { + printUsage(); + exit(1); + } + SafeVector sequenceNames; + int tempInt; + float tempFloat; + + for (int i = 1; i < argc; i++) { + if (argv[i][0] == '-') { + //help + if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) { + printUsage(); + exit(1); + //output file name + } else if (!strcmp(argv[i], "-o") + || !strcmp(argv[i], "--outfile")) { + if (i < argc - 1) { + alignOutFileName = argv[++i]; //get the file name + } else { + cerr << "ERROR: String expected for option " << argv[i] + << endl; + exit(1); + } + //number of threads used + } else if (!strcmp(argv[i], "-p") + || !strcmp(argv[i], "-num_threads")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << " ERROR: invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < 0) { + tempInt = 0; + } + numThreads = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + // number of consistency transformations + } else if (!strcmp(argv[i], "-c") + || !strcmp(argv[i], "--consistency")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_CONSISTENCY_REPS + || tempInt > MAX_CONSISTENCY_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_CONSISTENCY_REPS << " and " + << MAX_CONSISTENCY_REPS << "." << endl; + exit(1); + } else { + numConsistencyReps = tempInt; + } + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // number of randomized partitioning iterative refinement passes + else if (!strcmp(argv[i], "-ir") + || !strcmp(argv[i], "--iterative-refinement")) { + if (i < argc - 1) { + if (!GetInteger(argv[++i], &tempInt)) { + cerr << "ERROR: Invalid integer following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS + || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) { + cerr << "ERROR: For option " << argv[i - 1] + << ", integer must be between " + << MIN_ITERATIVE_REFINEMENT_REPS << " and " + << MAX_ITERATIVE_REFINEMENT_REPS << "." + << endl; + exit(1); + } else + numIterativeRefinementReps = tempInt; + } + } else { + cerr << "ERROR: Integer expected for option " << argv[i] + << endl; + exit(1); + } + } + + // annotation files + else if (!strcmp(argv[i], "-annot")) { + enableAnnotation = true; + if (i < argc - 1) { + annotationFilename = argv[++i]; + } else { + cerr << "ERROR: FILENAME expected for option " << argv[i] + << endl; + exit(1); + } + } + + // clustalw output format + else if (!strcmp(argv[i], "-clustalw")) { + enableClustalWOutput = true; + } + + // cutoff + else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) { + if (i < argc - 1) { + if (!GetFloat(argv[++i], &tempFloat)) { + cerr + << "ERROR: Invalid floating-point value following option " + << argv[i - 1] << ": " << argv[i] << endl; + exit(1); + } else { + if (tempFloat < 0 || tempFloat > 1) { + cerr << "ERROR: For option " << argv[i - 1] + << ", floating-point value must be between 0 and 1." + << endl; + exit(1); + } else + cutoff = tempFloat; + } + } else { + cerr << "ERROR: Floating-point value expected for option " + << argv[i] << endl; + exit(1); + } + } + + // verbose reporting + else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) { + enableVerbose = true; + } + + // alignment order + else if (!strcmp(argv[i], "-a") + || !strcmp(argv[i], "--alignment-order")) { + enableAlignOrder = true; + } + + //print out version + else if (!strcmp(argv[i], "-version")) { + cerr << "MSAPROBS version " << VERSION << endl; + exit(1); + } + // bad arguments + else { + cerr << "ERROR: Unrecognized option: " << argv[i] << endl; + exit(1); + } + } else { + sequenceNames.push_back(string(argv[i])); + } + } + + /*check the output file name*/ + cerr << "-------------------------------------" << endl; + if (alignOutFileName.length() == 0) { + cerr << "The final alignments will be printed out to STDOUT" << endl; + alignOutFile = &std::cout; + } else { + cerr << "Open the output file " << alignOutFileName << endl; + alignOutFile = new ofstream(alignOutFileName.c_str(), + ios::binary | ios::out | ios::trunc); + } + cerr << "-------------------------------------" << endl; + return sequenceNames; +} + +///////////////////////////////////////////////////////////////// +// ProcessTree() +// +// Process the tree recursively. Returns the aligned sequences +// corresponding to a node or leaf of the tree. +///////////////////////////////////////////////////////////////// +MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + MultiSequence *result; + + // check if this is a node of the alignment tree + //if (tree->GetSequenceLabel() == -1){ + if (tree->leaf == NODE) { + MultiSequence *alignLeft = ProcessTree(tree->left, sequences, + sparseMatrices, model); + MultiSequence *alignRight = ProcessTree(tree->right, sequences, + sparseMatrices, model); + + assert(alignLeft); + assert(alignRight); + + result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model); + assert(result); + + delete alignLeft; + delete alignRight; + } + + // otherwise, this is a leaf of the alignment tree + else { + result = new MultiSequence(); + assert(result); + //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone()); + result->AddSequence(sequences->GetSequence(tree->idx)->Clone()); + } + + return result; +} + +///////////////////////////////////////////////////////////////// +// ComputeFinalAlignment() +// +// Compute the final alignment by calling ProcessTree(), then +// performing iterative refinement as needed. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree, + MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences, + sparseMatrices, model); + + SafeVector oldOrdering; + if (enableAlignOrder) { + for (int i = 0; i < alignment->GetNumSequences(); i++) + oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel()); + alignment->SaveOrdering(); + enableAlignOrder = false; + } + + // tree-based refinement + // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree); + /*int numSeqs = alignment->GetNumSequences(); + if(numSeqs < numIterativeRefinementReps){ + for(int iter = 0; iter < 1; iter ++){ + for(int i = 0; i < numSeqs - 1; i++){ + DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i); + } + } + }*/ + for (int i = 0; i < numIterativeRefinementReps; i++) { + DoIterativeRefinement(sparseMatrices, model, alignment, i); + } + cerr << endl; + + if (oldOrdering.size() > 0) { + for (int i = 0; i < (int) oldOrdering.size(); i++) { + alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]); + } + } + + // return final alignment + return alignment; +} + +///////////////////////////////////////////////////////////////// +// AlignAlignments() +// +// Returns the alignment of two MultiSequence objects. +///////////////////////////////////////////////////////////////// + +MultiSequence* MSA::AlignAlignments(MultiSequence *align1, + MultiSequence *align2, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model) { + + // print some info about the alignment + if (enableVerbose) { + for (int i = 0; i < align1->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align1->GetSequence(i)->GetLabel(); + cerr << "] vs. "; + for (int i = 0; i < align2->GetNumSequences(); i++) + cerr << ((i == 0) ? "[" : ",") + << align2->GetSequence(i)->GetLabel(); + cerr << "]: "; + } +#if 0 + VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff); +#else + VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2, + sparseMatrices, cutoff); +#endif + pair *, float> alignment; + + //perform alignment + alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(), + align2->GetSequence(0)->GetLength(), *posterior); + + delete posterior; + + if (enableVerbose) { + + // compute total length of sequences + int totLength = 0; + for (int i = 0; i < align1->GetNumSequences(); i++) + for (int j = 0; j < align2->GetNumSequences(); j++) + totLength += min(align1->GetSequence(i)->GetLength(), + align2->GetSequence(j)->GetLength()); + + // give an "accuracy" measure for the alignment + cerr << alignment.second / totLength << endl; + } + + // now build final alignment + MultiSequence *result = new MultiSequence(); + for (int i = 0; i < align1->GetNumSequences(); i++) + result->AddSequence( + align1->GetSequence(i)->AddGaps(alignment.first, 'X')); + for (int i = 0; i < align2->GetNumSequences(); i++) + result->AddSequence( + align2->GetSequence(i)->AddGaps(alignment.first, 'Y')); + if (!enableAlignOrder) + result->SortByLabel(); + + // free temporary alignment + delete alignment.first; + + return result; +} + +///////////////////////////////////////////////////////////////// +// DoRelaxation() +// +// Performs one round of the weighted probabilistic consistency transformation. +// 1 +///////////////////////////////////////////////////////////////// + +SafeVector > MSA::DoRelaxation(float* seqsWeights, + MultiSequence *sequences, + SafeVector > &sparseMatrices) { + const int numSeqs = sequences->GetNumSequences(); + + SafeVector > newSparseMatrices(numSeqs, + SafeVector(numSeqs, NULL)); + + // for every pair of sequences +#ifdef _OPENMP + int pairIdx; +#pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic) + for(pairIdx = 0; pairIdx < numPairs; pairIdx++) { + int i = seqsPairs[pairIdx].seq1; + int j = seqsPairs[pairIdx].seq2; + float wi = seqsWeights[i]; + float wj = seqsWeights[j]; +#else + for (int i = 0; i < numSeqs; i++) { + float wi = seqsWeights[i]; + for (int j = i + 1; j < numSeqs; j++) { + float wj = seqsWeights[j]; +#endif + Sequence *seq1 = sequences->GetSequence(i); + Sequence *seq2 = sequences->GetSequence(j); + + if (enableVerbose) { +#ifdef _OPENMP +#pragma omp critical +#endif + cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader() + << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader() + << ": "; + } + // get the original posterior matrix + VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + // contribution from the summation where z = x and z = y + float w = wi * wi * wj + wi * wj * wj; + float sumW = w; + for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) { + posterior[k] = w * posterior[k]; + } + + if (enableVerbose) + cerr << sparseMatrices[i][j]->GetNumCells() << " --> "; + + // contribution from all other sequences + for (int k = 0; k < numSeqs; k++) { + if (k != i && k != j) { + float wk = seqsWeights[k]; + float w = wi * wj * wk; + sumW += w; + if (k < i) + Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j], + posterior); + else if (k > i && k < j) + Relax(w, sparseMatrices[i][k], sparseMatrices[k][j], + posterior); + else { + SparseMatrix *temp = + sparseMatrices[j][k]->ComputeTranspose(); + Relax(w, sparseMatrices[i][k], temp, posterior); + delete temp; + } + } + } + //cerr<<"sumW "<::iterator XYptr = matXY->GetRowPtr(x); + SafeVector::iterator XYend = XYptr + matXY->GetRowSize(x); + VF::iterator base = posterior.begin() + x * (seq2Length + 1); + int curr = 0; + while (XYptr != XYend) { + + // zero out all cells until the first filled column + while (curr < XYptr->first) { + base[curr] = 0; + curr++; + } + + // now, skip over this column + curr++; + ++XYptr; + } + + // zero out cells after last column + while (curr <= seq2Length) { + base[curr] = 0; + curr++; + } + } + + // save the new posterior matrix + newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(), + seq2->GetLength(), posterior); + newSparseMatrices[j][i] = NULL; + + if (enableVerbose) + cerr << newSparseMatrices[i][j]->GetNumCells() << " -- "; + + delete posteriorPtr; + + if (enableVerbose) + cerr << "done." << endl; +#ifndef _OPENMP + } +#endif + } + + return newSparseMatrices; +} + +///////////////////////////////////////////////////////////////// +// Relax() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY, + VF &posterior) { + + assert(matXZ); + assert(matZY); + + int lengthX = matXZ->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length()); + + // for every x[i] + for (int i = 1; i <= lengthX; i++) { + SafeVector::iterator XZptr = matXZ->GetRowPtr(i); + SafeVector::iterator XZend = XZptr + matXZ->GetRowSize(i); + + VF::iterator base = posterior.begin() + i * (lengthY + 1); + + // iterate through all x[i]-z[k] + while (XZptr != XZend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(XZptr->first); + SafeVector::iterator ZYend = ZYptr + + matZY->GetRowSize(XZptr->first); + const float XZval = XZptr->second; + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + base[ZYptr->first] += weight * XZval * ZYptr->second; + ZYptr++; + } + XZptr++; + } + } +} + +///////////////////////////////////////////////////////////////// +// Relax1() +// +// Computes the consistency transformation for a single sequence +// z, and adds the transformed matrix to "posterior". +///////////////////////////////////////////////////////////////// + +void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY, + VF &posterior) { + + assert(matZX); + assert(matZY); + + int lengthZ = matZX->GetSeq1Length(); + int lengthY = matZY->GetSeq2Length(); + + // for every z[k] + for (int k = 1; k <= lengthZ; k++) { + SafeVector::iterator ZXptr = matZX->GetRowPtr(k); + SafeVector::iterator ZXend = ZXptr + matZX->GetRowSize(k); + + // iterate through all z[k]-x[i] + while (ZXptr != ZXend) { + SafeVector::iterator ZYptr = matZY->GetRowPtr(k); + SafeVector::iterator ZYend = ZYptr + matZY->GetRowSize(k); + const float ZXval = ZXptr->second; + VF::iterator base = posterior.begin() + + ZXptr->first * (lengthY + 1); + + // iterate through all z[k]-y[j] + while (ZYptr != ZYend) { + base[ZYptr->first] += weight * ZXval * ZYptr->second; + ZYptr++; + } + ZXptr++; + } + } +} +///////////////////////////////////////////////////////////////// +// DoIterativeRefinement() +// +// Performs a single round of randomized partionining iterative +// refinement. +///////////////////////////////////////////////////////////////// + +int MSA::GenRandom(int m, int seed, bool init) { + static const int a = 5, b = 3, n = 7; + static int rand0; + if (init == true) { + rand0 = seed; + } + m *= 19; + int rand1; + for (int i = 0; i < n; i++) { + rand1 = (a * rand0 + b) % m; + rand0 = rand1; + } + return rand1; +} + +void MSA::DoIterativeRefinement( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment, int si) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + + int index = GenRandom(numSeqs, si, true); + // create two separate groups + for (int i = 0; i < numSeqs; i++) { + index = GenRandom(numSeqs, si); + if (index % 2) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) + return; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); + delete alignment; + + // realign + alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, + model); + + delete groupOneSeqs; + delete groupTwoSeqs; +} +void MSA::DoIterativeRefinementTreeNode( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment, + int nodeIndex) { + set groupOne, groupTwo; + int numSeqs = alignment->GetNumSequences(); + + vector inGroup1; + inGroup1.resize(numSeqs); + for (int i = 0; i < numSeqs; i++) { + inGroup1[i] = false; + } + + AlignmentOrder* orders = this->tree->getAlignOrders(); + AlignmentOrder* order = &orders[nodeIndex]; + for (int i = 0; i < order->leftNum; i++) { + int si = order->leftLeafs[i]; + inGroup1[si] = true; + } + for (int i = 0; i < order->rightNum; i++) { + int si = order->rightLeafs[i]; + inGroup1[si] = true; + } + // create two separate groups + for (int i = 0; i < numSeqs; i++) { + if (inGroup1[i]) { + groupOne.insert(i); + } else { + groupTwo.insert(i); + } + } + if (groupOne.empty() || groupTwo.empty()) + return; + + // project into the two groups + MultiSequence *groupOneSeqs = alignment->Project(groupOne); + assert(groupOneSeqs); + MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); + assert(groupTwoSeqs); + delete alignment; + + // realign + alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, + model); + + delete groupOneSeqs; + delete groupTwoSeqs; +} + +///////////////////////////////////////////////////////////////// +// WriteAnnotation() +// +// Computes annotation for multiple alignment and write values +// to a file. +///////////////////////////////////////////////////////////////// + +void MSA::WriteAnnotation(MultiSequence *alignment, + const SafeVector > &sparseMatrices) { + ofstream outfile(annotationFilename.c_str()); + + if (outfile.fail()) { + cerr << "ERROR: Unable to write annotation file." << endl; + exit(1); + } + + const int alignLength = alignment->GetSequence(0)->GetLength(); + const int numSeqs = alignment->GetNumSequences(); + + SafeVector position(numSeqs, 0); + SafeVector::iterator> seqs(numSeqs); + for (int i = 0; i < numSeqs; i++) + seqs[i] = alignment->GetSequence(i)->GetDataPtr(); + SafeVector > active; + active.reserve(numSeqs); + + SafeVector lab; + for (int i = 0; i < numSeqs; i++) + lab.push_back(alignment->GetSequence(i)->GetSortLabel()); + + // for every column + for (int i = 1; i <= alignLength; i++) { + + // find all aligned residues in this particular column + active.clear(); + for (int j = 0; j < numSeqs; j++) { + if (seqs[j][i] != '-') { + active.push_back(make_pair(lab[j], ++position[j])); + } + } + + sort(active.begin(), active.end()); + outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl; + } + + outfile.close(); +} + +///////////////////////////////////////////////////////////////// +// ComputeScore() +// +// Computes the annotation score for a particular column. +///////////////////////////////////////////////////////////////// + +int MSA::ComputeScore(const SafeVector > &active, + const SafeVector > &sparseMatrices) { + + if (active.size() <= 1) + return 0; + + // ALTERNATIVE #1: Compute the average alignment score. + + float val = 0; + for (int i = 0; i < (int) active.size(); i++) { + for (int j = i + 1; j < (int) active.size(); j++) { + val += sparseMatrices[active[i].first][active[j].first]->GetValue( + active[i].second, active[j].second); + } + } + + return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1))); + +} diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSA.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSA.h new file mode 100644 index 0000000..9d4ef7c --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSA.h @@ -0,0 +1,100 @@ +#ifndef _MSA_H +#define _MSA_H +#include "MSADef.h" +#include "MSAGuideTree.h" + +#include "SafeVector.h" +#include "MultiSequence.h" +#include "ScoreType.h" +#include "ProbabilisticModel.h" +#include "SparseMatrix.h" +#include +using namespace std; + +class MSAGuideTree; +struct TreeNode; +class MSA { +public: + MSA(int argc, char* argv[]); + ~MSA(); + + static void getSysTime(double * dtime); + MSAGuideTree* getGuideTree() { + return tree; + } + int * getSeqsWeights() { + return seqsWeights; + } +private: + //print usage + void printUsage(); + //do multiple sequence alignment + void doAlign(); + + //for sequence weights + void createSeqsWeights(int seqsNum); + void releaseSeqsWeights(); + + //weights of sequences + int * seqsWeights; + //guide tree + MSAGuideTree* tree; + //output file + string alignOutFileName; + std::ostream* alignOutFile; +private: + SafeVector ParseParams(int argc, char *argv[]); + void PrintParameters(const char *message, const VF &initDistrib, + const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, + const VF &emitSingle, const char *filename); + + SafeVector PostProbsParseParams(int argc, char **argv); + MultiSequence *doAlign(MultiSequence *sequence, + const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, + VF &gapExtend, VVF &emitPairs, VF &emitSingle); + void ReadParameters(); + MultiSequence* ProcessTree(TreeNode *tree, MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model); + MultiSequence *ComputeFinalAlignment(MSAGuideTree *tree, + MultiSequence *sequences, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model); + MultiSequence *AlignAlignments(MultiSequence *align1, MultiSequence *align2, + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model); + SafeVector > DoRelaxation(float* seqsWeights, + MultiSequence *sequences, + SafeVector > &sparseMatrices); + void Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY, + VF &posterior); + void Relax1(float weight, SparseMatrix *matXZ, SparseMatrix *matZY, + VF &posterior); + + int GenRandom(int m, int seed, bool init = false); + void DoIterativeRefinement( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment, int si); + void DoIterativeRefinement( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment); + void DoIterativeRefinementTreeNode( + const SafeVector > &sparseMatrices, + const ProbabilisticModel &model, MultiSequence* &alignment, + int nodeIndex); + void WriteAnnotation(MultiSequence *alignment, + const SafeVector > &sparseMatrices); + int ComputeScore(const SafeVector > &active, + const SafeVector > &sparseMatrices); +#ifdef _OPENMP + //private struct + struct SeqsPair { + int seq1; + int seq2; + }; + int numPairs; + SeqsPair* seqsPairs; +#endif +}; + +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAClusterTree.cpp b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAClusterTree.cpp new file mode 100644 index 0000000..a95efe0 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAClusterTree.cpp @@ -0,0 +1,151 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#include "MSAClusterTree.h" +MSAClusterTree::MSAClusterTree(MSA* msa, VVF& distMatrix, int numSeqs) : + MSAGuideTree(msa, distMatrix, numSeqs) { +} +MSAClusterTree::~MSAClusterTree() { +} +void MSAClusterTree::create() { + //generate the neighbor-joining tree + this->generateClusterTree(); + + //calculate sequence weights + this->getSeqsWeights(); + + //construct the alignment orders + this->createAlignmentOrders(); +} +void MSAClusterTree::generateClusterTree() { + int i; + ValidNode* validNodes, *headValidNodes; + ValidNode* miniPtr, *minjPtr, *ivalid, *jvalid; + int mini, minj; + float* joins; + unsigned int* clusterLeafs; + + //initialize the valid nodes link list + validNodes = new ValidNode[leafsNum + 1]; + joins = new float[leafsNum + 1]; + clusterLeafs = new unsigned int[nodesNum + 1]; + if (!validNodes || !joins || !clusterLeafs) { + cerr << "Out of memory of the reconstruction of cluster tree" << endl; + } + //initialize cluster size + for (i = 0; i < this->leafsNum; i++) { + clusterLeafs[i] = 1; + } + + headValidNodes = &validNodes[0]; + headValidNodes->next = &validNodes[1]; + headValidNodes->n = -1; + headValidNodes->node = -1; + headValidNodes->prev = NULL; + + //build an initial link list + ValidNode* curr = &validNodes[1]; + ValidNode* prev = headValidNodes; + ValidNode* next = &validNodes[2]; + for (i = 0; i < leafsNum; i++) { + curr->n = i; + curr->node = i; + curr->prev = prev; + curr->next = next; + prev = curr; + curr = next; + next++; + } + prev->next = NULL; + + //to generate the cluster tree + int nodeIdx; //the index of an internal node + int firstNode = leafsNum; //the index of the first internal node + int lastNode = firstNode + leafsNum - 1;//the index of the last internal node + + for (nodeIdx = firstNode; nodeIdx < lastNode; nodeIdx++) { + //find closest pair of clusters + float minDist = 2.0f; + miniPtr = headValidNodes; + minjPtr = headValidNodes; + + for (ivalid = headValidNodes->next; ivalid != NULL; + ivalid = ivalid->next) { + mini = ivalid->n; + for (jvalid = headValidNodes->next; + jvalid != NULL && jvalid->n < mini; jvalid = jvalid->next) { + minj = jvalid->n; + float dist = (*distMatrix)[mini][minj]; + if (dist < 0) { + cerr + << "ERROR: It is impossible to have distance value less than zero" + << endl; + dist = 0; + } + if (dist < minDist) { + minDist = dist; + miniPtr = ivalid; + minjPtr = jvalid; + } + //printf("dist %g mini %d minj %d\n", dist, ivalid->node, jvalid->node); + } + } + //printf("**** mini %d minj %d minDist %g *****\n", miniPtr->node, minjPtr->node, minDist); + //check the validity of miniPtr and minjPtr; + if (miniPtr == headValidNodes || minjPtr == headValidNodes) { + cerr << "OOPS: Error occurred while constructing the cluster tree\n" + << endl; + exit(-1); + } + //computing branch length and join the two nodes + float branchLength = minDist * 0.5f; + this->connectNodes(&nodes[nodeIdx], nodeIdx, &nodes[miniPtr->node], + branchLength, &nodes[minjPtr->node], branchLength); + clusterLeafs[nodeIdx] = clusterLeafs[miniPtr->node] + + clusterLeafs[minjPtr->node]; + + //remove the valid node minjPtr from the list + minjPtr->prev->next = minjPtr->next; + if (minjPtr->next != NULL) { + minjPtr->next->prev = minjPtr->prev; + } + minjPtr->prev = minjPtr->next = NULL; + + //compute the distance of each remaining valid node to the new node + for (ivalid = headValidNodes->next; ivalid != NULL; + ivalid = ivalid->next) { + int idx = ivalid->n; + + float idist = (*distMatrix)[miniPtr->n][idx]; + float jdist = (*distMatrix)[minjPtr->n][idx]; + + unsigned int isize = clusterLeafs[miniPtr->node]; + unsigned int jsize = clusterLeafs[minjPtr->node]; + joins[idx] = (idist * isize + jdist * jsize) / (isize + jsize); + } + //update the distance to the new node + miniPtr->node = nodeIdx; + mini = miniPtr->n; + for (jvalid = headValidNodes->next; jvalid != NULL; + jvalid = jvalid->next) { + minj = jvalid->n; + + float dist = joins[minj]; + (*distMatrix)[mini][minj] = dist; + (*distMatrix)[minj][mini] = dist; + } + } + //add a pseudo root to this unrooted NJ tree + this->root = &nodes[lastNode - 1]; + + delete[] validNodes; + delete[] joins; + delete[] clusterLeafs; +} diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAClusterTree.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAClusterTree.h new file mode 100644 index 0000000..30bce05 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAClusterTree.h @@ -0,0 +1,27 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#ifndef _MSA_CLUSTER_TREE_H +#define _MSA_CLUSTER_TREE_H + +#include "MSAGuideTree.h" + +class MSAClusterTree: public MSAGuideTree { +public: + MSAClusterTree(MSA* msa, VVF& distMatrix, int numSeqs); + ~MSAClusterTree(); + + //construct the cluster tree + void create(); +private: + //generate the cluster tree + void generateClusterTree(); +}; +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSADef.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSADef.h new file mode 100644 index 0000000..6a3d178 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSADef.h @@ -0,0 +1,26 @@ +#ifndef _MSA_DEF_H +#define _MSA_DEF_H +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +//maximum number +#define MAX_INT_NUM 0x7FFFFFFF +#define MAX_FLOAT_NUM FLT_MAX +#define INT_MULTIPLY 1000 + +#define SUBMATRIX_INT_SCALE 100 + +//a tree node is a leaf or a node +enum { + NONE, NODE, LEAF +}; + +#endif + diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAGuideTree.cpp b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAGuideTree.cpp new file mode 100644 index 0000000..207d25b --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAGuideTree.cpp @@ -0,0 +1,327 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ +#include "MSAGuideTree.h" +#include "MSA.h" +MSAGuideTree::MSAGuideTree(MSA* msa, VVF& distances, int numSeqs) { + int i; + TreeNode* node; + //system configuration + this->msa = msa; + this->distMatrix = &distances; + this->numSeqs = numSeqs; + this->seqsWeights = msa->getSeqsWeights(); + + //tree structure + this->nodesSize = this->numSeqs * 2 + 1; + this->nodes = new TreeNode[this->nodesSize]; + if (!this->nodes) { + cerr << "TreeNodes memory allocation failed" << endl; + exit(-1); + } + //initialize all the tree nodes + this->leafs = this->nodes; + this->leafsNum = this->numSeqs; + this->nodesNum = 2 * this->leafsNum - 1; + for (i = 0; i < this->nodesSize; i++) { + node = &nodes[i]; + node->left = 0; + node->right = 0; + node->parent = 0; + node->leftIdx = -1; + node->rightIdx = -1; + node->parentIdx = -1; + node->idx = -1; + node->dist = 0; + node->leaf = NODE; //setted to be NODE, by default + node->order = 0; + node->depth = 0; + } + //initialize the leaf nodes + for (i = 0; i < this->leafsNum; i++) { + node = &this->leafs[i]; + node->idx = i; + node->leaf = LEAF; + } +} +MSAGuideTree::~MSAGuideTree() { + //release tree nodes + delete[] this->nodes; + + //release alignment orders + releaseAlignmentOrders(); + +} +//get the tree nodes +TreeNode* MSAGuideTree::getNodes() { + return nodes; +} +//get the leaf nodes +TreeNode* MSAGuideTree::getLeafs() { + return leafs; +} +//get the number of nodes; +int MSAGuideTree::getNodesNum() { + return nodesNum; +} +//get the number of leaf nodes +int MSAGuideTree::getLeafsNum() { + return leafsNum; +} +//get the alignment orders +AlignmentOrder* MSAGuideTree::getAlignOrders() { + return alignOrders; +} +int MSAGuideTree::getAlignOrdersNum() { + return alignOrdersNum; +} +/**************************************************** + create the evolutionary relationship + ****************************************************/ +void MSAGuideTree::connectNodes(TreeNode* parent, int parentIdx, + TreeNode* leftChild, float leftDist, TreeNode* rightChild, + float rightDist) { + //save the parents index for each child + leftChild->parent = parent; + leftChild->parentIdx = parentIdx; + rightChild->parent = parent; + rightChild->parentIdx = parentIdx; + + //save the branch lengths (i.e. distance) from each child to its parent + leftChild->dist = leftDist; + rightChild->dist = rightDist; + + //save the indices of itself and its children for this new tree node + parent->idx = parentIdx; + parent->left = leftChild; + parent->leftIdx = leftChild->idx; + parent->right = rightChild; + parent->rightIdx = rightChild->idx; +} +/***************************************** + compute the alignment order of the phylogentic tree + *****************************************/ +void MSAGuideTree::createAlignmentOrders() { + int i; + + AlignmentOrder* order; + //allocate memory space for alignment orders vector + this->alignOrdersNum = 0;//for alignment orders, it starts from 1 instead of 0 + this->alignOrdersSize = numSeqs;//the number of internal nodes of the phylogentic tree + 1 + this->alignOrders = new AlignmentOrder[this->alignOrdersSize]; + if (!this->alignOrders) { + cerr << "OOPS: Alignment orders memory allocation failed" << endl; + exit(-1); + } + //initialize the alignment orders vector + for (i = 0; i < this->alignOrdersSize; i++) { + order = &this->alignOrders[i]; + order->leftOrder = 0; + order->rightOrder = 0; + order->leftLeafs = 0; + order->leftNum = 0; + order->rightLeafs = 0; + order->rightNum = 0; + } + //starting out constructing the alignment orders + int subLeafsNum; + int nodeDepth = 1; + int subOrder = recursiveCreateAlignmentOrders(this->root, 0, subLeafsNum, + nodeDepth); + + //check whether the function works well + if (subLeafsNum != numSeqs || this->alignOrdersNum != subOrder) { + fprintf(stderr, + "The alignment orders constructed were wrong (subLeafsNum %d, alignOrdersNum %d, subOrder %d)\n", + subLeafsNum, alignOrdersNum, subOrder); + } + +} +int MSAGuideTree::recursiveCreateAlignmentOrders(TreeNode* subRoot, + int* subLeafs, int& subLeafsNum, int nodeDepth) { + int leftNum, rightNum; + int leftOrder, rightOrder; + int* leftLeafs, *rightLeafs; + + if (subRoot->leaf == LEAF) { + subLeafs[0] = subRoot->idx; + subLeafsNum = 1; + + return 0; //if it is a leaf, return the index 0 + } + leftOrder = rightOrder = 0; + leftNum = rightNum = 0; + leftLeafs = new int[numSeqs]; + rightLeafs = new int[numSeqs]; + + //check the left subtree + if (subRoot->left) { + //recursively tranverse the left subtree + leftOrder = recursiveCreateAlignmentOrders(subRoot->left, leftLeafs, + leftNum, nodeDepth + 1); + } + //check the right subtree + if (subRoot->right) { + rightOrder = recursiveCreateAlignmentOrders(subRoot->right, rightLeafs, + rightNum, nodeDepth + 1); + } + //save the leafs in the left and right subtrees of the current subtree + if (this->alignOrdersNum > this->alignOrdersSize) { + fprintf(stderr, "the alignment order function works bad\n");\ + exit(-1); + } + + AlignmentOrder* order = &this->alignOrders[++this->alignOrdersNum]; + order->nodeDepth = nodeDepth; + order->leftOrder = leftOrder; + order->rightOrder = rightOrder; + order->leftNum = leftNum; + order->rightNum = rightNum; + order->leftLeafs = new int[order->leftNum]; + order->rightLeafs = new int[order->rightNum]; + if (!order->leftLeafs || !order->rightLeafs) { + fprintf(stderr, + "memory allocation failed while recursively constructing alignment orders\n"); + exit(-1); + } + memcpy(order->leftLeafs, leftLeafs, order->leftNum * sizeof(int)); + memcpy(order->rightLeafs, rightLeafs, order->rightNum * sizeof(int)); + + delete[] leftLeafs; + delete[] rightLeafs; + + //for the root of the tree, subLeafs buffer is set to 0 + if (subLeafs) { + //copy the results to the parent tree node + memcpy(subLeafs, order->leftLeafs, order->leftNum * sizeof(int)); + memcpy(subLeafs + order->leftNum, order->rightLeafs, + order->rightNum * sizeof(int)); + } + //compute the total number of leafs in this subtree + subLeafsNum = order->leftNum + order->rightNum; + + return this->alignOrdersNum;//return the index of itself, starting from 1, instead of 0 +} +void MSAGuideTree::releaseAlignmentOrders() { + if (!this->alignOrders) { + return; + } + for (int i = 0; i < this->alignOrdersNum; i++) { + AlignmentOrder* order = &this->alignOrders[i]; + if (order->leftLeafs) { + delete[] order->leftLeafs; + } + if (order->rightLeafs) { + delete[] order->rightLeafs; + } + } + delete[] alignOrders; +} +/******************************** + display the alignment orders + ********************************/ +void MSAGuideTree::displayAlignmentOrders() { + int i, j; + AlignmentOrder* order; + fprintf(stderr, "************DISPLAY ALIGNMENT ORDER***************\n"); + for (i = 1; i <= this->alignOrdersNum; i++) { + order = &this->alignOrders[i]; + + fprintf(stderr, "GROUP (%d depth %d):\n---LEFT ORDER: %d\n", i, + order->nodeDepth, order->leftOrder); + fprintf(stderr, "---LEFT: "); + for (j = 0; j < order->leftNum; j++) { + fprintf(stderr, "%d ", order->leftLeafs[j]); + } + + fprintf(stderr, "\n---RIGHT ORDER: %d\n", order->rightOrder); + fprintf(stderr, "\n---RIGHT: "); + for (j = 0; j < order->rightNum; j++) { + fprintf(stderr, "%d ", order->rightLeafs[j]); + } + fprintf(stderr, "\n"); + } + fprintf(stderr, "*******************************************\n"); +} +/********************************* + display the tree + *********************************/ +void MSAGuideTree::displayTree() { + fprintf(stderr, "**************DISPLAY TREE*********************\n"); + for (int i = 0; i < nodesNum; i++) { + TreeNode* node = &nodes[i]; + + fprintf(stderr, + "%d(%p): left(%p) %d, right(%p) %d, parent(%p) %d, dist %f\n", + (node == &nodes[node->idx]) ? node->idx : -2, node, node->left, + (!node->left || node->left == &nodes[node->leftIdx]) ? + node->leftIdx : -2, node->right, + (!node->right || node->right == &nodes[node->rightIdx]) ? + node->rightIdx : -2, node->parent, + (!node->parent || node->parent == &nodes[node->parentIdx]) ? + node->parentIdx : -2, node->dist); + } + fprintf(stderr, "*******************************************\n"); +} +/********************************* + compute the sequence weights + *********************************/ +void MSAGuideTree::getSeqsWeights() { + int i; + TreeNode* curr; + + //compute the order of each node, which represents the number of leaf nodes in the substree rooting from it. + for (i = 0; i < leafsNum; i++) { + //for each leaf nodes + curr = &this->leafs[i]; + while (curr != 0) { + curr->order++; + + curr = curr->parent; + } + } + //compute the weight of each sequence, which corresponds to a leaf node + for (i = 0; i < numSeqs; i++) { + //compute the weight of each sequence + float weights = 0; + curr = &this->leafs[i]; + while (curr->parent != 0) { + weights += curr->dist / curr->order; + curr = curr->parent; + //printf("order:%d weights: %f\n", curr->order, weights); + } + //save the weight of this sequence + seqsWeights[i] = (int) (100 * weights); + //printf("%d\n", seqsWeights[i]); + } + //normalize the weights + int wsum = 0; + for (i = 0; i < numSeqs; i++) { + wsum += seqsWeights[i]; + } + if (wsum == 0) { + //in this case, every sequence is assumed to have an identical weight + for (i = 0; i < numSeqs; i++) { + seqsWeights[i] = 1; + } + wsum = numSeqs; + } + //printf("wsum:%d \n", wsum); + for (i = 0; i < numSeqs; i++) { + seqsWeights[i] = (seqsWeights[i] * INT_MULTIPLY) / wsum; + if (seqsWeights[i] < 1) { + seqsWeights[i] = 1; + } + //printf("%d \n", seqsWeights[i]); + } +} +void MSAGuideTree::create() { + //do nothing +} + diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAGuideTree.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAGuideTree.h new file mode 100644 index 0000000..97d538a --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAGuideTree.h @@ -0,0 +1,119 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#ifndef _MSA_GUIDE_TREE_H +#define _MSA_GUIDE_TREE_H +#include "MSADef.h" +#include "MSA.h" + +#include "SafeVector.h" +#include "MultiSequence.h" +#include "ScoreType.h" +#include "ProbabilisticModel.h" +#include "SparseMatrix.h" + +class MSA; +struct ValidNode { + ValidNode* prev; + ValidNode* next; + int n; //the index in the distance matrix + int node; //the index in the tree node entries +}; + +struct TreeNode { + struct TreeNode *left; //the pointer to its left child + struct TreeNode *right; //the pointer to its right child + struct TreeNode *parent; //the pointer to its parent + int leftIdx; //the index of the left child + int rightIdx; //the index of the right child + int parentIdx; //the index of its parent + int idx; //the index of itself + float dist; //the distance to its parent + int leaf; //whether it is a leaf node or not + int order; //the number of generations dating back to its ancestor + int depth; //the depth of the node +}; +struct AlignmentOrder { + int nodeDepth; //the depth of the internal node + int leftOrder; //the order number of the right child + int rightOrder; //the order number of the left child + int* leftLeafs; //the indices of leafs in the left subtree + int leftNum; //the number of leafs in the left subtree + int* rightLeafs; //the indices of leafs in the right subtree + int rightNum; //the number of leafs in the right substree +}; + +class MSAGuideTree { +public: + MSAGuideTree(MSA* msa, VVF& distMatrix, int numSeqs); + virtual ~MSAGuideTree() = 0; //abstract class + + //get the tree nodes + TreeNode* getNodes(); + //get the leaf nodes + TreeNode* getLeafs(); + //get the number of nodes; + int getNodesNum(); + //get the number of leaf nodes + int getLeafsNum(); + //get the root of the tree + TreeNode* getRoot() { + return this->root; + } + //get the alignment orders + AlignmentOrder* getAlignOrders(); + int getAlignOrdersNum(); + //construct the alignment orders + void createAlignmentOrders(); + + //construct the guide tree + virtual void create(); + //calculate the sequence weights + virtual void getSeqsWeights(); + + /**********DEBUGING****************/ + //display the tree + void displayTree(); + //display the alignment orders + void displayAlignmentOrders(); + +protected: + //join two nodes + void connectNodes(TreeNode* parent, int parentIdx, TreeNode* leftChild, + float leftDist, TreeNode* rightChild, float rightDist); + //release the alignment orders vector + void releaseAlignmentOrders(); + //recursive implemenation of constructing the alignment orders + int recursiveCreateAlignmentOrders(TreeNode* subRoot, int* subLeafs, + int& subLeafsNum, int nodeDepth); + + //system configurations + MSA* msa; + VVF* distMatrix; + int numSeqs; + int* seqsWeights; + + //all the tree nodes + TreeNode* nodes; + int nodesNum; + int nodesSize; + //the root tree node + TreeNode* root; + //leaf node + TreeNode* leafs; + int leafsNum; + + //alignment order + AlignmentOrder* alignOrders; + int alignOrdersNum; + int alignOrdersSize; +}; +#endif + diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAPartProbs.cpp b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAPartProbs.cpp new file mode 100644 index 0000000..f9c3f25 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAPartProbs.cpp @@ -0,0 +1,728 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ +#include "SafeVector.h" +#include +#include +#include +#include +#include +#include +#include +#include +#define TRACE 0 // 0: NOTRACE 1: TRACE +//proba like settings +#define endgaps 1 // 1: engap penaties enabled 0: disabled +#define PART_FULL_MEMORY 0 //0: LOW MEM OPTION +#define REVPART_FULL_MEMORY 0 //0: LOW MEM OPTION +using namespace std; + +#ifdef _WIN32 +#define OS_HUGE_VALL HUGE_VAL +#else +#define OS_HUGE_VALL HUGE_VALL +#endif + +typedef struct { + char input[30]; + int matrix; + int N; + float T; + float beta; + char opt; //can be 'P' or 'M' + float gapopen; + float gapext; +} argument_decl; + +typedef struct sequence { + char *title; + char *text; + int length; +} fasta; + +typedef struct alignment { + char *title; + char *text; + int length; +} align; + +//////////////////////////////////////////////////////// +//externs related to scoring matrix and input arguments +/////////////////////////////////////////////////////////// +extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; +extern char aminos[26], matrixtype[20], bases[26]; + +extern double sub_matrix[26][26]; +extern int subst_index[26]; + +extern float TEMPERATURE; +extern int MATRIXTYPE; + +extern float GAPOPEN; +extern float GAPEXT; +extern argument_decl argument; + +////////////////////////////////////////////////////////////////////////////// +//calculates reverse partition function values based on z matrices +//and also simulaneously calculates the propability of each basepair +//or aminoacid residue pair i,j +////////////////////////////////////////////////////////////////////////////// + +VF *revers_partf(fasta sequences[2], const double termgapopen, + const double termgapextend, long double **Zfm, const double d, + const double e) { + // printf("revpart\n"); + //rest of the declarations + int i, j; + long double **Zm = NULL; + long double **Ze = NULL; + long double **Zf = NULL; + int len0, len1; + float probability; + long double tempvar; + int Si, Tj; + double endgapopen, endgapextend; + FILE *fo; + + //Init lengths of sequences + len0 = strlen(sequences[0].text); + len1 = strlen(sequences[1].text); + + //Safe vector declared + VF *posteriorPtr = new VF((len0 + 1) * (len1 + 1)); + VF & posterior = *posteriorPtr; + VF::iterator ptr = posterior.begin(); + + if (TRACE) //open the trace file + fo = fopen("revpartdump", "a"); + + //default: + endgapopen = termgapopen; + endgapextend = termgapextend; + + //instantiate the z matrix + if (REVPART_FULL_MEMORY) { + + Ze = new long double *[sequences[1].length + 1]; + Zf = new long double *[sequences[1].length + 1]; + Zm = new long double *[sequences[1].length + 1]; + + if (TRACE) + printf("\n\n %e %e\n", d, e); + + //DYNAMICALLY GROW 2D Zm Zf Ze MARICES (long double) + for (i = 0; i <= sequences[1].length; i++) { + Ze[i] = new long double[sequences[0].length + 1]; + Zf[i] = new long double[sequences[0].length + 1]; + Zm[i] = new long double[sequences[0].length + 1]; + } + } else { + Zm = new long double *[2]; + Ze = new long double *[2]; + Zf = new long double *[2]; + for (i = 0; i <= 1; i++) { + Zm[i] = new long double[sequences[0].length + 1]; + Ze[i] = new long double[sequences[0].length + 1]; + Zf[i] = new long double[sequences[0].length + 1]; + } + + } + + if (TRACE) { + printf("in rev partf---"); + printf("\n\n"); + } + + if (REVPART_FULL_MEMORY) { + for (i = 0; i <= len1; i++) + for (j = 0; j <= len0; j++) { + Zm[i][j] = 0.0; + Zf[i][j] = 0.0; + Ze[i][j] = 0.0; + } + } else { + + for (j = 0; j <= len0; j++) { + Zm[0][j] = 0; + Zf[0][j] = 0; + Ze[0][j] = 0; + Zf[1][j] = 0; + Ze[1][j] = 0; + Zm[1][j] = 0; + } + } + + //fill the probability matrix with 0s + for (i = 0; i <= len1; i++) + for (j = 0; j <= len0; j++) + ptr[j * (len1 + 1) + i] = 0; + + if (endgaps == 0) { + Zm[len1][len0] = 1; + Ze[len1][len0] = Zf[len1][len0] = 0; + Zf[len1 - 1][len0] = Zm[len1][len0] * d; + Ze[len1][len0 - 1] = Zm[len1][len0] * d; + + //>=2ND ROW INIT + if (REVPART_FULL_MEMORY) { + for (i = len1 - 2; i >= 0; i--) { + Zf[i][len0] = Zf[i + 1][len0] * e; + } + } + + //>=2ND COL INIT + if (REVPART_FULL_MEMORY) { + for (j = len0 - 2; j >= 0; j--) { + Ze[len1][j] = Ze[len1][j + 1] * e; + } + } else { + for (j = len0 - 2; j >= 0; j--) { + Ze[0][j] = Ze[0][j + 1] * e; + } + } + } else { + + if (REVPART_FULL_MEMORY) { + + Zm[len1][len0] = 1; + Ze[len1][len0] = Zf[len1][len0] = 0; + Zf[len1 - 1][len0] = Zm[len1][len0] * endgapopen; + Ze[len1][len0 - 1] = Zm[len1][len0] * endgapopen; + + //>=2ND ROW INIT + for (i = len1 - 2; i >= 0; i--) { + Zf[i][len0] = Zf[i + 1][len0] * endgapextend; + } + + //M Iy= d+j*e + + //>=2ND COL INIT + for (j = len0 - 2; j >= 0; j--) { + Ze[len1][j] = Ze[len1][j + 1] * endgapextend; + } + + } else { + //in Zm + //let: + // Zm(0) be the current row being filled/computed + // Zm(1) be the previous row + + Zm[1][len0] = 1; + Ze[0][len0] = Zf[0][len0] = 0; + Zf[1][len0] = Zm[1][len0] * endgapopen; + Ze[0][len0 - 1] = Zm[1][len0] * endgapopen; + + //>=2ND COL INIT + for (j = len0 - 2; j >= 0; j--) { + Ze[0][j] = Ze[0][j + 1] * endgapextend; + } + + } //END ELSE + + } //END FULL MEMORY and GAP enablement IF STATEMENT + + double scorez, zz = 0; + + for (i = len1 - 1; i >= 0; i--) { + + for (j = len0 - 1; j >= 0; j--) { + Si = subst_index[sequences[1].text[i] - 'A']; + Tj = subst_index[sequences[0].text[j] - 'A']; + scorez = sub_matrix[Si][Tj]; + + //endgaps modification aug 10 + double open0, extend0, open1, extend1; + + open0 = open1 = d; + extend0 = extend1 = e; + + if (endgaps == 1) { + + //check to see if one of the 2 sequences or both reach the end + + if (i == 0) { + open0 = endgapopen; + extend0 = endgapextend; + + } + + if (j == 0) { + open1 = endgapopen; + extend1 = endgapextend; + } + + } + + if (REVPART_FULL_MEMORY) { + //z computation + + Ze[i][j] = Zm[i][j + 1] * open0 + Ze[i][j + 1] * extend0; + Zf[i][j] = Zm[i + 1][j] * open1 + Zf[i + 1][j] * extend1; + Zm[i][j] = (Zm[i + 1][j + 1] + Zf[i + 1][j + 1] + + Ze[i + 1][j + 1]) * scorez; + zz = Zm[i][j] + Zf[i][j] + Ze[i][j]; + + } else { + + //2 ROW zE zF ALGORITHM GOES...: + //Ze[1][j] =Zm[i][j + 1] * exp(beta * open0) + Ze[1][j + 1] *exp(beta * extend0); + //Zf[1][j] = Zm[i + 1][j] * exp(beta * open1) + Zf[0][j] * exp(beta * extend1); + //Zm[i][j] = (Zm[i + 1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1]) * exp(beta * scorez); + //zz = Zm[0][j] + Zf[1][j] + Ze[1][j]; + + //lowmem code for merging probability calculating module + //Here we make use of Zm as a 2 row matrix + + Zf[1][j] = Zm[1][j] * open1 + Zf[0][j] * extend1; + Ze[1][j] = Zm[0][j + 1] * open0 + Ze[1][j + 1] * extend0; + Zm[0][j] = (Zm[1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1]) + * scorez; + + tempvar = Zfm[i + 1][j + 1] * Zm[0][j]; + //divide P(i,j) i.e. pairwise probability by denominator + tempvar /= (scorez * Zfm[0][0]); + probability = (float) tempvar; + + //store only noticable probabilities + if (probability <= 1 && probability >= 0.001) { + //algorithm goes... + //validprob[i + 1][j + 1] = probability; + ptr[(j + 1) * (len1 + 1) + (i + 1)] = probability; + } + //lowmem code ends here + + } + + } //end of for + + if (REVPART_FULL_MEMORY == 0) { + for (int t = 0; t <= sequences[0].length; t++) { + Ze[0][t] = Ze[1][t]; + Ze[1][t] = 0; + + Zf[0][t] = Zf[1][t]; + Zf[1][t] = 0; + + Zm[1][t] = Zm[0][t]; + Zm[0][t] = 0; + + } + Zf[0][len0] = 1; + + } + + } //end of for + + if (TRACE) { + printf("\n\nrM:....\n\n"); + if (REVPART_FULL_MEMORY) { + for (i = 0; i <= len1; i++) { + for (j = 0; j <= len0; j++) + printf("%.2Le ", Zm[i][j]); + printf("\n"); + } + + printf("\n\nrE:....\n\n"); + for (i = 0; i <= len1; i++) { + for (j = 0; j <= len0; j++) + printf("%.2Le ", Ze[i][j]); + printf("\n"); + + } + + printf("\n\nrF:....\n\n"); + for (i = 0; i <= len1; i++) { + for (j = 0; j <= len0; j++) + printf("%.2Le ", Zf[i][j]); + printf("\n"); + + } + + } + + } + + if (TRACE) { + fprintf(fo, "\n"); + fclose(fo); + } + + //delete unused memory + + if (REVPART_FULL_MEMORY) { + for (i = 0; i <= len1; i++) { + delete (Zm[i]); + delete (Zf[i]); + delete (Ze[i]); + } + } else { + delete (Zf[0]); + delete (Ze[0]); + delete (Zm[0]); + + delete (Zm[1]); + delete (Zf[1]); + delete (Ze[1]); + } + + for (i = 0; i <= len1; i++) { + delete (Zfm[i]); + } + + if (Zf != NULL) + delete (Zf); + + if (Ze != NULL) + delete (Ze); + + if (Zm != NULL) + delete (Zm); + + if (Zfm != NULL) + delete (Zfm); + + posterior[0] = 0; + return (posteriorPtr); + +} + +////////////////////////////////////////////////////////////// +//forward partition function +///////////////////////////////////////////////////////////// + +long double **partf(fasta sequences[2], const double termgapopen, + const double termgapextend, const double d, const double e) { + //printf("partf\n"); + int i, j, len1, len0; + long double **Zm = NULL, **Zf = NULL, **Ze = NULL, zz = 0; + double endgapopen, endgapextend; + + //default: + endgapopen = termgapopen; + endgapextend = termgapextend; + + //the flag endgaps is set at the #define section + if (PART_FULL_MEMORY) { + + Zf = new long double *[sequences[1].length + 1]; + Ze = new long double *[sequences[1].length + 1]; + Zm = new long double *[sequences[1].length + 1]; + + //comment + if (TRACE) + printf("\nPARTF:====\n"); + + //DYNAMICALLY GROW 2D M,IX,IY,PIX,PIY MARICES + for (i = 0; i <= sequences[1].length; i++) { + Zf[i] = new long double[sequences[0].length + 1]; + Ze[i] = new long double[sequences[0].length + 1]; + Zm[i] = new long double[sequences[0].length + 1]; + } + } else { + Zm = new long double *[sequences[1].length + 1]; + Ze = new long double *[2]; + Zf = new long double *[2]; + for (i = 0; i <= sequences[1].length; i++) { + Zm[i] = new long double[sequences[0].length + 1]; + } + Ze[0] = new long double[sequences[0].length + 1]; + Zf[0] = new long double[sequences[0].length + 1]; + Ze[1] = new long double[sequences[0].length + 1]; + Zf[1] = new long double[sequences[0].length + 1]; + } + + len0 = strlen(sequences[0].text); + len1 = strlen(sequences[1].text); + + if (PART_FULL_MEMORY) { + for (i = 0; i <= sequences[1].length; i++) + for (j = 0; j <= sequences[0].length; j++) { + Zm[i][j] = 0.00; + Zf[i][j] = 0.00; + Ze[i][j] = 0.00; + } + } else { + for (i = 0; i <= len1; i++) { + for (j = 0; j <= len0; j++) { + Zm[i][j] = 0; + } + } + for (j = 0; j <= len0; j++) { + Zf[0][j] = 0; + Ze[0][j] = 0; + Zf[1][j] = 0; + Ze[1][j] = 0; + } + } + + //INTITIALIZE THE DP + + if (endgaps == 0) { + Zm[0][0] = 1.00; + + Zf[0][0] = Ze[0][0] = 0; + Zf[1][0] = Zm[0][0] * d; + Ze[0][1] = Zm[0][0] * d; + + //>=2ND ROW INIT + if (PART_FULL_MEMORY) { + for (i = 2; i <= sequences[1].length; i++) { + Zf[i][0] = Zf[i - 1][0] * e; + } + } + + //>=2ND COL INIT + for (j = 2; j <= sequences[0].length; j++) { + Ze[0][j] = Ze[0][j - 1] * e; + } + } else { + //init z + Zm[0][0] = 1.00; + Zf[0][0] = Ze[0][0] = 0; + Zf[1][0] = Zm[0][0] * endgapopen; + Ze[0][1] = Zm[0][0] * endgapopen; + + //>=2ND ROW INIT + if (PART_FULL_MEMORY) { + for (i = 2; i <= sequences[1].length; i++) { + Zf[i][0] = Zf[i - 1][0] * endgapextend; + } + } + + //>=2ND COL INIT + for (j = 2; j <= sequences[0].length; j++) { + Ze[0][j] = Ze[0][j - 1] * endgapextend; + } + } + + //1ST ROW/COL INIT + + int Si, Tj; + double score; + + for (i = 1; i <= sequences[1].length; i++) { + + for (j = 1; j <= sequences[0].length; j++) { + + Si = subst_index[sequences[1].text[i - 1] - 'A']; + Tj = subst_index[sequences[0].text[j - 1] - 'A']; + + score = sub_matrix[Si][Tj]; + + double open0, extend0, open1, extend1; + + open0 = open1 = d; + extend0 = extend1 = e; + + if (endgaps == 1) { + //check to see if one of the 2 sequences or both reach the end + + if (i == sequences[1].length) { + open0 = endgapopen; + extend0 = endgapextend; + + } + + if (j == sequences[0].length) { + open1 = endgapopen; + extend1 = endgapextend; + } + } + + // + //z computation using open and extend temp vars + //open0 is gap open in seq0 and open1 is gap open in seq1 + //entend0 is gap extend in seq0 and extend1 is gap extend in seq1 + + if (PART_FULL_MEMORY) { + Ze[i][j] = Zm[i][j - 1] * open0 + Ze[i][j - 1] * extend0; + + if (Ze[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for Ze\n"); + exit(1); + } + + Zf[i][j] = Zm[i - 1][j] * open1 + Zf[i - 1][j] * extend1; + + if (Zf[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for Zf\n"); + exit(1); + } + + Zm[i][j] = (Zm[i - 1][j - 1] + Ze[i - 1][j - 1] + + Zf[i - 1][j - 1]) * score; + + if (Zm[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for Zm\n"); + exit(1); + } + + zz = Zm[i][j] + Ze[i][j] + Zf[i][j]; + } else { + Ze[1][j] = Zm[i][j - 1] * open0 + Ze[1][j - 1] * extend0; + + if (Ze[1][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for zE\n"); + exit(1); + } + + Zf[1][j] = Zm[i - 1][j] * open1 + Zf[0][j] * extend1; + + if (Zf[1][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for zF\n"); + exit(1); + } + + Zm[i][j] = (Zm[i - 1][j - 1] + Ze[0][j - 1] + Zf[0][j - 1]) + * score; + + if (Zm[i][j] >= OS_HUGE_VALL) { + printf("ERROR: huge val error for zM\n"); + exit(1); + } + + zz = Zm[i][j] + Ze[1][j] + Zf[1][j]; + } + + } //end for + + if (!PART_FULL_MEMORY) { + for (int t = 0; t <= sequences[0].length; t++) { + Ze[0][t] = Ze[1][t]; + Ze[1][t] = 0; + + Zf[0][t] = Zf[1][t]; + Zf[1][t] = 0; + } + + Zf[1][0] = 1; + + } + + } //end for + + //store the sum of zm zf ze (m,n)s in zm's 0,0 th position + Zm[0][0] = zz; + + if (TRACE) { + //debug code aug 3 + //print the 3 Z matrices namely Zm Zf and Ze + + printf("\n\nFINAL Zm:\n"); + for (i = 0; i <= sequences[1].length; i++) { + for (j = 0; j <= sequences[0].length; j++) + printf("%.2Le ", Zm[i][j]); + printf("\n"); + } + + printf("FINAL Zf \n"); + for (i = 0; i <= sequences[1].length; i++) { + for (j = 0; j <= sequences[0].length; j++) + printf("%.2Le ", Zf[i][j]); + printf("\n"); + } + + printf("FINAL Ze \n"); + for (i = 0; i <= sequences[1].length; i++) { + for (j = 0; j <= sequences[0].length; j++) + printf("%.2Le ", Ze[i][j]); + printf("\n"); + } + + //end debug dump code + + } + + if (PART_FULL_MEMORY) { + for (i = 0; i <= sequences[1].length; i++) { + delete (Zf[i]); + delete (Ze[i]); + } + } else { + delete (Zf[0]); + delete (Ze[0]); + delete (Zf[1]); + delete (Ze[1]); + } + + delete (Zf); + delete (Ze); + + return Zm; + +} //end of forward partition function + +///////////////////////////////////////////////////////////////////////////////////////// +//entry point (was the main function) , returns the posterior probability safe vector +//////////////////////////////////////////////////////////////////////////////////////// +VF *ComputePostProbs(int a, int b, string seq1, string seq2) { + //printf("probamod\n"); + double gap_open = -22, gap_ext = -1, beta = 0.2;//T = 5, beta = 1/T = 0.2, by default + int stock_loop = 1; + int le = 160; + double termgapopen = 1.0f; //exp(0) + double termgapextend = 1.0f; //exp(0) + + //initialize the sequence structure + fasta sequences[2]; + + sequences[0].length = strlen((char *) seq1.c_str()); + sequences[0].text = (char *) seq1.c_str(); + sequences[0].title = new char[10]; + strcpy(sequences[0].title, "seq0"); + sequences[1].length = strlen((char *) seq2.c_str()); + sequences[1].text = (char *) seq2.c_str(); + sequences[1].title = new char[10]; + strcpy(sequences[1].title, "seq1"); + + if (TRACE) + + { + printf("%d %d %s\n%d %d %s\n--\n", a, sequences[0].length, + sequences[0].text, b, sequences[1].length, sequences[1].text); + printf("after init\n"); + + FILE *dump1 = fopen("dump1", "a"); + fprintf(dump1, "%d %d %s\n%d %d %s\n--\n", a, sequences[0].length, + sequences[0].text, b, sequences[1].length, sequences[1].text); + fclose(dump1); + } + + gap_open = argument.gapopen; + gap_ext = argument.gapext; + beta = argument.beta; + + stock_loop = argument.N; + le = argument.matrix; + + //compute the values of exp(beta * ?) + termgapopen = exp(beta * 0.0); + termgapextend = exp(beta * 0.0); + gap_open = exp(beta * gap_open); + gap_ext = exp(beta * gap_ext); + + if (TRACE) + printf("%f %f %f %d\n", gap_open, gap_ext, beta, le); + + //call for calculating the posterior probabilities + // 1. call partition function partf + // 2. calculate revpartition using revers_parf + // 3. calculate probabilities + /// MODIFICATION... POPULATE SAFE VECTOR + + long double **MAT1; + + MAT1 = partf(sequences, termgapopen, termgapextend, gap_open, gap_ext); + + return revers_partf(sequences, termgapopen, termgapextend, MAT1, gap_open, + gap_ext); + +} + +//end of posterior probability module diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAProbs.vcproj b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAProbs.vcproj new file mode 100644 index 0000000..5212610 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAProbs.vcproj @@ -0,0 +1,272 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAProbs.vcproj.STUDENT.LIUY0039.user b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAProbs.vcproj.STUDENT.LIUY0039.user new file mode 100644 index 0000000..99168da --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAProbs.vcproj.STUDENT.LIUY0039.user @@ -0,0 +1,65 @@ + + + + + + + + + + + diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAReadMatrix.cpp b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAReadMatrix.cpp new file mode 100644 index 0000000..6ff0643 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAReadMatrix.cpp @@ -0,0 +1,174 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ + +#include +#include +#include +#include +#include "MSAReadMatrix.h" + +#define TRACE 0 + +//////////////////////////////////////////////////////////// +// extern variables for scoring matrix data +//////////////////////////////////////////////////////////// +extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; +extern char *aminos, *bases, matrixtype[20]; +extern int subst_index[26]; + +extern double sub_matrix[26][26]; + +extern float TEMPERATURE; +extern int MATRIXTYPE; + +extern float GAPOPEN; +extern float GAPEXT; + +typedef struct { + char input[30]; + int matrix; + int N; + float T; + float beta; + char opt; //can be 'P' or 'M' + float gapopen; + float gapext; +} argument_decl; + +//argument support +extern argument_decl argument; + +///////////////////////////////////////////////////////// +//sets substitution matrix type +//////////////////////////////////////////////////////// +void setmatrixtype(int le) { + switch (le) { + case 160: + strcpy(matrixtype, "gonnet_160"); + break; + case 4: + strcpy(matrixtype, "nuc_simple"); + break; + default: + strcpy(matrixtype, "CUSTOM"); + break; + + }; + +} + +/////////////////////////////////////////////////////////////////// +//sets matrix flag +/////////////////////////////////////////////////////////////////// +inline int matrixtype_to_int() { + + if (!strcmp(matrixtype, "nuc_simple")) + return 4; + else if (!strcmp(matrixtype, "gonnet_160")) + return 160; + else + return 1000; + +} + +///////////////////////////////////////////////////////////////// +// +// Can read any scoring matrix as long as it is defined in Matrix.h +// AND it is a lower triangular +// AND the order of amino acids/bases is mentioned +///////////////////////////////////////////////////////////////// + +inline void read_matrix(score_matrix matrx) { + int i, j, basecount, position = 0; + + bases = (char *) matrx.monomers; + + basecount = strlen(bases); + + for (i = 0; i < basecount; i++) + subst_index[i] = -1; + + for (i = 0; i < basecount; i++) + subst_index[bases[i] - 'A'] = i; + + if (TRACE == 1) + printf("\nbases read: %d\n", basecount); + + for (i = 0; i < basecount; i++) + for (j = 0; j <= i; j++) { + + double value = exp(argument.beta * matrx.matrix[position++]); + sub_matrix[i][j] = value; + sub_matrix[j][i] = value; + } + + if (TRACE) + for (i = 0; i < basecount; i++) { + for (j = 0; j < basecount; j++) + printf(" %g ", sub_matrix[i][j]); + printf("\n"); + } + +} + +////////////////////////////////////////////////////////////////////////////////// +//intialize the arguments (default values) +////////////////////////////////////////////////////////////////////////////////// +void init_arguments() { + float gap_open = 0, gap_ext = 0; + int le; + + le = matrixtype_to_int(); + + argument.N = 1; + strcpy(argument.input, "tempin"); + argument.matrix = le; + argument.gapopen = GAPOPEN; + argument.gapext = GAPEXT; + argument.T = TEMPERATURE; + argument.beta = 1.0 / TEMPERATURE; + argument.opt = 'P'; + + if (le == 4) //NUC OPTION :default is nuc_simple + { + read_matrix(nuc_simple); + gap_open = -4; + gap_ext = -0.25; + } + + else if (le == 160) //PROT option: default is gonnet_160 + { + if (TRACE) + printf("read matrix\n"); + read_matrix(gonnet_160); + gap_open = -22; + gap_ext = -1; + } else if (le == 1000) { //Error handling + printf("Error: enter a valid matrix type\n"); + exit(1); + //additional matrices can only be lower triangular + } + + //now override the gapopen and gapext + if (argument.gapopen != 0.0 || argument.gapext != 0.00) + + { + gap_open = -argument.gapopen; + gap_ext = -argument.gapext; + } + + if (TRACE) + printf("%f %f %f %d\n", argument.T, gap_open, gap_ext, le); + + argument.gapopen = gap_open; + argument.gapext = gap_ext; + argument.opt = 'P'; + +} diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAReadMatrix.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAReadMatrix.h new file mode 100644 index 0000000..8d15ce5 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MSAReadMatrix.h @@ -0,0 +1,87 @@ +///////////////////////////////////////////////////////////////// +// Matrix.h +// +// Specifies scoring matrices and their structure +// +// +// +///////////////////////////////////////////////////////////////// + +#ifndef _MSA_READ_MATRIX_H +#define _MSA_READ_MATRIX_H + +typedef struct { + char monomers[26]; /* amino or nucleic acid order */ + float matrix[676]; /* entries of the score matix, 26*26=676 */ +} score_matrix; + +//default protein sequence scoring matrix as well as default scoring matrix of the PROBALIGN +//also used when -prot option is used + +score_matrix gonnet_160 = { "ABCDEFGHIKLMNPQRSTVWXYZ", + +{ 4.6, 0.0, 0.0, 0.3, 0.0, 13.5, -1.1, 0.0, -5.3, 7.0, -0.4, 0.0, -5.2, 3.4, + 5.9, -3.8, 0.0, -1.8, -7.0, -6.2, 9.1, 0.2, 0.0, -3.4, -0.7, -2.1, -7.6, + 8.2, -1.8, 0.0, -2.3, -0.1, -0.1, -0.7, -2.7, 9.3, -1.8, 0.0, -2.5, + -6.2, -4.3, 0.3, -7.0, -3.7, 5.9, -1.2, 0.0, -4.8, -0.1, 1.3, -5.3, + -2.4, 0.2, -3.5, 5.5, -2.2, 0.0, -2.9, -6.5, -4.5, 1.9, -6.7, -3.2, 3.0, + -3.4, 5.7, -1.2, 0.0, -1.9, -5.0, -3.1, 1.4, -5.2, -2.1, 2.9, -2.1, 3.4, + 7.6, -1.2, 0.0, -3.1, 2.6, 0.5, -4.7, -0.2, 1.5, -4.4, 0.8, -4.8, -3.6, + 6.5, -0.1, 0.0, -5.2, -1.9, -1.4, -5.8, -3.0, -2.2, -4.3, -1.6, -3.5, + -4.2, -2.2, 9.6, -0.7, 0.0, -4.2, 0.6, 2.3, -4.1, -2.1, 1.7, -3.2, 2.0, + -2.4, -1.2, 0.5, -0.8, 5.6, -1.6, 0.0, -3.5, -1.6, -0.3, -5.3, -2.1, + 0.3, -4.1, 3.5, -3.5, -2.9, -0.4, -2.1, 1.7, 7.1, 1.6, 0.0, -0.2, 0.0, + -0.3, -4.5, -0.1, -0.8, -3.3, -0.4, -3.6, -2.3, 1.1, 0.0, -0.2, -0.9, + 4.4, 0.5, 0.0, -1.4, -0.6, -0.8, -3.6, -2.4, -0.8, -1.2, -0.2, -2.4, + -1.1, 0.3, -0.4, -0.4, -0.9, 2.3, 5.0, 0.1, 0.0, -0.6, -4.9, -3.0, -0.8, + -5.2, -3.5, 4.0, -3.0, 1.7, 1.4, -3.8, -3.2, -2.7, -3.4, -2.0, 0.0, 5.3, + -5.5, 0.0, -2.1, -7.8, -6.4, 3.2, -5.5, -1.9, -3.4, -5.4, -2.0, -2.2, + -5.5, -7.4, -4.0, -2.4, -4.7, -5.4, -4.5, 15.8, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, -3.7, 0.0, -1.3, -4.2, -4.4, 5.6, -6.0, 2.7, -2.0, -3.5, -1.1, + -1.3, -2.2, -4.8, -2.9, -2.9, -2.8, -3.2, -2.4, 3.8, 0.0, 10.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 } + +}; + +//default nucleotide sequence scoring matrix +//used when -nuc option is used +score_matrix nuc_simple = { + +"ABCDGHKMNRSTUVWXY", + +{ 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0 } + +//Ribosum85-60 + /* + { + 2.22, + 0, 0, + -1.86, 0, 1.16, + 0, 0, 0, 0, + -1.46, 0, -2.48, 0, 1.03, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -1.39, 0, -1.05, 0, -1.74, 0, 0, 0, 0, 0, 0, 1.65, + -1.39, 0, -1.05, 0, -1.74, 0, 0, 0, 0, 0, 0, 0, 1.65, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + } + */ + + }; + +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/Makefile b/binaries/src/MSAProbs-0.9.7/MSAProbs/Makefile new file mode 100644 index 0000000..8dc8450 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/Makefile @@ -0,0 +1,16 @@ + +CXXOBJS = MSA.o MSAGuideTree.o MSAClusterTree.o MSAPartProbs.o MSAReadMatrix.o main.o + +OPENMP = -fopenmp +CXX = g++ +COMMON_FLAGS = -O3 $(OPENMP) -Wall -funroll-loops -I . -I /usr/include +CXXFLAGS = $(COMMON_FLAGS) + +EXEC = msaprobs + +all: $(CXXOBJS) + $(CXX) $(CXXFLAGS) -o $(EXEC) $(CXXOBJS) $(NVCCOBJS) $(NVCCLIBS) + strip $(EXEC) +clean: + rm -rf *.o $(EXEC) + diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/MultiSequence.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/MultiSequence.h new file mode 100644 index 0000000..b31af08 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/MultiSequence.h @@ -0,0 +1,733 @@ +//////////////////////////////////////////////////////////////// +// MultiSequence.h +// +// Utilities for reading/writing multiple sequence data. +///////////////////////////////////////////////////////////////// + +#ifndef MULTISEQUENCE_H +#define MULTISEQUENCE_H + +#include +#include +#include +#include +#include +#include +#include +#include "SafeVector.h" +#include "Sequence.h" +#include "FileBuffer.h" + +#define VERSION "0.9.7" +///////////////////////////////////////////////////////////////// +// MultiSequence +// +// Class for multiple sequence alignment input/output. +///////////////////////////////////////////////////////////////// + +class MultiSequence { + + SafeVector *sequences; + +public: + + ///////////////////////////////////////////////////////////////// + // MultiSequence::MultiSequence() + // + // Default constructor. + ///////////////////////////////////////////////////////////////// + + MultiSequence() : + sequences(NULL) { + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::MultiSequence() + // + // Constructor. Load MFA from a FileBuffer object. + ///////////////////////////////////////////////////////////////// + + MultiSequence(FileBuffer &infile) : + sequences(NULL) { + LoadMFA(infile); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::MultiSequence() + // + // Constructor. Load MFA from a filename. + ///////////////////////////////////////////////////////////////// + + MultiSequence(const string &filename) : + sequences(NULL) { + LoadMFA(filename); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::~MultiSequence() + // + // Destructor. Gets rid of sequence objects contained in the + // multiple alignment. + ///////////////////////////////////////////////////////////////// + + ~MultiSequence() { + + // if sequences allocated + if (sequences) { + + // free all sequences + for (SafeVector::iterator iter = sequences->begin(); + iter != sequences->end(); ++iter) { + assert(*iter); + delete *iter; + *iter = NULL; + } + + // free sequence vector + delete sequences; + sequences = NULL; + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::LoadMFA() + // + // Load MFA from a filename. + ///////////////////////////////////////////////////////////////// + + void LoadMFA(const string &filename, bool stripGaps = false) { + + // try opening file + FileBuffer infile(filename.c_str()); + + if (infile.fail()) { + cerr << "ERROR: Could not open file '" << filename + << "' for reading." << endl; + exit(1); + } + + // if successful, then load using other LoadMFA() routine + LoadMFA(infile, stripGaps); + + infile.close(); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::LoadMFA() + // + // Load MSF from a FileBuffer object. + ///////////////////////////////////////////////////////////////// + + void ParseMSF(FileBuffer &infile, string header, bool stripGaps = false) { + + SafeVector *> seqData; + SafeVector seqNames; + SafeVector seqLengths; + + istringstream in; + bool valid = true; + bool missingHeader = false; + bool clustalW = false; + + // read until data starts + while (!infile.eof() && header.find("..", 0) == string::npos) { + if (header.find("CLUSTAL", 0) == 0 + || header.find("MSAPROBS", 0) == 0) { + clustalW = true; + break; + } + infile.GetLine(header); + if (header.find("//", 0) != string::npos) { + missingHeader = true; + break; + } + } + + // read until end-of-file + while (valid) { + infile.GetLine(header); + if (infile.eof()) + break; + + string word; + in.clear(); + in.str(header); + + // check if there's anything on this line + if (in >> word) { + + // clustalw name parsing + if (clustalW) { + if (!isspace(header[0]) + && find(seqNames.begin(), seqNames.end(), word) + == seqNames.end()) { + seqNames.push_back(word); + seqData.push_back(new SafeVector()); + seqLengths.push_back(0); + seqData[(int) seqData.size() - 1]->push_back('@'); + } + } + + // look for new sequence label + if (word == string("Name:")) { + if (in >> word) { + seqNames.push_back(word); + seqData.push_back(new SafeVector()); + seqLengths.push_back(0); + seqData[(int) seqData.size() - 1]->push_back('@'); + } else + valid = false; + } + + // check if this is sequence data + else if (find(seqNames.begin(), seqNames.end(), word) + != seqNames.end()) { + int index = find(seqNames.begin(), seqNames.end(), word) + - seqNames.begin(); + + // read all remaining characters on the line + char ch; + while (in >> ch) { + if (isspace(ch)) + continue; + if (ch >= 'a' && ch <= 'z') + ch = ch - 'a' + 'A'; + if (ch == '.') + ch = '-'; + if (stripGaps && ch == '-') + continue; + if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')) { + cerr << "ERROR: Unknown character encountered: " + << ch << endl; + exit(1); + } + + // everything's ok so far, so just store this character. + seqData[index]->push_back(ch); + seqLengths[index]++; + } + } else if (missingHeader) { + seqNames.push_back(word); + seqData.push_back(new SafeVector()); + seqLengths.push_back(0); + seqData[(int) seqData.size() - 1]->push_back('@'); + + int index = (int) seqNames.size() - 1; + + // read all remaining characters on the line + char ch; + while (in >> ch) { + if (isspace(ch)) + continue; + if (ch >= 'a' && ch <= 'z') + ch = ch - 'a' + 'A'; + if (ch == '.') + ch = '-'; + if (stripGaps && ch == '-') + continue; + if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')) { + cerr << "ERROR: Unknown character encountered: " + << ch << endl; + exit(1); + } + + // everything's ok so far, so just store this character. + seqData[index]->push_back(ch); + seqLengths[index]++; + } + } + } + } + + // check for errors + if (seqNames.size() == 0) { + cerr << "ERROR: No sequences read!" << endl; + exit(1); + } + + assert(!sequences); + sequences = new SafeVector; + for (int i = 0; i < (int) seqNames.size(); i++) { + if (seqLengths[i] == 0) { + cerr << "ERROR: Sequence of zero length!" << endl; + exit(1); + } + Sequence *seq = new Sequence(seqData[i], seqNames[i], seqLengths[i], + i, i); + sequences->push_back(seq); + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::LoadMFA() + // + // Load MFA from a FileBuffer object. + ///////////////////////////////////////////////////////////////// + + void LoadMFA(FileBuffer &infile, bool stripGaps = false) { + + // check to make sure that file reading is ok + if (infile.fail()) { + cerr << "ERROR: Error reading file." << endl; + exit(1); + } + + // read all sequences + while (true) { + + // get the sequence label as being the current # of sequences + // NOTE: sequence labels here are zero-based + int index = (!sequences) ? 0 : sequences->size(); + + // read the sequence + Sequence *seq = new Sequence(infile, stripGaps); + if (seq->Fail()) { + + // check if alternative file format (i.e. not MFA) + if (index == 0) { + string header = seq->GetHeader(); + if (header.length() > 0 && header[0] != '>') { + + // try MSF format + ParseMSF(infile, header); + break; + } + } + + delete seq; + break; + } + seq->SetLabel(index); + + // add the sequence to the list of current sequences + if (!sequences) + sequences = new SafeVector; + sequences->push_back(seq); + } + + // make sure at least one sequence was read + if (!sequences) { + cerr << "ERROR: No sequences read." << endl; + exit(1); + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::AddSequence() + // + // Add another sequence to an existing sequence list + ///////////////////////////////////////////////////////////////// + + void AddSequence(Sequence *sequence) { + assert(sequence); + assert(!sequence->Fail()); + + // add sequence + if (!sequences) + sequences = new SafeVector; + sequences->push_back(sequence); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::RemoveSequence() + // + // Remove a sequence from the MultiSequence + ///////////////////////////////////////////////////////////////// + + void RemoveSequence(int index) { + assert(sequences); + + assert(index >= 0 && index < (int) sequences->size()); + delete (*sequences)[index]; + + sequences->erase(sequences->begin() + index); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::WriteMFA() + // + // Write MFA to the outfile. Allows the user to specify the + // number of columns for the output. Also, useIndices determines + // whether or not the actual sequence comments will be printed + // out or whether the artificially assigned sequence labels will + // be used instead. + ///////////////////////////////////////////////////////////////// + + void WriteMFA(ostream &outfile, int numColumns = 60, + bool useIndices = false) { + if (!sequences) + return; + + // loop through all sequences and write them out + for (SafeVector::iterator iter = sequences->begin(); + iter != sequences->end(); ++iter) { + (*iter)->WriteMFA(outfile, numColumns, useIndices); + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::GetAnnotationChar() + // + // Return CLUSTALW annotation for column. + ///////////////////////////////////////////////////////////////// + + char GetAnnotationChar(SafeVector &column) { + SafeVector counts(256, 0); + int allChars = (int) column.size(); + + for (int i = 0; i < allChars; i++) { + counts[(unsigned char) toupper(column[i])]++; + } + + allChars -= counts[(unsigned char) '-']; + if (allChars == 1) + return ' '; + + for (int i = 0; i < 256; i++) + if ((char) i != '-' && counts[i] == allChars) + return '*'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + + counts[(unsigned char) 'A'] == allChars) + return ':'; + + if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E'] + + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'H'] + + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H'] + + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I'] + + counts[(unsigned char) 'L'] + counts[(unsigned char) 'V'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I'] + + counts[(unsigned char) 'L'] + counts[(unsigned char) 'F'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'Y'] + == allChars) + return ':'; + + if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y'] + + counts[(unsigned char) 'W'] == allChars) + return ':'; + + if (counts[(unsigned char) 'C'] + counts[(unsigned char) 'S'] + + counts[(unsigned char) 'A'] == allChars) + return '.'; + + if (counts[(unsigned char) 'A'] + counts[(unsigned char) 'T'] + + counts[(unsigned char) 'V'] == allChars) + return '.'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'A'] + + counts[(unsigned char) 'G'] == allChars) + return '.'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + + counts[(unsigned char) 'N'] + counts[(unsigned char) 'K'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + + counts[(unsigned char) 'P'] + counts[(unsigned char) 'A'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'G'] + + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'N'] + + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + + counts[(unsigned char) 'H'] + counts[(unsigned char) 'K'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E'] + + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q'] + + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K'] + == allChars) + return '.'; + + if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'V'] + + counts[(unsigned char) 'L'] + counts[(unsigned char) 'I'] + + counts[(unsigned char) 'M'] == allChars) + return '.'; + + if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'F'] + + counts[(unsigned char) 'Y'] == allChars) + return '.'; + + return ' '; + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::WriteALN() + // + // Write ALN to the outfile. Allows the user to specify the + // number of columns for the output. + ///////////////////////////////////////////////////////////////// + + void WriteALN(ostream &outfile, int numColumns = 60) { + if (!sequences) + return; + + outfile << "MSAPROBS version " << VERSION + << " multiple sequence alignment" << endl; + + int longestComment = 0; + SafeVector::iterator> ptrs(GetNumSequences()); + SafeVector lengths(GetNumSequences()); + for (int i = 0; i < GetNumSequences(); i++) { + ptrs[i] = GetSequence(i)->GetDataPtr(); + lengths[i] = GetSequence(i)->GetLength(); + longestComment = max(longestComment, + (int) GetSequence(i)->GetName().length()); + } + longestComment += 4; + + int writtenChars = 0; + bool allDone = false; + + while (!allDone) { + outfile << endl; + allDone = true; + + // loop through all sequences and write them out + for (int i = 0; i < GetNumSequences(); i++) { + + if (writtenChars < lengths[i]) { + outfile << GetSequence(i)->GetName(); + for (int j = 0; + j + < longestComment + - (int) GetSequence(i)->GetName().length(); + j++) + outfile << ' '; + + for (int j = 0; j < numColumns; j++) { + if (writtenChars + j < lengths[i]) + outfile << ptrs[i][writtenChars + j + 1]; + else + break; + } + + outfile << endl; + + if (writtenChars + numColumns < lengths[i]) + allDone = false; + } + } + + // write annotation line + for (int j = 0; j < longestComment; j++) + outfile << ' '; + + for (int j = 0; j < numColumns; j++) { + SafeVector column; + + for (int i = 0; i < GetNumSequences(); i++) + if (writtenChars + j < lengths[i]) + column.push_back(ptrs[i][writtenChars + j + 1]); + + if (column.size() > 0) + outfile << GetAnnotationChar(column); + } + + outfile << endl; + writtenChars += numColumns; + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::GetSequence() + // + // Retrieve a sequence from the MultiSequence object. + ///////////////////////////////////////////////////////////////// + + Sequence* GetSequence(int i) { + assert(sequences); + assert(0 <= i && i < (int) sequences->size()); + + return (*sequences)[i]; + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::GetSequence() + // + // Retrieve a sequence from the MultiSequence object + // (const version). + ///////////////////////////////////////////////////////////////// + + const Sequence* GetSequence(int i) const { + assert(sequences); + assert(0 <= i && i < (int) sequences->size()); + + return (*sequences)[i]; + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::GetNumSequences() + // + // Returns the number of sequences in the MultiSequence. + ///////////////////////////////////////////////////////////////// + + int GetNumSequences() const { + if (!sequences) + return 0; + return (int) sequences->size(); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::SortByHeader() + // + // Organizes the sequences according to their sequence headers + // in ascending order. + ///////////////////////////////////////////////////////////////// + + void SortByHeader() { + assert(sequences); + + // a quick and easy O(n^2) sort + for (int i = 0; i < (int) sequences->size() - 1; i++) { + for (int j = i + 1; j < (int) sequences->size(); j++) { + if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader()) + swap((*sequences)[i], (*sequences)[j]); + } + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::SortByLabel() + // + // Organizes the sequences according to their sequence labels + // in ascending order. + ///////////////////////////////////////////////////////////////// + + void SortByLabel() { + assert(sequences); + + // a quick and easy O(n^2) sort + for (int i = 0; i < (int) sequences->size() - 1; i++) { + for (int j = i + 1; j < (int) sequences->size(); j++) { + if ((*sequences)[i]->GetSortLabel() + > (*sequences)[j]->GetSortLabel()) + swap((*sequences)[i], (*sequences)[j]); + } + } + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::SaveOrdering() + // + // Relabels sequences so as to preserve the current ordering. + ///////////////////////////////////////////////////////////////// + + void SaveOrdering() { + assert(sequences); + + for (int i = 0; i < (int) sequences->size(); i++) + (*sequences)[i]->SetSortLabel(i); + } + + ///////////////////////////////////////////////////////////////// + // MultiSequence::Project() + // + // Given a set of indices, extract all sequences from the current + // MultiSequence object whose index is included in the set. + // Then, project the multiple alignments down to the desired + // subset, and return the projection as a new MultiSequence + // object. + ///////////////////////////////////////////////////////////////// + + MultiSequence *Project(const set &indices) { + SafeVector::iterator> oldPtrs(indices.size()); + SafeVector *> newPtrs(indices.size()); + + assert(indices.size() != 0); + + // grab old data + int i = 0; + for (set::const_iterator iter = indices.begin(); + iter != indices.end(); ++iter) { + oldPtrs[i++] = GetSequence(*iter)->GetDataPtr(); + } + + // compute new length + int oldLength = GetSequence(*indices.begin())->GetLength(); + int newLength = 0; + for (i = 1; i <= oldLength; i++) { + + // check to see if there is a gap in every sequence of the set + bool found = false; + for (int j = 0; !found && j < (int) indices.size(); j++) + found = (oldPtrs[j][i] != '-'); + + // if not, then this column counts towards the sequence length + if (found) + newLength++; + } + + // build new alignments + for (i = 0; i < (int) indices.size(); i++) { + newPtrs[i] = new SafeVector(); + assert(newPtrs[i]); + newPtrs[i]->push_back('@'); + } + + // add all needed columns + for (i = 1; i <= oldLength; i++) { + + // make sure column is not gapped in all sequences in the set + bool found = false; + for (int j = 0; !found && j < (int) indices.size(); j++) + found = (oldPtrs[j][i] != '-'); + + // if not, then add it + if (found) { + for (int j = 0; j < (int) indices.size(); j++) + newPtrs[j]->push_back(oldPtrs[j][i]); + } + } + + // wrap sequences in MultiSequence object + MultiSequence *ret = new MultiSequence(); + i = 0; + for (set::const_iterator iter = indices.begin(); + iter != indices.end(); ++iter) { + ret->AddSequence( + new Sequence(newPtrs[i++], GetSequence(*iter)->GetHeader(), + newLength, GetSequence(*iter)->GetSortLabel(), + GetSequence(*iter)->GetLabel())); + } + + return ret; + } +}; + +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/ProbabilisticModel.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/ProbabilisticModel.h new file mode 100644 index 0000000..dd5dbd3 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/ProbabilisticModel.h @@ -0,0 +1,1365 @@ +///////////////////////////////////////////////////////////////// +// ProbabilisticModel.h +// +// Routines for (1) posterior probability computations +// (2) chained anchoring +// (3) maximum weight trace alignment +///////////////////////////////////////////////////////////////// + +#ifndef PROBABILISTICMODEL_H +#define PROBABILISTICMODEL_H + +#include +#include +#include +#include "SafeVector.h" +#include "ScoreType.h" +#include "SparseMatrix.h" +#include "MultiSequence.h" + +using namespace std; + +const int NumMatchStates = 1; // note that in this version the number +// of match states is fixed at 1...will +// change in future versions +const int NumInsertStates = 2; +const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2; + +///////////////////////////////////////////////////////////////// +// ProbabilisticModel +// +// Class for storing the parameters of a probabilistic model and +// performing different computations based on those parameters. +// In particular, this class handles the computation of +// posterior probabilities that may be used in alignment. +///////////////////////////////////////////////////////////////// + +class ProbabilisticModel { + + float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state + float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities + float matchProb[256][256]; // emission probabilities for match states + float insProb[256][NumMatrixTypes]; // emission probabilities for insert states + +public: + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ProbabilisticModel() + // + // Constructor. Builds a new probabilistic model using the + // given parameters. + ///////////////////////////////////////////////////////////////// + + ProbabilisticModel(const VF &initDistribMat, const VF &gapOpen, + const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle) { + + // build transition matrix + VVF transMat(NumMatrixTypes, VF(NumMatrixTypes, 0.0f)); + transMat[0][0] = 1; + for (int i = 0; i < NumInsertStates; i++) { + transMat[0][2 * i + 1] = gapOpen[2 * i]; + transMat[0][2 * i + 2] = gapOpen[2 * i + 1]; + transMat[0][0] -= (gapOpen[2 * i] + gapOpen[2 * i + 1]); + assert(transMat[0][0] > 0); + transMat[2 * i + 1][2 * i + 1] = gapExtend[2 * i]; + transMat[2 * i + 2][2 * i + 2] = gapExtend[2 * i + 1]; + transMat[2 * i + 1][2 * i + 2] = 0; + transMat[2 * i + 2][2 * i + 1] = 0; + transMat[2 * i + 1][0] = 1 - gapExtend[2 * i]; + transMat[2 * i + 2][0] = 1 - gapExtend[2 * i + 1]; + } + + // create initial and transition probability matrices + for (int i = 0; i < NumMatrixTypes; i++) { + initialDistribution[i] = LOG(initDistribMat[i]); + for (int j = 0; j < NumMatrixTypes; j++) + transProb[i][j] = LOG(transMat[i][j]); + } + + // create insertion and match probability matrices + for (int i = 0; i < 256; i++) { + for (int j = 0; j < NumMatrixTypes; j++) + insProb[i][j] = LOG(emitSingle[i]); + for (int j = 0; j < 256; j++) + matchProb[i][j] = LOG(emitPairs[i][j]); + } + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeForwardMatrix() + // + // Computes a set of forward probability matrices for aligning + // seq1 and seq2. + // + // For efficiency reasons, a single-dimensional floating-point + // array is used here, with the following indexing scheme: + // + // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] + // refers to the probability of aligning through j characters + // of the first sequence, k characters of the second sequence, + // and ending in state i. + ///////////////////////////////////////////////////////////////// + + VF *ComputeForwardMatrix(Sequence *seq1, Sequence *seq2) const { + + assert(seq1); + assert(seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + // retrieve the points to the beginning of each sequence + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + // create matrix + VF *forwardPtr = new VF( + NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO); + assert(forwardPtr); + VF &forward = *forwardPtr; + + // initialization condition + forward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)] = + initialDistribution[0] + + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]]; + + for (int k = 0; k < NumInsertStates; k++) { + forward[2 * k + 1 + NumMatrixTypes * (1 * (seq2Length + 1) + 0)] = + initialDistribution[2 * k + 1] + + insProb[(unsigned char) iter1[1]][k]; + forward[2 * k + 2 + NumMatrixTypes * (0 * (seq2Length + 1) + 1)] = + initialDistribution[2 * k + 2] + + insProb[(unsigned char) iter2[1]][k]; + } + + // remember offset for each index combination + int ij = 0; + int i1j = -seq2Length - 1; + int ij1 = -1; + int i1j1 = -seq2Length - 2; + + ij *= NumMatrixTypes; + i1j *= NumMatrixTypes; + ij1 *= NumMatrixTypes; + i1j1 *= NumMatrixTypes; + + // compute forward scores + for (int i = 0; i <= seq1Length; i++) { + unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; + for (int j = 0; j <= seq2Length; j++) { + unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; + + if (i > 1 || j > 1) { + if (i > 0 && j > 0) { + forward[0 + ij] = forward[0 + i1j1] + transProb[0][0]; + for (int k = 1; k < NumMatrixTypes; k++) + LOG_PLUS_EQUALS(forward[0 + ij], + forward[k + i1j1] + transProb[k][0]); + forward[0 + ij] += matchProb[c1][c2]; + } + if (i > 0) { + for (int k = 0; k < NumInsertStates; k++) + forward[2 * k + 1 + ij] = insProb[c1][k] + + LOG_ADD( + forward[0 + i1j] + + transProb[0][2 * k + 1], + forward[2 * k + 1 + i1j] + + transProb[2 * k + 1][2 * k + + 1]); + } + if (j > 0) { + for (int k = 0; k < NumInsertStates; k++) + forward[2 * k + 2 + ij] = insProb[c2][k] + + LOG_ADD( + forward[0 + ij1] + + transProb[0][2 * k + 2], + forward[2 * k + 2 + ij1] + + transProb[2 * k + 2][2 * k + + 2]); + } + } + + ij += NumMatrixTypes; + i1j += NumMatrixTypes; + ij1 += NumMatrixTypes; + i1j1 += NumMatrixTypes; + } + } + + return forwardPtr; + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeBackwardMatrix() + // + // Computes a set of backward probability matrices for aligning + // seq1 and seq2. + // + // For efficiency reasons, a single-dimensional floating-point + // array is used here, with the following indexing scheme: + // + // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] + // refers to the probability of starting in state i and + // aligning from character j+1 to the end of the first + // sequence and from character k+1 to the end of the second + // sequence. + ///////////////////////////////////////////////////////////////// + + VF *ComputeBackwardMatrix(Sequence *seq1, Sequence *seq2) const { + + assert(seq1); + assert(seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + // create matrix + VF *backwardPtr = new VF( + NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO); + assert(backwardPtr); + VF &backward = *backwardPtr; + + // initialization condition + for (int k = 0; k < NumMatrixTypes; k++) + backward[NumMatrixTypes * ((seq1Length + 1) * (seq2Length + 1) - 1) + + k] = initialDistribution[k]; + + // remember offset for each index combination + int ij = (seq1Length + 1) * (seq2Length + 1) - 1; + int i1j = ij + seq2Length + 1; + int ij1 = ij + 1; + int i1j1 = ij + seq2Length + 2; + + ij *= NumMatrixTypes; + i1j *= NumMatrixTypes; + ij1 *= NumMatrixTypes; + i1j1 *= NumMatrixTypes; + + // compute backward scores + for (int i = seq1Length; i >= 0; i--) { + unsigned char c1 = + (i == seq1Length) ? '~' : (unsigned char) iter1[i + 1]; + for (int j = seq2Length; j >= 0; j--) { + unsigned char c2 = + (j == seq2Length) ? '~' : (unsigned char) iter2[j + 1]; + + if (i < seq1Length && j < seq2Length) { + const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2]; + for (int k = 0; k < NumMatrixTypes; k++) + LOG_PLUS_EQUALS(backward[k + ij], + ProbXY + transProb[k][0]); + } + if (i < seq1Length) { + for (int k = 0; k < NumInsertStates; k++) { + LOG_PLUS_EQUALS(backward[0 + ij], + backward[2 * k + 1 + i1j] + insProb[c1][k] + + transProb[0][2 * k + 1]); + LOG_PLUS_EQUALS(backward[2 * k + 1 + ij], + backward[2 * k + 1 + i1j] + insProb[c1][k] + + transProb[2 * k + 1][2 * k + 1]); + } + } + if (j < seq2Length) { + for (int k = 0; k < NumInsertStates; k++) { + LOG_PLUS_EQUALS(backward[0 + ij], + backward[2 * k + 2 + ij1] + insProb[c2][k] + + transProb[0][2 * k + 2]); + LOG_PLUS_EQUALS(backward[2 * k + 2 + ij], + backward[2 * k + 2 + ij1] + insProb[c2][k] + + transProb[2 * k + 2][2 * k + 2]); + } + } + + ij -= NumMatrixTypes; + i1j -= NumMatrixTypes; + ij1 -= NumMatrixTypes; + i1j1 -= NumMatrixTypes; + } + } + + return backwardPtr; + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeTotalProbability() + // + // Computes the total probability of an alignment given + // the forward and backward matrices. + ///////////////////////////////////////////////////////////////// + + float ComputeTotalProbability(int seq1Length, int seq2Length, + const VF &forward, const VF &backward) const { + + // compute total probability + float totalForwardProb = LOG_ZERO; + float totalBackwardProb = LOG_ZERO; + for (int k = 0; k < NumMatrixTypes; k++) { + LOG_PLUS_EQUALS(totalForwardProb, + forward[k + + NumMatrixTypes + * ((seq1Length + 1) * (seq2Length + 1) - 1)] + + backward[k + + NumMatrixTypes + * ((seq1Length + 1) + * (seq2Length + 1) - 1)]); + } + + totalBackwardProb = forward[0 + + NumMatrixTypes * (1 * (seq2Length + 1) + 1)] + + backward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)]; + + for (int k = 0; k < NumInsertStates; k++) { + LOG_PLUS_EQUALS(totalBackwardProb, + forward[2 * k + 1 + + NumMatrixTypes * (1 * (seq2Length + 1) + 0)] + + backward[2 * k + 1 + + NumMatrixTypes + * (1 * (seq2Length + 1) + 0)]); + LOG_PLUS_EQUALS(totalBackwardProb, + forward[2 * k + 2 + + NumMatrixTypes * (0 * (seq2Length + 1) + 1)] + + backward[2 * k + 2 + + NumMatrixTypes + * (0 * (seq2Length + 1) + 1)]); + } + + // cerr << totalForwardProb << " " << totalBackwardProb << endl; + + return (totalForwardProb + totalBackwardProb) / 2; + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputePosteriorMatrix() + // + // Computes the posterior probability matrix based on + // the forward and backward matrices. + ///////////////////////////////////////////////////////////////// + + VF *ComputePosteriorMatrix(Sequence *seq1, Sequence *seq2, + const VF &forward, const VF &backward) const { + + assert(seq1); + assert(seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + float totalProb = ComputeTotalProbability(seq1Length, seq2Length, + forward, backward); + + // compute posterior matrices + VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1)); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + + int ij = 0; + if (totalProb == 0) { + totalProb = 1.0f; + } + VF::iterator ptr = posterior.begin(); + + for (int i = 0; i <= seq1Length; i++) { + for (int j = 0; j <= seq2Length; j++) { + *(ptr++) = EXP( + min(LOG_ONE, forward[ij] + backward[ij] - totalProb)); + ij += NumMatrixTypes; + } + } + + posterior[0] = 0; + + return posteriorPtr; + } + + /* + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeExpectedCounts() + // + // Computes the expected counts for the various transitions. + ///////////////////////////////////////////////////////////////// + + VVF *ComputeExpectedCounts () const { + + assert (seq1); + assert (seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + // compute total probability + float totalProb = ComputeTotalProbability (seq1Length, seq2Length, + forward, backward); + + // initialize expected counts + VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr); + VVF &counts = *countsPtr; + + // remember offset for each index combination + int ij = 0; + int i1j = -seq2Length - 1; + int ij1 = -1; + int i1j1 = -seq2Length - 2; + + ij *= NumMatrixTypes; + i1j *= NumMatrixTypes; + ij1 *= NumMatrixTypes; + i1j1 *= NumMatrixTypes; + + // compute expected counts + for (int i = 0; i <= seq1Length; i++){ + unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; + for (int j = 0; j <= seq2Length; j++){ + unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; + + if (i > 0 && j > 0){ + for (int k = 0; k < NumMatrixTypes; k++) + LOG_PLUS_EQUALS (counts[k][0], + forward[k + i1j1] + transProb[k][0] + + matchProb[c1][c2] + backward[0 + ij]); + } + if (i > 0){ + for (int k = 0; k < NumInsertStates; k++){ + LOG_PLUS_EQUALS (counts[0][2*k+1], + forward[0 + i1j] + transProb[0][2*k+1] + + insProb[c1][k] + backward[2*k+1 + ij]); + LOG_PLUS_EQUALS (counts[2*k+1][2*k+1], + forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + + insProb[c1][k] + backward[2*k+1 + ij]); + } + } + if (j > 0){ + for (int k = 0; k < NumInsertStates; k++){ + LOG_PLUS_EQUALS (counts[0][2*k+2], + forward[0 + ij1] + transProb[0][2*k+2] + + insProb[c2][k] + backward[2*k+2 + ij]); + LOG_PLUS_EQUALS (counts[2*k+2][2*k+2], + forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + + insProb[c2][k] + backward[2*k+2 + ij]); + } + } + + ij += NumMatrixTypes; + i1j += NumMatrixTypes; + ij1 += NumMatrixTypes; + i1j1 += NumMatrixTypes; + } + } + + // scale all expected counts appropriately + for (int i = 0; i < NumMatrixTypes; i++) + for (int j = 0; j < NumMatrixTypes; j++) + counts[i][j] -= totalProb; + + } + */ + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeNewParameters() + // + // Computes a new parameter set based on the expected counts + // given. + ///////////////////////////////////////////////////////////////// + void ComputeNewParameters(Sequence *seq1, Sequence *seq2, const VF &forward, + const VF &backward, VF &initDistribMat, VF &gapOpen, VF &gapExtend, + VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const { + + assert(seq1); + assert(seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + // compute total probability + float totalProb = ComputeTotalProbability(seq1Length, seq2Length, + forward, backward); + + // initialize expected counts + VVF transCounts(NumMatrixTypes, VF(NumMatrixTypes, LOG_ZERO)); + VF initCounts(NumMatrixTypes, LOG_ZERO); + VVF pairCounts(256, VF(256, LOG_ZERO)); + VF singleCounts(256, LOG_ZERO); + + // remember offset for each index combination + int ij = 0; + int i1j = -seq2Length - 1; + int ij1 = -1; + int i1j1 = -seq2Length - 2; + + ij *= NumMatrixTypes; + i1j *= NumMatrixTypes; + ij1 *= NumMatrixTypes; + i1j1 *= NumMatrixTypes; + + // compute initial distribution posteriors + initCounts[0] = LOG_ADD( + forward[0 + NumMatrixTypes * (1 * (seq2Length + 1) + 1)] + + backward[0 + + NumMatrixTypes * (1 * (seq2Length + 1) + 1)], + forward[0 + + NumMatrixTypes + * ((seq1Length + 1) * (seq2Length + 1) - 1)] + + backward[0 + + NumMatrixTypes + * ((seq1Length + 1) * (seq2Length + 1) + - 1)]); + for (int k = 0; k < NumInsertStates; k++) { + initCounts[2 * k + 1] = LOG_ADD( + forward[2 * k + 1 + + NumMatrixTypes * (1 * (seq2Length + 1) + 0)] + + backward[2 * k + 1 + + NumMatrixTypes + * (1 * (seq2Length + 1) + 0)], + forward[2 * k + 1 + + NumMatrixTypes + * ((seq1Length + 1) * (seq2Length + 1) - 1)] + + backward[2 * k + 1 + + NumMatrixTypes + * ((seq1Length + 1) + * (seq2Length + 1) - 1)]); + initCounts[2 * k + 2] = LOG_ADD( + forward[2 * k + 2 + + NumMatrixTypes * (0 * (seq2Length + 1) + 1)] + + backward[2 * k + 2 + + NumMatrixTypes + * (0 * (seq2Length + 1) + 1)], + forward[2 * k + 2 + + NumMatrixTypes + * ((seq1Length + 1) * (seq2Length + 1) - 1)] + + backward[2 * k + 2 + + NumMatrixTypes + * ((seq1Length + 1) + * (seq2Length + 1) - 1)]); + } + + // compute expected counts + for (int i = 0; i <= seq1Length; i++) { + unsigned char c1 = + (i == 0) ? '~' : (unsigned char) toupper(iter1[i]); + for (int j = 0; j <= seq2Length; j++) { + unsigned char c2 = + (j == 0) ? '~' : (unsigned char) toupper(iter2[j]); + + if (i > 0 && j > 0) { + if (enableTrainEmissions && i == 1 && j == 1) { + LOG_PLUS_EQUALS(pairCounts[c1][c2], + initialDistribution[0] + matchProb[c1][c2] + + backward[0 + ij]); + LOG_PLUS_EQUALS(pairCounts[c2][c1], + initialDistribution[0] + matchProb[c2][c1] + + backward[0 + ij]); + } + + for (int k = 0; k < NumMatrixTypes; k++) { + LOG_PLUS_EQUALS(transCounts[k][0], + forward[k + i1j1] + transProb[k][0] + + matchProb[c1][c2] + backward[0 + ij]); + if (enableTrainEmissions && (i != 1 || j != 1)) {//adding parentheses by Liu Yongchao, 5 Mar, 2010 + LOG_PLUS_EQUALS(pairCounts[c1][c2], + forward[k + i1j1] + transProb[k][0] + + matchProb[c1][c2] + + backward[0 + ij]); + LOG_PLUS_EQUALS(pairCounts[c2][c1], + forward[k + i1j1] + transProb[k][0] + + matchProb[c2][c1] + + backward[0 + ij]); + } + } + } + if (i > 0) { + for (int k = 0; k < NumInsertStates; k++) { + LOG_PLUS_EQUALS(transCounts[0][2 * k + 1], + forward[0 + i1j] + transProb[0][2 * k + 1] + + insProb[c1][k] + + backward[2 * k + 1 + ij]); + LOG_PLUS_EQUALS(transCounts[2 * k + 1][2 * k + 1], + forward[2 * k + 1 + i1j] + + transProb[2 * k + 1][2 * k + 1] + + insProb[c1][k] + + backward[2 * k + 1 + ij]); + if (enableTrainEmissions) { + if (i == 1 && j == 0) { + LOG_PLUS_EQUALS(singleCounts[c1], + initialDistribution[2 * k + 1] + + insProb[c1][k] + + backward[2 * k + 1 + ij]); + } else { + LOG_PLUS_EQUALS(singleCounts[c1], + forward[0 + i1j] + + transProb[0][2 * k + 1] + + insProb[c1][k] + + backward[2 * k + 1 + ij]); + LOG_PLUS_EQUALS(singleCounts[c1], + forward[2 * k + 1 + i1j] + + transProb[2 * k + 1][2 * k + 1] + + insProb[c1][k] + + backward[2 * k + 1 + ij]); + } + } + } + } + if (j > 0) { + for (int k = 0; k < NumInsertStates; k++) { + LOG_PLUS_EQUALS(transCounts[0][2 * k + 2], + forward[0 + ij1] + transProb[0][2 * k + 2] + + insProb[c2][k] + + backward[2 * k + 2 + ij]); + LOG_PLUS_EQUALS(transCounts[2 * k + 2][2 * k + 2], + forward[2 * k + 2 + ij1] + + transProb[2 * k + 2][2 * k + 2] + + insProb[c2][k] + + backward[2 * k + 2 + ij]); + if (enableTrainEmissions) { + if (i == 0 && j == 1) { + LOG_PLUS_EQUALS(singleCounts[c2], + initialDistribution[2 * k + 2] + + insProb[c2][k] + + backward[2 * k + 2 + ij]); + } else { + LOG_PLUS_EQUALS(singleCounts[c2], + forward[0 + ij1] + + transProb[0][2 * k + 2] + + insProb[c2][k] + + backward[2 * k + 2 + ij]); + LOG_PLUS_EQUALS(singleCounts[c2], + forward[2 * k + 2 + ij1] + + transProb[2 * k + 2][2 * k + 2] + + insProb[c2][k] + + backward[2 * k + 2 + ij]); + } + } + } + } + + ij += NumMatrixTypes; + i1j += NumMatrixTypes; + ij1 += NumMatrixTypes; + i1j1 += NumMatrixTypes; + } + } + + // scale all expected counts appropriately + for (int i = 0; i < NumMatrixTypes; i++) { + initCounts[i] -= totalProb; + for (int j = 0; j < NumMatrixTypes; j++) + transCounts[i][j] -= totalProb; + } + if (enableTrainEmissions) { + for (int i = 0; i < 256; i++) { + for (int j = 0; j < 256; j++) + pairCounts[i][j] -= totalProb; + singleCounts[i] -= totalProb; + } + } + + // compute new initial distribution + float totalInitDistribCounts = 0; + for (int i = 0; i < NumMatrixTypes; i++) + totalInitDistribCounts += exp(initCounts[i]); // should be 2 + initDistribMat[0] = min(1.0f, + max(0.0f, (float) exp(initCounts[0]) / totalInitDistribCounts)); + for (int k = 0; k < NumInsertStates; k++) { + float val = + (exp(initCounts[2 * k + 1]) + exp(initCounts[2 * k + 2])) + / 2; + initDistribMat[2 * k + 1] = initDistribMat[2 * k + 2] = min(1.0f, + max(0.0f, val / totalInitDistribCounts)); + } + + // compute total counts for match state + float inMatchStateCounts = 0; + for (int i = 0; i < NumMatrixTypes; i++) + inMatchStateCounts += exp(transCounts[0][i]); + for (int i = 0; i < NumInsertStates; i++) { + + // compute total counts for gap state + float inGapStateCounts = exp(transCounts[2 * i + 1][0]) + + exp(transCounts[2 * i + 1][2 * i + 1]) + + exp(transCounts[2 * i + 2][0]) + + exp(transCounts[2 * i + 2][2 * i + 2]); + + gapOpen[2 * i] = gapOpen[2 * i + 1] = (exp( + transCounts[0][2 * i + 1]) + exp(transCounts[0][2 * i + 2])) + / (2 * inMatchStateCounts); + + gapExtend[2 * i] = gapExtend[2 * i + 1] = (exp( + transCounts[2 * i + 1][2 * i + 1]) + + exp(transCounts[2 * i + 2][2 * i + 2])) + / inGapStateCounts; + } + + if (enableTrainEmissions) { + float totalPairCounts = 0; + float totalSingleCounts = 0; + for (int i = 0; i < 256; i++) { + for (int j = 0; j <= i; j++) + totalPairCounts += exp(pairCounts[j][i]); + totalSingleCounts += exp(singleCounts[i]); + } + + for (int i = 0; i < 256; i++) + if (!islower((char) i)) { + int li = (int) ((unsigned char) tolower((char) i)); + for (int j = 0; j <= i; j++) + if (!islower((char) j)) { + int lj = (int) ((unsigned char) tolower((char) j)); + emitPairs[i][j] = + emitPairs[i][lj] = + emitPairs[li][j] = + emitPairs[li][lj] = + emitPairs[j][i] = + emitPairs[j][li] = + emitPairs[lj][i] = + emitPairs[lj][li] = + exp( + pairCounts[j][i]) + / totalPairCounts; + } + emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) + / totalSingleCounts; + } + } + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeAlignment() + // + // Computes an alignment based on the given posterior matrix. + // This is done by finding the maximum summing path (or + // maximum weight trace) through the posterior matrix. The + // final alignment is returned as a pair consisting of: + // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and + // denote insertions in one of the two sequences and + // B's denote that both sequences are present (i.e. + // matches). + // (2) a float indicating the sum achieved + ///////////////////////////////////////////////////////////////// + + pair *, float> ComputeAlignment(int seq1Length, + int seq2Length, const VF &posterior) const { + + float *twoRows = new float[(seq2Length + 1) * 2]; + assert(twoRows); + float *oldRow = twoRows; + float *newRow = twoRows + seq2Length + 1; + + char *tracebackMatrix = new char[(seq1Length + 1) * (seq2Length + 1)]; + assert(tracebackMatrix); + char *tracebackPtr = tracebackMatrix; + + VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; + + // initialization + for (int i = 0; i <= seq2Length; i++) { + oldRow[i] = 0; + *(tracebackPtr++) = 'L'; + } + + // fill in matrix + for (int i = 1; i <= seq1Length; i++) { + + // initialize left column + newRow[0] = 0; + posteriorPtr++; + *(tracebackPtr++) = 'U'; + + // fill in rest of row + for (int j = 1; j <= seq2Length; j++) { + ChooseBestOfThree(*(posteriorPtr++) + oldRow[j - 1], + newRow[j - 1], oldRow[j], 'D', 'L', 'U', &newRow[j], + tracebackPtr++); + } + + // swap rows + float *temp = oldRow; + oldRow = newRow; + newRow = temp; + } + + // store best score + float total = oldRow[seq2Length]; + delete[] twoRows; + + // compute traceback + SafeVector *alignment = new SafeVector; + assert(alignment); + int r = seq1Length, c = seq2Length; + while (r != 0 || c != 0) { + char ch = tracebackMatrix[r * (seq2Length + 1) + c]; + switch (ch) { + case 'L': + c--; + alignment->push_back('Y'); + break; + case 'U': + r--; + alignment->push_back('X'); + break; + case 'D': + c--; + r--; + alignment->push_back('B'); + break; + default: + assert(false); + } + } + + delete[] tracebackMatrix; + + reverse(alignment->begin(), alignment->end()); + + return make_pair(alignment, total); + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeAlignmentWithGapPenalties() + // + // Similar to ComputeAlignment() except with gap penalties. + ///////////////////////////////////////////////////////////////// + + pair *, float> ComputeAlignmentWithGapPenalties( + MultiSequence *align1, MultiSequence *align2, const VF &posterior, + int numSeqs1, int numSeqs2, float gapOpenPenalty, + float gapContinuePenalty) const { + int seq1Length = align1->GetSequence(0)->GetLength(); + int seq2Length = align2->GetSequence(0)->GetLength(); + SafeVector::iterator> dataPtrs1( + align1->GetNumSequences()); + SafeVector::iterator> dataPtrs2( + align2->GetNumSequences()); + + // grab character data + for (int i = 0; i < align1->GetNumSequences(); i++) + dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr(); + for (int i = 0; i < align2->GetNumSequences(); i++) + dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr(); + + // the number of active sequences at any given column is defined to be the + // number of non-gap characters in that column; the number of gap opens at + // any given column is defined to be the number of gap characters in that + // column where the previous character in the respective sequence was not + // a gap + SafeVector numActive1(seq1Length + 1), numGapOpens1( + seq1Length + 1); + SafeVector numActive2(seq2Length + 1), numGapOpens2( + seq2Length + 1); + + // compute number of active sequences and gap opens for each group + for (int i = 0; i < align1->GetNumSequences(); i++) { + SafeVector::iterator dataPtr = + align1->GetSequence(i)->GetDataPtr(); + numActive1[0] = numGapOpens1[0] = 0; + for (int j = 1; j <= seq1Length; j++) { + if (dataPtr[j] != '-') { + numActive1[j]++; + numGapOpens1[j] += (j != 1 && dataPtr[j - 1] != '-'); + } + } + } + for (int i = 0; i < align2->GetNumSequences(); i++) { + SafeVector::iterator dataPtr = + align2->GetSequence(i)->GetDataPtr(); + numActive2[0] = numGapOpens2[0] = 0; + for (int j = 1; j <= seq2Length; j++) { + if (dataPtr[j] != '-') { + numActive2[j]++; + numGapOpens2[j] += (j != 1 && dataPtr[j - 1] != '-'); + } + } + } + + VVF openingPenalty1(numSeqs1 + 1, VF(numSeqs2 + 1)); + VF continuingPenalty1(numSeqs1 + 1); + VVF openingPenalty2(numSeqs1 + 1, VF(numSeqs2 + 1)); + VF continuingPenalty2(numSeqs2 + 1); + + // precompute penalties + for (int i = 0; i <= numSeqs1; i++) + for (int j = 0; j <= numSeqs2; j++) + openingPenalty1[i][j] = i + * (gapOpenPenalty * j + + gapContinuePenalty * (numSeqs2 - j)); + for (int i = 0; i <= numSeqs1; i++) + continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2; + for (int i = 0; i <= numSeqs2; i++) + for (int j = 0; j <= numSeqs1; j++) + openingPenalty2[i][j] = i + * (gapOpenPenalty * j + + gapContinuePenalty * (numSeqs1 - j)); + for (int i = 0; i <= numSeqs2; i++) + continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1; + + float *twoRows = new float[6 * (seq2Length + 1)]; + assert(twoRows); + float *oldRowMatch = twoRows; + float *newRowMatch = twoRows + (seq2Length + 1); + float *oldRowInsertX = twoRows + 2 * (seq2Length + 1); + float *newRowInsertX = twoRows + 3 * (seq2Length + 1); + float *oldRowInsertY = twoRows + 4 * (seq2Length + 1); + float *newRowInsertY = twoRows + 5 * (seq2Length + 1); + + char *tracebackMatrix = + new char[3 * (seq1Length + 1) * (seq2Length + 1)]; + assert(tracebackMatrix); + char *tracebackPtr = tracebackMatrix; + + VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; + + // initialization + for (int i = 0; i <= seq2Length; i++) { + oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO; + oldRowInsertY[i] = + (i == 0) ? + 0 : + oldRowInsertY[i - 1] + + continuingPenalty2[numActive2[i]]; + *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'Y'; + tracebackPtr += 3; + } + + // fill in matrix + for (int i = 1; i <= seq1Length; i++) { + + // initialize left column + newRowMatch[0] = newRowInsertY[0] = LOG_ZERO; + newRowInsertX[0] = oldRowInsertX[0] + + continuingPenalty1[numActive1[i]]; + posteriorPtr++; + *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'X'; + tracebackPtr += 3; + + // fill in rest of row + for (int j = 1; j <= seq2Length; j++) { + + // going to MATCH state + ChooseBestOfThree(oldRowMatch[j - 1], oldRowInsertX[j - 1], + oldRowInsertY[j - 1], 'M', 'X', 'Y', &newRowMatch[j], + tracebackPtr++); + newRowMatch[j] += *(posteriorPtr++); + + // going to INSERT X state + ChooseBestOfThree( + oldRowMatch[j] + + openingPenalty1[numActive1[i]][numGapOpens2[j]], + oldRowInsertX[j] + continuingPenalty1[numActive1[i]], + oldRowInsertY[j] + + openingPenalty1[numActive1[i]][numGapOpens2[j]], + 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++); + + // going to INSERT Y state + ChooseBestOfThree( + newRowMatch[j - 1] + + openingPenalty2[numActive2[j]][numGapOpens1[i]], + newRowInsertX[j - 1] + + openingPenalty2[numActive2[j]][numGapOpens1[i]], + newRowInsertY[j - 1] + + continuingPenalty2[numActive2[j]], 'M', 'X', + 'Y', &newRowInsertY[j], tracebackPtr++); + } + + // swap rows + float *temp; + temp = oldRowMatch; + oldRowMatch = newRowMatch; + newRowMatch = temp; + temp = oldRowInsertX; + oldRowInsertX = newRowInsertX; + newRowInsertX = temp; + temp = oldRowInsertY; + oldRowInsertY = newRowInsertY; + newRowInsertY = temp; + } + + // store best score + float total; + char matrix; + ChooseBestOfThree(oldRowMatch[seq2Length], oldRowInsertX[seq2Length], + oldRowInsertY[seq2Length], 'M', 'X', 'Y', &total, &matrix); + + delete[] twoRows; + + // compute traceback + SafeVector *alignment = new SafeVector; + assert(alignment); + int r = seq1Length, c = seq2Length; + while (r != 0 || c != 0) { + + int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2; + char ch = tracebackMatrix[(r * (seq2Length + 1) + c) * 3 + offset]; + switch (matrix) { + case 'Y': + c--; + alignment->push_back('Y'); + break; + case 'X': + r--; + alignment->push_back('X'); + break; + case 'M': + c--; + r--; + alignment->push_back('B'); + break; + default: + assert(false); + } + matrix = ch; + } + + delete[] tracebackMatrix; + + reverse(alignment->begin(), alignment->end()); + + return make_pair(alignment, 1.0f); + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::ComputeViterbiAlignment() + // + // Computes the highest probability pairwise alignment using the + // probabilistic model. The final alignment is returned as a + // pair consisting of: + // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and + // denote insertions in one of the two sequences and + // B's denote that both sequences are present (i.e. + // matches). + // (2) a float containing the log probability of the best + // alignment (not used) + ///////////////////////////////////////////////////////////////// + + pair *, float> ComputeViterbiAlignment(Sequence *seq1, + Sequence *seq2) const { + + assert(seq1); + assert(seq2); + + const int seq1Length = seq1->GetLength(); + const int seq2Length = seq2->GetLength(); + + // retrieve the points to the beginning of each sequence + SafeVector::iterator iter1 = seq1->GetDataPtr(); + SafeVector::iterator iter2 = seq2->GetDataPtr(); + + // create viterbi matrix + VF *viterbiPtr = new VF( + NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), LOG_ZERO); + assert(viterbiPtr); + VF &viterbi = *viterbiPtr; + + // create traceback matrix + VI *tracebackPtr = new VI( + NumMatrixTypes * (seq1Length + 1) * (seq2Length + 1), -1); + assert(tracebackPtr); + VI &traceback = *tracebackPtr; + + // initialization condition + for (int k = 0; k < NumMatrixTypes; k++) + viterbi[k] = initialDistribution[k]; + + // remember offset for each index combination + int ij = 0; + int i1j = -seq2Length - 1; + int ij1 = -1; + int i1j1 = -seq2Length - 2; + + ij *= NumMatrixTypes; + i1j *= NumMatrixTypes; + ij1 *= NumMatrixTypes; + i1j1 *= NumMatrixTypes; + + // compute viterbi scores + for (int i = 0; i <= seq1Length; i++) { + unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; + for (int j = 0; j <= seq2Length; j++) { + unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; + + if (i > 0 && j > 0) { + for (int k = 0; k < NumMatrixTypes; k++) { + float newVal = viterbi[k + i1j1] + transProb[k][0] + + matchProb[c1][c2]; + if (viterbi[0 + ij] < newVal) { + viterbi[0 + ij] = newVal; + traceback[0 + ij] = k; + } + } + } + if (i > 0) { + for (int k = 0; k < NumInsertStates; k++) { + float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + + transProb[0][2 * k + 1]; + float valFromIns = insProb[c1][k] + + viterbi[2 * k + 1 + i1j] + + transProb[2 * k + 1][2 * k + 1]; + if (valFromMatch >= valFromIns) { + viterbi[2 * k + 1 + ij] = valFromMatch; + traceback[2 * k + 1 + ij] = 0; + } else { + viterbi[2 * k + 1 + ij] = valFromIns; + traceback[2 * k + 1 + ij] = 2 * k + 1; + } + } + } + if (j > 0) { + for (int k = 0; k < NumInsertStates; k++) { + float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + + transProb[0][2 * k + 2]; + float valFromIns = insProb[c2][k] + + viterbi[2 * k + 2 + ij1] + + transProb[2 * k + 2][2 * k + 2]; + if (valFromMatch >= valFromIns) { + viterbi[2 * k + 2 + ij] = valFromMatch; + traceback[2 * k + 2 + ij] = 0; + } else { + viterbi[2 * k + 2 + ij] = valFromIns; + traceback[2 * k + 2 + ij] = 2 * k + 2; + } + } + } + + ij += NumMatrixTypes; + i1j += NumMatrixTypes; + ij1 += NumMatrixTypes; + i1j1 += NumMatrixTypes; + } + } + + // figure out best terminating cell + float bestProb = LOG_ZERO; + int state = -1; + for (int k = 0; k < NumMatrixTypes; k++) { + float thisProb = + viterbi[k + + NumMatrixTypes + * ((seq1Length + 1) * (seq2Length + 1) - 1)] + + initialDistribution[k]; + if (bestProb < thisProb) { + bestProb = thisProb; + state = k; + } + } + assert(state != -1); + + delete viterbiPtr; + + // compute traceback + SafeVector *alignment = new SafeVector; + assert(alignment); + int r = seq1Length, c = seq2Length; + while (r != 0 || c != 0) { + int newState = traceback[state + + NumMatrixTypes * (r * (seq2Length + 1) + c)]; + + if (state == 0) { + c--; + r--; + alignment->push_back('B'); + } else if (state % 2 == 1) { + r--; + alignment->push_back('X'); + } else { + c--; + alignment->push_back('Y'); + } + + state = newState; + } + + delete tracebackPtr; + + reverse(alignment->begin(), alignment->end()); + + return make_pair(alignment, bestProb); + } + + ///////////////////////////////////////////////////////////////// + // ProbabilisticModel::BuildPosterior() + // + // Builds a posterior probability matrix needed to align a pair + // of alignments. Mathematically, the returned matrix M is + // defined as follows: + // M[i,j] = sum sum f(s,t,i,j) + // s in align1 t in align2 + // where + // [ P(s[i'] <--> t[j']) + // [ if s[i'] is a letter in the ith column of align1 and + // [ t[j'] it a letter in the jth column of align2 + // f(s,t,i,j) = [ + // [ 0 otherwise + // + ///////////////////////////////////////////////////////////////// + + VF *BuildPosterior(MultiSequence *align1, MultiSequence *align2, + const SafeVector > &sparseMatrices, + float cutoff = 0.0f) const { + const int seq1Length = align1->GetSequence(0)->GetLength(); + const int seq2Length = align2->GetSequence(0)->GetLength(); + + VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + VF::iterator postPtr = posterior.begin(); + + // for each s in align1 + for (int i = 0; i < align1->GetNumSequences(); i++) { + int first = align1->GetSequence(i)->GetLabel(); + SafeVector *mapping1 = align1->GetSequence(i)->GetMapping(); + + // for each t in align2 + for (int j = 0; j < align2->GetNumSequences(); j++) { + int second = align2->GetSequence(j)->GetLabel(); + SafeVector *mapping2 = + align2->GetSequence(j)->GetMapping(); + + if (first < second) { + + // get the associated sparse matrix + SparseMatrix *matrix = sparseMatrices[first][second]; + + for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) { + SafeVector::iterator row = matrix->GetRowPtr(ii); + int base = (*mapping1)[ii] * (seq2Length + 1); + int rowSize = matrix->GetRowSize(ii); + + // add in all relevant values + for (int jj = 0; jj < rowSize; jj++) + posterior[base + (*mapping2)[row[jj].first]] += + row[jj].second; + + // subtract cutoff + for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) + posterior[base + (*mapping2)[jj]] -= cutoff; + } + + } else { + + // get the associated sparse matrix + SparseMatrix *matrix = sparseMatrices[second][first]; + + for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) { + SafeVector::iterator row = matrix->GetRowPtr(jj); + int base = (*mapping2)[jj]; + int rowSize = matrix->GetRowSize(jj); + + // add in all relevant values + for (int ii = 0; ii < rowSize; ii++) + posterior[base + + (*mapping1)[row[ii].first] + * (seq2Length + 1)] += + row[ii].second; + + // subtract cutoff + for (int ii = 0; ii < matrix->GetSeq2Length(); ii++) + posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= + cutoff; + } + + } + + delete mapping2; + } + + delete mapping1; + } + + return posteriorPtr; + } + //added by Liu Yongchao.Feb 23, 2010 + VF *BuildPosterior(int* seqsWeights, MultiSequence *align1, + MultiSequence *align2, + const SafeVector > &sparseMatrices, + float cutoff = 0.0f) const { + const int seq1Length = align1->GetSequence(0)->GetLength(); + const int seq2Length = align2->GetSequence(0)->GetLength(); + + VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + VF::iterator postPtr = posterior.begin(); + + //compute the total sum of all weights + float totalWeights = 0; + for (int i = 0; i < align1->GetNumSequences(); i++) { + int first = align1->GetSequence(i)->GetLabel(); + int w1 = seqsWeights[first]; + for (int j = 0; j < align2->GetNumSequences(); j++) { + int second = align2->GetSequence(j)->GetLabel(); + int w2 = seqsWeights[second]; + + totalWeights += w1 * w2; + } + } + // for each s in align1 + for (int i = 0; i < align1->GetNumSequences(); i++) { + int first = align1->GetSequence(i)->GetLabel(); + int w1 = seqsWeights[first]; + SafeVector *mapping1 = align1->GetSequence(i)->GetMapping(); + // for each t in align2 + for (int j = 0; j < align2->GetNumSequences(); j++) { + int second = align2->GetSequence(j)->GetLabel(); + int w2 = seqsWeights[second]; + SafeVector *mapping2 = + align2->GetSequence(j)->GetMapping(); + + float w = (float) (w1 * w2) / totalWeights; + if (first < second) { + + // get the associated sparse matrix + SparseMatrix *matrix = sparseMatrices[first][second]; + + for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) { + SafeVector::iterator row = matrix->GetRowPtr(ii); + int base = (*mapping1)[ii] * (seq2Length + 1); + int rowSize = matrix->GetRowSize(ii); + + // add in all relevant values + for (int jj = 0; jj < rowSize; jj++) + posterior[base + (*mapping2)[row[jj].first]] += w + * row[jj].second; + + // subtract cutoff + for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) + posterior[base + (*mapping2)[jj]] -= w * cutoff; + } + + } else { + + // get the associated sparse matrix + SparseMatrix *matrix = sparseMatrices[second][first]; + + for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) { + SafeVector::iterator row = matrix->GetRowPtr(jj); + int base = (*mapping2)[jj]; + int rowSize = matrix->GetRowSize(jj); + + // add in all relevant values + for (int ii = 0; ii < rowSize; ii++) + posterior[base + + (*mapping1)[row[ii].first] + * (seq2Length + 1)] += w + * row[ii].second; + + // subtract cutoff + for (int ii = 0; ii < matrix->GetSeq2Length(); ii++) + posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= + w * cutoff; + } + + } + + delete mapping2; + } + + delete mapping1; + } + + return posteriorPtr; + } +}; + +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/SafeVector.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/SafeVector.h new file mode 100644 index 0000000..f42c2da --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/SafeVector.h @@ -0,0 +1,65 @@ +///////////////////////////////////////////////////////////////// +// SafeVector.h +// +// STL vector with array bounds checking. To enable bounds +// checking, #define ENABLE_CHECKS. +///////////////////////////////////////////////////////////////// + +#ifndef SAFEVECTOR_H +#define SAFEVECTOR_H + +#include +#include +using namespace std; + +///////////////////////////////////////////////////////////////// +// SafeVector +// +// Class derived from the STL std::vector for bounds checking. +///////////////////////////////////////////////////////////////// + +template +class SafeVector: public std::vector { +public: + + // miscellaneous constructors + SafeVector() : + std::vector() { + } + SafeVector(size_t size) : + std::vector(size) { + } + SafeVector(size_t size, const TYPE &value) : + std::vector(size, value) { + } + SafeVector(const SafeVector &source) : + std::vector(source) { + } + +#ifdef ENABLE_CHECKS + + // [] array bounds checking + TYPE &operator[](int index) { + assert (index >= 0 && index < (int) size()); + return std::vector::operator[] ((size_t) index); + } + + // [] const array bounds checking + const TYPE &operator[] (int index) const { + assert (index >= 0 && index < (int) size()); + return std::vector::operator[] ((size_t) index); + } + +#endif + +}; + +// some commonly used vector types +typedef SafeVector VI; +typedef SafeVector VVI; +typedef SafeVector VVVI; +typedef SafeVector VF; +typedef SafeVector VVF; +typedef SafeVector VVVF; + +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/ScoreType.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/ScoreType.h new file mode 100644 index 0000000..47de13d --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/ScoreType.h @@ -0,0 +1,368 @@ +///////////////////////////////////////////////////////////////// +// ScoreType.h +// +// Routines for doing math operations in MSAPROBS +///////////////////////////////////////////////////////////////// + +#ifndef SCORETYPE_H +#define SCORETYPE_H + +#include +#include +#include +#include + +typedef float ScoreType; + +const float LOG_ZERO = -2e20; +const float LOG_ONE = 0.0; + +///////////////////////////////////////////////////////////////// +// LOG() +// +// Compute the logarithm of x. +///////////////////////////////////////////////////////////////// + +inline ScoreType LOG(ScoreType x) { + return log(x); +} + +///////////////////////////////////////////////////////////////// +// EXP() +// +// Computes exp(x). +///////////////////////////////////////////////////////////////// + +inline ScoreType EXP(ScoreType x) { + //return exp(x); + if (x > -2) { + if (x > -0.5) { + if (x > 0) + return exp(x); + return (((0.03254409303190190000 * x + 0.16280432765779600000) * x + + 0.49929760485974900000) * x + 0.99995149601363700000) * x + + 0.99999925508501600000; + } + if (x > -1) + return (((0.01973899026052090000 * x + 0.13822379685007000000) * x + + 0.48056651562365000000) * x + 0.99326940370383500000) * x + + 0.99906756856399500000; + return (((0.00940528203591384000 * x + 0.09414963667859410000) * x + + 0.40825793595877300000) * x + 0.93933625499130400000) * x + + 0.98369508190545300000; + } + if (x > -8) { + if (x > -4) + return (((0.00217245711583303000 * x + 0.03484829428350620000) * x + + 0.22118199801337800000) * x + 0.67049462206469500000) * x + + 0.83556950223398500000; + return (((0.00012398771025456900 * x + 0.00349155785951272000) * x + + 0.03727721426017900000) * x + 0.17974997741536900000) * x + + 0.33249299994217400000; + } + if (x > -16) + return (((0.00000051741713416603 * x + 0.00002721456879608080) * x + + 0.00053418601865636800) * x + 0.00464101989351936000) * x + + 0.01507447981459420000; + return 0; +} + +/* + ///////////////////////////////////////////////////////////////// + // LOOKUP() + // + // Computes log (exp (x) + 1), for 0 <= x <= 7.5. + ///////////////////////////////////////////////////////////////// + + inline ScoreType LOOKUP (ScoreType x){ + //return log (exp(x) + 1); + if (x < 2){ + if (x < 0.5){ + if (x < 0) + return log (exp(x) + 1); + return (((-0.00486373205785640000*x - 0.00020245408813934800)*x + 0.12504222666029800000)*x + 0.49999685320563000000)*x + 0.69314723138948900000; + } + if (x < 1) + return (((-0.00278634205460548000*x - 0.00458097251248546000)*x + 0.12865849880472500000)*x + 0.49862228499205200000)*x + 0.69334810088688000000; + return (((0.00059633755154209200*x - 0.01918996666063320000)*x + 0.15288232492093800000)*x + 0.48039958825756900000)*x + 0.69857578503189200000; + } + if (x < 8){ + if (x < 4) + return (((0.00135958539181047000*x - 0.02329807659316430000)*x + 0.15885799609532100000)*x + 0.48167498563270800000)*x + 0.69276185058669200000; + return (((0.00011992394456683500*x - 0.00338464503306568000)*x + 0.03622746366545470000)*x + 0.82481250248383700000)*x + 0.32507892994863100000; + } + if (x < 16) + return (((0.00000051726300753785*x - 0.00002720671238876090)*x + 0.00053403733818413500)*x + 0.99536021775747900000)*x + 0.01507065715532010000; + return x; + } + + ///////////////////////////////////////////////////////////////// + // LOOKUP_SLOW() + // + // Computes log (exp (x) + 1). + ///////////////////////////////////////////////////////////////// + + inline ScoreType LOOKUP_SLOW (ScoreType x){ + return log (exp (x) + 1); + } + + ///////////////////////////////////////////////////////////////// + // MAX() + // + // Compute max of three numbers + ///////////////////////////////////////////////////////////////// + + inline ScoreType MAX (ScoreType x, ScoreType y, ScoreType z){ + if (x >= y){ + if (x >= z) + return x; + return z; + } + if (y >= z) + return y; + return z; + } + + ///////////////////////////////////////////////////////////////// + // LOG_PLUS_EQUALS() + // + // Add two log probabilities and store in the first argument + ///////////////////////////////////////////////////////////////// + + inline void LOG_PLUS_EQUALS (ScoreType &x, ScoreType y){ + if (x < y) + x = (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x; + else + x = (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y; + } + + ///////////////////////////////////////////////////////////////// + // LOG_PLUS_EQUALS_SLOW() + // + // Add two log probabilities and store in the first argument + ///////////////////////////////////////////////////////////////// + + inline void LOG_PLUS_EQUALS_SLOW (ScoreType &x, ScoreType y){ + if (x < y) + x = (x <= LOG_ZERO) ? y : LOOKUP_SLOW(y-x) + x; + else + x = (y <= LOG_ZERO) ? x : LOOKUP_SLOW(x-y) + y; + } + + ///////////////////////////////////////////////////////////////// + // LOG_ADD() + // + // Add two log probabilities + ///////////////////////////////////////////////////////////////// + + inline ScoreType LOG_ADD (ScoreType x, ScoreType y){ + if (x < y) return (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x; + return (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y; + } + */ + +/* + ///////////////////////////////////////////////////////////////// + // LOG() + // + // Compute the logarithm of x. + ///////////////////////////////////////////////////////////////// + + inline float LOG (float x){ + return log (x); + } + + ///////////////////////////////////////////////////////////////// + // EXP() + // + // Computes exp(x), fr -4.6 <= x <= 0. + ///////////////////////////////////////////////////////////////// + + inline float EXP (float x){ + assert (x <= 0.00f); + if (x < EXP_UNDERFLOW_THRESHOLD) return 0.0f; + return (((0.006349841068584 * x + 0.080775412572352) * x + 0.397982026296272) * x + 0.95279335963787f) * x + 0.995176455837312f; + //return (((0.00681169825657f * x + 0.08386267698832f) * x + 0.40413983195844f) * x + 0.95656674979767f) * x + 0.99556744049130f; + } + */ + +const float EXP_UNDERFLOW_THRESHOLD = -4.6; +const float LOG_UNDERFLOW_THRESHOLD = 7.5; + +///////////////////////////////////////////////////////////////// +// LOOKUP() +// +// Computes log (exp (x) + 1), for 0 <= x <= 7.5. +///////////////////////////////////////////////////////////////// + +inline float LOOKUP(float x) { + assert(x >= 0.00f); + assert(x <= LOG_UNDERFLOW_THRESHOLD); + //return ((-0.00653779113685f * x + 0.09537236626558f) * x + 0.55317574459331f) * x + 0.68672959851568f; + if (x <= 1.00f) + return ((-0.009350833524763f * x + 0.130659527668286f) * x + + 0.498799810682272f) * x + 0.693203116424741f; + if (x <= 2.50f) + return ((-0.014532321752540f * x + 0.139942324101744f) * x + + 0.495635523139337f) * x + 0.692140569840976f; + if (x <= 4.50f) + return ((-0.004605031767994f * x + 0.063427417320019f) * x + + 0.695956496475118f) * x + 0.514272634594009f; + assert(x <= LOG_UNDERFLOW_THRESHOLD); + return ((-0.000458661602210f * x + 0.009695946122598f) * x + + 0.930734667215156f) * x + 0.168037164329057f; + + //return (((0.00089738532761f * x - 0.01859488697982f) * x + 0.14415772028626f) * x + 0.49515490689159f) * x + 0.69311928966454f; +} + +///////////////////////////////////////////////////////////////// +// LOOKUP_SLOW() +// +// Computes log (exp (x) + 1). +///////////////////////////////////////////////////////////////// + +inline float LOOKUP_SLOW(float x) { + return log(exp(x) + 1); +} + +///////////////////////////////////////////////////////////////// +// MAX() +// +// Compute max of three numbers +///////////////////////////////////////////////////////////////// + +inline float MAX(float x, float y, float z) { + if (x >= y) { + if (x >= z) + return x; + return z; + } + if (y >= z) + return y; + return z; +} + +///////////////////////////////////////////////////////////////// +// LOG_PLUS_EQUALS() +// +// Add two log probabilities and store in the first argument +///////////////////////////////////////////////////////////////// + +inline void LOG_PLUS_EQUALS(float &x, float y) { + if (x < y) + x = (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? + y : LOOKUP(y - x) + x; + else + x = (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? + x : LOOKUP(x - y) + y; +} + +///////////////////////////////////////////////////////////////// +// LOG_PLUS_EQUALS_SLOW() +// +// Add two log probabilities and store in the first argument +///////////////////////////////////////////////////////////////// + +inline void LOG_PLUS_EQUALS_SLOW(float &x, float y) { + if (x < y) + x = (x == LOG_ZERO) ? y : LOOKUP_SLOW(y - x) + x; + else + x = (y == LOG_ZERO) ? x : LOOKUP_SLOW(x - y) + y; +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add two log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x, float y) { + if (x < y) + return (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? + y : LOOKUP(y - x) + x; + return (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? + x : LOOKUP(x - y) + y; +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add three log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x1, float x2, float x3) { + return LOG_ADD(x1, LOG_ADD(x2, x3)); +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add four log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x1, float x2, float x3, float x4) { + return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, x4))); +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add five log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5) { + return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, x5)))); +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add siz log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5, + float x6) { + return LOG_ADD(x1, LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, LOG_ADD(x5, x6))))); +} + +///////////////////////////////////////////////////////////////// +// LOG_ADD() +// +// Add seven log probabilities +///////////////////////////////////////////////////////////////// + +inline float LOG_ADD(float x1, float x2, float x3, float x4, float x5, float x6, + float x7) { + return LOG_ADD(x1, + LOG_ADD(x2, LOG_ADD(x3, LOG_ADD(x4, LOG_ADD(x5, LOG_ADD(x6, x7)))))); +} + +///////////////////////////////////////////////////////////////// +// ChooseBestOfThree() +// +// Store the largest of three values x1, x2, and x3 in *x. Also +// if xi is the largest value, then store bi in *b. +///////////////////////////////////////////////////////////////// + +inline void ChooseBestOfThree(float x1, float x2, float x3, char b1, char b2, + char b3, float *x, char *b) { + if (x1 >= x2) { + if (x1 >= x3) { + *x = x1; + *b = b1; + return; + } + *x = x3; + *b = b3; + return; + } + if (x2 >= x3) { + *x = x2; + *b = b2; + return; + } + *x = x3; + *b = b3; +} + +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/Sequence.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/Sequence.h new file mode 100644 index 0000000..5bd1ef9 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/Sequence.h @@ -0,0 +1,444 @@ +///////////////////////////////////////////////////////////////// +// Sequence.h +// +// Class for reading/manipulating single sequence character data. +///////////////////////////////////////////////////////////////// + +#ifndef SEQUENCE_H +#define SEQUENCE_H + +#include +#include +#include +#include +#include +#include "SafeVector.h" +#include "FileBuffer.h" + +///////////////////////////////////////////////////////////////// +// Sequence +// +// Class for storing sequence information. +///////////////////////////////////////////////////////////////// + +class Sequence { + + bool isValid; // a boolean indicating whether the sequence data is valid or not + string header; // string containing the comment line of the FASTA file + SafeVector *data; // pointer to character data + int length; // length of the sequence + int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences + // in a Multi-FASTA file + int inputLabel; // position of sequence in original input + + ///////////////////////////////////////////////////////////////// + // Sequence::Sequence() + // + // Default constructor. Does nothing. + ///////////////////////////////////////////////////////////////// + + Sequence() : + isValid(false), header(""), data(NULL), length(0), sequenceLabel(0), inputLabel( + 0) { + } + +public: + + ///////////////////////////////////////////////////////////////// + // Sequence::Sequence() + // + // Constructor. Reads the sequence from a FileBuffer. + ///////////////////////////////////////////////////////////////// + + Sequence(FileBuffer &infile, bool stripGaps = false) : + isValid(false), header("~"), data(NULL), length(0), sequenceLabel( + 0), inputLabel(0) { + + // read until the first non-blank line + while (!infile.eof()) { + infile.GetLine(header); + if (header.length() != 0) + break; + } + + // check to make sure that it is a correct header line + if (header[0] == '>') { + + // if so, remove the leading ">" + header = header.substr(1); + + // remove any leading or trailing white space in the header comment + while (header.length() > 0 && isspace(header[0])) + header = header.substr(1); + while (header.length() > 0 && isspace(header[header.length() - 1])) + header = header.substr(0, header.length() - 1); + + // get ready to read the data[] array; note that data[0] is always '@' + char ch; + data = new SafeVector; + assert(data); + data->push_back('@'); + + // get a character from the file + while (infile.Get(ch)) { + + // if we've reached a new comment line, put the character back and stop + if (ch == '>') { + infile.UnGet(); + break; + } + + // skip whitespace + if (isspace(ch)) + continue; + + // substitute gap character + if (ch == '.') + ch = '-'; + if (stripGaps && ch == '-') + continue; + + // check for known characters + if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))) { + cerr << "ERROR: Unknown character encountered: " << ch + << endl; + exit(1); + } + + // everything's ok so far, so just store this character. + if (ch >= 'a' && ch <= 'z') { + ch = ch - 'a' + 'A'; + } //change to upper case. fixed by Liu Yongchao, May 21, 2010 + + data->push_back(ch); + ++length; + } + + // sequence must contain data in order to be valid + isValid = length > 0; + if (!isValid) { + delete data; + data = NULL; + } + } + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Sequence() + // + // Constructor. Builds a sequence from existing data. Note + // that the data must use one-based indexing where data[0] should + // be set to '@'. + ///////////////////////////////////////////////////////////////// + + Sequence(SafeVector *data, string header, int length, + int sequenceLabel, int inputLabel) : + isValid(data != NULL), header(header), data(data), length(length), sequenceLabel( + sequenceLabel), inputLabel(inputLabel) { + assert(data); + assert((*data)[0] == '@'); + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Sequence() + // + // Destructor. Release allocated memory. + ///////////////////////////////////////////////////////////////// + + ~Sequence() { + if (data) { + assert(isValid); + delete data; + data = NULL; + isValid = false; + } + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetHeader() + // + // Return the string comment associated with this sequence. + ///////////////////////////////////////////////////////////////// + + string GetHeader() const { + return header; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetName() + // + // Return the first word of the string comment associated with this sequence. + ///////////////////////////////////////////////////////////////// + + string GetName() const { + char name[1024]; + sscanf(header.c_str(), "%s", name); + return string(name); + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetDataPtr() + // + // Return the iterator to data associated with this sequence. + ///////////////////////////////////////////////////////////////// + + SafeVector::iterator GetDataPtr() { + assert(isValid); + assert(data); + return data->begin(); + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetPosition() + // + // Return the character at position i. Recall that the character + // data is stored with one-based indexing. + ///////////////////////////////////////////////////////////////// + + char GetPosition(int i) const { + assert(isValid); + assert(data); + assert(i >= 1 && i <= length); + return (*data)[i]; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::SetLabel() + // + // Sets the sequence label to i. + ///////////////////////////////////////////////////////////////// + + void SetLabel(int i) { + assert(isValid); + sequenceLabel = i; + inputLabel = i; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::SetSortLabel() + // + // Sets the sequence sorting label to i. + ///////////////////////////////////////////////////////////////// + + void SetSortLabel(int i) { + assert(isValid); + sequenceLabel = i; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetLabel() + // + // Retrieves the input label. + ///////////////////////////////////////////////////////////////// + + int GetLabel() const { + assert(isValid); + return inputLabel; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetSortLabel() + // + // Retrieves the sorting label. + ///////////////////////////////////////////////////////////////// + + int GetSortLabel() const { + assert(isValid); + return sequenceLabel; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Fail() + // + // Checks to see if the sequence successfully loaded. + ///////////////////////////////////////////////////////////////// + + bool Fail() const { + return !isValid; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Length() + // + // Returns the length of the sequence. + ///////////////////////////////////////////////////////////////// + + int GetLength() const { + assert(isValid); + assert(data); + return length; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::WriteMFA() + // + // Writes the sequence to outfile in MFA format. Uses numColumns + // columns per line. If useIndex is set to false, then the + // header is printed as normal, but if useIndex is true, then + // ">S###" is printed where ### represents the sequence label. + ///////////////////////////////////////////////////////////////// + + void WriteMFA(ostream &outfile, int numColumns, + bool useIndex = false) const { + assert(isValid); + assert(data); + assert(!outfile.fail()); + + // print out heading + if (useIndex) + outfile << ">S" << GetLabel() << endl; + else + outfile << ">" << header << endl; + + // print out character data + int ct = 1; + for (; ct <= length; ct++) { + outfile << (*data)[ct]; + if (ct % numColumns == 0) + outfile << endl; + } + if ((ct - 1) % numColumns != 0) + outfile << endl; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Clone() + // + // Returns a new deep copy of the seqeuence. + ///////////////////////////////////////////////////////////////// + + Sequence *Clone() const { + Sequence *ret = new Sequence(); + assert(ret); + + ret->isValid = isValid; + ret->header = header; + ret->data = new SafeVector; + assert(ret->data); + *(ret->data) = *data; + ret->length = length; + ret->sequenceLabel = sequenceLabel; + ret->inputLabel = inputLabel; + + return ret; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetRange() + // + // Returns a new sequence object consisting of a range of + // characters from the current seuquence. + ///////////////////////////////////////////////////////////////// + + Sequence *GetRange(int start, int end) const { + Sequence *ret = new Sequence(); + assert(ret); + + assert(start >= 1 && start <= length); + assert(end >= 1 && end <= length); + assert(start <= end); + + ret->isValid = isValid; + ret->header = header; + ret->data = new SafeVector; + assert(ret->data); + ret->data->push_back('@'); + for (int i = start; i <= end; i++) + ret->data->push_back((*data)[i]); + ret->length = end - start + 1; + ret->sequenceLabel = sequenceLabel; + ret->inputLabel = inputLabel; + + return ret; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::AddGaps() + // + // Given an SafeVector containing the skeleton for an + // alignment and the identity of the current character, this + // routine will create a new sequence with all necesssary gaps added. + // For instance, + // alignment = "XXXBBYYYBBYYXX" + // id = 'X' + // will perform the transformation + // "ATGCAGTCA" --> "ATGCC---GT--CA" + // (XXXBBYYYBBYYXX) + ///////////////////////////////////////////////////////////////// + + Sequence *AddGaps(SafeVector *alignment, char id) { + Sequence *ret = new Sequence(); + assert(ret); + + ret->isValid = isValid; + ret->header = header; + ret->data = new SafeVector; + assert(ret->data); + ret->length = (int) alignment->size(); + ret->sequenceLabel = sequenceLabel; + ret->inputLabel = inputLabel; + ret->data->push_back('@'); + + SafeVector::iterator dataIter = data->begin() + 1; + for (SafeVector::iterator iter = alignment->begin(); + iter != alignment->end(); ++iter) { + if (*iter == 'B' || *iter == id) { + ret->data->push_back(*dataIter); + ++dataIter; + } else + ret->data->push_back('-'); + } + + return ret; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetString() + // + // Returns the sequence as a string with gaps removed. + ///////////////////////////////////////////////////////////////// + + string GetString() { + string s = ""; + for (int i = 1; i <= length; i++) { + if ((*data)[i] != '-') + s += (*data)[i]; + } + return s; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::GetMapping() + // + // Returns a SafeVector containing the indices of every + // character in the sequence. For instance, if the data is + // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}. + ///////////////////////////////////////////////////////////////// + + SafeVector *GetMapping() const { + SafeVector *ret = new SafeVector(1, 0); + for (int i = 1; i <= length; i++) { + if ((*data)[i] != '-') + ret->push_back(i); + } + return ret; + } + + ///////////////////////////////////////////////////////////////// + // Sequence::Highlight() + // + // Changes all positions with score >= cutoff to upper case and + // all positions with score < cutoff to lower case. + ///////////////////////////////////////////////////////////////// + + void Highlight(const SafeVector &scores, const float cutoff) { + for (int i = 1; i <= length; i++) { + if (scores[i - 1] >= cutoff) + (*data)[i] = toupper((*data)[i]); + else + (*data)[i] = tolower((*data)[i]); + } + } +}; + +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/SparseMatrix.h b/binaries/src/MSAProbs-0.9.7/MSAProbs/SparseMatrix.h new file mode 100644 index 0000000..51b273d --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/SparseMatrix.h @@ -0,0 +1,266 @@ +///////////////////////////////////////////////////////////////// +// SparseMatrix.h +// +// Sparse matrix computations +///////////////////////////////////////////////////////////////// + +#ifndef SPARSEMATRIX_H +#define SPARSEMATRIX_H + +#include + +using namespace std; + +const float POSTERIOR_CUTOFF = 0.01; // minimum posterior probability +// value that is maintained in the +// sparse matrix representation + +typedef pair PIF; // Sparse matrix entry type +// first --> column +// second --> value + +///////////////////////////////////////////////////////////////// +// SparseMatrix +// +// Class for sparse matrix computations +///////////////////////////////////////////////////////////////// + +class SparseMatrix { + + int seq1Length, seq2Length; // dimensions of matrix + VI rowSize; // rowSize[i] = # of cells in row i + SafeVector data; // data values + SafeVector::iterator> rowPtrs; // pointers to the beginning of each row + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::SparseMatrix() + // + // Private constructor. + ///////////////////////////////////////////////////////////////// + + SparseMatrix() { + } + +public: + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::SparseMatrix() + // + // Constructor. Builds a sparse matrix from a posterior matrix. + // Note that the expected format for the posterior matrix is as + // a (seq1Length+1) x (seq2Length+1) matrix where the 0th row + // and 0th column are ignored (they should contain all zeroes). + ///////////////////////////////////////////////////////////////// + + SparseMatrix(int seq1Length, int seq2Length, const VF &posterior) : + seq1Length(seq1Length), seq2Length(seq2Length) { + + int numCells = 0; + + assert(seq1Length > 0); + assert(seq2Length > 0); + + // calculate memory required; count the number of cells in the + // posterior matrix above the threshold + VF::const_iterator postPtr = posterior.begin(); + for (int i = 0; i <= seq1Length; i++) { + for (int j = 0; j <= seq2Length; j++) { + if (*(postPtr++) >= POSTERIOR_CUTOFF) { + assert(i != 0 && j != 0); + numCells++; + } + } + } + + // allocate memory + data.resize(numCells); + rowSize.resize(seq1Length + 1); + rowSize[0] = -1; + rowPtrs.resize(seq1Length + 1); + rowPtrs[0] = data.end(); + + // build sparse matrix + postPtr = posterior.begin() + seq2Length + 1; // note that we're skipping the first row here + SafeVector::iterator dataPtr = data.begin(); + for (int i = 1; i <= seq1Length; i++) { + postPtr++; // and skipping the first column of each row + rowPtrs[i] = dataPtr; + for (int j = 1; j <= seq2Length; j++) { + if (*postPtr >= POSTERIOR_CUTOFF) { + dataPtr->first = j; + dataPtr->second = *postPtr; + dataPtr++; + } + postPtr++; + } + rowSize[i] = dataPtr - rowPtrs[i]; + } + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetRowPtr() + // + // Returns the pointer to a particular row in the sparse matrix. + ///////////////////////////////////////////////////////////////// + + SafeVector::iterator GetRowPtr(int row) const { + assert(row >= 1 && row <= seq1Length); + return rowPtrs[row]; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetValue() + // + // Returns value at a particular row, column. + ///////////////////////////////////////////////////////////////// + + float GetValue(int row, int col) { + assert(row >= 1 && row <= seq1Length); + assert(col >= 1 && col <= seq2Length); + for (int i = 0; i < rowSize[row]; i++) { + if (rowPtrs[row][i].first == col) + return rowPtrs[row][i].second; + } + return 0; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetRowSize() + // + // Returns the number of entries in a particular row. + ///////////////////////////////////////////////////////////////// + + int GetRowSize(int row) const { + assert(row >= 1 && row <= seq1Length); + return rowSize[row]; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetSeq1Length() + // + // Returns the first dimension of the matrix. + ///////////////////////////////////////////////////////////////// + + int GetSeq1Length() const { + return seq1Length; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetSeq2Length() + // + // Returns the second dimension of the matrix. + ///////////////////////////////////////////////////////////////// + + int GetSeq2Length() const { + return seq2Length; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetRowPtr + // + // Returns the pointer to a particular row in the sparse matrix. + ///////////////////////////////////////////////////////////////// + + int GetNumCells() const { + return data.size(); + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::Print() + // + // Prints out a sparse matrix. + ///////////////////////////////////////////////////////////////// + + void Print(ostream &outfile) const { + outfile << "Sparse Matrix:" << endl; + for (int i = 1; i <= seq1Length; i++) { + outfile << " " << i << ":"; + for (int j = 0; j < rowSize[i]; j++) { + outfile << " (" << rowPtrs[i][j].first << "," + << rowPtrs[i][j].second << ")"; + } + outfile << endl; + } + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::ComputeTranspose() + // + // Returns a new sparse matrix containing the transpose of the + // current matrix. + ///////////////////////////////////////////////////////////////// + + SparseMatrix *ComputeTranspose() const { + + // create a new sparse matrix + SparseMatrix *ret = new SparseMatrix(); + int numCells = data.size(); + + ret->seq1Length = seq2Length; + ret->seq2Length = seq1Length; + + // allocate memory + ret->data.resize(numCells); + ret->rowSize.resize(seq2Length + 1); + ret->rowSize[0] = -1; + ret->rowPtrs.resize(seq2Length + 1); + ret->rowPtrs[0] = ret->data.end(); + + // compute row sizes + for (int i = 1; i <= seq2Length; i++) + ret->rowSize[i] = 0; + for (int i = 0; i < numCells; i++) + ret->rowSize[data[i].first]++; + + // compute row ptrs + for (int i = 1; i <= seq2Length; i++) { + ret->rowPtrs[i] = + (i == 1) ? + ret->data.begin() : + ret->rowPtrs[i - 1] + ret->rowSize[i - 1]; + } + + // now fill in data + SafeVector::iterator> currPtrs = ret->rowPtrs; + + for (int i = 1; i <= seq1Length; i++) { + SafeVector::iterator row = rowPtrs[i]; + for (int j = 0; j < rowSize[i]; j++) { + currPtrs[row[j].first]->first = i; + currPtrs[row[j].first]->second = row[j].second; + currPtrs[row[j].first]++; + } + } + + return ret; + } + + ///////////////////////////////////////////////////////////////// + // SparseMatrix::GetPosterior() + // + // Return the posterior representation of the sparse matrix. + ///////////////////////////////////////////////////////////////// + + VF *GetPosterior() const { + + // create a new posterior matrix + VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1)); + assert(posteriorPtr); + VF &posterior = *posteriorPtr; + + // build the posterior matrix + for (int i = 0; i < (seq1Length + 1) * (seq2Length + 1); i++) + posterior[i] = 0; + for (int i = 1; i <= seq1Length; i++) { + VF::iterator postPtr = posterior.begin() + i * (seq2Length + 1); + for (int j = 0; j < rowSize[i]; j++) { + postPtr[rowPtrs[i][j].first] = rowPtrs[i][j].second; + } + } + + return posteriorPtr; + } + +}; + +#endif diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/main.cpp b/binaries/src/MSAProbs-0.9.7/MSAProbs/main.cpp new file mode 100644 index 0000000..6fd1934 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/MSAProbs/main.cpp @@ -0,0 +1,16 @@ +/*********************************************** + * # Copyright 2009-2010. Liu Yongchao + * # Contact: Liu Yongchao, School of Computer Engineering, + * # Nanyang Technological University. + * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com + * # + * # GPL version 3.0 applies. + * # + * ************************************************/ +#include "MSA.h" + +int main(int argc, char* argv[]) { + MSA msa(argc, argv); + + return 0; +} diff --git a/binaries/src/MSAProbs-0.9.7/MSAProbs/msaprobs b/binaries/src/MSAProbs-0.9.7/MSAProbs/msaprobs new file mode 100755 index 0000000..e7e06a4 Binary files /dev/null and b/binaries/src/MSAProbs-0.9.7/MSAProbs/msaprobs differ diff --git a/binaries/src/MSAProbs-0.9.7/README b/binaries/src/MSAProbs-0.9.7/README new file mode 100644 index 0000000..5114c85 --- /dev/null +++ b/binaries/src/MSAProbs-0.9.7/README @@ -0,0 +1,48 @@ + +MSAPROBS is a new and practial protein multiple sequence alignment +algorithm based on pair hidden markov model and partition function +posterrior probabilities. Assessed on BAliBASE 3.0, PREFAB 4.0, +SABMARK 1.65, and OXBENCH, MSAProbs achieves the statistically +highest alignment accuracy, compared to ClustalW 2.0.10, MAFFT 6.717( +using L-INS-i with --maxiterate = 1000), MUSCLE 3.8.31, ProbCons 1.12, +and Probalign 1.3. (current version 0.9.3, March 17, 2010). + + +To use this software, please cite the following paper: +/****************************************************** +Yongchao Liu, Bertil Schmidt, Douglas L. Maskell: + +"MSAProbs: multiple sequence alignment based on +pair hidden Markov models and partition function posterior probabilities", + +Bioinformatics 2010, 26(16): 1958-1964 + +*******************************************************/ + +This software is developed by Liu Yongchao, School of Computer Engineering, +Nanyang Technological University. If any comments or problems, +please directly contact Liu Yongchao using either of the following email +addresses: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com. + +MSAPROBS is an open-source software, complying with General Public +Licence (GPL) version 3.0. MSAPROBS is distributed WITHOUT WARRANTY, express or +implied. The authors accept NO LEGAL LIABILITY OR RESPONSIBILITY for +loss due to reliance on the program. + +(1) Linux and Windows are supported, with a Makefile and a Visual Studio 2005 +project co-existing in the source code tarball. + +Change to sub-directory MSAProbs, the Makefile file for Linux can be found. + +(2)The default compiling options enable OpenMP support to fully utlized the +compute capability of multi-core CPUs, as multi-core CPUs have been commonplace. + +Typical Usage: + (1) "./msaprobs -help" or "./msaprobs -?" + Get the command line options + + (2) "./msaprobs infile >outfile" or "./msaprobs infile -o outfile" + The alignments are printed out into file "outfile" in FASTA format + + (3) ./msaprobs infile -o outfile -num_threads 4 + Use four threads to accelerate the execution diff --git a/binaries/src/compilebin.sh b/binaries/src/compilebin.sh index 87ce823..0eee21d 100755 --- a/binaries/src/compilebin.sh +++ b/binaries/src/compilebin.sh @@ -93,3 +93,18 @@ make echo "DONE" cd .. +echo "Compiling GLProbs ..." +cd GLProbs-1.0 +make clean +make +echo "DONE" +cd .. + + +echo "Compiling MSAProbs-0.9.7 ..." +cd MSAProbs-0.9.7/MSAProbs +make clean +make +echo "DONE" +cd ../.. + diff --git a/binaries/src/setexecflag.sh b/binaries/src/setexecflag.sh index 7be938c..38ad0fa 100644 --- a/binaries/src/setexecflag.sh +++ b/binaries/src/setexecflag.sh @@ -34,3 +34,12 @@ chmod +x iupred/iupred echo "Setting executable flag for Jpred..." chmod +x jpred/i686/* jpred/x86_64/* + +echo "Setting executable flag for ViennaRNA..." +chmod +x ViennaRNA/Progs/RNAalifold + +echo "Setting executable flag for MSAProbs-0.9.7..." +chmod +x MSAProbs-0.9.7/MSAProbs/msaprobs + +echo "Setting executable flag for GLProbs-1.0..." +chmod +x GLProbs-1.0/glprobs