binaries/src/GLProbs-1.0/MSAfull.cpp

   1 /***********************************************
   2  * # Copyright 2009-2010. Liu Yongchao
   3  * # Contact: Liu Yongchao, School of Computer Engineering,
   4  * #                     Nanyang Technological University.
   5  * # Emails:     liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
   6  * #
   7  * # GPL version 3.0 applies.
   8  * #
   9  * ************************************************/
  10
  11 #include <string>
  12 #include <sstream>
  13 #include <iomanip>
  14 #include <iostream>
  15 #include <list>
  16 #include <set>
  17 #include <algorithm>
  18 #include <climits>
  19 #include <cstdio>
  20 #include <cstdlib>
  21 #include <cerrno>
  22 #include <iomanip>
  23 #include "MSA.h"
  24 #include "MSAClusterTree.h"
  25 #include "Defaults.h"
  26
  27 #ifdef _OPENMP
  28 #include <omp.h>
  29 #endif
  30
  31 string parametersInputFilename = "";
  32 string parametersOutputFilename = "no training";
  33 string annotationFilename = "";
  34
  35 bool enableVerbose = false;
  36 bool enableAnnotation = false;
  37 bool enableClustalWOutput = false;
  38 bool enableAlignOrder = false;
  39 int numConsistencyReps = 2;
  40 int numPreTrainingReps = 0;
  41 int numIterativeRefinementReps = 100;
  42
  43 float cutoff = 0;
  44
  45 VF initDistrib(NumMatrixTypes);
  46 VF gapOpen(2 * NumInsertStates);
  47 VF gapExtend(2 * NumInsertStates);
  48 VVF emitPairs(256, VF(256, 1e-10));
  49 VF emitSingle(256, 1e-5);
  50
  51 string alphabet = alphabetDefault;
  52
  53 const int MIN_PRETRAINING_REPS = 0;
  54 const int MAX_PRETRAINING_REPS = 20;
  55 const int MIN_CONSISTENCY_REPS = 0;
  56 const int MAX_CONSISTENCY_REPS = 5;
  57 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
  58 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
  59
  60 string posteriorProbsFilename = "";
  61 bool allscores = true;
  62 string infilename;
  63
  64 int flag_gui = 0;   //0: no gui related o/p
  65 //1: gui related o/p generated
  66 int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
  67 //1: pp score seq added to o/p fasta alignment
  68
  69 ///////////////////////////////
  70 // global scoring matrix variables
  71 //////////////////////////////
  72 float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
  73 char *aminos, *bases, matrixtype[20] = "gonnet_160";
  74 int subst_index[26];
  75
  76 double sub_matrix[26][26];
  77 int firstread = 0;              //this makes sure that matrices are read only once
  78
  79 float TEMPERATURE = 5;
  80 int MATRIXTYPE = 160;
  81 int prot_nuc = 0;               //0=prot, 1=nucleotide
  82
  83 float GAPOPEN = 0;
  84 float GAPEXT = 0;
  85 int numThreads = 0;
  86
  87 //argument support
  88 typedef struct {
  89         char input[30];
  90         int matrix;
  91         int N;
  92         float T;
  93         float beta;
  94         char opt;                       //can be 'P' or 'M'
  95         float gapopen;
  96         float gapext;
  97 } argument_decl;
  98
  99 argument_decl argument;
 100
 101 extern inline void read_sustitution_matrix(char *fileName);
 102 extern void setmatrixtype(int le);
 103 extern inline int matrixtype_to_int();
 104 extern inline void read_dna_matrix();
 105 extern inline void read_vtml_la_matrix();
 106 extern void init_arguments();
 107
 108 MSA::MSA(int argc, char* argv[]) {
 109         //parse program parameters
 110         SafeVector<string> sequenceNames = ParseParams(argc, argv);
 111
 112         //initialize arguments for partition function
 113         init_arguments();
 114
 115         ReadParameters();
 116         //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 117
 118         //read the input sequences
 119         MultiSequence *sequences = new MultiSequence();
 120         assert(sequences);
 121         for (int i = 0; i < (int) sequenceNames.size(); i++) {
 122                 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 123                 sequences->LoadMFA(sequenceNames[i], true);
 124         }
 125         //allocate space for sequence weights
 126         this->seqsWeights = new int[sequences->GetNumSequences()];
 127         //initilaize parameters for OPENMP
 128 #ifdef _OPENMP
 129         if(numThreads <= 0) {
 130                 numThreads = omp_get_num_procs();
 131                 cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
 132         }
 133         cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
 134
 135         //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
 136         omp_set_num_threads(numThreads);
 137 #endif
 138
 139         // now, we can perform the alignments and write them out
 140         MultiSequence *alignment = doAlign(sequences,
 141                         ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
 142                                         emitSingle), initDistrib, gapOpen, gapExtend, emitPairs,
 143                         emitSingle);
 144
 145         //write the alignment results to standard output
 146         if (enableClustalWOutput) {
 147                 alignment->WriteALN(*alignOutFile);
 148         } else {
 149                 alignment->WriteMFA(*alignOutFile);
 150         }
 151         //release resources
 152         delete[] this->seqsWeights;
 153         delete alignment;
 154         delete sequences;
 155 }
 156 MSA::~MSA() {
 157         /*close the output file*/
 158         if (alignOutFileName.length() > 0) {
 159                 ((std::ofstream*) alignOutFile)->close();
 160         }
 161 }
 162 /////////////////////////////////////////////////////////////////
 163 // PrintParameters()
 164 //
 165 // Prints MSAPROBS parameters to STDERR.  If a filename is
 166 // specified, then the parameters are also written to the file.
 167 /////////////////////////////////////////////////////////////////
 168
 169 void MSA::PrintParameters(const char *message, const VF &initDistrib,
 170                 const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
 171                 const VF &emitSingle, const char *filename) {
 172
 173         // print parameters to the screen
 174         cerr << message << endl << "    initDistrib[] = { ";
 175         for (int i = 0; i < NumMatrixTypes; i++)
 176                 cerr << setprecision(10) << initDistrib[i] << " ";
 177         cerr << "}" << endl << "        gapOpen[] = { ";
 178         for (int i = 0; i < NumInsertStates * 2; i++)
 179                 cerr << setprecision(10) << gapOpen[i] << " ";
 180         cerr << "}" << endl << "      gapExtend[] = { ";
 181         for (int i = 0; i < NumInsertStates * 2; i++)
 182                 cerr << setprecision(10) << gapExtend[i] << " ";
 183         cerr << "}" << endl << endl;
 184
 185         /*
 186          for (int i = 0; i < 5; i++){
 187          for (int j = 0; j <= i; j++){
 188          cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
 189          }
 190          cerr << endl;
 191          }*/
 192
 193         // if a file name is specified
 194         if (filename) {
 195
 196                 // attempt to open the file for writing
 197                 FILE *file = fopen(filename, "w");
 198                 if (!file) {
 199                         cerr << "ERROR: Unable to write parameter file: " << filename
 200                                         << endl;
 201                         exit(1);
 202                 }
 203
 204                 // if successful, then write the parameters to the file
 205                 for (int i = 0; i < NumMatrixTypes; i++)
 206                         fprintf(file, "%.10f ", initDistrib[i]);
 207                 fprintf(file, "\n");
 208                 for (int i = 0; i < 2 * NumInsertStates; i++)
 209                         fprintf(file, "%.10f ", gapOpen[i]);
 210                 fprintf(file, "\n");
 211                 for (int i = 0; i < 2 * NumInsertStates; i++)
 212                         fprintf(file, "%.10f ", gapExtend[i]);
 213                 fprintf(file, "\n");
 214                 fprintf(file, "%s\n", alphabet.c_str());
 215                 for (int i = 0; i < (int) alphabet.size(); i++) {
 216                         for (int j = 0; j <= i; j++)
 217                                 fprintf(file, "%.10f ",
 218                                                 emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
 219                         fprintf(file, "\n");
 220                 }
 221                 for (int i = 0; i < (int) alphabet.size(); i++)
 222                         fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
 223                 fprintf(file, "\n");
 224                 fclose(file);
 225         }
 226 }
 227
 228 /////////////////////////////////////////////////////////////////
 229 // doAlign()
 230 //
 231 // First computes all pairwise posterior probability matrices.
 232 // Then, computes new parameters if training, or a final
 233 // alignment, otherwise.
 234 /////////////////////////////////////////////////////////////////
 235 extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
 236 MultiSequence* MSA::doAlign(MultiSequence *sequences,
 237                 const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
 238                 VF &gapExtend, VVF &emitPairs, VF &emitSingle) {
 239         assert(sequences);
 240
 241         //get the number of sequences
 242         const int numSeqs = sequences->GetNumSequences();
 243
 244         //create distance matrix
 245         VVF probalign_distances(numSeqs, VF(numSeqs, 0));
 246         VVF distances(numSeqs, VF(numSeqs, 0));//msa
 247
 248         float gl_accuracy = 0;
 249         //creat sparseMatrices
 250         SafeVector<SafeVector<SparseMatrix *> > probalign_sparseMatrices(numSeqs,
 251                         SafeVector<SparseMatrix *>(numSeqs, NULL));
 252         SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
 253                         SafeVector<SparseMatrix *>(numSeqs, NULL)); // msa
 254
 255 #ifdef _OPENMP
 256         //calculate sequence pairs for openmp model
 257         int pairIdx = 0;
 258         numPairs = (numSeqs - 1) * numSeqs / 2;
 259         seqsPairs = new SeqsPair[numPairs];
 260         for(int a = 0; a < numSeqs; a++) {
 261                 for(int b = a + 1; b < numSeqs; b++) {
 262                         seqsPairs[pairIdx].seq1 = a;
 263                         seqsPairs[pairIdx].seq2 = b;
 264                         pairIdx++;
 265                 }
 266         }
 267 #endif
 268         // do all pairwise alignments for posterior probability matrices
 269 #ifdef _OPENMP
 270 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
 271         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
 272                 int a= seqsPairs[pairIdx].seq1;
 273                 int b = seqsPairs[pairIdx].seq2;
 274                 if(enableVerbose) {
 275 #pragma omp critical
 276                         cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
 277                 }
 278 #else
 279         for (int a = 0; a < numSeqs - 1; a++) {
 280                 for (int b = a + 1; b < numSeqs; b++) {
 281 #endif
 282                         Sequence *seq1 = sequences->GetSequence(a);
 283                         Sequence *seq2 = sequences->GetSequence(b);
 284
 285                         // verbose output
 286                         if (enableVerbose) {
 287                                 cerr << "Computing posterior matrix: (" << a + 1 << ") "
 288                                                 << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") "
 289                                                 << seq2->GetHeader() << " -- ";
 290                         }
 291
 292 //probcons
 293                         // compute forward and backward probabilities
 294                         VF *forward = model.ComputeForwardMatrix(seq1, seq2);
 295                         assert(forward);
 296                         VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
 297                         assert(backward);
 298                         // compute posterior probability matrix from HMM
 299                         VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
 300                         assert(probcons_posterior);
 301                         delete forward;
 302                         delete backward;
 303
 304 //probalign
 305                         VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
 306                         assert(probalign_posterior);
 307                         probalign_sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),seq2->GetLength(),                                                                  *probalign_posterior);
 308                         probalign_sparseMatrices[b][a] = NULL;
 309                         pair<SafeVector<char> *, float> probalign_alignment = model.ComputeAlignment(
 310                                         seq1->GetLength(), seq2->GetLength(), *probalign_posterior);
 311                         probalign_distances[a][b] =1.0f - probalign_alignment.second / min(seq1->GetLength(), seq2->GetLength());
 312                         delete probalign_alignment.first;
 313
 314 //local
 315                         forward = model.ComputeForwardMatrix(seq1, seq2,false);
 316                         assert(forward);
 317                         backward = model.ComputeBackwardMatrix(seq1, seq2,false);
 318                         assert(backward);
 319                         VF* local_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
 320                         delete forward;
 321                         delete backward;
 322
 323 //GL
 324                         //merge probalign + local + probcons
 325                         VF::iterator ptr1 = probcons_posterior->begin();
 326                         VF::iterator ptr2 = probalign_posterior->begin();
 327                         VF::iterator ptr3 = local_posterior->begin();
 328                         VF* posterior = new VF((seq1->GetLength()+1) * (seq2->GetLength()+1)); assert (posterior);  //msa
 329                         VF::iterator ptr = posterior->begin();
 330                         for (int i = 0; i <= seq1->GetLength(); i++) {
 331                                 for (int j = 0; j <= seq2->GetLength(); j++) {
 332                                         float v1 = *ptr1;
 333                                         float v2 = *ptr2;
 334                                         float v3 = *ptr3;
 335                                         *ptr = sqrt(v1*v1 + v2*v2 + v3*v3);
 336                                         ptr1++;
 337                                         ptr2++;
 338                                         ptr3++;
 339                                         ptr++;
 340                                 }
 341                         }
 342                         // perform the pairwise sequence alignment
 343                         pair<SafeVector<char> *, float> gl_alignment = model.ComputeAlignment(
 344                                         seq1->GetLength(), seq2->GetLength(), *posterior);
 345
 346                         //compute expected accuracy
 347                         distances[a][b] = distances[b][a] = 1.0f - gl_alignment.second
 348                                         / (3*min(seq1->GetLength(), seq2->GetLength()));
 349
 350                         // compute sparse representations
 351                         sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
 352                         seq2->GetLength(), *posterior);
 353                         sparseMatrices[b][a] = NULL;
 354                         //
 355                         SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
 356                         SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
 357                         float N_correct_match = 0;
 358                         int i = 1;int j = 1;
 359                         for (SafeVector<char>::iterator iter = gl_alignment.first->begin();
 360                                 iter != gl_alignment.first->end(); ++iter){
 361                                 if (*iter == 'B'){
 362                                         unsigned char c1 = (unsigned char) iter1[i++];
 363                                         unsigned char c2 = (unsigned char) iter2[j++];
 364                                         if(c1==c2) N_correct_match += 1;
 365                                 }
 366                                 else if(*iter == 'X') i++;
 367                                 else if(*iter == 'Y') j++;
 368                         }
 369                         if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "GL"<< endl;
 370                         gl_accuracy += N_correct_match / min(seq1->GetLength(), seq2->GetLength());
 371                         //
 372                         delete probcons_posterior;
 373                         delete probalign_posterior;
 374                         delete local_posterior;
 375                         delete posterior;
 376
 377 #ifndef _OPENMP
 378                 }
 379 #endif
 380         }
 381
 382 //self-adaptive
 383         gl_accuracy /= numPairs;
 384         if(gl_accuracy > 0.4){
 385                 for (int a = 0; a < numSeqs - 1; a++)
 386                         for (int b = a + 1; b < numSeqs; b++) {
 387                                 distances[a][b] = distances[b][a] = probalign_distances[a][b];
 388                                 sparseMatrices[a][b] = probalign_sparseMatrices[a][b];
 389                                 sparseMatrices[b][a] = NULL;
 390                         }
 391         }
 392
 393         //create the guide tree
 394         this->tree = new MSAClusterTree(this, distances, numSeqs);
 395         this->tree->create();
 396
 397         // perform the consistency transformation the desired number of times
 398         float* fweights = new float[numSeqs];
 399         for (int r = 0; r < numSeqs; r++) {
 400                 fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
 401                 fweights[r] *= 10;
 402         }
 403         for (int r = 0; r < numConsistencyReps; r++) {
 404                 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
 405                                 DoRelaxation(fweights, sequences, sparseMatrices);
 406
 407                 // now replace the old posterior matrices
 408                 for (int i = 0; i < numSeqs; i++) {
 409                         for (int j = 0; j < numSeqs; j++) {
 410                                 delete sparseMatrices[i][j];
 411                                 sparseMatrices[i][j] = newSparseMatrices[i][j];
 412                         }
 413                 }
 414         }
 415         delete[] fweights;
 416 #ifdef _OPENMP
 417         delete [] seqsPairs;
 418 #endif
 419
 420         //compute the final multiple sequence alignment
 421         MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
 422                         sparseMatrices, model);
 423
 424         // build annotation
 425         if (enableAnnotation) {
 426                 WriteAnnotation(finalAlignment, sparseMatrices);
 427         }
 428         //destroy the guide tree
 429         delete this->tree;
 430         this->tree = 0;
 431
 432         // delete sparse matrices
 433         for (int a = 0; a < numSeqs - 1; a++) {
 434                 for (int b = a + 1; b < numSeqs; b++) {
 435                         delete sparseMatrices[a][b];
 436                         delete sparseMatrices[b][a];
 437                 }
 438         }
 439
 440         return finalAlignment;
 441 }
 442
 443 /////////////////////////////////////////////////////////////////
 444 // GetInteger()
 445 //
 446 // Attempts to parse an integer from the character string given.
 447 // Returns true only if no parsing error occurs.
 448 /////////////////////////////////////////////////////////////////
 449
 450 bool GetInteger(char *data, int *val) {
 451         char *endPtr;
 452         long int retVal;
 453
 454         assert(val);
 455
 456         errno = 0;
 457         retVal = strtol(data, &endPtr, 0);
 458         if (retVal == 0 && (errno != 0 || data == endPtr))
 459                 return false;
 460         if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
 461                 return false;
 462         if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
 463                 return false;
 464         *val = (int) retVal;
 465         return true;
 466 }
 467
 468 /////////////////////////////////////////////////////////////////
 469 // GetFloat()
 470 //
 471 // Attempts to parse a float from the character string given.
 472 // Returns true only if no parsing error occurs.
 473 /////////////////////////////////////////////////////////////////
 474
 475 bool GetFloat(char *data, float *val) {
 476         char *endPtr;
 477         double retVal;
 478
 479         assert(val);
 480
 481         errno = 0;
 482         retVal = strtod(data, &endPtr);
 483         if (retVal == 0 && (errno != 0 || data == endPtr))
 484                 return false;
 485         if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
 486                 return false;
 487         *val = (float) retVal;
 488         return true;
 489 }
 490
 491 /////////////////////////////////////////////////////////////////
 492 // ReadParameters()
 493 //
 494 // Read initial distribution, transition, and emission
 495 // parameters from a file.
 496 /////////////////////////////////////////////////////////////////
 497
 498 void MSA::ReadParameters() {
 499
 500         ifstream data;
 501
 502         emitPairs = VVF(256, VF(256, 1e-10));
 503         emitSingle = VF(256, 1e-5);
 504
 505         // read initial state distribution and transition parameters
 506         if (parametersInputFilename == string("")) {
 507                 if (NumInsertStates == 1) {
 508                         for (int i = 0; i < NumMatrixTypes; i++)
 509                                 initDistrib[i] = initDistrib1Default[i];
 510                         for (int i = 0; i < 2 * NumInsertStates; i++)
 511                                 gapOpen[i] = gapOpen1Default[i];
 512                         for (int i = 0; i < 2 * NumInsertStates; i++)
 513                                 gapExtend[i] = gapExtend1Default[i];
 514                 } else if (NumInsertStates == 2) {
 515                         for (int i = 0; i < NumMatrixTypes; i++)
 516                                 initDistrib[i] = initDistrib2Default[i];
 517                         for (int i = 0; i < 2 * NumInsertStates; i++)
 518                                 gapOpen[i] = gapOpen2Default[i];
 519                         for (int i = 0; i < 2 * NumInsertStates; i++)
 520                                 gapExtend[i] = gapExtend2Default[i];
 521                 } else {
 522                         cerr
 523                                         << "ERROR: No default initial distribution/parameter settings exist"
 524                                         << endl << "       for " << NumInsertStates
 525                                         << " pairs of insert states.  Use --paramfile." << endl;
 526                         exit(1);
 527                 }
 528
 529                 alphabet = alphabetDefault;
 530
 531                 for (int i = 0; i < (int) alphabet.length(); i++) {
 532                         emitSingle[(unsigned char) tolower(alphabet[i])] =
 533                                         emitSingleDefault[i];
 534                         emitSingle[(unsigned char) toupper(alphabet[i])] =
 535                                         emitSingleDefault[i];
 536                         for (int j = 0; j <= i; j++) {
 537                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
 538                                                 alphabet[j])] = emitPairsDefault[i][j];
 539                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
 540                                                 alphabet[j])] = emitPairsDefault[i][j];
 541                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
 542                                                 alphabet[j])] = emitPairsDefault[i][j];
 543                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
 544                                                 alphabet[j])] = emitPairsDefault[i][j];
 545                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
 546                                                 alphabet[i])] = emitPairsDefault[i][j];
 547                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
 548                                                 alphabet[i])] = emitPairsDefault[i][j];
 549                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
 550                                                 alphabet[i])] = emitPairsDefault[i][j];
 551                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
 552                                                 alphabet[i])] = emitPairsDefault[i][j];
 553                         }
 554                 }
 555         } else {
 556                 data.open(parametersInputFilename.c_str());
 557                 if (data.fail()) {
 558                         cerr << "ERROR: Unable to read parameter file: "
 559                                         << parametersInputFilename << endl;
 560                         exit(1);
 561                 }
 562
 563                 string line[3];
 564                 for (int i = 0; i < 3; i++) {
 565                         if (!getline(data, line[i])) {
 566                                 cerr
 567                                                 << "ERROR: Unable to read transition parameters from parameter file: "
 568                                                 << parametersInputFilename << endl;
 569                                 exit(1);
 570                         }
 571                 }
 572                 istringstream data2;
 573                 data2.clear();
 574                 data2.str(line[0]);
 575                 for (int i = 0; i < NumMatrixTypes; i++)
 576                         data2 >> initDistrib[i];
 577                 data2.clear();
 578                 data2.str(line[1]);
 579                 for (int i = 0; i < 2 * NumInsertStates; i++)
 580                         data2 >> gapOpen[i];
 581                 data2.clear();
 582                 data2.str(line[2]);
 583                 for (int i = 0; i < 2 * NumInsertStates; i++)
 584                         data2 >> gapExtend[i];
 585
 586                 if (!getline(data, line[0])) {
 587                         cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
 588                                         << parametersInputFilename << endl;
 589                         exit(1);
 590                 }
 591
 592                 // read alphabet as concatenation of all characters on alphabet line
 593                 alphabet = "";
 594                 string token;
 595                 data2.clear();
 596                 data2.str(line[0]);
 597                 while (data2 >> token)
 598                         alphabet += token;
 599
 600                 for (int i = 0; i < (int) alphabet.size(); i++) {
 601                         for (int j = 0; j <= i; j++) {
 602                                 float val;
 603                                 data >> val;
 604                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
 605                                                 alphabet[j])] = val;
 606                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
 607                                                 alphabet[j])] = val;
 608                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
 609                                                 alphabet[j])] = val;
 610                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
 611                                                 alphabet[j])] = val;
 612                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
 613                                                 alphabet[i])] = val;
 614                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
 615                                                 alphabet[i])] = val;
 616                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
 617                                                 alphabet[i])] = val;
 618                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
 619                                                 alphabet[i])] = val;
 620                         }
 621                 }
 622
 623                 for (int i = 0; i < (int) alphabet.size(); i++) {
 624                         float val;
 625                         data >> val;
 626                         emitSingle[(unsigned char) tolower(alphabet[i])] = val;
 627                         emitSingle[(unsigned char) toupper(alphabet[i])] = val;
 628                 }
 629                 data.close();
 630         }
 631 }
 632
 633 /////////////////////////////////////////////////////////////////
 634 // ParseParams()
 635 //
 636 // Parse all command-line options.
 637 /////////////////////////////////////////////////////////////////
 638 void MSA::printUsage() {
 639         cerr
 640                         << "************************************************************************"
 641                         << endl
 642                         << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
 643                         << endl
 644                         << "\tbased on pair hidden markov model and partition function postirior"
 645                         << endl
 646                         << "\tprobabilities. If any comments or problems, please contact"
 647                         << endl
 648                         << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
 649                         << endl
 650                         << "*************************************************************************"
 651                         << endl << "Usage:" << endl
 652                         << "       msaprobs [OPTION]... [infile]..." << endl << endl
 653                         << "Description:" << endl
 654                         << "       Align sequences in multi-FASTA format" << endl << endl
 655                         << "       -o, --outfile <string>" << endl
 656                         << "              specify the output file name (STDOUT by default)"
 657                         << endl << "       -num_threads <integer>" << endl
 658                         << "              specify the number of threads used, and otherwise detect automatically"
 659                         << endl << "       -clustalw" << endl
 660                         << "              use CLUSTALW output format instead of FASTA format"
 661                         << endl << endl << "       -c, --consistency REPS" << endl
 662                         << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
 663                         << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
 664                         << ") passes of consistency transformation" << endl << endl
 665                         << "       -ir, --iterative-refinement REPS" << endl
 666                         << "              use " << MIN_ITERATIVE_REFINEMENT_REPS
 667                         << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
 668                         << numIterativeRefinementReps << ") passes of iterative-refinement"
 669                         << endl << endl << "       -v, --verbose" << endl
 670                         << "              report progress while aligning (default: "
 671                         << (enableVerbose ? "on" : "off") << ")" << endl << endl
 672                         << "       -annot FILENAME" << endl
 673                         << "              write annotation for multiple alignment to FILENAME"
 674                         << endl << endl << "       -a, --alignment-order" << endl
 675                         << "              print sequences in alignment order rather than input order (default: "
 676                         << (enableAlignOrder ? "on" : "off") << ")" << endl
 677                         << "       -version " << endl
 678                         << "              print out version of MSAPROBS " << endl << endl;
 679 }
 680 SafeVector<string> MSA::ParseParams(int argc, char **argv) {
 681         if (argc < 2) {
 682                 printUsage();
 683                 exit(1);
 684         }
 685         SafeVector<string> sequenceNames;
 686         int tempInt;
 687         float tempFloat;
 688
 689         for (int i = 1; i < argc; i++) {
 690                 if (argv[i][0] == '-') {
 691                         //help
 692                         if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
 693                                 printUsage();
 694                                 exit(1);
 695                                 //output file name
 696                         } else if (!strcmp(argv[i], "-o")
 697                                         || !strcmp(argv[i], "--outfile")) {
 698                                 if (i < argc - 1) {
 699                                         alignOutFileName = argv[++i];   //get the file name
 700                                 } else {
 701                                         cerr << "ERROR: String expected for option " << argv[i]
 702                                                         << endl;
 703                                         exit(1);
 704                                 }
 705                                 // parameter file
 706                         } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
 707                                 if (i < argc - 1)
 708                                         parametersInputFilename = string (argv[++i]);
 709                                 else {
 710                                         cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 711                                         exit (1);
 712                                 }
 713                                 //number of threads used
 714                         } else if (!strcmp(argv[i], "-p")
 715                                         || !strcmp(argv[i], "-num_threads")) {
 716                                 if (i < argc - 1) {
 717                                         if (!GetInteger(argv[++i], &tempInt)) {
 718                                                 cerr << " ERROR: invalid integer following option "
 719                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 720                                                 exit(1);
 721                                         } else {
 722                                                 if (tempInt < 0) {
 723                                                         tempInt = 0;
 724                                                 }
 725                                                 numThreads = tempInt;
 726                                         }
 727                                 } else {
 728                                         cerr << "ERROR: Integer expected for option " << argv[i]
 729                                                         << endl;
 730                                         exit(1);
 731                                 }
 732                                 // number of consistency transformations
 733                         } else if (!strcmp(argv[i], "-c")
 734                                         || !strcmp(argv[i], "--consistency")) {
 735                                 if (i < argc - 1) {
 736                                         if (!GetInteger(argv[++i], &tempInt)) {
 737                                                 cerr << "ERROR: Invalid integer following option "
 738                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 739                                                 exit(1);
 740                                         } else {
 741                                                 if (tempInt < MIN_CONSISTENCY_REPS
 742                                                                 || tempInt > MAX_CONSISTENCY_REPS) {
 743                                                         cerr << "ERROR: For option " << argv[i - 1]
 744                                                                         << ", integer must be between "
 745                                                                         << MIN_CONSISTENCY_REPS << " and "
 746                                                                         << MAX_CONSISTENCY_REPS << "." << endl;
 747                                                         exit(1);
 748                                                 } else {
 749                                                         numConsistencyReps = tempInt;
 750                                                 }
 751                                         }
 752                                 } else {
 753                                         cerr << "ERROR: Integer expected for option " << argv[i]
 754                                                         << endl;
 755                                         exit(1);
 756                                 }
 757                         }
 758
 759                         // number of randomized partitioning iterative refinement passes
 760                         else if (!strcmp(argv[i], "-ir")
 761                                         || !strcmp(argv[i], "--iterative-refinement")) {
 762                                 if (i < argc - 1) {
 763                                         if (!GetInteger(argv[++i], &tempInt)) {
 764                                                 cerr << "ERROR: Invalid integer following option "
 765                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 766                                                 exit(1);
 767                                         } else {
 768                                                 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
 769                                                                 || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
 770                                                         cerr << "ERROR: For option " << argv[i - 1]
 771                                                                         << ", integer must be between "
 772                                                                         << MIN_ITERATIVE_REFINEMENT_REPS << " and "
 773                                                                         << MAX_ITERATIVE_REFINEMENT_REPS << "."
 774                                                                         << endl;
 775                                                         exit(1);
 776                                                 } else
 777                                                         numIterativeRefinementReps = tempInt;
 778                                         }
 779                                 } else {
 780                                         cerr << "ERROR: Integer expected for option " << argv[i]
 781                                                         << endl;
 782                                         exit(1);
 783                                 }
 784                         }
 785
 786                         // annotation files
 787                         else if (!strcmp(argv[i], "-annot")) {
 788                                 enableAnnotation = true;
 789                                 if (i < argc - 1) {
 790                                         annotationFilename = argv[++i];
 791                                 } else {
 792                                         cerr << "ERROR: FILENAME expected for option " << argv[i]
 793                                                         << endl;
 794                                         exit(1);
 795                                 }
 796                         }
 797
 798                         // clustalw output format
 799                         else if (!strcmp(argv[i], "-clustalw")) {
 800                                 enableClustalWOutput = true;
 801                         }
 802
 803                         // cutoff
 804                         else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
 805                                 if (i < argc - 1) {
 806                                         if (!GetFloat(argv[++i], &tempFloat)) {
 807                                                 cerr
 808                                                                 << "ERROR: Invalid floating-point value following option "
 809                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 810                                                 exit(1);
 811                                         } else {
 812                                                 if (tempFloat < 0 || tempFloat > 1) {
 813                                                         cerr << "ERROR: For option " << argv[i - 1]
 814                                                                         << ", floating-point value must be between 0 and 1."
 815                                                                         << endl;
 816                                                         exit(1);
 817                                                 } else
 818                                                         cutoff = tempFloat;
 819                                         }
 820                                 } else {
 821                                         cerr << "ERROR: Floating-point value expected for option "
 822                                                         << argv[i] << endl;
 823                                         exit(1);
 824                                 }
 825                         }
 826
 827                         // verbose reporting
 828                         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
 829                                 enableVerbose = true;
 830                         }
 831
 832                         // alignment order
 833                         else if (!strcmp(argv[i], "-a")
 834                                         || !strcmp(argv[i], "--alignment-order")) {
 835                                 enableAlignOrder = true;
 836                         }
 837
 838                         //print out version
 839                         else if (!strcmp(argv[i], "-version")) {
 840                                 cerr << "MSAPROBS version " << VERSION << endl;
 841                                 exit(1);
 842                         }
 843                         // bad arguments
 844                         else {
 845                                 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
 846                                 exit(1);
 847                         }
 848                 } else {
 849                         sequenceNames.push_back(string(argv[i]));
 850                 }
 851         }
 852
 853         /*check the output file name*/
 854         cerr << "-------------------------------------" << endl;
 855         if (alignOutFileName.length() == 0) {
 856                 cerr << "The final alignments will be printed out to STDOUT" << endl;
 857                 alignOutFile = &std::cout;
 858         } else {
 859                 cerr << "Open the output file " << alignOutFileName << endl;
 860                 alignOutFile = new ofstream(alignOutFileName.c_str(),
 861                                 ios::binary | ios::out | ios::trunc);
 862         }
 863         cerr << "-------------------------------------" << endl;
 864         return sequenceNames;
 865 }
 866
 867 /////////////////////////////////////////////////////////////////
 868 // ProcessTree()
 869 //
 870 // Process the tree recursively.  Returns the aligned sequences
 871 // corresponding to a node or leaf of the tree.
 872 /////////////////////////////////////////////////////////////////
 873 MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
 874                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 875                 const ProbabilisticModel &model) {
 876
 877         MultiSequence *result;
 878
 879         // check if this is a node of the alignment tree
 880         //if (tree->GetSequenceLabel() == -1){
 881         if (tree->leaf == NODE) {
 882                 MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
 883                                 sparseMatrices, model);
 884                 MultiSequence *alignRight = ProcessTree(tree->right, sequences,
 885                                 sparseMatrices, model);
 886
 887                 assert(alignLeft);
 888                 assert(alignRight);
 889
 890                 result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
 891                 assert(result);
 892
 893                 delete alignLeft;
 894                 delete alignRight;
 895         }
 896
 897         // otherwise, this is a leaf of the alignment tree
 898         else {
 899                 result = new MultiSequence();
 900                 assert(result);
 901                 //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
 902                 result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
 903         }
 904
 905         return result;
 906 }
 907
 908 /////////////////////////////////////////////////////////////////
 909 // ComputeFinalAlignment()
 910 //
 911 // Compute the final alignment by calling ProcessTree(), then
 912 // performing iterative refinement as needed.
 913 /////////////////////////////////////////////////////////////////
 914
 915 MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
 916                 MultiSequence *sequences,
 917                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 918                 const ProbabilisticModel &model) {
 919         MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
 920                         sparseMatrices, model);
 921
 922         SafeVector<int> oldOrdering;
 923         if (enableAlignOrder) {
 924                 for (int i = 0; i < alignment->GetNumSequences(); i++)
 925                         oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
 926                 alignment->SaveOrdering();
 927                 enableAlignOrder = false;
 928         }
 929
 930         // tree-based refinement
 931         // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
 932         /*
 933          int numSeqs = alignment->GetNumSequences();
 934          //if(numSeqs < numIterativeRefinementReps){
 935          for(int iter = 0; iter < 5; iter ++){
 936                 for(int i = 0; i < numSeqs - 1; i++){
 937                         DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
 938                  }
 939          }
 940          //}*/
 941         //Refinement return false:no improvement
 942         for (int i = 0; i < numIterativeRefinementReps; i++) {
 943                 DoIterativeRefinement(sparseMatrices, model, alignment);
 944         }
 945         cerr << endl;
 946
 947         if (oldOrdering.size() > 0) {
 948                 for (int i = 0; i < (int) oldOrdering.size(); i++) {
 949                         alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
 950                 }
 951         }
 952
 953         // return final alignment
 954         return alignment;
 955 }
 956
 957 /////////////////////////////////////////////////////////////////
 958 // AlignAlignments()
 959 //
 960 // Returns the alignment of two MultiSequence objects.
 961 /////////////////////////////////////////////////////////////////
 962
 963 MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
 964                 MultiSequence *align2,
 965                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 966                 const ProbabilisticModel &model) {
 967
 968         // print some info about the alignment
 969         if (enableVerbose) {
 970                 for (int i = 0; i < align1->GetNumSequences(); i++)
 971                         cerr << ((i == 0) ? "[" : ",")
 972                                         << align1->GetSequence(i)->GetLabel();
 973                 cerr << "] vs. ";
 974                 for (int i = 0; i < align2->GetNumSequences(); i++)
 975                         cerr << ((i == 0) ? "[" : ",")
 976                                         << align2->GetSequence(i)->GetLabel();
 977                 cerr << "]: ";
 978         }
 979 #if 0
 980         VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
 981 #else
 982         VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
 983                         sparseMatrices, cutoff);
 984 #endif
 985         // compute an "accuracy" measure for the alignment before refinement
 986
 987         pair<SafeVector<char> *, float> alignment;
 988         //perform alignment
 989         alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
 990                         align2->GetSequence(0)->GetLength(), *posterior);
 991
 992         delete posterior;
 993
 994         if (enableVerbose) {
 995
 996                 // compute total length of sequences
 997                 int totLength = 0;
 998                 for (int i = 0; i < align1->GetNumSequences(); i++)
 999                         for (int j = 0; j < align2->GetNumSequences(); j++)
1000                                 totLength += min(align1->GetSequence(i)->GetLength(),
1001                                                 align2->GetSequence(j)->GetLength());
1002
1003                 // give an "accuracy" measure for the alignment
1004                 cerr << alignment.second / totLength << endl;
1005         }
1006
1007         // now build final alignment
1008         MultiSequence *result = new MultiSequence();
1009         for (int i = 0; i < align1->GetNumSequences(); i++)
1010                 result->AddSequence(
1011                                 align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1012         for (int i = 0; i < align2->GetNumSequences(); i++)
1013                 result->AddSequence(
1014                                 align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1015         if (!enableAlignOrder)
1016                 result->SortByLabel();
1017
1018         // free temporary alignment
1019         delete alignment.first;
1020
1021         return result;
1022 }
1023
1024 /////////////////////////////////////////////////////////////////
1025 // DoRelaxation()
1026 //
1027 // Performs one round of the weighted probabilistic consistency transformation.
1028 //                     1
1029 /////////////////////////////////////////////////////////////////
1030
1031 SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
1032                 MultiSequence *sequences,
1033                 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1034         const int numSeqs = sequences->GetNumSequences();
1035
1036         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
1037                         SafeVector<SparseMatrix *>(numSeqs, NULL));
1038
1039         // for every pair of sequences
1040 #ifdef _OPENMP
1041         int pairIdx;
1042 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
1043         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
1044                 int i = seqsPairs[pairIdx].seq1;
1045                 int j = seqsPairs[pairIdx].seq2;
1046                 float wi = seqsWeights[i];
1047                 float wj = seqsWeights[j];
1048 #else
1049         for (int i = 0; i < numSeqs; i++) {
1050                 float wi = seqsWeights[i];
1051                 for (int j = i + 1; j < numSeqs; j++) {
1052                         float wj = seqsWeights[j];
1053 #endif
1054                         Sequence *seq1 = sequences->GetSequence(i);
1055                         Sequence *seq2 = sequences->GetSequence(j);
1056
1057                         if (enableVerbose) {
1058 #ifdef _OPENMP
1059 #pragma omp critical
1060 #endif
1061                                 cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
1062                                                 << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
1063                                                 << ": ";
1064                         }
1065                         // get the original posterior matrix
1066                         VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
1067                         assert(posteriorPtr);
1068                         VF &posterior = *posteriorPtr;
1069
1070                         const int seq1Length = seq1->GetLength();
1071                         const int seq2Length = seq2->GetLength();
1072
1073                         // contribution from the summation where z = x and z = y
1074                         float w = wi * wi * wj + wi * wj * wj;
1075                         float sumW = w;
1076                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1077                                 //posterior[k] = w*posterior[k];
1078                                 posterior[k] += posterior[k];
1079                         }
1080
1081                         if (enableVerbose)
1082                                 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1083
1084                         // contribution from all other sequences
1085                         for (int k = 0; k < numSeqs; k++) {
1086                                 if (k != i && k != j) {
1087                                         float wk = seqsWeights[k];
1088                                         float w = wi * wj * wk;
1089                                         sumW += w;
1090                                         if (k < i)
1091                                                 Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
1092                                                                 posterior);
1093                                         else if (k > i && k < j)
1094                                                 Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
1095                                                                 posterior);
1096                                         else {
1097                                                 SparseMatrix *temp =
1098                                                                 sparseMatrices[j][k]->ComputeTranspose();
1099                                                 Relax(w, sparseMatrices[i][k], temp, posterior);
1100                                                 delete temp;
1101                                         }
1102                                 }
1103                         }
1104                         //cerr<<"sumW "<<sumW<<endl;
1105                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1106                                 //posterior[k] /= sumW;
1107                                 posterior[k] /= numSeqs;
1108                         }
1109                         // mask out positions not originally in the posterior matrix
1110                         SparseMatrix *matXY = sparseMatrices[i][j];
1111                         for (int y = 0; y <= seq2Length; y++)
1112                                 posterior[y] = 0;
1113                         for (int x = 1; x <= seq1Length; x++) {
1114                                 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1115                                 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1116                                 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1117                                 int curr = 0;
1118                                 while (XYptr != XYend) {
1119
1120                                         // zero out all cells until the first filled column
1121                                         while (curr < XYptr->first) {
1122                                                 base[curr] = 0;
1123                                                 curr++;
1124                                         }
1125
1126                                         // now, skip over this column
1127                                         curr++;
1128                                         ++XYptr;
1129                                 }
1130
1131                                 // zero out cells after last column
1132                                 while (curr <= seq2Length) {
1133                                         base[curr] = 0;
1134                                         curr++;
1135                                 }
1136                         }
1137
1138                         // save the new posterior matrix
1139                         newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
1140                                         seq2->GetLength(), posterior);
1141                         newSparseMatrices[j][i] = NULL;
1142
1143                         if (enableVerbose)
1144                                 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1145
1146                         delete posteriorPtr;
1147
1148                         if (enableVerbose)
1149                                 cerr << "done." << endl;
1150 #ifndef _OPENMP
1151                 }
1152 #endif
1153         }
1154
1155         return newSparseMatrices;
1156 }
1157
1158 /////////////////////////////////////////////////////////////////
1159 // Relax()
1160 //
1161 // Computes the consistency transformation for a single sequence
1162 // z, and adds the transformed matrix to "posterior".
1163 /////////////////////////////////////////////////////////////////
1164
1165 void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
1166                 VF &posterior) {
1167
1168         assert(matXZ);
1169         assert(matZY);
1170
1171         int lengthX = matXZ->GetSeq1Length();
1172         int lengthY = matZY->GetSeq2Length();
1173         assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1174
1175         // for every x[i]
1176         for (int i = 1; i <= lengthX; i++) {
1177                 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1178                 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1179
1180                 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1181
1182                 // iterate through all x[i]-z[k]
1183                 while (XZptr != XZend) {
1184                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1185                         SafeVector<PIF>::iterator ZYend = ZYptr
1186                                         + matZY->GetRowSize(XZptr->first);
1187                         const float XZval = XZptr->second;
1188
1189                         // iterate through all z[k]-y[j]
1190                         while (ZYptr != ZYend) {
1191                                 //base[ZYptr->first] += weight * XZval * ZYptr->second;
1192                                 base[ZYptr->first] += XZval * ZYptr->second;
1193                                 ZYptr++;
1194                         }
1195                         XZptr++;
1196                 }
1197         }
1198 }
1199
1200 /////////////////////////////////////////////////////////////////
1201 // Relax1()
1202 //
1203 // Computes the consistency transformation for a single sequence
1204 // z, and adds the transformed matrix to "posterior".
1205 /////////////////////////////////////////////////////////////////
1206
1207 void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
1208                 VF &posterior) {
1209
1210         assert(matZX);
1211         assert(matZY);
1212
1213         int lengthZ = matZX->GetSeq1Length();
1214         int lengthY = matZY->GetSeq2Length();
1215
1216         // for every z[k]
1217         for (int k = 1; k <= lengthZ; k++) {
1218                 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1219                 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1220
1221                 // iterate through all z[k]-x[i]
1222                 while (ZXptr != ZXend) {
1223                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1224                         SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1225                         const float ZXval = ZXptr->second;
1226                         VF::iterator base = posterior.begin()
1227                                         + ZXptr->first * (lengthY + 1);
1228
1229                         // iterate through all z[k]-y[j]
1230                         while (ZYptr != ZYend) {
1231                                 //base[ZYptr->first] += weight * ZXval * ZYptr->second;
1232                                 base[ZYptr->first] += ZXval * ZYptr->second;
1233                                 ZYptr++;
1234                         }
1235                         ZXptr++;
1236                 }
1237         }
1238 }
1239 /////////////////////////////////////////////////////////////////
1240 // DoIterativeRefinement()
1241 //
1242 // Performs a single round of randomized partionining iterative
1243 // refinement.
1244 /////////////////////////////////////////////////////////////////
1245
1246 void MSA::DoIterativeRefinement(
1247                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1248                 const ProbabilisticModel &model, MultiSequence* &alignment) {
1249         set<int> groupOne, groupTwo;
1250         int numSeqs = alignment->GetNumSequences();
1251
1252         // create two separate groups
1253         for (int i = 0; i < numSeqs; i++) {
1254                 int index = rand();
1255                 if (index % 2) {
1256                         groupOne.insert(i);
1257                 } else {
1258                         groupTwo.insert(i);
1259                 }
1260         }
1261         if (groupOne.empty() || groupTwo.empty()) return;
1262
1263         // project into the two groups
1264         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1265         assert(groupOneSeqs);
1266         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1267         assert(groupTwoSeqs);
1268 /*
1269 //start add by Yongtao
1270 #if 1
1271         VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
1272 #else
1273         VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
1274                         sparseMatrices, cutoff);
1275 #endif
1276
1277         // compute an "accuracy" measure for the alignment before refinement
1278         SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
1279         SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
1280         int i=0;
1281         for (set<int>::const_iterator iter = groupOne.begin();
1282                         iter != groupOne.end(); ++iter) {
1283                 oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1284         }
1285         i=0;
1286         for (set<int>::const_iterator iter = groupTwo.begin();
1287                         iter != groupTwo.end(); ++iter) {
1288                 oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1289         }
1290
1291         VF &posteriorArr = *posterior;
1292         int oldLength = alignment->GetSequence(0)->GetLength();
1293         int groupOneindex=0; int groupTwoindex=0;
1294         float accuracy_before = 0;
1295         for (int i = 1; i <= oldLength; i++) {
1296                 // check to see if there is a gap in every sequence of the set
1297                 bool foundOne = false;
1298                 for (int j = 0; !foundOne && j < (int) groupOne.size(); j++)
1299                         foundOne = (oldOnePtrs[j][i] != '-');
1300                 // if not, then this column counts towards the sequence length
1301                 if (foundOne) groupOneindex ++;
1302                 bool foundTwo = false;
1303                 for (int j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
1304                         foundTwo = (oldTwoPtrs[j][i] != '-');
1305                 // if not, then this column counts towards the sequence length
1306                 if (foundTwo) groupTwoindex ++;
1307                 if(foundOne && foundTwo) accuracy_before +=
1308                                 posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
1309         }
1310
1311         pair<SafeVector<char> *, float> refinealignment;
1312         //perform alignment
1313         refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
1314                         groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
1315         delete posterior;
1316         // now build final alignment
1317         MultiSequence *result = new MultiSequence();
1318         //compare accuracy measure before and after refinement
1319         //if (refinealignment.second > accuracy_before) {
1320                 //cerr<<"Before:" << accuracy_before<<" after: "<< refinealignment.second<< endl;
1321                 for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
1322                         result->AddSequence(
1323                                 groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
1324                 for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
1325                         result->AddSequence(
1326                                 groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
1327                 // free temporary alignment
1328                 delete refinealignment.first;
1329                 delete alignment;
1330                 alignment = result;
1331
1332         }
1333         else{
1334                 if(numIterativeRefinementReps < 8*numSeqs) numIterativeRefinementReps++;
1335                 delete groupOneSeqs;
1336                 delete groupTwoSeqs;
1337                 return false;
1338         }
1339   */
1340 //end add by yongtao
1341
1342         //delete alignment;
1343         // realign
1344         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, model); //original
1345                 delete groupOneSeqs;
1346                 delete groupTwoSeqs;
1347
1348 }
1349
1350 void MSA::DoIterativeRefinementTreeNode(
1351                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1352                 const ProbabilisticModel &model, MultiSequence* &alignment,
1353                 int nodeIndex) {
1354         set<int> groupOne, groupTwo;
1355         int numSeqs = alignment->GetNumSequences();
1356
1357         vector<bool> inGroup1;
1358         inGroup1.resize(numSeqs);
1359         for (int i = 0; i < numSeqs; i++) {
1360                 inGroup1[i] = false;
1361         }
1362
1363         AlignmentOrder* orders = this->tree->getAlignOrders();
1364         AlignmentOrder* order = &orders[nodeIndex];
1365         for (int i = 0; i < order->leftNum; i++) {
1366                 int si = order->leftLeafs[i];
1367                 inGroup1[si] = true;
1368         }
1369         for (int i = 0; i < order->rightNum; i++) {
1370                 int si = order->rightLeafs[i];
1371                 inGroup1[si] = true;
1372         }
1373         // create two separate groups
1374         for (int i = 0; i < numSeqs; i++) {
1375                 if (inGroup1[i]) {
1376                         groupOne.insert(i);
1377                 } else {
1378                         groupTwo.insert(i);
1379                 }
1380         }
1381         if (groupOne.empty() || groupTwo.empty())
1382                 return;
1383
1384         // project into the two groups
1385         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1386         assert(groupOneSeqs);
1387         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1388         assert(groupTwoSeqs);
1389         delete alignment;
1390
1391         // realign
1392         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1393                         model);
1394
1395         delete groupOneSeqs;
1396         delete groupTwoSeqs;
1397 }
1398
1399 /////////////////////////////////////////////////////////////////
1400 // WriteAnnotation()
1401 //
1402 // Computes annotation for multiple alignment and write values
1403 // to a file.
1404 /////////////////////////////////////////////////////////////////
1405
1406 void MSA::WriteAnnotation(MultiSequence *alignment,
1407                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1408         ofstream outfile(annotationFilename.c_str());
1409
1410         if (outfile.fail()) {
1411                 cerr << "ERROR: Unable to write annotation file." << endl;
1412                 exit(1);
1413         }
1414
1415         const int alignLength = alignment->GetSequence(0)->GetLength();
1416         const int numSeqs = alignment->GetNumSequences();
1417
1418         SafeVector<int> position(numSeqs, 0);
1419         SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
1420         for (int i = 0; i < numSeqs; i++)
1421                 seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1422         SafeVector<pair<int, int> > active;
1423         active.reserve(numSeqs);
1424
1425         SafeVector<int> lab;
1426         for (int i = 0; i < numSeqs; i++)
1427                 lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1428
1429         // for every column
1430         for (int i = 1; i <= alignLength; i++) {
1431
1432                 // find all aligned residues in this particular column
1433                 active.clear();
1434                 for (int j = 0; j < numSeqs; j++) {
1435                         if (seqs[j][i] != '-') {
1436                                 active.push_back(make_pair(lab[j], ++position[j]));
1437                         }
1438                 }
1439
1440                 sort(active.begin(), active.end());
1441                 outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
1442         }
1443
1444         outfile.close();
1445 }
1446
1447 /////////////////////////////////////////////////////////////////
1448 // ComputeScore()
1449 //
1450 // Computes the annotation score for a particular column.
1451 /////////////////////////////////////////////////////////////////
1452
1453 int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
1454                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1455
1456         if (active.size() <= 1)
1457                 return 0;
1458
1459         // ALTERNATIVE #1: Compute the average alignment score.
1460
1461         float val = 0;
1462         for (int i = 0; i < (int) active.size(); i++) {
1463                 for (int j = i + 1; j < (int) active.size(); j++) {
1464                         val += sparseMatrices[active[i].first][active[j].first]->GetValue(
1465                                         active[i].second, active[j].second);
1466                 }
1467         }
1468
1469         return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1470
1471 }