binaries/src/MSAProbs-0.9.7/MSAProbs/MSA.cpp

   1 /***********************************************
   2  * # Copyright 2009-2010. Liu Yongchao
   3  * # Contact: Liu Yongchao, School of Computer Engineering,
   4  * #                     Nanyang Technological University.
   5  * # Emails:     liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
   6  * #
   7  * # GPL version 3.0 applies.
   8  * #
   9  * ************************************************/
  10
  11 #include <string>
  12 #include <sstream>
  13 #include <iomanip>
  14 #include <iostream>
  15 #include <list>
  16 #include <set>
  17 #include <algorithm>
  18 #include <climits>
  19 #include <cstdio>
  20 #include <cstdlib>
  21 #include <cerrno>
  22 #include <iomanip>
  23 #include "MSA.h"
  24 #include "MSAClusterTree.h"
  25 #include "Defaults.h"
  26
  27 #ifdef _OPENMP
  28 #include <omp.h>
  29 #endif
  30
  31 string parametersInputFilename = "";
  32 string parametersOutputFilename = "no training";
  33 string annotationFilename = "";
  34
  35 bool enableVerbose = false;
  36 bool enableAnnotation = false;
  37 bool enableClustalWOutput = false;
  38 bool enableAlignOrder = false;
  39 int numConsistencyReps = 2;
  40 int numPreTrainingReps = 0;
  41 int numIterativeRefinementReps = 10;
  42
  43 float cutoff = 0;
  44
  45 VF initDistrib(NumMatrixTypes);
  46 VF gapOpen(2 * NumInsertStates);
  47 VF gapExtend(2 * NumInsertStates);
  48 VVF emitPairs(256, VF(256, 1e-10));
  49 VF emitSingle(256, 1e-5);
  50
  51 string alphabet = alphabetDefault;
  52
  53 const int MIN_PRETRAINING_REPS = 0;
  54 const int MAX_PRETRAINING_REPS = 20;
  55 const int MIN_CONSISTENCY_REPS = 0;
  56 const int MAX_CONSISTENCY_REPS = 5;
  57 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
  58 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
  59
  60 string posteriorProbsFilename = "";
  61 bool allscores = true;
  62 string infilename;
  63
  64 int flag_gui = 0;   //0: no gui related o/p
  65 //1: gui related o/p generated
  66 int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
  67 //1: pp score seq added to o/p fasta alignment
  68
  69 ///////////////////////////////
  70 // global scoring matrix variables
  71 //////////////////////////////
  72 float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
  73 char *aminos, *bases, matrixtype[20] = "gonnet_160";
  74 int subst_index[26];
  75
  76 double sub_matrix[26][26];
  77 int firstread = 0;              //this makes sure that matrices are read only once
  78
  79 float TEMPERATURE = 5;
  80 int MATRIXTYPE = 160;
  81 int prot_nuc = 0;               //0=prot, 1=nucleotide
  82
  83 float GAPOPEN = 0;
  84 float GAPEXT = 0;
  85 int numThreads = 0;
  86
  87 //argument support
  88 typedef struct {
  89         char input[30];
  90         int matrix;
  91         int N;
  92         float T;
  93         float beta;
  94         char opt;                       //can be 'P' or 'M'
  95         float gapopen;
  96         float gapext;
  97 } argument_decl;
  98
  99 argument_decl argument;
 100
 101 extern inline void read_sustitution_matrix(char *fileName);
 102 extern void setmatrixtype(int le);
 103 extern inline int matrixtype_to_int();
 104 extern inline void read_dna_matrix();
 105 extern inline void read_vtml_la_matrix();
 106 extern void init_arguments();
 107
 108 MSA::MSA(int argc, char* argv[]) {
 109         //parse program parameters
 110         SafeVector<string> sequenceNames = ParseParams(argc, argv);
 111
 112         //initialize arguments for partition function
 113         init_arguments();
 114
 115         ReadParameters();
 116         //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 117
 118         //read the input sequences
 119         MultiSequence *sequences = new MultiSequence();
 120         assert(sequences);
 121         for (int i = 0; i < (int) sequenceNames.size(); i++) {
 122                 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 123                 sequences->LoadMFA(sequenceNames[i], true);
 124         }
 125         //allocate space for sequence weights
 126         this->seqsWeights = new int[sequences->GetNumSequences()];
 127         //initilaize parameters for OPENMP
 128 #ifdef _OPENMP
 129         if(numThreads <= 0) {
 130                 numThreads = omp_get_num_procs();
 131                 cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
 132         }
 133         cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
 134
 135         //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
 136         omp_set_num_threads(numThreads);
 137 #endif
 138
 139         // now, we can perform the alignments and write them out
 140         MultiSequence *alignment = doAlign(sequences,
 141                         ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
 142                                         emitSingle), initDistrib, gapOpen, gapExtend, emitPairs,
 143                         emitSingle);
 144
 145         //write the alignment results to standard output
 146         if (enableClustalWOutput) {
 147                 alignment->WriteALN(*alignOutFile);
 148         } else {
 149                 alignment->WriteMFA(*alignOutFile);
 150         }
 151         //release resources
 152         delete[] this->seqsWeights;
 153         delete alignment;
 154         delete sequences;
 155 }
 156 MSA::~MSA() {
 157         /*close the output file*/
 158         if (alignOutFileName.length() > 0) {
 159                 ((std::ofstream*) alignOutFile)->close();
 160         }
 161 }
 162 /////////////////////////////////////////////////////////////////
 163 // PrintParameters()
 164 //
 165 // Prints MSAPROBS parameters to STDERR.  If a filename is
 166 // specified, then the parameters are also written to the file.
 167 /////////////////////////////////////////////////////////////////
 168
 169 void MSA::PrintParameters(const char *message, const VF &initDistrib,
 170                 const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
 171                 const VF &emitSingle, const char *filename) {
 172
 173         // print parameters to the screen
 174         cerr << message << endl << "    initDistrib[] = { ";
 175         for (int i = 0; i < NumMatrixTypes; i++)
 176                 cerr << setprecision(10) << initDistrib[i] << " ";
 177         cerr << "}" << endl << "        gapOpen[] = { ";
 178         for (int i = 0; i < NumInsertStates * 2; i++)
 179                 cerr << setprecision(10) << gapOpen[i] << " ";
 180         cerr << "}" << endl << "      gapExtend[] = { ";
 181         for (int i = 0; i < NumInsertStates * 2; i++)
 182                 cerr << setprecision(10) << gapExtend[i] << " ";
 183         cerr << "}" << endl << endl;
 184
 185         /*
 186          for (int i = 0; i < 5; i++){
 187          for (int j = 0; j <= i; j++){
 188          cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
 189          }
 190          cerr << endl;
 191          }*/
 192
 193         // if a file name is specified
 194         if (filename) {
 195
 196                 // attempt to open the file for writing
 197                 FILE *file = fopen(filename, "w");
 198                 if (!file) {
 199                         cerr << "ERROR: Unable to write parameter file: " << filename
 200                                         << endl;
 201                         exit(1);
 202                 }
 203
 204                 // if successful, then write the parameters to the file
 205                 for (int i = 0; i < NumMatrixTypes; i++)
 206                         fprintf(file, "%.10f ", initDistrib[i]);
 207                 fprintf(file, "\n");
 208                 for (int i = 0; i < 2 * NumInsertStates; i++)
 209                         fprintf(file, "%.10f ", gapOpen[i]);
 210                 fprintf(file, "\n");
 211                 for (int i = 0; i < 2 * NumInsertStates; i++)
 212                         fprintf(file, "%.10f ", gapExtend[i]);
 213                 fprintf(file, "\n");
 214                 fprintf(file, "%s\n", alphabet.c_str());
 215                 for (int i = 0; i < (int) alphabet.size(); i++) {
 216                         for (int j = 0; j <= i; j++)
 217                                 fprintf(file, "%.10f ",
 218                                                 emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
 219                         fprintf(file, "\n");
 220                 }
 221                 for (int i = 0; i < (int) alphabet.size(); i++)
 222                         fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
 223                 fprintf(file, "\n");
 224                 fclose(file);
 225         }
 226 }
 227
 228 /////////////////////////////////////////////////////////////////
 229 // doAlign()
 230 //
 231 // First computes all pairwise posterior probability matrices.
 232 // Then, computes new parameters if training, or a final
 233 // alignment, otherwise.
 234 /////////////////////////////////////////////////////////////////
 235 extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
 236 MultiSequence* MSA::doAlign(MultiSequence *sequences,
 237                 const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
 238                 VF &gapExtend, VVF &emitPairs, VF &emitSingle) {
 239         assert(sequences);
 240
 241         //get the number of sequences
 242         const int numSeqs = sequences->GetNumSequences();
 243
 244         //create distance matrix
 245         VVF distances(numSeqs, VF(numSeqs, 0));
 246         SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
 247                         SafeVector<SparseMatrix *>(numSeqs, NULL));
 248
 249 #ifdef _OPENMP
 250         //calculate sequence pairs for openmp model
 251         int pairIdx = 0;
 252         numPairs = (numSeqs - 1) * numSeqs / 2;
 253         seqsPairs = new SeqsPair[numPairs];
 254         for(int a = 0; a < numSeqs; a++) {
 255                 for(int b = a + 1; b < numSeqs; b++) {
 256                         seqsPairs[pairIdx].seq1 = a;
 257                         seqsPairs[pairIdx].seq2 = b;
 258                         pairIdx++;
 259                 }
 260         }
 261 #endif
 262         // do all pairwise alignments for posterior probability matrices
 263 #ifdef _OPENMP
 264 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
 265         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
 266                 int a= seqsPairs[pairIdx].seq1;
 267                 int b = seqsPairs[pairIdx].seq2;
 268                 if(enableVerbose) {
 269 #pragma omp critical
 270                         cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
 271                 }
 272 #else
 273         for (int a = 0; a < numSeqs - 1; a++) {
 274                 for (int b = a + 1; b < numSeqs; b++) {
 275 #endif
 276                         Sequence *seq1 = sequences->GetSequence(a);
 277                         Sequence *seq2 = sequences->GetSequence(b);
 278
 279                         // verbose output
 280                         if (enableVerbose) {
 281                                 cerr << "Computing posterior matrix: (" << a + 1 << ") "
 282                                                 << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") "
 283                                                 << seq2->GetHeader() << " -- ";
 284                         }
 285
 286                         // compute forward and backward probabilities
 287                         VF *forward = model.ComputeForwardMatrix(seq1, seq2);
 288                         assert(forward);
 289                         VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
 290                         assert(backward);
 291
 292                         // compute posterior probability matrix from HMM
 293                         VF *posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,
 294                                         *backward);
 295                         assert(posterior);
 296                         delete forward;
 297                         delete backward;
 298
 299                         //compute posterior probability matrix from partition function
 300                         VF* part_posterior = ::ComputePostProbs(a, b, seq1->GetString(),
 301                                         seq2->GetString());
 302                         assert(part_posterior);
 303
 304                         //merge the two posterior matrices
 305                         VF::iterator ptr1 = posterior->begin();
 306                         VF::iterator ptr2 = part_posterior->begin();
 307                         for (int i = 0; i <= seq1->GetLength(); i++) {
 308                                 for (int j = 0; j <= seq2->GetLength(); j++) {
 309                                         float v1 = *ptr1;
 310                                         float v2 = *ptr2;
 311
 312                                         *ptr1 = sqrt((v1 * v1 + v2 * v2) * 0.5f);
 313                                         ptr1++;
 314                                         ptr2++;
 315                                 }
 316                         }
 317                         delete part_posterior;
 318
 319                         // compute sparse representations
 320                         sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
 321                                         seq2->GetLength(), *posterior);
 322                         sparseMatrices[b][a] = NULL;
 323
 324                         // perform the pairwise sequence alignment
 325                         pair<SafeVector<char> *, float> alignment = model.ComputeAlignment(
 326                                         seq1->GetLength(), seq2->GetLength(), *posterior);
 327
 328                         //compute the pairwise distance using expected accuracy
 329                         float accuracy = alignment.second
 330                                         / min(seq1->GetLength(), seq2->GetLength());
 331                         distances[a][b] = distances[b][a] = 1.0f - accuracy;
 332
 333                         if (enableVerbose) {
 334                                 cerr << setprecision(10) << accuracy << endl;
 335                         }
 336                         delete alignment.first;
 337                         delete posterior;
 338 #ifndef _OPENMP
 339                 }
 340 #endif
 341         }
 342         //create the guide tree
 343         this->tree = new MSAClusterTree(this, distances, numSeqs);
 344         this->tree->create();
 345
 346         // perform the consistency transformation the desired number of times
 347         float* fweights = new float[numSeqs];
 348         for (int r = 0; r < numSeqs; r++) {
 349                 fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
 350                 fweights[r] *= 10;
 351         }
 352         for (int r = 0; r < numConsistencyReps; r++) {
 353                 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
 354                                 DoRelaxation(fweights, sequences, sparseMatrices);
 355
 356                 // now replace the old posterior matrices
 357                 for (int i = 0; i < numSeqs; i++) {
 358                         for (int j = 0; j < numSeqs; j++) {
 359                                 delete sparseMatrices[i][j];
 360                                 sparseMatrices[i][j] = newSparseMatrices[i][j];
 361                         }
 362                 }
 363         }
 364         delete[] fweights;
 365 #ifdef _OPENMP
 366         delete [] seqsPairs;
 367 #endif
 368
 369         //compute the final multiple sequence alignment
 370         MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
 371                         sparseMatrices, model);
 372
 373         // build annotation
 374         if (enableAnnotation) {
 375                 WriteAnnotation(finalAlignment, sparseMatrices);
 376         }
 377         //destroy the guide tree
 378         delete this->tree;
 379         this->tree = 0;
 380
 381         // delete sparse matrices
 382         for (int a = 0; a < numSeqs - 1; a++) {
 383                 for (int b = a + 1; b < numSeqs; b++) {
 384                         delete sparseMatrices[a][b];
 385                         delete sparseMatrices[b][a];
 386                 }
 387         }
 388
 389         return finalAlignment;
 390 }
 391
 392 /////////////////////////////////////////////////////////////////
 393 // GetInteger()
 394 //
 395 // Attempts to parse an integer from the character string given.
 396 // Returns true only if no parsing error occurs.
 397 /////////////////////////////////////////////////////////////////
 398
 399 bool GetInteger(char *data, int *val) {
 400         char *endPtr;
 401         long int retVal;
 402
 403         assert(val);
 404
 405         errno = 0;
 406         retVal = strtol(data, &endPtr, 0);
 407         if (retVal == 0 && (errno != 0 || data == endPtr))
 408                 return false;
 409         if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
 410                 return false;
 411         if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
 412                 return false;
 413         *val = (int) retVal;
 414         return true;
 415 }
 416
 417 /////////////////////////////////////////////////////////////////
 418 // GetFloat()
 419 //
 420 // Attempts to parse a float from the character string given.
 421 // Returns true only if no parsing error occurs.
 422 /////////////////////////////////////////////////////////////////
 423
 424 bool GetFloat(char *data, float *val) {
 425         char *endPtr;
 426         double retVal;
 427
 428         assert(val);
 429
 430         errno = 0;
 431         retVal = strtod(data, &endPtr);
 432         if (retVal == 0 && (errno != 0 || data == endPtr))
 433                 return false;
 434         if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
 435                 return false;
 436         *val = (float) retVal;
 437         return true;
 438 }
 439
 440 /////////////////////////////////////////////////////////////////
 441 // ReadParameters()
 442 //
 443 // Read initial distribution, transition, and emission
 444 // parameters from a file.
 445 /////////////////////////////////////////////////////////////////
 446
 447 void MSA::ReadParameters() {
 448
 449         ifstream data;
 450
 451         emitPairs = VVF(256, VF(256, 1e-10));
 452         emitSingle = VF(256, 1e-5);
 453
 454         // read initial state distribution and transition parameters
 455         if (parametersInputFilename == string("")) {
 456                 if (NumInsertStates == 1) {
 457                         for (int i = 0; i < NumMatrixTypes; i++)
 458                                 initDistrib[i] = initDistrib1Default[i];
 459                         for (int i = 0; i < 2 * NumInsertStates; i++)
 460                                 gapOpen[i] = gapOpen1Default[i];
 461                         for (int i = 0; i < 2 * NumInsertStates; i++)
 462                                 gapExtend[i] = gapExtend1Default[i];
 463                 } else if (NumInsertStates == 2) {
 464                         for (int i = 0; i < NumMatrixTypes; i++)
 465                                 initDistrib[i] = initDistrib2Default[i];
 466                         for (int i = 0; i < 2 * NumInsertStates; i++)
 467                                 gapOpen[i] = gapOpen2Default[i];
 468                         for (int i = 0; i < 2 * NumInsertStates; i++)
 469                                 gapExtend[i] = gapExtend2Default[i];
 470                 } else {
 471                         cerr
 472                                         << "ERROR: No default initial distribution/parameter settings exist"
 473                                         << endl << "       for " << NumInsertStates
 474                                         << " pairs of insert states.  Use --paramfile." << endl;
 475                         exit(1);
 476                 }
 477
 478                 alphabet = alphabetDefault;
 479
 480                 for (int i = 0; i < (int) alphabet.length(); i++) {
 481                         emitSingle[(unsigned char) tolower(alphabet[i])] =
 482                                         emitSingleDefault[i];
 483                         emitSingle[(unsigned char) toupper(alphabet[i])] =
 484                                         emitSingleDefault[i];
 485                         for (int j = 0; j <= i; j++) {
 486                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
 487                                                 alphabet[j])] = emitPairsDefault[i][j];
 488                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
 489                                                 alphabet[j])] = emitPairsDefault[i][j];
 490                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
 491                                                 alphabet[j])] = emitPairsDefault[i][j];
 492                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
 493                                                 alphabet[j])] = emitPairsDefault[i][j];
 494                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
 495                                                 alphabet[i])] = emitPairsDefault[i][j];
 496                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
 497                                                 alphabet[i])] = emitPairsDefault[i][j];
 498                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
 499                                                 alphabet[i])] = emitPairsDefault[i][j];
 500                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
 501                                                 alphabet[i])] = emitPairsDefault[i][j];
 502                         }
 503                 }
 504         } else {
 505                 data.open(parametersInputFilename.c_str());
 506                 if (data.fail()) {
 507                         cerr << "ERROR: Unable to read parameter file: "
 508                                         << parametersInputFilename << endl;
 509                         exit(1);
 510                 }
 511
 512                 string line[3];
 513                 for (int i = 0; i < 3; i++) {
 514                         if (!getline(data, line[i])) {
 515                                 cerr
 516                                                 << "ERROR: Unable to read transition parameters from parameter file: "
 517                                                 << parametersInputFilename << endl;
 518                                 exit(1);
 519                         }
 520                 }
 521                 istringstream data2;
 522                 data2.clear();
 523                 data2.str(line[0]);
 524                 for (int i = 0; i < NumMatrixTypes; i++)
 525                         data2 >> initDistrib[i];
 526                 data2.clear();
 527                 data2.str(line[1]);
 528                 for (int i = 0; i < 2 * NumInsertStates; i++)
 529                         data2 >> gapOpen[i];
 530                 data2.clear();
 531                 data2.str(line[2]);
 532                 for (int i = 0; i < 2 * NumInsertStates; i++)
 533                         data2 >> gapExtend[i];
 534
 535                 if (!getline(data, line[0])) {
 536                         cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
 537                                         << parametersInputFilename << endl;
 538                         exit(1);
 539                 }
 540
 541                 // read alphabet as concatenation of all characters on alphabet line
 542                 alphabet = "";
 543                 string token;
 544                 data2.clear();
 545                 data2.str(line[0]);
 546                 while (data2 >> token)
 547                         alphabet += token;
 548
 549                 for (int i = 0; i < (int) alphabet.size(); i++) {
 550                         for (int j = 0; j <= i; j++) {
 551                                 float val;
 552                                 data >> val;
 553                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
 554                                                 alphabet[j])] = val;
 555                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
 556                                                 alphabet[j])] = val;
 557                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
 558                                                 alphabet[j])] = val;
 559                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
 560                                                 alphabet[j])] = val;
 561                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
 562                                                 alphabet[i])] = val;
 563                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
 564                                                 alphabet[i])] = val;
 565                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
 566                                                 alphabet[i])] = val;
 567                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
 568                                                 alphabet[i])] = val;
 569                         }
 570                 }
 571
 572                 for (int i = 0; i < (int) alphabet.size(); i++) {
 573                         float val;
 574                         data >> val;
 575                         emitSingle[(unsigned char) tolower(alphabet[i])] = val;
 576                         emitSingle[(unsigned char) toupper(alphabet[i])] = val;
 577                 }
 578                 data.close();
 579         }
 580 }
 581
 582 /////////////////////////////////////////////////////////////////
 583 // ParseParams()
 584 //
 585 // Parse all command-line options.
 586 /////////////////////////////////////////////////////////////////
 587 void MSA::printUsage() {
 588         cerr
 589                         << "************************************************************************"
 590                         << endl
 591                         << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
 592                         << endl
 593                         << "\tbased on pair hidden markov model and partition function postirior"
 594                         << endl
 595                         << "\tprobabilities. If any comments or problems, please contact"
 596                         << endl
 597                         << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
 598                         << endl
 599                         << "*************************************************************************"
 600                         << endl << "Usage:" << endl
 601                         << "       msaprobs [OPTION]... [infile]..." << endl << endl
 602                         << "Description:" << endl
 603                         << "       Align sequences in multi-FASTA format" << endl << endl
 604                         << "       -o, --outfile <string>" << endl
 605                         << "              specify the output file name (STDOUT by default)"
 606                         << endl << "       -num_threads <integer>" << endl
 607                         << "              specify the number of threads used, and otherwise detect automatically"
 608                         << endl << "       -clustalw" << endl
 609                         << "              use CLUSTALW output format instead of FASTA format"
 610                         << endl << endl << "       -c, --consistency REPS" << endl
 611                         << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
 612                         << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
 613                         << ") passes of consistency transformation" << endl << endl
 614                         << "       -ir, --iterative-refinement REPS" << endl
 615                         << "              use " << MIN_ITERATIVE_REFINEMENT_REPS
 616                         << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
 617                         << numIterativeRefinementReps << ") passes of iterative-refinement"
 618                         << endl << endl << "       -v, --verbose" << endl
 619                         << "              report progress while aligning (default: "
 620                         << (enableVerbose ? "on" : "off") << ")" << endl << endl
 621                         << "       -annot FILENAME" << endl
 622                         << "              write annotation for multiple alignment to FILENAME"
 623                         << endl << endl << "       -a, --alignment-order" << endl
 624                         << "              print sequences in alignment order rather than input order (default: "
 625                         << (enableAlignOrder ? "on" : "off") << ")" << endl
 626                         << "       -version " << endl
 627                         << "              print out version of MSAPROBS " << endl << endl;
 628 }
 629 SafeVector<string> MSA::ParseParams(int argc, char **argv) {
 630         if (argc < 2) {
 631                 printUsage();
 632                 exit(1);
 633         }
 634         SafeVector<string> sequenceNames;
 635         int tempInt;
 636         float tempFloat;
 637
 638         for (int i = 1; i < argc; i++) {
 639                 if (argv[i][0] == '-') {
 640                         //help
 641                         if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
 642                                 printUsage();
 643                                 exit(1);
 644                                 //output file name
 645                         } else if (!strcmp(argv[i], "-o")
 646                                         || !strcmp(argv[i], "--outfile")) {
 647                                 if (i < argc - 1) {
 648                                         alignOutFileName = argv[++i];   //get the file name
 649                                 } else {
 650                                         cerr << "ERROR: String expected for option " << argv[i]
 651                                                         << endl;
 652                                         exit(1);
 653                                 }
 654                                 //number of threads used
 655                         } else if (!strcmp(argv[i], "-p")
 656                                         || !strcmp(argv[i], "-num_threads")) {
 657                                 if (i < argc - 1) {
 658                                         if (!GetInteger(argv[++i], &tempInt)) {
 659                                                 cerr << " ERROR: invalid integer following option "
 660                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 661                                                 exit(1);
 662                                         } else {
 663                                                 if (tempInt < 0) {
 664                                                         tempInt = 0;
 665                                                 }
 666                                                 numThreads = tempInt;
 667                                         }
 668                                 } else {
 669                                         cerr << "ERROR: Integer expected for option " << argv[i]
 670                                                         << endl;
 671                                         exit(1);
 672                                 }
 673                                 // number of consistency transformations
 674                         } else if (!strcmp(argv[i], "-c")
 675                                         || !strcmp(argv[i], "--consistency")) {
 676                                 if (i < argc - 1) {
 677                                         if (!GetInteger(argv[++i], &tempInt)) {
 678                                                 cerr << "ERROR: Invalid integer following option "
 679                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 680                                                 exit(1);
 681                                         } else {
 682                                                 if (tempInt < MIN_CONSISTENCY_REPS
 683                                                                 || tempInt > MAX_CONSISTENCY_REPS) {
 684                                                         cerr << "ERROR: For option " << argv[i - 1]
 685                                                                         << ", integer must be between "
 686                                                                         << MIN_CONSISTENCY_REPS << " and "
 687                                                                         << MAX_CONSISTENCY_REPS << "." << endl;
 688                                                         exit(1);
 689                                                 } else {
 690                                                         numConsistencyReps = tempInt;
 691                                                 }
 692                                         }
 693                                 } else {
 694                                         cerr << "ERROR: Integer expected for option " << argv[i]
 695                                                         << endl;
 696                                         exit(1);
 697                                 }
 698                         }
 699
 700                         // number of randomized partitioning iterative refinement passes
 701                         else if (!strcmp(argv[i], "-ir")
 702                                         || !strcmp(argv[i], "--iterative-refinement")) {
 703                                 if (i < argc - 1) {
 704                                         if (!GetInteger(argv[++i], &tempInt)) {
 705                                                 cerr << "ERROR: Invalid integer following option "
 706                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 707                                                 exit(1);
 708                                         } else {
 709                                                 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
 710                                                                 || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
 711                                                         cerr << "ERROR: For option " << argv[i - 1]
 712                                                                         << ", integer must be between "
 713                                                                         << MIN_ITERATIVE_REFINEMENT_REPS << " and "
 714                                                                         << MAX_ITERATIVE_REFINEMENT_REPS << "."
 715                                                                         << endl;
 716                                                         exit(1);
 717                                                 } else
 718                                                         numIterativeRefinementReps = tempInt;
 719                                         }
 720                                 } else {
 721                                         cerr << "ERROR: Integer expected for option " << argv[i]
 722                                                         << endl;
 723                                         exit(1);
 724                                 }
 725                         }
 726
 727                         // annotation files
 728                         else if (!strcmp(argv[i], "-annot")) {
 729                                 enableAnnotation = true;
 730                                 if (i < argc - 1) {
 731                                         annotationFilename = argv[++i];
 732                                 } else {
 733                                         cerr << "ERROR: FILENAME expected for option " << argv[i]
 734                                                         << endl;
 735                                         exit(1);
 736                                 }
 737                         }
 738
 739                         // clustalw output format
 740                         else if (!strcmp(argv[i], "-clustalw")) {
 741                                 enableClustalWOutput = true;
 742                         }
 743
 744                         // cutoff
 745                         else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
 746                                 if (i < argc - 1) {
 747                                         if (!GetFloat(argv[++i], &tempFloat)) {
 748                                                 cerr
 749                                                                 << "ERROR: Invalid floating-point value following option "
 750                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 751                                                 exit(1);
 752                                         } else {
 753                                                 if (tempFloat < 0 || tempFloat > 1) {
 754                                                         cerr << "ERROR: For option " << argv[i - 1]
 755                                                                         << ", floating-point value must be between 0 and 1."
 756                                                                         << endl;
 757                                                         exit(1);
 758                                                 } else
 759                                                         cutoff = tempFloat;
 760                                         }
 761                                 } else {
 762                                         cerr << "ERROR: Floating-point value expected for option "
 763                                                         << argv[i] << endl;
 764                                         exit(1);
 765                                 }
 766                         }
 767
 768                         // verbose reporting
 769                         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
 770                                 enableVerbose = true;
 771                         }
 772
 773                         // alignment order
 774                         else if (!strcmp(argv[i], "-a")
 775                                         || !strcmp(argv[i], "--alignment-order")) {
 776                                 enableAlignOrder = true;
 777                         }
 778
 779                         //print out version
 780                         else if (!strcmp(argv[i], "-version")) {
 781                                 cerr << "MSAPROBS version " << VERSION << endl;
 782                                 exit(1);
 783                         }
 784                         // bad arguments
 785                         else {
 786                                 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
 787                                 exit(1);
 788                         }
 789                 } else {
 790                         sequenceNames.push_back(string(argv[i]));
 791                 }
 792         }
 793
 794         /*check the output file name*/
 795         cerr << "-------------------------------------" << endl;
 796         if (alignOutFileName.length() == 0) {
 797                 cerr << "The final alignments will be printed out to STDOUT" << endl;
 798                 alignOutFile = &std::cout;
 799         } else {
 800                 cerr << "Open the output file " << alignOutFileName << endl;
 801                 alignOutFile = new ofstream(alignOutFileName.c_str(),
 802                                 ios::binary | ios::out | ios::trunc);
 803         }
 804         cerr << "-------------------------------------" << endl;
 805         return sequenceNames;
 806 }
 807
 808 /////////////////////////////////////////////////////////////////
 809 // ProcessTree()
 810 //
 811 // Process the tree recursively.  Returns the aligned sequences
 812 // corresponding to a node or leaf of the tree.
 813 /////////////////////////////////////////////////////////////////
 814 MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
 815                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 816                 const ProbabilisticModel &model) {
 817
 818         MultiSequence *result;
 819
 820         // check if this is a node of the alignment tree
 821         //if (tree->GetSequenceLabel() == -1){
 822         if (tree->leaf == NODE) {
 823                 MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
 824                                 sparseMatrices, model);
 825                 MultiSequence *alignRight = ProcessTree(tree->right, sequences,
 826                                 sparseMatrices, model);
 827
 828                 assert(alignLeft);
 829                 assert(alignRight);
 830
 831                 result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
 832                 assert(result);
 833
 834                 delete alignLeft;
 835                 delete alignRight;
 836         }
 837
 838         // otherwise, this is a leaf of the alignment tree
 839         else {
 840                 result = new MultiSequence();
 841                 assert(result);
 842                 //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
 843                 result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
 844         }
 845
 846         return result;
 847 }
 848
 849 /////////////////////////////////////////////////////////////////
 850 // ComputeFinalAlignment()
 851 //
 852 // Compute the final alignment by calling ProcessTree(), then
 853 // performing iterative refinement as needed.
 854 /////////////////////////////////////////////////////////////////
 855
 856 MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
 857                 MultiSequence *sequences,
 858                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 859                 const ProbabilisticModel &model) {
 860         MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
 861                         sparseMatrices, model);
 862
 863         SafeVector<int> oldOrdering;
 864         if (enableAlignOrder) {
 865                 for (int i = 0; i < alignment->GetNumSequences(); i++)
 866                         oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
 867                 alignment->SaveOrdering();
 868                 enableAlignOrder = false;
 869         }
 870
 871         // tree-based refinement
 872         // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
 873         /*int numSeqs = alignment->GetNumSequences();
 874          if(numSeqs < numIterativeRefinementReps){
 875          for(int iter = 0; iter < 1; iter ++){
 876          for(int i = 0; i < numSeqs - 1; i++){
 877          DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
 878          }
 879          }
 880          }*/
 881         for (int i = 0; i < numIterativeRefinementReps; i++) {
 882                 DoIterativeRefinement(sparseMatrices, model, alignment, i);
 883         }
 884         cerr << endl;
 885
 886         if (oldOrdering.size() > 0) {
 887                 for (int i = 0; i < (int) oldOrdering.size(); i++) {
 888                         alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
 889                 }
 890         }
 891
 892         // return final alignment
 893         return alignment;
 894 }
 895
 896 /////////////////////////////////////////////////////////////////
 897 // AlignAlignments()
 898 //
 899 // Returns the alignment of two MultiSequence objects.
 900 /////////////////////////////////////////////////////////////////
 901
 902 MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
 903                 MultiSequence *align2,
 904                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 905                 const ProbabilisticModel &model) {
 906
 907         // print some info about the alignment
 908         if (enableVerbose) {
 909                 for (int i = 0; i < align1->GetNumSequences(); i++)
 910                         cerr << ((i == 0) ? "[" : ",")
 911                                         << align1->GetSequence(i)->GetLabel();
 912                 cerr << "] vs. ";
 913                 for (int i = 0; i < align2->GetNumSequences(); i++)
 914                         cerr << ((i == 0) ? "[" : ",")
 915                                         << align2->GetSequence(i)->GetLabel();
 916                 cerr << "]: ";
 917         }
 918 #if 0
 919         VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
 920 #else
 921         VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
 922                         sparseMatrices, cutoff);
 923 #endif
 924         pair<SafeVector<char> *, float> alignment;
 925
 926         //perform alignment
 927         alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
 928                         align2->GetSequence(0)->GetLength(), *posterior);
 929
 930         delete posterior;
 931
 932         if (enableVerbose) {
 933
 934                 // compute total length of sequences
 935                 int totLength = 0;
 936                 for (int i = 0; i < align1->GetNumSequences(); i++)
 937                         for (int j = 0; j < align2->GetNumSequences(); j++)
 938                                 totLength += min(align1->GetSequence(i)->GetLength(),
 939                                                 align2->GetSequence(j)->GetLength());
 940
 941                 // give an "accuracy" measure for the alignment
 942                 cerr << alignment.second / totLength << endl;
 943         }
 944
 945         // now build final alignment
 946         MultiSequence *result = new MultiSequence();
 947         for (int i = 0; i < align1->GetNumSequences(); i++)
 948                 result->AddSequence(
 949                                 align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
 950         for (int i = 0; i < align2->GetNumSequences(); i++)
 951                 result->AddSequence(
 952                                 align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
 953         if (!enableAlignOrder)
 954                 result->SortByLabel();
 955
 956         // free temporary alignment
 957         delete alignment.first;
 958
 959         return result;
 960 }
 961
 962 /////////////////////////////////////////////////////////////////
 963 // DoRelaxation()
 964 //
 965 // Performs one round of the weighted probabilistic consistency transformation.
 966 //                     1
 967 /////////////////////////////////////////////////////////////////
 968
 969 SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
 970                 MultiSequence *sequences,
 971                 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
 972         const int numSeqs = sequences->GetNumSequences();
 973
 974         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
 975                         SafeVector<SparseMatrix *>(numSeqs, NULL));
 976
 977         // for every pair of sequences
 978 #ifdef _OPENMP
 979         int pairIdx;
 980 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
 981         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
 982                 int i = seqsPairs[pairIdx].seq1;
 983                 int j = seqsPairs[pairIdx].seq2;
 984                 float wi = seqsWeights[i];
 985                 float wj = seqsWeights[j];
 986 #else
 987         for (int i = 0; i < numSeqs; i++) {
 988                 float wi = seqsWeights[i];
 989                 for (int j = i + 1; j < numSeqs; j++) {
 990                         float wj = seqsWeights[j];
 991 #endif
 992                         Sequence *seq1 = sequences->GetSequence(i);
 993                         Sequence *seq2 = sequences->GetSequence(j);
 994
 995                         if (enableVerbose) {
 996 #ifdef _OPENMP
 997 #pragma omp critical
 998 #endif
 999                                 cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
1000                                                 << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
1001                                                 << ": ";
1002                         }
1003                         // get the original posterior matrix
1004                         VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
1005                         assert(posteriorPtr);
1006                         VF &posterior = *posteriorPtr;
1007
1008                         const int seq1Length = seq1->GetLength();
1009                         const int seq2Length = seq2->GetLength();
1010
1011                         // contribution from the summation where z = x and z = y
1012                         float w = wi * wi * wj + wi * wj * wj;
1013                         float sumW = w;
1014                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1015                                 posterior[k] = w * posterior[k];
1016                         }
1017
1018                         if (enableVerbose)
1019                                 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1020
1021                         // contribution from all other sequences
1022                         for (int k = 0; k < numSeqs; k++) {
1023                                 if (k != i && k != j) {
1024                                         float wk = seqsWeights[k];
1025                                         float w = wi * wj * wk;
1026                                         sumW += w;
1027                                         if (k < i)
1028                                                 Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
1029                                                                 posterior);
1030                                         else if (k > i && k < j)
1031                                                 Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
1032                                                                 posterior);
1033                                         else {
1034                                                 SparseMatrix *temp =
1035                                                                 sparseMatrices[j][k]->ComputeTranspose();
1036                                                 Relax(w, sparseMatrices[i][k], temp, posterior);
1037                                                 delete temp;
1038                                         }
1039                                 }
1040                         }
1041                         //cerr<<"sumW "<<sumW<<endl;
1042                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1043                                 posterior[k] /= sumW;
1044                         }
1045                         // mask out positions not originally in the posterior matrix
1046                         SparseMatrix *matXY = sparseMatrices[i][j];
1047                         for (int y = 0; y <= seq2Length; y++)
1048                                 posterior[y] = 0;
1049                         for (int x = 1; x <= seq1Length; x++) {
1050                                 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1051                                 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1052                                 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1053                                 int curr = 0;
1054                                 while (XYptr != XYend) {
1055
1056                                         // zero out all cells until the first filled column
1057                                         while (curr < XYptr->first) {
1058                                                 base[curr] = 0;
1059                                                 curr++;
1060                                         }
1061
1062                                         // now, skip over this column
1063                                         curr++;
1064                                         ++XYptr;
1065                                 }
1066
1067                                 // zero out cells after last column
1068                                 while (curr <= seq2Length) {
1069                                         base[curr] = 0;
1070                                         curr++;
1071                                 }
1072                         }
1073
1074                         // save the new posterior matrix
1075                         newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
1076                                         seq2->GetLength(), posterior);
1077                         newSparseMatrices[j][i] = NULL;
1078
1079                         if (enableVerbose)
1080                                 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1081
1082                         delete posteriorPtr;
1083
1084                         if (enableVerbose)
1085                                 cerr << "done." << endl;
1086 #ifndef _OPENMP
1087                 }
1088 #endif
1089         }
1090
1091         return newSparseMatrices;
1092 }
1093
1094 /////////////////////////////////////////////////////////////////
1095 // Relax()
1096 //
1097 // Computes the consistency transformation for a single sequence
1098 // z, and adds the transformed matrix to "posterior".
1099 /////////////////////////////////////////////////////////////////
1100
1101 void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
1102                 VF &posterior) {
1103
1104         assert(matXZ);
1105         assert(matZY);
1106
1107         int lengthX = matXZ->GetSeq1Length();
1108         int lengthY = matZY->GetSeq2Length();
1109         assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1110
1111         // for every x[i]
1112         for (int i = 1; i <= lengthX; i++) {
1113                 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1114                 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1115
1116                 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1117
1118                 // iterate through all x[i]-z[k]
1119                 while (XZptr != XZend) {
1120                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1121                         SafeVector<PIF>::iterator ZYend = ZYptr
1122                                         + matZY->GetRowSize(XZptr->first);
1123                         const float XZval = XZptr->second;
1124
1125                         // iterate through all z[k]-y[j]
1126                         while (ZYptr != ZYend) {
1127                                 base[ZYptr->first] += weight * XZval * ZYptr->second;
1128                                 ZYptr++;
1129                         }
1130                         XZptr++;
1131                 }
1132         }
1133 }
1134
1135 /////////////////////////////////////////////////////////////////
1136 // Relax1()
1137 //
1138 // Computes the consistency transformation for a single sequence
1139 // z, and adds the transformed matrix to "posterior".
1140 /////////////////////////////////////////////////////////////////
1141
1142 void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
1143                 VF &posterior) {
1144
1145         assert(matZX);
1146         assert(matZY);
1147
1148         int lengthZ = matZX->GetSeq1Length();
1149         int lengthY = matZY->GetSeq2Length();
1150
1151         // for every z[k]
1152         for (int k = 1; k <= lengthZ; k++) {
1153                 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1154                 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1155
1156                 // iterate through all z[k]-x[i]
1157                 while (ZXptr != ZXend) {
1158                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1159                         SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1160                         const float ZXval = ZXptr->second;
1161                         VF::iterator base = posterior.begin()
1162                                         + ZXptr->first * (lengthY + 1);
1163
1164                         // iterate through all z[k]-y[j]
1165                         while (ZYptr != ZYend) {
1166                                 base[ZYptr->first] += weight * ZXval * ZYptr->second;
1167                                 ZYptr++;
1168                         }
1169                         ZXptr++;
1170                 }
1171         }
1172 }
1173 /////////////////////////////////////////////////////////////////
1174 // DoIterativeRefinement()
1175 //
1176 // Performs a single round of randomized partionining iterative
1177 // refinement.
1178 /////////////////////////////////////////////////////////////////
1179
1180 int MSA::GenRandom(int m, int seed, bool init) {
1181         static const int a = 5, b = 3, n = 7;
1182         static int rand0;
1183         if (init == true) {
1184                 rand0 = seed;
1185         }
1186         m *= 19;
1187         int rand1;
1188         for (int i = 0; i < n; i++) {
1189                 rand1 = (a * rand0 + b) % m;
1190                 rand0 = rand1;
1191         }
1192         return rand1;
1193 }
1194
1195 void MSA::DoIterativeRefinement(
1196                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1197                 const ProbabilisticModel &model, MultiSequence* &alignment, int si) {
1198         set<int> groupOne, groupTwo;
1199         int numSeqs = alignment->GetNumSequences();
1200
1201         int index = GenRandom(numSeqs, si, true);
1202         // create two separate groups
1203         for (int i = 0; i < numSeqs; i++) {
1204                 index = GenRandom(numSeqs, si);
1205                 if (index % 2) {
1206                         groupOne.insert(i);
1207                 } else {
1208                         groupTwo.insert(i);
1209                 }
1210         }
1211         if (groupOne.empty() || groupTwo.empty())
1212                 return;
1213
1214         // project into the two groups
1215         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1216         assert(groupOneSeqs);
1217         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1218         assert(groupTwoSeqs);
1219         delete alignment;
1220
1221         // realign
1222         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1223                         model);
1224
1225         delete groupOneSeqs;
1226         delete groupTwoSeqs;
1227 }
1228 void MSA::DoIterativeRefinementTreeNode(
1229                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1230                 const ProbabilisticModel &model, MultiSequence* &alignment,
1231                 int nodeIndex) {
1232         set<int> groupOne, groupTwo;
1233         int numSeqs = alignment->GetNumSequences();
1234
1235         vector<bool> inGroup1;
1236         inGroup1.resize(numSeqs);
1237         for (int i = 0; i < numSeqs; i++) {
1238                 inGroup1[i] = false;
1239         }
1240
1241         AlignmentOrder* orders = this->tree->getAlignOrders();
1242         AlignmentOrder* order = &orders[nodeIndex];
1243         for (int i = 0; i < order->leftNum; i++) {
1244                 int si = order->leftLeafs[i];
1245                 inGroup1[si] = true;
1246         }
1247         for (int i = 0; i < order->rightNum; i++) {
1248                 int si = order->rightLeafs[i];
1249                 inGroup1[si] = true;
1250         }
1251         // create two separate groups
1252         for (int i = 0; i < numSeqs; i++) {
1253                 if (inGroup1[i]) {
1254                         groupOne.insert(i);
1255                 } else {
1256                         groupTwo.insert(i);
1257                 }
1258         }
1259         if (groupOne.empty() || groupTwo.empty())
1260                 return;
1261
1262         // project into the two groups
1263         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1264         assert(groupOneSeqs);
1265         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1266         assert(groupTwoSeqs);
1267         delete alignment;
1268
1269         // realign
1270         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1271                         model);
1272
1273         delete groupOneSeqs;
1274         delete groupTwoSeqs;
1275 }
1276
1277 /////////////////////////////////////////////////////////////////
1278 // WriteAnnotation()
1279 //
1280 // Computes annotation for multiple alignment and write values
1281 // to a file.
1282 /////////////////////////////////////////////////////////////////
1283
1284 void MSA::WriteAnnotation(MultiSequence *alignment,
1285                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1286         ofstream outfile(annotationFilename.c_str());
1287
1288         if (outfile.fail()) {
1289                 cerr << "ERROR: Unable to write annotation file." << endl;
1290                 exit(1);
1291         }
1292
1293         const int alignLength = alignment->GetSequence(0)->GetLength();
1294         const int numSeqs = alignment->GetNumSequences();
1295
1296         SafeVector<int> position(numSeqs, 0);
1297         SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
1298         for (int i = 0; i < numSeqs; i++)
1299                 seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1300         SafeVector<pair<int, int> > active;
1301         active.reserve(numSeqs);
1302
1303         SafeVector<int> lab;
1304         for (int i = 0; i < numSeqs; i++)
1305                 lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1306
1307         // for every column
1308         for (int i = 1; i <= alignLength; i++) {
1309
1310                 // find all aligned residues in this particular column
1311                 active.clear();
1312                 for (int j = 0; j < numSeqs; j++) {
1313                         if (seqs[j][i] != '-') {
1314                                 active.push_back(make_pair(lab[j], ++position[j]));
1315                         }
1316                 }
1317
1318                 sort(active.begin(), active.end());
1319                 outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
1320         }
1321
1322         outfile.close();
1323 }
1324
1325 /////////////////////////////////////////////////////////////////
1326 // ComputeScore()
1327 //
1328 // Computes the annotation score for a particular column.
1329 /////////////////////////////////////////////////////////////////
1330
1331 int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
1332                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1333
1334         if (active.size() <= 1)
1335                 return 0;
1336
1337         // ALTERNATIVE #1: Compute the average alignment score.
1338
1339         float val = 0;
1340         for (int i = 0; i < (int) active.size(); i++) {
1341                 for (int j = i + 1; j < (int) active.size(); j++) {
1342                         val += sparseMatrices[active[i].first][active[j].first]->GetValue(
1343                                         active[i].second, active[j].second);
1344                 }
1345         }
1346
1347         return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1348
1349 }