binaries/src/GLProbs-1.0/MSA.cpp

   1 /***********************************************
   2  * # Copyright 2009-2010. Liu Yongchao
   3  * # Contact: Liu Yongchao, School of Computer Engineering,
   4  * #                     Nanyang Technological University.
   5  * # Emails:     liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
   6  * #
   7  * # GPL version 3.0 applies.
   8  * #
   9  * ************************************************/
  10
  11 #include <string>
  12 #include <sstream>
  13 #include <iomanip>
  14 #include <iostream>
  15 #include <list>
  16 #include <set>
  17 #include <algorithm>
  18 #include <climits>
  19 #include <cstdio>
  20 #include <cstdlib>
  21 #include <cerrno>
  22 #include <iomanip>
  23 #include "MSA.h"
  24 #include "MSAClusterTree.h"
  25 #include "Defaults.h"
  26
  27 #ifdef _OPENMP
  28 #include <omp.h>
  29 #endif
  30
  31 string parametersInputFilename = "";
  32 string parametersOutputFilename = "no training";
  33 string annotationFilename = "";
  34
  35 bool enableVerbose = false;
  36 bool enableAnnotation = false;
  37 bool enableClustalWOutput = false;
  38 bool enableAlignOrder = false;
  39 int numConsistencyReps = 2;
  40 int numPreTrainingReps = 0;
  41 int numIterativeRefinementReps = 100;
  42
  43 float cutoff = 0;
  44
  45 VF initDistrib(NumMatrixTypes);
  46 VF gapOpen(2 * NumInsertStates);
  47 VF gapExtend(2 * NumInsertStates);
  48 VVF emitPairs(256, VF(256, 1e-10));
  49 VF emitSingle(256, 1e-5);
  50
  51 string alphabet = alphabetDefault;
  52
  53 const int MIN_PRETRAINING_REPS = 0;
  54 const int MAX_PRETRAINING_REPS = 20;
  55 const int MIN_CONSISTENCY_REPS = 0;
  56 const int MAX_CONSISTENCY_REPS = 5;
  57 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
  58 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
  59
  60 string posteriorProbsFilename = "";
  61 bool allscores = true;
  62 string infilename;
  63
  64 int flag_gui = 0;   //0: no gui related o/p
  65 //1: gui related o/p generated
  66 int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
  67 //1: pp score seq added to o/p fasta alignment
  68
  69 ///////////////////////////////
  70 // global scoring matrix variables
  71 //////////////////////////////
  72 float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
  73 char *aminos, *bases, matrixtype[20] = "gonnet_160";
  74 int subst_index[26];
  75
  76 double sub_matrix[26][26];
  77 double normalized_matrix[26][26];// add by YE Yongtao
  78 int firstread = 0;              //this makes sure that matrices are read only once
  79
  80 float TEMPERATURE = 5;
  81 int MATRIXTYPE = 160;
  82 int prot_nuc = 0;               //0=prot, 1=nucleotide
  83
  84 float GAPOPEN = 0;
  85 float GAPEXT = 0;
  86 int numThreads = 0;
  87
  88 //argument support
  89 typedef struct {
  90         char input[30];
  91         int matrix;
  92         int N;
  93         float T;
  94         float beta;
  95         char opt;                       //can be 'P' or 'M'
  96         float gapopen;
  97         float gapext;
  98 } argument_decl;
  99
 100 argument_decl argument;
 101
 102 extern inline void read_sustitution_matrix(char *fileName);
 103 extern void setmatrixtype(int le);
 104 extern inline int matrixtype_to_int();
 105 extern inline void read_dna_matrix();
 106 extern inline void read_vtml_la_matrix();
 107 extern void init_arguments();
 108
 109 MSA::MSA(int argc, char* argv[]) {
 110         //parse program parameters
 111         SafeVector<string> sequenceNames = ParseParams(argc, argv);
 112
 113         //initialize arguments for partition function
 114         init_arguments();
 115
 116         ReadParameters();
 117         //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 118
 119         //read the input sequences
 120         MultiSequence *sequences = new MultiSequence();
 121         assert(sequences);
 122         for (int i = 0; i < (int) sequenceNames.size(); i++) {
 123                 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 124                 sequences->LoadMFA(sequenceNames[i], true);
 125         }
 126         //allocate space for sequence weights
 127         this->seqsWeights = new int[sequences->GetNumSequences()];
 128         //initilaize parameters for OPENMP
 129 #ifdef _OPENMP
 130         if(numThreads <= 0) {
 131                 numThreads = omp_get_num_procs();
 132 //              cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
 133         }
 134 //      cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
 135
 136         //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
 137         omp_set_num_threads(numThreads);
 138 #endif
 139 /*
 140 FILE *fi = fopen ("accuracy", "a");
 141 fprintf (fi, "%s ", argv[1]);
 142 fclose (fi);
 143 */
 144         int levelid = AdjustmentTest(sequences,ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,emitSingle));
 145         //cerr<<levelid<<endl;
 146         // now, we can perform the alignments and write them out
 147         MultiSequence *alignment = doAlign(sequences,
 148                         ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
 149                                         emitSingle), levelid);
 150
 151         //write the alignment results to standard output
 152         if (enableClustalWOutput) {
 153                 alignment->WriteALN(*alignOutFile);
 154         } else {
 155                 alignment->WriteMFA(*alignOutFile);
 156         }
 157
 158         //release resources
 159         delete[] this->seqsWeights;
 160         delete alignment;
 161         delete sequences;
 162 }
 163 MSA::~MSA() {
 164         /*close the output file*/
 165         if (alignOutFileName.length() > 0) {
 166                 ((std::ofstream*) alignOutFile)->close();
 167         }
 168 }
 169 /////////////////////////////////////////////////////////////////
 170 // PrintParameters()
 171 //
 172 // Prints MSAPROBS parameters to STDERR.  If a filename is
 173 // specified, then the parameters are also written to the file.
 174 /////////////////////////////////////////////////////////////////
 175
 176 void MSA::PrintParameters(const char *message, const VF &initDistrib,
 177                 const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
 178                 const VF &emitSingle, const char *filename) {
 179
 180         // print parameters to the screen
 181         cerr << message << endl << "    initDistrib[] = { ";
 182         for (int i = 0; i < NumMatrixTypes; i++)
 183                 cerr << setprecision(10) << initDistrib[i] << " ";
 184         cerr << "}" << endl << "        gapOpen[] = { ";
 185         for (int i = 0; i < NumInsertStates * 2; i++)
 186                 cerr << setprecision(10) << gapOpen[i] << " ";
 187         cerr << "}" << endl << "      gapExtend[] = { ";
 188         for (int i = 0; i < NumInsertStates * 2; i++)
 189                 cerr << setprecision(10) << gapExtend[i] << " ";
 190         cerr << "}" << endl << endl;
 191
 192         // if a file name is specified
 193         if (filename) {
 194
 195                 // attempt to open the file for writing
 196                 FILE *file = fopen(filename, "w");
 197                 if (!file) {
 198                         cerr << "ERROR: Unable to write parameter file: " << filename
 199                                         << endl;
 200                         exit(1);
 201                 }
 202
 203                 // if successful, then write the parameters to the file
 204                 for (int i = 0; i < NumMatrixTypes; i++)
 205                         fprintf(file, "%.10f ", initDistrib[i]);
 206                 fprintf(file, "\n");
 207                 for (int i = 0; i < 2 * NumInsertStates; i++)
 208                         fprintf(file, "%.10f ", gapOpen[i]);
 209                 fprintf(file, "\n");
 210                 for (int i = 0; i < 2 * NumInsertStates; i++)
 211                         fprintf(file, "%.10f ", gapExtend[i]);
 212                 fprintf(file, "\n");
 213                 fprintf(file, "%s\n", alphabet.c_str());
 214                 for (int i = 0; i < (int) alphabet.size(); i++) {
 215                         for (int j = 0; j <= i; j++)
 216                                 fprintf(file, "%.10f ",
 217                                                 emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
 218                         fprintf(file, "\n");
 219                 }
 220                 for (int i = 0; i < (int) alphabet.size(); i++)
 221                         fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
 222                 fprintf(file, "\n");
 223                 fclose(file);
 224         }
 225 }
 226
 227 /////////////////////////////////////////////////////////////////
 228 // doAlign()
 229 //
 230 // First computes all pairwise posterior probability matrices.
 231 // Then, computes new parameters if training, or a final
 232 // alignment, otherwise.
 233 /////////////////////////////////////////////////////////////////
 234 extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
 235 MultiSequence* MSA::doAlign(MultiSequence *sequences,
 236                 const ProbabilisticModel &model, int levelid) {
 237         assert(sequences);
 238
 239         //get the number of sequences
 240         const int numSeqs = sequences->GetNumSequences();
 241         //create distance matrix
 242         VVF distances(numSeqs, VF(numSeqs, 0));
 243         //creat sparseMatrices
 244         SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
 245                         SafeVector<SparseMatrix *>(numSeqs, NULL));
 246
 247 #ifdef _OPENMP
 248         //calculate sequence pairs for openmp model
 249         int pairIdx = 0;
 250         numPairs = (numSeqs - 1) * numSeqs / 2;
 251         seqsPairs = new SeqsPair[numPairs];
 252         for(int a = 0; a < numSeqs; a++) {
 253                 for(int b = a + 1; b < numSeqs; b++) {
 254                         seqsPairs[pairIdx].seq1 = a;
 255                         seqsPairs[pairIdx].seq2 = b;
 256                         pairIdx++;
 257                 }
 258         }
 259 #endif
 260         // do all pairwise alignments for posterior probability matrices
 261 #ifdef _OPENMP
 262 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
 263         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
 264                 int a= seqsPairs[pairIdx].seq1;
 265                 int b = seqsPairs[pairIdx].seq2;
 266                 if(enableVerbose) {
 267 #pragma omp critical
 268                         cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
 269                 }
 270 #else
 271         for (int a = 0; a < numSeqs - 1; a++) {
 272                 for (int b = a + 1; b < numSeqs; b++) {
 273 #endif
 274                         Sequence *seq1 = sequences->GetSequence(a);
 275                         Sequence *seq2 = sequences->GetSequence(b);
 276
 277                         //posterior probability matrix
 278                         VF* posterior;
 279
 280 //low similarity use local model
 281                         if(levelid == 1){
 282
 283                                 VF *forward = model.ComputeForwardMatrix(seq1, seq2,false);
 284                                 assert(forward);
 285                                 VF *backward = model.ComputeBackwardMatrix(seq1, seq2,false);
 286                                 assert(backward);
 287                                 posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
 288                                 delete forward;
 289                                 delete backward;
 290
 291                         }
 292 //high similarity use global model
 293                         else if(levelid >= 2) posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
 294
 295 //extreme low or extreme high similarity use combined model
 296                         else{
 297
 298 //probcons
 299                                 // compute forward and backward probabilities
 300                                 VF *forward = model.ComputeForwardMatrix(seq1, seq2);
 301                                 assert(forward);
 302                                 VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
 303                                 assert(backward);
 304                                 // compute posterior probability matrix from HMM
 305                                 VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
 306                                 assert(probcons_posterior);
 307                                 delete forward;
 308                                 delete backward;
 309
 310 //probalign
 311                                 VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
 312                                 assert(probalign_posterior);
 313 //local
 314                                 forward = model.ComputeForwardMatrix(seq1, seq2,false);
 315                                 assert(forward);
 316                                 backward = model.ComputeBackwardMatrix(seq1, seq2,false);
 317                                 assert(backward);
 318                                 posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
 319                                 assert(posterior);
 320                                 delete forward;
 321                                 delete backward;
 322 //combined model
 323                                 //merge probalign + local + probcons
 324                                 VF::iterator ptr1 = probcons_posterior->begin();
 325                                 VF::iterator ptr2 = probalign_posterior->begin();
 326                                 VF::iterator ptr = posterior->begin();
 327                                 for (int i = 0; i <= seq1->GetLength(); i++) {
 328                                         for (int j = 0; j <= seq2->GetLength(); j++) {
 329                                                 float v1 = *ptr1;
 330                                                 float v2 = *ptr2;
 331                                                 float v3 = *ptr;
 332                                                 *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3);
 333                                                 ptr1++;
 334                                                 ptr2++;
 335                                                 ptr++;
 336                                         }
 337                                 }
 338                                 delete probcons_posterior;
 339                                 delete probalign_posterior;
 340                         }
 341
 342                         assert(posterior);
 343                         // perform the pairwise sequence alignment
 344                         pair<SafeVector<char> *, float> alignment = model.ComputeAlignment(
 345                                         seq1->GetLength(), seq2->GetLength(), *posterior);
 346
 347                         //compute expected accuracy
 348                         distances[a][b] = distances[b][a] = 1.0f - alignment.second
 349                                         / min(seq1->GetLength(), seq2->GetLength());
 350
 351                         // compute sparse representations
 352                         sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
 353                         seq2->GetLength(), *posterior);
 354                         sparseMatrices[b][a] = NULL;
 355
 356                         delete posterior;
 357                         delete alignment.first;
 358 #ifndef _OPENMP
 359                 }
 360 #endif
 361         }
 362
 363         //create the guide tree
 364         this->tree = new MSAClusterTree(this, distances, numSeqs);
 365         this->tree->create();
 366
 367         // perform the consistency transformation the desired number of times
 368         float* fweights = new float[numSeqs];
 369         for (int r = 0; r < numSeqs; r++) {
 370                 fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
 371                 fweights[r] *= 10;
 372         }
 373         for (int r = 0; r < numConsistencyReps; r++) {
 374                 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
 375                                 DoRelaxation(fweights, sequences, sparseMatrices);
 376
 377                 // now replace the old posterior matrices
 378                 for (int i = 0; i < numSeqs; i++) {
 379                         for (int j = 0; j < numSeqs; j++) {
 380                                 delete sparseMatrices[i][j];
 381                                 sparseMatrices[i][j] = newSparseMatrices[i][j];
 382                         }
 383                 }
 384         }
 385         delete[] fweights;
 386 #ifdef _OPENMP
 387         delete [] seqsPairs;
 388 #endif
 389
 390         //compute the final multiple sequence alignment
 391         MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
 392                         sparseMatrices, model,levelid);
 393
 394         // build annotation
 395         if (enableAnnotation) {
 396                 WriteAnnotation(finalAlignment, sparseMatrices);
 397         }
 398         //destroy the guide tree
 399         delete this->tree;
 400         this->tree = 0;
 401
 402         // delete sparse matrices
 403         for (int a = 0; a < numSeqs - 1; a++) {
 404                 for (int b = a + 1; b < numSeqs; b++) {
 405                         delete sparseMatrices[a][b];
 406                         delete sparseMatrices[b][a];
 407                 }
 408         }
 409
 410         return finalAlignment;
 411 }
 412
 413 /////////////////////////////////////////////////////////////////
 414 // GetInteger()
 415 //
 416 // Attempts to parse an integer from the character string given.
 417 // Returns true only if no parsing error occurs.
 418 /////////////////////////////////////////////////////////////////
 419
 420 bool GetInteger(char *data, int *val) {
 421         char *endPtr;
 422         long int retVal;
 423
 424         assert(val);
 425
 426         errno = 0;
 427         retVal = strtol(data, &endPtr, 0);
 428         if (retVal == 0 && (errno != 0 || data == endPtr))
 429                 return false;
 430         if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
 431                 return false;
 432         if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
 433                 return false;
 434         *val = (int) retVal;
 435         return true;
 436 }
 437
 438 /////////////////////////////////////////////////////////////////
 439 // GetFloat()
 440 //
 441 // Attempts to parse a float from the character string given.
 442 // Returns true only if no parsing error occurs.
 443 /////////////////////////////////////////////////////////////////
 444
 445 bool GetFloat(char *data, float *val) {
 446         char *endPtr;
 447         double retVal;
 448
 449         assert(val);
 450
 451         errno = 0;
 452         retVal = strtod(data, &endPtr);
 453         if (retVal == 0 && (errno != 0 || data == endPtr))
 454                 return false;
 455         if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
 456                 return false;
 457         *val = (float) retVal;
 458         return true;
 459 }
 460
 461 /////////////////////////////////////////////////////////////////
 462 // ReadParameters()
 463 //
 464 // Read initial distribution, transition, and emission
 465 // parameters from a file.
 466 /////////////////////////////////////////////////////////////////
 467
 468 void MSA::ReadParameters() {
 469
 470         ifstream data;
 471
 472         emitPairs = VVF(256, VF(256, 1e-10));
 473         emitSingle = VF(256, 1e-5);
 474
 475         // read initial state distribution and transition parameters
 476         if (parametersInputFilename == string("")) {
 477                 if (NumInsertStates == 1) {
 478                         for (int i = 0; i < NumMatrixTypes; i++)
 479                                 initDistrib[i] = initDistrib1Default[i];
 480                         for (int i = 0; i < 2 * NumInsertStates; i++)
 481                                 gapOpen[i] = gapOpen1Default[i];
 482                         for (int i = 0; i < 2 * NumInsertStates; i++)
 483                                 gapExtend[i] = gapExtend1Default[i];
 484                 } else if (NumInsertStates == 2) {
 485                         for (int i = 0; i < NumMatrixTypes; i++)
 486                                 initDistrib[i] = initDistrib2Default[i];
 487                         for (int i = 0; i < 2 * NumInsertStates; i++)
 488                                 gapOpen[i] = gapOpen2Default[i];
 489                         for (int i = 0; i < 2 * NumInsertStates; i++)
 490                                 gapExtend[i] = gapExtend2Default[i];
 491                 } else {
 492                         cerr
 493                                         << "ERROR: No default initial distribution/parameter settings exist"
 494                                         << endl << "       for " << NumInsertStates
 495                                         << " pairs of insert states.  Use --paramfile." << endl;
 496                         exit(1);
 497                 }
 498
 499                 alphabet = alphabetDefault;
 500
 501                 for (int i = 0; i < (int) alphabet.length(); i++) {
 502                         emitSingle[(unsigned char) tolower(alphabet[i])] =
 503                                         emitSingleDefault[i];
 504                         emitSingle[(unsigned char) toupper(alphabet[i])] =
 505                                         emitSingleDefault[i];
 506                         for (int j = 0; j <= i; j++) {
 507                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
 508                                                 alphabet[j])] = emitPairsDefault[i][j];
 509                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
 510                                                 alphabet[j])] = emitPairsDefault[i][j];
 511                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
 512                                                 alphabet[j])] = emitPairsDefault[i][j];
 513                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
 514                                                 alphabet[j])] = emitPairsDefault[i][j];
 515                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
 516                                                 alphabet[i])] = emitPairsDefault[i][j];
 517                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
 518                                                 alphabet[i])] = emitPairsDefault[i][j];
 519                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
 520                                                 alphabet[i])] = emitPairsDefault[i][j];
 521                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
 522                                                 alphabet[i])] = emitPairsDefault[i][j];
 523                         }
 524                 }
 525         } else {
 526                 data.open(parametersInputFilename.c_str());
 527                 if (data.fail()) {
 528                         cerr << "ERROR: Unable to read parameter file: "
 529                                         << parametersInputFilename << endl;
 530                         exit(1);
 531                 }
 532
 533                 string line[3];
 534                 for (int i = 0; i < 3; i++) {
 535                         if (!getline(data, line[i])) {
 536                                 cerr
 537                                                 << "ERROR: Unable to read transition parameters from parameter file: "
 538                                                 << parametersInputFilename << endl;
 539                                 exit(1);
 540                         }
 541                 }
 542                 istringstream data2;
 543                 data2.clear();
 544                 data2.str(line[0]);
 545                 for (int i = 0; i < NumMatrixTypes; i++)
 546                         data2 >> initDistrib[i];
 547                 data2.clear();
 548                 data2.str(line[1]);
 549                 for (int i = 0; i < 2 * NumInsertStates; i++)
 550                         data2 >> gapOpen[i];
 551                 data2.clear();
 552                 data2.str(line[2]);
 553                 for (int i = 0; i < 2 * NumInsertStates; i++)
 554                         data2 >> gapExtend[i];
 555
 556                 if (!getline(data, line[0])) {
 557                         cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
 558                                         << parametersInputFilename << endl;
 559                         exit(1);
 560                 }
 561
 562                 // read alphabet as concatenation of all characters on alphabet line
 563                 alphabet = "";
 564                 string token;
 565                 data2.clear();
 566                 data2.str(line[0]);
 567                 while (data2 >> token)
 568                         alphabet += token;
 569
 570                 for (int i = 0; i < (int) alphabet.size(); i++) {
 571                         for (int j = 0; j <= i; j++) {
 572                                 float val;
 573                                 data >> val;
 574                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
 575                                                 alphabet[j])] = val;
 576                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
 577                                                 alphabet[j])] = val;
 578                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
 579                                                 alphabet[j])] = val;
 580                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
 581                                                 alphabet[j])] = val;
 582                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
 583                                                 alphabet[i])] = val;
 584                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
 585                                                 alphabet[i])] = val;
 586                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
 587                                                 alphabet[i])] = val;
 588                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
 589                                                 alphabet[i])] = val;
 590                         }
 591                 }
 592
 593                 for (int i = 0; i < (int) alphabet.size(); i++) {
 594                         float val;
 595                         data >> val;
 596                         emitSingle[(unsigned char) tolower(alphabet[i])] = val;
 597                         emitSingle[(unsigned char) toupper(alphabet[i])] = val;
 598                 }
 599                 data.close();
 600         }
 601 }
 602
 603 /////////////////////////////////////////////////////////////////
 604 // ParseParams()
 605 //
 606 // Parse all command-line options.
 607 /////////////////////////////////////////////////////////////////
 608 void MSA::printUsage() {
 609         cerr
 610                         << "************************************************************************"
 611                         << endl
 612                         << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
 613                         << endl
 614                         << "\tbased on pair hidden markov model and partition function postirior"
 615                         << endl
 616                         << "\tprobabilities. If any comments or problems, please contact"
 617                         << endl
 618                         << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
 619                         << endl
 620                         << "*************************************************************************"
 621                         << endl << "Usage:" << endl
 622                         << "       msaprobs [OPTION]... [infile]..." << endl << endl
 623                         << "Description:" << endl
 624                         << "       Align sequences in multi-FASTA format" << endl << endl
 625                         << "       -o, --outfile <string>" << endl
 626                         << "              specify the output file name (STDOUT by default)"
 627                         << endl << "       -num_threads <integer>" << endl
 628                         << "              specify the number of threads used, and otherwise detect automatically"
 629                         << endl << "       -clustalw" << endl
 630                         << "              use CLUSTALW output format instead of FASTA format"
 631                         << endl << endl << "       -c, --consistency REPS" << endl
 632                         << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
 633                         << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
 634                         << ") passes of consistency transformation" << endl << endl
 635                         << "       -ir, --iterative-refinement REPS" << endl
 636                         << "              use " << MIN_ITERATIVE_REFINEMENT_REPS
 637                         << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
 638                         << numIterativeRefinementReps << ") passes of iterative-refinement"
 639                         << endl << endl << "       -v, --verbose" << endl
 640                         << "              report progress while aligning (default: "
 641                         << (enableVerbose ? "on" : "off") << ")" << endl << endl
 642                         << "       -annot FILENAME" << endl
 643                         << "              write annotation for multiple alignment to FILENAME"
 644                         << endl << endl << "       -a, --alignment-order" << endl
 645                         << "              print sequences in alignment order rather than input order (default: "
 646                         << (enableAlignOrder ? "on" : "off") << ")" << endl
 647                         << "       -version " << endl
 648                         << "              print out version of MSAPROBS " << endl << endl;
 649 }
 650 SafeVector<string> MSA::ParseParams(int argc, char **argv) {
 651         if (argc < 2) {
 652                 printUsage();
 653                 exit(1);
 654         }
 655         SafeVector<string> sequenceNames;
 656         int tempInt;
 657         float tempFloat;
 658
 659         for (int i = 1; i < argc; i++) {
 660                 if (argv[i][0] == '-') {
 661                         //help
 662                         if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
 663                                 printUsage();
 664                                 exit(1);
 665                                 //output file name
 666                         } else if (!strcmp(argv[i], "-o")
 667                                         || !strcmp(argv[i], "--outfile")) {
 668                                 if (i < argc - 1) {
 669                                         alignOutFileName = argv[++i];   //get the file name
 670                                 } else {
 671                                         cerr << "ERROR: String expected for option " << argv[i]
 672                                                         << endl;
 673                                         exit(1);
 674                                 }
 675                                 // parameter file
 676                         } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
 677                                 if (i < argc - 1)
 678                                         parametersInputFilename = string (argv[++i]);
 679                                 else {
 680                                         cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 681                                         exit (1);
 682                                 }
 683                                 //number of threads used
 684                         } else if (!strcmp(argv[i], "-p")
 685                                         || !strcmp(argv[i], "-num_threads")) {
 686                                 if (i < argc - 1) {
 687                                         if (!GetInteger(argv[++i], &tempInt)) {
 688                                                 cerr << " ERROR: invalid integer following option "
 689                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 690                                                 exit(1);
 691                                         } else {
 692                                                 if (tempInt < 0) {
 693                                                         tempInt = 0;
 694                                                 }
 695                                                 numThreads = tempInt;
 696                                         }
 697                                 } else {
 698                                         cerr << "ERROR: Integer expected for option " << argv[i]
 699                                                         << endl;
 700                                         exit(1);
 701                                 }
 702                                 // number of consistency transformations
 703                         } else if (!strcmp(argv[i], "-c")
 704                                         || !strcmp(argv[i], "--consistency")) {
 705                                 if (i < argc - 1) {
 706                                         if (!GetInteger(argv[++i], &tempInt)) {
 707                                                 cerr << "ERROR: Invalid integer following option "
 708                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 709                                                 exit(1);
 710                                         } else {
 711                                                 if (tempInt < MIN_CONSISTENCY_REPS
 712                                                                 || tempInt > MAX_CONSISTENCY_REPS) {
 713                                                         cerr << "ERROR: For option " << argv[i - 1]
 714                                                                         << ", integer must be between "
 715                                                                         << MIN_CONSISTENCY_REPS << " and "
 716                                                                         << MAX_CONSISTENCY_REPS << "." << endl;
 717                                                         exit(1);
 718                                                 } else {
 719                                                         numConsistencyReps = tempInt;
 720                                                 }
 721                                         }
 722                                 } else {
 723                                         cerr << "ERROR: Integer expected for option " << argv[i]
 724                                                         << endl;
 725                                         exit(1);
 726                                 }
 727                         }
 728
 729                         // number of randomized partitioning iterative refinement passes
 730                         else if (!strcmp(argv[i], "-ir")
 731                                         || !strcmp(argv[i], "--iterative-refinement")) {
 732                                 if (i < argc - 1) {
 733                                         if (!GetInteger(argv[++i], &tempInt)) {
 734                                                 cerr << "ERROR: Invalid integer following option "
 735                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 736                                                 exit(1);
 737                                         } else {
 738                                                 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
 739                                                                 || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
 740                                                         cerr << "ERROR: For option " << argv[i - 1]
 741                                                                         << ", integer must be between "
 742                                                                         << MIN_ITERATIVE_REFINEMENT_REPS << " and "
 743                                                                         << MAX_ITERATIVE_REFINEMENT_REPS << "."
 744                                                                         << endl;
 745                                                         exit(1);
 746                                                 } else
 747                                                         numIterativeRefinementReps = tempInt;
 748                                         }
 749                                 } else {
 750                                         cerr << "ERROR: Integer expected for option " << argv[i]
 751                                                         << endl;
 752                                         exit(1);
 753                                 }
 754                         }
 755
 756                         // annotation files
 757                         else if (!strcmp(argv[i], "-annot")) {
 758                                 enableAnnotation = true;
 759                                 if (i < argc - 1) {
 760                                         annotationFilename = argv[++i];
 761                                 } else {
 762                                         cerr << "ERROR: FILENAME expected for option " << argv[i]
 763                                                         << endl;
 764                                         exit(1);
 765                                 }
 766                         }
 767
 768                         // clustalw output format
 769                         else if (!strcmp(argv[i], "-clustalw")) {
 770                                 enableClustalWOutput = true;
 771                         }
 772
 773                         // cutoff
 774                         else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
 775                                 if (i < argc - 1) {
 776                                         if (!GetFloat(argv[++i], &tempFloat)) {
 777                                                 cerr
 778                                                                 << "ERROR: Invalid floating-point value following option "
 779                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 780                                                 exit(1);
 781                                         } else {
 782                                                 if (tempFloat < 0 || tempFloat > 1) {
 783                                                         cerr << "ERROR: For option " << argv[i - 1]
 784                                                                         << ", floating-point value must be between 0 and 1."
 785                                                                         << endl;
 786                                                         exit(1);
 787                                                 } else
 788                                                         cutoff = tempFloat;
 789                                         }
 790                                 } else {
 791                                         cerr << "ERROR: Floating-point value expected for option "
 792                                                         << argv[i] << endl;
 793                                         exit(1);
 794                                 }
 795                         }
 796
 797                         // verbose reporting
 798                         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
 799                                 enableVerbose = true;
 800                         }
 801
 802                         // alignment order
 803                         else if (!strcmp(argv[i], "-a")
 804                                         || !strcmp(argv[i], "--alignment-order")) {
 805                                 enableAlignOrder = true;
 806                         }
 807
 808                         //print out version
 809                         else if (!strcmp(argv[i], "-version")) {
 810                                 cerr << "MSAPROBS version " << VERSION << endl;
 811                                 exit(1);
 812                         }
 813                         // bad arguments
 814                         else {
 815                                 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
 816                                 exit(1);
 817                         }
 818                 } else {
 819                         sequenceNames.push_back(string(argv[i]));
 820                 }
 821         }
 822
 823         /*check the output file name*/
 824         cerr << "-------------------------------------" << endl;
 825         if (alignOutFileName.length() == 0) {
 826                 cerr << "The final alignments will be printed out to STDOUT" << endl;
 827                 alignOutFile = &std::cout;
 828         } else {
 829                 cerr << "Open the output file " << alignOutFileName << endl;
 830                 alignOutFile = new ofstream(alignOutFileName.c_str(),
 831                                 ios::binary | ios::out | ios::trunc);
 832         }
 833         cerr << "-------------------------------------" << endl;
 834         return sequenceNames;
 835 }
 836
 837 /////////////////////////////////////////////////////////////////
 838 // ProcessTree()
 839 //
 840 // Process the tree recursively.  Returns the aligned sequences
 841 // corresponding to a node or leaf of the tree.
 842 /////////////////////////////////////////////////////////////////
 843 MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
 844                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 845                 const ProbabilisticModel &model) {
 846
 847         MultiSequence *result;
 848
 849         // check if this is a node of the alignment tree
 850         //if (tree->GetSequenceLabel() == -1){
 851         if (tree->leaf == NODE) {
 852                 MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
 853                                 sparseMatrices, model);
 854                 MultiSequence *alignRight = ProcessTree(tree->right, sequences,
 855                                 sparseMatrices, model);
 856
 857                 assert(alignLeft);
 858                 assert(alignRight);
 859
 860                 result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
 861                 assert(result);
 862
 863                 delete alignLeft;
 864                 delete alignRight;
 865         }
 866
 867         // otherwise, this is a leaf of the alignment tree
 868         else {
 869                 result = new MultiSequence();
 870                 assert(result);
 871                 //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
 872                 result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
 873         }
 874
 875         return result;
 876 }
 877
 878 /////////////////////////////////////////////////////////////////
 879 // ComputeFinalAlignment()
 880 //
 881 // Compute the final alignment by calling ProcessTree(), then
 882 // performing iterative refinement as needed.
 883 /////////////////////////////////////////////////////////////////
 884
 885 MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
 886                 MultiSequence *sequences,
 887                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 888                 const ProbabilisticModel &model, int levelid) {
 889         MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
 890                         sparseMatrices, model);
 891
 892         SafeVector<int> oldOrdering;
 893         int numSeqs = alignment->GetNumSequences();
 894         if (enableAlignOrder) {
 895                 for (int i = 0; i < numSeqs; i++)
 896                         oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
 897                 alignment->SaveOrdering();
 898                 enableAlignOrder = false;
 899         }
 900
 901         // tree-based refinement
 902         // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
 903         /*
 904          int numSeqs = alignment->GetNumSequences();
 905          //if(numSeqs < numIterativeRefinementReps){
 906          for(int iter = 0; iter < 5; iter ++){
 907                 for(int i = 0; i < numSeqs - 1; i++){
 908                         DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
 909                  }
 910          }
 911          //}*/
 912 /*
 913         //DoIterativeRefinement() return 1,2: this refinement unsuccessful
 914         if(levelid == 3) numIterativeRefinementReps=10;
 915         int ineffectiveness = 0;
 916         for (int i = 0; i < numIterativeRefinementReps; i++){
 917                 int flag = DoIterativeRefinement(sparseMatrices, model, alignment);
 918                 if(numSeqs > 35 && levelid < 3){
 919                         if(flag > 0){
 920                             if(numIterativeRefinementReps < 10*numSeqs)
 921                                 numIterativeRefinementReps ++;
 922                             if(flag == 1) ineffectiveness ++;
 923                         }
 924                         //else ineffectiveness = 0;
 925                         if(ineffectiveness > numSeqs && i >100 ) break;
 926                 }
 927         }
 928 */
 929
 930         //if(levelid == 3) numIterativeRefinementReps=10;
 931         for (int i = 0; i < numIterativeRefinementReps; i++)
 932                 DoIterativeRefinement(sparseMatrices, model, alignment);
 933
 934         cerr << endl;
 935
 936         if (oldOrdering.size() > 0) {
 937                 for (int i = 0; i < (int) oldOrdering.size(); i++) {
 938                         alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
 939                 }
 940         }
 941
 942         // return final alignment
 943         return alignment;
 944 }
 945
 946 /////////////////////////////////////////////////////////////////
 947 // AlignAlignments()
 948 //
 949 // Returns the alignment of two MultiSequence objects.
 950 /////////////////////////////////////////////////////////////////
 951
 952 MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
 953                 MultiSequence *align2,
 954                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 955                 const ProbabilisticModel &model) {
 956
 957         // print some info about the alignment
 958         if (enableVerbose) {
 959                 for (int i = 0; i < align1->GetNumSequences(); i++)
 960                         cerr << ((i == 0) ? "[" : ",")
 961                                         << align1->GetSequence(i)->GetLabel();
 962                 cerr << "] vs. ";
 963                 for (int i = 0; i < align2->GetNumSequences(); i++)
 964                         cerr << ((i == 0) ? "[" : ",")
 965                                         << align2->GetSequence(i)->GetLabel();
 966                 cerr << "]: ";
 967         }
 968 #if 0
 969         VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
 970 #else
 971         VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
 972                         sparseMatrices, cutoff);
 973 #endif
 974         // compute an "accuracy" measure for the alignment before refinement
 975
 976         pair<SafeVector<char> *, float> alignment;
 977         //perform alignment
 978         alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
 979                         align2->GetSequence(0)->GetLength(), *posterior);
 980
 981         delete posterior;
 982
 983         if (enableVerbose) {
 984
 985                 // compute total length of sequences
 986                 int totLength = 0;
 987                 for (int i = 0; i < align1->GetNumSequences(); i++)
 988                         for (int j = 0; j < align2->GetNumSequences(); j++)
 989                                 totLength += min(align1->GetSequence(i)->GetLength(),
 990                                                 align2->GetSequence(j)->GetLength());
 991
 992                 // give an "accuracy" measure for the alignment
 993                 cerr << alignment.second / totLength << endl;
 994         }
 995
 996         // now build final alignment
 997         MultiSequence *result = new MultiSequence();
 998         for (int i = 0; i < align1->GetNumSequences(); i++)
 999                 result->AddSequence(
1000                                 align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1001         for (int i = 0; i < align2->GetNumSequences(); i++)
1002                 result->AddSequence(
1003                                 align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1004         if (!enableAlignOrder)
1005                 result->SortByLabel();
1006
1007         // free temporary alignment
1008         delete alignment.first;
1009
1010         return result;
1011 }
1012
1013 /////////////////////////////////////////////////////////////////
1014 // DoRelaxation()
1015 //
1016 // Performs one round of the weighted probabilistic consistency transformation.
1017 //                     1
1018 /////////////////////////////////////////////////////////////////
1019
1020 SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
1021                 MultiSequence *sequences,
1022                 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1023         const int numSeqs = sequences->GetNumSequences();
1024
1025         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
1026                         SafeVector<SparseMatrix *>(numSeqs, NULL));
1027
1028         // for every pair of sequences
1029 #ifdef _OPENMP
1030         int pairIdx;
1031 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
1032         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
1033                 int i = seqsPairs[pairIdx].seq1;
1034                 int j = seqsPairs[pairIdx].seq2;
1035                 float wi = seqsWeights[i];
1036                 float wj = seqsWeights[j];
1037 #else
1038         for (int i = 0; i < numSeqs; i++) {
1039                 float wi = seqsWeights[i];
1040                 for (int j = i + 1; j < numSeqs; j++) {
1041                         float wj = seqsWeights[j];
1042 #endif
1043                         Sequence *seq1 = sequences->GetSequence(i);
1044                         Sequence *seq2 = sequences->GetSequence(j);
1045
1046                         if (enableVerbose) {
1047 #ifdef _OPENMP
1048 #pragma omp critical
1049 #endif
1050                                 cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
1051                                                 << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
1052                                                 << ": ";
1053                         }
1054                         // get the original posterior matrix
1055                         VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
1056                         assert(posteriorPtr);
1057                         VF &posterior = *posteriorPtr;
1058
1059                         const int seq1Length = seq1->GetLength();
1060                         const int seq2Length = seq2->GetLength();
1061
1062                         // contribution from the summation where z = x and z = y
1063                         float w = wi * wi * wj + wi * wj * wj;
1064                         float sumW = w;
1065                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1066                                 //posterior[k] = w*posterior[k];
1067                                 posterior[k] += posterior[k];
1068                         }
1069
1070                         if (enableVerbose)
1071                                 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1072
1073                         // contribution from all other sequences
1074                         for (int k = 0; k < numSeqs; k++) {
1075                                 if (k != i && k != j) {
1076                                         float wk = seqsWeights[k];
1077                                         float w = wi * wj * wk;
1078                                         sumW += w;
1079                                         if (k < i)
1080                                                 Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
1081                                                                 posterior);
1082                                         else if (k > i && k < j)
1083                                                 Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
1084                                                                 posterior);
1085                                         else {
1086                                                 SparseMatrix *temp =
1087                                                                 sparseMatrices[j][k]->ComputeTranspose();
1088                                                 Relax(w, sparseMatrices[i][k], temp, posterior);
1089                                                 delete temp;
1090                                         }
1091                                 }
1092                         }
1093                         //cerr<<"sumW "<<sumW<<endl;
1094                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1095                                 //posterior[k] /= sumW;
1096                                 posterior[k] /= numSeqs;
1097                         }
1098                         // mask out positions not originally in the posterior matrix
1099                         SparseMatrix *matXY = sparseMatrices[i][j];
1100                         for (int y = 0; y <= seq2Length; y++)
1101                                 posterior[y] = 0;
1102                         for (int x = 1; x <= seq1Length; x++) {
1103                                 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1104                                 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1105                                 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1106                                 int curr = 0;
1107                                 while (XYptr != XYend) {
1108
1109                                         // zero out all cells until the first filled column
1110                                         while (curr < XYptr->first) {
1111                                                 base[curr] = 0;
1112                                                 curr++;
1113                                         }
1114
1115                                         // now, skip over this column
1116                                         curr++;
1117                                         ++XYptr;
1118                                 }
1119
1120                                 // zero out cells after last column
1121                                 while (curr <= seq2Length) {
1122                                         base[curr] = 0;
1123                                         curr++;
1124                                 }
1125                         }
1126
1127                         // save the new posterior matrix
1128                         newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
1129                                         seq2->GetLength(), posterior);
1130                         newSparseMatrices[j][i] = NULL;
1131
1132                         if (enableVerbose)
1133                                 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1134
1135                         delete posteriorPtr;
1136
1137                         if (enableVerbose)
1138                                 cerr << "done." << endl;
1139 #ifndef _OPENMP
1140                 }
1141 #endif
1142         }
1143
1144         return newSparseMatrices;
1145 }
1146
1147 /////////////////////////////////////////////////////////////////
1148 // Relax()
1149 //
1150 // Computes the consistency transformation for a single sequence
1151 // z, and adds the transformed matrix to "posterior".
1152 /////////////////////////////////////////////////////////////////
1153
1154 void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
1155                 VF &posterior) {
1156
1157         assert(matXZ);
1158         assert(matZY);
1159
1160         int lengthX = matXZ->GetSeq1Length();
1161         int lengthY = matZY->GetSeq2Length();
1162         assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1163
1164         // for every x[i]
1165         for (int i = 1; i <= lengthX; i++) {
1166                 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1167                 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1168
1169                 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1170
1171                 // iterate through all x[i]-z[k]
1172                 while (XZptr != XZend) {
1173                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1174                         SafeVector<PIF>::iterator ZYend = ZYptr
1175                                         + matZY->GetRowSize(XZptr->first);
1176                         const float XZval = XZptr->second;
1177
1178                         // iterate through all z[k]-y[j]
1179                         while (ZYptr != ZYend) {
1180                                 //base[ZYptr->first] += weight * XZval * ZYptr->second;
1181                                 base[ZYptr->first] += XZval * ZYptr->second;
1182                                 ZYptr++;
1183                         }
1184                         XZptr++;
1185                 }
1186         }
1187 }
1188
1189 /////////////////////////////////////////////////////////////////
1190 // Relax1()
1191 //
1192 // Computes the consistency transformation for a single sequence
1193 // z, and adds the transformed matrix to "posterior".
1194 /////////////////////////////////////////////////////////////////
1195
1196 void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
1197                 VF &posterior) {
1198
1199         assert(matZX);
1200         assert(matZY);
1201
1202         int lengthZ = matZX->GetSeq1Length();
1203         int lengthY = matZY->GetSeq2Length();
1204
1205         // for every z[k]
1206         for (int k = 1; k <= lengthZ; k++) {
1207                 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1208                 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1209
1210                 // iterate through all z[k]-x[i]
1211                 while (ZXptr != ZXend) {
1212                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1213                         SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1214                         const float ZXval = ZXptr->second;
1215                         VF::iterator base = posterior.begin()
1216                                         + ZXptr->first * (lengthY + 1);
1217
1218                         // iterate through all z[k]-y[j]
1219                         while (ZYptr != ZYend) {
1220                                 //base[ZYptr->first] += weight * ZXval * ZYptr->second;
1221                                 base[ZYptr->first] += ZXval * ZYptr->second;
1222                                 ZYptr++;
1223                         }
1224                         ZXptr++;
1225                 }
1226         }
1227 }
1228 /////////////////////////////////////////////////////////////////
1229 // DoIterativeRefinement()
1230 //
1231 // Performs a single round of randomized partionining iterative
1232 // refinement.
1233 // return 0: successful refinement, 1: ineffective refinement, 2: random problem
1234 /////////////////////////////////////////////////////////////////
1235 int MSA::DoIterativeRefinement(
1236                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1237                 const ProbabilisticModel &model, MultiSequence* &alignment) {
1238         set<int> groupOne, groupTwo;
1239         int numSeqs = alignment->GetNumSequences();
1240         int i;
1241         // create two separate groups
1242         for (i = 0; i < numSeqs; i++) {
1243                 int index = rand();
1244                 if (index % 2) {
1245                         groupOne.insert(i);
1246                 } else {
1247                         groupTwo.insert(i);
1248                 }
1249         }
1250         if (groupOne.empty() || groupTwo.empty()) return 2;
1251
1252         // project into the two groups
1253         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1254         assert(groupOneSeqs);
1255         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1256         assert(groupTwoSeqs);
1257
1258 //start add by Yongtao
1259 #if 1
1260         VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
1261 #else
1262         VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
1263                         sparseMatrices, cutoff);
1264 #endif
1265         // compute an "accuracy" measure for the alignment before refinement
1266         SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
1267         SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
1268         i=0;
1269         for (set<int>::const_iterator iter = groupOne.begin();
1270                         iter != groupOne.end(); ++iter) {
1271                 oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1272         }
1273         i=0;
1274         for (set<int>::const_iterator iter = groupTwo.begin();
1275                         iter != groupTwo.end(); ++iter) {
1276                 oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1277         }
1278
1279         VF &posteriorArr = *posterior;
1280         int oldLength = alignment->GetSequence(0)->GetLength();
1281         int groupOneindex=0; int groupTwoindex=0;
1282         float accuracy_before = 0;
1283         int j;
1284         for (i = 1; i <= oldLength; i++) {
1285                 // check to see if there is a gap in every sequence of the set
1286                 bool foundOne = false;
1287                 for (j = 0; !foundOne && j < (int) groupOne.size(); j++)
1288                         foundOne = (oldOnePtrs[j][i] != '-');
1289                 // if not, then this column counts towards the sequence length
1290                 if (foundOne) groupOneindex ++;
1291                 bool foundTwo = false;
1292                 for (j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
1293                         foundTwo = (oldTwoPtrs[j][i] != '-');
1294                 if (foundTwo) groupTwoindex ++;
1295                 if(foundOne && foundTwo) accuracy_before +=
1296                                 posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
1297         }
1298
1299         pair<SafeVector<char> *, float> refinealignment;
1300         //perform alignment
1301         refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
1302                         groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
1303         delete posterior;
1304         // now build final alignment
1305         MultiSequence *result = new MultiSequence();
1306         for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
1307                 result->AddSequence(
1308                         groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
1309         for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
1310                 result->AddSequence(
1311                         groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
1312         // free temporary alignment
1313         delete refinealignment.first;
1314         delete alignment;
1315         alignment = result;
1316         delete groupOneSeqs;
1317         delete groupTwoSeqs;
1318         if(accuracy_before == refinealignment.second) return 1;
1319         else return 0;
1320 }
1321
1322
1323 void MSA::DoIterativeRefinementTreeNode(
1324                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1325                 const ProbabilisticModel &model, MultiSequence* &alignment,
1326                 int nodeIndex) {
1327         set<int> groupOne, groupTwo;
1328         int numSeqs = alignment->GetNumSequences();
1329
1330         vector<bool> inGroup1;
1331         inGroup1.resize(numSeqs);
1332         for (int i = 0; i < numSeqs; i++) {
1333                 inGroup1[i] = false;
1334         }
1335
1336         AlignmentOrder* orders = this->tree->getAlignOrders();
1337         AlignmentOrder* order = &orders[nodeIndex];
1338         for (int i = 0; i < order->leftNum; i++) {
1339                 int si = order->leftLeafs[i];
1340                 inGroup1[si] = true;
1341         }
1342         for (int i = 0; i < order->rightNum; i++) {
1343                 int si = order->rightLeafs[i];
1344                 inGroup1[si] = true;
1345         }
1346         // create two separate groups
1347         for (int i = 0; i < numSeqs; i++) {
1348                 if (inGroup1[i]) {
1349                         groupOne.insert(i);
1350                 } else {
1351                         groupTwo.insert(i);
1352                 }
1353         }
1354         if (groupOne.empty() || groupTwo.empty())
1355                 return;
1356
1357         // project into the two groups
1358         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1359         assert(groupOneSeqs);
1360         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1361         assert(groupTwoSeqs);
1362         delete alignment;
1363
1364         // realign
1365         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1366                         model);
1367
1368         delete groupOneSeqs;
1369         delete groupTwoSeqs;
1370 }
1371
1372 /////////////////////////////////////////////////////////////////
1373 // WriteAnnotation()
1374 //
1375 // Computes annotation for multiple alignment and write values
1376 // to a file.
1377 /////////////////////////////////////////////////////////////////
1378
1379 void MSA::WriteAnnotation(MultiSequence *alignment,
1380                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1381         ofstream outfile(annotationFilename.c_str());
1382
1383         if (outfile.fail()) {
1384                 cerr << "ERROR: Unable to write annotation file." << endl;
1385                 exit(1);
1386         }
1387
1388         const int alignLength = alignment->GetSequence(0)->GetLength();
1389         const int numSeqs = alignment->GetNumSequences();
1390
1391         SafeVector<int> position(numSeqs, 0);
1392         SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
1393         for (int i = 0; i < numSeqs; i++)
1394                 seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1395         SafeVector<pair<int, int> > active;
1396         active.reserve(numSeqs);
1397
1398         SafeVector<int> lab;
1399         for (int i = 0; i < numSeqs; i++)
1400                 lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1401
1402         // for every column
1403         for (int i = 1; i <= alignLength; i++) {
1404
1405                 // find all aligned residues in this particular column
1406                 active.clear();
1407                 for (int j = 0; j < numSeqs; j++) {
1408                         if (seqs[j][i] != '-') {
1409                                 active.push_back(make_pair(lab[j], ++position[j]));
1410                         }
1411                 }
1412
1413                 sort(active.begin(), active.end());
1414                 outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
1415         }
1416
1417         outfile.close();
1418 }
1419
1420 /////////////////////////////////////////////////////////////////
1421 // ComputeScore()
1422 //
1423 // Computes the annotation score for a particular column.
1424 /////////////////////////////////////////////////////////////////
1425
1426 int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
1427                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1428
1429         if (active.size() <= 1)
1430                 return 0;
1431
1432         // ALTERNATIVE #1: Compute the average alignment score.
1433
1434         float val = 0;
1435         for (int i = 0; i < (int) active.size(); i++) {
1436                 for (int j = i + 1; j < (int) active.size(); j++) {
1437                         val += sparseMatrices[active[i].first][active[j].first]->GetValue(
1438                                         active[i].second, active[j].second);
1439                 }
1440         }
1441
1442         return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1443
1444 }
1445
1446 /////////////////////////////////////////////////////////////////
1447 // ComputeSimilarity ()
1448 //
1449 // Computes the average similarity for a particular family.
1450 // extreme low similarity(<=25%) return 0
1451 // low similarity(<=40%) return 1
1452 // high similarity(<=70%) return 2
1453 // extreme high similarity(>70%) return 3
1454 /////////////////////////////////////////////////////////////////
1455 int MSA::AdjustmentTest(MultiSequence *sequences,const ProbabilisticModel &model){
1456         assert(sequences);
1457
1458         //get the number of sequences
1459         const int numSeqs = sequences->GetNumSequences();
1460         //average identity for all sequences
1461         float identity = 0;
1462
1463 #ifdef _OPENMP
1464         //calculate sequence pairs for openmp model
1465         int pairIdx = 0;
1466         numPairs = (numSeqs - 1) * numSeqs / 2;
1467         seqsPairs = new SeqsPair[numPairs];
1468         for(int a = 0; a < numSeqs; a++) {
1469                 for(int b = a + 1; b < numSeqs; b++) {
1470                         seqsPairs[pairIdx].seq1 = a;
1471                         seqsPairs[pairIdx].seq2 = b;
1472                         pairIdx++;
1473                 }
1474         }
1475 #endif
1476
1477         // do all pairwise alignments for family similarity
1478 #ifdef _OPENMP
1479 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
1480         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
1481                 int a= seqsPairs[pairIdx].seq1;
1482                 int b = seqsPairs[pairIdx].seq2;
1483                 if(enableVerbose) {
1484 #pragma omp critical
1485                         cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
1486                 }
1487 #else
1488         for (int a = 0; a < numSeqs - 1; a++) {
1489                 for (int b = a + 1; b < numSeqs; b++) {
1490 #endif
1491                         Sequence *seq1 = sequences->GetSequence(a);
1492                         Sequence *seq2 = sequences->GetSequence(b);
1493                         pair<SafeVector<char> *, float> alignment = model.ComputeViterbiAlignment(seq1,seq2);
1494                         SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
1495                         SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
1496
1497                         float N_correct_match = 0;
1498                         //float N_alignment = 0;
1499                         int i = 1;int j = 1;
1500                         for (SafeVector<char>::iterator iter = alignment.first->begin();
1501                                 iter != alignment.first->end(); ++iter){
1502                                 //N_alignment += 1;
1503                                 if (*iter == 'B'){
1504                                         unsigned char c1 = (unsigned char) iter1[i++];
1505                                         unsigned char c2 = (unsigned char) iter2[j++];
1506                                         if(c1==c2) N_correct_match += 1;
1507                                 }
1508                                 else if(*iter == 'X') i++;
1509                                 else if(*iter == 'Y') j++;
1510                         }
1511                         if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "similarity error"<< endl;
1512                         identity += N_correct_match / alignment.first->size();
1513                         delete alignment.first;
1514 #ifndef _OPENMP
1515                 }
1516 #endif
1517         }
1518         identity /= numPairs;
1519 /*
1520 FILE *fi = fopen ("accuracy", "a");
1521 fprintf (fi, " %.10f ", similarity); fprintf (fi, "\n");
1522 fclose (fi);
1523 */
1524
1525         //adapative
1526         if( identity <= 0.15 ) initDistrib[2] = 0.143854;
1527         else if( identity <= 0.2 ) initDistrib[2] = 0.191948;
1528         else if( identity <= 0.25 ) initDistrib[2] = 0.170705;
1529         else if( identity <= 0.3 ) initDistrib[2] = 0.100675;
1530         else if( identity <= 0.35 ) initDistrib[2] = 0.090755;
1531         else if( identity <= 0.4 ) initDistrib[2] = 0.146188;
1532         else if( identity <= 0.45 ) initDistrib[2] = 0.167858;
1533         else if( identity <= 0.5) initDistrib[2] = 0.250769;
1534
1535
1536         if( identity <= 0.25 ) return 0;
1537         else if( identity <= 0.4) return 1;
1538         else if( identity <= 0.7) return 2;
1539         else return 3;
1540
1541 }