binaries/src/GLProbs-1.0/MSA2.cpp

   1 /***********************************************
   2  * # Copyright 2009-2010. Liu Yongchao
   3  * # Contact: Liu Yongchao, School of Computer Engineering,
   4  * #                     Nanyang Technological University.
   5  * # Emails:     liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
   6  * #
   7  * # GPL version 3.0 applies.
   8  * #
   9  * ************************************************/
  10
  11 #include <string>
  12 #include <sstream>
  13 #include <iomanip>
  14 #include <iostream>
  15 #include <list>
  16 #include <set>
  17 #include <algorithm>
  18 #include <climits>
  19 #include <cstdio>
  20 #include <cstdlib>
  21 #include <cerrno>
  22 #include <iomanip>
  23 #include "MSA.h"
  24 #include "MSAClusterTree.h"
  25 #include "Defaults.h"
  26
  27 #ifdef _OPENMP
  28 #include <omp.h>
  29 #endif
  30
  31 string parametersInputFilename = "";
  32 string parametersOutputFilename = "no training";
  33 string annotationFilename = "";
  34
  35 bool enableVerbose = false;
  36 bool enableAnnotation = false;
  37 bool enableClustalWOutput = false;
  38 bool enableAlignOrder = false;
  39 int numConsistencyReps = 2;
  40 int numPreTrainingReps = 0;
  41 int numIterativeRefinementReps = 100;
  42
  43 float cutoff = 0;
  44
  45 VF initDistrib(NumMatrixTypes);
  46 VF gapOpen(2 * NumInsertStates);
  47 VF gapExtend(2 * NumInsertStates);
  48 VVF emitPairs(256, VF(256, 1e-10));
  49 VF emitSingle(256, 1e-5);
  50
  51 string alphabet = alphabetDefault;
  52
  53 const int MIN_PRETRAINING_REPS = 0;
  54 const int MAX_PRETRAINING_REPS = 20;
  55 const int MIN_CONSISTENCY_REPS = 0;
  56 const int MAX_CONSISTENCY_REPS = 5;
  57 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
  58 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
  59
  60 string posteriorProbsFilename = "";
  61 bool allscores = true;
  62 string infilename;
  63
  64 int flag_gui = 0;   //0: no gui related o/p
  65 //1: gui related o/p generated
  66 int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
  67 //1: pp score seq added to o/p fasta alignment
  68
  69 ///////////////////////////////
  70 // global scoring matrix variables
  71 //////////////////////////////
  72 float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
  73 char *aminos, *bases, matrixtype[20] = "gonnet_160";
  74 int subst_index[26];
  75
  76 double sub_matrix[26][26];
  77 double normalized_matrix[26][26];// add by YE Yongtao
  78 int firstread = 0;              //this makes sure that matrices are read only once
  79
  80 float TEMPERATURE = 5;
  81 int MATRIXTYPE = 160;
  82 int prot_nuc = 0;               //0=prot, 1=nucleotide
  83
  84 float GAPOPEN = 0;
  85 float GAPEXT = 0;
  86 int numThreads = 0;
  87
  88 //argument support
  89 typedef struct {
  90         char input[30];
  91         int matrix;
  92         int N;
  93         float T;
  94         float beta;
  95         char opt;                       //can be 'P' or 'M'
  96         float gapopen;
  97         float gapext;
  98 } argument_decl;
  99
 100 argument_decl argument;
 101
 102 extern inline void read_sustitution_matrix(char *fileName);
 103 extern void setmatrixtype(int le);
 104 extern inline int matrixtype_to_int();
 105 extern inline void read_dna_matrix();
 106 extern inline void read_vtml_la_matrix();
 107 extern void init_arguments();
 108
 109 MSA::MSA(int argc, char* argv[]) {
 110         //parse program parameters
 111         SafeVector<string> sequenceNames = ParseParams(argc, argv);
 112
 113         //initialize arguments for partition function
 114         init_arguments();
 115
 116         ReadParameters();
 117         //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 118
 119         //read the input sequences
 120         MultiSequence *sequences = new MultiSequence();
 121         assert(sequences);
 122         for (int i = 0; i < (int) sequenceNames.size(); i++) {
 123                 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 124                 sequences->LoadMFA(sequenceNames[i], true);
 125         }
 126         //allocate space for sequence weights
 127         this->seqsWeights = new int[sequences->GetNumSequences()];
 128         //initilaize parameters for OPENMP
 129 #ifdef _OPENMP
 130         if(numThreads <= 0) {
 131                 numThreads = omp_get_num_procs();
 132                 cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
 133         }
 134         cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
 135
 136         //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
 137         omp_set_num_threads(numThreads);
 138 #endif
 139 FILE *fi = fopen ("accuracy", "a");
 140 fprintf (fi, "%s ", argv[1]);
 141 fclose (fi);
 142
 143         int levelid = ComputeSimilarity (sequences,ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,emitSingle));
 144 /*
 145         // now, we can perform the alignments and write them out
 146         MultiSequence *alignment = doAlign(sequences,
 147                         ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
 148                                         emitSingle), levelid);
 149
 150         //write the alignment results to standard output
 151         if (enableClustalWOutput) {
 152                 alignment->WriteALN(*alignOutFile);
 153         } else {
 154                 alignment->WriteMFA(*alignOutFile);
 155         }
 156         //release resources
 157         delete[] this->seqsWeights;
 158         delete alignment;
 159 */
 160         delete sequences;
 161 }
 162 MSA::~MSA() {
 163         /*close the output file*/
 164         if (alignOutFileName.length() > 0) {
 165                 ((std::ofstream*) alignOutFile)->close();
 166         }
 167 }
 168 /////////////////////////////////////////////////////////////////
 169 // PrintParameters()
 170 //
 171 // Prints MSAPROBS parameters to STDERR.  If a filename is
 172 // specified, then the parameters are also written to the file.
 173 /////////////////////////////////////////////////////////////////
 174
 175 void MSA::PrintParameters(const char *message, const VF &initDistrib,
 176                 const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
 177                 const VF &emitSingle, const char *filename) {
 178
 179         // print parameters to the screen
 180         cerr << message << endl << "    initDistrib[] = { ";
 181         for (int i = 0; i < NumMatrixTypes; i++)
 182                 cerr << setprecision(10) << initDistrib[i] << " ";
 183         cerr << "}" << endl << "        gapOpen[] = { ";
 184         for (int i = 0; i < NumInsertStates * 2; i++)
 185                 cerr << setprecision(10) << gapOpen[i] << " ";
 186         cerr << "}" << endl << "      gapExtend[] = { ";
 187         for (int i = 0; i < NumInsertStates * 2; i++)
 188                 cerr << setprecision(10) << gapExtend[i] << " ";
 189         cerr << "}" << endl << endl;
 190
 191         // if a file name is specified
 192         if (filename) {
 193
 194                 // attempt to open the file for writing
 195                 FILE *file = fopen(filename, "w");
 196                 if (!file) {
 197                         cerr << "ERROR: Unable to write parameter file: " << filename
 198                                         << endl;
 199                         exit(1);
 200                 }
 201
 202                 // if successful, then write the parameters to the file
 203                 for (int i = 0; i < NumMatrixTypes; i++)
 204                         fprintf(file, "%.10f ", initDistrib[i]);
 205                 fprintf(file, "\n");
 206                 for (int i = 0; i < 2 * NumInsertStates; i++)
 207                         fprintf(file, "%.10f ", gapOpen[i]);
 208                 fprintf(file, "\n");
 209                 for (int i = 0; i < 2 * NumInsertStates; i++)
 210                         fprintf(file, "%.10f ", gapExtend[i]);
 211                 fprintf(file, "\n");
 212                 fprintf(file, "%s\n", alphabet.c_str());
 213                 for (int i = 0; i < (int) alphabet.size(); i++) {
 214                         for (int j = 0; j <= i; j++)
 215                                 fprintf(file, "%.10f ",
 216                                                 emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
 217                         fprintf(file, "\n");
 218                 }
 219                 for (int i = 0; i < (int) alphabet.size(); i++)
 220                         fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
 221                 fprintf(file, "\n");
 222                 fclose(file);
 223         }
 224 }
 225
 226 /////////////////////////////////////////////////////////////////
 227 // doAlign()
 228 //
 229 // First computes all pairwise posterior probability matrices.
 230 // Then, computes new parameters if training, or a final
 231 // alignment, otherwise.
 232 /////////////////////////////////////////////////////////////////
 233 extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
 234 MultiSequence* MSA::doAlign(MultiSequence *sequences,
 235                 const ProbabilisticModel &model, int levelid) {
 236         assert(sequences);
 237
 238         //get the number of sequences
 239         const int numSeqs = sequences->GetNumSequences();
 240         //create distance matrix
 241         VVF distances(numSeqs, VF(numSeqs, 0));
 242         //creat sparseMatrices
 243         SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
 244                         SafeVector<SparseMatrix *>(numSeqs, NULL));
 245
 246 #ifdef _OPENMP
 247         //calculate sequence pairs for openmp model
 248         int pairIdx = 0;
 249         numPairs = (numSeqs - 1) * numSeqs / 2;
 250         seqsPairs = new SeqsPair[numPairs];
 251         for(int a = 0; a < numSeqs; a++) {
 252                 for(int b = a + 1; b < numSeqs; b++) {
 253                         seqsPairs[pairIdx].seq1 = a;
 254                         seqsPairs[pairIdx].seq2 = b;
 255                         pairIdx++;
 256                 }
 257         }
 258 #endif
 259         // do all pairwise alignments for posterior probability matrices
 260 #ifdef _OPENMP
 261 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
 262         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
 263                 int a= seqsPairs[pairIdx].seq1;
 264                 int b = seqsPairs[pairIdx].seq2;
 265                 if(enableVerbose) {
 266 #pragma omp critical
 267                         cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
 268                 }
 269 #else
 270         for (int a = 0; a < numSeqs - 1; a++) {
 271                 for (int b = a + 1; b < numSeqs; b++) {
 272 #endif
 273                         Sequence *seq1 = sequences->GetSequence(a);
 274                         Sequence *seq2 = sequences->GetSequence(b);
 275
 276                         //posterior probability matrix
 277                         VF* posterior;
 278
 279
 280 //high similarity use global model
 281                         //if(levelid == 2)
 282                         if(1) posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
 283
 284 /*
 285 //low similarity use local model
 286                         else if(levelid == 1){
 287                                 VF *forward = model.ComputeForwardMatrix(seq1, seq2,false);
 288                                 assert(forward);
 289                                 VF *backward = model.ComputeBackwardMatrix(seq1, seq2,false);
 290                                 assert(backward);
 291                                 posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
 292                                 delete forward;
 293                                 delete backward;
 294                         }
 295
 296 //extreme low or extreme high similarity use combined model
 297                         else{
 298
 299 //probcons
 300                                 // compute forward and backward probabilities
 301                                 VF *forward = model.ComputeForwardMatrix(seq1, seq2);
 302                                 assert(forward);
 303                                 VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
 304                                 assert(backward);
 305                                 // compute posterior probability matrix from HMM
 306                                 VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
 307                                 assert(probcons_posterior);
 308                                 delete forward;
 309                                 delete backward;
 310
 311 //probalign
 312                                 VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
 313                                 assert(probalign_posterior);
 314 //local
 315                                 forward = model.ComputeForwardMatrix(seq1, seq2,false);
 316                                 assert(forward);
 317                                 backward = model.ComputeBackwardMatrix(seq1, seq2,false);
 318                                 assert(backward);
 319                                 posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
 320                                 assert(posterior);
 321                                 delete forward;
 322                                 delete backward;
 323 //combined model
 324                                 //merge probalign + local + probcons
 325                                 VF::iterator ptr1 = probcons_posterior->begin();
 326                                 VF::iterator ptr2 = probalign_posterior->begin();
 327                                 VF::iterator ptr = posterior->begin();
 328                                 for (int i = 0; i <= seq1->GetLength(); i++) {
 329                                         for (int j = 0; j <= seq2->GetLength(); j++) {
 330                                                 float v1 = *ptr1;
 331                                                 float v2 = *ptr2;
 332                                                 float v3 = *ptr;
 333                                                 *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3);
 334                                                 ptr1++;
 335                                                 ptr2++;
 336                                                 ptr++;
 337                                         }
 338                                 }
 339                                 delete probcons_posterior;
 340                                 delete probalign_posterior;
 341                         }
 342 */
 343                         assert(posterior);
 344                         // perform the pairwise sequence alignment
 345                         pair<SafeVector<char> *, float> alignment = model.ComputeAlignment(
 346                                         seq1->GetLength(), seq2->GetLength(), *posterior);
 347
 348                         //compute expected accuracy
 349                         distances[a][b] = distances[b][a] = 1.0f - alignment.second
 350                                         / min(seq1->GetLength(), seq2->GetLength());
 351
 352                         // compute sparse representations
 353                         sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
 354                         seq2->GetLength(), *posterior);
 355                         sparseMatrices[b][a] = NULL;
 356
 357                         delete posterior;
 358                         delete alignment.first;
 359 #ifndef _OPENMP
 360                 }
 361 #endif
 362         }
 363
 364         //create the guide tree
 365         this->tree = new MSAClusterTree(this, distances, numSeqs);
 366         this->tree->create();
 367
 368         // perform the consistency transformation the desired number of times
 369         float* fweights = new float[numSeqs];
 370         for (int r = 0; r < numSeqs; r++) {
 371                 fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
 372                 fweights[r] *= 10;
 373         }
 374         for (int r = 0; r < numConsistencyReps; r++) {
 375                 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
 376                                 DoRelaxation(fweights, sequences, sparseMatrices);
 377
 378                 // now replace the old posterior matrices
 379                 for (int i = 0; i < numSeqs; i++) {
 380                         for (int j = 0; j < numSeqs; j++) {
 381                                 delete sparseMatrices[i][j];
 382                                 sparseMatrices[i][j] = newSparseMatrices[i][j];
 383                         }
 384                 }
 385         }
 386         delete[] fweights;
 387 #ifdef _OPENMP
 388         delete [] seqsPairs;
 389 #endif
 390
 391         //compute the final multiple sequence alignment
 392         MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
 393                         sparseMatrices, model);
 394
 395         // build annotation
 396         if (enableAnnotation) {
 397                 WriteAnnotation(finalAlignment, sparseMatrices);
 398         }
 399         //destroy the guide tree
 400         delete this->tree;
 401         this->tree = 0;
 402
 403         // delete sparse matrices
 404         for (int a = 0; a < numSeqs - 1; a++) {
 405                 for (int b = a + 1; b < numSeqs; b++) {
 406                         delete sparseMatrices[a][b];
 407                         delete sparseMatrices[b][a];
 408                 }
 409         }
 410
 411         return finalAlignment;
 412 }
 413
 414 /////////////////////////////////////////////////////////////////
 415 // GetInteger()
 416 //
 417 // Attempts to parse an integer from the character string given.
 418 // Returns true only if no parsing error occurs.
 419 /////////////////////////////////////////////////////////////////
 420
 421 bool GetInteger(char *data, int *val) {
 422         char *endPtr;
 423         long int retVal;
 424
 425         assert(val);
 426
 427         errno = 0;
 428         retVal = strtol(data, &endPtr, 0);
 429         if (retVal == 0 && (errno != 0 || data == endPtr))
 430                 return false;
 431         if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
 432                 return false;
 433         if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
 434                 return false;
 435         *val = (int) retVal;
 436         return true;
 437 }
 438
 439 /////////////////////////////////////////////////////////////////
 440 // GetFloat()
 441 //
 442 // Attempts to parse a float from the character string given.
 443 // Returns true only if no parsing error occurs.
 444 /////////////////////////////////////////////////////////////////
 445
 446 bool GetFloat(char *data, float *val) {
 447         char *endPtr;
 448         double retVal;
 449
 450         assert(val);
 451
 452         errno = 0;
 453         retVal = strtod(data, &endPtr);
 454         if (retVal == 0 && (errno != 0 || data == endPtr))
 455                 return false;
 456         if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
 457                 return false;
 458         *val = (float) retVal;
 459         return true;
 460 }
 461
 462 /////////////////////////////////////////////////////////////////
 463 // ReadParameters()
 464 //
 465 // Read initial distribution, transition, and emission
 466 // parameters from a file.
 467 /////////////////////////////////////////////////////////////////
 468
 469 void MSA::ReadParameters() {
 470
 471         ifstream data;
 472
 473         emitPairs = VVF(256, VF(256, 1e-10));
 474         emitSingle = VF(256, 1e-5);
 475
 476         // read initial state distribution and transition parameters
 477         if (parametersInputFilename == string("")) {
 478                 if (NumInsertStates == 1) {
 479                         for (int i = 0; i < NumMatrixTypes; i++)
 480                                 initDistrib[i] = initDistrib1Default[i];
 481                         for (int i = 0; i < 2 * NumInsertStates; i++)
 482                                 gapOpen[i] = gapOpen1Default[i];
 483                         for (int i = 0; i < 2 * NumInsertStates; i++)
 484                                 gapExtend[i] = gapExtend1Default[i];
 485                 } else if (NumInsertStates == 2) {
 486                         for (int i = 0; i < NumMatrixTypes; i++)
 487                                 initDistrib[i] = initDistrib2Default[i];
 488                         for (int i = 0; i < 2 * NumInsertStates; i++)
 489                                 gapOpen[i] = gapOpen2Default[i];
 490                         for (int i = 0; i < 2 * NumInsertStates; i++)
 491                                 gapExtend[i] = gapExtend2Default[i];
 492                 } else {
 493                         cerr
 494                                         << "ERROR: No default initial distribution/parameter settings exist"
 495                                         << endl << "       for " << NumInsertStates
 496                                         << " pairs of insert states.  Use --paramfile." << endl;
 497                         exit(1);
 498                 }
 499
 500                 alphabet = alphabetDefault;
 501
 502                 for (int i = 0; i < (int) alphabet.length(); i++) {
 503                         emitSingle[(unsigned char) tolower(alphabet[i])] =
 504                                         emitSingleDefault[i];
 505                         emitSingle[(unsigned char) toupper(alphabet[i])] =
 506                                         emitSingleDefault[i];
 507                         for (int j = 0; j <= i; j++) {
 508                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
 509                                                 alphabet[j])] = emitPairsDefault[i][j];
 510                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
 511                                                 alphabet[j])] = emitPairsDefault[i][j];
 512                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
 513                                                 alphabet[j])] = emitPairsDefault[i][j];
 514                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
 515                                                 alphabet[j])] = emitPairsDefault[i][j];
 516                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
 517                                                 alphabet[i])] = emitPairsDefault[i][j];
 518                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
 519                                                 alphabet[i])] = emitPairsDefault[i][j];
 520                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
 521                                                 alphabet[i])] = emitPairsDefault[i][j];
 522                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
 523                                                 alphabet[i])] = emitPairsDefault[i][j];
 524                         }
 525                 }
 526         } else {
 527                 data.open(parametersInputFilename.c_str());
 528                 if (data.fail()) {
 529                         cerr << "ERROR: Unable to read parameter file: "
 530                                         << parametersInputFilename << endl;
 531                         exit(1);
 532                 }
 533
 534                 string line[3];
 535                 for (int i = 0; i < 3; i++) {
 536                         if (!getline(data, line[i])) {
 537                                 cerr
 538                                                 << "ERROR: Unable to read transition parameters from parameter file: "
 539                                                 << parametersInputFilename << endl;
 540                                 exit(1);
 541                         }
 542                 }
 543                 istringstream data2;
 544                 data2.clear();
 545                 data2.str(line[0]);
 546                 for (int i = 0; i < NumMatrixTypes; i++)
 547                         data2 >> initDistrib[i];
 548                 data2.clear();
 549                 data2.str(line[1]);
 550                 for (int i = 0; i < 2 * NumInsertStates; i++)
 551                         data2 >> gapOpen[i];
 552                 data2.clear();
 553                 data2.str(line[2]);
 554                 for (int i = 0; i < 2 * NumInsertStates; i++)
 555                         data2 >> gapExtend[i];
 556
 557                 if (!getline(data, line[0])) {
 558                         cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
 559                                         << parametersInputFilename << endl;
 560                         exit(1);
 561                 }
 562
 563                 // read alphabet as concatenation of all characters on alphabet line
 564                 alphabet = "";
 565                 string token;
 566                 data2.clear();
 567                 data2.str(line[0]);
 568                 while (data2 >> token)
 569                         alphabet += token;
 570
 571                 for (int i = 0; i < (int) alphabet.size(); i++) {
 572                         for (int j = 0; j <= i; j++) {
 573                                 float val;
 574                                 data >> val;
 575                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
 576                                                 alphabet[j])] = val;
 577                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
 578                                                 alphabet[j])] = val;
 579                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
 580                                                 alphabet[j])] = val;
 581                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
 582                                                 alphabet[j])] = val;
 583                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
 584                                                 alphabet[i])] = val;
 585                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
 586                                                 alphabet[i])] = val;
 587                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
 588                                                 alphabet[i])] = val;
 589                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
 590                                                 alphabet[i])] = val;
 591                         }
 592                 }
 593
 594                 for (int i = 0; i < (int) alphabet.size(); i++) {
 595                         float val;
 596                         data >> val;
 597                         emitSingle[(unsigned char) tolower(alphabet[i])] = val;
 598                         emitSingle[(unsigned char) toupper(alphabet[i])] = val;
 599                 }
 600                 data.close();
 601         }
 602 }
 603
 604 /////////////////////////////////////////////////////////////////
 605 // ParseParams()
 606 //
 607 // Parse all command-line options.
 608 /////////////////////////////////////////////////////////////////
 609 void MSA::printUsage() {
 610         cerr
 611                         << "************************************************************************"
 612                         << endl
 613                         << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
 614                         << endl
 615                         << "\tbased on pair hidden markov model and partition function postirior"
 616                         << endl
 617                         << "\tprobabilities. If any comments or problems, please contact"
 618                         << endl
 619                         << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
 620                         << endl
 621                         << "*************************************************************************"
 622                         << endl << "Usage:" << endl
 623                         << "       msaprobs [OPTION]... [infile]..." << endl << endl
 624                         << "Description:" << endl
 625                         << "       Align sequences in multi-FASTA format" << endl << endl
 626                         << "       -o, --outfile <string>" << endl
 627                         << "              specify the output file name (STDOUT by default)"
 628                         << endl << "       -num_threads <integer>" << endl
 629                         << "              specify the number of threads used, and otherwise detect automatically"
 630                         << endl << "       -clustalw" << endl
 631                         << "              use CLUSTALW output format instead of FASTA format"
 632                         << endl << endl << "       -c, --consistency REPS" << endl
 633                         << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
 634                         << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
 635                         << ") passes of consistency transformation" << endl << endl
 636                         << "       -ir, --iterative-refinement REPS" << endl
 637                         << "              use " << MIN_ITERATIVE_REFINEMENT_REPS
 638                         << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
 639                         << numIterativeRefinementReps << ") passes of iterative-refinement"
 640                         << endl << endl << "       -v, --verbose" << endl
 641                         << "              report progress while aligning (default: "
 642                         << (enableVerbose ? "on" : "off") << ")" << endl << endl
 643                         << "       -annot FILENAME" << endl
 644                         << "              write annotation for multiple alignment to FILENAME"
 645                         << endl << endl << "       -a, --alignment-order" << endl
 646                         << "              print sequences in alignment order rather than input order (default: "
 647                         << (enableAlignOrder ? "on" : "off") << ")" << endl
 648                         << "       -version " << endl
 649                         << "              print out version of MSAPROBS " << endl << endl;
 650 }
 651 SafeVector<string> MSA::ParseParams(int argc, char **argv) {
 652         if (argc < 2) {
 653                 printUsage();
 654                 exit(1);
 655         }
 656         SafeVector<string> sequenceNames;
 657         int tempInt;
 658         float tempFloat;
 659
 660         for (int i = 1; i < argc; i++) {
 661                 if (argv[i][0] == '-') {
 662                         //help
 663                         if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
 664                                 printUsage();
 665                                 exit(1);
 666                                 //output file name
 667                         } else if (!strcmp(argv[i], "-o")
 668                                         || !strcmp(argv[i], "--outfile")) {
 669                                 if (i < argc - 1) {
 670                                         alignOutFileName = argv[++i];   //get the file name
 671                                 } else {
 672                                         cerr << "ERROR: String expected for option " << argv[i]
 673                                                         << endl;
 674                                         exit(1);
 675                                 }
 676                                 // parameter file
 677                         } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
 678                                 if (i < argc - 1)
 679                                         parametersInputFilename = string (argv[++i]);
 680                                 else {
 681                                         cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 682                                         exit (1);
 683                                 }
 684                                 //number of threads used
 685                         } else if (!strcmp(argv[i], "-p")
 686                                         || !strcmp(argv[i], "-num_threads")) {
 687                                 if (i < argc - 1) {
 688                                         if (!GetInteger(argv[++i], &tempInt)) {
 689                                                 cerr << " ERROR: invalid integer following option "
 690                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 691                                                 exit(1);
 692                                         } else {
 693                                                 if (tempInt < 0) {
 694                                                         tempInt = 0;
 695                                                 }
 696                                                 numThreads = tempInt;
 697                                         }
 698                                 } else {
 699                                         cerr << "ERROR: Integer expected for option " << argv[i]
 700                                                         << endl;
 701                                         exit(1);
 702                                 }
 703                                 // number of consistency transformations
 704                         } else if (!strcmp(argv[i], "-c")
 705                                         || !strcmp(argv[i], "--consistency")) {
 706                                 if (i < argc - 1) {
 707                                         if (!GetInteger(argv[++i], &tempInt)) {
 708                                                 cerr << "ERROR: Invalid integer following option "
 709                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 710                                                 exit(1);
 711                                         } else {
 712                                                 if (tempInt < MIN_CONSISTENCY_REPS
 713                                                                 || tempInt > MAX_CONSISTENCY_REPS) {
 714                                                         cerr << "ERROR: For option " << argv[i - 1]
 715                                                                         << ", integer must be between "
 716                                                                         << MIN_CONSISTENCY_REPS << " and "
 717                                                                         << MAX_CONSISTENCY_REPS << "." << endl;
 718                                                         exit(1);
 719                                                 } else {
 720                                                         numConsistencyReps = tempInt;
 721                                                 }
 722                                         }
 723                                 } else {
 724                                         cerr << "ERROR: Integer expected for option " << argv[i]
 725                                                         << endl;
 726                                         exit(1);
 727                                 }
 728                         }
 729
 730                         // number of randomized partitioning iterative refinement passes
 731                         else if (!strcmp(argv[i], "-ir")
 732                                         || !strcmp(argv[i], "--iterative-refinement")) {
 733                                 if (i < argc - 1) {
 734                                         if (!GetInteger(argv[++i], &tempInt)) {
 735                                                 cerr << "ERROR: Invalid integer following option "
 736                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 737                                                 exit(1);
 738                                         } else {
 739                                                 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
 740                                                                 || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
 741                                                         cerr << "ERROR: For option " << argv[i - 1]
 742                                                                         << ", integer must be between "
 743                                                                         << MIN_ITERATIVE_REFINEMENT_REPS << " and "
 744                                                                         << MAX_ITERATIVE_REFINEMENT_REPS << "."
 745                                                                         << endl;
 746                                                         exit(1);
 747                                                 } else
 748                                                         numIterativeRefinementReps = tempInt;
 749                                         }
 750                                 } else {
 751                                         cerr << "ERROR: Integer expected for option " << argv[i]
 752                                                         << endl;
 753                                         exit(1);
 754                                 }
 755                         }
 756
 757                         // annotation files
 758                         else if (!strcmp(argv[i], "-annot")) {
 759                                 enableAnnotation = true;
 760                                 if (i < argc - 1) {
 761                                         annotationFilename = argv[++i];
 762                                 } else {
 763                                         cerr << "ERROR: FILENAME expected for option " << argv[i]
 764                                                         << endl;
 765                                         exit(1);
 766                                 }
 767                         }
 768
 769                         // clustalw output format
 770                         else if (!strcmp(argv[i], "-clustalw")) {
 771                                 enableClustalWOutput = true;
 772                         }
 773
 774                         // cutoff
 775                         else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
 776                                 if (i < argc - 1) {
 777                                         if (!GetFloat(argv[++i], &tempFloat)) {
 778                                                 cerr
 779                                                                 << "ERROR: Invalid floating-point value following option "
 780                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 781                                                 exit(1);
 782                                         } else {
 783                                                 if (tempFloat < 0 || tempFloat > 1) {
 784                                                         cerr << "ERROR: For option " << argv[i - 1]
 785                                                                         << ", floating-point value must be between 0 and 1."
 786                                                                         << endl;
 787                                                         exit(1);
 788                                                 } else
 789                                                         cutoff = tempFloat;
 790                                         }
 791                                 } else {
 792                                         cerr << "ERROR: Floating-point value expected for option "
 793                                                         << argv[i] << endl;
 794                                         exit(1);
 795                                 }
 796                         }
 797
 798                         // verbose reporting
 799                         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
 800                                 enableVerbose = true;
 801                         }
 802
 803                         // alignment order
 804                         else if (!strcmp(argv[i], "-a")
 805                                         || !strcmp(argv[i], "--alignment-order")) {
 806                                 enableAlignOrder = true;
 807                         }
 808
 809                         //print out version
 810                         else if (!strcmp(argv[i], "-version")) {
 811                                 cerr << "MSAPROBS version " << VERSION << endl;
 812                                 exit(1);
 813                         }
 814                         // bad arguments
 815                         else {
 816                                 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
 817                                 exit(1);
 818                         }
 819                 } else {
 820                         sequenceNames.push_back(string(argv[i]));
 821                 }
 822         }
 823
 824         /*check the output file name*/
 825         cerr << "-------------------------------------" << endl;
 826         if (alignOutFileName.length() == 0) {
 827                 cerr << "The final alignments will be printed out to STDOUT" << endl;
 828                 alignOutFile = &std::cout;
 829         } else {
 830                 cerr << "Open the output file " << alignOutFileName << endl;
 831                 alignOutFile = new ofstream(alignOutFileName.c_str(),
 832                                 ios::binary | ios::out | ios::trunc);
 833         }
 834         cerr << "-------------------------------------" << endl;
 835         return sequenceNames;
 836 }
 837
 838 /////////////////////////////////////////////////////////////////
 839 // ProcessTree()
 840 //
 841 // Process the tree recursively.  Returns the aligned sequences
 842 // corresponding to a node or leaf of the tree.
 843 /////////////////////////////////////////////////////////////////
 844 MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
 845                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 846                 const ProbabilisticModel &model) {
 847
 848         MultiSequence *result;
 849
 850         // check if this is a node of the alignment tree
 851         //if (tree->GetSequenceLabel() == -1){
 852         if (tree->leaf == NODE) {
 853                 MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
 854                                 sparseMatrices, model);
 855                 MultiSequence *alignRight = ProcessTree(tree->right, sequences,
 856                                 sparseMatrices, model);
 857
 858                 assert(alignLeft);
 859                 assert(alignRight);
 860
 861                 result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
 862                 assert(result);
 863
 864                 delete alignLeft;
 865                 delete alignRight;
 866         }
 867
 868         // otherwise, this is a leaf of the alignment tree
 869         else {
 870                 result = new MultiSequence();
 871                 assert(result);
 872                 //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
 873                 result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
 874         }
 875
 876         return result;
 877 }
 878
 879 /////////////////////////////////////////////////////////////////
 880 // ComputeFinalAlignment()
 881 //
 882 // Compute the final alignment by calling ProcessTree(), then
 883 // performing iterative refinement as needed.
 884 /////////////////////////////////////////////////////////////////
 885
 886 MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
 887                 MultiSequence *sequences,
 888                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 889                 const ProbabilisticModel &model) {
 890         MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
 891                         sparseMatrices, model);
 892
 893         SafeVector<int> oldOrdering;
 894         int numSeqs = alignment->GetNumSequences();
 895         if (enableAlignOrder) {
 896                 for (int i = 0; i < numSeqs; i++)
 897                         oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
 898                 alignment->SaveOrdering();
 899                 enableAlignOrder = false;
 900         }
 901
 902         // tree-based refinement
 903         // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
 904         /*
 905          int numSeqs = alignment->GetNumSequences();
 906          //if(numSeqs < numIterativeRefinementReps){
 907          for(int iter = 0; iter < 5; iter ++){
 908                 for(int i = 0; i < numSeqs - 1; i++){
 909                         DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
 910                  }
 911          }
 912          //}*/
 913         //DoIterativeRefinement() return 1,2: this refinement unsuccessful
 914 /*
 915         int ineffectiveness = 0;
 916         for (int i = 0; i < numIterativeRefinementReps; i++){
 917                 int flag = DoIterativeRefinement(sparseMatrices, model, alignment);
 918                 if(numSeqs > 25){
 919                         if(flag > 0){
 920                             if(numIterativeRefinementReps < 20*numSeqs)
 921                                 numIterativeRefinementReps ++;
 922                             if(flag == 1) ineffectiveness ++;
 923                         }
 924                         //else ineffectiveness = 0;
 925                         if(ineffectiveness > 2*numSeqs && i >100 ) break;
 926                 }
 927         }
 928 */
 929
 930         for (int i = 0; i < numIterativeRefinementReps; i++)
 931                 DoIterativeRefinement(sparseMatrices, model, alignment);
 932
 933         cerr << endl;
 934
 935         if (oldOrdering.size() > 0) {
 936                 for (int i = 0; i < (int) oldOrdering.size(); i++) {
 937                         alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
 938                 }
 939         }
 940
 941         // return final alignment
 942         return alignment;
 943 }
 944
 945 /////////////////////////////////////////////////////////////////
 946 // AlignAlignments()
 947 //
 948 // Returns the alignment of two MultiSequence objects.
 949 /////////////////////////////////////////////////////////////////
 950
 951 MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
 952                 MultiSequence *align2,
 953                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 954                 const ProbabilisticModel &model) {
 955
 956         // print some info about the alignment
 957         if (enableVerbose) {
 958                 for (int i = 0; i < align1->GetNumSequences(); i++)
 959                         cerr << ((i == 0) ? "[" : ",")
 960                                         << align1->GetSequence(i)->GetLabel();
 961                 cerr << "] vs. ";
 962                 for (int i = 0; i < align2->GetNumSequences(); i++)
 963                         cerr << ((i == 0) ? "[" : ",")
 964                                         << align2->GetSequence(i)->GetLabel();
 965                 cerr << "]: ";
 966         }
 967 #if 0
 968         VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
 969 #else
 970         VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
 971                         sparseMatrices, cutoff);
 972 #endif
 973         // compute an "accuracy" measure for the alignment before refinement
 974
 975         pair<SafeVector<char> *, float> alignment;
 976         //perform alignment
 977         alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
 978                         align2->GetSequence(0)->GetLength(), *posterior);
 979
 980         delete posterior;
 981
 982         if (enableVerbose) {
 983
 984                 // compute total length of sequences
 985                 int totLength = 0;
 986                 for (int i = 0; i < align1->GetNumSequences(); i++)
 987                         for (int j = 0; j < align2->GetNumSequences(); j++)
 988                                 totLength += min(align1->GetSequence(i)->GetLength(),
 989                                                 align2->GetSequence(j)->GetLength());
 990
 991                 // give an "accuracy" measure for the alignment
 992                 cerr << alignment.second / totLength << endl;
 993         }
 994
 995         // now build final alignment
 996         MultiSequence *result = new MultiSequence();
 997         for (int i = 0; i < align1->GetNumSequences(); i++)
 998                 result->AddSequence(
 999                                 align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1000         for (int i = 0; i < align2->GetNumSequences(); i++)
1001                 result->AddSequence(
1002                                 align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1003         if (!enableAlignOrder)
1004                 result->SortByLabel();
1005
1006         // free temporary alignment
1007         delete alignment.first;
1008
1009         return result;
1010 }
1011
1012 /////////////////////////////////////////////////////////////////
1013 // DoRelaxation()
1014 //
1015 // Performs one round of the weighted probabilistic consistency transformation.
1016 //                     1
1017 /////////////////////////////////////////////////////////////////
1018
1019 SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
1020                 MultiSequence *sequences,
1021                 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1022         const int numSeqs = sequences->GetNumSequences();
1023
1024         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
1025                         SafeVector<SparseMatrix *>(numSeqs, NULL));
1026
1027         // for every pair of sequences
1028 #ifdef _OPENMP
1029         int pairIdx;
1030 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
1031         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
1032                 int i = seqsPairs[pairIdx].seq1;
1033                 int j = seqsPairs[pairIdx].seq2;
1034                 float wi = seqsWeights[i];
1035                 float wj = seqsWeights[j];
1036 #else
1037         for (int i = 0; i < numSeqs; i++) {
1038                 float wi = seqsWeights[i];
1039                 for (int j = i + 1; j < numSeqs; j++) {
1040                         float wj = seqsWeights[j];
1041 #endif
1042                         Sequence *seq1 = sequences->GetSequence(i);
1043                         Sequence *seq2 = sequences->GetSequence(j);
1044
1045                         if (enableVerbose) {
1046 #ifdef _OPENMP
1047 #pragma omp critical
1048 #endif
1049                                 cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
1050                                                 << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
1051                                                 << ": ";
1052                         }
1053                         // get the original posterior matrix
1054                         VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
1055                         assert(posteriorPtr);
1056                         VF &posterior = *posteriorPtr;
1057
1058                         const int seq1Length = seq1->GetLength();
1059                         const int seq2Length = seq2->GetLength();
1060
1061                         // contribution from the summation where z = x and z = y
1062                         float w = wi * wi * wj + wi * wj * wj;
1063                         float sumW = w;
1064                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1065                                 //posterior[k] = w*posterior[k];
1066                                 posterior[k] += posterior[k];
1067                         }
1068
1069                         if (enableVerbose)
1070                                 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1071
1072                         // contribution from all other sequences
1073                         for (int k = 0; k < numSeqs; k++) {
1074                                 if (k != i && k != j) {
1075                                         float wk = seqsWeights[k];
1076                                         float w = wi * wj * wk;
1077                                         sumW += w;
1078                                         if (k < i)
1079                                                 Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
1080                                                                 posterior);
1081                                         else if (k > i && k < j)
1082                                                 Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
1083                                                                 posterior);
1084                                         else {
1085                                                 SparseMatrix *temp =
1086                                                                 sparseMatrices[j][k]->ComputeTranspose();
1087                                                 Relax(w, sparseMatrices[i][k], temp, posterior);
1088                                                 delete temp;
1089                                         }
1090                                 }
1091                         }
1092                         //cerr<<"sumW "<<sumW<<endl;
1093                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1094                                 //posterior[k] /= sumW;
1095                                 posterior[k] /= numSeqs;
1096                         }
1097                         // mask out positions not originally in the posterior matrix
1098                         SparseMatrix *matXY = sparseMatrices[i][j];
1099                         for (int y = 0; y <= seq2Length; y++)
1100                                 posterior[y] = 0;
1101                         for (int x = 1; x <= seq1Length; x++) {
1102                                 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1103                                 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1104                                 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1105                                 int curr = 0;
1106                                 while (XYptr != XYend) {
1107
1108                                         // zero out all cells until the first filled column
1109                                         while (curr < XYptr->first) {
1110                                                 base[curr] = 0;
1111                                                 curr++;
1112                                         }
1113
1114                                         // now, skip over this column
1115                                         curr++;
1116                                         ++XYptr;
1117                                 }
1118
1119                                 // zero out cells after last column
1120                                 while (curr <= seq2Length) {
1121                                         base[curr] = 0;
1122                                         curr++;
1123                                 }
1124                         }
1125
1126                         // save the new posterior matrix
1127                         newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
1128                                         seq2->GetLength(), posterior);
1129                         newSparseMatrices[j][i] = NULL;
1130
1131                         if (enableVerbose)
1132                                 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1133
1134                         delete posteriorPtr;
1135
1136                         if (enableVerbose)
1137                                 cerr << "done." << endl;
1138 #ifndef _OPENMP
1139                 }
1140 #endif
1141         }
1142
1143         return newSparseMatrices;
1144 }
1145
1146 /////////////////////////////////////////////////////////////////
1147 // Relax()
1148 //
1149 // Computes the consistency transformation for a single sequence
1150 // z, and adds the transformed matrix to "posterior".
1151 /////////////////////////////////////////////////////////////////
1152
1153 void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
1154                 VF &posterior) {
1155
1156         assert(matXZ);
1157         assert(matZY);
1158
1159         int lengthX = matXZ->GetSeq1Length();
1160         int lengthY = matZY->GetSeq2Length();
1161         assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1162
1163         // for every x[i]
1164         for (int i = 1; i <= lengthX; i++) {
1165                 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1166                 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1167
1168                 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1169
1170                 // iterate through all x[i]-z[k]
1171                 while (XZptr != XZend) {
1172                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1173                         SafeVector<PIF>::iterator ZYend = ZYptr
1174                                         + matZY->GetRowSize(XZptr->first);
1175                         const float XZval = XZptr->second;
1176
1177                         // iterate through all z[k]-y[j]
1178                         while (ZYptr != ZYend) {
1179                                 //base[ZYptr->first] += weight * XZval * ZYptr->second;
1180                                 base[ZYptr->first] += XZval * ZYptr->second;
1181                                 ZYptr++;
1182                         }
1183                         XZptr++;
1184                 }
1185         }
1186 }
1187
1188 /////////////////////////////////////////////////////////////////
1189 // Relax1()
1190 //
1191 // Computes the consistency transformation for a single sequence
1192 // z, and adds the transformed matrix to "posterior".
1193 /////////////////////////////////////////////////////////////////
1194
1195 void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
1196                 VF &posterior) {
1197
1198         assert(matZX);
1199         assert(matZY);
1200
1201         int lengthZ = matZX->GetSeq1Length();
1202         int lengthY = matZY->GetSeq2Length();
1203
1204         // for every z[k]
1205         for (int k = 1; k <= lengthZ; k++) {
1206                 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1207                 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1208
1209                 // iterate through all z[k]-x[i]
1210                 while (ZXptr != ZXend) {
1211                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1212                         SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1213                         const float ZXval = ZXptr->second;
1214                         VF::iterator base = posterior.begin()
1215                                         + ZXptr->first * (lengthY + 1);
1216
1217                         // iterate through all z[k]-y[j]
1218                         while (ZYptr != ZYend) {
1219                                 //base[ZYptr->first] += weight * ZXval * ZYptr->second;
1220                                 base[ZYptr->first] += ZXval * ZYptr->second;
1221                                 ZYptr++;
1222                         }
1223                         ZXptr++;
1224                 }
1225         }
1226 }
1227 /////////////////////////////////////////////////////////////////
1228 // DoIterativeRefinement()
1229 //
1230 // Performs a single round of randomized partionining iterative
1231 // refinement.
1232 // return 0: successful refinement, 1: ineffective refinement, 2: random problem
1233 /////////////////////////////////////////////////////////////////
1234 int MSA::DoIterativeRefinement(
1235                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1236                 const ProbabilisticModel &model, MultiSequence* &alignment) {
1237         set<int> groupOne, groupTwo;
1238         int numSeqs = alignment->GetNumSequences();
1239         int i;
1240         // create two separate groups
1241         for (i = 0; i < numSeqs; i++) {
1242                 int index = rand();
1243                 if (index % 2) {
1244                         groupOne.insert(i);
1245                 } else {
1246                         groupTwo.insert(i);
1247                 }
1248         }
1249         if (groupOne.empty() || groupTwo.empty()) return 2;
1250
1251         // project into the two groups
1252         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1253         assert(groupOneSeqs);
1254         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1255         assert(groupTwoSeqs);
1256
1257 //start add by Yongtao
1258 #if 1
1259         VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
1260 #else
1261         VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
1262                         sparseMatrices, cutoff);
1263 #endif
1264         // compute an "accuracy" measure for the alignment before refinement
1265         SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
1266         SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
1267         i=0;
1268         for (set<int>::const_iterator iter = groupOne.begin();
1269                         iter != groupOne.end(); ++iter) {
1270                 oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1271         }
1272         i=0;
1273         for (set<int>::const_iterator iter = groupTwo.begin();
1274                         iter != groupTwo.end(); ++iter) {
1275                 oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1276         }
1277
1278         VF &posteriorArr = *posterior;
1279         int oldLength = alignment->GetSequence(0)->GetLength();
1280         int groupOneindex=0; int groupTwoindex=0;
1281         float accuracy_before = 0;
1282         int j;
1283         for (i = 1; i <= oldLength; i++) {
1284                 // check to see if there is a gap in every sequence of the set
1285                 bool foundOne = false;
1286                 for (j = 0; !foundOne && j < (int) groupOne.size(); j++)
1287                         foundOne = (oldOnePtrs[j][i] != '-');
1288                 // if not, then this column counts towards the sequence length
1289                 if (foundOne) groupOneindex ++;
1290                 bool foundTwo = false;
1291                 for (j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
1292                         foundTwo = (oldTwoPtrs[j][i] != '-');
1293                 if (foundTwo) groupTwoindex ++;
1294                 if(foundOne && foundTwo) accuracy_before +=
1295                                 posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
1296         }
1297
1298         pair<SafeVector<char> *, float> refinealignment;
1299         //perform alignment
1300         refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
1301                         groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
1302         delete posterior;
1303         // now build final alignment
1304         MultiSequence *result = new MultiSequence();
1305         for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
1306                 result->AddSequence(
1307                         groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
1308         for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
1309                 result->AddSequence(
1310                         groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
1311         // free temporary alignment
1312         delete refinealignment.first;
1313         delete alignment;
1314         alignment = result;
1315         delete groupOneSeqs;
1316         delete groupTwoSeqs;
1317         if(accuracy_before == refinealignment.second) return 1;
1318         else return 0;
1319 }
1320
1321
1322 void MSA::DoIterativeRefinementTreeNode(
1323                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1324                 const ProbabilisticModel &model, MultiSequence* &alignment,
1325                 int nodeIndex) {
1326         set<int> groupOne, groupTwo;
1327         int numSeqs = alignment->GetNumSequences();
1328
1329         vector<bool> inGroup1;
1330         inGroup1.resize(numSeqs);
1331         for (int i = 0; i < numSeqs; i++) {
1332                 inGroup1[i] = false;
1333         }
1334
1335         AlignmentOrder* orders = this->tree->getAlignOrders();
1336         AlignmentOrder* order = &orders[nodeIndex];
1337         for (int i = 0; i < order->leftNum; i++) {
1338                 int si = order->leftLeafs[i];
1339                 inGroup1[si] = true;
1340         }
1341         for (int i = 0; i < order->rightNum; i++) {
1342                 int si = order->rightLeafs[i];
1343                 inGroup1[si] = true;
1344         }
1345         // create two separate groups
1346         for (int i = 0; i < numSeqs; i++) {
1347                 if (inGroup1[i]) {
1348                         groupOne.insert(i);
1349                 } else {
1350                         groupTwo.insert(i);
1351                 }
1352         }
1353         if (groupOne.empty() || groupTwo.empty())
1354                 return;
1355
1356         // project into the two groups
1357         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1358         assert(groupOneSeqs);
1359         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1360         assert(groupTwoSeqs);
1361         delete alignment;
1362
1363         // realign
1364         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1365                         model);
1366
1367         delete groupOneSeqs;
1368         delete groupTwoSeqs;
1369 }
1370
1371 /////////////////////////////////////////////////////////////////
1372 // WriteAnnotation()
1373 //
1374 // Computes annotation for multiple alignment and write values
1375 // to a file.
1376 /////////////////////////////////////////////////////////////////
1377
1378 void MSA::WriteAnnotation(MultiSequence *alignment,
1379                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1380         ofstream outfile(annotationFilename.c_str());
1381
1382         if (outfile.fail()) {
1383                 cerr << "ERROR: Unable to write annotation file." << endl;
1384                 exit(1);
1385         }
1386
1387         const int alignLength = alignment->GetSequence(0)->GetLength();
1388         const int numSeqs = alignment->GetNumSequences();
1389
1390         SafeVector<int> position(numSeqs, 0);
1391         SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
1392         for (int i = 0; i < numSeqs; i++)
1393                 seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1394         SafeVector<pair<int, int> > active;
1395         active.reserve(numSeqs);
1396
1397         SafeVector<int> lab;
1398         for (int i = 0; i < numSeqs; i++)
1399                 lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1400
1401         // for every column
1402         for (int i = 1; i <= alignLength; i++) {
1403
1404                 // find all aligned residues in this particular column
1405                 active.clear();
1406                 for (int j = 0; j < numSeqs; j++) {
1407                         if (seqs[j][i] != '-') {
1408                                 active.push_back(make_pair(lab[j], ++position[j]));
1409                         }
1410                 }
1411
1412                 sort(active.begin(), active.end());
1413                 outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
1414         }
1415
1416         outfile.close();
1417 }
1418
1419 /////////////////////////////////////////////////////////////////
1420 // ComputeScore()
1421 //
1422 // Computes the annotation score for a particular column.
1423 /////////////////////////////////////////////////////////////////
1424
1425 int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
1426                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1427
1428         if (active.size() <= 1)
1429                 return 0;
1430
1431         // ALTERNATIVE #1: Compute the average alignment score.
1432
1433         float val = 0;
1434         for (int i = 0; i < (int) active.size(); i++) {
1435                 for (int j = i + 1; j < (int) active.size(); j++) {
1436                         val += sparseMatrices[active[i].first][active[j].first]->GetValue(
1437                                         active[i].second, active[j].second);
1438                 }
1439         }
1440
1441         return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1442
1443 }
1444
1445 /////////////////////////////////////////////////////////////////
1446 // ComputeSimilarity ()
1447 //
1448 // Computes the average similarity for a particular family.
1449 // extreme low or extreme high similarity(<=20% or >80%) return 0
1450 // low similarity(20%-50%) return 1
1451 // high similarity(50%-80%) return 2
1452 /////////////////////////////////////////////////////////////////
1453 extern pair<SafeVector<char> *, float> partViterbi(string seq1, string seq2);
1454 extern float computeS(string seq1, string seq2, SafeVector<char> * alignment);
1455
1456 int MSA::ComputeSimilarity (MultiSequence *sequences,const ProbabilisticModel &model){
1457         assert(sequences);
1458
1459         //get the number of sequences
1460         const int numSeqs = sequences->GetNumSequences();
1461         //average identity for all sequences
1462         float identity = 0;
1463
1464 #ifdef _OPENMP
1465         //calculate sequence pairs for openmp model
1466         int pairIdx = 0;
1467         numPairs = (numSeqs - 1) * numSeqs / 2;
1468         seqsPairs = new SeqsPair[numPairs];
1469         for(int a = 0; a < numSeqs; a++) {
1470                 for(int b = a + 1; b < numSeqs; b++) {
1471                         seqsPairs[pairIdx].seq1 = a;
1472                         seqsPairs[pairIdx].seq2 = b;
1473                         pairIdx++;
1474                 }
1475         }
1476 #endif
1477
1478         // do all pairwise alignments for family similarity
1479 #ifdef _OPENMP
1480 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
1481         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
1482                 int a= seqsPairs[pairIdx].seq1;
1483                 int b = seqsPairs[pairIdx].seq2;
1484                 if(enableVerbose) {
1485 #pragma omp critical
1486                         cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
1487                 }
1488 #else
1489         for (int a = 0; a < numSeqs - 1; a++) {
1490                 for (int b = a + 1; b < numSeqs; b++) {
1491 #endif
1492                         Sequence *seq1 = sequences->GetSequence(a);
1493                         Sequence *seq2 = sequences->GetSequence(b);
1494
1495                         //pair<SafeVector<char> *, float> alignment = ::partViterbi(seq1->GetString(),seq2->GetString());
1496                         //cerr << alignment.second / alignment.first->size();
1497                         //cerr << computeS(seq1->GetString(),seq2->GetString(),alignment.first)<< endl;
1498                         pair<SafeVector<char> *, float> alignment = model.ComputeViterbiAlignment(seq1,seq2);
1499 /*
1500                         VF* posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
1501                         pair<SafeVector<char> *, float> alignment = model.ComputeAlignment(
1502                                         seq1->GetLength(), seq2->GetLength(), *posterior);
1503 */
1504 /*
1505                         SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
1506                         SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
1507
1508                         float N_correct_match = 0;
1509                         //float N_match;
1510                         //float N_column = 0;
1511                         //float N_alignment = 0;
1512                         int i = 1;int j = 1;
1513                         //bool start = false; bool end = false;
1514                         for (SafeVector<char>::iterator iter = alignment.first->begin();
1515                                 iter != alignment.first->end(); ++iter){
1516                                 if (*iter == 'B'){
1517                                         //N_match += 1;
1518                                         //start = true;
1519                                         //if(i==seq1->GetLength() || j==seq2->GetLength()) end = true;
1520                                         unsigned char c1 = (unsigned char) iter1[i++];
1521                                         unsigned char c2 = (unsigned char) iter2[j++];
1522                                         if(c1==c2) N_correct_match += 1;
1523                                 }
1524                                 else if(*iter == 'X') i++;
1525                                 else if(*iter == 'Y') j++;
1526                                 //if(start && !end) N_column += 1;
1527                                 N_alignment += 1;
1528                         }
1529                         if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "similarity error"<< endl;
1530                         identity += N_correct_match / N_alignment;
1531                         //
1532 */
1533                         identity += alignment.second / alignment.first->size();
1534                         delete alignment.first;
1535 #ifndef _OPENMP
1536                 }
1537 #endif
1538         }
1539         identity /= numPairs;
1540
1541 FILE *fi = fopen ("accuracy", "a");
1542 fprintf (fi, " %.10f ", identity); fprintf (fi, "\n");
1543 fclose (fi);
1544
1545 /*
1546         //adapative
1547         if(identity <= 0.15) initDistrib[2] = 0.143854;
1548         else if(identity <= 0.2) initDistrib[2] = 0.191948;
1549         else if(identity <= 0.25) initDistrib[2] = 0.170705;
1550         else if(identity <= 0.3) initDistrib[2] = 0.100675;
1551         else if(identity <= 0.35) initDistrib[2] = 0.090755;
1552         else if(identity <= 0.4) initDistrib[2] = 0.146188;
1553         else if(identity <= 0.45) initDistrib[2] = 0.167858;
1554         else if(identity <= 0.5) initDistrib[2] = 0.250769;
1555         //else if(identity <= 0.6) initDistrib[2] = 0.500829;
1556         //else if(identity <= 0.7) initDistrib[2] = 0.259622;
1557 */
1558         if( identity<= 0.25 || identity > 0.8 ) return 0;
1559         else if(identity > 0.2 && identity<= 0.4) return 1;
1560         else return 2;
1561
1562 }