binaries/src/GLProbs-1.0/MSAdiv3.cpp

   1 /***********************************************
   2  * # Copyright 2009-2010. Liu Yongchao
   3  * # Contact: Liu Yongchao, School of Computer Engineering,
   4  * #                     Nanyang Technological University.
   5  * # Emails:     liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
   6  * #
   7  * # GPL version 3.0 applies.
   8  * #
   9  * ************************************************/
  10
  11 #include <string>
  12 #include <sstream>
  13 #include <iomanip>
  14 #include <iostream>
  15 #include <list>
  16 #include <set>
  17 #include <algorithm>
  18 #include <climits>
  19 #include <cstdio>
  20 #include <cstdlib>
  21 #include <cerrno>
  22 #include <iomanip>
  23 #include "MSA.h"
  24 #include "MSAClusterTree.h"
  25 #include "Defaults.h"
  26
  27 #ifdef _OPENMP
  28 #include <omp.h>
  29 #endif
  30
  31 string parametersInputFilename = "";
  32 string parametersOutputFilename = "no training";
  33 string annotationFilename = "";
  34
  35 bool enableVerbose = false;
  36 bool enableAnnotation = false;
  37 bool enableClustalWOutput = false;
  38 bool enableAlignOrder = false;
  39 int numConsistencyReps = 2;
  40 int numPreTrainingReps = 0;
  41 int numIterativeRefinementReps = 100;
  42
  43 float cutoff = 0;
  44
  45 VF initDistrib(NumMatrixTypes);
  46 VF gapOpen(2 * NumInsertStates);
  47 VF gapExtend(2 * NumInsertStates);
  48 VVF emitPairs(256, VF(256, 1e-10));
  49 VF emitSingle(256, 1e-5);
  50
  51 string alphabet = alphabetDefault;
  52
  53 const int MIN_PRETRAINING_REPS = 0;
  54 const int MAX_PRETRAINING_REPS = 20;
  55 const int MIN_CONSISTENCY_REPS = 0;
  56 const int MAX_CONSISTENCY_REPS = 5;
  57 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
  58 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
  59
  60 string posteriorProbsFilename = "";
  61 bool allscores = true;
  62 string infilename;
  63
  64 int flag_gui = 0;   //0: no gui related o/p
  65 //1: gui related o/p generated
  66 int flag_ppscore = 0; //0: no pp score sequence added to o/p fasta alignment
  67 //1: pp score seq added to o/p fasta alignment
  68
  69 ///////////////////////////////
  70 // global scoring matrix variables
  71 //////////////////////////////
  72 float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
  73 char *aminos, *bases, matrixtype[20] = "gonnet_160";
  74 int subst_index[26];
  75
  76 double sub_matrix[26][26];
  77 int firstread = 0;              //this makes sure that matrices are read only once
  78
  79 float TEMPERATURE = 5;
  80 int MATRIXTYPE = 160;
  81 int prot_nuc = 0;               //0=prot, 1=nucleotide
  82
  83 float GAPOPEN = 0;
  84 float GAPEXT = 0;
  85 int numThreads = 0;
  86
  87 //argument support
  88 typedef struct {
  89         char input[30];
  90         int matrix;
  91         int N;
  92         float T;
  93         float beta;
  94         char opt;                       //can be 'P' or 'M'
  95         float gapopen;
  96         float gapext;
  97 } argument_decl;
  98
  99 argument_decl argument;
 100
 101 extern inline void read_sustitution_matrix(char *fileName);
 102 extern void setmatrixtype(int le);
 103 extern inline int matrixtype_to_int();
 104 extern inline void read_dna_matrix();
 105 extern inline void read_vtml_la_matrix();
 106 extern void init_arguments();
 107
 108 MSA::MSA(int argc, char* argv[]) {
 109         //parse program parameters
 110         SafeVector<string> sequenceNames = ParseParams(argc, argv);
 111
 112         //initialize arguments for partition function
 113         init_arguments();
 114
 115         ReadParameters();
 116         //PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 117
 118         //read the input sequences
 119         MultiSequence *sequences = new MultiSequence();
 120         assert(sequences);
 121         for (int i = 0; i < (int) sequenceNames.size(); i++) {
 122                 cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 123                 sequences->LoadMFA(sequenceNames[i], true);
 124         }
 125         //allocate space for sequence weights
 126         this->seqsWeights = new int[sequences->GetNumSequences()];
 127         //initilaize parameters for OPENMP
 128 #ifdef _OPENMP
 129         if(numThreads <= 0) {
 130                 numThreads = omp_get_num_procs();
 131                 cerr << "Automatically detected " << numThreads << " CPU cores" << endl;
 132         }
 133         cerr <<"Enabling OpenMP (with "<<numThreads<<" threads)"<<endl;
 134
 135         //set OpenMP to use dynamic number of threads which is equal to the number of processor cores on the host
 136         omp_set_num_threads(numThreads);
 137 #endif
 138
 139         // now, we can perform the alignments and write them out
 140         MultiSequence *alignment = doAlign(sequences,
 141                         ProbabilisticModel(initDistrib, gapOpen, gapExtend, emitPairs,
 142                                         emitSingle), initDistrib, gapOpen, gapExtend, emitPairs,
 143                         emitSingle);
 144
 145         //write the alignment results to standard output
 146         if (enableClustalWOutput) {
 147                 alignment->WriteALN(*alignOutFile);
 148         } else {
 149                 alignment->WriteMFA(*alignOutFile);
 150         }
 151         //release resources
 152         delete[] this->seqsWeights;
 153         delete alignment;
 154         delete sequences;
 155 }
 156 MSA::~MSA() {
 157         /*close the output file*/
 158         if (alignOutFileName.length() > 0) {
 159                 ((std::ofstream*) alignOutFile)->close();
 160         }
 161 }
 162 /////////////////////////////////////////////////////////////////
 163 // PrintParameters()
 164 //
 165 // Prints MSAPROBS parameters to STDERR.  If a filename is
 166 // specified, then the parameters are also written to the file.
 167 /////////////////////////////////////////////////////////////////
 168
 169 void MSA::PrintParameters(const char *message, const VF &initDistrib,
 170                 const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs,
 171                 const VF &emitSingle, const char *filename) {
 172
 173         // print parameters to the screen
 174         cerr << message << endl << "    initDistrib[] = { ";
 175         for (int i = 0; i < NumMatrixTypes; i++)
 176                 cerr << setprecision(10) << initDistrib[i] << " ";
 177         cerr << "}" << endl << "        gapOpen[] = { ";
 178         for (int i = 0; i < NumInsertStates * 2; i++)
 179                 cerr << setprecision(10) << gapOpen[i] << " ";
 180         cerr << "}" << endl << "      gapExtend[] = { ";
 181         for (int i = 0; i < NumInsertStates * 2; i++)
 182                 cerr << setprecision(10) << gapExtend[i] << " ";
 183         cerr << "}" << endl << endl;
 184
 185         /*
 186          for (int i = 0; i < 5; i++){
 187          for (int j = 0; j <= i; j++){
 188          cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
 189          }
 190          cerr << endl;
 191          }*/
 192
 193         // if a file name is specified
 194         if (filename) {
 195
 196                 // attempt to open the file for writing
 197                 FILE *file = fopen(filename, "w");
 198                 if (!file) {
 199                         cerr << "ERROR: Unable to write parameter file: " << filename
 200                                         << endl;
 201                         exit(1);
 202                 }
 203
 204                 // if successful, then write the parameters to the file
 205                 for (int i = 0; i < NumMatrixTypes; i++)
 206                         fprintf(file, "%.10f ", initDistrib[i]);
 207                 fprintf(file, "\n");
 208                 for (int i = 0; i < 2 * NumInsertStates; i++)
 209                         fprintf(file, "%.10f ", gapOpen[i]);
 210                 fprintf(file, "\n");
 211                 for (int i = 0; i < 2 * NumInsertStates; i++)
 212                         fprintf(file, "%.10f ", gapExtend[i]);
 213                 fprintf(file, "\n");
 214                 fprintf(file, "%s\n", alphabet.c_str());
 215                 for (int i = 0; i < (int) alphabet.size(); i++) {
 216                         for (int j = 0; j <= i; j++)
 217                                 fprintf(file, "%.10f ",
 218                                                 emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
 219                         fprintf(file, "\n");
 220                 }
 221                 for (int i = 0; i < (int) alphabet.size(); i++)
 222                         fprintf(file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
 223                 fprintf(file, "\n");
 224                 fclose(file);
 225         }
 226 }
 227
 228 /////////////////////////////////////////////////////////////////
 229 // doAlign()
 230 //
 231 // First computes all pairwise posterior probability matrices.
 232 // Then, computes new parameters if training, or a final
 233 // alignment, otherwise.
 234 /////////////////////////////////////////////////////////////////
 235 extern VF *ComputePostProbs(int a, int b, string seq1, string seq2);
 236 MultiSequence* MSA::doAlign(MultiSequence *sequences,
 237                 const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
 238                 VF &gapExtend, VVF &emitPairs, VF &emitSingle) {
 239         assert(sequences);
 240
 241         //get the number of sequences
 242         const int numSeqs = sequences->GetNumSequences();
 243
 244         //create distance matrix
 245         VVF probalign_distances(numSeqs, VF(numSeqs, 0));
 246         VVF distances(numSeqs, VF(numSeqs, 0));//msa
 247
 248         float gl_accuracy = 0;
 249         //creat sparseMatrices
 250         SafeVector<SafeVector<SparseMatrix *> > probalign_sparseMatrices(numSeqs,
 251                         SafeVector<SparseMatrix *>(numSeqs, NULL));
 252         SafeVector<SafeVector<SparseMatrix *> > sparseMatrices(numSeqs,
 253                         SafeVector<SparseMatrix *>(numSeqs, NULL)); // msa
 254
 255 #ifdef _OPENMP
 256         //calculate sequence pairs for openmp model
 257         int pairIdx = 0;
 258         numPairs = (numSeqs - 1) * numSeqs / 2;
 259         seqsPairs = new SeqsPair[numPairs];
 260         for(int a = 0; a < numSeqs; a++) {
 261                 for(int b = a + 1; b < numSeqs; b++) {
 262                         seqsPairs[pairIdx].seq1 = a;
 263                         seqsPairs[pairIdx].seq2 = b;
 264                         pairIdx++;
 265                 }
 266         }
 267 #endif
 268         // do all pairwise alignments for posterior probability matrices
 269 #ifdef _OPENMP
 270 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
 271         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
 272                 int a= seqsPairs[pairIdx].seq1;
 273                 int b = seqsPairs[pairIdx].seq2;
 274                 if(enableVerbose) {
 275 #pragma omp critical
 276                         cerr <<"tid "<<omp_get_thread_num()<<" a "<<a<<" b "<<b<<endl;
 277                 }
 278 #else
 279         for (int a = 0; a < numSeqs - 1; a++) {
 280                 for (int b = a + 1; b < numSeqs; b++) {
 281 #endif
 282                         Sequence *seq1 = sequences->GetSequence(a);
 283                         Sequence *seq2 = sequences->GetSequence(b);
 284
 285                         // verbose output
 286                         if (enableVerbose) {
 287                                 cerr << "Computing posterior matrix: (" << a + 1 << ") "
 288                                                 << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") "
 289                                                 << seq2->GetHeader() << " -- ";
 290                         }
 291
 292 //probcons
 293                         // compute forward and backward probabilities
 294                         VF *forward = model.ComputeForwardMatrix(seq1, seq2);
 295                         assert(forward);
 296                         VF *backward = model.ComputeBackwardMatrix(seq1, seq2);
 297                         assert(backward);
 298                         // compute posterior probability matrix from HMM
 299                         VF *probcons_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward);
 300                         assert(probcons_posterior);
 301                         delete forward;
 302                         delete backward;
 303
 304 //probalign
 305                         VF *probalign_posterior = ::ComputePostProbs(a, b, seq1->GetString(),seq2->GetString());
 306                         assert(probalign_posterior);
 307                         probalign_sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),seq2->GetLength(),                                                                  *probalign_posterior);
 308                         probalign_sparseMatrices[b][a] = NULL;
 309                         pair<SafeVector<char> *, float> probalign_alignment = model.ComputeAlignment(
 310                                         seq1->GetLength(), seq2->GetLength(), *probalign_posterior);
 311                         probalign_distances[a][b] =1.0f - probalign_alignment.second / min(seq1->GetLength(), seq2->GetLength());
 312                         delete probalign_alignment.first;
 313
 314 //local
 315                         forward = model.ComputeForwardMatrix(seq1, seq2,false);
 316                         assert(forward);
 317                         backward = model.ComputeBackwardMatrix(seq1, seq2,false);
 318                         assert(backward);
 319                         VF* local_posterior = model.ComputePosteriorMatrix(seq1, seq2, *forward,*backward, false);
 320                         delete forward;
 321                         delete backward;
 322
 323 //GL
 324                         //merge probalign + local + probcons
 325                         VF::iterator ptr1 = probcons_posterior->begin();
 326                         VF::iterator ptr2 = probalign_posterior->begin();
 327                         VF::iterator ptr3 = local_posterior->begin();
 328                         VF* posterior = new VF((seq1->GetLength()+1) * (seq2->GetLength()+1)); assert (posterior);  //msa
 329                         VF::iterator ptr = posterior->begin();
 330                         for (int i = 0; i <= seq1->GetLength(); i++) {
 331                                 for (int j = 0; j <= seq2->GetLength(); j++) {
 332                                         float v1 = *ptr1;
 333                                         float v2 = *ptr2;
 334                                         float v3 = *ptr3;
 335                                         *ptr = sqrt((v1*v1 + v2*v2 + v3*v3)/3);
 336                                         ptr1++;
 337                                         ptr2++;
 338                                         ptr3++;
 339                                         ptr++;
 340                                 }
 341                         }
 342                         // perform the pairwise sequence alignment
 343                         pair<SafeVector<char> *, float> gl_alignment = model.ComputeAlignment(
 344                                         seq1->GetLength(), seq2->GetLength(), *posterior);
 345
 346                         //compute expected accuracy
 347                         distances[a][b] = distances[b][a] = 1.0f - gl_alignment.second
 348                                         / min(seq1->GetLength(), seq2->GetLength());
 349
 350                         // compute sparse representations
 351                         sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(),
 352                         seq2->GetLength(), *posterior);
 353                         sparseMatrices[b][a] = NULL;
 354                         //
 355                         SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
 356                         SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
 357                         float N_correct_match = 0;
 358                         int i = 1;int j = 1;
 359                         for (SafeVector<char>::iterator iter = gl_alignment.first->begin();
 360                                 iter != gl_alignment.first->end(); ++iter){
 361                                 if (*iter == 'B'){
 362                                         unsigned char c1 = (unsigned char) iter1[i++];
 363                                         unsigned char c2 = (unsigned char) iter2[j++];
 364                                         if(c1==c2) N_correct_match += 1;
 365                                 }
 366                                 else if(*iter == 'X') i++;
 367                                 else if(*iter == 'Y') j++;
 368                         }
 369                         if(i!= seq1->GetLength()+1 || j!= seq2->GetLength() + 1 ) cerr << "GL"<< endl;
 370                         gl_accuracy += N_correct_match / min(seq1->GetLength(), seq2->GetLength());
 371                         //
 372                         delete probcons_posterior;
 373                         delete probalign_posterior;
 374                         delete local_posterior;
 375                         delete posterior;
 376
 377 #ifndef _OPENMP
 378                 }
 379 #endif
 380         }
 381
 382 /*
 383 //self-adaptive
 384         gl_accuracy /= numPairs;
 385         if(gl_accuracy > 0.4){
 386                 for (int a = 0; a < numSeqs - 1; a++)
 387                         for (int b = a + 1; b < numSeqs; b++) {
 388                                 distances[a][b] = distances[b][a] = probalign_distances[a][b];
 389                                 sparseMatrices[a][b] = probalign_sparseMatrices[a][b];
 390                                 sparseMatrices[b][a] = NULL;
 391                         }
 392         }
 393 */
 394         //create the guide tree
 395         this->tree = new MSAClusterTree(this, distances, numSeqs);
 396         this->tree->create();
 397
 398         // perform the consistency transformation the desired number of times
 399         float* fweights = new float[numSeqs];
 400         for (int r = 0; r < numSeqs; r++) {
 401                 fweights[r] = ((float) seqsWeights[r]) / INT_MULTIPLY;
 402                 fweights[r] *= 10;
 403         }
 404         for (int r = 0; r < numConsistencyReps; r++) {
 405                 SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices =
 406                                 DoRelaxation(fweights, sequences, sparseMatrices);
 407
 408                 // now replace the old posterior matrices
 409                 for (int i = 0; i < numSeqs; i++) {
 410                         for (int j = 0; j < numSeqs; j++) {
 411                                 delete sparseMatrices[i][j];
 412                                 sparseMatrices[i][j] = newSparseMatrices[i][j];
 413                         }
 414                 }
 415         }
 416         delete[] fweights;
 417 #ifdef _OPENMP
 418         delete [] seqsPairs;
 419 #endif
 420
 421         //compute the final multiple sequence alignment
 422         MultiSequence *finalAlignment = ComputeFinalAlignment(this->tree, sequences,
 423                         sparseMatrices, model);
 424
 425         // build annotation
 426         if (enableAnnotation) {
 427                 WriteAnnotation(finalAlignment, sparseMatrices);
 428         }
 429         //destroy the guide tree
 430         delete this->tree;
 431         this->tree = 0;
 432
 433         // delete sparse matrices
 434         for (int a = 0; a < numSeqs - 1; a++) {
 435                 for (int b = a + 1; b < numSeqs; b++) {
 436                         delete sparseMatrices[a][b];
 437                         delete sparseMatrices[b][a];
 438                 }
 439         }
 440
 441         return finalAlignment;
 442 }
 443
 444 /////////////////////////////////////////////////////////////////
 445 // GetInteger()
 446 //
 447 // Attempts to parse an integer from the character string given.
 448 // Returns true only if no parsing error occurs.
 449 /////////////////////////////////////////////////////////////////
 450
 451 bool GetInteger(char *data, int *val) {
 452         char *endPtr;
 453         long int retVal;
 454
 455         assert(val);
 456
 457         errno = 0;
 458         retVal = strtol(data, &endPtr, 0);
 459         if (retVal == 0 && (errno != 0 || data == endPtr))
 460                 return false;
 461         if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN))
 462                 return false;
 463         if (retVal < (long) INT_MIN || retVal > (long) INT_MAX)
 464                 return false;
 465         *val = (int) retVal;
 466         return true;
 467 }
 468
 469 /////////////////////////////////////////////////////////////////
 470 // GetFloat()
 471 //
 472 // Attempts to parse a float from the character string given.
 473 // Returns true only if no parsing error occurs.
 474 /////////////////////////////////////////////////////////////////
 475
 476 bool GetFloat(char *data, float *val) {
 477         char *endPtr;
 478         double retVal;
 479
 480         assert(val);
 481
 482         errno = 0;
 483         retVal = strtod(data, &endPtr);
 484         if (retVal == 0 && (errno != 0 || data == endPtr))
 485                 return false;
 486         if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0))
 487                 return false;
 488         *val = (float) retVal;
 489         return true;
 490 }
 491
 492 /////////////////////////////////////////////////////////////////
 493 // ReadParameters()
 494 //
 495 // Read initial distribution, transition, and emission
 496 // parameters from a file.
 497 /////////////////////////////////////////////////////////////////
 498
 499 void MSA::ReadParameters() {
 500
 501         ifstream data;
 502
 503         emitPairs = VVF(256, VF(256, 1e-10));
 504         emitSingle = VF(256, 1e-5);
 505
 506         // read initial state distribution and transition parameters
 507         if (parametersInputFilename == string("")) {
 508                 if (NumInsertStates == 1) {
 509                         for (int i = 0; i < NumMatrixTypes; i++)
 510                                 initDistrib[i] = initDistrib1Default[i];
 511                         for (int i = 0; i < 2 * NumInsertStates; i++)
 512                                 gapOpen[i] = gapOpen1Default[i];
 513                         for (int i = 0; i < 2 * NumInsertStates; i++)
 514                                 gapExtend[i] = gapExtend1Default[i];
 515                 } else if (NumInsertStates == 2) {
 516                         for (int i = 0; i < NumMatrixTypes; i++)
 517                                 initDistrib[i] = initDistrib2Default[i];
 518                         for (int i = 0; i < 2 * NumInsertStates; i++)
 519                                 gapOpen[i] = gapOpen2Default[i];
 520                         for (int i = 0; i < 2 * NumInsertStates; i++)
 521                                 gapExtend[i] = gapExtend2Default[i];
 522                 } else {
 523                         cerr
 524                                         << "ERROR: No default initial distribution/parameter settings exist"
 525                                         << endl << "       for " << NumInsertStates
 526                                         << " pairs of insert states.  Use --paramfile." << endl;
 527                         exit(1);
 528                 }
 529
 530                 alphabet = alphabetDefault;
 531
 532                 for (int i = 0; i < (int) alphabet.length(); i++) {
 533                         emitSingle[(unsigned char) tolower(alphabet[i])] =
 534                                         emitSingleDefault[i];
 535                         emitSingle[(unsigned char) toupper(alphabet[i])] =
 536                                         emitSingleDefault[i];
 537                         for (int j = 0; j <= i; j++) {
 538                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
 539                                                 alphabet[j])] = emitPairsDefault[i][j];
 540                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
 541                                                 alphabet[j])] = emitPairsDefault[i][j];
 542                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
 543                                                 alphabet[j])] = emitPairsDefault[i][j];
 544                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
 545                                                 alphabet[j])] = emitPairsDefault[i][j];
 546                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
 547                                                 alphabet[i])] = emitPairsDefault[i][j];
 548                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
 549                                                 alphabet[i])] = emitPairsDefault[i][j];
 550                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
 551                                                 alphabet[i])] = emitPairsDefault[i][j];
 552                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
 553                                                 alphabet[i])] = emitPairsDefault[i][j];
 554                         }
 555                 }
 556         } else {
 557                 data.open(parametersInputFilename.c_str());
 558                 if (data.fail()) {
 559                         cerr << "ERROR: Unable to read parameter file: "
 560                                         << parametersInputFilename << endl;
 561                         exit(1);
 562                 }
 563
 564                 string line[3];
 565                 for (int i = 0; i < 3; i++) {
 566                         if (!getline(data, line[i])) {
 567                                 cerr
 568                                                 << "ERROR: Unable to read transition parameters from parameter file: "
 569                                                 << parametersInputFilename << endl;
 570                                 exit(1);
 571                         }
 572                 }
 573                 istringstream data2;
 574                 data2.clear();
 575                 data2.str(line[0]);
 576                 for (int i = 0; i < NumMatrixTypes; i++)
 577                         data2 >> initDistrib[i];
 578                 data2.clear();
 579                 data2.str(line[1]);
 580                 for (int i = 0; i < 2 * NumInsertStates; i++)
 581                         data2 >> gapOpen[i];
 582                 data2.clear();
 583                 data2.str(line[2]);
 584                 for (int i = 0; i < 2 * NumInsertStates; i++)
 585                         data2 >> gapExtend[i];
 586
 587                 if (!getline(data, line[0])) {
 588                         cerr << "ERROR: Unable to read alphabet from scoring matrix file: "
 589                                         << parametersInputFilename << endl;
 590                         exit(1);
 591                 }
 592
 593                 // read alphabet as concatenation of all characters on alphabet line
 594                 alphabet = "";
 595                 string token;
 596                 data2.clear();
 597                 data2.str(line[0]);
 598                 while (data2 >> token)
 599                         alphabet += token;
 600
 601                 for (int i = 0; i < (int) alphabet.size(); i++) {
 602                         for (int j = 0; j <= i; j++) {
 603                                 float val;
 604                                 data >> val;
 605                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(
 606                                                 alphabet[j])] = val;
 607                                 emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(
 608                                                 alphabet[j])] = val;
 609                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(
 610                                                 alphabet[j])] = val;
 611                                 emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(
 612                                                 alphabet[j])] = val;
 613                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(
 614                                                 alphabet[i])] = val;
 615                                 emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(
 616                                                 alphabet[i])] = val;
 617                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(
 618                                                 alphabet[i])] = val;
 619                                 emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(
 620                                                 alphabet[i])] = val;
 621                         }
 622                 }
 623
 624                 for (int i = 0; i < (int) alphabet.size(); i++) {
 625                         float val;
 626                         data >> val;
 627                         emitSingle[(unsigned char) tolower(alphabet[i])] = val;
 628                         emitSingle[(unsigned char) toupper(alphabet[i])] = val;
 629                 }
 630                 data.close();
 631         }
 632 }
 633
 634 /////////////////////////////////////////////////////////////////
 635 // ParseParams()
 636 //
 637 // Parse all command-line options.
 638 /////////////////////////////////////////////////////////////////
 639 void MSA::printUsage() {
 640         cerr
 641                         << "************************************************************************"
 642                         << endl
 643                         << "\tMSAPROBS is a open-source protein multiple sequence alignment algorithm"
 644                         << endl
 645                         << "\tbased on pair hidden markov model and partition function postirior"
 646                         << endl
 647                         << "\tprobabilities. If any comments or problems, please contact"
 648                         << endl
 649                         << "\tLiu Yongchao(liuy0039@ntu.edu.sg or nkcslyc@hotmail.com)"
 650                         << endl
 651                         << "*************************************************************************"
 652                         << endl << "Usage:" << endl
 653                         << "       msaprobs [OPTION]... [infile]..." << endl << endl
 654                         << "Description:" << endl
 655                         << "       Align sequences in multi-FASTA format" << endl << endl
 656                         << "       -o, --outfile <string>" << endl
 657                         << "              specify the output file name (STDOUT by default)"
 658                         << endl << "       -num_threads <integer>" << endl
 659                         << "              specify the number of threads used, and otherwise detect automatically"
 660                         << endl << "       -clustalw" << endl
 661                         << "              use CLUSTALW output format instead of FASTA format"
 662                         << endl << endl << "       -c, --consistency REPS" << endl
 663                         << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= "
 664                         << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps
 665                         << ") passes of consistency transformation" << endl << endl
 666                         << "       -ir, --iterative-refinement REPS" << endl
 667                         << "              use " << MIN_ITERATIVE_REFINEMENT_REPS
 668                         << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: "
 669                         << numIterativeRefinementReps << ") passes of iterative-refinement"
 670                         << endl << endl << "       -v, --verbose" << endl
 671                         << "              report progress while aligning (default: "
 672                         << (enableVerbose ? "on" : "off") << ")" << endl << endl
 673                         << "       -annot FILENAME" << endl
 674                         << "              write annotation for multiple alignment to FILENAME"
 675                         << endl << endl << "       -a, --alignment-order" << endl
 676                         << "              print sequences in alignment order rather than input order (default: "
 677                         << (enableAlignOrder ? "on" : "off") << ")" << endl
 678                         << "       -version " << endl
 679                         << "              print out version of MSAPROBS " << endl << endl;
 680 }
 681 SafeVector<string> MSA::ParseParams(int argc, char **argv) {
 682         if (argc < 2) {
 683                 printUsage();
 684                 exit(1);
 685         }
 686         SafeVector<string> sequenceNames;
 687         int tempInt;
 688         float tempFloat;
 689
 690         for (int i = 1; i < argc; i++) {
 691                 if (argv[i][0] == '-') {
 692                         //help
 693                         if (!strcmp(argv[i], "-help") || !strcmp(argv[i], "-?")) {
 694                                 printUsage();
 695                                 exit(1);
 696                                 //output file name
 697                         } else if (!strcmp(argv[i], "-o")
 698                                         || !strcmp(argv[i], "--outfile")) {
 699                                 if (i < argc - 1) {
 700                                         alignOutFileName = argv[++i];   //get the file name
 701                                 } else {
 702                                         cerr << "ERROR: String expected for option " << argv[i]
 703                                                         << endl;
 704                                         exit(1);
 705                                 }
 706                                 // parameter file
 707                         } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
 708                                 if (i < argc - 1)
 709                                         parametersInputFilename = string (argv[++i]);
 710                                 else {
 711                                         cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 712                                         exit (1);
 713                                 }
 714                                 //number of threads used
 715                         } else if (!strcmp(argv[i], "-p")
 716                                         || !strcmp(argv[i], "-num_threads")) {
 717                                 if (i < argc - 1) {
 718                                         if (!GetInteger(argv[++i], &tempInt)) {
 719                                                 cerr << " ERROR: invalid integer following option "
 720                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 721                                                 exit(1);
 722                                         } else {
 723                                                 if (tempInt < 0) {
 724                                                         tempInt = 0;
 725                                                 }
 726                                                 numThreads = tempInt;
 727                                         }
 728                                 } else {
 729                                         cerr << "ERROR: Integer expected for option " << argv[i]
 730                                                         << endl;
 731                                         exit(1);
 732                                 }
 733                                 // number of consistency transformations
 734                         } else if (!strcmp(argv[i], "-c")
 735                                         || !strcmp(argv[i], "--consistency")) {
 736                                 if (i < argc - 1) {
 737                                         if (!GetInteger(argv[++i], &tempInt)) {
 738                                                 cerr << "ERROR: Invalid integer following option "
 739                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 740                                                 exit(1);
 741                                         } else {
 742                                                 if (tempInt < MIN_CONSISTENCY_REPS
 743                                                                 || tempInt > MAX_CONSISTENCY_REPS) {
 744                                                         cerr << "ERROR: For option " << argv[i - 1]
 745                                                                         << ", integer must be between "
 746                                                                         << MIN_CONSISTENCY_REPS << " and "
 747                                                                         << MAX_CONSISTENCY_REPS << "." << endl;
 748                                                         exit(1);
 749                                                 } else {
 750                                                         numConsistencyReps = tempInt;
 751                                                 }
 752                                         }
 753                                 } else {
 754                                         cerr << "ERROR: Integer expected for option " << argv[i]
 755                                                         << endl;
 756                                         exit(1);
 757                                 }
 758                         }
 759
 760                         // number of randomized partitioning iterative refinement passes
 761                         else if (!strcmp(argv[i], "-ir")
 762                                         || !strcmp(argv[i], "--iterative-refinement")) {
 763                                 if (i < argc - 1) {
 764                                         if (!GetInteger(argv[++i], &tempInt)) {
 765                                                 cerr << "ERROR: Invalid integer following option "
 766                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 767                                                 exit(1);
 768                                         } else {
 769                                                 if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS
 770                                                                 || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) {
 771                                                         cerr << "ERROR: For option " << argv[i - 1]
 772                                                                         << ", integer must be between "
 773                                                                         << MIN_ITERATIVE_REFINEMENT_REPS << " and "
 774                                                                         << MAX_ITERATIVE_REFINEMENT_REPS << "."
 775                                                                         << endl;
 776                                                         exit(1);
 777                                                 } else
 778                                                         numIterativeRefinementReps = tempInt;
 779                                         }
 780                                 } else {
 781                                         cerr << "ERROR: Integer expected for option " << argv[i]
 782                                                         << endl;
 783                                         exit(1);
 784                                 }
 785                         }
 786
 787                         // annotation files
 788                         else if (!strcmp(argv[i], "-annot")) {
 789                                 enableAnnotation = true;
 790                                 if (i < argc - 1) {
 791                                         annotationFilename = argv[++i];
 792                                 } else {
 793                                         cerr << "ERROR: FILENAME expected for option " << argv[i]
 794                                                         << endl;
 795                                         exit(1);
 796                                 }
 797                         }
 798
 799                         // clustalw output format
 800                         else if (!strcmp(argv[i], "-clustalw")) {
 801                                 enableClustalWOutput = true;
 802                         }
 803
 804                         // cutoff
 805                         else if (!strcmp(argv[i], "-co") || !strcmp(argv[i], "--cutoff")) {
 806                                 if (i < argc - 1) {
 807                                         if (!GetFloat(argv[++i], &tempFloat)) {
 808                                                 cerr
 809                                                                 << "ERROR: Invalid floating-point value following option "
 810                                                                 << argv[i - 1] << ": " << argv[i] << endl;
 811                                                 exit(1);
 812                                         } else {
 813                                                 if (tempFloat < 0 || tempFloat > 1) {
 814                                                         cerr << "ERROR: For option " << argv[i - 1]
 815                                                                         << ", floating-point value must be between 0 and 1."
 816                                                                         << endl;
 817                                                         exit(1);
 818                                                 } else
 819                                                         cutoff = tempFloat;
 820                                         }
 821                                 } else {
 822                                         cerr << "ERROR: Floating-point value expected for option "
 823                                                         << argv[i] << endl;
 824                                         exit(1);
 825                                 }
 826                         }
 827
 828                         // verbose reporting
 829                         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
 830                                 enableVerbose = true;
 831                         }
 832
 833                         // alignment order
 834                         else if (!strcmp(argv[i], "-a")
 835                                         || !strcmp(argv[i], "--alignment-order")) {
 836                                 enableAlignOrder = true;
 837                         }
 838
 839                         //print out version
 840                         else if (!strcmp(argv[i], "-version")) {
 841                                 cerr << "MSAPROBS version " << VERSION << endl;
 842                                 exit(1);
 843                         }
 844                         // bad arguments
 845                         else {
 846                                 cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
 847                                 exit(1);
 848                         }
 849                 } else {
 850                         sequenceNames.push_back(string(argv[i]));
 851                 }
 852         }
 853
 854         /*check the output file name*/
 855         cerr << "-------------------------------------" << endl;
 856         if (alignOutFileName.length() == 0) {
 857                 cerr << "The final alignments will be printed out to STDOUT" << endl;
 858                 alignOutFile = &std::cout;
 859         } else {
 860                 cerr << "Open the output file " << alignOutFileName << endl;
 861                 alignOutFile = new ofstream(alignOutFileName.c_str(),
 862                                 ios::binary | ios::out | ios::trunc);
 863         }
 864         cerr << "-------------------------------------" << endl;
 865         return sequenceNames;
 866 }
 867
 868 /////////////////////////////////////////////////////////////////
 869 // ProcessTree()
 870 //
 871 // Process the tree recursively.  Returns the aligned sequences
 872 // corresponding to a node or leaf of the tree.
 873 /////////////////////////////////////////////////////////////////
 874 MultiSequence* MSA::ProcessTree(TreeNode *tree, MultiSequence *sequences,
 875                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 876                 const ProbabilisticModel &model) {
 877
 878         MultiSequence *result;
 879
 880         // check if this is a node of the alignment tree
 881         //if (tree->GetSequenceLabel() == -1){
 882         if (tree->leaf == NODE) {
 883                 MultiSequence *alignLeft = ProcessTree(tree->left, sequences,
 884                                 sparseMatrices, model);
 885                 MultiSequence *alignRight = ProcessTree(tree->right, sequences,
 886                                 sparseMatrices, model);
 887
 888                 assert(alignLeft);
 889                 assert(alignRight);
 890
 891                 result = AlignAlignments(alignLeft, alignRight, sparseMatrices, model);
 892                 assert(result);
 893
 894                 delete alignLeft;
 895                 delete alignRight;
 896         }
 897
 898         // otherwise, this is a leaf of the alignment tree
 899         else {
 900                 result = new MultiSequence();
 901                 assert(result);
 902                 //result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
 903                 result->AddSequence(sequences->GetSequence(tree->idx)->Clone());
 904         }
 905
 906         return result;
 907 }
 908
 909 /////////////////////////////////////////////////////////////////
 910 // ComputeFinalAlignment()
 911 //
 912 // Compute the final alignment by calling ProcessTree(), then
 913 // performing iterative refinement as needed.
 914 /////////////////////////////////////////////////////////////////
 915
 916 MultiSequence* MSA::ComputeFinalAlignment(MSAGuideTree*tree,
 917                 MultiSequence *sequences,
 918                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 919                 const ProbabilisticModel &model) {
 920         MultiSequence *alignment = ProcessTree(tree->getRoot(), sequences,
 921                         sparseMatrices, model);
 922
 923         SafeVector<int> oldOrdering;
 924         if (enableAlignOrder) {
 925                 for (int i = 0; i < alignment->GetNumSequences(); i++)
 926                         oldOrdering.push_back(alignment->GetSequence(i)->GetSortLabel());
 927                 alignment->SaveOrdering();
 928                 enableAlignOrder = false;
 929         }
 930
 931         // tree-based refinement
 932         // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
 933         /*
 934          int numSeqs = alignment->GetNumSequences();
 935          //if(numSeqs < numIterativeRefinementReps){
 936          for(int iter = 0; iter < 5; iter ++){
 937                 for(int i = 0; i < numSeqs - 1; i++){
 938                         DoIterativeRefinementTreeNode(sparseMatrices, model, alignment, i);
 939                  }
 940          }
 941          //}*/
 942         //Refinement return false:no improvement
 943         for (int i = 0; i < numIterativeRefinementReps; i++) {
 944                 DoIterativeRefinement(sparseMatrices, model, alignment);
 945         }
 946         cerr << endl;
 947
 948         if (oldOrdering.size() > 0) {
 949                 for (int i = 0; i < (int) oldOrdering.size(); i++) {
 950                         alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
 951                 }
 952         }
 953
 954         // return final alignment
 955         return alignment;
 956 }
 957
 958 /////////////////////////////////////////////////////////////////
 959 // AlignAlignments()
 960 //
 961 // Returns the alignment of two MultiSequence objects.
 962 /////////////////////////////////////////////////////////////////
 963
 964 MultiSequence* MSA::AlignAlignments(MultiSequence *align1,
 965                 MultiSequence *align2,
 966                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 967                 const ProbabilisticModel &model) {
 968
 969         // print some info about the alignment
 970         if (enableVerbose) {
 971                 for (int i = 0; i < align1->GetNumSequences(); i++)
 972                         cerr << ((i == 0) ? "[" : ",")
 973                                         << align1->GetSequence(i)->GetLabel();
 974                 cerr << "] vs. ";
 975                 for (int i = 0; i < align2->GetNumSequences(); i++)
 976                         cerr << ((i == 0) ? "[" : ",")
 977                                         << align2->GetSequence(i)->GetLabel();
 978                 cerr << "]: ";
 979         }
 980 #if 1
 981         VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
 982 #else
 983         VF *posterior = model.BuildPosterior(getSeqsWeights(), align1, align2,
 984                         sparseMatrices, cutoff);
 985 #endif
 986         // compute an "accuracy" measure for the alignment before refinement
 987
 988         pair<SafeVector<char> *, float> alignment;
 989         //perform alignment
 990         alignment = model.ComputeAlignment(align1->GetSequence(0)->GetLength(),
 991                         align2->GetSequence(0)->GetLength(), *posterior);
 992
 993         delete posterior;
 994
 995         if (enableVerbose) {
 996
 997                 // compute total length of sequences
 998                 int totLength = 0;
 999                 for (int i = 0; i < align1->GetNumSequences(); i++)
1000                         for (int j = 0; j < align2->GetNumSequences(); j++)
1001                                 totLength += min(align1->GetSequence(i)->GetLength(),
1002                                                 align2->GetSequence(j)->GetLength());
1003
1004                 // give an "accuracy" measure for the alignment
1005                 cerr << alignment.second / totLength << endl;
1006         }
1007
1008         // now build final alignment
1009         MultiSequence *result = new MultiSequence();
1010         for (int i = 0; i < align1->GetNumSequences(); i++)
1011                 result->AddSequence(
1012                                 align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1013         for (int i = 0; i < align2->GetNumSequences(); i++)
1014                 result->AddSequence(
1015                                 align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1016         if (!enableAlignOrder)
1017                 result->SortByLabel();
1018
1019         // free temporary alignment
1020         delete alignment.first;
1021
1022         return result;
1023 }
1024
1025 /////////////////////////////////////////////////////////////////
1026 // DoRelaxation()
1027 //
1028 // Performs one round of the weighted probabilistic consistency transformation.
1029 //                     1
1030 /////////////////////////////////////////////////////////////////
1031
1032 SafeVector<SafeVector<SparseMatrix *> > MSA::DoRelaxation(float* seqsWeights,
1033                 MultiSequence *sequences,
1034                 SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1035         const int numSeqs = sequences->GetNumSequences();
1036
1037         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices(numSeqs,
1038                         SafeVector<SparseMatrix *>(numSeqs, NULL));
1039
1040         // for every pair of sequences
1041 #ifdef _OPENMP
1042         int pairIdx;
1043 #pragma omp parallel for private(pairIdx) default(shared) schedule(dynamic)
1044         for(pairIdx = 0; pairIdx < numPairs; pairIdx++) {
1045                 int i = seqsPairs[pairIdx].seq1;
1046                 int j = seqsPairs[pairIdx].seq2;
1047                 float wi = seqsWeights[i];
1048                 float wj = seqsWeights[j];
1049 #else
1050         for (int i = 0; i < numSeqs; i++) {
1051                 float wi = seqsWeights[i];
1052                 for (int j = i + 1; j < numSeqs; j++) {
1053                         float wj = seqsWeights[j];
1054 #endif
1055                         Sequence *seq1 = sequences->GetSequence(i);
1056                         Sequence *seq2 = sequences->GetSequence(j);
1057
1058                         if (enableVerbose) {
1059 #ifdef _OPENMP
1060 #pragma omp critical
1061 #endif
1062                                 cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader()
1063                                                 << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader()
1064                                                 << ": ";
1065                         }
1066                         // get the original posterior matrix
1067                         VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior();
1068                         assert(posteriorPtr);
1069                         VF &posterior = *posteriorPtr;
1070
1071                         const int seq1Length = seq1->GetLength();
1072                         const int seq2Length = seq2->GetLength();
1073
1074                         // contribution from the summation where z = x and z = y
1075                         float w = wi * wi * wj + wi * wj * wj;
1076                         float sumW = w;
1077                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1078                                 //posterior[k] = w*posterior[k];
1079                                 posterior[k] += posterior[k];
1080                         }
1081
1082                         if (enableVerbose)
1083                                 cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1084
1085                         // contribution from all other sequences
1086                         for (int k = 0; k < numSeqs; k++) {
1087                                 if (k != i && k != j) {
1088                                         float wk = seqsWeights[k];
1089                                         float w = wi * wj * wk;
1090                                         sumW += w;
1091                                         if (k < i)
1092                                                 Relax1(w, sparseMatrices[k][i], sparseMatrices[k][j],
1093                                                                 posterior);
1094                                         else if (k > i && k < j)
1095                                                 Relax(w, sparseMatrices[i][k], sparseMatrices[k][j],
1096                                                                 posterior);
1097                                         else {
1098                                                 SparseMatrix *temp =
1099                                                                 sparseMatrices[j][k]->ComputeTranspose();
1100                                                 Relax(w, sparseMatrices[i][k], temp, posterior);
1101                                                 delete temp;
1102                                         }
1103                                 }
1104                         }
1105                         //cerr<<"sumW "<<sumW<<endl;
1106                         for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) {
1107                                 //posterior[k] /= sumW;
1108                                 posterior[k] /= numSeqs;
1109                         }
1110                         // mask out positions not originally in the posterior matrix
1111                         SparseMatrix *matXY = sparseMatrices[i][j];
1112                         for (int y = 0; y <= seq2Length; y++)
1113                                 posterior[y] = 0;
1114                         for (int x = 1; x <= seq1Length; x++) {
1115                                 SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1116                                 SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1117                                 VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1118                                 int curr = 0;
1119                                 while (XYptr != XYend) {
1120
1121                                         // zero out all cells until the first filled column
1122                                         while (curr < XYptr->first) {
1123                                                 base[curr] = 0;
1124                                                 curr++;
1125                                         }
1126
1127                                         // now, skip over this column
1128                                         curr++;
1129                                         ++XYptr;
1130                                 }
1131
1132                                 // zero out cells after last column
1133                                 while (curr <= seq2Length) {
1134                                         base[curr] = 0;
1135                                         curr++;
1136                                 }
1137                         }
1138
1139                         // save the new posterior matrix
1140                         newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(),
1141                                         seq2->GetLength(), posterior);
1142                         newSparseMatrices[j][i] = NULL;
1143
1144                         if (enableVerbose)
1145                                 cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1146
1147                         delete posteriorPtr;
1148
1149                         if (enableVerbose)
1150                                 cerr << "done." << endl;
1151 #ifndef _OPENMP
1152                 }
1153 #endif
1154         }
1155
1156         return newSparseMatrices;
1157 }
1158
1159 /////////////////////////////////////////////////////////////////
1160 // Relax()
1161 //
1162 // Computes the consistency transformation for a single sequence
1163 // z, and adds the transformed matrix to "posterior".
1164 /////////////////////////////////////////////////////////////////
1165
1166 void MSA::Relax(float weight, SparseMatrix *matXZ, SparseMatrix *matZY,
1167                 VF &posterior) {
1168
1169         assert(matXZ);
1170         assert(matZY);
1171
1172         int lengthX = matXZ->GetSeq1Length();
1173         int lengthY = matZY->GetSeq2Length();
1174         assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1175
1176         // for every x[i]
1177         for (int i = 1; i <= lengthX; i++) {
1178                 SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1179                 SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1180
1181                 VF::iterator base = posterior.begin() + i * (lengthY + 1);
1182
1183                 // iterate through all x[i]-z[k]
1184                 while (XZptr != XZend) {
1185                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1186                         SafeVector<PIF>::iterator ZYend = ZYptr
1187                                         + matZY->GetRowSize(XZptr->first);
1188                         const float XZval = XZptr->second;
1189
1190                         // iterate through all z[k]-y[j]
1191                         while (ZYptr != ZYend) {
1192                                 //base[ZYptr->first] += weight * XZval * ZYptr->second;
1193                                 base[ZYptr->first] += XZval * ZYptr->second;
1194                                 ZYptr++;
1195                         }
1196                         XZptr++;
1197                 }
1198         }
1199 }
1200
1201 /////////////////////////////////////////////////////////////////
1202 // Relax1()
1203 //
1204 // Computes the consistency transformation for a single sequence
1205 // z, and adds the transformed matrix to "posterior".
1206 /////////////////////////////////////////////////////////////////
1207
1208 void MSA::Relax1(float weight, SparseMatrix *matZX, SparseMatrix *matZY,
1209                 VF &posterior) {
1210
1211         assert(matZX);
1212         assert(matZY);
1213
1214         int lengthZ = matZX->GetSeq1Length();
1215         int lengthY = matZY->GetSeq2Length();
1216
1217         // for every z[k]
1218         for (int k = 1; k <= lengthZ; k++) {
1219                 SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1220                 SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1221
1222                 // iterate through all z[k]-x[i]
1223                 while (ZXptr != ZXend) {
1224                         SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1225                         SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1226                         const float ZXval = ZXptr->second;
1227                         VF::iterator base = posterior.begin()
1228                                         + ZXptr->first * (lengthY + 1);
1229
1230                         // iterate through all z[k]-y[j]
1231                         while (ZYptr != ZYend) {
1232                                 //base[ZYptr->first] += weight * ZXval * ZYptr->second;
1233                                 base[ZYptr->first] += ZXval * ZYptr->second;
1234                                 ZYptr++;
1235                         }
1236                         ZXptr++;
1237                 }
1238         }
1239 }
1240 /////////////////////////////////////////////////////////////////
1241 // DoIterativeRefinement()
1242 //
1243 // Performs a single round of randomized partionining iterative
1244 // refinement.
1245 /////////////////////////////////////////////////////////////////
1246
1247 void MSA::DoIterativeRefinement(
1248                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1249                 const ProbabilisticModel &model, MultiSequence* &alignment) {
1250         set<int> groupOne, groupTwo;
1251         int numSeqs = alignment->GetNumSequences();
1252
1253         // create two separate groups
1254         for (int i = 0; i < numSeqs; i++) {
1255                 int index = rand();
1256                 if (index % 2) {
1257                         groupOne.insert(i);
1258                 } else {
1259                         groupTwo.insert(i);
1260                 }
1261         }
1262         if (groupOne.empty() || groupTwo.empty()) return;
1263
1264         // project into the two groups
1265         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1266         assert(groupOneSeqs);
1267         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1268         assert(groupTwoSeqs);
1269 /*
1270 //start add by Yongtao
1271 #if 0
1272         VF *posterior = model.BuildPosterior (groupOneSeqs, groupTwoSeqs, sparseMatrices, cutoff);
1273 #else
1274         VF *posterior = model.BuildPosterior(getSeqsWeights(), groupOneSeqs, groupTwoSeqs,
1275                         sparseMatrices, cutoff);
1276 #endif
1277
1278         // compute an "accuracy" measure for the alignment before refinement
1279         SafeVector<SafeVector<char>::iterator> oldOnePtrs(groupOne.size());
1280         SafeVector<SafeVector<char>::iterator> oldTwoPtrs(groupTwo.size());
1281         int i=0;
1282         for (set<int>::const_iterator iter = groupOne.begin();
1283                         iter != groupOne.end(); ++iter) {
1284                 oldOnePtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1285         }
1286         i=0;
1287         for (set<int>::const_iterator iter = groupTwo.begin();
1288                         iter != groupTwo.end(); ++iter) {
1289                 oldTwoPtrs[i++] = alignment->GetSequence(*iter)->GetDataPtr();
1290         }
1291
1292         VF &posteriorArr = *posterior;
1293         int oldLength = alignment->GetSequence(0)->GetLength();
1294         int groupOneindex=0; int groupTwoindex=0;
1295         float accuracy_before = 0;
1296         for (int i = 1; i <= oldLength; i++) {
1297                 // check to see if there is a gap in every sequence of the set
1298                 bool foundOne = false;
1299                 for (int j = 0; !foundOne && j < (int) groupOne.size(); j++)
1300                         foundOne = (oldOnePtrs[j][i] != '-');
1301                 // if not, then this column counts towards the sequence length
1302                 if (foundOne) groupOneindex ++;
1303                 bool foundTwo = false;
1304                 for (int j = 0; !foundTwo && j < (int) groupTwo.size(); j++)
1305                         foundTwo = (oldTwoPtrs[j][i] != '-');
1306                 // if not, then this column counts towards the sequence length
1307                 if (foundTwo) groupTwoindex ++;
1308                 if(foundOne && foundTwo) accuracy_before +=
1309                                 posteriorArr[groupOneindex * (groupTwoSeqs->GetSequence(0)->GetLength() + 1) + groupTwoindex];
1310         }
1311
1312         pair<SafeVector<char> *, float> refinealignment;
1313         //perform alignment
1314         refinealignment = model.ComputeAlignment(groupOneSeqs->GetSequence(0)->GetLength(),
1315                         groupTwoSeqs->GetSequence(0)->GetLength(), *posterior);
1316         delete posterior;
1317         // now build final alignment
1318         MultiSequence *result = new MultiSequence();
1319         //compare accuracy measure before and after refinement
1320         //if (refinealignment.second > accuracy_before) {
1321                 //cerr<<"Before:" << accuracy_before<<" after: "<< refinealignment.second<< endl;
1322                 for (int i = 0; i < groupOneSeqs->GetNumSequences(); i++)
1323                         result->AddSequence(
1324                                 groupOneSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'X'));
1325                 for (int i = 0; i < groupTwoSeqs->GetNumSequences(); i++)
1326                         result->AddSequence(
1327                                 groupTwoSeqs->GetSequence(i)->AddGaps(refinealignment.first, 'Y'));
1328                 // free temporary alignment
1329                 delete refinealignment.first;
1330                 delete alignment;
1331                 alignment = result;
1332
1333         }
1334         else{
1335                 if(numIterativeRefinementReps < 8*numSeqs) numIterativeRefinementReps++;
1336                 delete groupOneSeqs;
1337                 delete groupTwoSeqs;
1338                 return false;
1339         }
1340   */
1341 //end add by yongtao
1342
1343         delete alignment;
1344         // realign
1345         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices, model); //original
1346         delete groupOneSeqs;
1347         delete groupTwoSeqs;
1348
1349 }
1350
1351 void MSA::DoIterativeRefinementTreeNode(
1352                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1353                 const ProbabilisticModel &model, MultiSequence* &alignment,
1354                 int nodeIndex) {
1355         set<int> groupOne, groupTwo;
1356         int numSeqs = alignment->GetNumSequences();
1357
1358         vector<bool> inGroup1;
1359         inGroup1.resize(numSeqs);
1360         for (int i = 0; i < numSeqs; i++) {
1361                 inGroup1[i] = false;
1362         }
1363
1364         AlignmentOrder* orders = this->tree->getAlignOrders();
1365         AlignmentOrder* order = &orders[nodeIndex];
1366         for (int i = 0; i < order->leftNum; i++) {
1367                 int si = order->leftLeafs[i];
1368                 inGroup1[si] = true;
1369         }
1370         for (int i = 0; i < order->rightNum; i++) {
1371                 int si = order->rightLeafs[i];
1372                 inGroup1[si] = true;
1373         }
1374         // create two separate groups
1375         for (int i = 0; i < numSeqs; i++) {
1376                 if (inGroup1[i]) {
1377                         groupOne.insert(i);
1378                 } else {
1379                         groupTwo.insert(i);
1380                 }
1381         }
1382         if (groupOne.empty() || groupTwo.empty())
1383                 return;
1384
1385         // project into the two groups
1386         MultiSequence *groupOneSeqs = alignment->Project(groupOne);
1387         assert(groupOneSeqs);
1388         MultiSequence *groupTwoSeqs = alignment->Project(groupTwo);
1389         assert(groupTwoSeqs);
1390         delete alignment;
1391
1392         // realign
1393         alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices,
1394                         model);
1395
1396         delete groupOneSeqs;
1397         delete groupTwoSeqs;
1398 }
1399
1400 /////////////////////////////////////////////////////////////////
1401 // WriteAnnotation()
1402 //
1403 // Computes annotation for multiple alignment and write values
1404 // to a file.
1405 /////////////////////////////////////////////////////////////////
1406
1407 void MSA::WriteAnnotation(MultiSequence *alignment,
1408                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1409         ofstream outfile(annotationFilename.c_str());
1410
1411         if (outfile.fail()) {
1412                 cerr << "ERROR: Unable to write annotation file." << endl;
1413                 exit(1);
1414         }
1415
1416         const int alignLength = alignment->GetSequence(0)->GetLength();
1417         const int numSeqs = alignment->GetNumSequences();
1418
1419         SafeVector<int> position(numSeqs, 0);
1420         SafeVector<SafeVector<char>::iterator> seqs(numSeqs);
1421         for (int i = 0; i < numSeqs; i++)
1422                 seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1423         SafeVector<pair<int, int> > active;
1424         active.reserve(numSeqs);
1425
1426         SafeVector<int> lab;
1427         for (int i = 0; i < numSeqs; i++)
1428                 lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1429
1430         // for every column
1431         for (int i = 1; i <= alignLength; i++) {
1432
1433                 // find all aligned residues in this particular column
1434                 active.clear();
1435                 for (int j = 0; j < numSeqs; j++) {
1436                         if (seqs[j][i] != '-') {
1437                                 active.push_back(make_pair(lab[j], ++position[j]));
1438                         }
1439                 }
1440
1441                 sort(active.begin(), active.end());
1442                 outfile << setw(4) << ComputeScore(active, sparseMatrices) << endl;
1443         }
1444
1445         outfile.close();
1446 }
1447
1448 /////////////////////////////////////////////////////////////////
1449 // ComputeScore()
1450 //
1451 // Computes the annotation score for a particular column.
1452 /////////////////////////////////////////////////////////////////
1453
1454 int MSA::ComputeScore(const SafeVector<pair<int, int> > &active,
1455                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices) {
1456
1457         if (active.size() <= 1)
1458                 return 0;
1459
1460         // ALTERNATIVE #1: Compute the average alignment score.
1461
1462         float val = 0;
1463         for (int i = 0; i < (int) active.size(); i++) {
1464                 for (int j = i + 1; j < (int) active.size(); j++) {
1465                         val += sparseMatrices[active[i].first][active[j].first]->GetValue(
1466                                         active[i].second, active[j].second);
1467                 }
1468         }
1469
1470         return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1471
1472 }