binaries/src/probcons/Main.cc

   1 /////////////////////////////////////////////////////////////////
   2 // Main.cc
   3 //
   4 // Main routines for PROBCONS program.
   5 /////////////////////////////////////////////////////////////////
   6
   7 #include "SafeVector.h"
   8 #include "MultiSequence.h"
   9 #include "Defaults.h"
  10 #include "ScoreType.h"
  11 #include "ProbabilisticModel.h"
  12 #include "EvolutionaryTree.h"
  13 #include "SparseMatrix.h"
  14 #include <string>
  15 #include <sstream>
  16 #include <iomanip>
  17 #include <iostream>
  18 #include <list>
  19 #include <set>
  20 #include <algorithm>
  21 #include <climits>
  22 #include <cstdio>
  23 #include <cstdlib>
  24 #include <cerrno>
  25 #include <iomanip>
  26
  27 string parametersInputFilename = "";
  28 string parametersOutputFilename = "no training";
  29 string annotationFilename = "";
  30
  31 bool enableTraining = false;
  32 bool enableVerbose = false;
  33 bool enableAllPairs = false;
  34 bool enableAnnotation = false;
  35 bool enableViterbi = false;
  36 bool enableClustalWOutput = false;
  37 bool enableTrainEmissions = false;
  38 bool enableAlignOrder = false;
  39 int numConsistencyReps = 2;
  40 int numPreTrainingReps = 0;
  41 int numIterativeRefinementReps = 100;
  42
  43 float cutoff = 0;
  44 float gapOpenPenalty = 0;
  45 float gapContinuePenalty = 0;
  46
  47 VF initDistrib (NumMatrixTypes);
  48 VF gapOpen (2*NumInsertStates);
  49 VF gapExtend (2*NumInsertStates);
  50 VVF emitPairs (256, VF (256, 1e-10));
  51 VF emitSingle (256, 1e-5);
  52
  53 string alphabet = alphabetDefault;
  54
  55 const int MIN_PRETRAINING_REPS = 0;
  56 const int MAX_PRETRAINING_REPS = 20;
  57 const int MIN_CONSISTENCY_REPS = 0;
  58 const int MAX_CONSISTENCY_REPS = 5;
  59 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
  60 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
  61
  62 /////////////////////////////////////////////////////////////////
  63 // Function prototypes
  64 /////////////////////////////////////////////////////////////////
  65
  66 void PrintHeading();
  67 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
  68                       const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
  69 MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend,
  70                         VVF &emitPairs, VF &emitSingle);
  71 SafeVector<string> ParseParams (int argc, char **argv);
  72 void ReadParameters ();
  73 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
  74                                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  75                                       const ProbabilisticModel &model);
  76 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
  77                                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  78                                 const ProbabilisticModel &model);
  79 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
  80                                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
  81 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
  82 void Relax1 (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
  83
  84 set<int> GetSubtree (const TreeNode *tree);
  85 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  86                               const ProbabilisticModel &model, MultiSequence* &alignment,
  87                               const TreeNode *tree);
  88 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  89                             const ProbabilisticModel &model, MultiSequence* &alignment);
  90 void WriteAnnotation (MultiSequence *alignment,
  91                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
  92 int ComputeScore (const SafeVector<pair<int, int> > &active,
  93                   const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
  94
  95
  96 /////////////////////////////////////////////////////////////////
  97 // main()
  98 //
  99 // Calls all initialization routines and runs the PROBCONS
 100 // aligner.
 101 /////////////////////////////////////////////////////////////////
 102
 103 int main (int argc, char **argv){
 104
 105   // print PROBCONS heading
 106   PrintHeading();
 107
 108   // parse program parameters
 109   SafeVector<string> sequenceNames = ParseParams (argc, argv);
 110   ReadParameters();
 111   PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 112
 113   // now, we'll process all the files given as input.  If we are given
 114   // several filenames as input, then we'll load all of those sequences
 115   // simultaneously, as long as we're not training.  On the other hand,
 116   // if we are training, then we'll treat each file as a separate
 117   // training instance
 118
 119   // if we are training
 120   if (enableTraining){
 121
 122     // build new model for aligning
 123     ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 124
 125     // prepare to average parameters
 126     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
 127     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
 128     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
 129     if (enableTrainEmissions){
 130       for (int i = 0; i < (int) emitPairs.size(); i++)
 131         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
 132       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
 133     }
 134
 135     // align each file individually
 136     for (int i = 0; i < (int) sequenceNames.size(); i++){
 137
 138       VF thisInitDistrib (NumMatrixTypes);
 139       VF thisGapOpen (2*NumInsertStates);
 140       VF thisGapExtend (2*NumInsertStates);
 141       VVF thisEmitPairs (256, VF (256, 1e-10));
 142       VF thisEmitSingle (256, 1e-5);
 143
 144       // load sequence file
 145       MultiSequence *sequences = new MultiSequence(); assert (sequences);
 146       cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 147       sequences->LoadMFA (sequenceNames[i], true);
 148
 149       // align sequences
 150       DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
 151
 152       // add in contribution of the derived parameters
 153       for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
 154       for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
 155       for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
 156       if (enableTrainEmissions){
 157         for (int i = 0; i < (int) emitPairs.size(); i++)
 158           for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
 159         for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
 160       }
 161
 162       delete sequences;
 163     }
 164
 165     // compute new parameters and print them out
 166     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
 167     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
 168     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
 169     if (enableTrainEmissions){
 170       for (int i = 0; i < (int) emitPairs.size(); i++)
 171         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
 172       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
 173     }
 174
 175     PrintParameters ("Trained parameter set:",
 176                      initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
 177                      parametersOutputFilename.c_str());
 178   }
 179
 180   // if we are not training, we must simply want to align some sequences
 181   else {
 182
 183     // load all files together
 184     MultiSequence *sequences = new MultiSequence(); assert (sequences);
 185     for (int i = 0; i < (int) sequenceNames.size(); i++){
 186       cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 187       sequences->LoadMFA (sequenceNames[i], true);
 188     }
 189
 190     // do all "pre-training" repetitions first
 191     for (int ct = 0; ct < numPreTrainingReps; ct++){
 192       enableTraining = true;
 193
 194       // build new model for aligning
 195       ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
 196                                 emitPairs, emitSingle);
 197
 198       // do initial alignments
 199       DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 200
 201       // print new parameters
 202       PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 203
 204       enableTraining = false;
 205     }
 206
 207     // now, we can perform the alignments and write them out
 208     MultiSequence *alignment = DoAlign (sequences,
 209                                         ProbabilisticModel (initDistrib, gapOpen, gapExtend,  emitPairs, emitSingle),
 210                                         initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 211
 212     if (!enableAllPairs){
 213       if (enableClustalWOutput)
 214         alignment->WriteALN (cout);
 215       else
 216         alignment->WriteMFA (cout);
 217     }
 218     delete alignment;
 219     delete sequences;
 220
 221   }
 222 }
 223
 224 /////////////////////////////////////////////////////////////////
 225 // PrintHeading()
 226 //
 227 // Prints heading for PROBCONS program.
 228 /////////////////////////////////////////////////////////////////
 229
 230 void PrintHeading (){
 231   cerr << endl
 232        << "PROBCONS version " << VERSION << " - align multiple protein sequences and print to standard output" << endl
 233        << "Written by Chuong Do" << endl
 234        << endl;
 235 }
 236
 237 /////////////////////////////////////////////////////////////////
 238 // PrintParameters()
 239 //
 240 // Prints PROBCONS parameters to STDERR.  If a filename is
 241 // specified, then the parameters are also written to the file.
 242 /////////////////////////////////////////////////////////////////
 243
 244 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
 245                       const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
 246
 247   // print parameters to the screen
 248   cerr << message << endl
 249        << "    initDistrib[] = { ";
 250   for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
 251   cerr << "}" << endl
 252        << "        gapOpen[] = { ";
 253   for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
 254   cerr << "}" << endl
 255        << "      gapExtend[] = { ";
 256   for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
 257   cerr << "}" << endl
 258        << endl;
 259
 260   /*
 261   for (int i = 0; i < 5; i++){
 262     for (int j = 0; j <= i; j++){
 263       cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
 264     }
 265     cerr << endl;
 266     }*/
 267
 268   // if a file name is specified
 269   if (filename){
 270
 271     // attempt to open the file for writing
 272     FILE *file = fopen (filename, "w");
 273     if (!file){
 274       cerr << "ERROR: Unable to write parameter file: " << filename << endl;
 275       exit (1);
 276     }
 277
 278     // if successful, then write the parameters to the file
 279     for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
 280     for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
 281     for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
 282     fprintf (file, "%s\n", alphabet.c_str());
 283     for (int i = 0; i < (int) alphabet.size(); i++){
 284       for (int j = 0; j <= i; j++)
 285         fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
 286       fprintf (file, "\n");
 287     }
 288     for (int i = 0; i < (int) alphabet.size(); i++)
 289       fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
 290     fprintf (file, "\n");
 291     fclose (file);
 292   }
 293 }
 294
 295 /////////////////////////////////////////////////////////////////
 296 // DoAlign()
 297 //
 298 // First computes all pairwise posterior probability matrices.
 299 // Then, computes new parameters if training, or a final
 300 // alignment, otherwise.
 301 /////////////////////////////////////////////////////////////////
 302
 303 MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
 304                         VF &gapExtend, VVF &emitPairs, VF &emitSingle){
 305
 306   assert (sequences);
 307
 308   const int numSeqs = sequences->GetNumSequences();
 309   VVF distances (numSeqs, VF (numSeqs, 0));
 310   SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
 311
 312
 313
 314   if (enableTraining){
 315     // prepare to average parameters
 316     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
 317     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
 318     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
 319     if (enableTrainEmissions){
 320       for (int i = 0; i < (int) emitPairs.size(); i++)
 321         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
 322       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
 323     }
 324   }
 325
 326   // skip posterior calculations if we just want to do Viterbi alignments
 327   if (!enableViterbi){
 328
 329     // do all pairwise alignments for posterior probability matrices
 330     for (int a = 0; a < numSeqs-1; a++){
 331       for (int b = a+1; b < numSeqs; b++){
 332         Sequence *seq1 = sequences->GetSequence (a);
 333         Sequence *seq2 = sequences->GetSequence (b);
 334
 335         // verbose output
 336         if (enableVerbose)
 337           cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
 338                << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
 339
 340         // compute forward and backward probabilities
 341         VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
 342         VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
 343
 344         // if we are training, then we'll simply want to compute the
 345         // expected counts for each region within the matrix separately;
 346         // otherwise, we'll need to put all of the regions together and
 347         // assemble a posterior probability match matrix
 348
 349         // so, if we're training
 350         if (enableTraining){
 351
 352           // compute new parameters
 353           VF thisInitDistrib (NumMatrixTypes);
 354           VF thisGapOpen (2*NumInsertStates);
 355           VF thisGapExtend (2*NumInsertStates);
 356           VVF thisEmitPairs (256, VF (256, 1e-10));
 357           VF thisEmitSingle (256, 1e-5);
 358
 359           model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
 360
 361           // add in contribution of the derived parameters
 362           for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
 363           for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
 364           for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
 365           if (enableTrainEmissions){
 366             for (int i = 0; i < (int) emitPairs.size(); i++)
 367               for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
 368             for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
 369           }
 370
 371           // let us know that we're done.
 372           if (enableVerbose) cerr << "done." << endl;
 373         }
 374         else {
 375
 376           // compute posterior probability matrix
 377           VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
 378
 379           // compute sparse representations
 380           sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
 381           sparseMatrices[b][a] = NULL;
 382
 383           if (!enableAllPairs){
 384             // perform the pairwise sequence alignment
 385             pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
 386                                                                                 seq2->GetLength(),
 387                                                                                 *posterior);
 388
 389             // compute "expected accuracy" distance for evolutionary tree computation
 390             float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
 391             distances[a][b] = distances[b][a] = distance;
 392
 393             if (enableVerbose)
 394               cerr << setprecision (10) << distance << endl;
 395
 396               delete alignment.first;
 397           }
 398           else {
 399             // let us know that we're done.
 400             if (enableVerbose) cerr << "done." << endl;
 401           }
 402
 403           delete posterior;
 404         }
 405
 406         delete forward;
 407         delete backward;
 408       }
 409     }
 410   }
 411
 412   // now average out parameters derived
 413   if (enableTraining){
 414
 415     // compute new parameters
 416     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
 417     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
 418     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
 419
 420     if (enableTrainEmissions){
 421       for (int i = 0; i < (int) emitPairs.size(); i++)
 422         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
 423       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
 424     }
 425   }
 426
 427   // see if we still want to do some alignments
 428   else {
 429
 430     if (!enableViterbi){
 431
 432       // perform the consistency transformation the desired number of times
 433       for (int r = 0; r < numConsistencyReps; r++){
 434         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
 435
 436         // now replace the old posterior matrices
 437         for (int i = 0; i < numSeqs; i++){
 438           for (int j = 0; j < numSeqs; j++){
 439             delete sparseMatrices[i][j];
 440             sparseMatrices[i][j] = newSparseMatrices[i][j];
 441           }
 442         }
 443       }
 444     }
 445
 446     MultiSequence *finalAlignment = NULL;
 447
 448     if (enableAllPairs){
 449       for (int a = 0; a < numSeqs-1; a++){
 450         for (int b = a+1; b < numSeqs; b++){
 451           Sequence *seq1 = sequences->GetSequence (a);
 452           Sequence *seq2 = sequences->GetSequence (b);
 453
 454           if (enableVerbose)
 455             cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
 456                  << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
 457
 458
 459           // perform the pairwise sequence alignment
 460           pair<SafeVector<char> *, float> alignment;
 461           if (enableViterbi)
 462             alignment = model.ComputeViterbiAlignment (seq1, seq2);
 463           else {
 464
 465             // build posterior matrix
 466             VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
 467             int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
 468             for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
 469
 470             alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
 471             delete posterior;
 472           }
 473
 474           // write pairwise alignments
 475           string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
 476           ofstream outfile (name.c_str());
 477
 478           MultiSequence *result = new MultiSequence();
 479           result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
 480           result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
 481           if (enableClustalWOutput)
 482             result->WriteALN (outfile);
 483           else
 484             result->WriteMFA (outfile);
 485
 486           outfile.close();
 487
 488           delete alignment.first;
 489         }
 490       }
 491     }
 492
 493     // now if we still need to do a final multiple alignment
 494     else {
 495
 496       if (enableVerbose)
 497         cerr << endl;
 498
 499       // compute the evolutionary tree
 500       TreeNode *tree = TreeNode::ComputeTree (distances);
 501
 502       tree->Print (cerr, sequences);
 503       cerr << endl;
 504
 505       // make the final alignment
 506       finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
 507
 508       // build annotation
 509       if (enableAnnotation){
 510         WriteAnnotation (finalAlignment, sparseMatrices);
 511       }
 512
 513       delete tree;
 514     }
 515
 516     if (!enableViterbi){
 517       // delete sparse matrices
 518       for (int a = 0; a < numSeqs-1; a++){
 519         for (int b = a+1; b < numSeqs; b++){
 520           delete sparseMatrices[a][b];
 521           delete sparseMatrices[b][a];
 522         }
 523       }
 524     }
 525
 526     return finalAlignment;
 527   }
 528
 529   return NULL;
 530 }
 531
 532 /////////////////////////////////////////////////////////////////
 533 // GetInteger()
 534 //
 535 // Attempts to parse an integer from the character string given.
 536 // Returns true only if no parsing error occurs.
 537 /////////////////////////////////////////////////////////////////
 538
 539 bool GetInteger (char *data, int *val){
 540   char *endPtr;
 541   long int retVal;
 542
 543   assert (val);
 544
 545   errno = 0;
 546   retVal = strtol (data, &endPtr, 0);
 547   if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
 548   if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
 549   if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
 550   *val = (int) retVal;
 551   return true;
 552 }
 553
 554 /////////////////////////////////////////////////////////////////
 555 // GetFloat()
 556 //
 557 // Attempts to parse a float from the character string given.
 558 // Returns true only if no parsing error occurs.
 559 /////////////////////////////////////////////////////////////////
 560
 561 bool GetFloat (char *data, float *val){
 562   char *endPtr;
 563   double retVal;
 564
 565   assert (val);
 566
 567   errno = 0;
 568   retVal = strtod (data, &endPtr);
 569   if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
 570   if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
 571   *val = (float) retVal;
 572   return true;
 573 }
 574
 575 /////////////////////////////////////////////////////////////////
 576 // ParseParams()
 577 //
 578 // Parse all command-line options.
 579 /////////////////////////////////////////////////////////////////
 580
 581 SafeVector<string> ParseParams (int argc, char **argv){
 582
 583   if (argc < 2){
 584
 585     cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY.  This is free software, and" << endl
 586          << "you are welcome to redistribute it under certain conditions.  See the" << endl
 587          << "file COPYING.txt for details." << endl
 588          << endl
 589          << "Usage:" << endl
 590          << "       probcons [OPTION]... [MFAFILE]..." << endl
 591          << endl
 592          << "Description:" << endl
 593          << "       Align sequences in MFAFILE(s) and print result to standard output" << endl
 594          << endl
 595          << "       -clustalw" << endl
 596          << "              use CLUSTALW output format instead of MFA" << endl
 597          << endl
 598          << "       -c, --consistency REPS" << endl
 599          << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
 600          << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
 601          << endl
 602          << "       -ir, --iterative-refinement REPS" << endl
 603          << "              use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
 604          << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
 605          << endl
 606          << "       -pre, --pre-training REPS" << endl
 607          << "              use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
 608          << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
 609          << endl
 610          << "       -pairs" << endl
 611          << "              generate all-pairs pairwise alignments" << endl
 612          << endl
 613          << "       -viterbi" << endl
 614          << "              use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
 615          << endl
 616          << "       -v, --verbose" << endl
 617          << "              report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
 618          << endl
 619          << "       -annot FILENAME" << endl
 620          << "              write annotation for multiple alignment to FILENAME" << endl
 621          << endl
 622          << "       -t, --train FILENAME" << endl
 623          << "              compute EM transition probabilities, store in FILENAME (default: "
 624          << parametersOutputFilename << ")" << endl
 625          << endl
 626          << "       -e, --emissions" << endl
 627          << "              also reestimate emission probabilities (default: "
 628          << (enableTrainEmissions ? "on" : "off") << ")" << endl
 629          << endl
 630          << "       -p, --paramfile FILENAME" << endl
 631          << "              read parameters from FILENAME (default: "
 632          << parametersInputFilename << ")" << endl
 633          << endl
 634          << "       -a, --alignment-order" << endl
 635          << "              print sequences in alignment order rather than input order (default: "
 636          << (enableAlignOrder ? "on" : "off") << ")" << endl
 637          << endl;
 638     //           << "       -go, --gap-open VALUE" << endl
 639     //           << "              gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
 640     //   << endl
 641     //   << "       -ge, --gap-extension VALUE" << endl
 642     //   << "              gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
 643     //   << endl
 644     //         << "       -co, --cutoff CUTOFF" << endl
 645     //         << "              subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
 646     //         << endl
 647
 648     exit (1);
 649   }
 650
 651   SafeVector<string> sequenceNames;
 652   int tempInt;
 653   float tempFloat;
 654
 655   for (int i = 1; i < argc; i++){
 656     if (argv[i][0] == '-'){
 657
 658       // training
 659       if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
 660         enableTraining = true;
 661         if (i < argc - 1)
 662           parametersOutputFilename = string (argv[++i]);
 663         else {
 664           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 665           exit (1);
 666         }
 667       }
 668
 669       // emission training
 670       else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){
 671         enableTrainEmissions = true;
 672       }
 673
 674       // parameter file
 675       else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
 676         if (i < argc - 1)
 677           parametersInputFilename = string (argv[++i]);
 678         else {
 679           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 680           exit (1);
 681         }
 682       }
 683
 684       // number of consistency transformations
 685       else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
 686         if (i < argc - 1){
 687           if (!GetInteger (argv[++i], &tempInt)){
 688             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 689             exit (1);
 690           }
 691           else {
 692             if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
 693               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 694                    << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
 695               exit (1);
 696             }
 697             else
 698               numConsistencyReps = tempInt;
 699           }
 700         }
 701         else {
 702           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 703           exit (1);
 704         }
 705       }
 706
 707       // number of randomized partitioning iterative refinement passes
 708       else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
 709         if (i < argc - 1){
 710           if (!GetInteger (argv[++i], &tempInt)){
 711             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 712             exit (1);
 713           }
 714           else {
 715             if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
 716               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 717                    << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
 718               exit (1);
 719             }
 720             else
 721               numIterativeRefinementReps = tempInt;
 722           }
 723         }
 724         else {
 725           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 726           exit (1);
 727         }
 728       }
 729
 730       // number of EM pre-training rounds
 731       else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
 732         if (i < argc - 1){
 733           if (!GetInteger (argv[++i], &tempInt)){
 734             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 735             exit (1);
 736           }
 737           else {
 738             if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
 739               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 740                    << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
 741               exit (1);
 742             }
 743             else
 744               numPreTrainingReps = tempInt;
 745           }
 746         }
 747         else {
 748           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 749           exit (1);
 750         }
 751       }
 752
 753       // gap open penalty
 754       else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
 755         if (i < argc - 1){
 756           if (!GetFloat (argv[++i], &tempFloat)){
 757             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 758             exit (1);
 759           }
 760           else {
 761             if (tempFloat > 0){
 762               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
 763               exit (1);
 764             }
 765             else
 766               gapOpenPenalty = tempFloat;
 767           }
 768         }
 769         else {
 770           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
 771           exit (1);
 772         }
 773       }
 774
 775       // gap extension penalty
 776       else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
 777         if (i < argc - 1){
 778           if (!GetFloat (argv[++i], &tempFloat)){
 779             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 780             exit (1);
 781           }
 782           else {
 783             if (tempFloat > 0){
 784               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
 785               exit (1);
 786             }
 787             else
 788               gapContinuePenalty = tempFloat;
 789           }
 790         }
 791         else {
 792           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
 793           exit (1);
 794         }
 795       }
 796
 797       // all-pairs pairwise alignments
 798       else if (!strcmp (argv[i], "-pairs")){
 799         enableAllPairs = true;
 800       }
 801
 802       // all-pairs pairwise Viterbi alignments
 803       else if (!strcmp (argv[i], "-viterbi")){
 804         enableAllPairs = true;
 805         enableViterbi = true;
 806       }
 807
 808       // annotation files
 809       else if (!strcmp (argv[i], "-annot")){
 810         enableAnnotation = true;
 811         if (i < argc - 1)
 812           annotationFilename = argv[++i];
 813         else {
 814           cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
 815           exit (1);
 816         }
 817       }
 818
 819       // clustalw output format
 820       else if (!strcmp (argv[i], "-clustalw")){
 821         enableClustalWOutput = true;
 822       }
 823
 824       // cutoff
 825       else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){
 826         if (i < argc - 1){
 827           if (!GetFloat (argv[++i], &tempFloat)){
 828             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 829             exit (1);
 830           }
 831           else {
 832             if (tempFloat < 0 || tempFloat > 1){
 833               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
 834               exit (1);
 835             }
 836             else
 837               cutoff = tempFloat;
 838           }
 839         }
 840         else {
 841           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
 842           exit (1);
 843         }
 844       }
 845
 846       // verbose reporting
 847       else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
 848         enableVerbose = true;
 849       }
 850
 851       // alignment order
 852       else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){
 853         enableAlignOrder = true;
 854       }
 855
 856       // bad arguments
 857       else {
 858         cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
 859         exit (1);
 860       }
 861     }
 862     else {
 863       sequenceNames.push_back (string (argv[i]));
 864     }
 865   }
 866
 867   if (enableTrainEmissions && !enableTraining){
 868     cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
 869     exit (1);
 870   }
 871
 872   return sequenceNames;
 873 }
 874
 875 /////////////////////////////////////////////////////////////////
 876 // ReadParameters()
 877 //
 878 // Read initial distribution, transition, and emission
 879 // parameters from a file.
 880 /////////////////////////////////////////////////////////////////
 881
 882 void ReadParameters (){
 883
 884   ifstream data;
 885
 886   emitPairs = VVF (256, VF (256, 1e-10));
 887   emitSingle = VF (256, 1e-5);
 888
 889   // read initial state distribution and transition parameters
 890   if (parametersInputFilename == string ("")){
 891     if (NumInsertStates == 1){
 892       for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
 893       for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
 894       for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
 895     }
 896     else if (NumInsertStates == 2){
 897       for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
 898       for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
 899       for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
 900     }
 901     else {
 902       cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
 903            << "       for " << NumInsertStates << " pairs of insert states.  Use --paramfile." << endl;
 904       exit (1);
 905     }
 906
 907     alphabet = alphabetDefault;
 908
 909     for (int i = 0; i < (int) alphabet.length(); i++){
 910       emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
 911       emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
 912       for (int j = 0; j <= i; j++){
 913         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
 914         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
 915         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
 916         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
 917         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
 918         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
 919         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
 920         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
 921       }
 922     }
 923   }
 924   else {
 925     data.open (parametersInputFilename.c_str());
 926     if (data.fail()){
 927       cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
 928       exit (1);
 929     }
 930
 931     string line[3];
 932     for (int i = 0; i < 3; i++){
 933       if (!getline (data, line[i])){
 934         cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
 935         exit (1);
 936       }
 937     }
 938     istringstream data2;
 939     data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
 940     data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
 941     data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
 942
 943     if (!getline (data, line[0])){
 944       cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
 945       exit (1);
 946     }
 947
 948     // read alphabet as concatenation of all characters on alphabet line
 949     alphabet = "";
 950     string token;
 951     data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
 952
 953     for (int i = 0; i < (int) alphabet.size(); i++){
 954       for (int j = 0; j <= i; j++){
 955         float val;
 956         data >> val;
 957         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
 958         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
 959         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
 960         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
 961         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
 962         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
 963         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
 964         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
 965       }
 966     }
 967
 968     for (int i = 0; i < (int) alphabet.size(); i++){
 969       float val;
 970       data >> val;
 971       emitSingle[(unsigned char) tolower(alphabet[i])] = val;
 972       emitSingle[(unsigned char) toupper(alphabet[i])] = val;
 973     }
 974     data.close();
 975   }
 976 }
 977
 978 /////////////////////////////////////////////////////////////////
 979 // ProcessTree()
 980 //
 981 // Process the tree recursively.  Returns the aligned sequences
 982 // corresponding to a node or leaf of the tree.
 983 /////////////////////////////////////////////////////////////////
 984
 985 MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
 986                             const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 987                             const ProbabilisticModel &model){
 988   MultiSequence *result;
 989
 990   // check if this is a node of the alignment tree
 991   if (tree->GetSequenceLabel() == -1){
 992     MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
 993     MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
 994
 995     assert (alignLeft);
 996     assert (alignRight);
 997
 998     result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
 999     assert (result);
1000
1001     delete alignLeft;
1002     delete alignRight;
1003   }
1004
1005   // otherwise, this is a leaf of the alignment tree
1006   else {
1007     result = new MultiSequence(); assert (result);
1008     result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1009   }
1010
1011   return result;
1012 }
1013
1014 /////////////////////////////////////////////////////////////////
1015 // ComputeFinalAlignment()
1016 //
1017 // Compute the final alignment by calling ProcessTree(), then
1018 // performing iterative refinement as needed.
1019 /////////////////////////////////////////////////////////////////
1020
1021 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
1022                                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1023                                       const ProbabilisticModel &model){
1024
1025   MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
1026
1027   SafeVector<int> oldOrdering;
1028   if (enableAlignOrder){
1029     for (int i = 0; i < alignment->GetNumSequences(); i++)
1030       oldOrdering.push_back (alignment->GetSequence(i)->GetSortLabel());
1031     alignment->SaveOrdering();
1032     enableAlignOrder = false;
1033   }
1034
1035   // tree-based refinement
1036   // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
1037
1038   // iterative refinement
1039   for (int i = 0; i < numIterativeRefinementReps; i++)
1040     DoIterativeRefinement (sparseMatrices, model, alignment);
1041
1042   cerr << endl;
1043
1044   if (oldOrdering.size() > 0){
1045     for (int i = 0; i < (int) oldOrdering.size(); i++){
1046       alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
1047     }
1048   }
1049
1050   // return final alignment
1051   return alignment;
1052 }
1053
1054 /////////////////////////////////////////////////////////////////
1055 // AlignAlignments()
1056 //
1057 // Returns the alignment of two MultiSequence objects.
1058 /////////////////////////////////////////////////////////////////
1059
1060 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
1061                                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1062                                 const ProbabilisticModel &model){
1063
1064   // print some info about the alignment
1065   if (enableVerbose){
1066     for (int i = 0; i < align1->GetNumSequences(); i++)
1067       cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1068     cerr << "] vs. ";
1069     for (int i = 0; i < align2->GetNumSequences(); i++)
1070       cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1071     cerr << "]: ";
1072   }
1073
1074   VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
1075   pair<SafeVector<char> *, float> alignment;
1076
1077   // choose the alignment routine depending on the "cosmetic" gap penalties used
1078   if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
1079     alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
1080   else
1081     alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1082                                                         *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1083                                                         gapOpenPenalty, gapContinuePenalty);
1084
1085   delete posterior;
1086
1087   if (enableVerbose){
1088
1089     // compute total length of sequences
1090     int totLength = 0;
1091     for (int i = 0; i < align1->GetNumSequences(); i++)
1092       for (int j = 0; j < align2->GetNumSequences(); j++)
1093         totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1094
1095     // give an "accuracy" measure for the alignment
1096     cerr << alignment.second / totLength << endl;
1097   }
1098
1099   // now build final alignment
1100   MultiSequence *result = new MultiSequence();
1101   for (int i = 0; i < align1->GetNumSequences(); i++)
1102     result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1103   for (int i = 0; i < align2->GetNumSequences(); i++)
1104     result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1105   if (!enableAlignOrder)
1106     result->SortByLabel();
1107
1108   // free temporary alignment
1109   delete alignment.first;
1110
1111   return result;
1112 }
1113
1114 /////////////////////////////////////////////////////////////////
1115 // DoRelaxation()
1116 //
1117 // Performs one round of the consistency transformation.  The
1118 // formula used is:
1119 //                     1
1120 //    P'(x[i]-y[j]) = ---  sum   sum P(x[i]-z[k]) P(z[k]-y[j])
1121 //                    |S| z in S  k
1122 //
1123 // where S = {x, y, all other sequences...}
1124 //
1125 /////////////////////////////////////////////////////////////////
1126
1127 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
1128                                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1129   const int numSeqs = sequences->GetNumSequences();
1130
1131   SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
1132
1133   // for every pair of sequences
1134   for (int i = 0; i < numSeqs; i++){
1135     for (int j = i+1; j < numSeqs; j++){
1136       Sequence *seq1 = sequences->GetSequence (i);
1137       Sequence *seq2 = sequences->GetSequence (j);
1138
1139       if (enableVerbose)
1140         cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1141              << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1142
1143       // get the original posterior matrix
1144       VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1145       VF &posterior = *posteriorPtr;
1146
1147       const int seq1Length = seq1->GetLength();
1148       const int seq2Length = seq2->GetLength();
1149
1150       // contribution from the summation where z = x and z = y
1151       for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1152
1153       if (enableVerbose)
1154         cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1155
1156       // contribution from all other sequences
1157       for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1158         if (k < i)
1159           Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
1160         else if (k > i && k < j)
1161           Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1162         else {
1163           SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
1164           Relax (sparseMatrices[i][k], temp, posterior);
1165           delete temp;
1166         }
1167       }
1168
1169       // now renormalization
1170       for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1171
1172       // mask out positions not originally in the posterior matrix
1173       SparseMatrix *matXY = sparseMatrices[i][j];
1174       for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
1175       for (int x = 1; x <= seq1Length; x++){
1176         SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1177         SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1178         VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1179         int curr = 0;
1180         while (XYptr != XYend){
1181
1182           // zero out all cells until the first filled column
1183           while (curr < XYptr->first){
1184             base[curr] = 0;
1185             curr++;
1186           }
1187
1188           // now, skip over this column
1189           curr++;
1190           ++XYptr;
1191         }
1192
1193         // zero out cells after last column
1194         while (curr <= seq2Length){
1195           base[curr] = 0;
1196           curr++;
1197         }
1198       }
1199
1200       // save the new posterior matrix
1201       newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1202       newSparseMatrices[j][i] = NULL;
1203
1204       if (enableVerbose)
1205         cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1206
1207       delete posteriorPtr;
1208
1209       if (enableVerbose)
1210         cerr << "done." << endl;
1211     }
1212   }
1213
1214   return newSparseMatrices;
1215 }
1216
1217 /////////////////////////////////////////////////////////////////
1218 // Relax()
1219 //
1220 // Computes the consistency transformation for a single sequence
1221 // z, and adds the transformed matrix to "posterior".
1222 /////////////////////////////////////////////////////////////////
1223
1224 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
1225
1226   assert (matXZ);
1227   assert (matZY);
1228
1229   int lengthX = matXZ->GetSeq1Length();
1230   int lengthY = matZY->GetSeq2Length();
1231   assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1232
1233   // for every x[i]
1234   for (int i = 1; i <= lengthX; i++){
1235     SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1236     SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1237
1238     VF::iterator base = posterior.begin() + i * (lengthY + 1);
1239
1240     // iterate through all x[i]-z[k]
1241     while (XZptr != XZend){
1242       SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1243       SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1244       const float XZval = XZptr->second;
1245
1246       // iterate through all z[k]-y[j]
1247       while (ZYptr != ZYend){
1248         base[ZYptr->first] += XZval * ZYptr->second;
1249         ZYptr++;
1250       }
1251       XZptr++;
1252     }
1253   }
1254 }
1255
1256 /////////////////////////////////////////////////////////////////
1257 // Relax1()
1258 //
1259 // Computes the consistency transformation for a single sequence
1260 // z, and adds the transformed matrix to "posterior".
1261 /////////////////////////////////////////////////////////////////
1262
1263 void Relax1 (SparseMatrix *matZX, SparseMatrix *matZY, VF &posterior){
1264
1265   assert (matZX);
1266   assert (matZY);
1267
1268   int lengthZ = matZX->GetSeq1Length();
1269   int lengthY = matZY->GetSeq2Length();
1270
1271   // for every z[k]
1272   for (int k = 1; k <= lengthZ; k++){
1273     SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1274     SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1275
1276     // iterate through all z[k]-x[i]
1277     while (ZXptr != ZXend){
1278       SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1279       SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1280       const float ZXval = ZXptr->second;
1281       VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
1282
1283       // iterate through all z[k]-y[j]
1284       while (ZYptr != ZYend){
1285         base[ZYptr->first] += ZXval * ZYptr->second;
1286         ZYptr++;
1287       }
1288       ZXptr++;
1289     }
1290   }
1291 }
1292
1293 /////////////////////////////////////////////////////////////////
1294 // GetSubtree
1295 //
1296 // Returns set containing all leaf labels of the current subtree.
1297 /////////////////////////////////////////////////////////////////
1298
1299 set<int> GetSubtree (const TreeNode *tree){
1300   set<int> s;
1301
1302   if (tree->GetSequenceLabel() == -1){
1303     s = GetSubtree (tree->GetLeftChild());
1304     set<int> t = GetSubtree (tree->GetRightChild());
1305
1306     for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1307       s.insert (*iter);
1308   }
1309   else {
1310     s.insert (tree->GetSequenceLabel());
1311   }
1312
1313   return s;
1314 }
1315
1316 /////////////////////////////////////////////////////////////////
1317 // TreeBasedBiPartitioning
1318 //
1319 // Uses the iterative refinement scheme from MUSCLE.
1320 /////////////////////////////////////////////////////////////////
1321
1322 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1323                               const ProbabilisticModel &model, MultiSequence* &alignment,
1324                               const TreeNode *tree){
1325   // check if this is a node of the alignment tree
1326   if (tree->GetSequenceLabel() == -1){
1327     TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild());
1328     TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild());
1329
1330     set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1331     set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1332     set<int> leftSubtreeComplement, rightSubtreeComplement;
1333
1334     // calculate complement of each subtree
1335     for (int i = 0; i < alignment->GetNumSequences(); i++){
1336       if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1337       if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1338     }
1339
1340     // perform realignments for edge to left child
1341     if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1342       MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1343       MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1344       delete alignment;
1345       alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1346     }
1347
1348     // perform realignments for edge to right child
1349     if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1350       MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1351       MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1352       delete alignment;
1353       alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1354     }
1355   }
1356 }
1357
1358 /////////////////////////////////////////////////////////////////
1359 // DoIterativeRefinement()
1360 //
1361 // Performs a single round of randomized partionining iterative
1362 // refinement.
1363 /////////////////////////////////////////////////////////////////
1364
1365 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1366                             const ProbabilisticModel &model, MultiSequence* &alignment){
1367   set<int> groupOne, groupTwo;
1368
1369   // create two separate groups
1370   for (int i = 0; i < alignment->GetNumSequences(); i++){
1371     if (rand() % 2)
1372       groupOne.insert (i);
1373     else
1374       groupTwo.insert (i);
1375   }
1376
1377   if (groupOne.empty() || groupTwo.empty()) return;
1378
1379   // project into the two groups
1380   MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1381   MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1382   delete alignment;
1383
1384   // realign
1385   alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1386
1387   delete groupOneSeqs;
1388   delete groupTwoSeqs;
1389 }
1390
1391 /////////////////////////////////////////////////////////////////
1392 // WriteAnnotation()
1393 //
1394 // Computes annotation for multiple alignment and write values
1395 // to a file.
1396 /////////////////////////////////////////////////////////////////
1397
1398 void WriteAnnotation (MultiSequence *alignment,
1399                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1400   ofstream outfile (annotationFilename.c_str());
1401
1402   if (outfile.fail()){
1403     cerr << "ERROR: Unable to write annotation file." << endl;
1404     exit (1);
1405   }
1406
1407   const int alignLength = alignment->GetSequence(0)->GetLength();
1408   const int numSeqs = alignment->GetNumSequences();
1409
1410   SafeVector<int> position (numSeqs, 0);
1411   SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1412   for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1413   SafeVector<pair<int,int> > active;
1414   active.reserve (numSeqs);
1415
1416   SafeVector<int> lab;
1417   for (int i = 0; i < numSeqs; i++) lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1418
1419   // for every column
1420   for (int i = 1; i <= alignLength; i++){
1421
1422     // find all aligned residues in this particular column
1423     active.clear();
1424     for (int j = 0; j < numSeqs; j++){
1425       if (seqs[j][i] != '-'){
1426         active.push_back (make_pair(lab[j], ++position[j]));
1427       }
1428     }
1429
1430     sort (active.begin(), active.end());
1431     outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1432   }
1433
1434   outfile.close();
1435 }
1436
1437 /////////////////////////////////////////////////////////////////
1438 // ComputeScore()
1439 //
1440 // Computes the annotation score for a particular column.
1441 /////////////////////////////////////////////////////////////////
1442
1443 int ComputeScore (const SafeVector<pair<int, int> > &active,
1444                   const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1445
1446   if (active.size() <= 1) return 0;
1447
1448   // ALTERNATIVE #1: Compute the average alignment score.
1449
1450   float val = 0;
1451   for (int i = 0; i < (int) active.size(); i++){
1452     for (int j = i+1; j < (int) active.size(); j++){
1453       val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1454     }
1455   }
1456
1457   return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1458
1459 }