binaries/src/probcons/Main.cc

   1 /////////////////////////////////////////////////////////////////
   2 // Main.cc
   3 //
   4 // Main routines for PROBCONS program.
   5 /////////////////////////////////////////////////////////////////
   6
   7 #include "SafeVector.h"
   8 #include "MultiSequence.h"
   9 #include "Defaults.h"
  10 #include "ScoreType.h"
  11 #include "ProbabilisticModel.h"
  12 #include "EvolutionaryTree.h"
  13 #include "SparseMatrix.h"
  14 #include <string>
  15 #include <sstream>
  16 #include <iomanip>
  17 #include <iostream>
  18 #include <list>
  19 #include <set>
  20 #include <algorithm>
  21 #include <climits>
  22 #include <cstdio>
  23 #include <cstdlib>
  24 #include <cerrno>
  25 #include <iomanip>
  26 #include <string.h>
  27
  28 string parametersInputFilename = "";
  29 string parametersOutputFilename = "no training";
  30 string annotationFilename = "";
  31
  32 bool enableTraining = false;
  33 bool enableVerbose = false;
  34 bool enableAllPairs = false;
  35 bool enableAnnotation = false;
  36 bool enableViterbi = false;
  37 bool enableClustalWOutput = false;
  38 bool enableTrainEmissions = false;
  39 bool enableAlignOrder = false;
  40 int numConsistencyReps = 2;
  41 int numPreTrainingReps = 0;
  42 int numIterativeRefinementReps = 100;
  43
  44 float cutoff = 0;
  45 float gapOpenPenalty = 0;
  46 float gapContinuePenalty = 0;
  47
  48 VF initDistrib (NumMatrixTypes);
  49 VF gapOpen (2*NumInsertStates);
  50 VF gapExtend (2*NumInsertStates);
  51 VVF emitPairs (256, VF (256, 1e-10));
  52 VF emitSingle (256, 1e-5);
  53
  54 string alphabet = alphabetDefault;
  55
  56 const int MIN_PRETRAINING_REPS = 0;
  57 const int MAX_PRETRAINING_REPS = 20;
  58 const int MIN_CONSISTENCY_REPS = 0;
  59 const int MAX_CONSISTENCY_REPS = 5;
  60 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
  61 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
  62
  63 /////////////////////////////////////////////////////////////////
  64 // Function prototypes
  65 /////////////////////////////////////////////////////////////////
  66
  67 void PrintHeading();
  68 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
  69                       const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
  70 MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend,
  71                         VVF &emitPairs, VF &emitSingle);
  72 SafeVector<string> ParseParams (int argc, char **argv);
  73 void ReadParameters ();
  74 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
  75                                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  76                                       const ProbabilisticModel &model);
  77 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
  78                                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  79                                 const ProbabilisticModel &model);
  80 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
  81                                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
  82 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
  83 void Relax1 (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
  84
  85 set<int> GetSubtree (const TreeNode *tree);
  86 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  87                               const ProbabilisticModel &model, MultiSequence* &alignment,
  88                               const TreeNode *tree);
  89 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  90                             const ProbabilisticModel &model, MultiSequence* &alignment);
  91 void WriteAnnotation (MultiSequence *alignment,
  92                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
  93 int ComputeScore (const SafeVector<pair<int, int> > &active,
  94                   const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
  95
  96
  97 /////////////////////////////////////////////////////////////////
  98 // main()
  99 //
 100 // Calls all initialization routines and runs the PROBCONS
 101 // aligner.
 102 /////////////////////////////////////////////////////////////////
 103
 104 int main (int argc, char **argv){
 105
 106   // print PROBCONS heading
 107   PrintHeading();
 108
 109   // parse program parameters
 110   SafeVector<string> sequenceNames = ParseParams (argc, argv);
 111   ReadParameters();
 112   PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 113
 114   // now, we'll process all the files given as input.  If we are given
 115   // several filenames as input, then we'll load all of those sequences
 116   // simultaneously, as long as we're not training.  On the other hand,
 117   // if we are training, then we'll treat each file as a separate
 118   // training instance
 119
 120   // if we are training
 121   if (enableTraining){
 122
 123     // build new model for aligning
 124     ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 125
 126     // prepare to average parameters
 127     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
 128     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
 129     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
 130     if (enableTrainEmissions){
 131       for (int i = 0; i < (int) emitPairs.size(); i++)
 132         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
 133       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
 134     }
 135
 136     // align each file individually
 137     for (int i = 0; i < (int) sequenceNames.size(); i++){
 138
 139       VF thisInitDistrib (NumMatrixTypes);
 140       VF thisGapOpen (2*NumInsertStates);
 141       VF thisGapExtend (2*NumInsertStates);
 142       VVF thisEmitPairs (256, VF (256, 1e-10));
 143       VF thisEmitSingle (256, 1e-5);
 144
 145       // load sequence file
 146       MultiSequence *sequences = new MultiSequence(); assert (sequences);
 147       cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 148       sequences->LoadMFA (sequenceNames[i], true);
 149
 150       // align sequences
 151       DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
 152
 153       // add in contribution of the derived parameters
 154       for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
 155       for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
 156       for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
 157       if (enableTrainEmissions){
 158         for (int i = 0; i < (int) emitPairs.size(); i++)
 159           for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
 160         for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
 161       }
 162
 163       delete sequences;
 164     }
 165
 166     // compute new parameters and print them out
 167     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
 168     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
 169     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
 170     if (enableTrainEmissions){
 171       for (int i = 0; i < (int) emitPairs.size(); i++)
 172         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
 173       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
 174     }
 175
 176     PrintParameters ("Trained parameter set:",
 177                      initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
 178                      parametersOutputFilename.c_str());
 179   }
 180
 181   // if we are not training, we must simply want to align some sequences
 182   else {
 183
 184     // load all files together
 185     MultiSequence *sequences = new MultiSequence(); assert (sequences);
 186     for (int i = 0; i < (int) sequenceNames.size(); i++){
 187       cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 188       sequences->LoadMFA (sequenceNames[i], true);
 189     }
 190
 191     // do all "pre-training" repetitions first
 192     for (int ct = 0; ct < numPreTrainingReps; ct++){
 193       enableTraining = true;
 194
 195       // build new model for aligning
 196       ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
 197                                 emitPairs, emitSingle);
 198
 199       // do initial alignments
 200       DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 201
 202       // print new parameters
 203       PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 204
 205       enableTraining = false;
 206     }
 207
 208     // now, we can perform the alignments and write them out
 209     MultiSequence *alignment = DoAlign (sequences,
 210                                         ProbabilisticModel (initDistrib, gapOpen, gapExtend,  emitPairs, emitSingle),
 211                                         initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 212
 213     if (!enableAllPairs){
 214       if (enableClustalWOutput)
 215         alignment->WriteALN (cout);
 216       else
 217         alignment->WriteMFA (cout);
 218     }
 219     delete alignment;
 220     delete sequences;
 221
 222   }
 223 }
 224
 225 /////////////////////////////////////////////////////////////////
 226 // PrintHeading()
 227 //
 228 // Prints heading for PROBCONS program.
 229 /////////////////////////////////////////////////////////////////
 230
 231 void PrintHeading (){
 232   cerr << endl
 233        << "PROBCONS version " << VERSION << " - align multiple protein sequences and print to standard output" << endl
 234        << "Written by Chuong Do" << endl
 235        << endl;
 236 }
 237
 238 /////////////////////////////////////////////////////////////////
 239 // PrintParameters()
 240 //
 241 // Prints PROBCONS parameters to STDERR.  If a filename is
 242 // specified, then the parameters are also written to the file.
 243 /////////////////////////////////////////////////////////////////
 244
 245 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
 246                       const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
 247
 248   // print parameters to the screen
 249   cerr << message << endl
 250        << "    initDistrib[] = { ";
 251   for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
 252   cerr << "}" << endl
 253        << "        gapOpen[] = { ";
 254   for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
 255   cerr << "}" << endl
 256        << "      gapExtend[] = { ";
 257   for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
 258   cerr << "}" << endl
 259        << endl;
 260
 261   /*
 262   for (int i = 0; i < 5; i++){
 263     for (int j = 0; j <= i; j++){
 264       cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
 265     }
 266     cerr << endl;
 267     }*/
 268
 269   // if a file name is specified
 270   if (filename){
 271
 272     // attempt to open the file for writing
 273     FILE *file = fopen (filename, "w");
 274     if (!file){
 275       cerr << "ERROR: Unable to write parameter file: " << filename << endl;
 276       exit (1);
 277     }
 278
 279     // if successful, then write the parameters to the file
 280     for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
 281     for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
 282     for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
 283     fprintf (file, "%s\n", alphabet.c_str());
 284     for (int i = 0; i < (int) alphabet.size(); i++){
 285       for (int j = 0; j <= i; j++)
 286         fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
 287       fprintf (file, "\n");
 288     }
 289     for (int i = 0; i < (int) alphabet.size(); i++)
 290       fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
 291     fprintf (file, "\n");
 292     fclose (file);
 293   }
 294 }
 295
 296 /////////////////////////////////////////////////////////////////
 297 // DoAlign()
 298 //
 299 // First computes all pairwise posterior probability matrices.
 300 // Then, computes new parameters if training, or a final
 301 // alignment, otherwise.
 302 /////////////////////////////////////////////////////////////////
 303
 304 MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
 305                         VF &gapExtend, VVF &emitPairs, VF &emitSingle){
 306
 307   assert (sequences);
 308
 309   const int numSeqs = sequences->GetNumSequences();
 310   VVF distances (numSeqs, VF (numSeqs, 0));
 311   SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
 312
 313
 314
 315   if (enableTraining){
 316     // prepare to average parameters
 317     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
 318     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
 319     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
 320     if (enableTrainEmissions){
 321       for (int i = 0; i < (int) emitPairs.size(); i++)
 322         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
 323       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
 324     }
 325   }
 326
 327   // skip posterior calculations if we just want to do Viterbi alignments
 328   if (!enableViterbi){
 329
 330     // do all pairwise alignments for posterior probability matrices
 331     for (int a = 0; a < numSeqs-1; a++){
 332       for (int b = a+1; b < numSeqs; b++){
 333         Sequence *seq1 = sequences->GetSequence (a);
 334         Sequence *seq2 = sequences->GetSequence (b);
 335
 336         // verbose output
 337         if (enableVerbose)
 338           cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
 339                << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
 340
 341         // compute forward and backward probabilities
 342         VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
 343         VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
 344
 345         // if we are training, then we'll simply want to compute the
 346         // expected counts for each region within the matrix separately;
 347         // otherwise, we'll need to put all of the regions together and
 348         // assemble a posterior probability match matrix
 349
 350         // so, if we're training
 351         if (enableTraining){
 352
 353           // compute new parameters
 354           VF thisInitDistrib (NumMatrixTypes);
 355           VF thisGapOpen (2*NumInsertStates);
 356           VF thisGapExtend (2*NumInsertStates);
 357           VVF thisEmitPairs (256, VF (256, 1e-10));
 358           VF thisEmitSingle (256, 1e-5);
 359
 360           model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
 361
 362           // add in contribution of the derived parameters
 363           for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
 364           for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
 365           for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
 366           if (enableTrainEmissions){
 367             for (int i = 0; i < (int) emitPairs.size(); i++)
 368               for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
 369             for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
 370           }
 371
 372           // let us know that we're done.
 373           if (enableVerbose) cerr << "done." << endl;
 374         }
 375         else {
 376
 377           // compute posterior probability matrix
 378           VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
 379
 380           // compute sparse representations
 381           sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
 382           sparseMatrices[b][a] = NULL;
 383
 384           if (!enableAllPairs){
 385             // perform the pairwise sequence alignment
 386             pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
 387                                                                                 seq2->GetLength(),
 388                                                                                 *posterior);
 389
 390             // compute "expected accuracy" distance for evolutionary tree computation
 391             float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
 392             distances[a][b] = distances[b][a] = distance;
 393
 394             if (enableVerbose)
 395               cerr << setprecision (10) << distance << endl;
 396
 397               delete alignment.first;
 398           }
 399           else {
 400             // let us know that we're done.
 401             if (enableVerbose) cerr << "done." << endl;
 402           }
 403
 404           delete posterior;
 405         }
 406
 407         delete forward;
 408         delete backward;
 409       }
 410     }
 411   }
 412
 413   // now average out parameters derived
 414   if (enableTraining){
 415
 416     // compute new parameters
 417     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
 418     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
 419     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
 420
 421     if (enableTrainEmissions){
 422       for (int i = 0; i < (int) emitPairs.size(); i++)
 423         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
 424       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
 425     }
 426   }
 427
 428   // see if we still want to do some alignments
 429   else {
 430
 431     if (!enableViterbi){
 432
 433       // perform the consistency transformation the desired number of times
 434       for (int r = 0; r < numConsistencyReps; r++){
 435         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
 436
 437         // now replace the old posterior matrices
 438         for (int i = 0; i < numSeqs; i++){
 439           for (int j = 0; j < numSeqs; j++){
 440             delete sparseMatrices[i][j];
 441             sparseMatrices[i][j] = newSparseMatrices[i][j];
 442           }
 443         }
 444       }
 445     }
 446
 447     MultiSequence *finalAlignment = NULL;
 448
 449     if (enableAllPairs){
 450       for (int a = 0; a < numSeqs-1; a++){
 451         for (int b = a+1; b < numSeqs; b++){
 452           Sequence *seq1 = sequences->GetSequence (a);
 453           Sequence *seq2 = sequences->GetSequence (b);
 454
 455           if (enableVerbose)
 456             cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
 457                  << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
 458
 459
 460           // perform the pairwise sequence alignment
 461           pair<SafeVector<char> *, float> alignment;
 462           if (enableViterbi)
 463             alignment = model.ComputeViterbiAlignment (seq1, seq2);
 464           else {
 465
 466             // build posterior matrix
 467             VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
 468             int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
 469             for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
 470
 471             alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
 472             delete posterior;
 473           }
 474
 475           // write pairwise alignments
 476           string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
 477           ofstream outfile (name.c_str());
 478
 479           MultiSequence *result = new MultiSequence();
 480           result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
 481           result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
 482           if (enableClustalWOutput)
 483             result->WriteALN (outfile);
 484           else
 485             result->WriteMFA (outfile);
 486
 487           outfile.close();
 488
 489           delete alignment.first;
 490         }
 491       }
 492     }
 493
 494     // now if we still need to do a final multiple alignment
 495     else {
 496
 497       if (enableVerbose)
 498         cerr << endl;
 499
 500       // compute the evolutionary tree
 501       TreeNode *tree = TreeNode::ComputeTree (distances);
 502
 503       tree->Print (cerr, sequences);
 504       cerr << endl;
 505
 506       // make the final alignment
 507       finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
 508
 509       // build annotation
 510       if (enableAnnotation){
 511         WriteAnnotation (finalAlignment, sparseMatrices);
 512       }
 513
 514       delete tree;
 515     }
 516
 517     if (!enableViterbi){
 518       // delete sparse matrices
 519       for (int a = 0; a < numSeqs-1; a++){
 520         for (int b = a+1; b < numSeqs; b++){
 521           delete sparseMatrices[a][b];
 522           delete sparseMatrices[b][a];
 523         }
 524       }
 525     }
 526
 527     return finalAlignment;
 528   }
 529
 530   return NULL;
 531 }
 532
 533 /////////////////////////////////////////////////////////////////
 534 // GetInteger()
 535 //
 536 // Attempts to parse an integer from the character string given.
 537 // Returns true only if no parsing error occurs.
 538 /////////////////////////////////////////////////////////////////
 539
 540 bool GetInteger (char *data, int *val){
 541   char *endPtr;
 542   long int retVal;
 543
 544   assert (val);
 545
 546   errno = 0;
 547   retVal = strtol (data, &endPtr, 0);
 548   if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
 549   if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
 550   if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
 551   *val = (int) retVal;
 552   return true;
 553 }
 554
 555 /////////////////////////////////////////////////////////////////
 556 // GetFloat()
 557 //
 558 // Attempts to parse a float from the character string given.
 559 // Returns true only if no parsing error occurs.
 560 /////////////////////////////////////////////////////////////////
 561
 562 bool GetFloat (char *data, float *val){
 563   char *endPtr;
 564   double retVal;
 565
 566   assert (val);
 567
 568   errno = 0;
 569   retVal = strtod (data, &endPtr);
 570   if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
 571   if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
 572   *val = (float) retVal;
 573   return true;
 574 }
 575
 576 /////////////////////////////////////////////////////////////////
 577 // ParseParams()
 578 //
 579 // Parse all command-line options.
 580 /////////////////////////////////////////////////////////////////
 581
 582 SafeVector<string> ParseParams (int argc, char **argv){
 583
 584   if (argc < 2){
 585
 586     cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY.  This is free software, and" << endl
 587          << "you are welcome to redistribute it under certain conditions.  See the" << endl
 588          << "file COPYING.txt for details." << endl
 589          << endl
 590          << "Usage:" << endl
 591          << "       probcons [OPTION]... [MFAFILE]..." << endl
 592          << endl
 593          << "Description:" << endl
 594          << "       Align sequences in MFAFILE(s) and print result to standard output" << endl
 595          << endl
 596          << "       -clustalw" << endl
 597          << "              use CLUSTALW output format instead of MFA" << endl
 598          << endl
 599          << "       -c, --consistency REPS" << endl
 600          << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
 601          << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
 602          << endl
 603          << "       -ir, --iterative-refinement REPS" << endl
 604          << "              use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
 605          << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
 606          << endl
 607          << "       -pre, --pre-training REPS" << endl
 608          << "              use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
 609          << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
 610          << endl
 611          << "       -pairs" << endl
 612          << "              generate all-pairs pairwise alignments" << endl
 613          << endl
 614          << "       -viterbi" << endl
 615          << "              use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
 616          << endl
 617          << "       -v, --verbose" << endl
 618          << "              report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
 619          << endl
 620          << "       -annot FILENAME" << endl
 621          << "              write annotation for multiple alignment to FILENAME" << endl
 622          << endl
 623          << "       -t, --train FILENAME" << endl
 624          << "              compute EM transition probabilities, store in FILENAME (default: "
 625          << parametersOutputFilename << ")" << endl
 626          << endl
 627          << "       -e, --emissions" << endl
 628          << "              also reestimate emission probabilities (default: "
 629          << (enableTrainEmissions ? "on" : "off") << ")" << endl
 630          << endl
 631          << "       -p, --paramfile FILENAME" << endl
 632          << "              read parameters from FILENAME (default: "
 633          << parametersInputFilename << ")" << endl
 634          << endl
 635          << "       -a, --alignment-order" << endl
 636          << "              print sequences in alignment order rather than input order (default: "
 637          << (enableAlignOrder ? "on" : "off") << ")" << endl
 638          << endl;
 639     //           << "       -go, --gap-open VALUE" << endl
 640     //           << "              gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
 641     //   << endl
 642     //   << "       -ge, --gap-extension VALUE" << endl
 643     //   << "              gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
 644     //   << endl
 645     //         << "       -co, --cutoff CUTOFF" << endl
 646     //         << "              subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
 647     //         << endl
 648
 649     exit (1);
 650   }
 651
 652   SafeVector<string> sequenceNames;
 653   int tempInt;
 654   float tempFloat;
 655
 656   for (int i = 1; i < argc; i++){
 657     if (argv[i][0] == '-'){
 658
 659       // training
 660       if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
 661         enableTraining = true;
 662         if (i < argc - 1)
 663           parametersOutputFilename = string (argv[++i]);
 664         else {
 665           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 666           exit (1);
 667         }
 668       }
 669
 670       // emission training
 671       else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){
 672         enableTrainEmissions = true;
 673       }
 674
 675       // parameter file
 676       else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
 677         if (i < argc - 1)
 678           parametersInputFilename = string (argv[++i]);
 679         else {
 680           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 681           exit (1);
 682         }
 683       }
 684
 685       // number of consistency transformations
 686       else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
 687         if (i < argc - 1){
 688           if (!GetInteger (argv[++i], &tempInt)){
 689             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 690             exit (1);
 691           }
 692           else {
 693             if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
 694               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 695                    << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
 696               exit (1);
 697             }
 698             else
 699               numConsistencyReps = tempInt;
 700           }
 701         }
 702         else {
 703           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 704           exit (1);
 705         }
 706       }
 707
 708       // number of randomized partitioning iterative refinement passes
 709       else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
 710         if (i < argc - 1){
 711           if (!GetInteger (argv[++i], &tempInt)){
 712             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 713             exit (1);
 714           }
 715           else {
 716             if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
 717               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 718                    << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
 719               exit (1);
 720             }
 721             else
 722               numIterativeRefinementReps = tempInt;
 723           }
 724         }
 725         else {
 726           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 727           exit (1);
 728         }
 729       }
 730
 731       // number of EM pre-training rounds
 732       else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
 733         if (i < argc - 1){
 734           if (!GetInteger (argv[++i], &tempInt)){
 735             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 736             exit (1);
 737           }
 738           else {
 739             if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
 740               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 741                    << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
 742               exit (1);
 743             }
 744             else
 745               numPreTrainingReps = tempInt;
 746           }
 747         }
 748         else {
 749           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 750           exit (1);
 751         }
 752       }
 753
 754       // gap open penalty
 755       else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
 756         if (i < argc - 1){
 757           if (!GetFloat (argv[++i], &tempFloat)){
 758             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 759             exit (1);
 760           }
 761           else {
 762             if (tempFloat > 0){
 763               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
 764               exit (1);
 765             }
 766             else
 767               gapOpenPenalty = tempFloat;
 768           }
 769         }
 770         else {
 771           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
 772           exit (1);
 773         }
 774       }
 775
 776       // gap extension penalty
 777       else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
 778         if (i < argc - 1){
 779           if (!GetFloat (argv[++i], &tempFloat)){
 780             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 781             exit (1);
 782           }
 783           else {
 784             if (tempFloat > 0){
 785               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
 786               exit (1);
 787             }
 788             else
 789               gapContinuePenalty = tempFloat;
 790           }
 791         }
 792         else {
 793           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
 794           exit (1);
 795         }
 796       }
 797
 798       // all-pairs pairwise alignments
 799       else if (!strcmp (argv[i], "-pairs")){
 800         enableAllPairs = true;
 801       }
 802
 803       // all-pairs pairwise Viterbi alignments
 804       else if (!strcmp (argv[i], "-viterbi")){
 805         enableAllPairs = true;
 806         enableViterbi = true;
 807       }
 808
 809       // annotation files
 810       else if (!strcmp (argv[i], "-annot")){
 811         enableAnnotation = true;
 812         if (i < argc - 1)
 813           annotationFilename = argv[++i];
 814         else {
 815           cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
 816           exit (1);
 817         }
 818       }
 819
 820       // clustalw output format
 821       else if (!strcmp (argv[i], "-clustalw")){
 822         enableClustalWOutput = true;
 823       }
 824
 825       // cutoff
 826       else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){
 827         if (i < argc - 1){
 828           if (!GetFloat (argv[++i], &tempFloat)){
 829             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 830             exit (1);
 831           }
 832           else {
 833             if (tempFloat < 0 || tempFloat > 1){
 834               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
 835               exit (1);
 836             }
 837             else
 838               cutoff = tempFloat;
 839           }
 840         }
 841         else {
 842           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
 843           exit (1);
 844         }
 845       }
 846
 847       // verbose reporting
 848       else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
 849         enableVerbose = true;
 850       }
 851
 852       // alignment order
 853       else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){
 854         enableAlignOrder = true;
 855       }
 856
 857       // bad arguments
 858       else {
 859         cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
 860         exit (1);
 861       }
 862     }
 863     else {
 864       sequenceNames.push_back (string (argv[i]));
 865     }
 866   }
 867
 868   if (enableTrainEmissions && !enableTraining){
 869     cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
 870     exit (1);
 871   }
 872
 873   return sequenceNames;
 874 }
 875
 876 /////////////////////////////////////////////////////////////////
 877 // ReadParameters()
 878 //
 879 // Read initial distribution, transition, and emission
 880 // parameters from a file.
 881 /////////////////////////////////////////////////////////////////
 882
 883 void ReadParameters (){
 884
 885   ifstream data;
 886
 887   emitPairs = VVF (256, VF (256, 1e-10));
 888   emitSingle = VF (256, 1e-5);
 889
 890   // read initial state distribution and transition parameters
 891   if (parametersInputFilename == string ("")){
 892     if (NumInsertStates == 1){
 893       for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
 894       for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
 895       for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
 896     }
 897     else if (NumInsertStates == 2){
 898       for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
 899       for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
 900       for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
 901     }
 902     else {
 903       cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
 904            << "       for " << NumInsertStates << " pairs of insert states.  Use --paramfile." << endl;
 905       exit (1);
 906     }
 907
 908     alphabet = alphabetDefault;
 909
 910     for (int i = 0; i < (int) alphabet.length(); i++){
 911       emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
 912       emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
 913       for (int j = 0; j <= i; j++){
 914         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
 915         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
 916         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
 917         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
 918         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
 919         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
 920         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
 921         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
 922       }
 923     }
 924   }
 925   else {
 926     data.open (parametersInputFilename.c_str());
 927     if (data.fail()){
 928       cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
 929       exit (1);
 930     }
 931
 932     string line[3];
 933     for (int i = 0; i < 3; i++){
 934       if (!getline (data, line[i])){
 935         cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
 936         exit (1);
 937       }
 938     }
 939     istringstream data2;
 940     data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
 941     data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
 942     data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
 943
 944     if (!getline (data, line[0])){
 945       cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
 946       exit (1);
 947     }
 948
 949     // read alphabet as concatenation of all characters on alphabet line
 950     alphabet = "";
 951     string token;
 952     data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
 953
 954     for (int i = 0; i < (int) alphabet.size(); i++){
 955       for (int j = 0; j <= i; j++){
 956         float val;
 957         data >> val;
 958         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
 959         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
 960         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
 961         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
 962         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
 963         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
 964         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
 965         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
 966       }
 967     }
 968
 969     for (int i = 0; i < (int) alphabet.size(); i++){
 970       float val;
 971       data >> val;
 972       emitSingle[(unsigned char) tolower(alphabet[i])] = val;
 973       emitSingle[(unsigned char) toupper(alphabet[i])] = val;
 974     }
 975     data.close();
 976   }
 977 }
 978
 979 /////////////////////////////////////////////////////////////////
 980 // ProcessTree()
 981 //
 982 // Process the tree recursively.  Returns the aligned sequences
 983 // corresponding to a node or leaf of the tree.
 984 /////////////////////////////////////////////////////////////////
 985
 986 MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
 987                             const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 988                             const ProbabilisticModel &model){
 989   MultiSequence *result;
 990
 991   // check if this is a node of the alignment tree
 992   if (tree->GetSequenceLabel() == -1){
 993     MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
 994     MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
 995
 996     assert (alignLeft);
 997     assert (alignRight);
 998
 999     result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
1000     assert (result);
1001
1002     delete alignLeft;
1003     delete alignRight;
1004   }
1005
1006   // otherwise, this is a leaf of the alignment tree
1007   else {
1008     result = new MultiSequence(); assert (result);
1009     result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1010   }
1011
1012   return result;
1013 }
1014
1015 /////////////////////////////////////////////////////////////////
1016 // ComputeFinalAlignment()
1017 //
1018 // Compute the final alignment by calling ProcessTree(), then
1019 // performing iterative refinement as needed.
1020 /////////////////////////////////////////////////////////////////
1021
1022 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
1023                                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1024                                       const ProbabilisticModel &model){
1025
1026   MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
1027
1028   SafeVector<int> oldOrdering;
1029   if (enableAlignOrder){
1030     for (int i = 0; i < alignment->GetNumSequences(); i++)
1031       oldOrdering.push_back (alignment->GetSequence(i)->GetSortLabel());
1032     alignment->SaveOrdering();
1033     enableAlignOrder = false;
1034   }
1035
1036   // tree-based refinement
1037   // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
1038
1039   // iterative refinement
1040   for (int i = 0; i < numIterativeRefinementReps; i++)
1041     DoIterativeRefinement (sparseMatrices, model, alignment);
1042
1043   cerr << endl;
1044
1045   if (oldOrdering.size() > 0){
1046     for (int i = 0; i < (int) oldOrdering.size(); i++){
1047       alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
1048     }
1049   }
1050
1051   // return final alignment
1052   return alignment;
1053 }
1054
1055 /////////////////////////////////////////////////////////////////
1056 // AlignAlignments()
1057 //
1058 // Returns the alignment of two MultiSequence objects.
1059 /////////////////////////////////////////////////////////////////
1060
1061 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
1062                                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1063                                 const ProbabilisticModel &model){
1064
1065   // print some info about the alignment
1066   if (enableVerbose){
1067     for (int i = 0; i < align1->GetNumSequences(); i++)
1068       cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1069     cerr << "] vs. ";
1070     for (int i = 0; i < align2->GetNumSequences(); i++)
1071       cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1072     cerr << "]: ";
1073   }
1074
1075   VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
1076   pair<SafeVector<char> *, float> alignment;
1077
1078   // choose the alignment routine depending on the "cosmetic" gap penalties used
1079   if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
1080     alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
1081   else
1082     alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1083                                                         *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1084                                                         gapOpenPenalty, gapContinuePenalty);
1085
1086   delete posterior;
1087
1088   if (enableVerbose){
1089
1090     // compute total length of sequences
1091     int totLength = 0;
1092     for (int i = 0; i < align1->GetNumSequences(); i++)
1093       for (int j = 0; j < align2->GetNumSequences(); j++)
1094         totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1095
1096     // give an "accuracy" measure for the alignment
1097     cerr << alignment.second / totLength << endl;
1098   }
1099
1100   // now build final alignment
1101   MultiSequence *result = new MultiSequence();
1102   for (int i = 0; i < align1->GetNumSequences(); i++)
1103     result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1104   for (int i = 0; i < align2->GetNumSequences(); i++)
1105     result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1106   if (!enableAlignOrder)
1107     result->SortByLabel();
1108
1109   // free temporary alignment
1110   delete alignment.first;
1111
1112   return result;
1113 }
1114
1115 /////////////////////////////////////////////////////////////////
1116 // DoRelaxation()
1117 //
1118 // Performs one round of the consistency transformation.  The
1119 // formula used is:
1120 //                     1
1121 //    P'(x[i]-y[j]) = ---  sum   sum P(x[i]-z[k]) P(z[k]-y[j])
1122 //                    |S| z in S  k
1123 //
1124 // where S = {x, y, all other sequences...}
1125 //
1126 /////////////////////////////////////////////////////////////////
1127
1128 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
1129                                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1130   const int numSeqs = sequences->GetNumSequences();
1131
1132   SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
1133
1134   // for every pair of sequences
1135   for (int i = 0; i < numSeqs; i++){
1136     for (int j = i+1; j < numSeqs; j++){
1137       Sequence *seq1 = sequences->GetSequence (i);
1138       Sequence *seq2 = sequences->GetSequence (j);
1139
1140       if (enableVerbose)
1141         cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1142              << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1143
1144       // get the original posterior matrix
1145       VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1146       VF &posterior = *posteriorPtr;
1147
1148       const int seq1Length = seq1->GetLength();
1149       const int seq2Length = seq2->GetLength();
1150
1151       // contribution from the summation where z = x and z = y
1152       for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1153
1154       if (enableVerbose)
1155         cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1156
1157       // contribution from all other sequences
1158       for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1159         if (k < i)
1160           Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
1161         else if (k > i && k < j)
1162           Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1163         else {
1164           SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
1165           Relax (sparseMatrices[i][k], temp, posterior);
1166           delete temp;
1167         }
1168       }
1169
1170       // now renormalization
1171       for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1172
1173       // mask out positions not originally in the posterior matrix
1174       SparseMatrix *matXY = sparseMatrices[i][j];
1175       for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
1176       for (int x = 1; x <= seq1Length; x++){
1177         SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1178         SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1179         VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1180         int curr = 0;
1181         while (XYptr != XYend){
1182
1183           // zero out all cells until the first filled column
1184           while (curr < XYptr->first){
1185             base[curr] = 0;
1186             curr++;
1187           }
1188
1189           // now, skip over this column
1190           curr++;
1191           ++XYptr;
1192         }
1193
1194         // zero out cells after last column
1195         while (curr <= seq2Length){
1196           base[curr] = 0;
1197           curr++;
1198         }
1199       }
1200
1201       // save the new posterior matrix
1202       newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1203       newSparseMatrices[j][i] = NULL;
1204
1205       if (enableVerbose)
1206         cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1207
1208       delete posteriorPtr;
1209
1210       if (enableVerbose)
1211         cerr << "done." << endl;
1212     }
1213   }
1214
1215   return newSparseMatrices;
1216 }
1217
1218 /////////////////////////////////////////////////////////////////
1219 // Relax()
1220 //
1221 // Computes the consistency transformation for a single sequence
1222 // z, and adds the transformed matrix to "posterior".
1223 /////////////////////////////////////////////////////////////////
1224
1225 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
1226
1227   assert (matXZ);
1228   assert (matZY);
1229
1230   int lengthX = matXZ->GetSeq1Length();
1231   int lengthY = matZY->GetSeq2Length();
1232   assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1233
1234   // for every x[i]
1235   for (int i = 1; i <= lengthX; i++){
1236     SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1237     SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1238
1239     VF::iterator base = posterior.begin() + i * (lengthY + 1);
1240
1241     // iterate through all x[i]-z[k]
1242     while (XZptr != XZend){
1243       SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1244       SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1245       const float XZval = XZptr->second;
1246
1247       // iterate through all z[k]-y[j]
1248       while (ZYptr != ZYend){
1249         base[ZYptr->first] += XZval * ZYptr->second;
1250         ZYptr++;
1251       }
1252       XZptr++;
1253     }
1254   }
1255 }
1256
1257 /////////////////////////////////////////////////////////////////
1258 // Relax1()
1259 //
1260 // Computes the consistency transformation for a single sequence
1261 // z, and adds the transformed matrix to "posterior".
1262 /////////////////////////////////////////////////////////////////
1263
1264 void Relax1 (SparseMatrix *matZX, SparseMatrix *matZY, VF &posterior){
1265
1266   assert (matZX);
1267   assert (matZY);
1268
1269   int lengthZ = matZX->GetSeq1Length();
1270   int lengthY = matZY->GetSeq2Length();
1271
1272   // for every z[k]
1273   for (int k = 1; k <= lengthZ; k++){
1274     SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1275     SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1276
1277     // iterate through all z[k]-x[i]
1278     while (ZXptr != ZXend){
1279       SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1280       SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1281       const float ZXval = ZXptr->second;
1282       VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
1283
1284       // iterate through all z[k]-y[j]
1285       while (ZYptr != ZYend){
1286         base[ZYptr->first] += ZXval * ZYptr->second;
1287         ZYptr++;
1288       }
1289       ZXptr++;
1290     }
1291   }
1292 }
1293
1294 /////////////////////////////////////////////////////////////////
1295 // GetSubtree
1296 //
1297 // Returns set containing all leaf labels of the current subtree.
1298 /////////////////////////////////////////////////////////////////
1299
1300 set<int> GetSubtree (const TreeNode *tree){
1301   set<int> s;
1302
1303   if (tree->GetSequenceLabel() == -1){
1304     s = GetSubtree (tree->GetLeftChild());
1305     set<int> t = GetSubtree (tree->GetRightChild());
1306
1307     for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1308       s.insert (*iter);
1309   }
1310   else {
1311     s.insert (tree->GetSequenceLabel());
1312   }
1313
1314   return s;
1315 }
1316
1317 /////////////////////////////////////////////////////////////////
1318 // TreeBasedBiPartitioning
1319 //
1320 // Uses the iterative refinement scheme from MUSCLE.
1321 /////////////////////////////////////////////////////////////////
1322
1323 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1324                               const ProbabilisticModel &model, MultiSequence* &alignment,
1325                               const TreeNode *tree){
1326   // check if this is a node of the alignment tree
1327   if (tree->GetSequenceLabel() == -1){
1328     TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild());
1329     TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild());
1330
1331     set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1332     set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1333     set<int> leftSubtreeComplement, rightSubtreeComplement;
1334
1335     // calculate complement of each subtree
1336     for (int i = 0; i < alignment->GetNumSequences(); i++){
1337       if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1338       if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1339     }
1340
1341     // perform realignments for edge to left child
1342     if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1343       MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1344       MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1345       delete alignment;
1346       alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1347     }
1348
1349     // perform realignments for edge to right child
1350     if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1351       MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1352       MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1353       delete alignment;
1354       alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1355     }
1356   }
1357 }
1358
1359 /////////////////////////////////////////////////////////////////
1360 // DoIterativeRefinement()
1361 //
1362 // Performs a single round of randomized partionining iterative
1363 // refinement.
1364 /////////////////////////////////////////////////////////////////
1365
1366 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1367                             const ProbabilisticModel &model, MultiSequence* &alignment){
1368   set<int> groupOne, groupTwo;
1369
1370   // create two separate groups
1371   for (int i = 0; i < alignment->GetNumSequences(); i++){
1372     if (rand() % 2)
1373       groupOne.insert (i);
1374     else
1375       groupTwo.insert (i);
1376   }
1377
1378   if (groupOne.empty() || groupTwo.empty()) return;
1379
1380   // project into the two groups
1381   MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1382   MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1383   delete alignment;
1384
1385   // realign
1386   alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1387
1388   delete groupOneSeqs;
1389   delete groupTwoSeqs;
1390 }
1391
1392 /////////////////////////////////////////////////////////////////
1393 // WriteAnnotation()
1394 //
1395 // Computes annotation for multiple alignment and write values
1396 // to a file.
1397 /////////////////////////////////////////////////////////////////
1398
1399 void WriteAnnotation (MultiSequence *alignment,
1400                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1401   ofstream outfile (annotationFilename.c_str());
1402
1403   if (outfile.fail()){
1404     cerr << "ERROR: Unable to write annotation file." << endl;
1405     exit (1);
1406   }
1407
1408   const int alignLength = alignment->GetSequence(0)->GetLength();
1409   const int numSeqs = alignment->GetNumSequences();
1410
1411   SafeVector<int> position (numSeqs, 0);
1412   SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1413   for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1414   SafeVector<pair<int,int> > active;
1415   active.reserve (numSeqs);
1416
1417   SafeVector<int> lab;
1418   for (int i = 0; i < numSeqs; i++) lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1419
1420   // for every column
1421   for (int i = 1; i <= alignLength; i++){
1422
1423     // find all aligned residues in this particular column
1424     active.clear();
1425     for (int j = 0; j < numSeqs; j++){
1426       if (seqs[j][i] != '-'){
1427         active.push_back (make_pair(lab[j], ++position[j]));
1428       }
1429     }
1430
1431     sort (active.begin(), active.end());
1432     outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1433   }
1434
1435   outfile.close();
1436 }
1437
1438 /////////////////////////////////////////////////////////////////
1439 // ComputeScore()
1440 //
1441 // Computes the annotation score for a particular column.
1442 /////////////////////////////////////////////////////////////////
1443
1444 int ComputeScore (const SafeVector<pair<int, int> > &active,
1445                   const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1446
1447   if (active.size() <= 1) return 0;
1448
1449   // ALTERNATIVE #1: Compute the average alignment score.
1450
1451   float val = 0;
1452   for (int i = 0; i < (int) active.size(); i++){
1453     for (int j = i+1; j < (int) active.size(); j++){
1454       val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1455     }
1456   }
1457
1458   return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1459
1460 }