binaries/src/mafft/extensions/mxscarna_src/probconsRNA/FixRef.cc

   1 /////////////////////////////////////////////////////////////////
   2 // Main.cc
   3 /////////////////////////////////////////////////////////////////
   4
   5 #include "SafeVector.h"
   6 #include "MultiSequence.h"
   7 #include "Defaults.h"
   8 #include "ScoreType.h"
   9 #include "ProbabilisticModel.h"
  10 #include "EvolutionaryTree.h"
  11 #include "SparseMatrix.h"
  12 #include <string>
  13 #include <iomanip>
  14 #include <iostream>
  15 #include <list>
  16 #include <set>
  17 #include <algorithm>
  18 #include <cstdio>
  19 #include <cstdlib>
  20 #include <cerrno>
  21 #include <iomanip>
  22
  23 string matrixFilename = "";
  24 string parametersInputFilename = "";
  25 string parametersOutputFilename = "no training";
  26
  27 bool enableTraining = false;
  28 bool enableVerbose = false;
  29 int numConsistencyReps = 2;
  30 int numPreTrainingReps = 0;
  31 int numIterativeRefinementReps = 100;
  32
  33 float gapOpenPenalty = 0;
  34 float gapContinuePenalty = 0;
  35 VF initDistrib (NumMatrixTypes);
  36 VF gapOpen (2*NumInsertStates);
  37 VF gapExtend (2*NumInsertStates);
  38 SafeVector<char> alphabet;
  39 VVF emitPairs;
  40 VF emitSingle;
  41
  42 const int MIN_PRETRAINING_REPS = 0;
  43 const int MAX_PRETRAINING_REPS = 20;
  44 const int MIN_CONSISTENCY_REPS = 0;
  45 const int MAX_CONSISTENCY_REPS = 5;
  46 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
  47 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
  48
  49 /////////////////////////////////////////////////////////////////
  50 // Function prototypes
  51 /////////////////////////////////////////////////////////////////
  52
  53 void PrintHeading();
  54 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
  55                       const VF &gapExtend, const char *filename);
  56 MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model);
  57 SafeVector<string> ParseParams (int argc, char **argv);
  58 void ReadParameters ();
  59 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
  60                                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  61                                       const ProbabilisticModel &model);
  62 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
  63                                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  64                                 const ProbabilisticModel &model);
  65 void DoRelaxation (MultiSequence *sequences, SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
  66 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
  67 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
  68                             const ProbabilisticModel &model, MultiSequence* &alignment);
  69 //float ScoreAlignment (MultiSequence *alignment, MultiSequence *sequences, SparseMatrix **sparseMatrices, const int numSeqs);
  70
  71 /////////////////////////////////////////////////////////////////
  72 // main()
  73 //
  74 // Calls all initialization routines and runs the PROBCONS
  75 // aligner.
  76 /////////////////////////////////////////////////////////////////
  77
  78 int main (int argc, char **argv){
  79
  80   if (argc != 3){
  81     cerr << "Usage: FixRef inputfile reffile" << endl;
  82     exit (1);
  83   }
  84
  85   string inputFilename = string (argv[1]);
  86   string refFilename = string (argv[2]);
  87
  88   ReadParameters();
  89
  90   // build new model for aligning
  91   ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
  92                             alphabet, emitPairs, emitSingle);
  93
  94   MultiSequence *inputSeq = new MultiSequence(); inputSeq->LoadMFA (inputFilename);
  95   MultiSequence *refSeq = new MultiSequence(); refSeq->LoadMFA (refFilename);
  96
  97   SafeVector<char> *ali = new SafeVector<char>;
  98
  99   if (refSeq->GetNumSequences() != 2){
 100     cerr << "ERROR: Expected two sequences in reference alignment." << endl;
 101     exit (1);
 102   }
 103   set<int> s; s.insert (0);
 104   MultiSequence *ref1 = refSeq->Project (s);
 105   s.clear(); s.insert (1);
 106   MultiSequence *ref2 = refSeq->Project (s);
 107
 108   for (int i = 0; i < inputSeq->GetNumSequences(); i++){
 109     if (inputSeq->GetSequence(i)->GetHeader() == ref1->GetSequence(0)->GetHeader()){
 110       ref1->AddSequence (inputSeq->GetSequence(i)->Clone());
 111     }
 112     if (inputSeq->GetSequence(i)->GetHeader() == ref2->GetSequence(0)->GetHeader())
 113       ref2->AddSequence (inputSeq->GetSequence(i)->Clone());
 114   }
 115   if (ref1->GetNumSequences() != 2){
 116     cerr << "ERROR: Expected two sequences in reference1 alignment." << endl;
 117     exit (1);
 118   }
 119   if (ref2->GetNumSequences() != 2){
 120     cerr << "ERROR: Expected two sequences in reference2 alignment." << endl;
 121     exit (1);
 122   }
 123
 124   ref1->GetSequence(0)->SetLabel(0);
 125   ref2->GetSequence(0)->SetLabel(0);
 126   ref1->GetSequence(1)->SetLabel(1);
 127   ref2->GetSequence(1)->SetLabel(1);
 128
 129   //  cerr << "Aligning..." << endl;
 130
 131   // now, we can perform the alignments and write them out
 132   MultiSequence *alignment1 = DoAlign (ref1,
 133                                        ProbabilisticModel (initDistrib, gapOpen, gapExtend,
 134                                                            alphabet, emitPairs, emitSingle));
 135
 136   //cerr << "Aligning second..." << endl;
 137   MultiSequence *alignment2 = DoAlign (ref2,
 138                                        ProbabilisticModel (initDistrib, gapOpen, gapExtend,
 139                                                            alphabet, emitPairs, emitSingle));
 140
 141   SafeVector<char>::iterator iter1 = alignment1->GetSequence(0)->GetDataPtr();
 142   SafeVector<char>::iterator iter2 = alignment1->GetSequence(1)->GetDataPtr();
 143   for (int i = 1; i <= alignment1->GetSequence(0)->GetLength(); i++){
 144     if (islower(iter1[i])) iter2[i] = tolower(iter2[i]);
 145     if (isupper(iter1[i])) iter2[i] = toupper(iter2[i]);
 146   }
 147   iter1 = alignment2->GetSequence(0)->GetDataPtr();
 148   iter2 = alignment2->GetSequence(1)->GetDataPtr();
 149   for (int i = 1; i <= alignment2->GetSequence(0)->GetLength(); i++){
 150     if (islower(iter1[i])) iter2[i] = tolower(iter2[i]);
 151     if (isupper(iter1[i])) iter2[i] = toupper(iter2[i]);
 152   }
 153   //alignment1->WriteMFA (cout);
 154   //alignment2->WriteMFA (cout);
 155
 156   int a1 = 0, a = 0;
 157   int b1 = 0, b = 0;
 158
 159   for (int i = 1; i <= refSeq->GetSequence(0)->GetLength(); i++){
 160
 161     // catch up in filler sequences
 162     if (refSeq->GetSequence(0)->GetPosition(i) != '-'){
 163       while (true){
 164         a++;
 165         if (alignment1->GetSequence(0)->GetPosition(a) != '-') break;
 166         ali->push_back ('X');
 167       }
 168     }
 169     if (refSeq->GetSequence(1)->GetPosition(i) != '-'){
 170       while (true){
 171         b++;
 172         if (alignment2->GetSequence(0)->GetPosition(b) != '-') break;
 173         ali->push_back ('Y');
 174       }
 175     }
 176
 177     if (refSeq->GetSequence(0)->GetPosition(i) != '-' &&
 178         refSeq->GetSequence(1)->GetPosition(i) != '-'){
 179       //cerr << "M: " << refSeq->GetSequence(0)->GetPosition(i) << refSeq->GetSequence(1)->GetPosition(i) << endl;
 180       ali->push_back ('B');
 181     }
 182     else if (refSeq->GetSequence(0)->GetPosition(i) != '-'){
 183       //cerr << "X" << endl;
 184       ali->push_back ('X');
 185     }
 186     else if (refSeq->GetSequence(1)->GetPosition(i) != '-'){
 187       //cerr << "Y" << endl;
 188       ali->push_back ('Y');
 189     }
 190   }
 191
 192   while (a < alignment1->GetSequence(0)->GetLength()){
 193     a++;
 194     ali->push_back ('X');
 195     if (alignment1->GetSequence(0)->GetPosition(a) != '-') a1++;
 196   }
 197   while (b < alignment2->GetSequence(0)->GetLength()){
 198     b++;
 199     ali->push_back ('Y');
 200     if (alignment2->GetSequence(0)->GetPosition(b) != '-') b1++;
 201   }
 202
 203   Sequence *seq1 = alignment1->GetSequence(1)->AddGaps (ali, 'X');
 204   Sequence *seq2 = alignment2->GetSequence(1)->AddGaps (ali, 'Y');
 205   seq1->WriteMFA (cout, 60);
 206   seq2->WriteMFA (cout, 60);
 207
 208   delete seq1;
 209   delete seq2;
 210
 211   delete ali;
 212   delete alignment1;
 213   delete alignment2;
 214   delete inputSeq;
 215   delete refSeq;
 216 }
 217
 218 /////////////////////////////////////////////////////////////////
 219 // PrintHeading()
 220 //
 221 // Prints heading for PROBCONS program.
 222 /////////////////////////////////////////////////////////////////
 223
 224 void PrintHeading (){
 225   cerr << endl
 226        << "PROBCONS version 1.02 - align multiple protein sequences and print to standard output" << endl
 227        << "Copyright (C) 2004  Chuong Ba Do" << endl
 228        << endl;
 229 }
 230
 231 /////////////////////////////////////////////////////////////////
 232 // PrintParameters()
 233 //
 234 // Prints PROBCONS parameters to STDERR.  If a filename is
 235 // specified, then the parameters are also written to the file.
 236 /////////////////////////////////////////////////////////////////
 237
 238 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
 239                       const VF &gapExtend, const char *filename){
 240
 241   // print parameters to the screen
 242   cerr << message << endl
 243        << "    initDistrib[] = { ";
 244   for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
 245   cerr << "}" << endl
 246        << "        gapOpen[] = { ";
 247   for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
 248   cerr << "}" << endl
 249        << "      gapExtend[] = { ";
 250   for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
 251   cerr << "}" << endl
 252        << endl;
 253
 254   // if a file name is specified
 255   if (filename){
 256
 257     // attempt to open the file for writing
 258     FILE *file = fopen (filename, "w");
 259     if (!file){
 260       cerr << "ERROR: Unable to write parameter file: " << filename << endl;
 261       exit (1);
 262     }
 263
 264     // if successful, then write the parameters to the file
 265     for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
 266     for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
 267     for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
 268     fclose (file);
 269   }
 270 }
 271
 272 /////////////////////////////////////////////////////////////////
 273 // DoAlign()
 274 //
 275 // First computes all pairwise posterior probability matrices.
 276 // Then, computes new parameters if training, or a final
 277 // alignment, otherwise.
 278 /////////////////////////////////////////////////////////////////
 279
 280 MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model){
 281
 282   assert (sequences);
 283
 284   const int numSeqs = sequences->GetNumSequences();
 285   VVF distances (numSeqs, VF (numSeqs, 0));
 286   SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
 287
 288   // do all pairwise alignments
 289   for (int a = 0; a < numSeqs-1; a++){
 290     for (int b = a+1; b < numSeqs; b++){
 291       Sequence *seq1 = sequences->GetSequence (a);
 292       Sequence *seq2 = sequences->GetSequence (b);
 293
 294       // verbose output
 295       if (enableVerbose)
 296         cerr << "(" << a+1 << ") " << seq1->GetHeader() << " vs. "
 297              << "(" << b+1 << ") " << seq2->GetHeader() << ": ";
 298
 299       // compute forward and backward probabilities
 300       VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
 301       VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
 302
 303       // if we are training, then we'll simply want to compute the
 304       // expected counts for each region within the matrix separately;
 305       // otherwise, we'll need to put all of the regions together and
 306       // assemble a posterior probability match matrix
 307
 308       // compute posterior probability matrix
 309       VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
 310
 311       // compute "expected accuracy" distance for evolutionary tree computation
 312       pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
 313                                                                           seq2->GetLength(),
 314                                                                           *posterior);
 315
 316       float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
 317
 318       if (enableVerbose)
 319         cerr << setprecision (10) << distance << endl;
 320
 321       // save posterior probability matrices in sparse format
 322       distances[a][b] = distances[b][a] = distance;
 323       sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
 324       sparseMatrices[b][a] = sparseMatrices[a][b]->ComputeTranspose();
 325
 326       delete alignment.first;
 327       delete posterior;
 328
 329       delete forward;
 330       delete backward;
 331     }
 332   }
 333
 334   if (!enableTraining){
 335     if (enableVerbose)
 336       cerr << endl;
 337
 338     // now, perform the consistency transformation the desired number of times
 339     for (int i = 0; i < numConsistencyReps; i++)
 340       DoRelaxation (sequences, sparseMatrices);
 341
 342     // compute the evolutionary tree
 343     TreeNode *tree = TreeNode::ComputeTree (distances);
 344
 345     //tree->Print (cerr, sequences);
 346     //cerr << endl;
 347
 348     // make the final alignment
 349     MultiSequence *alignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
 350     delete tree;
 351
 352     return alignment;
 353   }
 354
 355   return NULL;
 356 }
 357
 358 /////////////////////////////////////////////////////////////////
 359 // GetInteger()
 360 //
 361 // Attempts to parse an integer from the character string given.
 362 // Returns true only if no parsing error occurs.
 363 /////////////////////////////////////////////////////////////////
 364
 365 bool GetInteger (char *data, int *val){
 366   char *endPtr;
 367   long int retVal;
 368
 369   assert (val);
 370
 371   errno = 0;
 372   retVal = strtol (data, &endPtr, 0);
 373   if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
 374   if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
 375   if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
 376   *val = (int) retVal;
 377   return true;
 378 }
 379
 380 /////////////////////////////////////////////////////////////////
 381 // GetFloat()
 382 //
 383 // Attempts to parse a float from the character string given.
 384 // Returns true only if no parsing error occurs.
 385 /////////////////////////////////////////////////////////////////
 386
 387 bool GetFloat (char *data, float *val){
 388   char *endPtr;
 389   double retVal;
 390
 391   assert (val);
 392
 393   errno = 0;
 394   retVal = strtod (data, &endPtr);
 395   if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
 396   if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
 397   *val = (float) retVal;
 398   return true;
 399 }
 400
 401 /////////////////////////////////////////////////////////////////
 402 // ParseParams()
 403 //
 404 // Parse all command-line options.
 405 /////////////////////////////////////////////////////////////////
 406
 407 SafeVector<string> ParseParams (int argc, char **argv){
 408
 409   if (argc < 2){
 410
 411     cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY.  This is free software, and" << endl
 412          << "you are welcome to redistribute it under certain conditions.  See the" << endl
 413          << "file COPYING.txt for details." << endl
 414          << endl
 415          << "Usage:" << endl
 416          << "       probcons [OPTION]... [MFAFILE]..." << endl
 417          << endl
 418          << "Description:" << endl
 419          << "       Align sequences in MFAFILE(s) and print result to standard output" << endl
 420          << endl
 421          << "       -t, --train FILENAME" << endl
 422          << "              compute EM transition probabilities, store in FILENAME (default: "
 423          << parametersOutputFilename << ")" << endl
 424          << endl
 425          << "       -m, --matrixfile FILENAME" << endl
 426          << "              read transition parameters from FILENAME (default: "
 427          << matrixFilename << ")" << endl
 428          << endl
 429          << "       -p, --paramfile FILENAME" << endl
 430          << "              read scoring matrix probabilities from FILENAME (default: "
 431          << parametersInputFilename << ")" << endl
 432          << endl
 433          << "       -c, --consistency REPS" << endl
 434          << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
 435          << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
 436          << endl
 437          << "       -ir, --iterative-refinement REPS" << endl
 438          << "              use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
 439          << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
 440          << endl
 441          << "       -pre, --pre-training REPS" << endl
 442          << "              use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
 443          << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
 444          << endl
 445          << "       -go, --gap-open VALUE" << endl
 446          << "              gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
 447          << endl
 448          << "       -ge, --gap-extension VALUE" << endl
 449          << "              gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
 450          << endl
 451          << "       -v, --verbose" << endl
 452          << "              report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
 453          << endl;
 454
 455     exit (1);
 456   }
 457
 458   SafeVector<string> sequenceNames;
 459   int tempInt;
 460   float tempFloat;
 461
 462   for (int i = 1; i < argc; i++){
 463     if (argv[i][0] == '-'){
 464
 465       // training
 466       if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
 467         enableTraining = true;
 468         if (i < argc - 1)
 469           parametersOutputFilename = string (argv[++i]);
 470         else {
 471           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 472           exit (1);
 473         }
 474       }
 475
 476       // scoring matrix file
 477       else if (!strcmp (argv[i], "-m") || !strcmp (argv[i], "--matrixfile")){
 478         if (i < argc - 1)
 479           matrixFilename = string (argv[++i]);
 480         else {
 481           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 482           exit (1);
 483         }
 484       }
 485
 486       // transition/initial distribution parameter file
 487       else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
 488         if (i < argc - 1)
 489           parametersInputFilename = string (argv[++i]);
 490         else {
 491           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 492           exit (1);
 493         }
 494       }
 495
 496       // number of consistency transformations
 497       else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
 498         if (i < argc - 1){
 499           if (!GetInteger (argv[++i], &tempInt)){
 500             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 501             exit (1);
 502           }
 503           else {
 504             if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
 505               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 506                    << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
 507               exit (1);
 508             }
 509             else
 510               numConsistencyReps = tempInt;
 511           }
 512         }
 513         else {
 514           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 515           exit (1);
 516         }
 517       }
 518
 519       // number of randomized partitioning iterative refinement passes
 520       else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
 521         if (i < argc - 1){
 522           if (!GetInteger (argv[++i], &tempInt)){
 523             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 524             exit (1);
 525           }
 526           else {
 527             if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
 528               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 529                    << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
 530               exit (1);
 531             }
 532             else
 533               numIterativeRefinementReps = tempInt;
 534           }
 535         }
 536         else {
 537           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 538           exit (1);
 539         }
 540       }
 541
 542       // number of EM pre-training rounds
 543       else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
 544         if (i < argc - 1){
 545           if (!GetInteger (argv[++i], &tempInt)){
 546             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 547             exit (1);
 548           }
 549           else {
 550             if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
 551               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 552                    << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
 553               exit (1);
 554             }
 555             else
 556               numPreTrainingReps = tempInt;
 557           }
 558         }
 559         else {
 560           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 561           exit (1);
 562         }
 563       }
 564
 565       // gap open penalty
 566       else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
 567         if (i < argc - 1){
 568           if (!GetFloat (argv[++i], &tempFloat)){
 569             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 570             exit (1);
 571           }
 572           else {
 573             if (tempFloat > 0){
 574               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
 575               exit (1);
 576             }
 577             else
 578               gapOpenPenalty = tempFloat;
 579           }
 580         }
 581         else {
 582           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
 583           exit (1);
 584         }
 585       }
 586
 587       // gap extension penalty
 588       else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
 589         if (i < argc - 1){
 590           if (!GetFloat (argv[++i], &tempFloat)){
 591             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 592             exit (1);
 593           }
 594           else {
 595             if (tempFloat > 0){
 596               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
 597               exit (1);
 598             }
 599             else
 600               gapContinuePenalty = tempFloat;
 601           }
 602         }
 603         else {
 604           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
 605           exit (1);
 606         }
 607       }
 608
 609       // verbose reporting
 610       else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
 611         enableVerbose = true;
 612       }
 613
 614       // bad arguments
 615       else {
 616         cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
 617         exit (1);
 618       }
 619     }
 620     else {
 621       sequenceNames.push_back (string (argv[i]));
 622     }
 623   }
 624
 625   return sequenceNames;
 626 }
 627
 628 /////////////////////////////////////////////////////////////////
 629 // ReadParameters()
 630 //
 631 // Read initial distribution, transition, and emission
 632 // parameters from a file.
 633 /////////////////////////////////////////////////////////////////
 634
 635 void ReadParameters (){
 636
 637   ifstream data;
 638
 639   // read initial state distribution and transition parameters
 640   if (parametersInputFilename == string ("")){
 641     if (NumInsertStates == 1){
 642       for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
 643       for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
 644       for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
 645     }
 646     else if (NumInsertStates == 2){
 647       for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
 648       for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
 649       for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
 650     }
 651     else {
 652       cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
 653            << "       for " << NumInsertStates << " pairs of insert states.  Use --paramfile." << endl;
 654       exit (1);
 655     }
 656   }
 657   else {
 658     data.open (parametersInputFilename.c_str());
 659     if (data.fail()){
 660       cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
 661       exit (1);
 662     }
 663     for (int i = 0; i < NumMatrixTypes; i++) data >> initDistrib[i];
 664     for (int i = 0; i < 2*NumInsertStates; i++) data >> gapOpen[i];
 665     for (int i = 0; i < 2*NumInsertStates; i++) data >> gapExtend[i];
 666     data.close();
 667   }
 668
 669   // read emission parameters
 670   int alphabetSize = 20;
 671
 672   // allocate memory
 673   alphabet = SafeVector<char>(alphabetSize);
 674   emitPairs = VVF (alphabetSize, VF (alphabetSize, 0));
 675   emitSingle = VF (alphabetSize);
 676
 677   if (matrixFilename == string ("")){
 678     for (int i = 0; i < alphabetSize; i++) alphabet[i] = alphabetDefault[i];
 679     for (int i = 0; i < alphabetSize; i++){
 680       emitSingle[i] = emitSingleDefault[i];
 681       for (int j = 0; j <= i; j++){
 682         emitPairs[i][j] = emitPairs[j][i] = (i == j);
 683       }
 684     }
 685   }
 686   else {
 687     data.open (matrixFilename.c_str());
 688     if (data.fail()){
 689       cerr << "ERROR: Unable to read scoring matrix file: " << matrixFilename << endl;
 690       exit (1);
 691     }
 692
 693     for (int i = 0; i < alphabetSize; i++) data >> alphabet[i];
 694     for (int i = 0; i < alphabetSize; i++){
 695       for (int j = 0; j <= i; j++){
 696         data >> emitPairs[i][j];
 697         emitPairs[j][i] = emitPairs[i][j];
 698       }
 699     }
 700     for (int i = 0; i < alphabetSize; i++){
 701       char ch;
 702       data >> ch;
 703       assert (ch == alphabet[i]);
 704     }
 705     for (int i = 0; i < alphabetSize; i++) data >> emitSingle[i];
 706     data.close();
 707   }
 708 }
 709
 710 /////////////////////////////////////////////////////////////////
 711 // ProcessTree()
 712 //
 713 // Process the tree recursively.  Returns the aligned sequences
 714 // corresponding to a node or leaf of the tree.
 715 /////////////////////////////////////////////////////////////////
 716
 717 MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
 718                             const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 719                             const ProbabilisticModel &model){
 720   MultiSequence *result;
 721
 722   // check if this is a node of the alignment tree
 723   if (tree->GetSequenceLabel() == -1){
 724     MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
 725     MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
 726
 727     assert (alignLeft);
 728     assert (alignRight);
 729
 730     result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
 731     assert (result);
 732
 733     delete alignLeft;
 734     delete alignRight;
 735   }
 736
 737   // otherwise, this is a leaf of the alignment tree
 738   else {
 739     result = new MultiSequence(); assert (result);
 740     result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
 741   }
 742
 743   return result;
 744 }
 745
 746 /////////////////////////////////////////////////////////////////
 747 // ComputeFinalAlignment()
 748 //
 749 // Compute the final alignment by calling ProcessTree(), then
 750 // performing iterative refinement as needed.
 751 /////////////////////////////////////////////////////////////////
 752
 753 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
 754                                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 755                                       const ProbabilisticModel &model){
 756
 757   MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
 758
 759   // iterative refinement
 760   for (int i = 0; i < numIterativeRefinementReps; i++)
 761     DoIterativeRefinement (sparseMatrices, model, alignment);
 762
 763   cerr << endl;
 764
 765   // return final alignment
 766   return alignment;
 767 }
 768
 769 /////////////////////////////////////////////////////////////////
 770 // AlignAlignments()
 771 //
 772 // Returns the alignment of two MultiSequence objects.
 773 /////////////////////////////////////////////////////////////////
 774
 775 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
 776                                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 777                                 const ProbabilisticModel &model){
 778
 779   // print some info about the alignment
 780   if (enableVerbose){
 781     for (int i = 0; i < align1->GetNumSequences(); i++)
 782       cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
 783     cerr << "] vs. ";
 784     for (int i = 0; i < align2->GetNumSequences(); i++)
 785       cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
 786     cerr << "]: ";
 787   }
 788
 789   VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices);
 790   pair<SafeVector<char> *, float> alignment;
 791
 792   // choose the alignment routine depending on the "cosmetic" gap penalties used
 793   if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
 794     alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
 795   else
 796     alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
 797                                                         *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
 798                                                         gapOpenPenalty, gapContinuePenalty);
 799
 800   delete posterior;
 801
 802   if (enableVerbose){
 803
 804     // compute total length of sequences
 805     int totLength = 0;
 806     for (int i = 0; i < align1->GetNumSequences(); i++)
 807       for (int j = 0; j < align2->GetNumSequences(); j++)
 808         totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
 809
 810     // give an "accuracy" measure for the alignment
 811     cerr << alignment.second / totLength << endl;
 812   }
 813
 814   // now build final alignment
 815   MultiSequence *result = new MultiSequence();
 816   for (int i = 0; i < align1->GetNumSequences(); i++)
 817     result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
 818   for (int i = 0; i < align2->GetNumSequences(); i++)
 819     result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
 820   result->SortByLabel();
 821
 822   // free temporary alignment
 823   delete alignment.first;
 824
 825   return result;
 826 }
 827
 828 /////////////////////////////////////////////////////////////////
 829 // DoRelaxation()
 830 //
 831 // Performs one round of the consistency transformation.  The
 832 // formula used is:
 833 //                     1
 834 //    P'(x[i]-y[j]) = ---  sum   sum P(x[i]-z[k]) P(z[k]-y[j])
 835 //                    |S| z in S  k
 836 //
 837 // where S = {x, y, all other sequences...}
 838 //
 839 /////////////////////////////////////////////////////////////////
 840
 841 void DoRelaxation (MultiSequence *sequences, SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
 842   const int numSeqs = sequences->GetNumSequences();
 843
 844   SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
 845
 846   // for every pair of sequences
 847   for (int i = 0; i < numSeqs; i++){
 848     for (int j = i+1; j < numSeqs; j++){
 849       Sequence *seq1 = sequences->GetSequence (i);
 850       Sequence *seq2 = sequences->GetSequence (j);
 851
 852       if (enableVerbose)
 853         cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
 854              << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
 855
 856       // get the original posterior matrix
 857       VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
 858       VF &posterior = *posteriorPtr;
 859
 860       const int seq1Length = seq1->GetLength();
 861       const int seq2Length = seq2->GetLength();
 862
 863       // contribution from the summation where z = x and z = y
 864       for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
 865
 866       if (enableVerbose)
 867         cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
 868
 869       // contribution from all other sequences
 870       for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
 871         Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
 872       }
 873
 874       // now renormalization
 875       for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
 876
 877       // save the new posterior matrix
 878       newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
 879       newSparseMatrices[j][i] = newSparseMatrices[i][j]->ComputeTranspose();
 880
 881       if (enableVerbose)
 882         cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
 883
 884       delete posteriorPtr;
 885
 886       if (enableVerbose)
 887         cerr << "done." << endl;
 888     }
 889   }
 890
 891   // now replace the old posterior matrices
 892   for (int i = 0; i < numSeqs; i++){
 893     for (int j = 0; j < numSeqs; j++){
 894       delete sparseMatrices[i][j];
 895       sparseMatrices[i][j] = newSparseMatrices[i][j];
 896     }
 897   }
 898 }
 899
 900 /////////////////////////////////////////////////////////////////
 901 // DoRelaxation()
 902 //
 903 // Computes the consistency transformation for a single sequence
 904 // z, and adds the transformed matrix to "posterior".
 905 /////////////////////////////////////////////////////////////////
 906
 907 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
 908
 909   assert (matXZ);
 910   assert (matZY);
 911
 912   int lengthX = matXZ->GetSeq1Length();
 913   int lengthY = matZY->GetSeq2Length();
 914   assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
 915
 916   // for every x[i]
 917   for (int i = 1; i <= lengthX; i++){
 918     SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
 919     SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
 920
 921     VF::iterator base = posterior.begin() + i * (lengthY + 1);
 922
 923     // iterate through all x[i]-z[k]
 924     while (XZptr != XZend){
 925       SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
 926       SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
 927       const float XZval = XZptr->second;
 928
 929       // iterate through all z[k]-y[j]
 930       while (ZYptr != ZYend){
 931         base[ZYptr->first] += XZval * ZYptr->second;;
 932         ZYptr++;
 933       }
 934       XZptr++;
 935     }
 936   }
 937 }
 938
 939 /////////////////////////////////////////////////////////////////
 940 // DoIterativeRefinement()
 941 //
 942 // Performs a single round of randomized partionining iterative
 943 // refinement.
 944 /////////////////////////////////////////////////////////////////
 945
 946 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 947                             const ProbabilisticModel &model, MultiSequence* &alignment){
 948   set<int> groupOne, groupTwo;
 949
 950   // create two separate groups
 951   for (int i = 0; i < alignment->GetNumSequences(); i++){
 952     if (random() % 2)
 953       groupOne.insert (i);
 954     else
 955       groupTwo.insert (i);
 956   }
 957
 958   if (groupOne.empty() || groupTwo.empty()) return;
 959
 960   // project into the two groups
 961   MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
 962   MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
 963   delete alignment;
 964
 965   // realign
 966   alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
 967 }
 968
 969 /*
 970 float ScoreAlignment (MultiSequence *alignment, MultiSequence *sequences, SparseMatrix **sparseMatrices, const int numSeqs){
 971   int totLength = 0;
 972   float score = 0;
 973
 974   for (int a = 0; a < alignment->GetNumSequences(); a++){
 975     for (int b = a+1; b < alignment->GetNumSequences(); b++){
 976       Sequence *seq1 = alignment->GetSequence(a);
 977       Sequence *seq2 = alignment->GetSequence(b);
 978
 979       const int seq1Length = sequences->GetSequence(seq1->GetLabel())->GetLength();
 980       const int seq2Length = sequences->GetSequence(seq2->GetLabel())->GetLength();
 981
 982       totLength += min (seq1Length, seq2Length);
 983
 984       int pos1 = 0, pos2 = 0;
 985       for (int i = 1; i <= seq1->GetLength(); i++){
 986         char ch1 = seq1->GetPosition(i);
 987         char ch2 = seq2->GetPosition(i);
 988
 989         if (ch1 != '-') pos1++;
 990         if (ch2 != '-') pos2++;
 991         if (ch1 != '-' && ch2 != '-'){
 992           score += sparseMatrices[a * numSeqs + b]->GetValue (pos1, pos2);
 993         }
 994       }
 995     }
 996   }
 997
 998   return score / totLength;
 999 }
1000 */