binaries/src/mafft/extensions/mxscarna_src/Main.cc

   1 /////////////////////////////////////////////////////////////////
   2 // Main.cc
   3 //
   4 // Main routines for MXSCARNA program.
   5 /////////////////////////////////////////////////////////////////
   6
   7 #include "scarna.hpp"
   8 #include "SafeVector.h"
   9 #include "MultiSequence.h"
  10 #include "Defaults.h"
  11 #include "ScoreType.h"
  12 #include "ProbabilisticModel.h"
  13 #include "EvolutionaryTree.h"
  14 #include "SparseMatrix.h"
  15 #include "BPPMatrix.hpp"
  16 #include "StemCandidate.hpp"
  17 #include "Globaldp.hpp"
  18 #include "nrutil.h"
  19 #include "AlifoldMEA.h"
  20 #include <string>
  21 #include <sstream>
  22 #include <iomanip>
  23 #include <iostream>
  24 #include <list>
  25 #include <set>
  26 #include <algorithm>
  27 #include <cstdio>
  28 #include <cstdlib>
  29 #include <cerrno>
  30 #include <iomanip>
  31 #include <fstream>
  32
  33 //#include "RfoldWrapper.hpp"
  34 //static RFOLD::Rfold folder;
  35
  36 using namespace::MXSCARNA;
  37
  38 string parametersInputFilename = "";
  39 string parametersOutputFilename = "no training";
  40 string annotationFilename = "";
  41 string weboutputFileName = "";
  42
  43 ofstream *outputFile;
  44
  45 bool enableTraining = false;
  46 bool enableVerbose = false;
  47 bool enableAllPairs = false;
  48 bool enableAnnotation = false;
  49 bool enableViterbi = false;
  50 bool enableClustalWOutput = false;
  51 bool enableTrainEmissions = false;
  52 bool enableAlignOrder = false;
  53 bool enableWebOutput  = false;
  54 bool enableStockholmOutput = false;
  55 bool enableMXSCARNAOutput = false;
  56 bool enableMcCaskillMEAMode = false;
  57 char bppmode = 's'; // by katoh
  58 int numConsistencyReps = 2;
  59 int numPreTrainingReps = 0;
  60 int numIterativeRefinementReps = 100;
  61 int scsLength = SCSLENGTH;
  62 float cutoff = 0;
  63 float gapOpenPenalty = 0;
  64 float gapContinuePenalty = 0;
  65 float threshhold = 1.0;
  66 float BaseProbThreshold = BASEPROBTHRESHOLD;
  67 float BasePairConst = BASEPAIRCONST;
  68 int   BandWidth = BANDWIDTH;
  69
  70 SafeVector<string> sequenceNames;
  71
  72 VF initDistrib (NumMatrixTypes);
  73 VF gapOpen (2*NumInsertStates);
  74 VF gapExtend (2*NumInsertStates);
  75 VVF emitPairs (256, VF (256, 1e-10));
  76 VF emitSingle (256, 1e-5);
  77
  78 string alphabet = alphabetDefault;
  79
  80 string *ssCons = NULL;
  81
  82 const int MIN_PRETRAINING_REPS = 0;
  83 const int MAX_PRETRAINING_REPS = 20;
  84 const int MIN_CONSISTENCY_REPS = 0;
  85 const int MAX_CONSISTENCY_REPS = 5;
  86 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
  87 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
  88
  89 /////////////////////////////////////////////////////////////////
  90 // Function prototypes
  91 /////////////////////////////////////////////////////////////////
  92
  93 void PrintHeading();
  94 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
  95                       const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
  96 MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle);
  97 SafeVector<string> ParseParams (int argc, char **argv);
  98 void ReadParameters ();
  99 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
 100                                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 101                                       const ProbabilisticModel &model,
 102                                       SafeVector<BPPMatrix*> &BPPMatrices);
 103 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
 104                                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 105                                 const ProbabilisticModel &model, SafeVector<BPPMatrix*> &BPPMatrices, float identity);
 106 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
 107                                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
 108 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
 109 void Relax1 (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
 110 void DoBasePairProbabilityRelaxation (MultiSequence *sequences,
 111                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 112                                       SafeVector<BPPMatrix*> &BPPMatrices);
 113 set<int> GetSubtree (const TreeNode *tree);
 114 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 115                               const ProbabilisticModel &model, MultiSequence* &alignment,
 116                               const TreeNode *tree, SafeVector<BPPMatrix*> &BPPMatrices);
 117 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 118                             const ProbabilisticModel &model, MultiSequence* &alignment);
 119 void WriteAnnotation (MultiSequence *alignment,
 120                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
 121 int ComputeScore (const SafeVector<pair<int, int> > &active,
 122                   const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
 123 std::vector<StemCandidate>* seq2scs(MultiSequence *Sequences, SafeVector<BPPMatrix*> &BPPMatrices, int BandWidth);
 124 void removeConflicts(std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2, std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2);
 125
 126 struct prob {
 127     int i;
 128     int j;
 129     float p;
 130 };
 131
 132 /////////////////////////////////////////////////////////////////
 133 // main()
 134 //
 135 // Calls all initialization routines and runs the MXSCARNA
 136 // aligner.
 137 /////////////////////////////////////////////////////////////////
 138
 139
 140 int main (int argc, char **argv){
 141
 142     // print MXSCARNA heading
 143     PrintHeading();
 144
 145     // parse program parameters
 146     sequenceNames = ParseParams (argc, argv);
 147     ReadParameters();
 148     PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 149
 150   // now, we'll process all the files given as input.  If we are given
 151   // several filenames as input, then we'll load all of those sequences
 152   // simultaneously, as long as we're not training.  On the other hand,
 153   // if we are training, then we'll treat each file as a separate
 154   // training instance
 155
 156     if (enableMcCaskillMEAMode) {
 157       MultiSequence *sequences = new MultiSequence(); assert (sequences);
 158       for (int i = 0; i < (int) sequenceNames.size(); i++){
 159           cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 160           sequences->LoadMFA (sequenceNames[i], true);
 161       }
 162
 163       const int numSeqs = sequences->GetNumSequences();
 164       SafeVector<BPPMatrix*> BPPMatrices;
 165
 166       // compute the base pairing matrices for each sequences
 167       for(int i = 0; i < numSeqs; i++) {
 168           Sequence *tmpSeq = sequences->GetSequence(i);
 169           string   seq = tmpSeq->GetString();
 170           int    n_seq = tmpSeq->GetLength();
 171           BPPMatrix *bppmat = new BPPMatrix(bppmode, seq, n_seq); // modified by katoh
 172           BPPMatrices.push_back(bppmat);
 173       }
 174       if (bppmode=='w') exit( 0 );
 175
 176       AlifoldMEA alifold(sequences, BPPMatrices, BasePairConst);
 177       alifold.Run();
 178       ssCons = alifold.getSScons();
 179
 180       if (enableStockholmOutput) {
 181           sequences->WriteSTOCKHOLM (cout, ssCons);
 182       }
 183       else if (enableMXSCARNAOutput){
 184           sequences->WriteMXSCARNA (cout, ssCons);
 185       }
 186       else {
 187           sequences->WriteMFA (cout, ssCons);
 188       }
 189
 190       delete sequences;
 191   }
 192   // if we are training
 193   else if (enableTraining){
 194
 195     // build new model for aligning
 196     ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 197
 198     // prepare to average parameters
 199     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
 200     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
 201     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
 202     if (enableTrainEmissions){
 203       for (int i = 0; i < (int) emitPairs.size(); i++)
 204         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
 205       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
 206     }
 207
 208     // align each file individually
 209     for (int i = 0; i < (int) sequenceNames.size(); i++){
 210
 211       VF thisInitDistrib (NumMatrixTypes);
 212       VF thisGapOpen (2*NumInsertStates);
 213       VF thisGapExtend (2*NumInsertStates);
 214       VVF thisEmitPairs (256, VF (256, 1e-10));
 215       VF thisEmitSingle (256, 1e-5);
 216
 217       // load sequence file
 218       MultiSequence *sequences = new MultiSequence(); assert (sequences);
 219       cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 220       sequences->LoadMFA (sequenceNames[i], true);
 221
 222       // align sequences
 223       DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
 224
 225       // add in contribution of the derived parameters
 226       for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
 227       for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
 228       for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
 229       if (enableTrainEmissions){
 230         for (int i = 0; i < (int) emitPairs.size(); i++)
 231           for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
 232         for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
 233       }
 234
 235       delete sequences;
 236     }
 237
 238     // compute new parameters and print them out
 239     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
 240     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
 241     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
 242     if (enableTrainEmissions){
 243       for (int i = 0; i < (int) emitPairs.size(); i++)
 244         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
 245       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
 246     }
 247
 248     PrintParameters ("Trained parameter set:",
 249                      initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
 250                      parametersOutputFilename.c_str());
 251   }
 252   // pass
 253   // if we are not training, we must simply want to align some sequences
 254   else {
 255     // load all files together
 256     MultiSequence *sequences = new MultiSequence(); assert (sequences);
 257     for (int i = 0; i < (int) sequenceNames.size(); i++){
 258       cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 259
 260       sequences->LoadMFA (sequenceNames[i], true);
 261     }
 262
 263     // do all "pre-training" repetitions first
 264     // NOT execute
 265     for (int ct = 0; ct < numPreTrainingReps; ct++){
 266       enableTraining = true;
 267
 268       // build new model for aligning
 269       ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
 270                                 emitPairs, emitSingle);
 271
 272       // do initial alignments
 273       DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 274
 275       // print new parameters
 276       PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 277
 278       enableTraining = false;
 279     }
 280
 281     // now, we can perform the alignments and write them out
 282     if (enableWebOutput) {
 283         outputFile = new ofstream(weboutputFileName.c_str());
 284         if (!outputFile) {
 285             cerr << "cannot open output file." << weboutputFileName << endl;
 286             exit(1);
 287         }
 288         *outputFile << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << endl;
 289         *outputFile << "<result>" << endl;
 290     }
 291         MultiSequence *alignment = DoAlign (sequences,
 292                                             ProbabilisticModel (initDistrib, gapOpen, gapExtend,  emitPairs, emitSingle),
 293                                             initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 294
 295
 296     if (!enableAllPairs){
 297         if (enableClustalWOutput) {
 298             alignment->WriteALN (cout);
 299         }
 300         else if (enableWebOutput) {
 301             alignment->WriteWEB (*outputFile, ssCons);
 302 //          computeStructureWithAlifold ();
 303         }
 304         else if (enableStockholmOutput) {
 305             alignment->WriteSTOCKHOLM (cout, ssCons);
 306         }
 307         else if (enableMXSCARNAOutput) {
 308             alignment->WriteMXSCARNA (cout, ssCons);
 309         }
 310         else {
 311             alignment->WriteMFA (cout, ssCons);
 312         }
 313     }
 314
 315     if (enableWebOutput) {
 316         *outputFile << "</result>" << endl;
 317         delete outputFile;
 318     }
 319
 320     delete ssCons;
 321     delete alignment;
 322     delete sequences;
 323
 324   }
 325 }
 326
 327 /////////////////////////////////////////////////////////////////
 328 // PrintHeading()
 329 //
 330 // Prints heading for PROBCONS program.
 331 /////////////////////////////////////////////////////////////////
 332
 333 void PrintHeading (){
 334   cerr << endl
 335        << "Multiplex SCARNA"<< endl
 336        << "version " << VERSION << " - align multiple RNA sequences and print to standard output" << endl
 337        << "Written by Yasuo Tabei" << endl
 338        << endl;
 339 }
 340
 341 /////////////////////////////////////////////////////////////////
 342 // PrintParameters()
 343 //
 344 // Prints PROBCONS parameters to STDERR.  If a filename is
 345 // specified, then the parameters are also written to the file.
 346 /////////////////////////////////////////////////////////////////
 347
 348 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
 349                       const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
 350
 351   // print parameters to the screen
 352   cerr << message << endl
 353        << "    initDistrib[] = { ";
 354   for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
 355   cerr << "}" << endl
 356        << "        gapOpen[] = { ";
 357   for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
 358   cerr << "}" << endl
 359        << "      gapExtend[] = { ";
 360   for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
 361   cerr << "}" << endl
 362        << endl;
 363
 364   /*
 365   for (int i = 0; i < 5; i++){
 366     for (int j = 0; j <= i; j++){
 367       cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
 368     }
 369     cerr << endl;
 370     }*/
 371
 372   // if a file name is specified
 373   if (filename){
 374
 375     // attempt to open the file for writing
 376     FILE *file = fopen (filename, "w");
 377     if (!file){
 378       cerr << "ERROR: Unable to write parameter file: " << filename << endl;
 379       exit (1);
 380     }
 381
 382     // if successful, then write the parameters to the file
 383     for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
 384     for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
 385     for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
 386     fprintf (file, "%s\n", alphabet.c_str());
 387     for (int i = 0; i < (int) alphabet.size(); i++){
 388       for (int j = 0; j <= i; j++)
 389         fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
 390       fprintf (file, "\n");
 391     }
 392     for (int i = 0; i < (int) alphabet.size(); i++)
 393       fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
 394     fprintf (file, "\n");
 395     fclose (file);
 396   }
 397 }
 398
 399 /////////////////////////////////////////////////////////////////
 400 // DoAlign()
 401 //
 402 // First computes all pairwise posterior probability matrices.
 403 // Then, computes new parameters if training, or a final
 404 // alignment, otherwise.
 405 /////////////////////////////////////////////////////////////////
 406 MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle){
 407
 408
 409   assert (sequences);
 410
 411   const int numSeqs = sequences->GetNumSequences();
 412   VVF distances (numSeqs, VF (numSeqs, 0));
 413   VVF identities (numSeqs, VF (numSeqs, 0));
 414   SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
 415
 416   SafeVector<BPPMatrix*> BPPMatrices;
 417
 418   for(int i = 0; i < numSeqs; i++) {
 419     Sequence *tmpSeq = sequences->GetSequence(i);
 420     string   seq = tmpSeq->GetString();
 421     int    n_seq = tmpSeq->GetLength();
 422     BPPMatrix *bppmat = new BPPMatrix(bppmode, seq, n_seq, BASEPROBTHRESHOLD); // modified by katoh
 423     BPPMatrices.push_back(bppmat);
 424   }
 425
 426   if (enableTraining){
 427     // prepare to average parameters
 428     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
 429     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
 430     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
 431     if (enableTrainEmissions){
 432       for (int i = 0; i < (int) emitPairs.size(); i++)
 433         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
 434       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
 435     }
 436   }
 437
 438   // skip posterior calculations if we just want to do Viterbi alignments
 439   if (!enableViterbi){
 440
 441     // do all pairwise alignments for posterior probability matrices
 442     for (int a = 0; a < numSeqs-1; a++){
 443       for (int b = a+1; b < numSeqs; b++){
 444         Sequence *seq1 = sequences->GetSequence (a);
 445         Sequence *seq2 = sequences->GetSequence (b);
 446
 447         // verbose output
 448         if (enableVerbose)
 449           cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
 450                << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
 451
 452         // compute forward and backward probabilities
 453         VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
 454         VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
 455
 456         // if we are training, then we'll simply want to compute the
 457         // expected counts for each region within the matrix separately;
 458         // otherwise, we'll need to put all of the regions together and
 459         // assemble a posterior probability match matrix
 460
 461         // so, if we're training
 462         if (enableTraining){
 463
 464           // compute new parameters
 465           VF thisInitDistrib (NumMatrixTypes);
 466           VF thisGapOpen (2*NumInsertStates);
 467           VF thisGapExtend (2*NumInsertStates);
 468           VVF thisEmitPairs (256, VF (256, 1e-10));
 469           VF thisEmitSingle (256, 1e-5);
 470
 471           model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
 472
 473           // add in contribution of the derived parameters
 474           for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
 475           for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
 476           for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
 477           if (enableTrainEmissions){
 478             for (int i = 0; i < (int) emitPairs.size(); i++)
 479               for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
 480             for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
 481           }
 482
 483           // let us know that we're done.
 484           if (enableVerbose) cerr << "done." << endl;
 485         }
 486         // pass
 487         else {
 488
 489           // compute posterior probability matrix
 490           VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
 491
 492           // compute sparse representations
 493           sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
 494           sparseMatrices[b][a] = NULL;
 495
 496           if (!enableAllPairs){
 497             // perform the pairwise sequence alignment
 498             pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
 499                                                                                 seq2->GetLength(),
 500                                                                                 *posterior);
 501
 502             Sequence *tmpSeq1 = seq1->AddGaps (alignment.first, 'X');
 503             Sequence *tmpSeq2 = seq2->AddGaps (alignment.first, 'Y');
 504
 505             // compute sequence identity for each pair of sequenceses
 506             int length = tmpSeq1->GetLength();
 507             int matchCount = 0;
 508             int misMatchCount = 0;
 509             for (int k = 1; k <= length; k++) {
 510                 int ch1 = tmpSeq1->GetPosition(k);
 511                 int ch2 = tmpSeq2->GetPosition(k);
 512                 if (ch1 == ch2 && ch1 != '-' && ch2 != '-') { ++matchCount; }
 513                 else if (ch1 != ch2 && ch1 != '-' && ch2 != '-') { ++misMatchCount; }
 514             }
 515
 516             identities[a][b] = identities[b][a] = (float)matchCount/(float)(matchCount + misMatchCount);
 517
 518             // compute "expected accuracy" distance for evolutionary tree computation
 519             float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
 520             distances[a][b] = distances[b][a] = distance;
 521
 522             if (enableVerbose)
 523               cerr << setprecision (10) << distance << endl;
 524
 525               delete alignment.first;
 526           }
 527           else {
 528             // let us know that we're done.
 529             if (enableVerbose) cerr << "done." << endl;
 530           }
 531
 532           delete posterior;
 533         }
 534
 535         delete forward;
 536         delete backward;
 537       }
 538     }
 539   }
 540
 541
 542   // now average out parameters derived
 543   if (enableTraining){
 544     // compute new parameters
 545     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
 546     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
 547     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
 548
 549     if (enableTrainEmissions){
 550       for (int i = 0; i < (int) emitPairs.size(); i++)
 551         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
 552       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
 553     }
 554   }
 555
 556   // see if we still want to do some alignments
 557   else {
 558       // pass
 559     if (!enableViterbi){
 560
 561       // perform the consistency transformation the desired number of times
 562       for (int r = 0; r < numConsistencyReps; r++){
 563         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
 564
 565         // now replace the old posterior matrices
 566         for (int i = 0; i < numSeqs; i++){
 567           for (int j = 0; j < numSeqs; j++){
 568             delete sparseMatrices[i][j];
 569             sparseMatrices[i][j] = newSparseMatrices[i][j];
 570           }
 571         }
 572       }
 573       //if (numSeqs > 8) {
 574       //  for (int r = 0; r < 1; r++)
 575       //  DoBasePairProbabilityRelaxation(sequences, sparseMatrices, BPPMatrices);
 576       //}
 577     }
 578
 579     MultiSequence *finalAlignment = NULL;
 580
 581     if (enableAllPairs){
 582       for (int a = 0; a < numSeqs-1; a++){
 583         for (int b = a+1; b < numSeqs; b++){
 584           Sequence *seq1 = sequences->GetSequence (a);
 585           Sequence *seq2 = sequences->GetSequence (b);
 586
 587           if (enableVerbose)
 588             cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
 589                  << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
 590
 591
 592           // perform the pairwise sequence alignment
 593           pair<SafeVector<char> *, float> alignment;
 594           if (enableViterbi)
 595             alignment = model.ComputeViterbiAlignment (seq1, seq2);
 596           else {
 597
 598             // build posterior matrix
 599             VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
 600             int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
 601             for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
 602
 603             alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
 604             delete posterior;
 605           }
 606
 607
 608           // write pairwise alignments
 609           string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
 610           ofstream outfile (name.c_str());
 611
 612           MultiSequence *result = new MultiSequence();
 613           result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
 614           result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
 615           result->WriteMFAseq (outfile); // by katoh
 616
 617           outfile.close();
 618
 619           delete alignment.first;
 620         }
 621       }
 622         exit( 0 );
 623     }
 624
 625     // now if we still need to do a final multiple alignment
 626     else {
 627
 628       if (enableVerbose)
 629         cerr << endl;
 630
 631       // compute the evolutionary tree
 632       TreeNode *tree = TreeNode::ComputeTree (distances, identities);
 633
 634       if (enableWebOutput) {
 635           *outputFile << "<tree>" << endl;
 636           tree->Print (*outputFile, sequences);
 637           *outputFile << "</tree>" << endl;
 638       }
 639       else {
 640           tree->Print (cerr, sequences);
 641           cerr << endl;
 642       }
 643       // make the final alignment
 644       finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model, BPPMatrices);
 645
 646       // build annotation
 647       if (enableAnnotation){
 648         WriteAnnotation (finalAlignment, sparseMatrices);
 649       }
 650
 651       delete tree;
 652     }
 653
 654     if (!enableViterbi){
 655       // delete sparse matrices
 656       for (int a = 0; a < numSeqs-1; a++){
 657         for (int b = a+1; b < numSeqs; b++){
 658           delete sparseMatrices[a][b];
 659           delete sparseMatrices[b][a];
 660         }
 661       }
 662     }
 663
 664     //AlifoldMEA alifold(finalAlignment, BPPMatrices, BasePairConst);
 665     //alifold.Run();
 666     //ssCons = alifold.getSScons();
 667
 668     return finalAlignment;
 669
 670   }
 671
 672   return NULL;
 673 }
 674
 675 /////////////////////////////////////////////////////////////////
 676 // GetInteger()
 677 //
 678 // Attempts to parse an integer from the character string given.
 679 // Returns true only if no parsing error occurs.
 680 /////////////////////////////////////////////////////////////////
 681
 682 bool GetInteger (char *data, int *val){
 683   char *endPtr;
 684   long int retVal;
 685
 686   assert (val);
 687
 688   errno = 0;
 689   retVal = strtol (data, &endPtr, 0);
 690   if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
 691   if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
 692   if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
 693   *val = (int) retVal;
 694   return true;
 695 }
 696
 697 /////////////////////////////////////////////////////////////////
 698 // GetFloat()
 699 //
 700 // Attempts to parse a float from the character string given.
 701 // Returns true only if no parsing error occurs.
 702 /////////////////////////////////////////////////////////////////
 703
 704 bool GetFloat (char *data, float *val){
 705   char *endPtr;
 706   double retVal;
 707
 708   assert (val);
 709
 710   errno = 0;
 711   retVal = strtod (data, &endPtr);
 712   if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
 713   if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
 714   *val = (float) retVal;
 715   return true;
 716 }
 717
 718 /////////////////////////////////////////////////////////////////
 719 // ParseParams()
 720 //
 721 // Parse all command-line options.
 722 /////////////////////////////////////////////////////////////////
 723
 724 SafeVector<string> ParseParams (int argc, char **argv){
 725
 726   if (argc < 2){
 727
 728     cerr << "MXSCARNA comes with ABSOLUTELY NO WARRANTY.  This is free software, and" << endl
 729          << "you are welcome to redistribute it under certain conditions.  See the" << endl
 730          << "file COPYING.txt for details." << endl
 731          << endl
 732          << "Usage:" << endl
 733          << "       mxscarna [OPTION]... [MFAFILE]..." << endl
 734          << endl
 735          << "Description:" << endl
 736          << "       Align sequences in MFAFILE(s) and print result to standard output" << endl
 737          << endl
 738          << "       -clustalw" << endl
 739          << "              use CLUSTALW output format instead of MFA" << endl
 740          << endl
 741          << "       -stockholm" << endl
 742          << "              use STOCKHOLM output format instead of MFA" << endl
 743          << endl
 744          << "       -mxscarna" << endl
 745          << "              use MXSCARNA output format instead of MFA" << endl
 746          << endl
 747          << "       -weboutput /<output_path>/<outputfilename>" << endl
 748          << "        use web output format" << endl
 749          << endl
 750          << "       -c, --consistency REPS" << endl
 751          << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
 752          << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
 753          << endl
 754          << "       -ir, --iterative-refinement REPS" << endl
 755          << "              use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
 756          << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
 757          << endl
 758          << "       -pre, --pre-training REPS" << endl
 759          << "              use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
 760          << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
 761          << endl
 762          << "       -pairs" << endl
 763          << "              generate all-pairs pairwise alignments" << endl
 764          << endl
 765          << "       -viterbi" << endl
 766          << "              use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
 767          << endl
 768          << "       -v, --verbose" << endl
 769          << "              report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
 770          << endl
 771          << "       -annot FILENAME" << endl
 772          << "              write annotation for multiple alignment to FILENAME" << endl
 773          << endl
 774          << "       -t, --train FILENAME" << endl
 775          << "              compute EM transition probabilities, store in FILENAME (default: "
 776          << parametersOutputFilename << ")" << endl
 777          << endl
 778          << "       -e, --emissions" << endl
 779          << "              also reestimate emission probabilities (default: "
 780          << (enableTrainEmissions ? "on" : "off") << ")" << endl
 781          << endl
 782          << "       -p, --paramfile FILENAME" << endl
 783          << "              read parameters from FILENAME (default: "
 784          << parametersInputFilename << ")" << endl
 785          << endl
 786          << "       -a, --alignment-order" << endl
 787          << "              print sequences in alignment order rather than input order (default: "
 788          << (enableAlignOrder ? "on" : "off") << ")" << endl
 789          << endl
 790          << "       -s THRESHOLD" << endl
 791          << "               the threshold of SCS alignment" << endl
 792          << endl
 793          << "               In default, for less than " << threshhold << ", the SCS aligment is applied. " << endl
 794          << "       -l SCSLENGTH" << endl
 795          << "               the length of stem candidates " << SCSLENGTH << endl
 796          << endl
 797          << "       -b BASEPROBTRHESHHOLD" << endl
 798          << "               the threshold of base pairing probability " << BASEPROBTHRESHOLD << endl
 799          << endl
 800          << "       -m, --mccaskillmea" << endl
 801          << "               McCaskill MEA MODE: input the clustalw format file and output the secondary structure predicted by McCaskill MEA" << endl
 802          << endl
 803          << "       -g BASEPAIRSCORECONST" << endl
 804          << "               the control parameter of the prediction of base pairs, default:" << BasePairConst << endl
 805          << endl
 806          << "       -w BANDWIDTH" << endl
 807          << "               the control parameter of the distance of stem candidates, default:" << BANDWIDTH << endl
 808          << endl;
 809
 810
 811     //           << "       -go, --gap-open VALUE" << endl
 812     //           << "              gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
 813     //   << endl
 814     //   << "       -ge, --gap-extension VALUE" << endl
 815     //   << "              gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
 816     //   << endl
 817     //         << "       -co, --cutoff CUTOFF" << endl
 818     //         << "              subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
 819     //         << endl
 820
 821     exit (1);
 822   }
 823
 824   SafeVector<string> sequenceNames;
 825   int tempInt;
 826   float tempFloat;
 827
 828   for (int i = 1; i < argc; i++){
 829     if (argv[i][0] == '-'){
 830
 831       // training
 832       if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
 833         enableTraining = true;
 834         if (i < argc - 1)
 835           parametersOutputFilename = string (argv[++i]);
 836         else {
 837           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 838           exit (1);
 839         }
 840       }
 841
 842       // emission training
 843       else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){
 844         enableTrainEmissions = true;
 845       }
 846
 847       // parameter file
 848       else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
 849         if (i < argc - 1)
 850           parametersInputFilename = string (argv[++i]);
 851         else {
 852           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 853           exit (1);
 854         }
 855       }
 856       else if (! strcmp (argv[i], "-s")) {
 857         if (i < argc - 1){
 858           if (!GetFloat (argv[++i], &tempFloat)){
 859             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 860             exit (1);
 861           }
 862           else {
 863             if (tempFloat < 0){
 864               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
 865               exit (1);
 866             }
 867             else
 868               threshhold = tempFloat;
 869           }
 870         }
 871         else {
 872           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
 873           exit (1);
 874         }
 875       }
 876
 877       else if (! strcmp (argv[i], "-l")) {
 878           if (i < argc - 1) {
 879               if (!GetInteger (argv[++i], &tempInt)){
 880                   cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 881                   exit (1);
 882               }
 883               else {
 884                   if (tempInt <= 0 || 6 <= tempInt) {
 885                       cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 886                            << "1 and 6" << "." << endl;
 887                       exit (1);
 888                   }
 889                   else
 890                       scsLength = tempInt;
 891               }
 892           }
 893       }
 894       else if (! strcmp (argv[i], "-b")) {
 895           if (i < argc - 1) {
 896               if (!GetFloat (argv[++i], &tempFloat)){
 897                   cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 898                   exit (1);
 899               }
 900               else {
 901                   if (tempFloat < 0 && 1 < tempFloat) {
 902                       cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
 903                       exit (1);
 904                   }
 905                   else
 906                       BaseProbThreshold = tempFloat;
 907               }
 908           }
 909       }
 910       else if (! strcmp (argv[i], "-g")) {
 911           if (i < argc - 1) {
 912               if (!GetFloat (argv[++i], &tempFloat)){
 913                   cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 914                   exit (1);
 915               }
 916               else {
 917                   if (tempFloat < 0 && 1 < tempFloat) {
 918                       cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
 919                       exit (1);
 920                   }
 921                   else
 922                       BasePairConst = tempFloat;
 923               }
 924           }
 925       }
 926
 927       // number of consistency transformations
 928       else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
 929         if (i < argc - 1){
 930           if (!GetInteger (argv[++i], &tempInt)){
 931             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 932             exit (1);
 933           }
 934           else {
 935             if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
 936               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 937                    << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
 938               exit (1);
 939             }
 940             else
 941               numConsistencyReps = tempInt;
 942           }
 943         }
 944         else {
 945           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 946           exit (1);
 947         }
 948       }
 949
 950       // number of randomized partitioning iterative refinement passes
 951       else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
 952         if (i < argc - 1){
 953           if (!GetInteger (argv[++i], &tempInt)){
 954             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 955             exit (1);
 956           }
 957           else {
 958             if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
 959               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 960                    << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
 961               exit (1);
 962             }
 963             else
 964               numIterativeRefinementReps = tempInt;
 965           }
 966         }
 967         else {
 968           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 969           exit (1);
 970         }
 971       }
 972       // number of EM pre-training rounds
 973       else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
 974         if (i < argc - 1){
 975           if (!GetInteger (argv[++i], &tempInt)){
 976             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 977             exit (1);
 978           }
 979           else {
 980             if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
 981               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 982                    << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
 983               exit (1);
 984             }
 985             else
 986               numPreTrainingReps = tempInt;
 987           }
 988         }
 989         else {
 990           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 991           exit (1);
 992         }
 993       }
 994
 995       // the distance of stem candidate
 996       else if (!strcmp (argv[i], "-w")){
 997         if (i < argc - 1){
 998           if (!GetInteger (argv[++i], &tempInt)){
 999             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
1000             exit (1);
1001           }
1002           else {
1003               BandWidth = tempInt;
1004           }
1005         }
1006         else {
1007           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
1008           exit (1);
1009         }
1010       }
1011
1012       // gap open penalty
1013       else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
1014         if (i < argc - 1){
1015           if (!GetFloat (argv[++i], &tempFloat)){
1016             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1017             exit (1);
1018           }
1019           else {
1020             if (tempFloat > 0){
1021               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
1022               exit (1);
1023             }
1024             else
1025               gapOpenPenalty = tempFloat;
1026           }
1027         }
1028         else {
1029           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1030           exit (1);
1031         }
1032       }
1033
1034       // gap extension penalty
1035       else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
1036         if (i < argc - 1){
1037           if (!GetFloat (argv[++i], &tempFloat)){
1038             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1039             exit (1);
1040           }
1041           else {
1042             if (tempFloat > 0){
1043               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
1044               exit (1);
1045             }
1046             else
1047               gapContinuePenalty = tempFloat;
1048           }
1049         }
1050         else {
1051           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1052           exit (1);
1053         }
1054       }
1055
1056       // all-pairs pairwise alignments
1057       else if (!strcmp (argv[i], "-pairs")){
1058         enableAllPairs = true;
1059       }
1060
1061       // all-pairs pairwise Viterbi alignments
1062       else if (!strcmp (argv[i], "-viterbi")){
1063         enableAllPairs = true;
1064         enableViterbi = true;
1065       }
1066
1067       // read base-pairing probability from the '_bpp' file, by katoh
1068       else if (!strcmp (argv[i], "-readbpp")){
1069         bppmode = 'r';
1070       }
1071
1072       // write base-pairing probability to stdout, by katoh
1073       else if (!strcmp (argv[i], "-writebpp")){
1074         bppmode = 'w';
1075       }
1076
1077       // annotation files
1078       else if (!strcmp (argv[i], "-annot")){
1079         enableAnnotation = true;
1080         if (i < argc - 1)
1081           annotationFilename = argv[++i];
1082         else {
1083           cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
1084           exit (1);
1085         }
1086       }
1087
1088       // clustalw output format
1089       else if (!strcmp (argv[i], "-clustalw")){
1090         enableClustalWOutput = true;
1091       }
1092       // mxscarna output format
1093       else if (!strcmp (argv[i], "-mxscarna")) {
1094           enableMXSCARNAOutput = true;
1095       }
1096       // stockholm output format
1097       else if (!strcmp (argv[i], "-stockholm")) {
1098           enableStockholmOutput = true;
1099       }
1100       // web output format
1101       else if (!strcmp (argv[i], "-weboutput")) {
1102           if (i < argc - 1) {
1103               weboutputFileName = string(argv[++i]);
1104           }
1105           else {
1106               cerr << "ERROR: Invalid following option " << argv[i-1] << ": " << argv[i] << endl;
1107               exit (1);
1108           }
1109
1110           enableWebOutput = true;
1111       }
1112
1113       // cutoff
1114       else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){
1115         if (i < argc - 1){
1116           if (!GetFloat (argv[++i], &tempFloat)){
1117             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1118             exit (1);
1119           }
1120           else {
1121             if (tempFloat < 0 || tempFloat > 1){
1122               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
1123               exit (1);
1124             }
1125             else
1126               cutoff = tempFloat;
1127           }
1128         }
1129         else {
1130           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1131           exit (1);
1132         }
1133       }
1134
1135       // verbose reporting
1136       else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
1137         enableVerbose = true;
1138       }
1139
1140       // alignment order
1141       else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){
1142         enableAlignOrder = true;
1143       }
1144       // McCaskill MEA MODE
1145       else if (!strcmp (argv[i], "-m") || !strcmp (argv[i], "--mccaskillmea")){
1146         enableMcCaskillMEAMode = true;
1147       }
1148       // bad arguments
1149       else {
1150         cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
1151         exit (1);
1152       }
1153     }
1154     else {
1155       sequenceNames.push_back (string (argv[i]));
1156     }
1157   }
1158
1159   if (enableTrainEmissions && !enableTraining){
1160     cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
1161     exit (1);
1162   }
1163
1164   return sequenceNames;
1165 }
1166
1167 /////////////////////////////////////////////////////////////////
1168 // ReadParameters()
1169 //
1170 // Read initial distribution, transition, and emission
1171 // parameters from a file.
1172 /////////////////////////////////////////////////////////////////
1173
1174 void ReadParameters (){
1175
1176   ifstream data;
1177
1178   emitPairs = VVF (256, VF (256, 1e-10));
1179   emitSingle = VF (256, 1e-5);
1180
1181   // read initial state distribution and transition parameters
1182   // pass
1183   if (parametersInputFilename == string ("")){
1184     if (NumInsertStates == 1){
1185       for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
1186       for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
1187       for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
1188     }
1189     else if (NumInsertStates == 2){
1190       for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
1191       for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
1192       for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
1193     }
1194     else {
1195       cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
1196            << "       for " << NumInsertStates << " pairs of insert states.  Use --paramfile." << endl;
1197       exit (1);
1198     }
1199
1200     alphabet = alphabetDefault;
1201
1202     for (int i = 0; i < (int) alphabet.length(); i++){
1203       emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
1204       emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
1205       for (int j = 0; j <= i; j++){
1206         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1207         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1208         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1209         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1210         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1211         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1212         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1213         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1214       }
1215     }
1216   }
1217   else {
1218     data.open (parametersInputFilename.c_str());
1219     if (data.fail()){
1220       cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
1221       exit (1);
1222     }
1223
1224     string line[3];
1225     for (int i = 0; i < 3; i++){
1226       if (!getline (data, line[i])){
1227         cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
1228         exit (1);
1229       }
1230     }
1231     istringstream data2;
1232     data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
1233     data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
1234     data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
1235
1236     if (!getline (data, line[0])){
1237       cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
1238       exit (1);
1239     }
1240
1241     // read alphabet as concatenation of all characters on alphabet line
1242     alphabet = "";
1243     string token;
1244     data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
1245
1246     for (int i = 0; i < (int) alphabet.size(); i++){
1247       for (int j = 0; j <= i; j++){
1248         float val;
1249         data >> val;
1250         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1251         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1252         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1253         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1254         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1255         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1256         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1257         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1258       }
1259     }
1260
1261     for (int i = 0; i < (int) alphabet.size(); i++){
1262       float val;
1263       data >> val;
1264       emitSingle[(unsigned char) tolower(alphabet[i])] = val;
1265       emitSingle[(unsigned char) toupper(alphabet[i])] = val;
1266     }
1267     data.close();
1268   }
1269 }
1270
1271 /////////////////////////////////////////////////////////////////
1272 // ProcessTree()
1273 //
1274 // Process the tree recursively.  Returns the aligned sequences
1275 // corresponding to a node or leaf of the tree.
1276 /////////////////////////////////////////////////////////////////
1277 float ide;
1278 MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
1279                             const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1280                             const ProbabilisticModel &model, SafeVector<BPPMatrix*> &BPPMatrices) {
1281   MultiSequence *result;
1282
1283   // check if this is a node of the alignment tree
1284   if (tree->GetSequenceLabel() == -1){
1285     MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model, BPPMatrices);
1286     MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model, BPPMatrices);
1287
1288     assert (alignLeft);
1289     assert (alignRight);
1290
1291     result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model, BPPMatrices, tree->GetIdentity());
1292     assert (result);
1293
1294     delete alignLeft;
1295     delete alignRight;
1296   }
1297
1298   // otherwise, this is a leaf of the alignment tree
1299   else {
1300     result = new MultiSequence(); assert (result);
1301     result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1302   }
1303
1304   return result;
1305 }
1306
1307 /////////////////////////////////////////////////////////////////
1308 // ComputeFinalAlignment()
1309 //
1310 // Compute the final alignment by calling ProcessTree(), then
1311 // performing iterative refinement as needed.
1312 /////////////////////////////////////////////////////////////////
1313
1314 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
1315                                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1316                                       const ProbabilisticModel &model,
1317                                       SafeVector<BPPMatrix*> &BPPMatrices) {
1318
1319   MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model, BPPMatrices);
1320
1321   if (enableAlignOrder){
1322     alignment->SaveOrdering();
1323     enableAlignOrder = false;
1324   }
1325
1326   // tree-based refinement
1327   // if you use the function, you can degrade the quality of the software.
1328   // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree, BPPMatrices);
1329
1330   // iterative refinement
1331 /*
1332   for (int i = 0; i < numIterativeRefinementReps; i++)
1333       DoIterativeRefinement (sparseMatrices, model, alignment);
1334
1335       cerr << endl;
1336 */
1337   // return final alignment
1338   return alignment;
1339 }
1340
1341 /////////////////////////////////////////////////////////////////
1342 // AlignAlignments()
1343 //
1344 // Returns the alignment of two MultiSequence objects.
1345 /////////////////////////////////////////////////////////////////
1346
1347 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
1348                                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1349                                 const ProbabilisticModel &model,
1350                                 SafeVector<BPPMatrix*> &BPPMatrices, float identity){
1351
1352   // print some info about the alignment
1353   if (enableVerbose){
1354     for (int i = 0; i < align1->GetNumSequences(); i++)
1355       cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1356     cerr << "] vs. ";
1357     for (int i = 0; i < align2->GetNumSequences(); i++)
1358       cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1359     cerr << "]: ";
1360   }
1361
1362   VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
1363
1364   pair<SafeVector<char> *, float> alignment;
1365   // choose the alignment routine depending on the "cosmetic" gap penalties used
1366   if (gapOpenPenalty == 0 && gapContinuePenalty == 0) {
1367
1368     if(identity <= threshhold) {
1369         std::vector<StemCandidate> *pscs1, *pscs2;
1370         pscs1 = seq2scs(align1, BPPMatrices, BandWidth);
1371         pscs2 = seq2scs(align2, BPPMatrices, BandWidth);
1372         std::vector<int> *matchPSCS1 = new std::vector<int>;
1373         std::vector<int> *matchPSCS2 = new std::vector<int>;
1374
1375         Globaldp globaldp(pscs1, pscs2, align1, align2, matchPSCS1, matchPSCS2, posterior, BPPMatrices);
1376         //float scsScore = globaldp.Run();
1377
1378         globaldp.Run();
1379
1380         removeConflicts(pscs1, pscs2, matchPSCS1, matchPSCS2);
1381
1382         alignment = model.ComputeAlignment2 (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior, pscs1, pscs2, matchPSCS1, matchPSCS2);
1383         delete matchPSCS1;
1384         delete matchPSCS2;
1385     } else {
1386         alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
1387     }
1388   }
1389   else {
1390     alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1391                                                         *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1392                                                         gapOpenPenalty, gapContinuePenalty);
1393   }
1394
1395   delete posterior;
1396
1397   if (enableVerbose){
1398
1399     // compute total length of sequences
1400     int totLength = 0;
1401     for (int i = 0; i < align1->GetNumSequences(); i++)
1402       for (int j = 0; j < align2->GetNumSequences(); j++)
1403         totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1404
1405     // give an "accuracy" measure for the alignment
1406     cerr << alignment.second / totLength << endl;
1407   }
1408
1409   // now build final alignment
1410   MultiSequence *result = new MultiSequence();
1411   for (int i = 0; i < align1->GetNumSequences(); i++)
1412     result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1413   for (int i = 0; i < align2->GetNumSequences(); i++)
1414     result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1415   if (!enableAlignOrder)
1416     result->SortByLabel();
1417
1418   // free temporary alignment
1419   delete alignment.first;
1420
1421   return result;
1422 }
1423
1424 /////////////////////////////////////////////////////////////////
1425 // DoRelaxation()
1426 //
1427 // Performs one round of the consistency transformation.  The
1428 // formula used is:
1429 //                     1
1430 //    P'(x[i]-y[j]) = ---  sum   sum P(x[i]-z[k]) P(z[k]-y[j])
1431 //                    |S| z in S  k
1432 //
1433 // where S = {x, y, all other sequences...}
1434 //
1435 /////////////////////////////////////////////////////////////////
1436
1437 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
1438                                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1439   const int numSeqs = sequences->GetNumSequences();
1440
1441   SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
1442
1443   // for every pair of sequences
1444   for (int i = 0; i < numSeqs; i++){
1445     for (int j = i+1; j < numSeqs; j++){
1446       Sequence *seq1 = sequences->GetSequence (i);
1447       Sequence *seq2 = sequences->GetSequence (j);
1448
1449       if (enableVerbose)
1450         cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1451              << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1452
1453       // get the original posterior matrix
1454       VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1455       VF &posterior = *posteriorPtr;
1456
1457       const int seq1Length = seq1->GetLength();
1458       const int seq2Length = seq2->GetLength();
1459
1460       // contribution from the summation where z = x and z = y
1461       for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1462
1463       if (enableVerbose)
1464         cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1465
1466       // contribution from all other sequences
1467       for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1468         if (k < i)
1469           Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
1470         else if (k > i && k < j)
1471           Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1472         else {
1473           SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
1474           Relax (sparseMatrices[i][k], temp, posterior);
1475           delete temp;
1476         }
1477       }
1478
1479       // now renormalization
1480       for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1481
1482       // mask out positions not originally in the posterior matrix
1483       SparseMatrix *matXY = sparseMatrices[i][j];
1484       for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
1485       for (int x = 1; x <= seq1Length; x++){
1486         SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1487         SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1488         VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1489         int curr = 0;
1490         while (XYptr != XYend){
1491
1492           // zero out all cells until the first filled column
1493           while (curr < XYptr->first){
1494             base[curr] = 0;
1495             curr++;
1496           }
1497
1498           // now, skip over this column
1499           curr++;
1500           ++XYptr;
1501         }
1502
1503         // zero out cells after last column
1504         while (curr <= seq2Length){
1505           base[curr] = 0;
1506           curr++;
1507         }
1508       }
1509
1510       // save the new posterior matrix
1511       newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1512       newSparseMatrices[j][i] = NULL;
1513
1514       if (enableVerbose)
1515         cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1516
1517       delete posteriorPtr;
1518
1519       if (enableVerbose)
1520         cerr << "done." << endl;
1521     }
1522   }
1523
1524   return newSparseMatrices;
1525 }
1526
1527 /////////////////////////////////////////////////////////////////
1528 // Relax()
1529 //
1530 // Computes the consistency transformation for a single sequence
1531 // z, and adds the transformed matrix to "posterior".
1532 /////////////////////////////////////////////////////////////////
1533
1534 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
1535
1536   assert (matXZ);
1537   assert (matZY);
1538
1539   int lengthX = matXZ->GetSeq1Length();
1540   int lengthY = matZY->GetSeq2Length();
1541   assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1542
1543   // for every x[i]
1544   for (int i = 1; i <= lengthX; i++){
1545     SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1546     SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1547
1548     VF::iterator base = posterior.begin() + i * (lengthY + 1);
1549
1550     // iterate through all x[i]-z[k]
1551     while (XZptr != XZend){
1552       SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1553       SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1554       const float XZval = XZptr->second;
1555
1556       // iterate through all z[k]-y[j]
1557       while (ZYptr != ZYend){
1558         base[ZYptr->first] += XZval * ZYptr->second;
1559        ZYptr++;
1560       }
1561       XZptr++;
1562     }
1563   }
1564 }
1565
1566 /////////////////////////////////////////////////////////////////
1567 // Relax1()
1568 //
1569 // Computes the consistency transformation for a single sequence
1570 // z, and adds the transformed matrix to "posterior".
1571 /////////////////////////////////////////////////////////////////
1572
1573 void Relax1 (SparseMatrix *matZX, SparseMatrix *matZY, VF &posterior){
1574
1575   assert (matZX);
1576   assert (matZY);
1577
1578   int lengthZ = matZX->GetSeq1Length();
1579   int lengthY = matZY->GetSeq2Length();
1580
1581   // for every z[k]
1582   for (int k = 1; k <= lengthZ; k++){
1583     SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1584     SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1585
1586     // iterate through all z[k]-x[i]
1587     while (ZXptr != ZXend){
1588       SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1589       SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1590       const float ZXval = ZXptr->second;
1591       VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
1592
1593       // iterate through all z[k]-y[j]
1594       while (ZYptr != ZYend){
1595         base[ZYptr->first] += ZXval * ZYptr->second;
1596         ZYptr++;
1597       }
1598       ZXptr++;
1599     }
1600   }
1601 }
1602
1603 void DoBasePairProbabilityRelaxation (MultiSequence *sequences,
1604                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1605                                       SafeVector<BPPMatrix*> &BPPMatrices) {
1606     const int numSeqs = sequences->GetNumSequences();
1607
1608     for (int i = 0; i < numSeqs; i++) {
1609         Sequence *seq1 = sequences->GetSequence (i);
1610         BPPMatrix *seq1BppMatrix = BPPMatrices[seq1->GetLabel()];
1611         Trimat<float> consBppMat(seq1->GetLength() + 1);
1612         int seq1Length = seq1->GetLength();
1613
1614         for (int k = 1; k <= seq1Length; k++) {
1615             for (int l = k; l <= seq1Length; l++) {
1616                 consBppMat.ref(k, l) = seq1BppMatrix->GetProb(k, l);
1617             }
1618         }
1619
1620         for (int j = i + 1; j < numSeqs; j++) {
1621
1622    //           VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior()
1623           Sequence *seq2 = sequences->GetSequence (j);
1624           BPPMatrix *seq2BppMatrix = BPPMatrices[seq2->GetLabel()];
1625 //        int seq2Length = seq2->GetLength();
1626           SparseMatrix *matchProb = sparseMatrices[i][j];
1627
1628 //        vector<PIF2> &probs1 = seq1BppMatrix->bppMat.data2;
1629           for(int k = 1; k <= seq1Length; k++) {
1630               for(int m = k, n = k; n <= k + 200 && m >= 1 && n <= seq1Length; m--, n++) {
1631
1632 //        for (int k = 0; k < (int)probs1.size(); k++) {
1633 //            float tmpProb1 = probs1[k].prob;
1634 //            int   tmp1I    = probs1[k].i;
1635 //            int   tmp1J    = probs1[k].j;
1636
1637                   float sumProb = 0;
1638                   vector<PIF2> &probs2 = seq2BppMatrix->bppMat.data2;
1639                   for(int l = 0; l < (int)probs2.size(); l++) {
1640                       float tmpProb2 = probs2[l].prob;
1641                       int   tmp2I    = probs2[l].i;
1642                       int   tmp2J    = probs2[l].j;
1643                       sumProb += matchProb->GetValue(m, tmp2I)*matchProb->GetValue(n, tmp2J)*tmpProb2;
1644                   }
1645
1646                   consBppMat.ref(m, n) += sumProb;
1647               }
1648
1649               for(int m = k, n = k + 1; n <= k + 200 && m >= 1 && n <= seq1Length; m--, n++) {
1650
1651 //        for (int k = 0; k < (int)probs1.size(); k++) {
1652 //            float tmpProb1 = probs1[k].prob;
1653 //            int   tmp1I    = probs1[k].i;
1654 //            int   tmp1J    = probs1[k].j;
1655
1656                   float sumProb = 0;
1657                   vector<PIF2> &probs2 = seq2BppMatrix->bppMat.data2;
1658                   for(int l = 0; l < (int)probs2.size(); l++) {
1659                       float tmpProb2 = probs2[l].prob;
1660                       int   tmp2I    = probs2[l].i;
1661                       int   tmp2J    = probs2[l].j;
1662                       sumProb += matchProb->GetValue(m, tmp2I)*matchProb->GetValue(n, tmp2J)*tmpProb2;
1663                   }
1664
1665                   consBppMat.ref(m, n) += sumProb;
1666               }
1667           }
1668         }
1669
1670 /*
1671           for(int k = 1; k <= seq1Length; k++) {
1672               for(int m = k, n = k; n <= k + 30 && m >= 1 && n <= seq1Length; m--, n++) {
1673                   float tmpProb = seq1BppMatrix->GetProb(m, n);
1674                   for(int l = 1; l <= seq2Length; l++) {
1675                       for(int s = l, t = l; t <= l + 30 && s >= 1 && t <= seq2Length; s--, t++) {
1676                           tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1677                       }
1678                       for(int s = l, t = l + 1; t <= l + 31 && s >= 1 && t <= seq2Length; s--, t++) {
1679                           tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1680                       }
1681                   }
1682                   consBppMat.ref(m, n) += tmpProb;
1683               }
1684
1685               for(int m = k, n = k + 1; n <= k + 31 && m >= 1 && n <= seq1Length; m--, n++) {
1686                   float tmpProb = seq1BppMatrix->GetProb(m, n);
1687                   for(int l = 1; l <= seq2Length; l++) {
1688                       for(int s = l, t = l; t <= l + 30 && s >= 1 && t <= seq2Length; s--, t++) {
1689                           tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1690                       }
1691                       for(int s = l, t = l + 1; t <= l + 31 && s >= 1 && t <= seq2Length; s--, t++) {
1692                           tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1693                       }
1694                   }
1695                   consBppMat.ref(m,n) += tmpProb;
1696               }
1697           }
1698         }
1699 */
1700         for (int m = 1; m <= seq1Length; m++) {
1701             for (int n = m + 4; n <= seq1Length; n++) {
1702                 consBppMat.ref(m,n) = consBppMat.ref(m,n)/(float)numSeqs;
1703             }
1704         }
1705         seq1BppMatrix->updateBPPMatrix(consBppMat);
1706     }
1707 }
1708
1709 /////////////////////////////////////////////////////////////////
1710 // GetSubtree
1711 //
1712 // Returns set containing all leaf labels of the current subtree.
1713 /////////////////////////////////////////////////////////////////
1714
1715 set<int> GetSubtree (const TreeNode *tree){
1716   set<int> s;
1717
1718   if (tree->GetSequenceLabel() == -1){
1719     s = GetSubtree (tree->GetLeftChild());
1720     set<int> t = GetSubtree (tree->GetRightChild());
1721
1722     for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1723       s.insert (*iter);
1724   }
1725   else {
1726     s.insert (tree->GetSequenceLabel());
1727   }
1728
1729   return s;
1730 }
1731
1732 /////////////////////////////////////////////////////////////////
1733 // TreeBasedBiPartitioning
1734 //
1735 // Uses the iterative refinement scheme from MUSCLE.
1736 /////////////////////////////////////////////////////////////////
1737
1738 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1739                               const ProbabilisticModel &model, MultiSequence* &alignment,
1740                               const TreeNode *tree, SafeVector<BPPMatrix*> &BPPMatrices){
1741   // check if this is a node of the alignment tree
1742   if (tree->GetSequenceLabel() == -1){
1743     TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild(), BPPMatrices);
1744     TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild(), BPPMatrices);
1745
1746     set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1747     set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1748     set<int> leftSubtreeComplement, rightSubtreeComplement;
1749
1750     // calculate complement of each subtree
1751     for (int i = 0; i < alignment->GetNumSequences(); i++){
1752       if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1753       if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1754     }
1755
1756     // perform realignments for edge to left child
1757     if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1758       MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1759       MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1760       delete alignment;
1761       alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model, BPPMatrices, tree->GetLeftChild()->GetIdentity());
1762     }
1763
1764     // perform realignments for edge to right child
1765     if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1766       MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1767       MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1768       delete alignment;
1769       alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model, BPPMatrices, tree->GetRightChild()->GetIdentity());
1770     }
1771   }
1772 }
1773
1774 /////////////////////////////////////////////////////////////////
1775 // DoterativeRefinement()
1776 //
1777 // Performs a single round of randomized partionining iterative
1778 // refinement.
1779 /////////////////////////////////////////////////////////////////
1780 /*
1781 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1782                             const ProbabilisticModel &model, MultiSequence* &alignment){
1783   set<int> groupOne, groupTwo;
1784
1785   // create two separate groups
1786   for (int i = 0; i < alignment->GetNumSequences(); i++){
1787     if (rand() % 2)
1788       groupOne.insert (i);
1789     else
1790       groupTwo.insert (i);
1791   }
1792
1793   if (groupOne.empty() || groupTwo.empty()) return;
1794
1795   // project into the two groups
1796   MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1797   MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1798   delete alignment;
1799
1800   // realign
1801   alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1802
1803   delete groupOneSeqs;
1804   delete groupTwoSeqs;
1805 }
1806 */
1807
1808 /////////////////////////////////////////////////////////////////
1809 // WriteAnnotation()
1810 //
1811 // Computes annotation for multiple alignment and write values
1812 // to a file.
1813 /////////////////////////////////////////////////////////////////
1814
1815 void WriteAnnotation (MultiSequence *alignment,
1816                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1817   ofstream outfile (annotationFilename.c_str());
1818
1819   if (outfile.fail()){
1820     cerr << "ERROR: Unable to write annotation file." << endl;
1821     exit (1);
1822   }
1823
1824   const int alignLength = alignment->GetSequence(0)->GetLength();
1825   const int numSeqs = alignment->GetNumSequences();
1826
1827   SafeVector<int> position (numSeqs, 0);
1828   SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1829   for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1830   SafeVector<pair<int,int> > active;
1831   active.reserve (numSeqs);
1832
1833   // for every column
1834   for (int i = 1; i <= alignLength; i++){
1835
1836     // find all aligned residues in this particular column
1837     active.clear();
1838     for (int j = 0; j < numSeqs; j++){
1839       if (seqs[j][i] != '-'){
1840         active.push_back (make_pair(j, ++position[j]));
1841       }
1842     }
1843
1844     outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1845   }
1846
1847   outfile.close();
1848 }
1849
1850 /////////////////////////////////////////////////////////////////
1851 // ComputeScore()
1852 //
1853 // Computes the annotation score for a particular column.
1854 /////////////////////////////////////////////////////////////////
1855
1856 int ComputeScore (const SafeVector<pair<int, int> > &active,
1857                   const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1858
1859   if (active.size() <= 1) return 0;
1860
1861   // ALTERNATIVE #1: Compute the average alignment score.
1862
1863   float val = 0;
1864   for (int i = 0; i < (int) active.size(); i++){
1865     for (int j = i+1; j < (int) active.size(); j++){
1866       val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1867     }
1868   }
1869
1870   return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1871
1872 }