binaries/src/mafft/extensions/mxscarna_src/probconsRNA/Main.cc

   1 /////////////////////////////////////////////////////////////////
   2 // Main.cc
   3 //
   4 // Main routines for MXSCARNA program.
   5 /////////////////////////////////////////////////////////////////
   6
   7 #include "scarna.hpp"
   8 #include "SafeVector.h"
   9 #include "MultiSequence.h"
  10 #include "Defaults.h"
  11 #include "ScoreType.h"
  12 #include "ProbabilisticModel.h"
  13 #include "EvolutionaryTree.h"
  14 #include "SparseMatrix.h"
  15 #include "BPPMatrix.hpp"
  16 #include "StemCandidate.hpp"
  17 #include "Globaldp.hpp"
  18 #include "nrutil.h"
  19 #include "AlifoldMEA.h"
  20 #include <string>
  21 #include <sstream>
  22 #include <iomanip>
  23 #include <iostream>
  24 #include <list>
  25 #include <set>
  26 #include <algorithm>
  27 #include <cstdio>
  28 #include <cstdlib>
  29 #include <cerrno>
  30 #include <iomanip>
  31 #include <fstream>
  32
  33 #include "RfoldWrapper.hpp"
  34 //static RFOLD::Rfold folder;
  35
  36 using namespace::MXSCARNA;
  37 using namespace::RFOLD;
  38
  39 string parametersInputFilename = "";
  40 string parametersOutputFilename = "no training";
  41 string annotationFilename = "";
  42 string weboutputFileName = "";
  43
  44 ofstream *outputFile;
  45
  46 bool enableTraining = false;
  47 bool enableVerbose = false;
  48 bool enableAllPairs = false;
  49 bool enableAnnotation = false;
  50 bool enableViterbi = false;
  51 bool enableClustalWOutput = false;
  52 bool enableTrainEmissions = false;
  53 bool enableAlignOrder = false;
  54 bool enableWebOutput  = false;
  55 bool enableStockholmOutput = false;
  56 bool enableMXSCARNAOutput = false;
  57 bool enableMcCaskillMEAMode = false;
  58 int numConsistencyReps = 2;
  59 int numPreTrainingReps = 0;
  60 int numIterativeRefinementReps = 100;
  61 int scsLength = SCSLENGTH;
  62 float cutoff = 0;
  63 float gapOpenPenalty = 0;
  64 float gapContinuePenalty = 0;
  65 float threshhold = 1.0;
  66 float BaseProbThreshold = BASEPROBTHRESHOLD;
  67 float BasePairConst = BASEPAIRCONST;
  68 int   BandWidth = BANDWIDTH;
  69 bool  useRfold = USERFOLD;
  70 SafeVector<string> sequenceNames;
  71
  72 VF initDistrib (NumMatrixTypes);
  73 VF gapOpen (2*NumInsertStates);
  74 VF gapExtend (2*NumInsertStates);
  75 VVF emitPairs (256, VF (256, 1e-10));
  76 VF emitSingle (256, 1e-5);
  77
  78 string alphabet = alphabetDefault;
  79
  80 string *ssCons = NULL;
  81
  82 const int MIN_PRETRAINING_REPS = 0;
  83 const int MAX_PRETRAINING_REPS = 20;
  84 const int MIN_CONSISTENCY_REPS = 0;
  85 const int MAX_CONSISTENCY_REPS = 5;
  86 const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
  87 const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
  88
  89 /////////////////////////////////////////////////////////////////
  90 // Function prototypes
  91 /////////////////////////////////////////////////////////////////
  92
  93 void PrintHeading();
  94 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
  95                       const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
  96 MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle);
  97 SafeVector<string> ParseParams (int argc, char **argv);
  98 void ReadParameters ();
  99 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
 100                                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 101                                       const ProbabilisticModel &model,
 102                                       SafeVector<BPPMatrix*> &BPPMatrices);
 103 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
 104                                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 105                                 const ProbabilisticModel &model, SafeVector<BPPMatrix*> &BPPMatrices, float identity);
 106 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
 107                                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
 108 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
 109 void Relax1 (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior);
 110 void DoBasePairProbabilityRelaxation (MultiSequence *sequences,
 111                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 112                                       SafeVector<BPPMatrix*> &BPPMatrices);
 113 set<int> GetSubtree (const TreeNode *tree);
 114 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 115                               const ProbabilisticModel &model, MultiSequence* &alignment,
 116                               const TreeNode *tree, SafeVector<BPPMatrix*> &BPPMatrices);
 117 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
 118                             const ProbabilisticModel &model, MultiSequence* &alignment);
 119 void WriteAnnotation (MultiSequence *alignment,
 120                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
 121 int ComputeScore (const SafeVector<pair<int, int> > &active,
 122                   const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
 123 std::vector<StemCandidate>* seq2scs(MultiSequence *Sequences, SafeVector<BPPMatrix*> &BPPMatrices, int BandWidth);
 124 void removeConflicts(std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2, std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2);
 125
 126 struct prob {
 127     int i;
 128     int j;
 129     float p;
 130 };
 131
 132 /////////////////////////////////////////////////////////////////
 133 // main()
 134 //
 135 // Calls all initialization routines and runs the MXSCARNA
 136 // aligner.
 137 /////////////////////////////////////////////////////////////////
 138
 139
 140 int main (int argc, char **argv){
 141
 142     // print MXSCARNA heading
 143     PrintHeading();
 144
 145     // parse program parameters
 146     sequenceNames = ParseParams (argc, argv);
 147     ReadParameters();
 148     PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 149
 150   // now, we'll process all the files given as input.  If we are given
 151   // several filenames as input, then we'll load all of those sequences
 152   // simultaneously, as long as we're not training.  On the other hand,
 153   // if we are training, then we'll treat each file as a separate
 154   // training instance
 155
 156     if (enableMcCaskillMEAMode) {
 157       MultiSequence *sequences = new MultiSequence(); assert (sequences);
 158       for (int i = 0; i < (int) sequenceNames.size(); i++){
 159           cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 160           sequences->LoadMFA (sequenceNames[i], true);
 161       }
 162
 163       const int numSeqs = sequences->GetNumSequences();
 164       SafeVector<BPPMatrix*> BPPMatrices;
 165
 166       // compute the base pairing matrices for each sequences
 167       for(int i = 0; i < numSeqs; i++) {
 168           Sequence *tmpSeq = sequences->GetSequence(i);
 169           string   seq = tmpSeq->GetString();
 170           int    n_seq = tmpSeq->GetLength();
 171           BPPMatrix *bppmat = new BPPMatrix(seq, n_seq);
 172           BPPMatrices.push_back(bppmat);
 173       }
 174       AlifoldMEA alifold(sequences, BPPMatrices, BasePairConst);
 175       alifold.Run();
 176       ssCons = alifold.getSScons();
 177
 178       if (enableStockholmOutput) {
 179           sequences->WriteSTOCKHOLM (cout, ssCons);
 180       }
 181       else if (enableMXSCARNAOutput){
 182           sequences->WriteMXSCARNA (cout, ssCons);
 183       }
 184       else {
 185           sequences->WriteMFA (cout, ssCons);
 186       }
 187
 188       delete sequences;
 189   }
 190   // if we are training
 191   else if (enableTraining){
 192
 193     // build new model for aligning
 194     ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 195
 196     // prepare to average parameters
 197     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
 198     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
 199     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
 200     if (enableTrainEmissions){
 201       for (int i = 0; i < (int) emitPairs.size(); i++)
 202         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
 203       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
 204     }
 205
 206     // align each file individually
 207     for (int i = 0; i < (int) sequenceNames.size(); i++){
 208
 209       VF thisInitDistrib (NumMatrixTypes);
 210       VF thisGapOpen (2*NumInsertStates);
 211       VF thisGapExtend (2*NumInsertStates);
 212       VVF thisEmitPairs (256, VF (256, 1e-10));
 213       VF thisEmitSingle (256, 1e-5);
 214
 215       // load sequence file
 216       MultiSequence *sequences = new MultiSequence(); assert (sequences);
 217       cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 218       sequences->LoadMFA (sequenceNames[i], true);
 219
 220       // align sequences
 221       DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
 222
 223       // add in contribution of the derived parameters
 224       for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
 225       for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
 226       for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
 227       if (enableTrainEmissions){
 228         for (int i = 0; i < (int) emitPairs.size(); i++)
 229           for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
 230         for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
 231       }
 232
 233       delete sequences;
 234     }
 235
 236     // compute new parameters and print them out
 237     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
 238     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
 239     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
 240     if (enableTrainEmissions){
 241       for (int i = 0; i < (int) emitPairs.size(); i++)
 242         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
 243       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
 244     }
 245
 246     PrintParameters ("Trained parameter set:",
 247                      initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
 248                      parametersOutputFilename.c_str());
 249   }
 250   // pass
 251   // if we are not training, we must simply want to align some sequences
 252   else {
 253     // load all files together
 254     MultiSequence *sequences = new MultiSequence(); assert (sequences);
 255     for (int i = 0; i < (int) sequenceNames.size(); i++){
 256       cerr << "Loading sequence file: " << sequenceNames[i] << endl;
 257
 258       sequences->LoadMFA (sequenceNames[i], true);
 259     }
 260
 261     // do all "pre-training" repetitions first
 262     // NOT execute
 263     for (int ct = 0; ct < numPreTrainingReps; ct++){
 264       enableTraining = true;
 265
 266       // build new model for aligning
 267       ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
 268                                 emitPairs, emitSingle);
 269
 270       // do initial alignments
 271       DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 272
 273       // print new parameters
 274       PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
 275
 276       enableTraining = false;
 277     }
 278
 279     // now, we can perform the alignments and write them out
 280     if (enableWebOutput) {
 281         outputFile = new ofstream(weboutputFileName.c_str());
 282         if (!outputFile) {
 283             cerr << "cannot open output file." << weboutputFileName << endl;
 284             exit(1);
 285         }
 286         *outputFile << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << endl;
 287         *outputFile << "<result>" << endl;
 288     }
 289         MultiSequence *alignment = DoAlign (sequences,
 290                                             ProbabilisticModel (initDistrib, gapOpen, gapExtend,  emitPairs, emitSingle),
 291                                             initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
 292
 293
 294     if (!enableAllPairs){
 295         if (enableClustalWOutput) {
 296             alignment->WriteALN (cout);
 297         }
 298         else if (enableWebOutput) {
 299             alignment->WriteWEB (*outputFile, ssCons);
 300 //          computeStructureWithAlifold ();
 301         }
 302         else if (enableStockholmOutput) {
 303             alignment->WriteSTOCKHOLM (cout, ssCons);
 304         }
 305         else if (enableMXSCARNAOutput) {
 306             alignment->WriteMXSCARNA (cout, ssCons);
 307         }
 308         else {
 309             alignment->WriteMFA (cout, ssCons);
 310         }
 311     }
 312
 313     if (enableWebOutput) {
 314         *outputFile << "</result>" << endl;
 315         delete outputFile;
 316     }
 317
 318     delete ssCons;
 319     delete alignment;
 320     delete sequences;
 321
 322   }
 323 }
 324
 325 /////////////////////////////////////////////////////////////////
 326 // PrintHeading()
 327 //
 328 // Prints heading for PROBCONS program.
 329 /////////////////////////////////////////////////////////////////
 330
 331 void PrintHeading (){
 332   cerr << endl
 333        << "Multiplex SCARNA"<< endl
 334        << "version " << VERSION << " - align multiple RNA sequences and print to standard output" << endl
 335        << "Written by Yasuo Tabei" << endl
 336        << endl;
 337 }
 338
 339 /////////////////////////////////////////////////////////////////
 340 // PrintParameters()
 341 //
 342 // Prints PROBCONS parameters to STDERR.  If a filename is
 343 // specified, then the parameters are also written to the file.
 344 /////////////////////////////////////////////////////////////////
 345
 346 void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
 347                       const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
 348
 349   // print parameters to the screen
 350   cerr << message << endl
 351        << "    initDistrib[] = { ";
 352   for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
 353   cerr << "}" << endl
 354        << "        gapOpen[] = { ";
 355   for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
 356   cerr << "}" << endl
 357        << "      gapExtend[] = { ";
 358   for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
 359   cerr << "}" << endl
 360        << endl;
 361
 362   /*
 363   for (int i = 0; i < 5; i++){
 364     for (int j = 0; j <= i; j++){
 365       cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
 366     }
 367     cerr << endl;
 368     }*/
 369
 370   // if a file name is specified
 371   if (filename){
 372
 373     // attempt to open the file for writing
 374     FILE *file = fopen (filename, "w");
 375     if (!file){
 376       cerr << "ERROR: Unable to write parameter file: " << filename << endl;
 377       exit (1);
 378     }
 379
 380     // if successful, then write the parameters to the file
 381     for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
 382     for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
 383     for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
 384     fprintf (file, "%s\n", alphabet.c_str());
 385     for (int i = 0; i < (int) alphabet.size(); i++){
 386       for (int j = 0; j <= i; j++)
 387         fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
 388       fprintf (file, "\n");
 389     }
 390     for (int i = 0; i < (int) alphabet.size(); i++)
 391       fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
 392     fprintf (file, "\n");
 393     fclose (file);
 394   }
 395 }
 396
 397 /////////////////////////////////////////////////////////////////
 398 // DoAlign()
 399 //
 400 // First computes all pairwise posterior probability matrices.
 401 // Then, computes new parameters if training, or a final
 402 // alignment, otherwise.
 403 /////////////////////////////////////////////////////////////////
 404 MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle){
 405
 406   assert (sequences);
 407
 408   const int numSeqs = sequences->GetNumSequences();
 409   VVF distances (numSeqs, VF (numSeqs, 0));
 410   VVF identities (numSeqs, VF (numSeqs, 0));
 411   SafeVector<SafeVector<SparseMatrix *> > sparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
 412
 413   SafeVector<BPPMatrix*> BPPMatrices;
 414
 415   RfoldWrapper rfoldWrapper;
 416   if (useRfold) {
 417     BPPMatrices
 418       = rfoldWrapper.getProb(sequenceNames, sequences, BASEPROBTHRESHOLD, BandWidth);
 419   }
 420   else {
 421     for(int i = 0; i < numSeqs; i++) {
 422       Sequence *tmpSeq = sequences->GetSequence(i);
 423       string   seq = tmpSeq->GetString();
 424       int    n_seq = tmpSeq->GetLength();
 425       BPPMatrix *bppmat = new BPPMatrix(seq, n_seq, BASEPROBTHRESHOLD);
 426       BPPMatrices.push_back(bppmat);
 427     }
 428   }
 429
 430   if (enableTraining){
 431     // prepare to average parameters
 432     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
 433     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
 434     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
 435     if (enableTrainEmissions){
 436       for (int i = 0; i < (int) emitPairs.size(); i++)
 437         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
 438       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
 439     }
 440   }
 441
 442   // skip posterior calculations if we just want to do Viterbi alignments
 443   if (!enableViterbi){
 444
 445     // do all pairwise alignments for posterior probability matrices
 446     for (int a = 0; a < numSeqs-1; a++){
 447       for (int b = a+1; b < numSeqs; b++){
 448         Sequence *seq1 = sequences->GetSequence (a);
 449         Sequence *seq2 = sequences->GetSequence (b);
 450
 451         // verbose output
 452         if (enableVerbose)
 453           cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
 454                << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
 455
 456         // compute forward and backward probabilities
 457         VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
 458         VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
 459
 460         // if we are training, then we'll simply want to compute the
 461         // expected counts for each region within the matrix separately;
 462         // otherwise, we'll need to put all of the regions together and
 463         // assemble a posterior probability match matrix
 464
 465         // so, if we're training
 466         if (enableTraining){
 467
 468           // compute new parameters
 469           VF thisInitDistrib (NumMatrixTypes);
 470           VF thisGapOpen (2*NumInsertStates);
 471           VF thisGapExtend (2*NumInsertStates);
 472           VVF thisEmitPairs (256, VF (256, 1e-10));
 473           VF thisEmitSingle (256, 1e-5);
 474
 475           model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
 476
 477           // add in contribution of the derived parameters
 478           for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
 479           for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
 480           for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
 481           if (enableTrainEmissions){
 482             for (int i = 0; i < (int) emitPairs.size(); i++)
 483               for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
 484             for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
 485           }
 486
 487           // let us know that we're done.
 488           if (enableVerbose) cerr << "done." << endl;
 489         }
 490         // pass
 491         else {
 492
 493           // compute posterior probability matrix
 494           VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior);
 495
 496           // compute sparse representations
 497           sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
 498           sparseMatrices[b][a] = NULL;
 499
 500           if (!enableAllPairs){
 501             // perform the pairwise sequence alignment
 502             pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
 503                                                                                 seq2->GetLength(),
 504                                                                                 *posterior);
 505
 506             Sequence *tmpSeq1 = seq1->AddGaps (alignment.first, 'X');
 507             Sequence *tmpSeq2 = seq2->AddGaps (alignment.first, 'Y');
 508
 509             // compute sequence identity for each pair of sequenceses
 510             int length = tmpSeq1->GetLength();
 511             int matchCount = 0;
 512             int misMatchCount = 0;
 513             for (int k = 1; k <= length; k++) {
 514                 int ch1 = tmpSeq1->GetPosition(k);
 515                 int ch2 = tmpSeq2->GetPosition(k);
 516                 if (ch1 == ch2 && ch1 != '-' && ch2 != '-') { ++matchCount; }
 517                 else if (ch1 != ch2 && ch1 != '-' && ch2 != '-') { ++misMatchCount; }
 518             }
 519
 520             identities[a][b] = identities[b][a] = (float)matchCount/(float)(matchCount + misMatchCount);
 521
 522             // compute "expected accuracy" distance for evolutionary tree computation
 523             float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
 524             distances[a][b] = distances[b][a] = distance;
 525
 526             if (enableVerbose)
 527               cerr << setprecision (10) << distance << endl;
 528
 529               delete alignment.first;
 530           }
 531           else {
 532             // let us know that we're done.
 533             if (enableVerbose) cerr << "done." << endl;
 534           }
 535
 536           delete posterior;
 537         }
 538
 539         delete forward;
 540         delete backward;
 541       }
 542     }
 543   }
 544
 545   // now average out parameters derived
 546   if (enableTraining){
 547     // compute new parameters
 548     for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
 549     for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
 550     for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
 551
 552     if (enableTrainEmissions){
 553       for (int i = 0; i < (int) emitPairs.size(); i++)
 554         for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
 555       for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
 556     }
 557   }
 558
 559   // see if we still want to do some alignments
 560   else {
 561       // pass
 562     if (!enableViterbi){
 563
 564       // perform the consistency transformation the desired number of times
 565       for (int r = 0; r < numConsistencyReps; r++){
 566         SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
 567
 568         // now replace the old posterior matrices
 569         for (int i = 0; i < numSeqs; i++){
 570           for (int j = 0; j < numSeqs; j++){
 571             delete sparseMatrices[i][j];
 572             sparseMatrices[i][j] = newSparseMatrices[i][j];
 573           }
 574         }
 575       }
 576       if (numSeqs > 8) {
 577         for (int r = 0; r < 1; r++)
 578           DoBasePairProbabilityRelaxation(sequences, sparseMatrices, BPPMatrices);
 579       }
 580     }
 581
 582     MultiSequence *finalAlignment = NULL;
 583
 584     if (enableAllPairs){
 585       for (int a = 0; a < numSeqs-1; a++){
 586         for (int b = a+1; b < numSeqs; b++){
 587           Sequence *seq1 = sequences->GetSequence (a);
 588           Sequence *seq2 = sequences->GetSequence (b);
 589
 590           if (enableVerbose)
 591             cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
 592                  << "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
 593
 594
 595           // perform the pairwise sequence alignment
 596           pair<SafeVector<char> *, float> alignment;
 597           if (enableViterbi)
 598             alignment = model.ComputeViterbiAlignment (seq1, seq2);
 599           else {
 600
 601             // build posterior matrix
 602             VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
 603             int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
 604             for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
 605
 606             alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
 607             delete posterior;
 608           }
 609
 610           // write pairwise alignments
 611           string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
 612           ofstream outfile (name.c_str());
 613
 614           MultiSequence *result = new MultiSequence();
 615           result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
 616           result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
 617           if (enableClustalWOutput)
 618             result->WriteALN (outfile);
 619           else
 620             result->WriteMFA (outfile);
 621
 622           outfile.close();
 623
 624           delete alignment.first;
 625         }
 626       }
 627     }
 628
 629     // now if we still need to do a final multiple alignment
 630     else {
 631
 632       if (enableVerbose)
 633         cerr << endl;
 634
 635       // compute the evolutionary tree
 636       TreeNode *tree = TreeNode::ComputeTree (distances, identities);
 637
 638       if (enableWebOutput) {
 639           *outputFile << "<tree>" << endl;
 640           tree->Print (*outputFile, sequences);
 641           *outputFile << "</tree>" << endl;
 642       }
 643       else {
 644           tree->Print (cerr, sequences);
 645           cerr << endl;
 646       }
 647       // make the final alignment
 648       finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model, BPPMatrices);
 649
 650       // build annotation
 651       if (enableAnnotation){
 652         WriteAnnotation (finalAlignment, sparseMatrices);
 653       }
 654
 655       delete tree;
 656     }
 657
 658     if (!enableViterbi){
 659       // delete sparse matrices
 660       for (int a = 0; a < numSeqs-1; a++){
 661         for (int b = a+1; b < numSeqs; b++){
 662           delete sparseMatrices[a][b];
 663           delete sparseMatrices[b][a];
 664         }
 665       }
 666     }
 667
 668     AlifoldMEA alifold(finalAlignment, BPPMatrices, BasePairConst);
 669     alifold.Run();
 670     ssCons = alifold.getSScons();
 671
 672     return finalAlignment;
 673
 674   }
 675
 676   return NULL;
 677 }
 678
 679 /////////////////////////////////////////////////////////////////
 680 // GetInteger()
 681 //
 682 // Attempts to parse an integer from the character string given.
 683 // Returns true only if no parsing error occurs.
 684 /////////////////////////////////////////////////////////////////
 685
 686 bool GetInteger (char *data, int *val){
 687   char *endPtr;
 688   long int retVal;
 689
 690   assert (val);
 691
 692   errno = 0;
 693   retVal = strtol (data, &endPtr, 0);
 694   if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
 695   if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false;
 696   if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false;
 697   *val = (int) retVal;
 698   return true;
 699 }
 700
 701 /////////////////////////////////////////////////////////////////
 702 // GetFloat()
 703 //
 704 // Attempts to parse a float from the character string given.
 705 // Returns true only if no parsing error occurs.
 706 /////////////////////////////////////////////////////////////////
 707
 708 bool GetFloat (char *data, float *val){
 709   char *endPtr;
 710   double retVal;
 711
 712   assert (val);
 713
 714   errno = 0;
 715   retVal = strtod (data, &endPtr);
 716   if (retVal == 0 && (errno != 0 || data == endPtr)) return false;
 717   if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false;
 718   *val = (float) retVal;
 719   return true;
 720 }
 721
 722 /////////////////////////////////////////////////////////////////
 723 // ParseParams()
 724 //
 725 // Parse all command-line options.
 726 /////////////////////////////////////////////////////////////////
 727
 728 SafeVector<string> ParseParams (int argc, char **argv){
 729
 730   if (argc < 2){
 731
 732     cerr << "MXSCARNA comes with ABSOLUTELY NO WARRANTY.  This is free software, and" << endl
 733          << "you are welcome to redistribute it under certain conditions.  See the" << endl
 734          << "file COPYING.txt for details." << endl
 735          << endl
 736          << "Usage:" << endl
 737          << "       mxscarna [OPTION]... [MFAFILE]..." << endl
 738          << endl
 739          << "Description:" << endl
 740          << "       Align sequences in MFAFILE(s) and print result to standard output" << endl
 741          << endl
 742          << "       -clustalw" << endl
 743          << "              use CLUSTALW output format instead of MFA" << endl
 744          << endl
 745          << "       -stockholm" << endl
 746          << "              use STOCKHOLM output format instead of MFA" << endl
 747          << endl
 748          << "       -mxscarna" << endl
 749          << "              use MXSCARNA output format instead of MFA" << endl
 750          << endl
 751          << "       -weboutput /<output_path>/<outputfilename>" << endl
 752          << "        use web output format" << endl
 753          << endl
 754          << "       -c, --consistency REPS" << endl
 755          << "              use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
 756          << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
 757          << endl
 758          << "       -ir, --iterative-refinement REPS" << endl
 759          << "              use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
 760          << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
 761          << endl
 762          << "       -pre, --pre-training REPS" << endl
 763          << "              use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
 764          << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
 765          << endl
 766          << "       -pairs" << endl
 767          << "              generate all-pairs pairwise alignments" << endl
 768          << endl
 769          << "       -viterbi" << endl
 770          << "              use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
 771          << endl
 772          << "       -v, --verbose" << endl
 773          << "              report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
 774          << endl
 775          << "       -annot FILENAME" << endl
 776          << "              write annotation for multiple alignment to FILENAME" << endl
 777          << endl
 778          << "       -t, --train FILENAME" << endl
 779          << "              compute EM transition probabilities, store in FILENAME (default: "
 780          << parametersOutputFilename << ")" << endl
 781          << endl
 782          << "       -e, --emissions" << endl
 783          << "              also reestimate emission probabilities (default: "
 784          << (enableTrainEmissions ? "on" : "off") << ")" << endl
 785          << endl
 786          << "       -p, --paramfile FILENAME" << endl
 787          << "              read parameters from FILENAME (default: "
 788          << parametersInputFilename << ")" << endl
 789          << endl
 790          << "       -a, --alignment-order" << endl
 791          << "              print sequences in alignment order rather than input order (default: "
 792          << (enableAlignOrder ? "on" : "off") << ")" << endl
 793          << endl
 794          << "       -s THRESHOLD" << endl
 795          << "               the threshold of SCS alignment" << endl
 796          << endl
 797          << "               In default, for less than " << threshhold << ", the SCS aligment is applied. " << endl
 798          << "       -l SCSLENGTH" << endl
 799          << "               the length of stem candidates " << SCSLENGTH << endl
 800          << endl
 801          << "       -b BASEPROBTRHESHHOLD" << endl
 802          << "               the threshold of base pairing probability " << BASEPROBTHRESHOLD << endl
 803          << endl
 804          << "       -m, --mccaskillmea" << endl
 805          << "               McCaskill MEA MODE: input the clustalw format file and output the secondary structure predicted by McCaskill MEA" << endl
 806          << endl
 807          << "       -g BASEPAIRSCORECONST" << endl
 808          << "               the control parameter of the prediction of base pairs, default:" << BasePairConst << endl
 809          << endl
 810          << "       -w BANDWIDTH" << endl
 811          << "               the control parameter of the distance of stem candidates, default:" << BANDWIDTH << endl
 812          << "       -rfold" << endl
 813          << "               use Rfold instead of global McCaskill algorithm to calcurate base paring probality matrices, default: (" << (useRfold? "on" : "off") << ")" << endl
 814          << endl;
 815
 816
 817     //           << "       -go, --gap-open VALUE" << endl
 818     //           << "              gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
 819     //   << endl
 820     //   << "       -ge, --gap-extension VALUE" << endl
 821     //   << "              gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
 822     //   << endl
 823     //         << "       -co, --cutoff CUTOFF" << endl
 824     //         << "              subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
 825     //         << endl
 826
 827     exit (1);
 828   }
 829
 830   SafeVector<string> sequenceNames;
 831   int tempInt;
 832   float tempFloat;
 833
 834   for (int i = 1; i < argc; i++){
 835     if (argv[i][0] == '-'){
 836
 837       // training
 838       if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){
 839         enableTraining = true;
 840         if (i < argc - 1)
 841           parametersOutputFilename = string (argv[++i]);
 842         else {
 843           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 844           exit (1);
 845         }
 846       }
 847
 848       // emission training
 849       else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){
 850         enableTrainEmissions = true;
 851       }
 852
 853       // parameter file
 854       else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){
 855         if (i < argc - 1)
 856           parametersInputFilename = string (argv[++i]);
 857         else {
 858           cerr << "ERROR: Filename expected for option " << argv[i] << endl;
 859           exit (1);
 860         }
 861       }
 862       else if (! strcmp (argv[i], "-s")) {
 863         if (i < argc - 1){
 864           if (!GetFloat (argv[++i], &tempFloat)){
 865             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 866             exit (1);
 867           }
 868           else {
 869             if (tempFloat < 0){
 870               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
 871               exit (1);
 872             }
 873             else
 874               threshhold = tempFloat;
 875           }
 876         }
 877         else {
 878           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
 879           exit (1);
 880         }
 881       }
 882
 883       else if (! strcmp (argv[i], "-l")) {
 884           if (i < argc - 1) {
 885               if (!GetInteger (argv[++i], &tempInt)){
 886                   cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 887                   exit (1);
 888               }
 889               else {
 890                   if (tempInt <= 0 || 6 <= tempInt) {
 891                       cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 892                            << "1 and 6" << "." << endl;
 893                       exit (1);
 894                   }
 895                   else
 896                       scsLength = tempInt;
 897               }
 898           }
 899       }
 900       else if (! strcmp (argv[i], "-b")) {
 901           if (i < argc - 1) {
 902               if (!GetFloat (argv[++i], &tempFloat)){
 903                   cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 904                   exit (1);
 905               }
 906               else {
 907                   if (tempFloat < 0 && 1 < tempFloat) {
 908                       cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
 909                       exit (1);
 910                   }
 911                   else
 912                       BaseProbThreshold = tempFloat;
 913               }
 914           }
 915       }
 916       else if (! strcmp (argv[i], "-g")) {
 917           if (i < argc - 1) {
 918               if (!GetFloat (argv[++i], &tempFloat)){
 919                   cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
 920                   exit (1);
 921               }
 922               else {
 923                   if (tempFloat < 0 && 1 < tempFloat) {
 924                       cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
 925                       exit (1);
 926                   }
 927                   else
 928                       BasePairConst = tempFloat;
 929               }
 930           }
 931       }
 932
 933       // number of consistency transformations
 934       else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){
 935         if (i < argc - 1){
 936           if (!GetInteger (argv[++i], &tempInt)){
 937             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 938             exit (1);
 939           }
 940           else {
 941             if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){
 942               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 943                    << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
 944               exit (1);
 945             }
 946             else
 947               numConsistencyReps = tempInt;
 948           }
 949         }
 950         else {
 951           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 952           exit (1);
 953         }
 954       }
 955
 956       // number of randomized partitioning iterative refinement passes
 957       else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){
 958         if (i < argc - 1){
 959           if (!GetInteger (argv[++i], &tempInt)){
 960             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 961             exit (1);
 962           }
 963           else {
 964             if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
 965               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 966                    << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
 967               exit (1);
 968             }
 969             else
 970               numIterativeRefinementReps = tempInt;
 971           }
 972         }
 973         else {
 974           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
 975           exit (1);
 976         }
 977       }
 978       else if (!strcmp (argv[i], "-rfold")){
 979         useRfold = true;
 980       }
 981       // number of EM pre-training rounds
 982       else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){
 983         if (i < argc - 1){
 984           if (!GetInteger (argv[++i], &tempInt)){
 985             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
 986             exit (1);
 987           }
 988           else {
 989             if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){
 990               cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
 991                    << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
 992               exit (1);
 993             }
 994             else
 995               numPreTrainingReps = tempInt;
 996           }
 997         }
 998         else {
 999           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
1000           exit (1);
1001         }
1002       }
1003
1004       // the distance of stem candidate
1005       else if (!strcmp (argv[i], "-w")){
1006         if (i < argc - 1){
1007           if (!GetInteger (argv[++i], &tempInt)){
1008             cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
1009             exit (1);
1010           }
1011           else {
1012               BandWidth = tempInt;
1013           }
1014         }
1015         else {
1016           cerr << "ERROR: Integer expected for option " << argv[i] << endl;
1017           exit (1);
1018         }
1019       }
1020
1021       // gap open penalty
1022       else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){
1023         if (i < argc - 1){
1024           if (!GetFloat (argv[++i], &tempFloat)){
1025             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1026             exit (1);
1027           }
1028           else {
1029             if (tempFloat > 0){
1030               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
1031               exit (1);
1032             }
1033             else
1034               gapOpenPenalty = tempFloat;
1035           }
1036         }
1037         else {
1038           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1039           exit (1);
1040         }
1041       }
1042
1043       // gap extension penalty
1044       else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){
1045         if (i < argc - 1){
1046           if (!GetFloat (argv[++i], &tempFloat)){
1047             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1048             exit (1);
1049           }
1050           else {
1051             if (tempFloat > 0){
1052               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
1053               exit (1);
1054             }
1055             else
1056               gapContinuePenalty = tempFloat;
1057           }
1058         }
1059         else {
1060           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1061           exit (1);
1062         }
1063       }
1064
1065       // all-pairs pairwise alignments
1066       else if (!strcmp (argv[i], "-pairs")){
1067         enableAllPairs = true;
1068       }
1069
1070       // all-pairs pairwise Viterbi alignments
1071       else if (!strcmp (argv[i], "-viterbi")){
1072         enableAllPairs = true;
1073         enableViterbi = true;
1074       }
1075
1076       // annotation files
1077       else if (!strcmp (argv[i], "-annot")){
1078         enableAnnotation = true;
1079         if (i < argc - 1)
1080           annotationFilename = argv[++i];
1081         else {
1082           cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
1083           exit (1);
1084         }
1085       }
1086
1087       // clustalw output format
1088       else if (!strcmp (argv[i], "-clustalw")){
1089         enableClustalWOutput = true;
1090       }
1091       // mxscarna output format
1092       else if (!strcmp (argv[i], "-mxscarna")) {
1093           enableMXSCARNAOutput = true;
1094       }
1095       // stockholm output format
1096       else if (!strcmp (argv[i], "-stockholm")) {
1097           enableStockholmOutput = true;
1098       }
1099       // web output format
1100       else if (!strcmp (argv[i], "-weboutput")) {
1101           if (i < argc - 1) {
1102               weboutputFileName = string(argv[++i]);
1103           }
1104           else {
1105               cerr << "ERROR: Invalid following option " << argv[i-1] << ": " << argv[i] << endl;
1106               exit (1);
1107           }
1108
1109           enableWebOutput = true;
1110       }
1111
1112       // cutoff
1113       else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){
1114         if (i < argc - 1){
1115           if (!GetFloat (argv[++i], &tempFloat)){
1116             cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1117             exit (1);
1118           }
1119           else {
1120             if (tempFloat < 0 || tempFloat > 1){
1121               cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
1122               exit (1);
1123             }
1124             else
1125               cutoff = tempFloat;
1126           }
1127         }
1128         else {
1129           cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1130           exit (1);
1131         }
1132       }
1133
1134       // verbose reporting
1135       else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){
1136         enableVerbose = true;
1137       }
1138
1139       // alignment order
1140       else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){
1141         enableAlignOrder = true;
1142       }
1143       // McCaskill MEA MODE
1144       else if (!strcmp (argv[i], "-m") || !strcmp (argv[i], "--mccaskillmea")){
1145         enableMcCaskillMEAMode = true;
1146       }
1147       // bad arguments
1148       else {
1149         cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
1150         exit (1);
1151       }
1152     }
1153     else {
1154       sequenceNames.push_back (string (argv[i]));
1155     }
1156   }
1157
1158   if (enableTrainEmissions && !enableTraining){
1159     cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
1160     exit (1);
1161   }
1162
1163   return sequenceNames;
1164 }
1165
1166 /////////////////////////////////////////////////////////////////
1167 // ReadParameters()
1168 //
1169 // Read initial distribution, transition, and emission
1170 // parameters from a file.
1171 /////////////////////////////////////////////////////////////////
1172
1173 void ReadParameters (){
1174
1175   ifstream data;
1176
1177   emitPairs = VVF (256, VF (256, 1e-10));
1178   emitSingle = VF (256, 1e-5);
1179
1180   // read initial state distribution and transition parameters
1181   // pass
1182   if (parametersInputFilename == string ("")){
1183     if (NumInsertStates == 1){
1184       for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
1185       for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
1186       for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
1187     }
1188     else if (NumInsertStates == 2){
1189       for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
1190       for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
1191       for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
1192     }
1193     else {
1194       cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
1195            << "       for " << NumInsertStates << " pairs of insert states.  Use --paramfile." << endl;
1196       exit (1);
1197     }
1198
1199     alphabet = alphabetDefault;
1200
1201     for (int i = 0; i < (int) alphabet.length(); i++){
1202       emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
1203       emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
1204       for (int j = 0; j <= i; j++){
1205         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1206         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1207         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1208         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1209         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1210         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1211         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1212         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1213       }
1214     }
1215   }
1216   else {
1217     data.open (parametersInputFilename.c_str());
1218     if (data.fail()){
1219       cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
1220       exit (1);
1221     }
1222
1223     string line[3];
1224     for (int i = 0; i < 3; i++){
1225       if (!getline (data, line[i])){
1226         cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
1227         exit (1);
1228       }
1229     }
1230     istringstream data2;
1231     data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
1232     data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
1233     data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
1234
1235     if (!getline (data, line[0])){
1236       cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
1237       exit (1);
1238     }
1239
1240     // read alphabet as concatenation of all characters on alphabet line
1241     alphabet = "";
1242     string token;
1243     data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
1244
1245     for (int i = 0; i < (int) alphabet.size(); i++){
1246       for (int j = 0; j <= i; j++){
1247         float val;
1248         data >> val;
1249         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1250         emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1251         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1252         emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1253         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1254         emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1255         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1256         emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1257       }
1258     }
1259
1260     for (int i = 0; i < (int) alphabet.size(); i++){
1261       float val;
1262       data >> val;
1263       emitSingle[(unsigned char) tolower(alphabet[i])] = val;
1264       emitSingle[(unsigned char) toupper(alphabet[i])] = val;
1265     }
1266     data.close();
1267   }
1268 }
1269
1270 /////////////////////////////////////////////////////////////////
1271 // ProcessTree()
1272 //
1273 // Process the tree recursively.  Returns the aligned sequences
1274 // corresponding to a node or leaf of the tree.
1275 /////////////////////////////////////////////////////////////////
1276 float ide;
1277 MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences,
1278                             const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1279                             const ProbabilisticModel &model, SafeVector<BPPMatrix*> &BPPMatrices) {
1280   MultiSequence *result;
1281
1282   // check if this is a node of the alignment tree
1283   if (tree->GetSequenceLabel() == -1){
1284     MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model, BPPMatrices);
1285     MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model, BPPMatrices);
1286
1287     assert (alignLeft);
1288     assert (alignRight);
1289
1290     result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model, BPPMatrices, tree->GetIdentity());
1291     assert (result);
1292
1293     delete alignLeft;
1294     delete alignRight;
1295   }
1296
1297   // otherwise, this is a leaf of the alignment tree
1298   else {
1299     result = new MultiSequence(); assert (result);
1300     result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1301   }
1302
1303   return result;
1304 }
1305
1306 /////////////////////////////////////////////////////////////////
1307 // ComputeFinalAlignment()
1308 //
1309 // Compute the final alignment by calling ProcessTree(), then
1310 // performing iterative refinement as needed.
1311 /////////////////////////////////////////////////////////////////
1312
1313 MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences,
1314                                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1315                                       const ProbabilisticModel &model,
1316                                       SafeVector<BPPMatrix*> &BPPMatrices) {
1317
1318   MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model, BPPMatrices);
1319
1320   if (enableAlignOrder){
1321     alignment->SaveOrdering();
1322     enableAlignOrder = false;
1323   }
1324
1325   // tree-based refinement
1326   // if you use the function, you can degrade the quality of the software.
1327   // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree, BPPMatrices);
1328
1329   // iterative refinement
1330 /*
1331   for (int i = 0; i < numIterativeRefinementReps; i++)
1332       DoIterativeRefinement (sparseMatrices, model, alignment);
1333
1334       cerr << endl;
1335 */
1336   // return final alignment
1337   return alignment;
1338 }
1339
1340 /////////////////////////////////////////////////////////////////
1341 // AlignAlignments()
1342 //
1343 // Returns the alignment of two MultiSequence objects.
1344 /////////////////////////////////////////////////////////////////
1345
1346 MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2,
1347                                 const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1348                                 const ProbabilisticModel &model,
1349                                 SafeVector<BPPMatrix*> &BPPMatrices, float identity){
1350
1351   // print some info about the alignment
1352   if (enableVerbose){
1353     for (int i = 0; i < align1->GetNumSequences(); i++)
1354       cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1355     cerr << "] vs. ";
1356     for (int i = 0; i < align2->GetNumSequences(); i++)
1357       cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1358     cerr << "]: ";
1359   }
1360
1361   VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
1362
1363   pair<SafeVector<char> *, float> alignment;
1364   // choose the alignment routine depending on the "cosmetic" gap penalties used
1365   if (gapOpenPenalty == 0 && gapContinuePenalty == 0) {
1366
1367     if(identity < threshhold) {
1368         std::vector<StemCandidate> *pscs1, *pscs2;
1369         pscs1 = seq2scs(align1, BPPMatrices, BandWidth);
1370         pscs2 = seq2scs(align2, BPPMatrices, BandWidth);
1371         std::vector<int> *matchPSCS1 = new std::vector<int>;
1372         std::vector<int> *matchPSCS2 = new std::vector<int>;
1373
1374         Globaldp globaldp(pscs1, pscs2, align1, align2, matchPSCS1, matchPSCS2, posterior, BPPMatrices);
1375         //float scsScore = globaldp.Run();
1376
1377         globaldp.Run();
1378
1379         removeConflicts(pscs1, pscs2, matchPSCS1, matchPSCS2);
1380
1381         alignment = model.ComputeAlignment2 (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior, pscs1, pscs2, matchPSCS1, matchPSCS2);
1382
1383     } else {
1384         alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
1385     }
1386   }
1387   else {
1388     alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1389                                                         *posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1390                                                         gapOpenPenalty, gapContinuePenalty);
1391   }
1392
1393   delete posterior;
1394
1395   if (enableVerbose){
1396
1397     // compute total length of sequences
1398     int totLength = 0;
1399     for (int i = 0; i < align1->GetNumSequences(); i++)
1400       for (int j = 0; j < align2->GetNumSequences(); j++)
1401         totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1402
1403     // give an "accuracy" measure for the alignment
1404     cerr << alignment.second / totLength << endl;
1405   }
1406
1407   // now build final alignment
1408   MultiSequence *result = new MultiSequence();
1409   for (int i = 0; i < align1->GetNumSequences(); i++)
1410     result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1411   for (int i = 0; i < align2->GetNumSequences(); i++)
1412     result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1413   if (!enableAlignOrder)
1414     result->SortByLabel();
1415
1416   // free temporary alignment
1417   delete alignment.first;
1418
1419   return result;
1420 }
1421
1422 /////////////////////////////////////////////////////////////////
1423 // DoRelaxation()
1424 //
1425 // Performs one round of the consistency transformation.  The
1426 // formula used is:
1427 //                     1
1428 //    P'(x[i]-y[j]) = ---  sum   sum P(x[i]-z[k]) P(z[k]-y[j])
1429 //                    |S| z in S  k
1430 //
1431 // where S = {x, y, all other sequences...}
1432 //
1433 /////////////////////////////////////////////////////////////////
1434
1435 SafeVector<SafeVector<SparseMatrix *> > DoRelaxation (MultiSequence *sequences,
1436                                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1437   const int numSeqs = sequences->GetNumSequences();
1438
1439   SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices (numSeqs, SafeVector<SparseMatrix *>(numSeqs, NULL));
1440
1441   // for every pair of sequences
1442   for (int i = 0; i < numSeqs; i++){
1443     for (int j = i+1; j < numSeqs; j++){
1444       Sequence *seq1 = sequences->GetSequence (i);
1445       Sequence *seq2 = sequences->GetSequence (j);
1446
1447       if (enableVerbose)
1448         cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1449              << "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1450
1451       // get the original posterior matrix
1452       VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1453       VF &posterior = *posteriorPtr;
1454
1455       const int seq1Length = seq1->GetLength();
1456       const int seq2Length = seq2->GetLength();
1457
1458       // contribution from the summation where z = x and z = y
1459       for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1460
1461       if (enableVerbose)
1462         cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1463
1464       // contribution from all other sequences
1465       for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1466         if (k < i)
1467           Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
1468         else if (k > i && k < j)
1469           Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1470         else {
1471           SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
1472           Relax (sparseMatrices[i][k], temp, posterior);
1473           delete temp;
1474         }
1475       }
1476
1477       // now renormalization
1478       for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1479
1480       // mask out positions not originally in the posterior matrix
1481       SparseMatrix *matXY = sparseMatrices[i][j];
1482       for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
1483       for (int x = 1; x <= seq1Length; x++){
1484         SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1485         SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1486         VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1487         int curr = 0;
1488         while (XYptr != XYend){
1489
1490           // zero out all cells until the first filled column
1491           while (curr < XYptr->first){
1492             base[curr] = 0;
1493             curr++;
1494           }
1495
1496           // now, skip over this column
1497           curr++;
1498           ++XYptr;
1499         }
1500
1501         // zero out cells after last column
1502         while (curr <= seq2Length){
1503           base[curr] = 0;
1504           curr++;
1505         }
1506       }
1507
1508       // save the new posterior matrix
1509       newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1510       newSparseMatrices[j][i] = NULL;
1511
1512       if (enableVerbose)
1513         cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1514
1515       delete posteriorPtr;
1516
1517       if (enableVerbose)
1518         cerr << "done." << endl;
1519     }
1520   }
1521
1522   return newSparseMatrices;
1523 }
1524
1525 /////////////////////////////////////////////////////////////////
1526 // Relax()
1527 //
1528 // Computes the consistency transformation for a single sequence
1529 // z, and adds the transformed matrix to "posterior".
1530 /////////////////////////////////////////////////////////////////
1531
1532 void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){
1533
1534   assert (matXZ);
1535   assert (matZY);
1536
1537   int lengthX = matXZ->GetSeq1Length();
1538   int lengthY = matZY->GetSeq2Length();
1539   assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1540
1541   // for every x[i]
1542   for (int i = 1; i <= lengthX; i++){
1543     SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1544     SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1545
1546     VF::iterator base = posterior.begin() + i * (lengthY + 1);
1547
1548     // iterate through all x[i]-z[k]
1549     while (XZptr != XZend){
1550       SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1551       SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1552       const float XZval = XZptr->second;
1553
1554       // iterate through all z[k]-y[j]
1555       while (ZYptr != ZYend){
1556         base[ZYptr->first] += XZval * ZYptr->second;
1557        ZYptr++;
1558       }
1559       XZptr++;
1560     }
1561   }
1562 }
1563
1564 /////////////////////////////////////////////////////////////////
1565 // Relax1()
1566 //
1567 // Computes the consistency transformation for a single sequence
1568 // z, and adds the transformed matrix to "posterior".
1569 /////////////////////////////////////////////////////////////////
1570
1571 void Relax1 (SparseMatrix *matZX, SparseMatrix *matZY, VF &posterior){
1572
1573   assert (matZX);
1574   assert (matZY);
1575
1576   int lengthZ = matZX->GetSeq1Length();
1577   int lengthY = matZY->GetSeq2Length();
1578
1579   // for every z[k]
1580   for (int k = 1; k <= lengthZ; k++){
1581     SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1582     SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1583
1584     // iterate through all z[k]-x[i]
1585     while (ZXptr != ZXend){
1586       SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1587       SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1588       const float ZXval = ZXptr->second;
1589       VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
1590
1591       // iterate through all z[k]-y[j]
1592       while (ZYptr != ZYend){
1593         base[ZYptr->first] += ZXval * ZYptr->second;
1594         ZYptr++;
1595       }
1596       ZXptr++;
1597     }
1598   }
1599 }
1600
1601 void DoBasePairProbabilityRelaxation (MultiSequence *sequences,
1602                                       SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1603                                       SafeVector<BPPMatrix*> &BPPMatrices) {
1604     const int numSeqs = sequences->GetNumSequences();
1605
1606     for (int i = 0; i < numSeqs; i++) {
1607         Sequence *seq1 = sequences->GetSequence (i);
1608         BPPMatrix *seq1BppMatrix = BPPMatrices[seq1->GetLabel()];
1609         Trimat<float> consBppMat(seq1->GetLength() + 1);
1610         int seq1Length = seq1->GetLength();
1611
1612         for (int k = 1; k <= seq1Length; k++) {
1613             for (int l = k; l <= seq1Length; l++) {
1614                 consBppMat.ref(k, l) = seq1BppMatrix->GetProb(k, l);
1615             }
1616         }
1617
1618         for (int j = i + 1; j < numSeqs; j++) {
1619
1620    //           VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior()
1621           Sequence *seq2 = sequences->GetSequence (j);
1622           BPPMatrix *seq2BppMatrix = BPPMatrices[seq2->GetLabel()];
1623 //        int seq2Length = seq2->GetLength();
1624           SparseMatrix *matchProb = sparseMatrices[i][j];
1625
1626 //        vector<PIF2> &probs1 = seq1BppMatrix->bppMat.data2;
1627           for(int k = 1; k <= seq1Length; k++) {
1628               for(int m = k, n = k; n <= k + 200 && m >= 1 && n <= seq1Length; m--, n++) {
1629
1630 //        for (int k = 0; k < (int)probs1.size(); k++) {
1631 //            float tmpProb1 = probs1[k].prob;
1632 //            int   tmp1I    = probs1[k].i;
1633 //            int   tmp1J    = probs1[k].j;
1634
1635                   float sumProb = 0;
1636                   vector<PIF2> &probs2 = seq2BppMatrix->bppMat.data2;
1637                   for(int l = 0; l < (int)probs2.size(); l++) {
1638                       float tmpProb2 = probs2[l].prob;
1639                       int   tmp2I    = probs2[l].i;
1640                       int   tmp2J    = probs2[l].j;
1641                       sumProb += matchProb->GetValue(m, tmp2I)*matchProb->GetValue(n, tmp2J)*tmpProb2;
1642                   }
1643
1644                   consBppMat.ref(m, n) += sumProb;
1645               }
1646
1647               for(int m = k, n = k + 1; n <= k + 200 && m >= 1 && n <= seq1Length; m--, n++) {
1648
1649 //        for (int k = 0; k < (int)probs1.size(); k++) {
1650 //            float tmpProb1 = probs1[k].prob;
1651 //            int   tmp1I    = probs1[k].i;
1652 //            int   tmp1J    = probs1[k].j;
1653
1654                   float sumProb = 0;
1655                   vector<PIF2> &probs2 = seq2BppMatrix->bppMat.data2;
1656                   for(int l = 0; l < (int)probs2.size(); l++) {
1657                       float tmpProb2 = probs2[l].prob;
1658                       int   tmp2I    = probs2[l].i;
1659                       int   tmp2J    = probs2[l].j;
1660                       sumProb += matchProb->GetValue(m, tmp2I)*matchProb->GetValue(n, tmp2J)*tmpProb2;
1661                   }
1662
1663                   consBppMat.ref(m, n) += sumProb;
1664               }
1665           }
1666         }
1667
1668 /*
1669           for(int k = 1; k <= seq1Length; k++) {
1670               for(int m = k, n = k; n <= k + 30 && m >= 1 && n <= seq1Length; m--, n++) {
1671                   float tmpProb = seq1BppMatrix->GetProb(m, n);
1672                   for(int l = 1; l <= seq2Length; l++) {
1673                       for(int s = l, t = l; t <= l + 30 && s >= 1 && t <= seq2Length; s--, t++) {
1674                           tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1675                       }
1676                       for(int s = l, t = l + 1; t <= l + 31 && s >= 1 && t <= seq2Length; s--, t++) {
1677                           tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1678                       }
1679                   }
1680                   consBppMat.ref(m, n) += tmpProb;
1681               }
1682
1683               for(int m = k, n = k + 1; n <= k + 31 && m >= 1 && n <= seq1Length; m--, n++) {
1684                   float tmpProb = seq1BppMatrix->GetProb(m, n);
1685                   for(int l = 1; l <= seq2Length; l++) {
1686                       for(int s = l, t = l; t <= l + 30 && s >= 1 && t <= seq2Length; s--, t++) {
1687                           tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1688                       }
1689                       for(int s = l, t = l + 1; t <= l + 31 && s >= 1 && t <= seq2Length; s--, t++) {
1690                           tmpProb += matchProb->GetValue(m,s)*matchProb->GetValue(n,t)*seq2BppMatrix->GetProb(s,t);
1691                       }
1692                   }
1693                   consBppMat.ref(m,n) += tmpProb;
1694               }
1695           }
1696         }
1697 */
1698         for (int m = 1; m <= seq1Length; m++) {
1699             for (int n = m + 4; n <= seq1Length; n++) {
1700                 consBppMat.ref(m,n) = consBppMat.ref(m,n)/(float)numSeqs;
1701             }
1702         }
1703         seq1BppMatrix->updateBPPMatrix(consBppMat);
1704     }
1705 }
1706
1707 /////////////////////////////////////////////////////////////////
1708 // GetSubtree
1709 //
1710 // Returns set containing all leaf labels of the current subtree.
1711 /////////////////////////////////////////////////////////////////
1712
1713 set<int> GetSubtree (const TreeNode *tree){
1714   set<int> s;
1715
1716   if (tree->GetSequenceLabel() == -1){
1717     s = GetSubtree (tree->GetLeftChild());
1718     set<int> t = GetSubtree (tree->GetRightChild());
1719
1720     for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1721       s.insert (*iter);
1722   }
1723   else {
1724     s.insert (tree->GetSequenceLabel());
1725   }
1726
1727   return s;
1728 }
1729
1730 /////////////////////////////////////////////////////////////////
1731 // TreeBasedBiPartitioning
1732 //
1733 // Uses the iterative refinement scheme from MUSCLE.
1734 /////////////////////////////////////////////////////////////////
1735
1736 void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1737                               const ProbabilisticModel &model, MultiSequence* &alignment,
1738                               const TreeNode *tree, SafeVector<BPPMatrix*> &BPPMatrices){
1739   // check if this is a node of the alignment tree
1740   if (tree->GetSequenceLabel() == -1){
1741     TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild(), BPPMatrices);
1742     TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild(), BPPMatrices);
1743
1744     set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1745     set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1746     set<int> leftSubtreeComplement, rightSubtreeComplement;
1747
1748     // calculate complement of each subtree
1749     for (int i = 0; i < alignment->GetNumSequences(); i++){
1750       if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1751       if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1752     }
1753
1754     // perform realignments for edge to left child
1755     if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1756       MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1757       MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1758       delete alignment;
1759       alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model, BPPMatrices, tree->GetLeftChild()->GetIdentity());
1760     }
1761
1762     // perform realignments for edge to right child
1763     if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1764       MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1765       MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1766       delete alignment;
1767       alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model, BPPMatrices, tree->GetRightChild()->GetIdentity());
1768     }
1769   }
1770 }
1771
1772 /////////////////////////////////////////////////////////////////
1773 // DoterativeRefinement()
1774 //
1775 // Performs a single round of randomized partionining iterative
1776 // refinement.
1777 /////////////////////////////////////////////////////////////////
1778 /*
1779 void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1780                             const ProbabilisticModel &model, MultiSequence* &alignment){
1781   set<int> groupOne, groupTwo;
1782
1783   // create two separate groups
1784   for (int i = 0; i < alignment->GetNumSequences(); i++){
1785     if (rand() % 2)
1786       groupOne.insert (i);
1787     else
1788       groupTwo.insert (i);
1789   }
1790
1791   if (groupOne.empty() || groupTwo.empty()) return;
1792
1793   // project into the two groups
1794   MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1795   MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1796   delete alignment;
1797
1798   // realign
1799   alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1800
1801   delete groupOneSeqs;
1802   delete groupTwoSeqs;
1803 }
1804 */
1805
1806 /////////////////////////////////////////////////////////////////
1807 // WriteAnnotation()
1808 //
1809 // Computes annotation for multiple alignment and write values
1810 // to a file.
1811 /////////////////////////////////////////////////////////////////
1812
1813 void WriteAnnotation (MultiSequence *alignment,
1814                       const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1815   ofstream outfile (annotationFilename.c_str());
1816
1817   if (outfile.fail()){
1818     cerr << "ERROR: Unable to write annotation file." << endl;
1819     exit (1);
1820   }
1821
1822   const int alignLength = alignment->GetSequence(0)->GetLength();
1823   const int numSeqs = alignment->GetNumSequences();
1824
1825   SafeVector<int> position (numSeqs, 0);
1826   SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1827   for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1828   SafeVector<pair<int,int> > active;
1829   active.reserve (numSeqs);
1830
1831   // for every column
1832   for (int i = 1; i <= alignLength; i++){
1833
1834     // find all aligned residues in this particular column
1835     active.clear();
1836     for (int j = 0; j < numSeqs; j++){
1837       if (seqs[j][i] != '-'){
1838         active.push_back (make_pair(j, ++position[j]));
1839       }
1840     }
1841
1842     outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1843   }
1844
1845   outfile.close();
1846 }
1847
1848 /////////////////////////////////////////////////////////////////
1849 // ComputeScore()
1850 //
1851 // Computes the annotation score for a particular column.
1852 /////////////////////////////////////////////////////////////////
1853
1854 int ComputeScore (const SafeVector<pair<int, int> > &active,
1855                   const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1856
1857   if (active.size() <= 1) return 0;
1858
1859   // ALTERNATIVE #1: Compute the average alignment score.
1860
1861   float val = 0;
1862   for (int i = 0; i < (int) active.size(); i++){
1863     for (int j = i+1; j < (int) active.size(); j++){
1864       val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1865     }
1866   }
1867
1868   return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1869
1870 }