4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
7 * @author Mark Larkin, Conway Institute, UCD. mark.larkin@ucd.ie
10 * Mark: 23-01-2007: There was a problem with running an alignment with only
11 * one sequence in it. I needed to make a change to the multiSeqAlign function.
12 ****************************************************************************/
17 #include "MyersMillerProfileAlign.h"
18 //#include "../phylogeneticTree/ClusterTree.h"
19 #include "../general/debuglogObject.h"
32 int MSA::multiSeqAlign(Alignment* alnPtr, DistMatrix* distMat, vector<int>* seqWeight, AlignmentSteps* progSteps, int iStart)
46 vector<int> treeWeight;
47 int i = 0, j = 0, set = 0, iseq = 0;
51 utilityObject->info("Start of Multiple Alignment\n");
53 int _numSeqs = alnPtr->getNumSeqs();
55 vector<int> newOutputIndex(_numSeqs);
57 alnPtr->addSeqWeight(seqWeight);
59 ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign;
60 _numSteps = progSteps->getNumSteps();
61 // for each sequence, find the most closely related sequence
63 maxid = new int[_numSeqs + 1];
65 for (i = 1; i <= _numSeqs; i++)
68 for (j = 1; j <= _numSeqs; j++)
69 if (j != i && maxid[i] < (*distMat)(i, j))
71 maxid[i] = static_cast<int>((*distMat)(i, j));
75 // group the sequences according to their relative divergence
80 // start the multiple alignments.........
82 utilityObject->info("Aligning...");
83 // first pass, align closely related sequences first....
87 aligned = new int[_numSeqs + 1];
89 for (i = 0; i <= _numSeqs; i++)
94 const vector<vector<int> >* ptrToSets = progSteps->getSteps();
97 for (set = 1; set <= _numSteps; ++set)
100 for (i = 1; i <= _numSeqs; i++)
102 if (((*ptrToSets)[set][i] != 0) &&
103 (maxid[i] > userParameters->getDivergenceCutoff()))
108 if (userParameters->getOutputOrder() == INPUT)
111 newOutputIndex[i - 1] = i;
115 if(ix >= (int)newOutputIndex.size())
117 cerr << "ERROR: size = " << newOutputIndex.size()
118 << "ix = " << ix << "\n";
123 newOutputIndex[ix] = i;
136 if(logObject && DEBUGLOG)
138 logObject->logMsg("Doing profile align");
141 score = alignAlgorithm->profileAlign(alnPtr, distMat, progSteps->getStep(set),
150 // negative score means fatal error... exit now!
156 if(userParameters->getDisplayInfo())
158 if ((entries > 0) && (score > 0))
160 utilityObject->info("Group %d: Sequences:%4d Score:%d",
161 set, entries, score);
165 utilityObject->info("Group %d: Delayed", set);
173 aligned = new int[_numSeqs + 1];
175 for (i = 1; i <= iStart + 1; i++)
179 newOutputIndex[i - 1] = i;
181 for (i = iStart + 2; i <= _numSeqs; i++)
187 // second pass - align remaining, more divergent sequences.....
189 // if not all sequences were aligned, for each unaligned sequence,
190 // find it's closest pair amongst the aligned sequences.
192 group.resize(_numSeqs + 1);
193 treeWeight.resize(_numSeqs);
195 for (i = 0; i < _numSeqs; i++)
197 treeWeight[i] = (*seqWeight)[i];
200 // if we haven't aligned any sequences, in the first pass - align the
201 // two most closely related sequences now
207 for (i = 1; i <= _numSeqs; i++)
209 for (j = i + 1; j <= _numSeqs; j++)
211 if (max < (*distMat)(i, j))
213 max = static_cast<int>((*distMat)(i, j)); // Mark change 17-5-07
219 if (userParameters->getOutputOrder() == INPUT)
222 newOutputIndex[iseq - 1] = iseq;
226 newOutputIndex[ix] = iseq;
231 while (ix < _numSeqs)
233 for (i = 1; i <= _numSeqs; i++)
238 for (j = 1; j <= _numSeqs; j++)
239 if ((maxid[i] < (*distMat)(i, j)) && (aligned[j] != 0))
241 maxid[i] = static_cast<int>((*distMat)(i, j));// Mark change 17-5-07
246 // find the most closely related sequence to those already aligned
250 for (i = 1; i <= _numSeqs; i++)
252 if ((aligned[i] == 0) && (maxid[i] > max))
260 // align this sequence to the existing alignment
261 // weight sequences with percent identity with profile
262 // OR...., multiply sequence weights from tree by percent identity with new sequence
263 if (userParameters->getNoWeights() == false)
265 for (j = 0; j < _numSeqs; j++)
266 if (aligned[j + 1] != 0)
268 (*seqWeight)[j] = static_cast<int>(treeWeight[j] * (*distMat)(j + 1, iseq));
271 // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
274 for (j = 0; j < _numSeqs; j++)
276 if (aligned[j + 1] != 0)
278 sum += (*seqWeight)[j];
284 for (j = 0; j < _numSeqs; j++)
291 for (j = 0; j < _numSeqs; j++)
293 if (aligned[j + 1] != 0)
295 (*seqWeight)[j] = ((*seqWeight)[j] * INT_SCALE_FACTOR) / sum;
296 if ((*seqWeight)[j] < 1)
305 for (j = 1; j <= _numSeqs; j++)
319 alnPtr->addSeqWeight(seqWeight);
322 score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned);
324 if (userParameters->getOutputOrder() == INPUT)
327 newOutputIndex[iseq - 1] = iseq;
331 newOutputIndex[ix] = iseq;
336 alnPtr->addOutputIndex(&newOutputIndex);
338 if(userParameters->getDisplayInfo())
340 int alignmentScore = alnPtr->alignScore(); // ?? check, FS, 2009-05-18
343 delete alignAlgorithm;
358 int MSA::seqsAlignToProfile(Alignment* alnPtr, DistMatrix* distMat, vector<int>* seqWeight, int iStart,
362 vector<int> treeWeight;
368 int i = 0, j = 0, iseq = 0;
369 int sum = 0, entries = 0;
371 int _numSeqs = alnPtr->getNumSeqs();
373 utilityObject->info("Start of Multiple Alignment\n");
375 ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign;
377 // calculate sequence weights according to branch lengths of the tree -
378 // weights in global variable seq_weight normalised to sum to 100
379 vector<int> newOutputIndex(_numSeqs);
380 //groupTree.calcSeqWeights(0, _numSeqs, &seqWeight);
382 treeWeight.resize(_numSeqs);
383 for (i = 0; i < _numSeqs; i++)
385 treeWeight[i] = (*seqWeight)[i];
388 // for each sequence, find the most closely related sequence
390 maxid = new int[_numSeqs + 1];
392 for (i = 1; i <= _numSeqs; i++)
395 for (j = 1; j <= _numSeqs; j++)
397 if (maxid[i] < (*distMat)(i, j))
399 maxid[i] = static_cast<int>((*distMat)(i, j)); // Mark change 17-5-07
404 aligned = new int[_numSeqs + 1];
406 for (i = 1; i <= iStart + 1; i++)
410 newOutputIndex[i - 1] = i;
412 for (i = iStart + 2; i <= _numSeqs; i++)
417 // for each unaligned sequence, find it's closest pair amongst the
418 // aligned sequences.
420 group.resize(_numSeqs + 1);
422 while (ix < _numSeqs)
426 for (i = 1; i <= _numSeqs; i++)
431 for (j = 1; j <= _numSeqs; j++)
433 if ((maxid[i] < (*distMat)(i, j)) && (aligned[j] != 0))
435 maxid[i] = static_cast<int>((*distMat)(i, j));
442 // find the most closely related sequence to those already aligned
445 for (i = 1; i <= _numSeqs; i++)
447 if ((aligned[i] == 0) && (maxid[i] > max))
454 // align this sequence to the existing alignment
456 for (j = 1; j <= _numSeqs; j++)
472 // multiply sequence weights from tree by percent
473 // identity with new sequence
475 for (j = 0; j < _numSeqs; j++)
477 (*seqWeight)[j] = static_cast<int>(treeWeight[j] * (*distMat)(j + 1, iseq));
481 // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
485 for (j = 0; j < _numSeqs; j++)
487 if (group[j + 1] == 1)
489 sum += (*seqWeight)[j];
495 for (j = 0; j < _numSeqs; j++)
502 for (j = 0; j < _numSeqs; j++)
504 (*seqWeight)[j] = ((*seqWeight)[j] * INT_SCALE_FACTOR) / sum;
505 if ((*seqWeight)[j] < 1)
510 // Add the seqWeights to the Alignment object!!!!
511 alnPtr->addSeqWeight(seqWeight);
513 score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned);
514 utilityObject->info("Sequence:%d Score:%d", iseq, score);
515 if (userParameters->getOutputOrder() == INPUT)
518 newOutputIndex[iseq - 1] = iseq;
522 newOutputIndex[ix] = iseq;
529 delete alignAlgorithm;
530 alnPtr->addOutputIndex(&newOutputIndex);
532 if(userParameters->getDisplayInfo())
534 alnPtr->alignScore();
547 int MSA::calcPairwiseForProfileAlign(Alignment* alnPtr, DistMatrix* distMat)
554 vector<int> seqWeight;
557 int _numSeqs = alnPtr->getNumSeqs();
559 seqWeight.resize(_numSeqs);
560 ProfileAlignAlgorithm* alignAlg = new MyersMillerProfileAlign;
562 utilityObject->info("Start of Initial Alignment");
563 /* calculate sequence weights according to branch lengths of the tree -
564 * weights in global variable seq_weight normalised to sum to INT_SCALE_FACTOR */
566 temp = INT_SCALE_FACTOR / _numSeqs;
567 for (i = 0; i < _numSeqs; i++)
572 userParameters->setDistanceTree(false);
574 /* do the initial alignment......... */
576 group.resize(_numSeqs + 1);
578 for (i = 1; i <= alnPtr->getProfile1NumSeqs(); ++i)
583 for (i = alnPtr->getProfile1NumSeqs() + 1; i <= _numSeqs; ++i)
590 aligned = new int[_numSeqs + 1];
592 for (i = 1; i <= _numSeqs; i++)
597 alnPtr->addSeqWeight(&seqWeight);
599 score = alignAlg->profileAlign(alnPtr, distMat, &group, aligned);
600 utilityObject->info("Sequences:%d Score:%d", entries, score);
603 for (i = 1; i <= _numSeqs; i++)
605 for (j = i + 1; j <= _numSeqs; j++)
607 dscore = alnPtr->countid(i, j);
608 (*distMat)(i, j) = ((double)100.0 - (double)dscore) / (double)100.0;
609 (*distMat)(j, i) = (*distMat)(i, j);
626 int MSA::doProfileAlign(Alignment* alnPtr, DistMatrix* distMat, vector<int>* prof1Weight, vector<int>* prof2Weight)
628 //Tree groupTree1, groupTree2;
629 int i, j, sum, entries;
634 //vector<int> prof1Weight, prof2Weight;
635 int _profile1NumSeqs = alnPtr->getProfile1NumSeqs();
636 int _numSeqs = alnPtr->getNumSeqs();
637 vector<int> _seqWeight, _outputIndex;
639 utilityObject->info("Start of Multiple Alignment\n");
641 _seqWeight.resize(_numSeqs + 1);
642 _outputIndex.resize(_numSeqs);
643 ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign;
645 // weight sequences with max percent identity with other profile
647 maxid = new int[_numSeqs + 1];
648 for (i = 0; i < _profile1NumSeqs; i++)
651 for (j = _profile1NumSeqs + 1; j <= _numSeqs; j++)
653 if (maxid[i] < (*distMat)(i + 1, j))
655 maxid[i] = static_cast<int>((*distMat)(i + 1, j)); // Mark change 17-5-07
658 _seqWeight[i] = maxid[i] * (*prof1Weight)[i];
661 for (i = _profile1NumSeqs; i < _numSeqs; i++)
664 for (j = 1; j <= _profile1NumSeqs; j++)
666 if (maxid[i] < (*distMat)(i + 1, j))
668 maxid[i] = static_cast<int>((*distMat)(i + 1, j));// Mark change 17-5-07
671 _seqWeight[i] = maxid[i] * (*prof2Weight)[i];
674 // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
678 for (j = 0; j < _numSeqs; j++)
680 sum += _seqWeight[j];
685 for (j = 0; j < _numSeqs; j++)
692 for (j = 0; j < _numSeqs; j++)
694 _seqWeight[j] = (_seqWeight[j] * INT_SCALE_FACTOR) / sum;
695 if (_seqWeight[j] < 1)
701 // do the alignment......... /
703 utilityObject->info("Aligning...");
704 group.resize(_numSeqs + 1);
706 for (i = 1; i <= _profile1NumSeqs; ++i)
711 for (i = _profile1NumSeqs + 1; i <= _numSeqs; ++i)
718 aligned = new int[_numSeqs + 1];
719 for (i = 1; i <= _numSeqs; i++)
724 alnPtr->addSeqWeight(&_seqWeight);
726 score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned);
728 utilityObject->info("Sequences:%d Score:%d", entries, score);
730 for (i = 1; i <= _numSeqs; i++)
732 _outputIndex[i - 1] = i;
735 alnPtr->addOutputIndex(&_outputIndex);
737 delete alignAlgorithm;