+++ /dev/null
-/**
- * Author: Mark Larkin
- *
- * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
- */
-/**
- * @author Mark Larkin, Conway Institute, UCD. mark.larkin@ucd.ie
- * Changes:
- *
- * Mark: 23-01-2007: There was a problem with running an alignment with only
- * one sequence in it. I needed to make a change to the multiSeqAlign function.
- ****************************************************************************/
-#ifdef HAVE_CONFIG_H
- #include "config.h"
-#endif
-#include "MSA.h"
-#include "MyersMillerProfileAlign.h"
-//#include "../phylogeneticTree/ClusterTree.h"
-#include "../general/debuglogObject.h"
-
-namespace clustalw
-{
-
-/**
- *
- * @param alnPtr
- * @param distMat
- * @param iStart
- * @param phylipName
- * @return
- */
-int MSA::multiSeqAlign(Alignment* alnPtr, DistMatrix* distMat, vector<int>* seqWeight, AlignmentSteps* progSteps, int iStart)
-{
-
- if(!progSteps)
- {
- return 0;
- }
-
- int* aligned;
- vector<int> group;
- int ix;
-
- int* maxid;
- int max = 0, sum = 0;
- vector<int> treeWeight;
- int i = 0, j = 0, set = 0, iseq = 0;
- int entries = 0;
- int score = 0;
- int _numSteps = 0;
- utilityObject->info("Start of Multiple Alignment\n");
-
- int _numSeqs = alnPtr->getNumSeqs();
-
- vector<int> newOutputIndex(_numSeqs);
-
- alnPtr->addSeqWeight(seqWeight);
-
- ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign;
- _numSteps = progSteps->getNumSteps();
- // for each sequence, find the most closely related sequence
-
- maxid = new int[_numSeqs + 1];
-
- for (i = 1; i <= _numSeqs; i++)
- {
- maxid[i] = -1;
- for (j = 1; j <= _numSeqs; j++)
- if (j != i && maxid[i] < (*distMat)(i, j))
- {
- maxid[i] = static_cast<int>((*distMat)(i, j));
- }
- }
-
- // group the sequences according to their relative divergence
-
- if (iStart == 0)
- {
-
- // start the multiple alignments.........
-
- utilityObject->info("Aligning...");
- // first pass, align closely related sequences first....
-
-
- ix = 0;
- aligned = new int[_numSeqs + 1];
-
- for (i = 0; i <= _numSeqs; i++)
- {
- aligned[i] = 0;
- }
-
- const vector<vector<int> >* ptrToSets = progSteps->getSteps();
-
-
- for (set = 1; set <= _numSteps; ++set)
- {
- entries = 0;
- for (i = 1; i <= _numSeqs; i++)
- {
- if (((*ptrToSets)[set][i] != 0) &&
- (maxid[i] > userParameters->getDivergenceCutoff()))
- {
- entries++;
- if (aligned[i] == 0)
- {
- if (userParameters->getOutputOrder() == INPUT)
- {
- ++ix;
- newOutputIndex[i - 1] = i;
- }
- else
- {
- if(ix >= (int)newOutputIndex.size())
- {
- cerr << "ERROR: size = " << newOutputIndex.size()
- << "ix = " << ix << "\n";
- exit(1);
- }
- else
- {
- newOutputIndex[ix] = i;
- ++ix;
- }
- }
-
- aligned[i] = 1;
- }
- }
- }
-
- if (entries > 0)
- {
- #if DEBUGFULL
- if(logObject && DEBUGLOG)
- {
- logObject->logMsg("Doing profile align");
- }
- #endif
- score = alignAlgorithm->profileAlign(alnPtr, distMat, progSteps->getStep(set),
- aligned);
- }
- else
- {
- score = 0;
- }
-
-
- // negative score means fatal error... exit now!
-
- if (score < 0)
- {
- return (-1);
- }
- if(userParameters->getDisplayInfo())
- {
- if ((entries > 0) && (score > 0))
- {
- utilityObject->info("Group %d: Sequences:%4d Score:%d",
- set, entries, score);
- }
- else
- {
- utilityObject->info("Group %d: Delayed", set);
- }
- }
- }
- }
-
- else
- {
- aligned = new int[_numSeqs + 1];
- ix = 0;
- for (i = 1; i <= iStart + 1; i++)
- {
- aligned[i] = 1;
- ++ix;
- newOutputIndex[i - 1] = i;
- }
- for (i = iStart + 2; i <= _numSeqs; i++)
- {
- aligned[i] = 0;
- }
- }
-
- // second pass - align remaining, more divergent sequences.....
-
- // if not all sequences were aligned, for each unaligned sequence,
- // find it's closest pair amongst the aligned sequences.
-
- group.resize(_numSeqs + 1);
- treeWeight.resize(_numSeqs);
-
- for (i = 0; i < _numSeqs; i++)
- {
- treeWeight[i] = (*seqWeight)[i];
- }
-
- // if we haven't aligned any sequences, in the first pass - align the
- // two most closely related sequences now
- if (ix == 0)
- {
- max = -1;
- iseq = 0;
-
- for (i = 1; i <= _numSeqs; i++)
- {
- for (j = i + 1; j <= _numSeqs; j++)
- {
- if (max < (*distMat)(i, j))
- {
- max = static_cast<int>((*distMat)(i, j)); // Mark change 17-5-07
- iseq = i;
- }
- }
- }
- aligned[iseq] = 1;
- if (userParameters->getOutputOrder() == INPUT)
- {
- ++ix;
- newOutputIndex[iseq - 1] = iseq;
- }
- else
- {
- newOutputIndex[ix] = iseq;
- ++ix;
- }
- }
-
- while (ix < _numSeqs)
- {
- for (i = 1; i <= _numSeqs; i++)
- {
- if (aligned[i] == 0)
- {
- maxid[i] = - 1;
- for (j = 1; j <= _numSeqs; j++)
- if ((maxid[i] < (*distMat)(i, j)) && (aligned[j] != 0))
- {
- maxid[i] = static_cast<int>((*distMat)(i, j));// Mark change 17-5-07
- }
-
- }
- }
- // find the most closely related sequence to those already aligned
-
- max = - 1;
- iseq = 0;
- for (i = 1; i <= _numSeqs; i++)
- {
- if ((aligned[i] == 0) && (maxid[i] > max))
- {
- max = maxid[i];
- iseq = i;
- }
- }
-
-
- // align this sequence to the existing alignment
- // weight sequences with percent identity with profile
- // OR...., multiply sequence weights from tree by percent identity with new sequence
- if (userParameters->getNoWeights() == false)
- {
- for (j = 0; j < _numSeqs; j++)
- if (aligned[j + 1] != 0)
- {
- (*seqWeight)[j] = static_cast<int>(treeWeight[j] * (*distMat)(j + 1, iseq));
- }
-
- // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
-
- sum = 0;
- for (j = 0; j < _numSeqs; j++)
- {
- if (aligned[j + 1] != 0)
- {
- sum += (*seqWeight)[j];
- }
- }
-
- if (sum == 0)
- {
- for (j = 0; j < _numSeqs; j++)
- {
- (*seqWeight)[j] = 1;
- }
-
- sum = j;
- }
- for (j = 0; j < _numSeqs; j++)
- {
- if (aligned[j + 1] != 0)
- {
- (*seqWeight)[j] = ((*seqWeight)[j] * INT_SCALE_FACTOR) / sum;
- if ((*seqWeight)[j] < 1)
- {
- (*seqWeight)[j] = 1;
- }
- }
- }
- }
-
- entries = 0;
- for (j = 1; j <= _numSeqs; j++)
- {
- if (aligned[j] != 0)
- {
- group[j] = 1;
- entries++;
- }
- else if (iseq == j)
- {
- group[j] = 2;
- entries++;
- }
- }
-
- alnPtr->addSeqWeight(seqWeight);
- aligned[iseq] = 1;
-
- score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned);
-
- if (userParameters->getOutputOrder() == INPUT)
- {
- ++ix;
- newOutputIndex[iseq - 1] = iseq;
- }
- else
- {
- newOutputIndex[ix] = iseq;
- ++ix;
- }
- }
-
- alnPtr->addOutputIndex(&newOutputIndex);
-
- if(userParameters->getDisplayInfo())
- {
- int alignmentScore = alnPtr->alignScore(); // ?? check, FS, 2009-05-18
- }
-
- delete alignAlgorithm;
- delete [] aligned;
- delete [] maxid;
- return (_numSeqs);
-}
-
-
-/**
- *
- * @param alnPtr
- * @param distMat
- * @param iStart
- * @param phylipName
- * @return
- */
-int MSA::seqsAlignToProfile(Alignment* alnPtr, DistMatrix* distMat, vector<int>* seqWeight, int iStart,
- string phylipName)
-{
- int *aligned;
- vector<int> treeWeight;
- vector<int> group;
- int ix;
-
- int *maxid;
- int max = 0;
- int i = 0, j = 0, iseq = 0;
- int sum = 0, entries = 0;
- int score = 0;
- int _numSeqs = alnPtr->getNumSeqs();
-
- utilityObject->info("Start of Multiple Alignment\n");
-
- ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign;
-
- // calculate sequence weights according to branch lengths of the tree -
- // weights in global variable seq_weight normalised to sum to 100
- vector<int> newOutputIndex(_numSeqs);
- //groupTree.calcSeqWeights(0, _numSeqs, &seqWeight);
-
- treeWeight.resize(_numSeqs);
- for (i = 0; i < _numSeqs; i++)
- {
- treeWeight[i] = (*seqWeight)[i];
- }
-
- // for each sequence, find the most closely related sequence
-
- maxid = new int[_numSeqs + 1];
-
- for (i = 1; i <= _numSeqs; i++)
- {
- maxid[i] = - 1;
- for (j = 1; j <= _numSeqs; j++)
- {
- if (maxid[i] < (*distMat)(i, j))
- {
- maxid[i] = static_cast<int>((*distMat)(i, j)); // Mark change 17-5-07
- }
- }
- }
-
- aligned = new int[_numSeqs + 1];
- ix = 0;
- for (i = 1; i <= iStart + 1; i++)
- {
- aligned[i] = 1;
- ++ix;
- newOutputIndex[i - 1] = i;
- }
- for (i = iStart + 2; i <= _numSeqs; i++)
- {
- aligned[i] = 0;
- }
-
- // for each unaligned sequence, find it's closest pair amongst the
- // aligned sequences.
-
- group.resize(_numSeqs + 1);
-
- while (ix < _numSeqs)
- {
- if (ix > 0)
- {
- for (i = 1; i <= _numSeqs; i++)
- {
- if (aligned[i] == 0)
- {
- maxid[i] = - 1;
- for (j = 1; j <= _numSeqs; j++)
- {
- if ((maxid[i] < (*distMat)(i, j)) && (aligned[j] != 0))
- {
- maxid[i] = static_cast<int>((*distMat)(i, j));
- }
- }
- }
- }
- }
-
- // find the most closely related sequence to those already aligned
-
- max = -1;
- for (i = 1; i <= _numSeqs; i++)
- {
- if ((aligned[i] == 0) && (maxid[i] > max))
- {
- max = maxid[i];
- iseq = i;
- }
- }
-
- // align this sequence to the existing alignment
- entries = 0;
- for (j = 1; j <= _numSeqs; j++)
- {
- if (aligned[j] != 0)
- {
- group[j] = 1;
- entries++;
- }
- else if (iseq == j)
- {
- group[j] = 2;
- entries++;
- }
- }
-
- aligned[iseq] = 1;
-
- // multiply sequence weights from tree by percent
- // identity with new sequence
-
- for (j = 0; j < _numSeqs; j++)
- {
- (*seqWeight)[j] = static_cast<int>(treeWeight[j] * (*distMat)(j + 1, iseq));
- }
-
- //
- // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
- //
-
- sum = 0;
- for (j = 0; j < _numSeqs; j++)
- {
- if (group[j + 1] == 1)
- {
- sum += (*seqWeight)[j];
- }
- }
-
- if (sum == 0)
- {
- for (j = 0; j < _numSeqs; j++)
- {
- (*seqWeight)[j] = 1;
- }
-
- sum = j;
- }
- for (j = 0; j < _numSeqs; j++)
- {
- (*seqWeight)[j] = ((*seqWeight)[j] * INT_SCALE_FACTOR) / sum;
- if ((*seqWeight)[j] < 1)
- {
- (*seqWeight)[j] = 1;
- }
- }
- // Add the seqWeights to the Alignment object!!!!
- alnPtr->addSeqWeight(seqWeight);
-
- score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned);
- utilityObject->info("Sequence:%d Score:%d", iseq, score);
- if (userParameters->getOutputOrder() == INPUT)
- {
- ++ix;
- newOutputIndex[iseq - 1] = iseq;
- }
- else
- {
- newOutputIndex[ix] = iseq;
- ++ix;
- }
- }
-
- delete [] aligned;
- delete [] maxid;
- delete alignAlgorithm;
- alnPtr->addOutputIndex(&newOutputIndex);
-
- if(userParameters->getDisplayInfo())
- {
- alnPtr->alignScore();
- }
-
- return (_numSeqs);
-}
-
-
-/**
- *
- * @param alnPtr
- * @param distMat
- * @return
- */
-int MSA::calcPairwiseForProfileAlign(Alignment* alnPtr, DistMatrix* distMat)
-{
- //Tree groupTree;
- int i, j, temp;
- int entries;
- int* aligned;
- vector<int> group;
- vector<int> seqWeight;
- float dscore;
- int score;
- int _numSeqs = alnPtr->getNumSeqs();
-
- seqWeight.resize(_numSeqs);
- ProfileAlignAlgorithm* alignAlg = new MyersMillerProfileAlign;
-
- utilityObject->info("Start of Initial Alignment");
- /* calculate sequence weights according to branch lengths of the tree -
- * weights in global variable seq_weight normalised to sum to INT_SCALE_FACTOR */
-
- temp = INT_SCALE_FACTOR / _numSeqs;
- for (i = 0; i < _numSeqs; i++)
- {
- seqWeight[i] = temp;
- }
-
- userParameters->setDistanceTree(false);
-
- /* do the initial alignment......... */
-
- group.resize(_numSeqs + 1);
-
- for (i = 1; i <= alnPtr->getProfile1NumSeqs(); ++i)
- {
- group[i] = 1;
- }
-
- for (i = alnPtr->getProfile1NumSeqs() + 1; i <= _numSeqs; ++i)
- {
- group[i] = 2;
- }
-
- entries = _numSeqs;
-
- aligned = new int[_numSeqs + 1];
-
- for (i = 1; i <= _numSeqs; i++)
- {
- aligned[i] = 1;
- }
-
- alnPtr->addSeqWeight(&seqWeight);
-
- score = alignAlg->profileAlign(alnPtr, distMat, &group, aligned);
- utilityObject->info("Sequences:%d Score:%d", entries, score);
- delete [] aligned;
-
- for (i = 1; i <= _numSeqs; i++)
- {
- for (j = i + 1; j <= _numSeqs; j++)
- {
- dscore = alnPtr->countid(i, j);
- (*distMat)(i, j) = ((double)100.0 - (double)dscore) / (double)100.0;
- (*distMat)(j, i) = (*distMat)(i, j);
- }
- }
- delete alignAlg;
- return (_numSeqs);
-
-}
-
-
-/**
- *
- * @param alnPtr
- * @param distMat
- * @param p1TreeName
- * @param p2TreeName
- * @return
- */
-int MSA::doProfileAlign(Alignment* alnPtr, DistMatrix* distMat, vector<int>* prof1Weight, vector<int>* prof2Weight)
-{
- //Tree groupTree1, groupTree2;
- int i, j, sum, entries;
- int score;
- int *aligned;
- vector<int> group;
- int *maxid;
- //vector<int> prof1Weight, prof2Weight;
- int _profile1NumSeqs = alnPtr->getProfile1NumSeqs();
- int _numSeqs = alnPtr->getNumSeqs();
- vector<int> _seqWeight, _outputIndex;
-
- utilityObject->info("Start of Multiple Alignment\n");
-
- _seqWeight.resize(_numSeqs + 1);
- _outputIndex.resize(_numSeqs);
- ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign;
-
- // weight sequences with max percent identity with other profile
-
- maxid = new int[_numSeqs + 1];
- for (i = 0; i < _profile1NumSeqs; i++)
- {
- maxid[i] = 0;
- for (j = _profile1NumSeqs + 1; j <= _numSeqs; j++)
- {
- if (maxid[i] < (*distMat)(i + 1, j))
- {
- maxid[i] = static_cast<int>((*distMat)(i + 1, j)); // Mark change 17-5-07
- }
- }
- _seqWeight[i] = maxid[i] * (*prof1Weight)[i];
- }
-
- for (i = _profile1NumSeqs; i < _numSeqs; i++)
- {
- maxid[i] = - 1;
- for (j = 1; j <= _profile1NumSeqs; j++)
- {
- if (maxid[i] < (*distMat)(i + 1, j))
- {
- maxid[i] = static_cast<int>((*distMat)(i + 1, j));// Mark change 17-5-07
- }
- }
- _seqWeight[i] = maxid[i] * (*prof2Weight)[i];
- }
- //
- // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
- //
-
- sum = 0;
- for (j = 0; j < _numSeqs; j++)
- {
- sum += _seqWeight[j];
- }
-
- if (sum == 0)
- {
- for (j = 0; j < _numSeqs; j++)
- {
- _seqWeight[j] = 1;
- }
-
- sum = j;
- }
- for (j = 0; j < _numSeqs; j++)
- {
- _seqWeight[j] = (_seqWeight[j] * INT_SCALE_FACTOR) / sum;
- if (_seqWeight[j] < 1)
- {
- _seqWeight[j] = 1;
- }
- }
-
- // do the alignment......... /
-
- utilityObject->info("Aligning...");
- group.resize(_numSeqs + 1);
-
- for (i = 1; i <= _profile1NumSeqs; ++i)
- {
- group[i] = 1;
- }
-
- for (i = _profile1NumSeqs + 1; i <= _numSeqs; ++i)
- {
- group[i] = 2;
- }
-
- entries = _numSeqs;
-
- aligned = new int[_numSeqs + 1];
- for (i = 1; i <= _numSeqs; i++)
- {
- aligned[i] = 1;
- }
-
- alnPtr->addSeqWeight(&_seqWeight);
-
- score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned);
-
- utilityObject->info("Sequences:%d Score:%d", entries, score);
-
- for (i = 1; i <= _numSeqs; i++)
- {
- _outputIndex[i - 1] = i;
- }
-
- alnPtr->addOutputIndex(&_outputIndex);
-
- delete alignAlgorithm;
- delete [] aligned;
- delete [] maxid;
- return (_numSeqs);
- return 1;
-}
-
-}