/** * Author: Mark Larkin * * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson. */ /** * @author Mark Larkin, Conway Institute, UCD. mark.larkin@ucd.ie * Changes: * * Mark: 23-01-2007: There was a problem with running an alignment with only * one sequence in it. I needed to make a change to the multiSeqAlign function. ****************************************************************************/ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "MSA.h" #include "MyersMillerProfileAlign.h" //#include "../phylogeneticTree/ClusterTree.h" #include "../general/debuglogObject.h" namespace clustalw { /** * * @param alnPtr * @param distMat * @param iStart * @param phylipName * @return */ int MSA::multiSeqAlign(Alignment* alnPtr, DistMatrix* distMat, vector* seqWeight, AlignmentSteps* progSteps, int iStart) { if(!progSteps) { return 0; } int* aligned; vector group; int ix; int* maxid; int max = 0, sum = 0; vector treeWeight; int i = 0, j = 0, set = 0, iseq = 0; int entries = 0; int score = 0; int _numSteps = 0; utilityObject->info("Start of Multiple Alignment\n"); int _numSeqs = alnPtr->getNumSeqs(); vector newOutputIndex(_numSeqs); alnPtr->addSeqWeight(seqWeight); ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign; _numSteps = progSteps->getNumSteps(); // for each sequence, find the most closely related sequence maxid = new int[_numSeqs + 1]; for (i = 1; i <= _numSeqs; i++) { maxid[i] = -1; for (j = 1; j <= _numSeqs; j++) if (j != i && maxid[i] < (*distMat)(i, j)) { maxid[i] = static_cast((*distMat)(i, j)); } } // group the sequences according to their relative divergence if (iStart == 0) { // start the multiple alignments......... utilityObject->info("Aligning..."); // first pass, align closely related sequences first.... ix = 0; aligned = new int[_numSeqs + 1]; for (i = 0; i <= _numSeqs; i++) { aligned[i] = 0; } const vector >* ptrToSets = progSteps->getSteps(); for (set = 1; set <= _numSteps; ++set) { entries = 0; for (i = 1; i <= _numSeqs; i++) { if (((*ptrToSets)[set][i] != 0) && (maxid[i] > userParameters->getDivergenceCutoff())) { entries++; if (aligned[i] == 0) { if (userParameters->getOutputOrder() == INPUT) { ++ix; newOutputIndex[i - 1] = i; } else { if(ix >= (int)newOutputIndex.size()) { cerr << "ERROR: size = " << newOutputIndex.size() << "ix = " << ix << "\n"; exit(1); } else { newOutputIndex[ix] = i; ++ix; } } aligned[i] = 1; } } } if (entries > 0) { #if DEBUGFULL if(logObject && DEBUGLOG) { logObject->logMsg("Doing profile align"); } #endif score = alignAlgorithm->profileAlign(alnPtr, distMat, progSteps->getStep(set), aligned); } else { score = 0; } // negative score means fatal error... exit now! if (score < 0) { return (-1); } if(userParameters->getDisplayInfo()) { if ((entries > 0) && (score > 0)) { utilityObject->info("Group %d: Sequences:%4d Score:%d", set, entries, score); } else { utilityObject->info("Group %d: Delayed", set); } } } } else { aligned = new int[_numSeqs + 1]; ix = 0; for (i = 1; i <= iStart + 1; i++) { aligned[i] = 1; ++ix; newOutputIndex[i - 1] = i; } for (i = iStart + 2; i <= _numSeqs; i++) { aligned[i] = 0; } } // second pass - align remaining, more divergent sequences..... // if not all sequences were aligned, for each unaligned sequence, // find it's closest pair amongst the aligned sequences. group.resize(_numSeqs + 1); treeWeight.resize(_numSeqs); for (i = 0; i < _numSeqs; i++) { treeWeight[i] = (*seqWeight)[i]; } // if we haven't aligned any sequences, in the first pass - align the // two most closely related sequences now if (ix == 0) { max = -1; iseq = 0; for (i = 1; i <= _numSeqs; i++) { for (j = i + 1; j <= _numSeqs; j++) { if (max < (*distMat)(i, j)) { max = static_cast((*distMat)(i, j)); // Mark change 17-5-07 iseq = i; } } } aligned[iseq] = 1; if (userParameters->getOutputOrder() == INPUT) { ++ix; newOutputIndex[iseq - 1] = iseq; } else { newOutputIndex[ix] = iseq; ++ix; } } while (ix < _numSeqs) { for (i = 1; i <= _numSeqs; i++) { if (aligned[i] == 0) { maxid[i] = - 1; for (j = 1; j <= _numSeqs; j++) if ((maxid[i] < (*distMat)(i, j)) && (aligned[j] != 0)) { maxid[i] = static_cast((*distMat)(i, j));// Mark change 17-5-07 } } } // find the most closely related sequence to those already aligned max = - 1; iseq = 0; for (i = 1; i <= _numSeqs; i++) { if ((aligned[i] == 0) && (maxid[i] > max)) { max = maxid[i]; iseq = i; } } // align this sequence to the existing alignment // weight sequences with percent identity with profile // OR...., multiply sequence weights from tree by percent identity with new sequence if (userParameters->getNoWeights() == false) { for (j = 0; j < _numSeqs; j++) if (aligned[j + 1] != 0) { (*seqWeight)[j] = static_cast(treeWeight[j] * (*distMat)(j + 1, iseq)); } // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR sum = 0; for (j = 0; j < _numSeqs; j++) { if (aligned[j + 1] != 0) { sum += (*seqWeight)[j]; } } if (sum == 0) { for (j = 0; j < _numSeqs; j++) { (*seqWeight)[j] = 1; } sum = j; } for (j = 0; j < _numSeqs; j++) { if (aligned[j + 1] != 0) { (*seqWeight)[j] = ((*seqWeight)[j] * INT_SCALE_FACTOR) / sum; if ((*seqWeight)[j] < 1) { (*seqWeight)[j] = 1; } } } } entries = 0; for (j = 1; j <= _numSeqs; j++) { if (aligned[j] != 0) { group[j] = 1; entries++; } else if (iseq == j) { group[j] = 2; entries++; } } alnPtr->addSeqWeight(seqWeight); aligned[iseq] = 1; score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned); if (userParameters->getOutputOrder() == INPUT) { ++ix; newOutputIndex[iseq - 1] = iseq; } else { newOutputIndex[ix] = iseq; ++ix; } } alnPtr->addOutputIndex(&newOutputIndex); if(userParameters->getDisplayInfo()) { int alignmentScore = alnPtr->alignScore(); // ?? check, FS, 2009-05-18 } delete alignAlgorithm; delete [] aligned; delete [] maxid; return (_numSeqs); } /** * * @param alnPtr * @param distMat * @param iStart * @param phylipName * @return */ int MSA::seqsAlignToProfile(Alignment* alnPtr, DistMatrix* distMat, vector* seqWeight, int iStart, string phylipName) { int *aligned; vector treeWeight; vector group; int ix; int *maxid; int max = 0; int i = 0, j = 0, iseq = 0; int sum = 0, entries = 0; int score = 0; int _numSeqs = alnPtr->getNumSeqs(); utilityObject->info("Start of Multiple Alignment\n"); ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign; // calculate sequence weights according to branch lengths of the tree - // weights in global variable seq_weight normalised to sum to 100 vector newOutputIndex(_numSeqs); //groupTree.calcSeqWeights(0, _numSeqs, &seqWeight); treeWeight.resize(_numSeqs); for (i = 0; i < _numSeqs; i++) { treeWeight[i] = (*seqWeight)[i]; } // for each sequence, find the most closely related sequence maxid = new int[_numSeqs + 1]; for (i = 1; i <= _numSeqs; i++) { maxid[i] = - 1; for (j = 1; j <= _numSeqs; j++) { if (maxid[i] < (*distMat)(i, j)) { maxid[i] = static_cast((*distMat)(i, j)); // Mark change 17-5-07 } } } aligned = new int[_numSeqs + 1]; ix = 0; for (i = 1; i <= iStart + 1; i++) { aligned[i] = 1; ++ix; newOutputIndex[i - 1] = i; } for (i = iStart + 2; i <= _numSeqs; i++) { aligned[i] = 0; } // for each unaligned sequence, find it's closest pair amongst the // aligned sequences. group.resize(_numSeqs + 1); while (ix < _numSeqs) { if (ix > 0) { for (i = 1; i <= _numSeqs; i++) { if (aligned[i] == 0) { maxid[i] = - 1; for (j = 1; j <= _numSeqs; j++) { if ((maxid[i] < (*distMat)(i, j)) && (aligned[j] != 0)) { maxid[i] = static_cast((*distMat)(i, j)); } } } } } // find the most closely related sequence to those already aligned max = -1; for (i = 1; i <= _numSeqs; i++) { if ((aligned[i] == 0) && (maxid[i] > max)) { max = maxid[i]; iseq = i; } } // align this sequence to the existing alignment entries = 0; for (j = 1; j <= _numSeqs; j++) { if (aligned[j] != 0) { group[j] = 1; entries++; } else if (iseq == j) { group[j] = 2; entries++; } } aligned[iseq] = 1; // multiply sequence weights from tree by percent // identity with new sequence for (j = 0; j < _numSeqs; j++) { (*seqWeight)[j] = static_cast(treeWeight[j] * (*distMat)(j + 1, iseq)); } // // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR // sum = 0; for (j = 0; j < _numSeqs; j++) { if (group[j + 1] == 1) { sum += (*seqWeight)[j]; } } if (sum == 0) { for (j = 0; j < _numSeqs; j++) { (*seqWeight)[j] = 1; } sum = j; } for (j = 0; j < _numSeqs; j++) { (*seqWeight)[j] = ((*seqWeight)[j] * INT_SCALE_FACTOR) / sum; if ((*seqWeight)[j] < 1) { (*seqWeight)[j] = 1; } } // Add the seqWeights to the Alignment object!!!! alnPtr->addSeqWeight(seqWeight); score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned); utilityObject->info("Sequence:%d Score:%d", iseq, score); if (userParameters->getOutputOrder() == INPUT) { ++ix; newOutputIndex[iseq - 1] = iseq; } else { newOutputIndex[ix] = iseq; ++ix; } } delete [] aligned; delete [] maxid; delete alignAlgorithm; alnPtr->addOutputIndex(&newOutputIndex); if(userParameters->getDisplayInfo()) { alnPtr->alignScore(); } return (_numSeqs); } /** * * @param alnPtr * @param distMat * @return */ int MSA::calcPairwiseForProfileAlign(Alignment* alnPtr, DistMatrix* distMat) { //Tree groupTree; int i, j, temp; int entries; int* aligned; vector group; vector seqWeight; float dscore; int score; int _numSeqs = alnPtr->getNumSeqs(); seqWeight.resize(_numSeqs); ProfileAlignAlgorithm* alignAlg = new MyersMillerProfileAlign; utilityObject->info("Start of Initial Alignment"); /* calculate sequence weights according to branch lengths of the tree - * weights in global variable seq_weight normalised to sum to INT_SCALE_FACTOR */ temp = INT_SCALE_FACTOR / _numSeqs; for (i = 0; i < _numSeqs; i++) { seqWeight[i] = temp; } userParameters->setDistanceTree(false); /* do the initial alignment......... */ group.resize(_numSeqs + 1); for (i = 1; i <= alnPtr->getProfile1NumSeqs(); ++i) { group[i] = 1; } for (i = alnPtr->getProfile1NumSeqs() + 1; i <= _numSeqs; ++i) { group[i] = 2; } entries = _numSeqs; aligned = new int[_numSeqs + 1]; for (i = 1; i <= _numSeqs; i++) { aligned[i] = 1; } alnPtr->addSeqWeight(&seqWeight); score = alignAlg->profileAlign(alnPtr, distMat, &group, aligned); utilityObject->info("Sequences:%d Score:%d", entries, score); delete [] aligned; for (i = 1; i <= _numSeqs; i++) { for (j = i + 1; j <= _numSeqs; j++) { dscore = alnPtr->countid(i, j); (*distMat)(i, j) = ((double)100.0 - (double)dscore) / (double)100.0; (*distMat)(j, i) = (*distMat)(i, j); } } delete alignAlg; return (_numSeqs); } /** * * @param alnPtr * @param distMat * @param p1TreeName * @param p2TreeName * @return */ int MSA::doProfileAlign(Alignment* alnPtr, DistMatrix* distMat, vector* prof1Weight, vector* prof2Weight) { //Tree groupTree1, groupTree2; int i, j, sum, entries; int score; int *aligned; vector group; int *maxid; //vector prof1Weight, prof2Weight; int _profile1NumSeqs = alnPtr->getProfile1NumSeqs(); int _numSeqs = alnPtr->getNumSeqs(); vector _seqWeight, _outputIndex; utilityObject->info("Start of Multiple Alignment\n"); _seqWeight.resize(_numSeqs + 1); _outputIndex.resize(_numSeqs); ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign; // weight sequences with max percent identity with other profile maxid = new int[_numSeqs + 1]; for (i = 0; i < _profile1NumSeqs; i++) { maxid[i] = 0; for (j = _profile1NumSeqs + 1; j <= _numSeqs; j++) { if (maxid[i] < (*distMat)(i + 1, j)) { maxid[i] = static_cast((*distMat)(i + 1, j)); // Mark change 17-5-07 } } _seqWeight[i] = maxid[i] * (*prof1Weight)[i]; } for (i = _profile1NumSeqs; i < _numSeqs; i++) { maxid[i] = - 1; for (j = 1; j <= _profile1NumSeqs; j++) { if (maxid[i] < (*distMat)(i + 1, j)) { maxid[i] = static_cast((*distMat)(i + 1, j));// Mark change 17-5-07 } } _seqWeight[i] = maxid[i] * (*prof2Weight)[i]; } // // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR // sum = 0; for (j = 0; j < _numSeqs; j++) { sum += _seqWeight[j]; } if (sum == 0) { for (j = 0; j < _numSeqs; j++) { _seqWeight[j] = 1; } sum = j; } for (j = 0; j < _numSeqs; j++) { _seqWeight[j] = (_seqWeight[j] * INT_SCALE_FACTOR) / sum; if (_seqWeight[j] < 1) { _seqWeight[j] = 1; } } // do the alignment......... / utilityObject->info("Aligning..."); group.resize(_numSeqs + 1); for (i = 1; i <= _profile1NumSeqs; ++i) { group[i] = 1; } for (i = _profile1NumSeqs + 1; i <= _numSeqs; ++i) { group[i] = 2; } entries = _numSeqs; aligned = new int[_numSeqs + 1]; for (i = 1; i <= _numSeqs; i++) { aligned[i] = 1; } alnPtr->addSeqWeight(&_seqWeight); score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned); utilityObject->info("Sequences:%d Score:%d", entries, score); for (i = 1; i <= _numSeqs; i++) { _outputIndex[i - 1] = i; } alnPtr->addOutputIndex(&_outputIndex); delete alignAlgorithm; delete [] aligned; delete [] maxid; return (_numSeqs); return 1; } }