/** * Author: Mark Larkin * * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson. * * Changes: * Mark 30-5-2007: Changed iterationOnTreeNode function as it was adding in extra gaps * at the end of an alignment. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "Iteration.h" #include "../alignment/ObjectiveScore.h" #include "../general/utils.h" #include "../general/userparams.h" #include "../tree/TreeInterface.h" #include "../clustalw_version.h" #include "MSA.h" namespace clustalw { bool Iteration::iterationOnTreeNode(int numSeqsProf1, int numSeqsProf2, int& prfLength1, int& prfLength2, SeqArray* seqArray) { Alignment alignmentToIterate; int numSeqsInProfiles = numSeqsProf1 + numSeqsProf2; if(numSeqsInProfiles <= 2) { return false; } SeqArray profileSeqs; profileSeqs.resize(numSeqsInProfiles + 1); // Copy the SeqArray! for (int j = 0; ((j < numSeqsProf1 + numSeqsProf2) && (j < (int)seqArray->size())); j++) { profileSeqs[j + 1].clear(); profileSeqs[j + 1].resize(prfLength1 + 1); for (int i = 0; i < prfLength1 && i < (int)(*seqArray)[j].size(); i++) { profileSeqs[j + 1][i + 1] = (*seqArray)[j][i]; } } alignmentToIterate.addSequences(&profileSeqs); //userParameters->setNumIterations(numSeqsInProfiles * 2); bool changed = false; changed = removeFirstIterate(&alignmentToIterate); if(changed) { SeqArray* iteratedSeqs = alignmentToIterate.getSeqArrayForRealloc(); string aaCodes = userParameters->getAminoAcidCodes(); int newPrf1Length = 0, newPrf2Length = 0; for (int j = 0; j < numSeqsProf1 + numSeqsProf2; j++) { if(j < numSeqsProf1) { if(alignmentToIterate.getSeqLength(j + 1) > newPrf1Length) { newPrf1Length = alignmentToIterate.getSeqLength(j + 1); } } else if(j < numSeqsProf1 + numSeqsProf2) { if(alignmentToIterate.getSeqLength(j + 1) > newPrf2Length) { newPrf2Length = alignmentToIterate.getSeqLength(j + 1); } } } prfLength1 = newPrf1Length; // mark 30-5-2007 prfLength2 = newPrf2Length; // mark 30-5-2007 for (int j = 0; j < numSeqsProf1 + numSeqsProf2; j++) { // I need to recalculate the prfLength1 and prfLength2 (*seqArray)[j].clear(); (*seqArray)[j].assign((*iteratedSeqs)[j + 1].begin() + 1, (*iteratedSeqs)[j + 1].end()); (*seqArray)[j].resize(prfLength1 + extraEndElemNum, 31); (*seqArray)[j][prfLength1] = ENDALN; } } return true; } void Iteration::printSeqArray(SeqArray* arrayToPrint) { cout << "HERE IS THE SEQARRAY\n"; // I need to use iterators for everything here. SeqArray::iterator mainBeginIt = arrayToPrint->begin(); SeqArray::iterator mainEndIt = arrayToPrint->end(); vector::iterator begin, end; string aaCodes = userParameters->getAminoAcidCodes(); for(; mainBeginIt != mainEndIt; mainBeginIt++) { if(mainBeginIt->size() > 0) { begin = mainBeginIt->begin() + 1; end = mainBeginIt->end(); for(; begin != end; begin++) { if(*begin < (int)aaCodes.size()) { cout << aaCodes[*begin]; } else { cout << "-"; } } cout << "\n"; } } cout << "\n\n"; } /** * The function removeFirstIterate is used to perform the remove first iteration * strategy on the finished alignment. It optimises the score give in alignScore. * For iter = 1 to numIterations * For seq i = 1 to numSeqs * remove seq i * if either of the profiles has all gaps, remove this column. * realign using profileAlign * if its better, keep it. If its not better, dont keep it. * @param alnPtr The alignment object. * @return true if it has been successful, false if it has not been successful. */ bool Iteration::removeFirstIterate(Alignment* alnPtr) { if(!alnPtr) { return false; } string p1TreeName; p1TreeName = ""; string p2TreeName; int nSeqs = alnPtr->getNumSeqs(); if(nSeqs <= 2) { return false; } DistMatrix distMat; distMat.ResizeRect(nSeqs + 1); ObjectiveScore scoreObj; int iterate = userParameters->getDoRemoveFirstIteration(); userParameters->setDoRemoveFirstIteration(NONE); double firstScore = scoreObj.getScore(alnPtr); //cout << "firstScore = " << firstScore << "\n"; double score = 0; double bestScore = firstScore; double dscore; int count; bool scoreImproved = false; bool scoreImprovedAnyIteration = false; int prof1NumSeqs = 1; // This will be used for removing gaps!!! vector profile1; vector profile2; profile1.resize(nSeqs + 1, 0); profile1[1] = 1; profile2.resize(nSeqs + 1, 1); profile2[0] = 0; profile2[1] = 0; vector prof1Weight, prof2Weight; int iterations = userParameters->getNumIterations(); //cout << "Max num iterations = " << iterations << "\n"; Alignment bestAlignSoFar; TreeInterface tree; // One iteration consists of removing each of the sequences, reseting all the gap // only columns. If the score is better, the new alignment is kept. for(int n = 1; n <= iterations; n++) { scoreImproved = false; cout << "ITERATION " << n << " OF " << iterations << "\n"; for(int i = 1; i <= nSeqs; i++) { vector seqVector; Alignment iterateAlign = *alnPtr; iterateAlign.setProfile1NumSeqs(1); // We remove the sequence i from the profile, and paste into the first position // This is to make it easy to do the profile alignment. vector selected; selected.resize(nSeqs + 1, 0); selected[i] = 1; seqVector = iterateAlign.cutSelectedSequencesFromAlignment(&selected); iterateAlign.pasteSequencesIntoPosition(&seqVector, 0); // Remove any gap only columns iterateAlign.removeGapOnlyColsFromSelectedSeqs(&profile1); iterateAlign.removeGapOnlyColsFromSelectedSeqs(&profile2); // Calculate a simple distance matrix. if(nSeqs - 1 >= 2) { for (int i = 1; i <= nSeqs; i++) { for (int j = i + 1; j <= nSeqs; j++) { dscore = iterateAlign.countid(i, j); distMat(i, j) = (100.0 - dscore)/100.0; } } /* temporary tree file * * can't use the safer mkstemp function here, because * we just pass down the filename :( */ char buffer[L_tmpnam]; tmpnam (buffer); p2TreeName = buffer + string(".dnd"); // should test here if file is writable } bool success = false; prof1Weight.clear(); prof1Weight.resize(prof1NumSeqs); prof2Weight.clear(); prof2Weight.resize(nSeqs); tree.getWeightsForProfileAlign(&iterateAlign, &distMat, &p1TreeName, &prof1Weight, &p2TreeName, &prof2Weight, nSeqs, prof1NumSeqs, false, false, &success); remove(p2TreeName.c_str()); if(!success) { /* returning false only means alignment hasn't * changed, but here getWeightsForProfileAlign failed, * most likely because p2TreeName couldn't be read. an * error will be printed to console. clustalw should * then exit, FIXME: clustalx users have to sit * through all error messages until someone * implements a way to return an exit code and react * appropriately */ // does anyone know how to use // (userParameters->getMenuFlag() || // !userParameters->getInteractive() instead? char buf[1024]; utilityObject->myname(buf); if (strcasecmp(buf, "clustalw")==0) { exit(EXIT_FAILURE); } else { // the next two lines were here before the exit // was added. keeping it for clustalx although it // doesnt seem to make any sens userParameters->setDoRemoveFirstIteration(iterate); return false; } } MSA* msaObj = new MSA(); iterateAlign.resetProfile1(); iterateAlign.resetProfile2(); // Do the profile alignment. count = msaObj->doProfileAlign(&iterateAlign, &distMat, &prof1Weight, &prof2Weight); delete msaObj; // Check if its better score = scoreObj.getScore(&iterateAlign); iterateAlign.setProfile1NumSeqs(0); if(score < bestScore) // Might be a problem with this. { //cout << "**********************************************\n"; //cout << "***** Better score found using iteration *****\n"; //cout << "**********************************************\n"; bestScore = score; bestAlignSoFar = iterateAlign; scoreImproved = true; scoreImprovedAnyIteration = true; } distMat.clearArray(); distMat.ResizeRect(nSeqs + 1); } if(scoreImproved == false) { cout << "Score was not improved in last iteration. Exiting...\n"; break; } } // // NOTE if we have improved it, then we need to update the sequences in alnPtr // 1) get the unique id of seq i // 2) get the sequence from new object using id // 3) update the sequence in alnPtr // if(scoreImprovedAnyIteration) // If we need to update the alnPtr object. { cout << "Iteration improved Align score: " << bestScore << "\n"; int seqId; const vector* improvedSeq; for(int i = 1; i <= nSeqs; i++) // For each seq in alnPtr { seqId = alnPtr->getUniqueId(i); improvedSeq = bestAlignSoFar.getSequenceFromUniqueId(seqId); alnPtr->updateSequence(i, improvedSeq); } } cout << "FINAL score: " << bestScore << "\n"; userParameters->setDoRemoveFirstIteration(iterate); return true; // It was successful. } }