Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / clustalw / src / multipleAlign / MSA.cpp
diff --git a/website/archive/binaries/mac/src/clustalw/src/multipleAlign/MSA.cpp b/website/archive/binaries/mac/src/clustalw/src/multipleAlign/MSA.cpp
new file mode 100644 (file)
index 0000000..2499039
--- /dev/null
@@ -0,0 +1,744 @@
+/**
+ * Author: Mark Larkin
+ * 
+ * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.  
+ */
+/**
+ * @author Mark Larkin, Conway Institute, UCD. mark.larkin@ucd.ie
+ * Changes:
+ *
+ * Mark: 23-01-2007: There was a problem with running an alignment with only 
+ * one sequence in it. I needed to make a change to the multiSeqAlign function.
+ ****************************************************************************/
+#ifdef HAVE_CONFIG_H
+    #include "config.h"
+#endif
+#include "MSA.h"
+#include "MyersMillerProfileAlign.h"
+//#include "../phylogeneticTree/ClusterTree.h"
+#include "../general/debuglogObject.h"
+
+namespace clustalw
+{
+
+/**
+ * 
+ * @param alnPtr 
+ * @param distMat 
+ * @param iStart 
+ * @param phylipName 
+ * @return 
+ */
+int MSA::multiSeqAlign(Alignment* alnPtr, DistMatrix* distMat, vector<int>* seqWeight, AlignmentSteps* progSteps, int iStart)
+{
+        
+    if(!progSteps)
+    {
+        return 0;
+    }
+        
+    int* aligned;
+    vector<int> group;
+    int ix;
+    
+    int* maxid;
+    int max = 0, sum = 0;
+    vector<int> treeWeight;
+    int i = 0, j = 0, set = 0, iseq = 0;
+    int entries = 0;
+    int score = 0;
+    int _numSteps = 0;
+    utilityObject->info("Start of Multiple Alignment\n"); 
+
+    int _numSeqs = alnPtr->getNumSeqs();
+    
+    vector<int> newOutputIndex(_numSeqs);   
+   
+    alnPtr->addSeqWeight(seqWeight);
+    
+    ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign;
+    _numSteps = progSteps->getNumSteps();
+    // for each sequence, find the most closely related sequence
+
+    maxid = new int[_numSeqs + 1];
+
+    for (i = 1; i <= _numSeqs; i++)
+    {
+        maxid[i] =  -1;
+        for (j = 1; j <= _numSeqs; j++)
+            if (j != i && maxid[i] < (*distMat)(i, j))
+            {
+                maxid[i] = static_cast<int>((*distMat)(i, j));
+            }
+    }
+
+    // group the sequences according to their relative divergence
+
+    if (iStart == 0)
+    {
+
+        // start the multiple alignments.........
+
+        utilityObject->info("Aligning...");
+        // first pass, align closely related sequences first....
+        
+        
+        ix = 0;
+        aligned = new int[_numSeqs + 1];
+        
+        for (i = 0; i <= _numSeqs; i++)
+        {
+            aligned[i] = 0;
+        }
+        
+        const vector<vector<int> >* ptrToSets = progSteps->getSteps();
+        
+        
+        for (set = 1; set <= _numSteps; ++set)
+        {
+            entries = 0;
+            for (i = 1; i <= _numSeqs; i++)
+            {
+                if (((*ptrToSets)[set][i] != 0) && 
+                    (maxid[i] > userParameters->getDivergenceCutoff()))
+                {
+                    entries++;
+                    if (aligned[i] == 0)
+                    {
+                        if (userParameters->getOutputOrder() == INPUT)
+                        {
+                            ++ix;
+                            newOutputIndex[i - 1] = i;
+                        }
+                        else
+                        {
+                            if(ix >= (int)newOutputIndex.size())
+                            {
+                                cerr << "ERROR: size = " << newOutputIndex.size() 
+                                     << "ix = " << ix << "\n";
+                                exit(1);
+                            }
+                            else
+                            {
+                                newOutputIndex[ix] = i;
+                                ++ix;
+                            }
+                        }
+
+                        aligned[i] = 1;
+                    }
+                }
+            }
+
+            if (entries > 0)
+            {
+                #if DEBUGFULL
+                    if(logObject && DEBUGLOG)
+                    {    
+                        logObject->logMsg("Doing profile align");
+                    }
+                #endif                 
+                score = alignAlgorithm->profileAlign(alnPtr, distMat, progSteps->getStep(set),
+                                                     aligned);
+            }
+            else
+            {
+                score = 0;
+            }
+
+
+            // negative score means fatal error... exit now! 
+
+            if (score < 0)
+            {
+                return (-1);
+            }
+            if(userParameters->getDisplayInfo())
+            {
+                if ((entries > 0) && (score > 0))
+                {
+                    utilityObject->info("Group %d: Sequences:%4d      Score:%d",
+                                        set, entries, score);                     
+                }
+                else
+                {
+                    utilityObject->info("Group %d:                     Delayed", set);
+                }
+            }
+        }
+    }
+
+    else
+    {
+        aligned = new int[_numSeqs + 1];
+        ix = 0;
+        for (i = 1; i <= iStart + 1; i++)
+        {
+            aligned[i] = 1;
+            ++ix;
+            newOutputIndex[i - 1] = i;
+        }
+        for (i = iStart + 2; i <= _numSeqs; i++)
+        {
+            aligned[i] = 0;
+        }
+    }
+
+    // second pass - align remaining, more divergent sequences..... 
+
+    // if not all sequences were aligned, for each unaligned sequence,
+    // find it's closest pair amongst the aligned sequences.
+
+    group.resize(_numSeqs + 1); 
+    treeWeight.resize(_numSeqs); 
+
+    for (i = 0; i < _numSeqs; i++)
+    {
+        treeWeight[i] = (*seqWeight)[i];
+    }
+
+    // if we haven't aligned any sequences, in the first pass - align the
+    // two most closely related sequences now
+    if (ix == 0)
+    {
+        max =  -1;
+        iseq = 0;
+
+        for (i = 1; i <= _numSeqs; i++)
+        {
+            for (j = i + 1; j <= _numSeqs; j++)
+            {
+                if (max < (*distMat)(i, j))
+                {
+                    max = static_cast<int>((*distMat)(i, j)); // Mark change 17-5-07
+                    iseq = i;
+                }
+            }
+        }
+        aligned[iseq] = 1;
+        if (userParameters->getOutputOrder() == INPUT)
+        {
+            ++ix;
+            newOutputIndex[iseq - 1] = iseq;
+        }
+        else
+        {
+            newOutputIndex[ix] = iseq;
+            ++ix;
+        }
+    }
+
+    while (ix < _numSeqs)
+    {
+        for (i = 1; i <= _numSeqs; i++)
+        {
+            if (aligned[i] == 0)
+            {
+                maxid[i] =  - 1;
+                for (j = 1; j <= _numSeqs; j++)
+                    if ((maxid[i] < (*distMat)(i, j)) && (aligned[j] != 0))
+                    {
+                        maxid[i] = static_cast<int>((*distMat)(i, j));// Mark change 17-5-07
+                    }
+
+            }
+        }
+        // find the most closely related sequence to those already aligned
+
+        max =  - 1;
+        iseq = 0;
+        for (i = 1; i <= _numSeqs; i++)
+        {
+            if ((aligned[i] == 0) && (maxid[i] > max))
+            {
+                max = maxid[i];
+                iseq = i;
+            }
+        }
+
+
+        // align this sequence to the existing alignment
+        // weight sequences with percent identity with profile
+        // OR...., multiply sequence weights from tree by percent identity with new sequence 
+        if (userParameters->getNoWeights() == false)
+        {
+            for (j = 0; j < _numSeqs; j++)
+                if (aligned[j + 1] != 0)
+                {
+                    (*seqWeight)[j] = static_cast<int>(treeWeight[j] * (*distMat)(j + 1, iseq));
+                }
+
+            // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
+
+            sum = 0;
+            for (j = 0; j < _numSeqs; j++)
+            {
+                if (aligned[j + 1] != 0)
+                {
+                    sum += (*seqWeight)[j];
+                }
+            }
+
+            if (sum == 0)
+            {
+                for (j = 0; j < _numSeqs; j++)
+                {
+                    (*seqWeight)[j] = 1;
+                }
+
+                sum = j;
+            }
+            for (j = 0; j < _numSeqs; j++)
+            {
+                if (aligned[j + 1] != 0)
+                {
+                    (*seqWeight)[j] = ((*seqWeight)[j] * INT_SCALE_FACTOR) / sum;
+                    if ((*seqWeight)[j] < 1)
+                    {
+                        (*seqWeight)[j] = 1;
+                    }
+                }
+            }
+        }
+
+        entries = 0;
+        for (j = 1; j <= _numSeqs; j++)
+        {
+            if (aligned[j] != 0)
+            {
+                group[j] = 1;
+                entries++;
+            }
+            else if (iseq == j)
+            {
+                group[j] = 2;
+                entries++;
+            }
+        }
+        
+        alnPtr->addSeqWeight(seqWeight);
+        aligned[iseq] = 1;
+        
+        score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned);
+         
+        if (userParameters->getOutputOrder() == INPUT)
+        {
+            ++ix;
+            newOutputIndex[iseq - 1] = iseq;
+        }
+        else
+        {
+            newOutputIndex[ix] = iseq;
+            ++ix;
+        }
+    }
+        
+    alnPtr->addOutputIndex(&newOutputIndex);
+    
+    if(userParameters->getDisplayInfo())
+    {
+      int alignmentScore = alnPtr->alignScore(); // ?? check, FS, 2009-05-18
+    }
+    
+    delete alignAlgorithm;
+    delete [] aligned;
+    delete [] maxid;
+    return (_numSeqs);
+}
+
+
+/**
+ * 
+ * @param alnPtr 
+ * @param distMat 
+ * @param iStart 
+ * @param phylipName 
+ * @return 
+ */
+int MSA::seqsAlignToProfile(Alignment* alnPtr, DistMatrix* distMat, vector<int>* seqWeight, int iStart, 
+                           string phylipName)
+{
+    int *aligned;  
+    vector<int> treeWeight;
+    vector<int> group;
+    int ix;
+
+    int *maxid;
+    int max = 0;
+    int i = 0, j = 0, iseq = 0;
+    int sum = 0, entries = 0;
+    int score = 0;
+    int _numSeqs = alnPtr->getNumSeqs();
+    
+    utilityObject->info("Start of Multiple Alignment\n"); 
+       
+    ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign;
+    
+    // calculate sequence weights according to branch lengths of the tree -
+     // weights in global variable seq_weight normalised to sum to 100
+    vector<int> newOutputIndex(_numSeqs);    
+    //groupTree.calcSeqWeights(0, _numSeqs, &seqWeight);
+
+    treeWeight.resize(_numSeqs);
+    for (i = 0; i < _numSeqs; i++)
+    {
+        treeWeight[i] = (*seqWeight)[i];
+    }
+
+    // for each sequence, find the most closely related sequence
+
+    maxid = new int[_numSeqs + 1];
+    
+    for (i = 1; i <= _numSeqs; i++)
+    {
+        maxid[i] =  - 1;
+        for (j = 1; j <= _numSeqs; j++)
+        {
+            if (maxid[i] < (*distMat)(i, j))
+            {
+                maxid[i] = static_cast<int>((*distMat)(i, j)); // Mark change 17-5-07
+            }
+        }
+    }
+
+    aligned = new int[_numSeqs + 1];
+    ix = 0;
+    for (i = 1; i <= iStart + 1; i++)
+    {
+        aligned[i] = 1;
+        ++ix;
+        newOutputIndex[i - 1] = i;
+    }
+    for (i = iStart + 2; i <= _numSeqs; i++)
+    {
+        aligned[i] = 0;
+    }
+
+    // for each unaligned sequence, find it's closest pair amongst the
+    // aligned sequences. 
+
+    group.resize(_numSeqs + 1);
+
+    while (ix < _numSeqs)
+    {
+        if (ix > 0)
+        {
+            for (i = 1; i <= _numSeqs; i++)
+            {
+                if (aligned[i] == 0)
+                {
+                    maxid[i] =  - 1;
+                    for (j = 1; j <= _numSeqs; j++)
+                    {
+                        if ((maxid[i] < (*distMat)(i, j)) && (aligned[j] != 0))
+                        {
+                            maxid[i] = static_cast<int>((*distMat)(i, j));
+                        }
+                    }
+                }
+            }
+        }
+
+        // find the most closely related sequence to those already aligned
+
+        max =  -1;
+        for (i = 1; i <= _numSeqs; i++)
+        {
+            if ((aligned[i] == 0) && (maxid[i] > max))
+            {
+                max = maxid[i];
+                iseq = i;
+            }
+        }
+
+        // align this sequence to the existing alignment 
+        entries = 0;
+        for (j = 1; j <= _numSeqs; j++)
+        {
+            if (aligned[j] != 0)
+            {
+                group[j] = 1;
+                entries++;
+            }
+            else if (iseq == j)
+            {
+                group[j] = 2;
+                entries++;
+            }
+        }
+
+        aligned[iseq] = 1;
+
+        // multiply sequence weights from tree by percent
+        // identity with new sequence 
+
+        for (j = 0; j < _numSeqs; j++)
+        {
+            (*seqWeight)[j] = static_cast<int>(treeWeight[j] * (*distMat)(j + 1, iseq));
+        }
+
+        //
+        // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
+        //
+
+        sum = 0;
+        for (j = 0; j < _numSeqs; j++)
+        {
+            if (group[j + 1] == 1)
+            {
+                sum += (*seqWeight)[j];
+            }
+        }
+
+        if (sum == 0)
+        {
+            for (j = 0; j < _numSeqs; j++)
+            {
+                (*seqWeight)[j] = 1;
+            }
+
+            sum = j;
+        }
+        for (j = 0; j < _numSeqs; j++)
+        {
+            (*seqWeight)[j] = ((*seqWeight)[j] * INT_SCALE_FACTOR) / sum;
+            if ((*seqWeight)[j] < 1)
+            {
+                (*seqWeight)[j] = 1;
+            }
+        }
+        // Add the seqWeights to the Alignment object!!!!
+        alnPtr->addSeqWeight(seqWeight);
+        
+        score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned);
+        utilityObject->info("Sequence:%d     Score:%d", iseq, score);
+        if (userParameters->getOutputOrder() == INPUT)
+        {
+            ++ix;
+            newOutputIndex[iseq - 1] = iseq;
+        }
+        else
+        {
+            newOutputIndex[ix] = iseq;
+            ++ix;
+        }
+    }
+
+    delete [] aligned;
+    delete [] maxid;
+    delete alignAlgorithm;
+    alnPtr->addOutputIndex(&newOutputIndex);
+    
+    if(userParameters->getDisplayInfo())
+    {
+        alnPtr->alignScore();
+    }
+
+    return (_numSeqs);
+}
+
+
+/**
+ * 
+ * @param alnPtr 
+ * @param distMat 
+ * @return 
+ */
+int MSA::calcPairwiseForProfileAlign(Alignment* alnPtr, DistMatrix* distMat)
+{
+    //Tree groupTree;
+    int i, j, temp;
+    int entries;
+    int* aligned;  
+    vector<int> group;
+    vector<int> seqWeight;
+    float dscore;
+    int score;
+    int _numSeqs = alnPtr->getNumSeqs();
+    
+    seqWeight.resize(_numSeqs);
+    ProfileAlignAlgorithm* alignAlg = new MyersMillerProfileAlign;
+    
+    utilityObject->info("Start of Initial Alignment");
+    /* calculate sequence weights according to branch lengths of the tree -
+     * weights in global variable seq_weight normalised to sum to INT_SCALE_FACTOR */
+
+    temp = INT_SCALE_FACTOR / _numSeqs;
+    for (i = 0; i < _numSeqs; i++)
+    {
+        seqWeight[i] = temp;
+    }
+
+    userParameters->setDistanceTree(false);
+
+    /* do the initial alignment.........  */
+
+    group.resize(_numSeqs + 1);
+
+    for (i = 1; i <= alnPtr->getProfile1NumSeqs(); ++i)
+    {
+        group[i] = 1;
+    }
+
+    for (i = alnPtr->getProfile1NumSeqs() + 1; i <= _numSeqs; ++i)
+    {
+        group[i] = 2;
+    }
+
+    entries = _numSeqs;
+
+    aligned = new int[_numSeqs + 1];
+
+    for (i = 1; i <= _numSeqs; i++)
+    {
+        aligned[i] = 1;
+    }
+
+    alnPtr->addSeqWeight(&seqWeight);
+    
+    score = alignAlg->profileAlign(alnPtr, distMat, &group, aligned);
+    utilityObject->info("Sequences:%d      Score:%d", entries, score);
+    delete [] aligned;
+    
+    for (i = 1; i <= _numSeqs; i++)
+    {
+        for (j = i + 1; j <= _numSeqs; j++)
+        {
+            dscore = alnPtr->countid(i, j);
+            (*distMat)(i, j) = ((double)100.0 - (double)dscore) / (double)100.0;
+            (*distMat)(j, i) = (*distMat)(i, j);
+        }
+    }
+    delete alignAlg;
+    return (_numSeqs);
+
+}
+
+
+/**
+ * 
+ * @param alnPtr 
+ * @param distMat 
+ * @param p1TreeName 
+ * @param p2TreeName 
+ * @return 
+ */
+int MSA::doProfileAlign(Alignment* alnPtr, DistMatrix* distMat, vector<int>* prof1Weight, vector<int>* prof2Weight)
+{
+    //Tree groupTree1, groupTree2;
+    int i, j, sum, entries;
+    int score;
+    int *aligned;
+    vector<int> group;
+    int *maxid;
+    //vector<int> prof1Weight, prof2Weight;
+    int _profile1NumSeqs = alnPtr->getProfile1NumSeqs();
+    int _numSeqs = alnPtr->getNumSeqs();
+    vector<int> _seqWeight, _outputIndex;    
+    
+    utilityObject->info("Start of Multiple Alignment\n");
+        
+    _seqWeight.resize(_numSeqs + 1);
+    _outputIndex.resize(_numSeqs);
+    ProfileAlignAlgorithm* alignAlgorithm = new MyersMillerProfileAlign;
+
+    // weight sequences with max percent identity with other profile
+
+    maxid = new int[_numSeqs + 1];
+    for (i = 0; i < _profile1NumSeqs; i++)
+    {
+        maxid[i] = 0;
+        for (j = _profile1NumSeqs + 1; j <= _numSeqs; j++)
+        {
+            if (maxid[i] < (*distMat)(i + 1, j))
+            {
+                maxid[i] = static_cast<int>((*distMat)(i + 1, j)); // Mark change 17-5-07
+            } 
+        }
+        _seqWeight[i] = maxid[i] * (*prof1Weight)[i];
+    }
+
+    for (i = _profile1NumSeqs; i < _numSeqs; i++)
+    {
+        maxid[i] =  - 1;
+        for (j = 1; j <= _profile1NumSeqs; j++)
+        {
+            if (maxid[i] < (*distMat)(i + 1, j))
+            {
+                maxid[i] = static_cast<int>((*distMat)(i + 1, j));// Mark change 17-5-07
+            }
+        }
+        _seqWeight[i] = maxid[i] * (*prof2Weight)[i];
+    }
+    //
+    // Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
+    //
+
+    sum = 0;
+    for (j = 0; j < _numSeqs; j++)
+    {
+        sum += _seqWeight[j];
+    }
+
+    if (sum == 0)
+    {
+        for (j = 0; j < _numSeqs; j++)
+        {
+            _seqWeight[j] = 1;
+        }
+
+        sum = j;
+    }
+    for (j = 0; j < _numSeqs; j++)
+    {
+        _seqWeight[j] = (_seqWeight[j] * INT_SCALE_FACTOR) / sum;
+        if (_seqWeight[j] < 1)
+        {
+            _seqWeight[j] = 1;
+        }
+    }
+
+    // do the alignment.........  /
+
+    utilityObject->info("Aligning...");
+    group.resize(_numSeqs + 1);
+
+    for (i = 1; i <= _profile1NumSeqs; ++i)
+    {
+        group[i] = 1;
+    }
+
+    for (i = _profile1NumSeqs + 1; i <= _numSeqs; ++i)
+    {
+        group[i] = 2;
+    }
+
+    entries = _numSeqs;
+
+    aligned = new int[_numSeqs + 1];
+    for (i = 1; i <= _numSeqs; i++)
+    {
+        aligned[i] = 1;
+    }
+
+    alnPtr->addSeqWeight(&_seqWeight);
+
+    score = alignAlgorithm->profileAlign(alnPtr, distMat, &group, aligned);
+   
+    utilityObject->info("Sequences:%d      Score:%d", entries, score);
+    
+    for (i = 1; i <= _numSeqs; i++)
+    {
+        _outputIndex[i - 1] = i;
+    }
+
+    alnPtr->addOutputIndex(&_outputIndex);
+    
+    delete alignAlgorithm;
+    delete [] aligned;
+    delete [] maxid;
+    return (_numSeqs);
+    return 1;
+}
+}