WSTester updated to work plus hopefully all the other changes that need to go into...
[jabaws.git] / binaries / src / ViennaRNA / RNAforester / src / progressive_align.cpp
diff --git a/binaries/src/ViennaRNA/RNAforester/src/progressive_align.cpp b/binaries/src/ViennaRNA/RNAforester/src/progressive_align.cpp
new file mode 100644 (file)
index 0000000..172dfc6
--- /dev/null
@@ -0,0 +1,343 @@
+/*
+  Copyright by Matthias Hoechsmann (C) 2002-2004
+  =====================================                                   
+  You may use, copy and distribute this file freely as long as you
+  - do not change the file,
+  - leave this copyright notice in the file,
+  - do not make any profit with the distribution of this file
+  - give credit where credit is due
+  You are not allowed to copy or distribute this file otherwise
+  The commercial usage and distribution of this file is prohibited
+  Please report bugs and suggestions to <mhoechsm@TechFak.Uni-Bielefeld.DE>
+*/
+
+#include <algorithm>
+#include <fstream>
+
+#include "alignment.h"
+#include "progressive_align.h"
+
+#include "alignment.t.cpp"
+
+using namespace std;
+
+// !!! this operator is defined as > !!!
+bool operator < (pair<double,RNAProfileAlignment*> &l, pair<double,RNAProfileAlignment*> &r)
+{
+  if(l.second->getNumStructures() < r.second->getNumStructures())
+    return false;
+  else
+    return true;
+}
+
+void progressiveAlign(deque<RNAProfileAlignment*> &inputList, deque<pair<double,RNAProfileAlignment*> > &resultList, const DoubleScoreProfileAlgebraType *alg,const RNAforesterOptions &options)
+{
+    RNAProfileAliMapType inputMapProfile;
+       deque<RNAProfileAliKeyPairType> alignList;
+       RNAProfileAliMapType::iterator it,it2;
+       long x,y,bestx,besty;  // keys of stuctures in inputMapProfile
+       RNAProfileAlignment *ppfx=NULL,*ppfy=NULL,*ppf=NULL;
+       int level=1;
+       double bestScore,threshold,cutoff;
+       string clusterfilename;
+       ofstream s;
+       Matrix<double> *score_mtrx;     
+       bool local;
+
+       local=options.has(RNAforesterOptions::LocalSimilarity);
+
+       cout << "*** Calculation ***" << endl << endl;
+
+       // create inputMapProfile to access a profile by an index value
+       deque<RNAProfileAlignment*>::const_iterator inpIt;
+       long i=1;
+       for(inpIt=inputList.begin();inpIt!=inputList.end();inpIt++)
+         {
+           inputMapProfile[i]=*inpIt;
+           i++;
+         }
+       inputList.clear();
+
+       // create matrix for all against all comparison
+       bestScore=alg->worst_score();
+       score_mtrx=new Matrix<double>(inputMapProfile.size(),inputMapProfile.size());
+
+       // set threshold for the clustering algorithm
+       if(options.has(RNAforesterOptions::CalculateDistance))
+               options.get(RNAforesterOptions::ClusterThreshold,threshold,20.0);
+       else
+               options.get(RNAforesterOptions::ClusterThreshold,threshold,0.7);
+       cout << "clustering threshold is: " << threshold << endl;
+
+       // set cutoff value for clustering
+       if(options.has(RNAforesterOptions::CalculateDistance))
+               options.get(RNAforesterOptions::ClusterJoinCutoff,cutoff,100.0);
+       else
+               options.get(RNAforesterOptions::ClusterJoinCutoff,cutoff,0.0);
+       cout << "join clusters cutoff is: " << cutoff << endl << endl;
+
+       // generate dot file
+       clusterfilename=options.generateFilename(RNAforesterOptions::Help,"_cluster.dot", "cluster.dot");  // use Help as dummy
+       s.open(clusterfilename.c_str());
+       s << "digraph forest" << endl << "{" << endl;
+
+       // generate nodes for the input forests
+       for(it=inputMapProfile.begin();it!=inputMapProfile.end();it++) 
+       {
+               ppf=it->second;
+
+               s << "\"" << ppf->getName() << "\"" << "[label=\"" << ppf->getName() << "\"]" << endl;
+               s << "\"" << ppf->getName() << "\"" << "[label=\"" << ppf->getName() << "\"]" << endl;
+       }
+
+
+       // compute all pairwise distances
+       // !! NOTE !! iterating through the map is ordered by key number
+       // as i only calculate a triangle matrix this is a prerequisite
+       cout << "Computing all pairwise similarities" << endl;
+       for(it=inputMapProfile.begin();it!=inputMapProfile.end();it++)
+       {         
+               x=it->first;
+               ppfx=it->second;
+               for(it2=inputMapProfile.begin();it2->first<it->first;it2++)
+               {
+                       y=it2->first;
+                       ppfy=it2->second;
+
+                       Alignment<double,RNA_Alphabet_Profile,RNA_Alphabet_Profile> ali(ppfx,ppfy,*alg,local);
+                       if(local)
+                         score_mtrx->setAt(x-1,y-1,ali.getLocalOptimum());
+                       else
+                         score_mtrx->setAt(x-1,y-1,ali.getGlobalOptimumRelative());
+                       
+                       cout << x << "," << y << ": " << score_mtrx->getAt(x-1,y-1) << endl;
+
+                       WATCH(DBG_MULTIPLE,"progressiveAlign",x);
+                       WATCH(DBG_MULTIPLE,"progressiveAlign",y);
+                       WATCH(DBG_MULTIPLE,"progressiveAlign",ali.getGlobalOptimumRelative());
+               }
+       }
+       cout << endl;
+
+       while(inputMapProfile.size()>1)
+       {
+               // find the best score of all pairwise alignments
+               bestScore=alg->worst_score();
+               for(it=inputMapProfile.begin();it!=inputMapProfile.end();it++)
+               {
+                       x=it->first;
+                       for(it2=inputMapProfile.begin();it2->first < it->first;it2++)
+                       {
+                               double old_bestScore=bestScore;
+
+                               y=it2->first;         
+                               WATCH(DBG_MULTIPLE,"progressiveAlign",x);
+                               WATCH(DBG_MULTIPLE,"progressiveAlign",y);
+                               WATCH(DBG_MULTIPLE,"progressiveAlign",score_mtrx->getAt(x-1,y-1));
+
+                               WATCH(DBG_MULTIPLE,"progressiveAlign",bestScore);
+                               bestScore=alg->choice(bestScore,score_mtrx->getAt(x-1,y-1));
+
+                               if(bestScore!=old_bestScore)
+                               {
+                                       bestx=it->first;
+                                       besty=it2->first;
+                                       old_bestScore=bestScore;
+                               }
+                       }
+               }
+
+               WATCH(DBG_MULTIPLE,"progressiveAlign",bestScore);
+               cout << "joining alignments:" << endl;
+
+               // if threshold is set generate a list of best pairs within threshold 
+               if(alg->choice(bestScore,threshold)!= threshold)
+               {
+                       Graph graph;
+                       int *mate;
+                       int maximize;
+
+                       graph=makePairsGraph(inputMapProfile,alg,score_mtrx,threshold);
+                       if(options.has(RNAforesterOptions::CalculateDistance))
+                               maximize=0;
+                       else
+                               maximize=1;
+
+                       mate = Weighted_Match(graph,1,maximize);
+                       for(x=1;x<=score_mtrx->xDim();x++)  // !! begins at 1 !!
+                       {
+                               if(x<mate[x])
+                               {               
+                                       // if it is a best pair put it in the align list
+                                       ppfx=inputMapProfile[x];
+                                       inputMapProfile.erase(x);
+                                       alignList.push_back(make_pair(x,ppfx));
+                                       ppfy=inputMapProfile[mate[x]];
+                                       inputMapProfile.erase(mate[x]);
+                                       alignList.push_back(make_pair(mate[x],ppfy));
+                               }
+                       }
+                       free(mate);
+                       free(graph);
+               }
+               else
+               {
+                       // if there us no pair below the threshold
+                       // combine those two profile forests, that produced the best score
+                       ppfx=inputMapProfile[bestx];
+                       ppfy=inputMapProfile[besty];
+                       inputMapProfile.erase(bestx);
+                       inputMapProfile.erase(besty);
+                       alignList.push_back(make_pair(bestx,ppfx));
+                       alignList.push_back(make_pair(besty,ppfy));
+               }
+
+               // align all forests in the align list. 
+               while(alignList.size()>1)
+               {
+                       string aliName;
+
+                       x=alignList.front().first;
+                       ppfx=alignList.front().second;
+                       alignList.erase(alignList.begin());
+                       y=alignList.front().first;
+                       ppfy=alignList.front().second;
+                       alignList.erase(alignList.begin());
+
+                       // compute alignment again 
+                       Alignment<double,RNA_Alphabet_Profile,RNA_Alphabet_Profile> bestali(ppfx,ppfy,*alg,local);
+                       if(local)
+                         bestScore=bestali.getLocalOptimum();
+                       else
+                         bestScore=bestali.getGlobalOptimumRelative();
+
+                       // test, if score is worse than the cutoff value
+                       if(alg->choice(bestScore,cutoff)== cutoff)
+                       {
+                               // copy involved forests to the result list
+                               cout << x << "," << y << ": alignment is below cutoff." << endl; 
+                               if(ppfx->getNumStructures()>1)
+                                 resultList.push_back(make_pair(ppfx->maxScore(*alg),ppfx));
+                               if(ppfy->getNumStructures()>1)
+                                 resultList.push_back(make_pair(ppfy->maxScore(*alg),ppfy));                                                     
+                       }
+                       else
+                       {
+                               // calculate optimal alignment and add it to inputMapProfile
+
+                         ppf=new RNAProfileAlignment(ppfx->getNumStructures(),ppfy->getNumStructures());
+
+                         if(local)
+                           {
+                             Uint xbasepos,ybasepos;
+                             bestali.getOptLocalAlignment(*ppf,xbasepos,ybasepos);
+                           }
+                         else
+                           {
+                             bestali.getOptGlobalAlignment(*ppf);
+                           }
+                       
+                               ppf->addStrNames(ppfx->getStrNames());
+                               ppf->addStrNames(ppfy->getStrNames());
+                               aliName=ppfx->getName() + "." +ppfy->getName();
+                               ppf->setName(aliName);
+
+                               // for debug purposes
+                               //string dotfilename;
+                               //dotfilename=ppf->getName() + string(".dot");
+                               //                              ofstream s("test.dot");
+                               //                              ppf->printDot(s);
+
+                               // generate nodes in cluster file (dot format)
+                               s << "\"" << ppf->getName() << "\"" << "[shape=\"diamond\" label=\"" << bestScore << "\"]" << endl;
+                               s << "\"" << ppf->getName() << "\"" << "-> {\"" <<  ppfx->getName() << "\" \"" << ppfy->getName() << "\"}";
+                               
+                               cout << x << "," << y << ": " << bestScore << " -> " << min(x,y) << endl;
+                               
+                               delete ppfx;
+                               delete ppfy;      
+
+                               // calculate distance to all forests in the list
+                               cout << "Calculate similarities to other clusters" << endl;
+                               ppfx=ppf;
+                               // x remains x !!
+                               for(it=inputMapProfile.begin();it!=inputMapProfile.end();it++)
+                                 {
+                                   y=it->first;
+                                   ppfy=it->second;
+                                   Alignment<double,RNA_Alphabet_Profile,RNA_Alphabet_Profile> ali(ppfx,ppfy,*alg,local);
+                                   if(local)
+                                     {
+                                       score_mtrx->setAt(min(x-1,y-1),max(x-1,y-1),ali.getLocalOptimum());  // min - max = fill the upper triangle
+                                       cout << min(x,y) << "," << max(x,y) << ": " << ali.getLocalOptimum() <<  endl;                                  
+                                     }
+                                   else
+                                     {
+                                       score_mtrx->setAt(min(x-1,y-1),max(x-1,y-1),ali.getGlobalOptimumRelative());  // min - max = fill the upper triangle
+                                       cout << min(x,y) << "," << max(x,y) << ": " << ali.getGlobalOptimumRelative() <<  endl;
+                                     }
+                                 } 
+                               cout << endl;
+                               
+                               // ... and append it to the list
+                               inputMapProfile.insert(make_pair(x,ppf));
+                       }
+               }
+
+               level++;
+       }
+
+       assert(inputMapProfile.size()<2);
+
+       // copy last profile to resultList
+       if(inputMapProfile.size()==1)
+       {
+           ppf=inputMapProfile.begin()->second;
+               resultList.push_back(make_pair(ppf->maxScore(*alg),ppf));
+               inputMapProfile.clear();
+       }
+
+       // end of dot file
+       s << "}" << endl;
+
+       // sort result list
+       //      std::sort(resultList.begin(),resultList.end());
+       
+       
+       delete score_mtrx;
+}
+
+Graph makePairsGraph(const RNAProfileAliMapType &inputMapProfile, const DoubleScoreProfileAlgebraType *alg, const Matrix<double> *score_mtrx, double threshold)
+{
+       Graph graph;
+       RNAProfileAliMapType::const_iterator it,it2;
+       RNAProfileAlignment *ppfx=NULL,*ppfy=NULL;
+
+       graph = NewGraph(score_mtrx->xDim());
+
+       for(int i=0;i<score_mtrx->xDim();i++)
+       {
+               Xcoord(graph,i+1) = 0;
+               Ycoord(graph,i+1) = 0;
+       }
+
+       for(it=inputMapProfile.begin();it!=inputMapProfile.end();it++)
+       {
+               ppfx=it->second;
+               for(it2=inputMapProfile.begin();it2->first<it->first;it2++)
+               {
+                       double score;
+                       ppfy=it2->second;
+
+                       score=score_mtrx->getAt(it->first-1,it2->first-1);
+                       if(alg->choice(score,threshold) != threshold)  // is it better than the threshold ?
+                       {
+                               AddEdge (graph,it->first,it2->first,(int)(score*100.0));      
+                       }
+               }
+       }
+
+
+       WriteGraph (graph,"test.out");
+       return graph;
+}