1 ///////////////////////////////////////////////////////////////
4 // make SCS(Stem Candidate Sequence) from the profile
5 //////////////////////////////////////////////////////////////
8 #include "SafeVector.h"
9 #include "StemCandidate.hpp"
11 #include "MultiSequence.h"
12 #include "BPPMatrix.hpp"
22 using namespace::MXSCARNA;
27 #include "fold_vars.h"
29 #include "part_func.h"
31 #include "RNAstruct.h"
33 #include "stringdist.h"
34 #include "profiledist.h"
37 #include "dist_vars.h"
39 double Stacking_Energy[36] ={
40 -0.9,-2.1,-1.7,-0.5,-0.9,-1.0,
41 -1.8,-2.9,-2.0,-1.2,-1.7,-1.9,
42 -2.3,-3.4,-2.9,-1.4,-2.1,-2.1,
43 -1.1,-2.1,-1.9,-0.4,-1.0,1.5,
44 -1.1,-2.3,-1.8,-0.8,-0.9,-1.1,
45 -0.8,-1.4,-1.2,-0.2,-0.5,-0.4 };
47 static Trimat<float>* makeProfileBPPMatrix(MultiSequence *Sequences, SafeVector<BPPMatrix*> &BPPMatrices);
48 static int countSCS(MultiSequence *Sequences, Trimat<float>* consBppMat, int BandWidth);
49 static std::vector<StemCandidate>* makeProfileScs(std::vector<StemCandidate> *pscs, MultiSequence *Sequences, Trimat<float>* consBppMat, int BandWidth);
50 static void printScs(std::vector<StemCandidate> *pscs);
51 static std::vector<StemCandidate>* doubleScs(std::vector<StemCandidate> *pscs);
52 static std::vector<StemCandidate>* findRelations(std::vector<StemCandidate> *pscs);
53 static std::vector<StemCandidate>* findCorresponding(std::vector<StemCandidate>* pscs);
54 static std::vector<StemCandidate>* calculateStackingEnergy(std::vector<StemCandidate>* pscs);
56 //float alipf_fold(char **sequences, char *structure, pair_info **pi);
59 bool operator()(const StemCandidate &sc1, const StemCandidate &sc2) const {
60 if (sc1.GetPosition() > sc2.GetPosition()) return false;
61 else if (sc1.GetPosition() < sc2.GetPosition()) return true;
62 else if (sc1.GetDistance() > sc2.GetDistance()) return false;
68 vector<StemCandidate>*
69 seq2scs(MultiSequence *Sequences, SafeVector<BPPMatrix*> &BPPMatrices, int BandWidth)
72 Trimat<float> *consBppMat = makeProfileBPPMatrix(Sequences, BPPMatrices);
74 int numberScs = countSCS(Sequences, consBppMat, BandWidth);
76 std::vector<StemCandidate> *pscs = new std::vector<StemCandidate>(); // Profile Stem Candidate Sequence
77 // cout << "numberScs=" << numberScs << endl;
78 pscs->resize(numberScs+1);
80 pscs = makeProfileScs(pscs, Sequences, consBppMat, BandWidth);
82 pscs = doubleScs(pscs);
84 std::vector<StemCandidate>::iterator startIter = pscs->begin();
85 std::vector<StemCandidate>::iterator endIter = pscs->end();
87 std::sort(startIter, endIter, SortCmp());
90 pscs = findRelations(pscs);
92 pscs = findCorresponding(pscs);
94 pscs = calculateStackingEnergy(pscs);
104 static Trimat<float>*
105 makeProfileBPPMatrix(MultiSequence *Sequences, SafeVector<BPPMatrix*> &BPPMatrices)
107 int length = Sequences->GetSequence(0)->GetLength();
108 // float thr = BaseProbThreshold;
109 Trimat<float> *consBppMat = new Trimat<float>(length + 1);
110 fill(consBppMat->begin(), consBppMat->end(), 0);
113 // for(int i = 0; i <= length; i++)
114 // for(int j = i; j <= length; j++)
115 // cout << "i=" << i << " j=" << j << " " << consBppMat->ref(i,j) << endl;
116 // consBppMat->ref(i,j) = 0;
119 int number = Sequences->GetNumSequences();
120 // if( number == 1) {
121 for(int seqNum = 0; seqNum < number; seqNum++) {
122 SafeVector<int> *tmpMap = Sequences->GetSequence(seqNum)->GetMappingNumber();
123 int label = Sequences->GetSequence(seqNum)->GetLabel();
124 BPPMatrix *tmpBppMatrix = BPPMatrices[label];
126 for(int i = 1; i <= length ; i++) {
127 int originI = tmpMap->at(i);
128 for(int j = i + 3; j <= length; j++) {
129 int originJ = tmpMap->at(j);
130 if(originI != 0 && originJ != 0) {
131 float tmpProb = tmpBppMatrix->GetProb(originI, originJ);
133 // if(tmpProb >= thr) {
134 consBppMat->ref(i, j) += tmpProb;
135 // cout << i << " " << j << " " << consBppMat->ref(i,j) << endl;
142 /* compute the mean of base pairing probability */
143 for(int i = 1; i <= length; i++) {
144 for(int j = i + 3; j <= length; j++) {
145 consBppMat->ref(i,j) = consBppMat->ref(i,j)/(float)number;
146 //consBppMat->ref(i,j) = std::sqrt[number](consBppMat->ref(i,j));
147 // cout << i << " " << j << " " << consBppMat->ref(i,j) << endl;
155 Seqs = (char **) malloc(sizeof(char*) * number);
157 for(int i = 0; i < number; i++) {
158 Seqs[i] = (char *) malloc(sizeof(char) * (length + 1));
159 for(int j = 1; j <= length; j++) {
160 Seqs[i][j-1] = Sequences->GetSequence(i)->GetPosition(j);
162 Seqs[i][length] = '\0';
166 char *structure = NULL;
169 alipf_fold(Seqs, structure, &pi, number);
171 for(int iter = 0; iter < length; iter++) {
172 if(pi[iter].i == 0) break;
173 consBppMat->ref(pi[iter].i, pi[iter].j) = pi[iter].p;
176 for(int i = 0; i < number; i++) {
181 // free_alifold_arrays(void);
189 countSCS(MultiSequence *Sequences, Trimat<float> *consBppMat, int BandWidth)
192 int length = Sequences->GetSequence(0)->GetLength();
193 int Word_Length = scsLength;
200 for(k = 1; k <= length; k++) {
203 for(i = k, j = k; i >= 1 && j <= (k + BandWidth - 1) && j <= length; i--, j++) {
204 if(consBppMat->ref(i, j) >= BaseProbThreshold) {
207 else if(count >= Word_Length) {
208 for(s = 0; s <= count - Word_Length; s++)
216 if(consBppMat->ref(i, j) >= BaseProbThreshold && count >= Word_Length && (i == 1 || j == length || j == (k + BandWidth - 1))) {
217 for(s = 0; s <= count - Word_Length; s++)
225 for(i = k, j = k + 1; i >= 1 && j <= (k + BandWidth - 1) && j <= length; i--, j++) {
226 if(consBppMat->ref(i, j) >= BaseProbThreshold) {
229 else if(count >= Word_Length) {
230 for(s = 0; s <= count - Word_Length; s++) sum++;
238 if(consBppMat->ref(i, j) >= BaseProbThreshold && count >= Word_Length && (i == 1 || j == length || j == (k + BandWidth - 1))) {
239 for(s = 0; s <= count - Word_Length; s++) sum++;
249 static std::vector<StemCandidate>*
250 makeProfileScs(std::vector<StemCandidate> *pscs, MultiSequence *Sequences, Trimat<float>* consBppMat, int BandWidth)
253 int length = Sequences->GetSequence(0)->GetLength();
254 int Word_Length = scsLength;
255 float Thr = BaseProbThreshold;
256 int i, j, k, s, t, m, n, l;
261 for(k = 1; k <= length; k++) {
263 for(i = k, j = k; i >= 1 && j <= (k + BandWidth - 1) && j <= length; i--, j++) {
264 if(consBppMat->ref(i,j) >= Thr) {
267 else if(count >= Word_Length) {
268 for(s = 0; s <= count- Word_Length; s++) {
270 pscs->at(sum).SetLength(Word_Length);
271 pscs->at(sum).SetPosition(i+1+s);
272 pscs->at(sum).SetRvposition(j-count+(count-Word_Length-s));
273 pscs->at(sum).SetDistance((j-count+count-Word_Length-s) - (i+1+s+Word_Length));
274 pscs->at(sum).SetNumSeq(Sequences->GetNumSequences());
275 pscs->at(sum).SetNumSubstr(Sequences->GetNumSequences());
276 pscs->at(sum).SetNumRvstr(Sequences->GetNumSequences());
277 for(m = i + 1 + s, n = j - count + (count-Word_Length-s), l = j - 1 - s, t = 0; n < j-s; m++, n++, l--, t++) {
278 for(int num = 0; num < Sequences->GetNumSequences(); num++) {
279 Sequence *seq = Sequences->GetSequence(num);
280 // cout << num << "; " << m << ":" << seq->GetPosition(m) << " " << n << ":" << seq->GetPosition(n) << endl;
281 pscs->at(sum).AddSubstr(num, seq->GetPosition(m));
282 pscs->at(sum).AddRvstr(num, seq->GetPosition(n));
284 // assert(pr[iindx[m]-l] > Thr);
285 // cout << "prob=" << consBppMat->ref(m,l) << endl;
286 pscs->at(sum).AddScore(consBppMat->ref(m,l));
287 pscs->at(sum).AddBaseScore(consBppMat->ref(m, l));
289 for(int num = 0; num < Sequences->GetNumSequences(); num++) {
290 pscs->at(sum).AddSubstr(num, '\0');
291 pscs->at(sum).AddRvstr(num, '\0');
299 if(consBppMat->ref(i,j) >= Thr && count >= Word_Length && (i == 1 || j == length || j == (k + BandWidth - 1))) {
300 for(s = 0; s <= count- Word_Length; s++) {
302 pscs->at(sum).SetLength(Word_Length);
303 pscs->at(sum).SetPosition(i+s);
304 pscs->at(sum).SetRvposition(j-count+1+(count-Word_Length-s));
305 pscs->at(sum).SetDistance((j-count+1+count-Word_Length-s) - (i+s+Word_Length));
306 pscs->at(sum).SetNumSeq(Sequences->GetNumSequences());
307 pscs->at(sum).SetNumSubstr(Sequences->GetNumSequences());
308 pscs->at(sum).SetNumRvstr(Sequences->GetNumSequences());
309 for(m = i + s, n = j - count + 1 + (count-Word_Length-s), l = j - s, t = 0; n <= j-s; m++, n++, l--, t++) {
310 for(int num = 0; num < Sequences->GetNumSequences(); num++) {
311 Sequence *seq = Sequences->GetSequence(num);
312 pscs->at(sum).AddSubstr(num, seq->GetPosition(m));
313 pscs->at(sum).AddRvstr(num, seq->GetPosition(n));
316 pscs->at(sum).AddScore(consBppMat->ref(m,l));
317 pscs->at(sum).AddBaseScore(consBppMat->ref(m, l));
319 for(int num = 0; num < Sequences->GetNumSequences(); num++) {
320 pscs->at(sum).AddSubstr(num, '\0');
321 pscs->at(sum).AddRvstr(num, '\0');
328 for(i = k, j = k + 1; i >= 1 && j <= (k + BandWidth - 1) && j <= length; i--, j++) {
329 if(consBppMat->ref(i,j) >= Thr) {
332 else if(count >= Word_Length) {
333 for(s = 0; s <= count- Word_Length; s++) {
335 pscs->at(sum).SetLength(Word_Length);
336 pscs->at(sum).SetPosition(i+1+s);
337 pscs->at(sum).SetRvposition( j-count+(count-Word_Length-s));
338 pscs->at(sum).SetDistance((j-count+count-Word_Length-s) - (i+1+s+Word_Length));
339 pscs->at(sum).SetNumSeq(Sequences->GetNumSequences());
340 pscs->at(sum).SetNumSubstr(Sequences->GetNumSequences());
341 pscs->at(sum).SetNumRvstr(Sequences->GetNumSequences());
342 for(m = i + 1 + s, n = j - count + (count-Word_Length-s), l = j - 1 - s, t = 0; n < j-s; m++, n++, l--, t++) {
343 for(int num = 0; num < Sequences->GetNumSequences(); num++) {
344 Sequence *seq = Sequences->GetSequence(num);
345 pscs->at(sum).AddSubstr(num, seq->GetPosition(m));
346 pscs->at(sum).AddRvstr(num, seq->GetPosition(n));
349 pscs->at(sum).AddScore(consBppMat->ref(m,l));
350 pscs->at(sum).AddBaseScore(consBppMat->ref(m, l));
352 for(int num = 0; num < Sequences->GetNumSequences(); num++) {
353 pscs->at(sum).AddSubstr(num, '\0');
354 pscs->at(sum).AddRvstr(num, '\0');
362 if(consBppMat->ref(i,j) >= Thr && count >= Word_Length && (i == 1 || j == length || j == (k + BandWidth - 1))) {
363 for(s = 0; s <= count - Word_Length; s++) {
365 pscs->at(sum).SetLength(Word_Length);
366 pscs->at(sum).SetPosition(i+s);
367 pscs->at(sum).SetRvposition(j-count+1+(count-Word_Length-s));
368 pscs->at(sum).SetDistance((j-count+1+count-Word_Length-s) - (i+s+Word_Length));
369 // pscs->at(sum).SetDistance((j-count+count-Word_Length-s) - (i+1+s+Word_Length));
370 pscs->at(sum).SetNumSeq(Sequences->GetNumSequences());
371 pscs->at(sum).SetNumSubstr(Sequences->GetNumSequences());
372 pscs->at(sum).SetNumRvstr(Sequences->GetNumSequences());
373 for(m = i + s, n = j - count + 1 + (count-Word_Length-s), l = j - s, t=0; n <= j-s; m++, n++, l--, t++) {
374 for(int num = 0; num < Sequences->GetNumSequences(); num++) {
375 Sequence *seq = Sequences->GetSequence(num);
376 pscs->at(sum).AddSubstr(num, seq->GetPosition(m));
377 pscs->at(sum).AddRvstr(num, seq->GetPosition(n));
380 pscs->at(sum).AddScore(consBppMat->ref(m,l));
381 pscs->at(sum).AddBaseScore(consBppMat->ref(m, l));
383 for(int num = 0; num < Sequences->GetNumSequences(); num++) {
384 pscs->at(sum).AddSubstr(num, '\0');
385 pscs->at(sum).AddRvstr(num, '\0');
396 static std::vector<StemCandidate>*
397 doubleScs(std::vector<StemCandidate> *pscs)
399 int num = pscs->size()/2;
401 for(int i = 1; i <= num; i++) {
402 int latter = num + i;
403 //cout << i << " " << latter << endl;
404 StemCandidate &tmpScs = pscs->at(i);
405 pscs->at(latter).SetLength(tmpScs.GetLength());
406 pscs->at(latter).SetPosition(tmpScs.GetRvposition());
407 pscs->at(latter).SetRvposition(tmpScs.GetPosition());
408 pscs->at(latter).SetDistance(-tmpScs.GetDistance());
409 pscs->at(latter).SetNumSeq(tmpScs.GetNumSeq());
410 pscs->at(latter).SetNumSubstr(tmpScs.GetNumSeq());
411 pscs->at(latter).SetNumRvstr(tmpScs.GetNumSeq());
413 pscs->at(latter).SetScore(tmpScs.GetScore());
414 for(int num = 0; num < tmpScs.GetNumSeq(); num++) {
415 string tmpSubstr = tmpScs.GetSubstr(num);
416 string tmpRvstr = tmpScs.GetRvstr(num);
418 for(int k = 0; k < tmpScs.GetLength(); k++) {
419 pscs->at(latter).AddSubstr(num, tmpSubstr[k]);
420 pscs->at(latter).AddRvstr(num, tmpRvstr[k]);
423 for(int k = 0; k < tmpScs.GetLength(); k++) {
424 pscs->at(latter).AddBaseScore(tmpScs.GetBaseScore(k));
432 printScs(std::vector<StemCandidate> *pscs)
434 int num = pscs->size();
435 // std::cout << "size = " << num << endl;
436 for(int i = 1; i < num; i++) {
437 StemCandidate &sc = pscs->at(i);
439 std::cout << i << "\t" << sc.GetLength() << "\t" << sc.GetPosition() << "\t" <<
440 sc.GetRvposition() << "\t" << sc.GetDistance() << "\t" << sc.GetNumSeq() <<
441 "\t" << sc.GetScore() << "\t" << sc.GetContPos() << "\t" << sc.GetBeforePos() <<
442 "\t" << sc.GetRvscnumber() << "\t" << sc.GetStacking() << "\t" << sc.GetStemStacking() <<
443 "\t" << sc.GetBaseScore(0) << "\t" << sc.GetBaseScore(1) << endl;
444 cout << "substr:" << endl;
445 for(int k = 0; k < sc.GetNumSeq(); k++) {
446 cout << k << "\t" << sc.GetSubstr(k) << "\t" << sc.GetRvstr(k) << "\t" << endl;
453 static std::vector<StemCandidate>*
454 findRelations(std::vector<StemCandidate> *pscs)
456 int num = pscs->size();
458 for(int i = 1; i < num; i++) {
460 StemCandidate &sc = pscs->at(i);
462 while(sc.GetPosition() == pscs->at(pt).GetPosition()) { pt--; }
464 while((sc.GetPosition() == pscs->at(pt).GetPosition() + 1) && (pt > 0)) {
465 if(sc.GetRvposition() == pscs->at(pt).GetRvposition() - 1) {
471 while((sc.GetPosition() < pscs->at(pt).GetPosition() + pscs->at(pt).GetLength())&&(pt > 0)) { pt--; }
478 static std::vector<StemCandidate>*
479 findCorresponding(std::vector<StemCandidate>* pscs)
481 int num = pscs->size();
483 for(int i = 1; i < num; i++) { pscs->at(i).SetRvscnumber(0); }
485 for(int i = 1; i < num; i++) {
486 if(pscs->at(i).GetDistance() > 0) {
487 for(int j = i + 1; j < num; j++) {
488 if ( (pscs->at(j).GetPosition() == pscs->at(i).GetRvposition())
489 && (pscs->at(i).GetPosition() == pscs->at(j).GetRvposition()) ) {
490 pscs->at(i).SetRvscnumber(j);
491 pscs->at(j).SetRvscnumber(i);
496 if(pscs->at(i).GetRvscnumber() == 0) {
497 std::cerr << "error in findCorresponding" << " i=" << i << endl;
505 static std::vector<StemCandidate>*
506 calculateStackingEnergy(std::vector<StemCandidate>* pscs)
508 int num = pscs->size();
509 int wordLength = scsLength;
511 for(int i = 1; i < num; i++) {
512 StemCandidate &scI = pscs->at(i);
513 int j = pscs->at(i).GetContPos();
517 StemCandidate &scJ = pscs->at(j);
519 int profNum = scJ.GetNumSeq();
520 for(int k = 0; k < profNum; k++) {
521 string substr = scJ.GetSubstr(k);
522 string rvstr = scJ.GetRvstr(k);
524 switch(substr[wordLength - 1]) {
525 case 'A': if( rvstr[0]=='U' ) {index += 0;}
526 else{ index = -1000; }
528 case 'C': if( rvstr[0]=='G' ) {index += 6;}
529 else{ index = -1000; }
531 case 'G': if( rvstr[0]=='C'){index += 12;}
532 else if(rvstr[0]=='U'){index += 18;}
533 else{ index = -1000; }
535 case 'U': if( rvstr[0]=='A'){index += 24;}
536 else if(rvstr[0]=='G'){index += 30;}
537 else{ index = - 1000; }
540 substr = scI.GetSubstr(k);
541 rvstr = scI.GetRvstr(k);
542 switch(substr[wordLength - 1]){
543 case 'A': if( rvstr[0]=='U'){index += 0;}
544 else{ index = -1000; }
546 case 'C': if( rvstr[0]=='G'){index += 1;}
547 else{ index = -1000; }
549 case 'G': if( rvstr[0]=='C'){index += 2;}
550 else if(rvstr[0]=='U'){index += 3;}
551 else{ index = -1000; }
553 case 'U': if( rvstr[0]=='A'){index += 4;}
554 else if(rvstr[0]=='G'){index += 5;}
555 else{ index = -1000; }
559 stacking += Stacking_Energy[index];
562 scI.SetStacking(stacking/(float)profNum);
565 scI.SetStacking(1000);
569 for(int i = 1; i < num; i++) {
570 StemCandidate &sc = pscs->at(i);
571 float stemStacking = 0;
572 int profNum = sc.GetNumSeq();
573 for(int k = 0; k < profNum; k++) {
574 string substr = sc.GetSubstr(k);
575 string rvstr = sc.GetRvstr(k);
576 for(int j = 0; j < wordLength-1; j++) {
580 case 'A': if( rvstr[wordLength-1-j]=='U'){index += 0;}
581 else{ index = -1000; }
583 case 'C': if( rvstr[wordLength-1-j]=='G'){index += 6;}
584 else{ index = -1000; }
586 case 'G': if( rvstr[wordLength-1-j]=='C'){index += 12;}
587 else if(rvstr[wordLength-1-j]=='U'){index += 18;}
588 else{ index = -1000; }
590 case 'U': if( rvstr[wordLength-1-j]=='A'){index += 24;}
591 else if(rvstr[wordLength-1-j]=='G'){index += 30;}
592 else{ index = -1000; }
596 case 'A': if( rvstr[wordLength-1-(j+1)]=='U'){index += 0;}
597 else{ index = -1000; }
599 case 'C': if( rvstr[wordLength-1-(j+1)]=='G'){index += 1;}
600 else{ index = -1000; }
602 case 'G': if( rvstr[wordLength-1-(j+1)]=='C'){index += 2;}
603 else if(rvstr[wordLength-1-(j+1)]=='U'){index += 3;}
604 else{ index = -1000; }
606 case 'U': if( rvstr[wordLength-1-(j+1)]=='A'){index += 4;}
607 else if(rvstr[wordLength-1-(j+1)]=='G'){index += 5;}
608 else{ index = -1000; }
612 stemStacking += Stacking_Energy[index];
615 sc.SetStemStacking(stemStacking/(float)profNum);