1 /* Last changed Time-stamp: <2007-08-26 11:59:45 ivo> */
3 compute the duplex structure of two RNA strands,
4 allowing only inter-strand base pairs.
5 see cofold() for computing hybrid structures without
19 #include "energy_par.h"
20 #include "fold_vars.h"
26 #include "loop_energies.h"
34 static char rcsid[] UNUSED = "$Id: duplex.c,v 1.8 2007/08/26 10:08:44 ivo Exp $";
36 #define STACK_BULGE1 1 /* stacking energies for bulges of size 1 */
37 #define NEW_NINIO 1 /* new asymetry penalty */
38 #define MAXSECTORS 500 /* dimension for a backtrack array */
39 #define LOCALITY 0. /* locality parameter for base-pairs */
41 #define MINPSCORE -2 * UNIT
42 #define NONE -10000 /* score for forbidden pairs */
47 #################################
49 #################################
52 #################################
54 #################################
56 PRIVATE paramT *P = NULL;
57 PRIVATE int **c = NULL; /* energy array, given that i-j pair */
58 PRIVATE short *S1 = NULL, *SS1 = NULL, *S2 = NULL, *SS2 = NULL;
59 PRIVATE int n1,n2; /* sequence lengths */
63 /* NOTE: all variables are assumed to be uninitialized if they are declared as threadprivate
65 #pragma omp threadprivate(P, c, S1, SS1, S2, SS2, n1, n2)
70 #################################
71 # PRIVATE FUNCTION DECLARATIONS #
72 #################################
74 PRIVATE duplexT duplexfold_cu(const char *s1, const char *s2, int clean_up);
75 PRIVATE duplexT aliduplexfold_cu(const char *s1[], const char *s2[], int clean_up);
76 PRIVATE char *backtrack(int i, int j);
77 PRIVATE char *alibacktrack(int i, int j, const short **S1, const short **S2);
78 PRIVATE int compare(const void *sub1, const void *sub2);
79 PRIVATE int covscore(const int *types, int n_seq);
82 #################################
83 # BEGIN OF FUNCTION DEFINITIONS #
84 #################################
87 PUBLIC duplexT duplexfold(const char *s1, const char *s2){
88 return duplexfold_cu(s1, s2, 1);
91 PRIVATE duplexT duplexfold_cu(const char *s1, const char *s2, int clean_up){
92 int i, j, l1, Emin=INF, i_min=0, j_min=0;
96 n1 = (int) strlen(s1);
97 n2 = (int) strlen(s2);
99 if ((!P) || (fabs(P->temperature - temperature)>1e-6)) {
100 if(P) free(P); P = scale_parameters();
104 c = (int **) space(sizeof(int *) * (n1+1));
105 for (i=1; i<=n1; i++) c[i] = (int *) space(sizeof(int) * (n2+1));
107 S1 = encode_sequence(s1, 0);
108 S2 = encode_sequence(s2, 0);
109 SS1 = encode_sequence(s1, 1);
110 SS2 = encode_sequence(s2, 1);
112 for (i=1; i<=n1; i++) {
113 for (j=n2; j>0; j--) {
114 int type, type2, E, k,l;
115 type = pair[S1[i]][S2[j]];
116 c[i][j] = type ? P->DuplexInit : INF;
118 c[i][j] += E_ExtLoop(type, (i>1) ? SS1[i-1] : -1, (j<n2) ? SS2[j+1] : -1, P);
119 for (k=i-1; k>0 && k>i-MAXLOOP-2; k--) {
120 for (l=j+1; l<=n2; l++) {
121 if (i-k+l-j-2>MAXLOOP) break;
122 type2 = pair[S1[k]][S2[l]];
123 if (!type2) continue;
124 E = E_IntLoop(i-k-1, l-j-1, type2, rtype[type],
125 SS1[k+1], SS2[l-1], SS1[i-1], SS2[j+1], P);
126 c[i][j] = MIN2(c[i][j], c[k][l]+E);
130 E += E_ExtLoop(rtype[type], (j > 1) ? SS2[j-1] : -1, (i<n1) ? SS1[i+1] : -1, P);
132 Emin=E; i_min=i; j_min=j;
137 struc = backtrack(i_min, j_min);
138 if (i_min<n1) i_min++;
139 if (j_min>1 ) j_min--;
140 l1 = strchr(struc, '&')-struc;
142 printf("%s %3d,%-3d : %3d,%-3d (%5.2f)\n", struc, i_min+1-l1, i_min,
143 j_min, j_min+strlen(struc)-l1-2, Emin*0.01);
147 mfe.energy = (float) Emin/100.;
148 mfe.structure = struc;
150 for (i=1; i<=n1; i++) free(c[i]);
160 PUBLIC duplexT *duplex_subopt(const char *s1, const char *s2, int delta, int w) {
161 int i,j, n1, n2, thresh, E, n_subopt=0, n_max;
167 subopt = (duplexT *) space(n_max*sizeof(duplexT));
168 mfe = duplexfold_cu(s1, s2, 0);
171 thresh = (int) mfe.energy*100+0.1 + delta;
172 n1 = strlen(s1); n2=strlen(s2);
173 for (i=n1; i>0; i--) {
174 for (j=1; j<=n2; j++) {
176 type = pair[S2[j]][S1[i]];
179 Ed += E_ExtLoop(type, (j>1) ? SS2[j-1] : -1, (i<n1) ? SS1[i+1] : -1, P);
180 if (Ed>thresh) continue;
181 /* too keep output small, remove hits that are dominated by a
182 better one close (w) by. For simplicity we do test without
183 adding dangles, which is slightly inaccurate.
185 for (ii=MAX2(i-w,1); (ii<=MIN2(i+w,n1)) && type; ii++) {
186 for (jj=MAX2(j-w,1); jj<=MIN2(j+w,n2); jj++)
187 if (c[ii][jj]<E) {type=0; break;}
191 struc = backtrack(i,j);
192 fprintf(stderr, "%d %d %d\n", i,j,E);
193 if (n_subopt+1>=n_max) {
195 subopt = (duplexT *) xrealloc(subopt, n_max*sizeof(duplexT));
197 subopt[n_subopt].i = MIN2(i+1,n1);
198 subopt[n_subopt].j = MAX2(j-1,1);
199 subopt[n_subopt].energy = Ed * 0.01;
200 subopt[n_subopt++].structure = struc;
203 /* free all static globals */
204 for (i=1; i<=n1; i++) free(c[i]);
206 free(S1); free(S2); free(SS1); free(SS2);
208 if (subopt_sorted) qsort(subopt, n_subopt, sizeof(duplexT), compare);
209 subopt[n_subopt].i =0;
210 subopt[n_subopt].j =0;
211 subopt[n_subopt].structure = NULL;
215 PRIVATE char *backtrack(int i, int j) {
216 /* backtrack structure going backwards from i, and forwards from j
217 return structure in bracket notation with & as separator */
218 int k, l, type, type2, E, traced, i0, j0;
219 char *st1, *st2, *struc;
221 st1 = (char *) space(sizeof(char)*(n1+1));
222 st2 = (char *) space(sizeof(char)*(n2+1));
224 i0=MIN2(i+1,n1); j0=MAX2(j-1,1);
226 while (i>0 && j<=n2) {
227 E = c[i][j]; traced=0;
230 type = pair[S1[i]][S2[j]];
231 if (!type) nrerror("backtrack failed in fold duplex");
232 for (k=i-1; k>0 && k>i-MAXLOOP-2; k--) {
233 for (l=j+1; l<=n2; l++) {
235 if (i-k+l-j-2>MAXLOOP) break;
236 type2 = pair[S1[k]][S2[l]];
237 if (!type2) continue;
238 LE = E_IntLoop(i-k-1, l-j-1, type2, rtype[type],
239 SS1[k+1], SS2[l-1], SS1[i-1], SS2[j+1], P);
240 if (E == c[k][l]+LE) {
249 E -= E_ExtLoop(type, (i>1) ? SS1[i-1] : -1, (j<n2) ? SS2[j+1] : -1, P);
250 if (E != P->DuplexInit) {
251 nrerror("backtrack failed in fold duplex");
258 struc = (char *) space(i0-i+1+j-j0+1+2);
259 for (k=MAX2(i,1); k<=i0; k++) if (!st1[k-1]) st1[k-1] = '.';
260 for (k=j0; k<=j; k++) if (!st2[k-1]) st2[k-1] = '.';
261 strcpy(struc, st1+MAX2(i-1,0)); strcat(struc, "&");
262 strcat(struc, st2+j0-1);
264 /* printf("%s %3d,%-3d : %3d,%-3d\n", struc, i,i0,j0,j); */
265 free(st1); free(st2);
270 /*------------------------------------------------------------------------*/
272 PRIVATE int compare(const void *sub1, const void *sub2) {
274 if (((duplexT *) sub1)->energy > ((duplexT *) sub2)->energy)
276 if (((duplexT *) sub1)->energy < ((duplexT *) sub2)->energy)
278 d = ((duplexT *) sub1)->i - ((duplexT *) sub2)->i;
280 return ((duplexT *) sub1)->j - ((duplexT *) sub2)->j;
283 /*---------------------------------------------------------------------------*/
285 PUBLIC duplexT aliduplexfold(const char *s1[], const char *s2[]){
286 return aliduplexfold_cu(s1, s2, 1);
289 PRIVATE duplexT aliduplexfold_cu(const char *s1[], const char *s2[], int clean_up) {
290 int i, j, s, n_seq, l1, Emin=INF, i_min=0, j_min=0;
295 n1 = (int) strlen(s1[0]);
296 n2 = (int) strlen(s2[0]);
298 for (s=0; s1[s]!=NULL; s++);
300 for (s=0; s2[s]!=NULL; s++);
301 if (n_seq != s) nrerror("unequal number of sequences in aliduplexfold()\n");
303 if ((!P) || (fabs(P->temperature - temperature)>1e-6)) {
304 if(P) free(P); P = scale_parameters();
308 c = (int **) space(sizeof(int *) * (n1+1));
309 for (i=1; i<=n1; i++) c[i] = (int *) space(sizeof(int) * (n2+1));
311 S1 = (short **) space((n_seq+1)*sizeof(short *));
312 S2 = (short **) space((n_seq+1)*sizeof(short *));
313 for (s=0; s<n_seq; s++) {
314 if (strlen(s1[s]) != n1) nrerror("uneqal seqence lengths");
315 if (strlen(s2[s]) != n2) nrerror("uneqal seqence lengths");
316 S1[s] = encode_sequence(s1[s], 0);
317 S2[s] = encode_sequence(s2[s], 0);
319 type = (int *) space(n_seq*sizeof(int));
321 for (i=1; i<=n1; i++) {
322 for (j=n2; j>0; j--) {
324 for (s=0; s<n_seq; s++) {
325 type[s] = pair[S1[s][i]][S2[s][j]];
327 psc = covscore(type, n_seq);
328 for (s=0; s<n_seq; s++) if (type[s]==0) type[s]=7;
329 c[i][j] = (psc>=MINPSCORE) ? (n_seq*P->DuplexInit) : INF;
330 if (psc<MINPSCORE) continue;
331 for(s=0; s<n_seq;s++){
332 c[i][j] += E_ExtLoop(type[s], (i>1) ? S1[s][i-1] : -1, (j<n2) ? S2[s][j+1] : -1, P);
334 for (k=i-1; k>0 && k>i-MAXLOOP-2; k--) {
335 for (l=j+1; l<=n2; l++) {
337 if (i-k+l-j-2>MAXLOOP) break;
338 if (c[k][l]>INF/2) continue;
339 for (E=s=0; s<n_seq; s++) {
340 type2 = pair[S1[s][k]][S2[s][l]];
341 if (type2==0) type2=7;
342 E += E_IntLoop(i-k-1, l-j-1, type2, rtype[type[s]],
343 S1[s][k+1], S2[s][l-1], S1[s][i-1], S2[s][j+1], P);
345 c[i][j] = MIN2(c[i][j], c[k][l]+E);
350 for (s=0; s<n_seq; s++) {
351 E += E_ExtLoop(rtype[type[s]], (j>1) ? S2[s][j-1] : -1, (i<n1) ? S1[s][i+1] : -1, P);
354 Emin=E; i_min=i; j_min=j;
359 struc = alibacktrack(i_min, j_min, (const short **)S1,(const short **)S2);
360 if (i_min<n1) i_min++;
361 if (j_min>1 ) j_min--;
362 l1 = strchr(struc, '&')-struc;
364 printf("%s %3d,%-3d : %3d,%-3d (%5.2f)\n", struc, i_min+1-l1, i_min,
365 j_min, j_min+strlen(struc)-l1-2, Emin*0.01);
369 mfe.energy = (float) (Emin/(100.*n_seq));
370 mfe.structure = struc;
372 for (i=1; i<=n1; i++) free(c[i]);
375 for (s=0; s<n_seq; s++) {
376 free(S1[s]); free(S2[s]);
384 PUBLIC duplexT *aliduplex_subopt(const char *s1[], const char *s2[], int delta, int w) {
385 int i,j, n1, n2, thresh, E, n_subopt=0, n_max, s, n_seq, *type;
392 subopt = (duplexT *) space(n_max*sizeof(duplexT));
393 mfe = aliduplexfold_cu(s1, s2, 0);
396 for (s=0; s1[s]!=NULL; s++);
399 thresh = (int) ((mfe.energy*100. + delta)*n_seq +0.1);
400 n1 = strlen(s1[0]); n2=strlen(s2[0]);
401 S1 = (short **) space((n_seq+1)*sizeof(short *));
402 S2 = (short **) space((n_seq+1)*sizeof(short *));
403 for (s=0; s<n_seq; s++) {
404 if (strlen(s1[s]) != n1) nrerror("uneqal seqence lengths");
405 if (strlen(s2[s]) != n2) nrerror("uneqal seqence lengths");
406 S1[s] = encode_sequence(s1[s], 0);
407 S2[s] = encode_sequence(s2[s], 0);
409 type = (int *) space(n_seq*sizeof(int));
411 for (i=n1; i>0; i--) {
412 for (j=1; j<=n2; j++) {
413 int ii, jj, skip, Ed, psc;
415 for (s=0; s<n_seq; s++) {
416 type[s] = pair[S2[s][j]][S1[s][i]];
418 psc = covscore(type, n_seq);
419 for (s=0; s<n_seq; s++) if (type[s]==0) type[s]=7;
420 if (psc<MINPSCORE) continue;
422 for (s=0; s<n_seq; s++) {
423 Ed += E_ExtLoop(type[s], (j>1) ? S2[s][j-1] : -1, (i<n1) ? S1[s][i+1] : -1, P);
425 if (Ed>thresh) continue;
426 /* too keep output small, skip hits that are dominated by a
427 better one close (w) by. For simplicity we don't take dangels
428 into account here, thus the heuristic is somewhat inaccurate.
430 for (skip=0, ii=MAX2(i-w,1); (ii<=MIN2(i+w,n1)) && type; ii++) {
431 for (jj=MAX2(j-w,1); jj<=MIN2(j+w,n2); jj++)
432 if (c[ii][jj]<E) {skip=1; break;}
435 struc = alibacktrack(i,j,(const short **)S1, (const short **)S2);
436 fprintf(stderr, "%d %d %d\n", i,j,E);
437 if (n_subopt+1>=n_max) {
439 subopt = (duplexT *) xrealloc(subopt, n_max*sizeof(duplexT));
441 subopt[n_subopt].i = MIN2(i+1,n1);
442 subopt[n_subopt].j = MAX2(j-1,1);
443 subopt[n_subopt].energy = Ed * 0.01/n_seq;
444 subopt[n_subopt++].structure = struc;
448 for (i=1; i<=n1; i++) free(c[i]);
450 for (s=0; s<n_seq; s++) {
451 free(S1[s]); free(S2[s]);
453 free(S1); free(S2); free(type);
455 if (subopt_sorted) qsort(subopt, n_subopt, sizeof(duplexT), compare);
456 subopt[n_subopt].i =0;
457 subopt[n_subopt].j =0;
458 subopt[n_subopt].structure = NULL;
462 PRIVATE char *alibacktrack(int i, int j, const short **S1, const short **S2) {
463 /* backtrack structure going backwards from i, and forwards from j
464 return structure in bracket notation with & as separator */
465 int k, l, *type, type2, E, traced, i0, j0, s, n_seq;
466 char *st1, *st2, *struc;
471 for (s=0; S1[s]!=NULL; s++);
473 for (s=0; S2[s]!=NULL; s++);
474 if (n_seq != s) nrerror("unequal number of sequences in alibacktrack()\n");
476 st1 = (char *) space(sizeof(char)*(n1+1));
477 st2 = (char *) space(sizeof(char)*(n2+1));
478 type = (int *) space(n_seq*sizeof(int));
480 i0=MIN2(i+1,n1); j0=MAX2(j-1,1);
482 while (i>0 && j<=n2) {
484 E = c[i][j]; traced=0;
487 for (s=0; s<n_seq; s++) {
488 type[s] = pair[S1[s][i]][S2[s][j]];
490 psc = covscore(type, n_seq);
491 for (s=0; s<n_seq; s++) if (type[s]==0) type[s] = 7;
493 for (k=i-1; k>0 && k>i-MAXLOOP-2; k--) {
494 for (l=j+1; l<=n2; l++) {
496 if (i-k+l-j-2>MAXLOOP) break;
497 if (c[k][l]>INF/2) continue;
498 for (s=LE=0; s<n_seq; s++) {
499 type2 = pair[S1[s][k]][S2[s][l]];
500 if (type2==0) type2=7;
501 LE += E_IntLoop(i-k-1, l-j-1, type2, rtype[type[s]],
502 S1[s][k+1], S2[s][l-1], S1[s][i-1], S2[s][j+1], P);
504 if (E == c[k][l]+LE) {
513 for (s=0; s<n_seq; s++) {
514 E -= E_ExtLoop(type[s], (i>1) ? S1[s][i-1] : -1, (j<n2) ? S2[s][j+1] : -1, P);
516 if (E != n_seq*P->DuplexInit) {
517 nrerror("backtrack failed in aliduplex");
524 struc = (char *) space(i0-i+1+j-j0+1+2);
525 for (k=MAX2(i,1); k<=i0; k++) if (!st1[k-1]) st1[k-1] = '.';
526 for (k=j0; k<=j; k++) if (!st2[k-1]) st2[k-1] = '.';
527 strcpy(struc, st1+MAX2(i-1,0)); strcat(struc, "&");
528 strcat(struc, st2+j0-1);
530 /* printf("%s %3d,%-3d : %3d,%-3d\n", struc, i,i0,j0,j); */
531 free(st1); free(st2); free(type);
537 PRIVATE int covscore(const int *types, int n_seq) {
538 /* calculate co-variance bonus for a pair depending on */
539 /* compensatory/consistent mutations and incompatible seqs */
540 /* should be 0 for conserved pairs, >0 for good pairs */
541 int k,l,s,score, pscore;
542 int dm[7][7]={{0,0,0,0,0,0,0}, /* hamming distance between pairs */
543 {0,0,2,2,1,2,2} /* CG */,
544 {0,2,0,1,2,2,2} /* GC */,
545 {0,2,1,0,2,1,2} /* GU */,
546 {0,1,2,2,0,2,1} /* UG */,
547 {0,2,2,1,2,0,2} /* AU */,
548 {0,2,2,2,1,2,0} /* UA */};
550 int pfreq[8]={0,0,0,0,0,0,0,0};
551 for (s=0; s<n_seq; s++)
554 if (pfreq[0]*2>n_seq) return NONE;
555 for (k=1,score=0; k<=6; k++) /* ignore pairtype 7 (gap-gap) */
556 for (l=k+1; l<=6; l++)
557 /* scores for replacements between pairtypes */
558 /* consistent or compensatory mutations score 1 or 2 */
559 score += pfreq[k]*pfreq[l]*dm[k][l];
561 /* counter examples score -1, gap-gap scores -0.25 */
563 ((UNIT*score)/n_seq - nc_fact*UNIT*(pfreq[0] + pfreq[7]*0.25));