14 #include "../config.h"
15 #include "fold_vars.h"
16 #include "data_structures.h"
17 #include "energy_const.h"
24 # define INLINE inline
31 * Use this macro to loop over each G-quadruplex
32 * delimited by a and b within the subsequence [c,d]
34 #define FOR_EACH_GQUAD(a, b, c, d) \
35 for((a) = (d) - VRNA_GQUAD_MIN_BOX_SIZE + 1; (a) >= (c); (a)--)\
36 for((b) = (a) + VRNA_GQUAD_MIN_BOX_SIZE - 1;\
37 (b) <= MIN2((d), (a) + VRNA_GQUAD_MAX_BOX_SIZE - 1);\
41 * This macro does almost the same as FOR_EACH_GQUAD() but keeps
42 * the 5' delimiter fixed. 'b' is the 3' delimiter of the gquad,
43 * for gquads within subsequence [a,c] that have 5' delimiter 'a'
45 #define FOR_EACH_GQUAD_AT(a, b, c) \
46 for((b) = (a) + VRNA_GQUAD_MIN_BOX_SIZE - 1;\
47 (b) <= MIN2((c), (a) + VRNA_GQUAD_MAX_BOX_SIZE - 1);\
52 #################################
53 # PRIVATE FUNCTION DECLARATIONS #
54 #################################
59 get_g_islands(short *S);
63 get_g_islands_sub(short *S, int i, int j);
67 * If you don't know how to use this function, DONT'T USE IT!
69 * The function pointer this function takes as argument is
70 * used for individual calculations with each g-quadruplex
72 * The function it points to always receives as first 3 arguments
73 * position i, the stack size L and an array l[3] containing the
74 * individual linker sizes.
75 * The remaining 4 (void *) pointers of the callback function receive
76 * the parameters 'data', 'P', 'aux1' and 'aux2' and thus may be
77 * used to pass whatever data you like to.
78 * As the names of those parameters suggest the convention is that
79 * 'data' should be used as a pointer where data is stored into,
80 * e.g the MFE or PF and the 'P' parameter should actually be a
81 * 'paramT *' or 'pf_paramT *' type.
82 * However, what you actually pass obviously depends on the
83 * function the pointer is pointing to.
85 * Although all of this may look like an overkill, it is found
86 * to be almost as fast as implementing g-quadruplex enumeration
87 * in each individual scenario, i.e. code duplication.
88 * Using this function, however, ensures that all g-quadruplex
89 * enumerations are absolutely identical.
93 process_gquad_enumeration(int *gg,
96 void (*f)(int, int, int *,
97 void *, void *, void *, void *),
104 * MFE callback for process_gquad_enumeration()
127 * Partition function callback for process_gquad_enumeration()
140 * Partition function callback for process_gquad_enumeration()
141 * in contrast to gquad_pf() it stores the stack size L and
142 * the linker lengths l[3] of the g-quadruplex that dominates
144 * (FLT_OR_DBL *)data must be 0. on entry
157 * MFE (alifold) callback for process_gquad_enumeration()
170 * MFE (alifold) callback for process_gquad_enumeration()
171 * with seperation of free energy and penalty contribution
175 gquad_mfe_ali_en( int i,
185 gquad_interact( int i,
193 /* other useful static functions */
197 gquad_ali_penalty(int i,
204 #########################################
205 # BEGIN OF PUBLIC FUNCTION DEFINITIONS #
206 # (all available in RNAlib) #
207 #########################################
210 /********************************
211 Here are the G-quadruplex energy
212 contribution functions
213 *********************************/
215 PUBLIC int E_gquad( int L,
222 if(l[i] > VRNA_GQUAD_MAX_LINKER_LENGTH) return c;
223 if(l[i] < VRNA_GQUAD_MIN_LINKER_LENGTH) return c;
225 if(L > VRNA_GQUAD_MAX_STACK_SIZE) return c;
226 if(L < VRNA_GQUAD_MIN_STACK_SIZE) return c;
236 PUBLIC FLT_OR_DBL exp_E_gquad(int L,
244 if(l[i] > VRNA_GQUAD_MAX_LINKER_LENGTH) return q;
245 if(l[i] < VRNA_GQUAD_MIN_LINKER_LENGTH) return q;
247 if(L > VRNA_GQUAD_MAX_STACK_SIZE) return q;
248 if(L < VRNA_GQUAD_MIN_STACK_SIZE) return q;
258 PUBLIC int E_gquad_ali( int i,
266 E_gquad_ali_en(i, L, l, S, n_seq, en, P);
267 return en[0] + en[1];
271 PUBLIC void E_gquad_ali_en( int i,
283 if(l[j] > VRNA_GQUAD_MAX_LINKER_LENGTH) return;
284 if(l[j] < VRNA_GQUAD_MIN_LINKER_LENGTH) return;
286 if(L > VRNA_GQUAD_MAX_STACK_SIZE) return;
287 if(L < VRNA_GQUAD_MIN_STACK_SIZE) return;
289 gquad_mfe_ali_en( i, L, l,
296 /********************************
297 Now, the triangular matrix
298 generators for the G-quadruplex
299 contributions are following
300 *********************************/
302 PUBLIC int *get_gquad_matrix(short *S, paramT *P){
304 int n, size, i, j, *gg, *my_index, *data;
307 my_index = get_indx(n);
308 gg = get_g_islands(S);
309 size = (n * (n+1))/2 + 2;
310 data = (int *)space(sizeof(int) * size);
312 /* prefill the upper triangular matrix with INF */
313 for(i = 0; i < size; i++) data[i] = INF;
315 FOR_EACH_GQUAD(i, j, 1, n){
316 process_gquad_enumeration(gg, i, j,
318 (void *)(&(data[my_index[j]+i])),
329 PUBLIC FLT_OR_DBL *get_gquad_pf_matrix( short *S,
333 int n, size, *gg, i, j, *my_index;
338 size = (n * (n+1))/2 + 2;
339 data = (FLT_OR_DBL *)space(sizeof(FLT_OR_DBL) * size);
340 gg = get_g_islands(S);
341 my_index = get_iindx(n);
343 FOR_EACH_GQUAD(i, j, 1, n){
344 process_gquad_enumeration(gg, i, j,
346 (void *)(&(data[my_index[i]-j])),
350 data[my_index[i]-j] *= scale[j-i+1];
358 PUBLIC int *get_gquad_ali_matrix( short *S_cons,
363 int n, size, *data, *gg;
368 size = (n * (n+1))/2 + 2;
369 data = (int *)space(sizeof(int) * size);
370 gg = get_g_islands(S_cons);
371 my_index = get_indx(n);
373 /* prefill the upper triangular matrix with INF */
374 for(i=0;i<size;i++) data[i] = INF;
376 FOR_EACH_GQUAD(i, j, 1, n){
377 process_gquad_enumeration(gg, i, j,
379 (void *)(&(data[my_index[j]+i])),
390 PUBLIC int **get_gquad_L_matrix(short *S,
397 int n, i, j, k, l, *gg;
400 gg = get_g_islands_sub(S, start, MIN2(n, start + maxdist + 4));
402 if(g){ /* we just update the gquadruplex contribution for the current
403 start and rotate the rest */
405 /* we re-use the memory allocated previously */
406 data[start] = data[start + maxdist + 5];
407 data[start + maxdist + 5] = NULL;
409 /* prefill with INF */
410 for(i = 0; i < maxdist + 5; i++)
411 data[start][i] = INF;
413 /* now we compute contributions for all gquads with 5' delimiter at
416 FOR_EACH_GQUAD_AT(start, j, start + maxdist + 4){
417 process_gquad_enumeration(gg, start, j,
419 (void *)(&(data[start][j-start])),
425 } else { /* create a new matrix from scratch since this is the first
426 call to this function */
428 /* allocate memory and prefill with INF */
429 data = (int **) space(sizeof(int *) * (n+1));
430 for(k = n; (k>n-maxdist-5) && (k>=0); k--){
431 data[k] = (int *) space(sizeof(int)*(maxdist+5));
432 for(i = 0; i < maxdist+5; i++) data[k][i] = INF;
435 /* compute all contributions for the gquads in this interval */
436 FOR_EACH_GQUAD(i, j, n - maxdist - 4, n){
437 process_gquad_enumeration(gg, i, j,
439 (void *)(&(data[i][j-i])),
451 PUBLIC plist *get_plist_gquad_from_db(const char *structure, float pr){
452 int x, size, actual_size, L, n, ge, ee, gb, l[3];
458 size = strlen(structure);
459 pl = (plist *)space(n*size*sizeof(plist));
461 while((ee = parse_gquad(structure + ge, &L, l)) > 0){
463 gb = ge - L*4 - l[0] - l[1] - l[2] + 1;
464 /* add pseudo-base pair encloding gquad */
465 for(x = 0; x < L; x++){
466 if (actual_size >= n * size - 5){
468 pl = (plist *)xrealloc(pl, n * size * sizeof(plist));
470 pl[actual_size].i = gb + x;
471 pl[actual_size].j = ge + x - L + 1;
472 pl[actual_size].p = pr;
473 pl[actual_size++].type = 0;
475 pl[actual_size].i = gb + x;
476 pl[actual_size].j = gb + x + l[0] + L;
477 pl[actual_size].p = pr;
478 pl[actual_size++].type = 0;
480 pl[actual_size].i = gb + x + l[0] + L;
481 pl[actual_size].j = ge + x - 2*L - l[2] + 1;
482 pl[actual_size].p = pr;
483 pl[actual_size++].type = 0;
485 pl[actual_size].i = ge + x - 2*L - l[2] + 1;
486 pl[actual_size].j = ge + x - L + 1;
487 pl[actual_size].p = pr;
488 pl[actual_size++].type = 0;
492 pl[actual_size].i = pl[actual_size].j = 0;
493 pl[actual_size++].p = 0;
494 pl = (plist *)xrealloc(pl, actual_size * sizeof(plist));
498 PUBLIC void get_gquad_pattern_mfe(short *S,
505 int *gg = get_g_islands_sub(S, i, j);
508 process_gquad_enumeration(gg, i, j,
519 PUBLIC void get_gquad_pattern_pf( short *S,
526 int *gg = get_g_islands_sub(S, i, j);
529 process_gquad_enumeration(gg, i, j,
540 PUBLIC plist *get_plist_gquad_from_pr(short *S,
549 return get_plist_gquad_from_pr_max(S, gi, gj, G, probs, scale, &L, l, pf);
553 PUBLIC plist *get_plist_gquad_from_pr_max(short *S,
563 int n, size, *gg, counter, i, j, *my_index;
564 FLT_OR_DBL pp, *tempprobs;
568 size = (n * (n + 1))/2 + 2;
569 tempprobs = (FLT_OR_DBL *)space(sizeof(FLT_OR_DBL) * size);
570 pl = (plist *)space((S[0]*S[0])*sizeof(plist));
571 gg = get_g_islands_sub(S, gi, gj);
573 my_index = get_iindx(n);
575 process_gquad_enumeration(gg, gi, gj,
583 process_gquad_enumeration(gg, gi, gj,
590 pp = probs[my_index[gi]-gj] * scale[gj-gi+1] / G[my_index[gi]-gj];
591 for (i=gi;i<gj; i++) {
592 for (j=i; j<=gj; j++) {
593 if (tempprobs[my_index[i]-j]>0.) {
596 pl[counter++].p = pp * tempprobs[my_index[i]-j];
600 pl[counter].i = pl[counter].j = 0;
601 pl[counter++].p = 0.;
602 /* shrink memory to actual size needed */
603 pl = (plist *) xrealloc(pl, counter * sizeof(plist));
605 gg += gi - 1; free(gg);
611 PUBLIC int parse_gquad(const char *struc, int *L, int l[3]) {
612 int i, il, start, end, len;
614 for (i=0; struc[i] && struc[i]!='+'; i++);
615 if (struc[i] == '+') { /* start of gquad */
616 for (il=0; il<=3; il++) {
617 start=i; /* pos of first '+' */
618 while (struc[++i] == '+'){
619 if((il) && (i-start == *L))
622 end=i; len=end-start;
625 nrerror("unequal stack lengths in gquad");
627 while (struc[++i] == '.'); /* linker */
630 nrerror("illegal character in gquad linker region");
634 /* printf("gquad at %d %d %d %d %d\n", end, *L, l[0], l[1], l[2]); */
641 #########################################
642 # BEGIN OF PRIVATE FUNCTION DEFINITIONS #
643 # (internal use only) #
644 #########################################
647 PRIVATE int gquad_ali_penalty(int i,
657 /* check for compatibility in the alignment */
658 for(s = 0; S[s]; s++){
659 unsigned int ld = 0; /* !=0 if layer destruction was detected */
662 /* check bottom layer */
663 if(S[s][i] != 3) ld |= 1U;
664 if(S[s][i + L + l[0]] != 3) ld |= 2U;
665 if(S[s][i + 2*L + l[0] + l[1]] != 3) ld |= 4U;
666 if(S[s][i + 3*L + l[0] + l[1] + l[2]] != 3) ld |= 8U;
667 /* add 1x penalty for missing bottom layer */
668 if(ld) pen += VRNA_GQUAD_MISMATCH_PENALTY;
670 /* check top layer */
672 if(S[s][i + L - 1] != 3) ld |= 1U;
673 if(S[s][i + 2*L + l[0] - 1] != 3) ld |= 2U;
674 if(S[s][i + 3*L + l[0] + l[1] - 1] != 3) ld |= 4U;
675 if(S[s][i + 4*L + l[0] + l[1] + l[2] - 1] != 3) ld |= 8U;
676 /* add 1x penalty for missing top layer */
677 if(ld) pen += VRNA_GQUAD_MISMATCH_PENALTY;
679 /* check inner layers */
680 for(cnt=1;cnt<L-1;cnt++){
681 if(S[s][i + cnt] != 3) ld |= 1U;
682 if(S[s][i + L + l[0] + cnt] != 3) ld |= 2U;
683 if(S[s][i + 2*L + l[0] + l[1] + cnt] != 3) ld |= 4U;
684 if(S[s][i + 3*L + l[0] + l[1] + l[2] + cnt] != 3) ld |= 8U;
685 /* add 2x penalty for missing inner layer */
686 if(ld) pen += 2*VRNA_GQUAD_MISMATCH_PENALTY;
689 /* if all layers are missing, we have a complete gg mismatch */
690 if(pen >= (2*VRNA_GQUAD_MISMATCH_PENALTY * (L-1)))
693 /* add the penalty to the score */
696 /* if gg_mismatch exceeds maximum allowed, this g-quadruplex is forbidden */
697 if(gg_mismatch > VRNA_GQUAD_MISMATCH_NUM_ALI) return INF;
702 PRIVATE void gquad_mfe( int i,
710 int cc = ((paramT *)P)->gquad[L][l[0] + l[1] + l[2]];
711 if(cc < *((int *)data))
715 PRIVATE void gquad_mfe_pos( int i,
723 int cc = ((paramT *)P)->gquad[L][l[0] + l[1] + l[2]];
724 if(cc < *((int *)data)){
727 *((int *)lmfe) = l[0];
728 *(((int *)lmfe) + 1) = l[1];
729 *(((int *)lmfe) + 2) = l[2];
733 PRIVATE void gquad_pf(int i,
741 *((FLT_OR_DBL *)data) += ((pf_paramT *)pf)->expgquad[L][l[0] + l[1] + l[2]];
744 PRIVATE void gquad_pf_pos(int i,
752 FLT_OR_DBL gq = ((pf_paramT *)pf)->expgquad[L][l[0] + l[1] + l[2]];
753 if(gq > *((FLT_OR_DBL *)data)){
754 *((FLT_OR_DBL *)data) = gq;
756 *((int *)lmax) = l[0];
757 *(((int *)lmax) + 1) = l[1];
758 *(((int *)lmax) + 2) = l[2];
762 PRIVATE void gquad_mfe_ali( int i,
772 gquad_mfe_ali_en(i, L, l, (void *)(&(en[0])), P, S, n_seq);
775 if(cc < *((int *)data)) *((int *)data) = cc;
779 PRIVATE void gquad_mfe_ali_en(int i,
788 en[0] = ((paramT *)P)->gquad[L][l[0] + l[1] + l[2]] * (*(int *)n_seq);
789 en[1] = gquad_ali_penalty(i, L, l, (const short **)S, (paramT *)P);
792 dd = ((int *)data)[0] + ((int *)data)[1];
794 ((int *)data)[0] = en[0];
795 ((int *)data)[1] = en[1];
800 PRIVATE void gquad_interact(int i,
812 pp = (FLT_OR_DBL *)data;
813 gq = exp_E_gquad(L, l, (pf_paramT *)pf);
815 for(x = 0; x < L; x++){
816 pp[idx[i + x] - (i + x + 3*L + l[0] + l[1] + l[2])] += gq;
817 pp[idx[i + x] - (i + x + L + l[0])] += gq;
818 pp[idx[i + x + L + l[0]] - (i + x + 2*L + l[0] + l[1])] += gq;
819 pp[idx[i + x + 2*L + l[0] + l[1]] - (i + x + 3*L + l[0] + l[1] + l[2])] += gq;
824 PRIVATE INLINE int *get_g_islands(short *S){
825 return get_g_islands_sub(S, 1, S[0]);
828 PRIVATE INLINE int *get_g_islands_sub(short *S, int i, int j){
831 gg = (int *)space(sizeof(int)*(j-i+2));
834 if(S[j]==3) gg[j] = 1;
835 for(x = j - 1; x >= i; x--)
843 * We could've also created a macro that loops over all G-quadruplexes
844 * delimited by i and j. However, for the fun of it we use this function
845 * that receives a pointer to a callback function which in turn does the
846 * actual computation for each quadruplex found.
850 process_gquad_enumeration(int *gg,
853 void (*f)(int, int, int *,
854 void *, void *, void *, void *),
860 int L, l[3], n, max_linker, maxl0, maxl1;
864 if((n >= VRNA_GQUAD_MIN_BOX_SIZE) && (n <= VRNA_GQUAD_MAX_BOX_SIZE))
865 for(L = MIN2(gg[i], VRNA_GQUAD_MAX_STACK_SIZE);
866 L >= VRNA_GQUAD_MIN_STACK_SIZE;
870 if( (max_linker >= 3*VRNA_GQUAD_MIN_LINKER_LENGTH)
871 && (max_linker <= 3*VRNA_GQUAD_MAX_LINKER_LENGTH)){
872 maxl0 = MIN2( VRNA_GQUAD_MAX_LINKER_LENGTH,
873 max_linker - 2*VRNA_GQUAD_MIN_LINKER_LENGTH
875 for(l[0] = VRNA_GQUAD_MIN_LINKER_LENGTH;
878 if(gg[i+L+l[0]] >= L){
879 maxl1 = MIN2( VRNA_GQUAD_MAX_LINKER_LENGTH,
880 max_linker - l[0] - VRNA_GQUAD_MIN_LINKER_LENGTH
882 for(l[1] = VRNA_GQUAD_MIN_LINKER_LENGTH;
885 if(gg[i + 2*L + l[0] + l[1]] >= L){
886 l[2] = max_linker - l[0] - l[1];
887 f(i, L, &(l[0]), data, P, aux1, aux2);