/* Last changed Time-stamp: <2009-02-24 15:17:17 ivo> */ /* minimum free energy folding for a set of aligned sequences c Ivo Hofacker Vienna RNA package */ /** *** \file alifold.c **/ #include #include #include #include #include #include #include #include "fold.h" #include "utils.h" #include "energy_par.h" #include "fold_vars.h" #include "pair_mat.h" #include "params.h" #include "ribo.h" #include "aln_util.h" #include "loop_energies.h" #include "gquad.h" #include "alifold.h" #ifdef _OPENMP #include #endif #define PAREN #define STACK_BULGE1 1 /* stacking energies for bulges of size 1 */ #define NEW_NINIO 1 /* new asymetry penalty */ #define MAXSECTORS 500 /* dimension for a backtrack array */ #define LOCALITY 0. /* locality parameter for base-pairs */ #define UNIT 100 #define MINPSCORE -2 * UNIT /* ################################# # GLOBAL VARIABLES # ################################# */ PUBLIC double cv_fact=1.; /* should be made static to not interfere with other threads */ PUBLIC double nc_fact=1.; /* should be made static to not interfere with other threads */ /* ################################# # PRIVATE VARIABLES # ################################# */ PRIVATE short **S = NULL; PRIVATE short **S5 = NULL; /*S5[s][i] holds next base 5' of i in sequence s*/ PRIVATE short **S3 = NULL; /*Sl[s][i] holds next base 3' of i in sequence s*/ PRIVATE char **Ss = NULL; PRIVATE unsigned short **a2s = NULL; PRIVATE paramT *P = NULL; PRIVATE int *indx = NULL; /* index for moving in the triangle matrices c[] and fMl[]*/ PRIVATE int *c = NULL; /* energy array, given that i-j pair */ PRIVATE int *cc = NULL; /* linear array for calculating canonical structures */ PRIVATE int *cc1 = NULL; /* " " */ PRIVATE int *f5 = NULL; /* energy of 5' end */ PRIVATE int *fML = NULL; /* multi-loop auxiliary energy array */ PRIVATE int *Fmi = NULL; /* holds row i of fML (avoids jumps in memory) */ PRIVATE int *DMLi = NULL; /* DMLi[j] holds MIN(fML[i,k]+fML[k+1,j]) */ PRIVATE int *DMLi1 = NULL; /* MIN(fML[i+1,k]+fML[k+1,j]) */ PRIVATE int *DMLi2 = NULL; /* MIN(fML[i+2,k]+fML[k+1,j]) */ PRIVATE int *pscore = NULL; /* precomputed array of pair types */ PRIVATE int init_length = -1; PRIVATE sect sector[MAXSECTORS]; /* stack of partial structures for backtracking */ PRIVATE bondT *base_pair2 = NULL; PRIVATE int circular = 0; PRIVATE int with_gquad = 0; PRIVATE int *ggg = NULL; /* minimum free energies of the gquadruplexes */ PRIVATE char *cons_seq = NULL; PRIVATE short *S_cons = NULL; #ifdef _OPENMP #pragma omp threadprivate(S, S5, S3, Ss, a2s, P, indx, c, cc, cc1, f5, fML, Fmi, DMLi, DMLi1, DMLi2,\ pscore, init_length, sector, base_pair2,\ ggg, with_gquad, cons_seq, S_cons) #endif /* ################################# # PRIVATE FUNCTION DECLARATIONS # ################################# */ PRIVATE void init_alifold(int length); PRIVATE void get_arrays(unsigned int size); PRIVATE void make_pscores(const short *const *S, const char **AS, int n_seq, const char *structure); PRIVATE int fill_arrays(const char **strings); PRIVATE void backtrack(const char **strings, int s); PRIVATE void energy_of_alistruct_pt(const char **sequences,short * ptable, int n_seq, int *energy); PRIVATE void stack_energy_pt(int i, const char **sequences, short *ptable, int n_seq, int *energy); PRIVATE int ML_Energy_pt(int i, int n_seq, short *pt); PRIVATE int EL_Energy_pt(int i, int n_seq, short *pt); PRIVATE void en_corr_of_loop_gquad(int i, int j, const char **sequences, const char *structure, short *pt, int *loop_idx, int n_seq, int en[2]); /* ################################# # BEGIN OF FUNCTION DEFINITIONS # ################################# */ /* unsafe function that will be replaced by a threadsafe companion in the future */ PRIVATE void init_alifold(int length){ #ifdef _OPENMP /* Explicitly turn off dynamic threads */ omp_set_dynamic(0); #endif if (length < 1) nrerror("initialize_fold: argument must be greater 0"); free_alifold_arrays(); get_arrays((unsigned) length); init_length = length; indx = get_indx((unsigned)length); update_alifold_params(); } PRIVATE void get_arrays(unsigned int size){ if(size >= (unsigned int)sqrt((double)INT_MAX)) nrerror("get_arrays@alifold.c: sequence length exceeds addressable range"); c = (int *) space(sizeof(int)*((size*(size+1))/2+2)); fML = (int *) space(sizeof(int)*((size*(size+1))/2+2)); pscore = (int *) space(sizeof(int)*((size*(size+1))/2+2)); f5 = (int *) space(sizeof(int)*(size+2)); cc = (int *) space(sizeof(int)*(size+2)); cc1 = (int *) space(sizeof(int)*(size+2)); Fmi = (int *) space(sizeof(int)*(size+1)); DMLi = (int *) space(sizeof(int)*(size+1)); DMLi1 = (int *) space(sizeof(int)*(size+1)); DMLi2 = (int *) space(sizeof(int)*(size+1)); if(base_pair2) free(base_pair2); base_pair2 = (bondT *) space(sizeof(bondT)*(1+size/2+200)); /* add a guess of how many G's may be involved in a G quadruplex */ } PUBLIC void free_alifold_arrays(void){ if(indx) free(indx); if(c) free(c); if(fML) free(fML); if(f5) free(f5); if(cc) free(cc); if(cc1) free(cc1); if(pscore) free(pscore); if(base_pair2) free(base_pair2); if(Fmi) free(Fmi); if(DMLi) free(DMLi); if(DMLi1) free(DMLi1); if(DMLi2) free(DMLi2); if(P) free(P); if(ggg) free(ggg); if(cons_seq) free(cons_seq); if(S_cons) free(S_cons); indx = c = fML = f5 = cc = cc1 = Fmi = DMLi = DMLi1 = DMLi2 = ggg = NULL; pscore = NULL; base_pair = NULL; base_pair2 = NULL; P = NULL; init_length = 0; cons_seq = NULL; S_cons = NULL; } PUBLIC void alloc_sequence_arrays(const char **sequences, short ***S, short ***S5, short ***S3, unsigned short ***a2s, char ***Ss, int circ){ unsigned int s, n_seq, length; if(sequences[0] != NULL){ length = strlen(sequences[0]); for (s=0; sequences[s] != NULL; s++); n_seq = s; *S = (short **) space((n_seq+1) * sizeof(short *)); *S5 = (short **) space((n_seq+1) * sizeof(short *)); *S3 = (short **) space((n_seq+1) * sizeof(short *)); *a2s = (unsigned short **) space((n_seq+1) * sizeof(unsigned short *)); *Ss = (char **) space((n_seq+1) * sizeof(char *)); for (s=0; sinit_length) init_alifold(length); #endif if (fabs(P->temperature - temperature)>1e-6) update_alifold_params(); for (s=0; strings[s]!=NULL; s++); n_seq = s; with_gquad = P->model_details.gquad; alloc_sequence_arrays(strings, &S, &S5, &S3, &a2s, &Ss, circular); make_pscores((const short **) S, strings, n_seq, structure); energy = fill_arrays((const char **)strings); backtrack((const char **)strings, 0); #ifdef PAREN parenthesis_structure(structure, base_pair2, length); #else letter_structure(structure, base_pair2, length); #endif /* * Backward compatibility: * This block may be removed if deprecated functions * relying on the global variable "base_pair" vanishs from within the package! */ base_pair = base_pair2; /* { if(base_pair) free(base_pair); base_pair = (bondT *)space(sizeof(bondT) * (1+length/2)); memcpy(base_pair, base_pair2, sizeof(bondT) * (1+length/2)); } */ free_sequence_arrays(n_seq, &S, &S5, &S3, &a2s, &Ss); if (backtrack_type=='C') return (float) c[indx[length]+1]/(n_seq*100.); else if (backtrack_type=='M') return (float) fML[indx[length]+1]/(n_seq*100.); else return (float) f5[length]/(n_seq*100.); } /** *** the actual forward recursion to fill the energy arrays **/ PRIVATE int fill_arrays(const char **strings) { int i, j, k, p, q, length, energy, new_c; int decomp, MLenergy, new_fML; int s, n_seq, *type, type_2, tt; /* count number of sequences */ for (n_seq=0; strings[n_seq]!=NULL; n_seq++); type = (int *) space(n_seq*sizeof(int)); length = strlen(strings[0]); /* init energies */ if(with_gquad){ cons_seq = consensus(strings); /* make g-island annotation of the consensus */ S_cons = encode_sequence(cons_seq, 0); ggg = get_gquad_ali_matrix(S_cons, S, n_seq, P); } for (j=1; j<=length; j++){ Fmi[j]=DMLi[j]=DMLi1[j]=DMLi2[j]=INF; for (i=(j>TURN?(j-TURN):1); i= 1; i--) { /* i,j in [1..length] */ for (j = i+TURN+1; j <= length; j++) { int ij, psc, l1, maxq, minq, up, c0; ij = indx[j]+i; for (s=0; s=MINPSCORE) { /* a pair to consider */ int stackEnergy = INF; /* hairpin ----------------------------------------------*/ for (new_c=s=0; sMLclosing; new_c = MIN2(new_c, MLenergy); if(with_gquad){ decomp = 0; for(s=0;smismatchI[tt][S3[s][i]][S5[s][j]]; if(tt > 2) decomp += P->TerminalAU; } for(p = i + 2; p < j - VRNA_GQUAD_MIN_BOX_SIZE; p++){ l1 = p - i - 1; if(l1>MAXLOOP) break; if(S_cons[p] != 3) continue; minq = j - i + p - MAXLOOP - 2; c0 = p + VRNA_GQUAD_MIN_BOX_SIZE - 1; minq = MAX2(c0, minq); c0 = j - 1; maxq = p + VRNA_GQUAD_MAX_BOX_SIZE + 1; maxq = MIN2(c0, maxq); for(q = minq; q < maxq; q++){ if(S_cons[q] != 3) continue; c0 = decomp + ggg[indx[q] + p] + n_seq * P->internal_loop[l1 + j - q - 1]; new_c = MIN2(new_c, c0); } } p = i + 1; if(S_cons[p] == 3){ if(p < j - VRNA_GQUAD_MIN_BOX_SIZE){ minq = j - i + p - MAXLOOP - 2; c0 = p + VRNA_GQUAD_MIN_BOX_SIZE - 1; minq = MAX2(c0, minq); c0 = j - 3; maxq = p + VRNA_GQUAD_MAX_BOX_SIZE + 1; maxq = MIN2(c0, maxq); for(q = minq; q < maxq; q++){ if(S_cons[q] != 3) continue; c0 = decomp + ggg[indx[q] + p] + n_seq * P->internal_loop[j - q - 1]; new_c = MIN2(new_c, c0); } } } q = j - 1; if(S_cons[q] == 3) for(p = i + 4; p < j - VRNA_GQUAD_MIN_BOX_SIZE; p++){ l1 = p - i - 1; if(l1>MAXLOOP) break; if(S_cons[p] != 3) continue; c0 = decomp + ggg[indx[q] + p] + n_seq * P->internal_loop[l1]; new_c = MIN2(new_c, c0); } } new_c = MIN2(new_c, cc1[j-1]+stackEnergy); cc[j] = new_c - psc; /* add covariance bonnus/penalty */ if (noLonelyPairs) c[ij] = cc1[j-1]+stackEnergy-psc; else c[ij] = cc[j]; } /* end >> if (pair) << */ else c[ij] = INF; /* done with c[i,j], now compute fML[i,j] */ /* free ends ? -----------------------------------------*/ new_fML = fML[ij+1]+n_seq*P->MLbase; new_fML = MIN2(fML[indx[j-1]+i]+n_seq*P->MLbase, new_fML); energy = c[ij]; if(dangles){ for (s=0; s 1; i--){ if(c[indx[j]+i] 1; i--){ if (c[indx[j]+i]0 then s items have been already pushed onto the sector stack */ int i, j, k, p, q, length, energy, up, c0, l1, minq, maxq; int type_2, tt, mm; int b=0, cov_en = 0; int n_seq; int *type; length = strlen(strings[0]); for (n_seq=0; strings[n_seq]!=NULL; n_seq++); type = (int *) space(n_seq*sizeof(int)); if (s==0) { sector[++s].i = 1; sector[s].j = length; sector[s].ml = (backtrack_type=='M') ? 1 : ((backtrack_type=='C')?2:0); } while (s>0) { int ss, ml, fij, fi, cij, traced, i1, j1, d3, d5, jj=0, gq=0; int canonical = 1; /* (i,j) closes a canonical structure */ i = sector[s].i; j = sector[s].j; ml = sector[s--].ml; /* ml is a flag indicating if backtracking is to occur in the fML- (1) or in the f-array (0) */ if (ml==2) { base_pair2[++b].i = i; base_pair2[b].j = j; cov_en += pscore[indx[j]+i]; goto repeat1; } if (j < i+TURN+1) continue; /* no more pairs in this interval */ fij = (ml)? fML[indx[j]+i] : f5[j]; fi = (ml)?(fML[indx[j-1]+i]+n_seq*P->MLbase):f5[j-1]; if (fij == fi) { /* 3' end is unpaired */ sector[++s].i = i; sector[s].j = j-1; sector[s].ml = ml; continue; } if (ml == 0) { /* backtrack in f5 */ switch(dangles){ case 0: /* j or j-1 is paired. Find pairing partner */ for (i=j-TURN-1,traced=0; i>=1; i--) { int cc, en; jj = i-1; if (c[indx[j]+i]=1; i--) { int cc, en; jj = i-1; if (c[indx[j]+i]1) ? S5[ss][i]: -1, (j < length) ? S3[ss][j] : -1, P); } if (fij == en) traced=j; } if(with_gquad){ if(fij == f5[i-1] + ggg[indx[j]+i]){ /* found the decomposition */ traced = j; jj = i - 1; gq = 1; break; } } if (traced) break; } break; } if (!traced) nrerror("backtrack failed in f5"); /* push back the remaining f5 portion */ sector[++s].i = 1; sector[s].j = jj; sector[s].ml = ml; /* trace back the base pair found */ j=traced; if(with_gquad && gq){ /* goto backtrace of gquadruplex */ goto repeat_gquad; } base_pair2[++b].i = i; base_pair2[b].j = j; cov_en += pscore[indx[j]+i]; goto repeat1; } else { /* trace back in fML array */ if (fML[indx[j]+i+1]+n_seq*P->MLbase == fij) { /* 5' end is unpaired */ sector[++s].i = i+1; sector[s].j = j; sector[s].ml = ml; continue; } if(with_gquad){ if(fij == ggg[indx[j]+i] + n_seq * E_MLstem(0, -1, -1, P)){ /* go to backtracing of quadruplex */ goto repeat_gquad; } } cij = c[indx[j]+i]; if(dangles){ for(ss = 0; ss < n_seq; ss++){ tt = pair[S[ss][i]][S[ss][j]]; if(tt==0) tt=7; cij += E_MLstem(tt, S5[ss][i], S3[ss][j], P); } } else{ for(ss = 0; ss < n_seq; ss++){ tt = pair[S[ss][i]][S[ss][j]]; if(tt==0) tt=7; cij += E_MLstem(tt, -1, -1, P); } } if (fij==cij){ /* found a pair */ base_pair2[++b].i = i; base_pair2[b].j = j; cov_en += pscore[indx[j]+i]; goto repeat1; } for (k = i+1+TURN; k <= j-2-TURN; k++) if (fij == (fML[indx[k]+i]+fML[indx[j]+k+1])) break; sector[++s].i = i; sector[s].j = k; sector[s].ml = ml; sector[++s].i = k+1; sector[s].j = j; sector[s].ml = ml; if (k>j-2-TURN) nrerror("backtrack failed in fML"); continue; } repeat1: /*----- begin of "repeat:" -----*/ if (canonical) cij = c[indx[j]+i]; for (ss=0; ssstack[type[ss]][type_2]; } cij += pscore[indx[j]+i]; base_pair2[++b].i = i+1; base_pair2[b].j = j-1; cov_en += pscore[indx[j-1]+i+1]; i++; j--; canonical=0; goto repeat1; } canonical = 1; cij += pscore[indx[j]+i]; {int cc=0; for (ss=0; ss= minq; q--) { if (c[indx[q]+p]>=INF) continue; for (ss=energy=0; ssmismatchI[tt][S3[s][i]][S5[s][j]]; if(tt > 2) mm += P->TerminalAU; } for(p = i + 2; p < j - VRNA_GQUAD_MIN_BOX_SIZE; p++){ if(S_cons[p] != 3) continue; l1 = p - i - 1; if(l1>MAXLOOP) break; minq = j - i + p - MAXLOOP - 2; c0 = p + VRNA_GQUAD_MIN_BOX_SIZE - 1; minq = MAX2(c0, minq); c0 = j - 1; maxq = p + VRNA_GQUAD_MAX_BOX_SIZE + 1; maxq = MIN2(c0, maxq); for(q = minq; q < maxq; q++){ if(S_cons[q] != 3) continue; c0 = mm + ggg[indx[q] + p] + n_seq * P->internal_loop[l1 + j - q - 1]; if(cij == c0){ i=p;j=q; goto repeat_gquad; } } } p = i1; if(S_cons[p] == 3){ if(p < j - VRNA_GQUAD_MIN_BOX_SIZE){ minq = j - i + p - MAXLOOP - 2; c0 = p + VRNA_GQUAD_MIN_BOX_SIZE - 1; minq = MAX2(c0, minq); c0 = j - 3; maxq = p + VRNA_GQUAD_MAX_BOX_SIZE + 1; maxq = MIN2(c0, maxq); for(q = minq; q < maxq; q++){ if(S_cons[q] != 3) continue; if(cij == mm + ggg[indx[q] + p] + n_seq * P->internal_loop[j - q - 1]){ i = p; j=q; goto repeat_gquad; } } } } q = j1; if(S_cons[q] == 3) for(p = i1 + 3; p < j - VRNA_GQUAD_MIN_BOX_SIZE; p++){ l1 = p - i - 1; if(l1>MAXLOOP) break; if(S_cons[p] != 3) continue; if(cij == mm + ggg[indx[q] + p] + n_seq * P->internal_loop[l1]){ i = p; j = q; goto repeat_gquad; } } } mm = n_seq*P->MLclosing; if(dangles){ for(ss = 0; ss < n_seq; ss++){ tt = rtype[type[ss]]; mm += E_MLstem(tt, S5[ss][j], S3[ss][i], P); } } else{ for(ss = 0; ss < n_seq; ss++){ tt = rtype[type[ss]]; mm += E_MLstem(tt, -1, -1, P); } } sector[s+1].ml = sector[s+2].ml = 1; for (k = i1+TURN+1; k < j1-TURN-1; k++){ if(cij == fML[indx[k]+i1] + fML[indx[j1]+k+1] + mm) break; } if (k<=j-3-TURN) { /* found the decomposition */ sector[++s].i = i1; sector[s].j = k; sector[++s].i = k+1; sector[s].j = j1; } else { nrerror("backtracking failed in repeat"); } continue; /* this is a workarround to not accidentally proceed in the following block */ repeat_gquad: /* now we do some fancy stuff to backtrace the stacksize and linker lengths of the g-quadruplex that should reside within position i,j */ { int cnt1, cnt2, cnt3, cnt4, l[3], L, size; size = j-i+1; for(L=0; L < VRNA_GQUAD_MIN_STACK_SIZE;L++){ if(S_cons[i+L] != 3) break; if(S_cons[j-L] != 3) break; } if(L == VRNA_GQUAD_MIN_STACK_SIZE){ /* continue only if minimum stack size starting from i is possible */ for(; L<=VRNA_GQUAD_MAX_STACK_SIZE;L++){ if(S_cons[i+L-1] != 3) break; /* break if no more consecutive G's 5' */ if(S_cons[j-L+1] != 3) break; /* break if no more consecutive G'1 3' */ for( l[0] = VRNA_GQUAD_MIN_LINKER_LENGTH; (l[0] <= VRNA_GQUAD_MAX_LINKER_LENGTH) && (size - 4*L - 2*VRNA_GQUAD_MIN_LINKER_LENGTH - l[0] >= 0); l[0]++){ /* check whether we find the second stretch of consecutive G's */ for(cnt1 = 0; (cnt1 < L) && (S_cons[i+L+l[0]+cnt1] == 3); cnt1++); if(cnt1 < L) continue; for( l[1] = VRNA_GQUAD_MIN_LINKER_LENGTH; (l[1] <= VRNA_GQUAD_MAX_LINKER_LENGTH) && (size - 4*L - VRNA_GQUAD_MIN_LINKER_LENGTH - l[0] - l[1] >= 0); l[1]++){ /* check whether we find the third stretch of consectutive G's */ for(cnt1 = 0; (cnt1 < L) && (S_cons[i+2*L+l[0]+l[1]+cnt1] == 3); cnt1++); if(cnt1 < L) continue; /* the length of the third linker now depends on position j as well as the other linker lengths... so we do not have to loop too much */ l[2] = size - 4*L - l[0] - l[1]; if(l[2] < VRNA_GQUAD_MIN_LINKER_LENGTH) break; if(l[2] > VRNA_GQUAD_MAX_LINKER_LENGTH) continue; /* check for contribution */ if(ggg[indx[j]+i] == E_gquad_ali(i, L, l, (const short **)S, n_seq, P)){ int a; /* fill the G's of the quadruplex into base_pair2 */ for(a=0;a0; i--){ char c5; c5 = sequence[i-1]; if ((c5=='-')||(c5=='_')||(c5=='~')||(c5=='.')) continue; s5[1] = S[i]; break; } for (i=1; i<=l; i++) { char c3; c3 = sequence[i-1]; if ((c3=='-')||(c3=='_')||(c3=='~')||(c3=='.')) continue; s3[l] = S[i]; break; } } else s5[1]=s3[l]=0; for(i=1,p=0; i<=l; i++){ char c5; c5 = sequence[i-1]; if ((c5=='-')||(c5=='_')||(c5=='~')||(c5=='.')) s5[i+1]=s5[i]; else { /* no gap */ ss[p++]=sequence[i-1]; /*start at 0!!*/ s5[i+1]=S[i]; } as[i]=p; } for (i=l; i>=1; i--) { char c3; c3 = sequence[i-1]; if ((c3=='-')||(c3=='_')||(c3=='~')||(c3=='.')) s3[i-1]=s3[i]; else s3[i-1]=S[i]; } } } PRIVATE void make_pscores(const short *const* S, const char **AS, int n_seq, const char *structure) { /* calculate co-variance bonus for each pair depending on */ /* compensatory/consistent mutations and incompatible seqs */ /* should be 0 for conserved pairs, >0 for good pairs */ #define NONE -10000 /* score for forbidden pairs */ int n,i,j,k,l,s; int olddm[7][7]={{0,0,0,0,0,0,0}, /* hamming distance between pairs */ {0,0,2,2,1,2,2} /* CG */, {0,2,0,1,2,2,2} /* GC */, {0,2,1,0,2,1,2} /* GU */, {0,1,2,2,0,2,1} /* UG */, {0,2,2,1,2,0,2} /* AU */, {0,2,2,2,1,2,0} /* UA */}; float **dm; n=S[0][0]; /* length of seqs */ if (ribo) { if (RibosumFile !=NULL) dm=readribosum(RibosumFile); else dm=get_ribosum(AS,n_seq,n); } else { /*use usual matrix*/ dm=(float **)space(7*sizeof(float*)); for (i=0; i<7;i++) { dm[i]=(float *)space(7*sizeof(float)); for (j=0; j<7; j++) dm[i][j] = (float) olddm[i][j]; } } for (i=1; in_seq) { pscore[indx[j]+i] = NONE; continue;} for (k=1,score=0; k<=6; k++) /* ignore pairtype 7 (gap-gap) */ for (l=k; l<=6; l++) score += pfreq[k]*pfreq[l]*dm[k][l]; /* counter examples score -1, gap-gap scores -0.25 */ pscore[indx[j]+i] = cv_fact * ((UNIT*score)/n_seq - nc_fact*UNIT*(pfreq[0] + pfreq[7]*0.25)); } } if (noLonelyPairs) /* remove unwanted pairs */ for (k=1; k=1)&&(j<=n)) { if ((i>1)&&(j0) ? psij : 0; /* fallthrough */ case '>': /* pairs downstream */ for (l=j+TURN+1; l<=n; l++) pscore[indx[l]+j] = NONE; break; } } if (hx!=0) { fprintf(stderr, "%s\n", structure); nrerror("unbalanced brackets in constraint string"); } free(stack); free(stack2); } /*free dm */ for (i=0; i<7;i++) { free(dm[i]); } free(dm); } /*--------New scoring part-----------------------------------*/ PUBLIC int get_mpi(char *Alseq[], int n_seq, int length, int *mini) { int i, j,k; float ident=0; int pairnum=0; int sumident=0; float minimum=1.; for(j=0; j0) return (int) (sumident*100/pairnum); else return 0; } PUBLIC float **readribosum(char *name){ float **dm; char *line; FILE *fp; int i=0; int who=0; float a,b,c,d,e,f; int translator[7]={0,5,1,2,3,6,4}; fp=fopen(name,"r"); dm=(float **)space(7*sizeof(float*)); for (i=0; i<7;i++) { dm[i]=(float *)space(7*sizeof(float)); } while(1) { /*bisma hoit fertisch san*/ line=get_line(fp); if (*line=='#') continue; i=0; i=sscanf(line,"%f %f %f %f %f %f",&a,&b,&c,&d,&e,&f); if (i==0) break; dm[translator[++who]][translator[1]]=a; dm[translator[who]][translator[2]]=b; dm[translator[who]][translator[3]]=c; dm[translator[who]][translator[4]]=d; dm[translator[who]][translator[5]]=e; dm[translator[who]][translator[6]]=f; free(line); if (who==6) break; } fclose(fp); return dm; } PRIVATE void en_corr_of_loop_gquad(int i, int j, const char **sequences, const char *structure, short *pt, int *loop_idx, int n_seq, int en[2]){ int pos, energy, en_covar, p, q, r, s, u, type, type2, gq_en[2]; int L, l[3]; energy = en_covar = 0; q = i; while((pos = parse_gquad(structure + q-1, &L, l)) > 0){ q += pos-1; p = q - 4*L - l[0] - l[1] - l[2] + 1; if(q > j) break; /* we've found the first g-quadruplex at position [p,q] */ E_gquad_ali_en(p, L, l, (const short **)S, n_seq, gq_en, P); energy += gq_en[0]; en_covar += gq_en[1]; /* check if it's enclosed in a base pair */ if(loop_idx[p] == 0){ q++; continue; /* g-quad in exterior loop */} else{ energy += E_MLstem(0, -1, -1, P) * n_seq; /* find its enclosing pair */ int num_elem, num_g, elem_i, elem_j, up_mis; num_elem = 0; num_g = 1; r = p - 1; up_mis = q - p + 1; /* seek for first pairing base located 5' of the g-quad */ for(r = p - 1; !pt[r] && (r >= i); r--); if(r < i) nrerror("this should not happen"); if(r < pt[r]){ /* found the enclosing pair */ s = pt[r]; } else { num_elem++; elem_i = pt[r]; elem_j = r; r = pt[r]-1 ; /* seek for next pairing base 5' of r */ for(; !pt[r] && (r >= i); r--); if(r < i) nrerror("so nich"); if(r < pt[r]){ /* found the enclosing pair */ s = pt[r]; } else { /* hop over stems and unpaired nucleotides */ while((r > pt[r]) && (r >= i)){ if(pt[r]){ r = pt[r]; num_elem++;} r--; } if(r < i) nrerror("so nich"); s = pt[r]; /* found the enclosing pair */ } } /* now we have the enclosing pair (r,s) */ u = q+1; /* we know everything about the 5' part of this loop so check the 3' part */ while(u 0){ E_gquad_ali_en(u, L, l, (const short **)S, n_seq, gq_en, P); energy += gq_en[0] + E_MLstem(0, -1, -1, P) * n_seq; en_covar += gq_en[1]; up_mis += pos; u += pos; num_g++; } } else { /* we must have found a stem */ if(!(u < pt[u])) nrerror("wtf!"); num_elem++; elem_i = u; elem_j = pt[u]; en_corr_of_loop_gquad(u, pt[u], sequences, structure, pt, loop_idx, n_seq, gq_en); energy += gq_en[0]; en_covar += gq_en[1]; u = pt[u] + 1; } } if(u!=s) nrerror("what the ..."); else{ /* we are done since we've found no other 3' structure element */ switch(num_elem){ /* g-quad was misinterpreted as hairpin closed by (r,s) */ case 0: /*if(num_g == 1) if((p-r-1 == 0) || (s-q-1 == 0)) nrerror("too few unpaired bases"); */ { int ee = 0; int cnt; for(cnt=0;cntmismatchI[type][S3[cnt][r]][S5[cnt][s]]; if(type > 2) ee += P->TerminalAU; } energy += ee; } energy += n_seq * P->internal_loop[s-r-1-up_mis]; break; /* g-quad was misinterpreted as interior loop closed by (r,s) with enclosed pair (elem_i, elem_j) */ case 1: { int ee = 0; int cnt; for(cnt = 0; cntMLclosing + (elem_i-r-1+s-elem_j-1-up_mis) * P->MLbase) * n_seq; break; /* gquad was misinterpreted as unpaired nucleotides in a multiloop */ default: energy -= (up_mis) * P->MLbase * n_seq; break; } } q = s+1; } } en[0] = energy; en[1] = en_covar; } PUBLIC float energy_of_ali_gquad_structure( const char **sequences, const char *structure, int n_seq, float *energy){ int new=0; unsigned int s, n; short *pt; short **tempS; short **tempS5; /*S5[s][i] holds next base 5' of i in sequence s*/ short **tempS3; /*Sl[s][i] holds next base 3' of i in sequence s*/ char **tempSs; unsigned short **tempa2s; int *tempindx, *loop_idx; int *temppscore; int en_struct[2], gge[2]; if(sequences[0] != NULL){ n = (unsigned int) strlen(sequences[0]); update_alifold_params(); /*save old memory*/ tempS = S; tempS3 = S3; tempS5 = S5; tempSs = Ss; tempa2s = a2s; tempindx = indx; temppscore = pscore; alloc_sequence_arrays(sequences, &S, &S5, &S3, &a2s, &Ss, 0); pscore = (int *) space(sizeof(int)*((n+1)*(n+2)/2)); indx = get_indx(n); make_pscores((const short *const*)S, sequences, n_seq, NULL); new = 1; pt = make_pair_table(structure); energy_of_alistruct_pt(sequences,pt, n_seq, &(en_struct[0])); loop_idx = make_loop_index_pt(pt); en_corr_of_loop_gquad(1, n, sequences, structure, pt, loop_idx, n_seq, gge); en_struct[0] += gge[0]; en_struct[1] += gge[1]; free(loop_idx); free(pt); energy[0] = (float)en_struct[0]/(float)(100*n_seq); energy[1] = (float)en_struct[1]/(float)(100*n_seq); free(pscore); free(indx); free_sequence_arrays(n_seq, &S, &S5, &S3, &a2s, &Ss); /* restore old memory */ S = tempS; S3 = tempS3; S5 = tempS5; Ss = tempSs; a2s = tempa2s; indx = tempindx; pscore = temppscore; } else nrerror("energy_of_alistruct(): no sequences in alignment!"); return energy[0]; } PUBLIC float energy_of_alistruct(const char **sequences, const char *structure, int n_seq, float *energy){ int new=0; unsigned int s, n; short *pt; short **tempS; short **tempS5; /*S5[s][i] holds next base 5' of i in sequence s*/ short **tempS3; /*Sl[s][i] holds next base 3' of i in sequence s*/ char **tempSs; unsigned short **tempa2s; int *tempindx; int *temppscore; int en_struct[2]; if(sequences[0] != NULL){ n = (unsigned int) strlen(sequences[0]); update_alifold_params(); /*save old memory*/ tempS = S; tempS3 = S3; tempS5 = S5; tempSs = Ss; tempa2s = a2s; tempindx = indx; temppscore = pscore; alloc_sequence_arrays(sequences, &S, &S5, &S3, &a2s, &Ss, 0); pscore = (int *) space(sizeof(int)*((n+1)*(n+2)/2)); indx = get_indx(n); make_pscores((const short *const*)S, sequences, n_seq, NULL); new = 1; pt = make_pair_table(structure); energy_of_alistruct_pt(sequences,pt, n_seq, &(en_struct[0])); free(pt); energy[0] = (float)en_struct[0]/(float)(100*n_seq); energy[1] = (float)en_struct[1]/(float)(100*n_seq); free(pscore); free(indx); free_sequence_arrays(n_seq, &S, &S5, &S3, &a2s, &Ss); /* restore old memory */ S = tempS; S3 = tempS3; S5 = tempS5; Ss = tempSs; a2s = tempa2s; indx = tempindx; pscore = temppscore; } else nrerror("energy_of_alistruct(): no sequences in alignment!"); return energy[0]; } PRIVATE void energy_of_alistruct_pt(const char **sequences, short *pt, int n_seq, int *energy){ int i, length; length = S[0][0]; energy[0] = backtrack_type=='M' ? ML_Energy_pt(0, n_seq, pt) : EL_Energy_pt(0, n_seq, pt); energy[1] = 0; for (i=1; i<=length; i++) { if (pt[i]==0) continue; stack_energy_pt(i, sequences, pt, n_seq, energy); i=pt[i]; } } PRIVATE void stack_energy_pt(int i, const char **sequences, short *pt, int n_seq, int *energy) { /* calculate energy of substructure enclosed by (i,j) */ int ee= 0; int j, p, q, s; int *type = (int *) space(n_seq*sizeof(int)); j = pt[i]; for (s=0; sq)) break; ee=0; for (s=0; sq) { ee=0;/* hair pin */ for (s=0; s< n_seq; s++) { if ((a2s[s][j-1]-a2s[s][i])<3) ee+=600; else ee += E_Hairpin(a2s[s][j-1]-a2s[s][i], type[s], S3[s][i], S5[s][j], Ss[s]+(a2s[s][i-1]), P); } energy[0] += ee; energy[1] += pscore[indx[j]+i]; free(type); return; } /* (i,j) is exterior pair of multiloop */ energy[1] += pscore[indx[j]+i]; while (p= j) break; /* get position of pairing partner */ q = pt[p]; /* memorize number of unpaired positions? no, we just approximate here */ u += p-i1-1; for (s=0; s< n_seq; s++) { /* get type of base pair P->q */ tt = pair[S[s][p]][S[s][q]]; if (tt==0) tt=7; d5 = dangles && (a2s[s][p]>1) && (tt!=0) ? S5[s][p] : -1; d3 = dangles && (a2s[s][q] 0){ energy += P->MLclosing * n_seq; if(dangles){ for (s=0; sMLbase * n_seq; return energy; } PRIVATE int EL_Energy_pt(int i, int n_seq, short *pt){ int energy, tt, i1, j, p, q, s; short d5, d3; j = pt[0]; p = i+1; energy = 0; do{ /* walk along the backbone */ /* hop over unpaired positions */ while (p < j && pt[p]==0) p++; if(p == j) break; /* no more stems */ /* get position of pairing partner */ q = pt[p]; for (s=0; s< n_seq; s++) { /* get type of base pair P->q */ tt = pair[S[s][p]][S[s][q]]; if (tt==0) tt=7; d5 = dangles && (a2s[s][p]>1) && (tt!=0) ? S5[s][p] : -1; d3 = dangles && (a2s[s][q]