2 Fast, but crude, pairwise structural Alignments of RNA sequences
4 Possible structures of each RNA are encoded in a linear
5 "probability profile", by computing for each base the probability
6 of being unpaired, or paired upstream or downstream. These profiles
7 can be aligned using standard string alignment.
9 The is an extension of the old method in ProfileDist.c with the
11 - use sequence as well as structure profile for scoring
12 - use similarity alignment instead of distance (maybe add local alinment)
13 - use affine gap costs
15 C Ivo L Hofacker, Vienna RNA Package
17 /* Last changed Time-stamp: <2004-08-02 10:11:13 ivo> */
25 #include "dist_vars.h"
26 #include "fold_vars.h"
27 #include "part_func.h"
29 #include "profiledist.h"
30 #include "ProfileAln.h"
34 static char rcsid[] = "$Id: ProfileAln.c,v 1.5 2006/01/18 13:00:30 ivo Exp $";
36 #define EQUAL(x,y) (fabs((x)-(y)) <= fabs(x)*2*FLT_EPSILON)
38 PRIVATE int *alignment[2];
40 PRIVATE void sprint_aligned_bppm(const float *T1, const char *seq1,
41 const float *T2, const char *seq2);
42 PRIVATE double PrfEditScore(const float *p1, const float *p2,
44 PRIVATE double average(double x, double y);
46 PRIVATE double open=-1.5, ext=-0.666; /* defaults from clustalw */
47 PRIVATE double seqw=0.5;
48 PRIVATE int free_ends=1; /* whether to use free end gaps */
50 /*---------------------------------------------------------------------------*/
52 PRIVATE float **newmat(int l1, int l2) {
55 a = (float **) space((l1+1)*sizeof(float *));
56 for (i=0; i<=l1; i++) a[i] = (float *) space((l2+1)*sizeof(float));
60 PUBLIC float profile_aln(const float *T1, const char *seq1,
61 const float *T2, const char *seq2)
63 /* align the 2 probability profiles T1, T2 */
64 /* This is like a Needleman-Wunsch alignment, with affine gap-costs
65 ala Gotoh. The score looks at both seq and pair profile */
67 float **S, **E, **F, tot_score;
68 int i, j, length1, length2;
70 length1 = strlen(seq1);
71 length2 = strlen(seq2);
72 S = newmat(length1, length2);
73 E = newmat(length1, length2);
74 F = newmat(length1, length2);
76 E[0][0] = F[0][0] = open - ext;
78 for (i=1; i<=length1; i++) F[i][0] = -9999; /* impossible */
79 for (j=1; j<=length2; j++) E[0][j] = -9999; /* impossible */
81 for (i=1; i<=length1; i++) S[i][0] = E[i][0] = E[i-1][0] +ext;
82 for (j=1; j<=length2; j++) S[0][j] = F[0][j] = F[0][j-1] +ext;
85 for (i=1; i<=length1; i++) {
86 for (j=1; j<=length2; j++) {
88 E[i][j] = MAX2(E[i-1][j]+ext, S[i-1][j]+open);
89 F[i][j] = MAX2(F[i][j-1]+ext, S[i][j-1]+open);
90 M = S[i-1][j-1] + PrfEditScore(T1+3*i,T2+3*j, seq1[i-1], seq2[j-1]);
91 S[i][j] = MAX3(M, E[i][j], F[i][j]);
99 alignment[0] = (int *) space((length1+length2+1)*sizeof(int));
100 alignment[1] = (int *) space((length1+length2+1)*sizeof(int));
102 pos = length1+length2;
106 tot_score = S[length1][length2];
109 /* find starting point for backtracking,
110 search for highest entry in last row or column */
112 for (i=1; i<=length1; i++) {
113 if (S[i][length2]>score) {
118 for (j=1; j<=length2; j++) {
119 if (S[length1][j]>score) {
125 for (j=length2; j> -imax; j--) {
126 alignment[0][pos] = 0;
127 alignment[1][pos--] = j;
131 for (i=length1; i>imax; i--) {
132 alignment[0][pos] = i;
133 alignment[1][pos--] = 0;
144 alignment[0][pos] = i;
145 alignment[1][pos--] = 0;
146 if (EQUAL(score, S[i-1][j] + open)) state = 'S';
151 alignment[0][pos] = 0;
152 alignment[1][pos--] = j;
153 if (EQUAL(score, S[i][j-1] + open)) state = 'S';
158 if (EQUAL(score, E[i][j])) state = 'E';
159 else if (EQUAL(score, F[i][j])) state = 'F';
160 else if (EQUAL(score, S[i-1][j-1] +
161 PrfEditScore(T1+3*i,T2+3*j, seq1[i-1], seq2[j-1]))) {
162 alignment[0][pos] = i;
163 alignment[1][pos--] = j;
166 else nrerror("backtrack of alignment failed");
172 alignment[0][pos] = 0;
173 alignment[1][pos--] = j;
176 alignment[0][pos] = i;
177 alignment[1][pos--] = 0;
180 for(i=pos+1; i<=length1+length2; i++){
181 alignment[0][i-pos] = alignment[0][i];
182 alignment[1][i-pos] = alignment[1][i];
184 alignment[0][0] = length1+length2-pos; /* length of alignment */
186 sprint_aligned_bppm(T1,seq1, T2,seq2);
190 for (i=0; i<=length1; i++) {
191 free(S[i]); free(E[i]); free(F[i]);
193 free(S); free(E); free(F);
199 /*---------------------------------------------------------------------------*/
200 PRIVATE inline double average(double x, double y) {
202 As in Bonhoeffer et al (1993) 'RNA Multi Structure Landscapes',
203 Eur. Biophys. J. 22: 13-24 we have chosen the geometric mean.
205 return (float) sqrt(x*y);
208 PRIVATE double PrfEditScore(const float *p1, const float *p2, char c1, char c2)
213 for(score=0.,k=0; k<3; k++)
214 score += average(p1[k],p2[k]);
217 if (c1==c2) score += seqw;
218 else if (((c1=='A') && (c2=='G')) ||
219 ((c1=='G') && (c2=='A')) ||
220 ((c1=='C') && (c2=='U')) ||
221 ((c1=='U') && (c2=='C')))
223 else score -= 0.9*seqw;
227 /*---------------------------------------------------------------------------*/
229 PRIVATE void sprint_aligned_bppm(const float *T1, const char *seq1,
230 const float *T2, const char *seq2) {
232 length = alignment[0][0];
233 for (i=0; i<4; i++) {
234 if (aligned_line[i] != NULL) free(aligned_line[i]);
235 aligned_line[i] = (char *) space((length+1)*sizeof(char));
237 for(i=1; i<=length; i++){
238 if (alignment[0][i]==0)
239 aligned_line[0][i-1] = aligned_line[2][i-1] = '_';
241 aligned_line[0][i-1] = bppm_symbol(T1+alignment[0][i]*3);
242 aligned_line[2][i-1] = seq1[alignment[0][i]-1];
244 if (alignment[1][i]==0)
245 aligned_line[1][i-1] = aligned_line[3][i-1] = '_';
247 aligned_line[1][i-1] = bppm_symbol(T2+alignment[1][i]*3);
248 aligned_line[3][i-1] = seq2[alignment[1][i]-1];
253 PUBLIC int set_paln_params(double gap_open, double gap_ext,
254 double seq_weight, int freeends) {
255 open = (gap_open>0) ? -gap_open : gap_open;
256 ext = (gap_ext>0) ? -gap_ext : gap_ext;
257 if (open > ext) fprintf(stderr, "Gap extension penalty is smaller than "
258 "gap open. Do you realy want this?\n");
262 fprintf(stderr, "Sequence weight set to 0 (must be in [0..1])\n");
266 fprintf(stderr, "Sequence weight set to 1 (must be in [0..1])\n");
268 free_ends = (freeends) ? 1 : 0;
272 /*---------------------------------------------------------------------------*/