2 String alignment for RNA secondary structures
3 Peter F Stadler, Ivo Hofacker
6 /* Last changed Time-stamp: <2005-07-23 10:19:40 ivo> */
13 #include "edit_cost.h"
14 #include "dist_vars.h"
17 static char rcsid[] = "$Id: stringdist.c,v 1.3 2005/07/24 08:37:15 ivo Exp $";
20 #define PRIVATE static
22 #define INFINITY_DIST 10000
24 PUBLIC float string_edit_distance(swString *T1, swString *T2);
25 PUBLIC swString *Make_swString(char *string);
26 PUBLIC void print_swString(swString *x);
27 PUBLIC void print_alignment_list(void);
29 PRIVATE void sprint_aligned_swStrings(swString *T1, swString *T2);
30 PRIVATE float StrEditCost(int i, int j, swString *T1, swString *T2);
31 PRIVATE void DeCode(char *string, int k, int *tp, float *w);
32 PRIVATE int decode(char *id);
33 PRIVATE void encode(int type, char label[]);
34 PRIVATE int *alignment[2]; /* contains information from backtracking
35 alignment[0][n] is the node in tree2
36 matching node n in tree1 */
39 /*---------------------------------------------------------------------------*/
41 PUBLIC float string_edit_distance(swString *T1, swString *T2)
45 short **i_point, **j_point;
47 int i, j, i1, j1, pos, length1,length2;
48 float minus, plus, change, temp;
50 if (cost_matrix==0) EditCost = &UsualCost;
51 else EditCost = &ShapiroCost;
56 distance = (float **) space((length1 +1)*sizeof(float *));
58 i_point = (short **) space((length1 +1)*sizeof(short *));
59 j_point = (short **) space((length1 +1)*sizeof(short *));
61 for(i=0; i<= length1; i++){
62 distance[i] = (float *) space( (length2+1)*sizeof(float));
64 i_point[i] = (short *) space( (length2+1)*sizeof(short));
65 j_point[i] = (short *) space( (length2+1)*sizeof(short));
69 for(i = 1; i <= length1; i++) {
74 distance[i][0] = distance[i-1][0]+StrEditCost(i,0,T1,T2);
76 for(j = 1; j <= length2; j++) {
81 distance[0][j] = distance[0][j-1]+StrEditCost(0,j,T1,T2);
84 for (i = 1; i <= length1; i++) {
85 for (j = 1; j <= length2 ; j++) {
86 minus = distance[i-1][j] + StrEditCost(i,0,T1,T2);
87 plus = distance[i][j-1] + StrEditCost(0,j,T1,T2);
88 change = distance[i-1][j-1]+ StrEditCost(i,j,T1,T2);
90 distance[i][j] = MIN3(minus, plus, change);
91 /* printf("%g ", distance[i][j]); */
94 if(distance[i][j] == change) {
95 i_point[i][j]=i-1; j_point[i][j]=j-1; }
96 else if(distance[i][j] == plus) {
97 i_point[i][j]=i ; j_point[i][j]=j-1; }
99 i_point[i][j]=i-1; j_point[i][j]=j ; }
105 temp = distance[length1][length2];
106 for(i=0;i<=length1;i++)
111 if(alignment[0]!= NULL) free(alignment[0]);
112 if(alignment[1]!= NULL) free(alignment[1]);
113 alignment[0] = (int *) space((length1+length2+1)*sizeof(int));
114 alignment[1] = (int *) space((length1+length2+1)*sizeof(int));
116 pos = length1+length2;
119 while( (i>0)||(j>0) ) {
122 if( ((i-i1)==1)&&((j-j1)==1) ) { /* substitution */
123 alignment[0][pos] = i;
124 alignment[1][pos] = j;
126 if( ((i-i1)==1)&&(j==j1) ) { /* Deletion in [1] */
127 alignment[0][pos] = i;
128 alignment[1][pos] = 0;
130 if( (i==i1)&&((j-j1)==1) ) { /* Deletion in [0] */
131 alignment[0][pos] = 0;
132 alignment[1][pos] = j;
138 for(i=pos+1; i<=length1+length2; i++){
139 alignment[0][i-pos] = alignment[0][i];
140 alignment[1][i-pos] = alignment[1][i];
142 alignment[0][0] = length1+length2-pos; /* length of alignment */
144 for(i=0; i<=length1; i++){
145 free(i_point[i]); free(j_point[i]);
147 free(i_point); free(j_point);
148 sprint_aligned_swStrings(T1,T2);
156 /*---------------------------------------------------------------------------*/
158 PRIVATE float StrEditCost(int i, int j, swString *T1, swString *T2)
160 float c, diff, cd, min, a, b, dist;
163 cd = (float) (*EditCost)[0][T2[j].type];
169 cd = (float) (*EditCost)[T1[i].type][0];
174 if( ((T1[i].sign)*(T2[j].sign)) > 0) {
175 c = (float) (*EditCost)[T1[i].type][T2[j].type];
176 diff = (float) fabs((a=T1[i].weight) - (b=T2[j].weight));
178 if (min == a) cd = (float) (*EditCost)[0][T2[j].type];
179 else cd = (float) (*EditCost)[T1[i].type][0];
180 dist = c * min + cd * diff;
182 else dist = (float) INFINITY_DIST;
186 /*---------------------------------------------------------------------------*/
188 PUBLIC swString *Make_swString(char *string)
191 int tp, len, l, length;
195 length = strlen(string);
197 for(i=0; i<length; i++) {
198 if( (string[i]=='(') || (string[i]==')') ) j++;
199 if(string[i]=='.') j+=2;
204 S= (swString *) space(sizeof(swString)*(len+1));
205 S[0].sign = j; /* number of entries */
219 if(string[k] == '(' ) l++;
220 if(string[k] == ')' ) l--;
222 DeCode(string,k,&tp,&w);
230 DeCode(string,k,&tp,&w);
251 /*---------------------------------------------------------------------------*/
253 PRIVATE void DeCode(char *string, int k, int *tp, float *w)
254 /* retrieves type and weigth for a node closed by a bracket at position k */
257 char label[20], id[20] ;
262 if( (string[i]=='(')||(string[i]==')')||(string[i]=='.') ) break;
264 label[k-i-1] = string[i]; label[k-i] = '\0';
268 if (l==0) { /* Dot-Bracket notation */
273 for (i=0; i<l; i++) {
274 if (!isalpha(label[l-i-1])) break;
275 id[i] = label[l-i-1];
285 sscanf(label,"%d",&m);
288 fprintf(stderr, "Warning: Non-integer weight in DeCode ignored\n");
297 /*---------------------------------------------------------------------------*/
299 PRIVATE int decode(char *id)
302 char label[100], *code;
310 for (i = 0; code[i] != sep; i++) {
311 if (code[i] == '\0') {
318 if (strcmp(id, label) == 0) return (n);
323 fprintf(stderr,"Syntax error: node identifier \"%s\" not found "
324 "in coding string \"%s\"\n", id, coding);
325 fprintf(stderr,"Exiting...");
330 /*---------------------------------------------------------------------------*/
332 PRIVATE void encode( int type, char label[])
338 for (i = 0; i < type; i++) {
339 while (coding[l] != sep && coding[l]) l++;
343 for (i = 0; coding[l+i] != sep; i++) {
344 if (coding[l+i] == '\0') break;
345 label[i] = coding[l+i];
350 /*---------------------------------------------------------------------------*/
352 PRIVATE void sprint_aligned_swStrings(swString *T1, swString *T2)
354 int i, j, l0, l1, ltmp=0, weights;
355 char label[10], *a0, *a1, tmp0[20], tmp1[20];
358 for (i=1; i<=T1[0].sign; i++) weights = (weights||(T1[i].weight!=0.5));
359 for (i=1; i<=T2[0].sign; i++) weights = (weights||(T2[i].weight!=0.5));
361 a0 = (char *) space(alignment[0][0]*4+2);
362 a1 = (char *) space(alignment[0][0]*4+2);
363 for(i=1; i<= alignment[0][0]; i++){
364 tmp0[0] = '\0'; l0=0;
365 if(alignment[0][i] > 0) {
366 encode(T1[alignment[0][i]].type, label);
367 if(T1[alignment[0][i]].sign > 0) {
373 sprintf(tmp0+strlen(tmp0), "%d",
374 (int)(2*T1[alignment[0][i]].weight));
376 if(T1[alignment[0][i]].sign < 0) strcat(tmp0, ")");
380 if(alignment[1][i] > 0) {
381 encode(T2[alignment[1][i]].type, label);
382 if(T2[alignment[1][i]].sign > 0) {
388 sprintf(tmp1+strlen(tmp1), "%d",
389 (int)(2*T2[alignment[1][i]].weight));
391 if(T2[alignment[1][i]].sign < 0) strcat(tmp1, ")");
395 for (j=l0; j<ltmp; j++) tmp0[j] = '_';
396 for (j=l1; j<ltmp; j++) tmp1[j] = '_';
397 tmp0[ltmp] = '\0'; tmp1[ltmp] = '\0';
399 strcat(a0,tmp0); strcat(a1,tmp1);
402 if (aligned_line[0]!= NULL) { free(aligned_line[0]); aligned_line[0]= NULL;}
403 if (aligned_line[1]!= NULL) { free(aligned_line[1]); aligned_line[1]= NULL;}
404 aligned_line[0] = strdup(a0);
406 aligned_line[1] = strdup(a1);
410 /*---------------------------------------------------------------------------*/
413 PUBLIC void print_swString(swString *x)
416 for (i=0; i<=x[0].sign; i++)
417 printf("(%d,%d,%f\n) ", x[i].type, x[i].sign, x[i].weight );
421 /*---------------------------------------------------------------------------*/
423 PUBLIC void print_alignment_list(void)
427 for (i=1; i<= alignment[0][0]; i++)
428 printf("%3d ",alignment[0][i]);
430 for (i=1; i<= alignment[0][0]; i++)
431 printf("%3d ",alignment[1][i]);