2 Tree edit distances for RNA secondary structures
3 Walter Fontana, Ivo L Hofacker, Peter F Stadler
6 /* Last changed Time-stamp: <97/10/27 15:23:48 ivo> */
12 #include "edit_cost.h"
13 #include "dist_vars.h"
15 static char rcsid[] = "$Id: treedist.c,v 1.3 1997/11/03 10:39:43 ivo Rel $";
17 #define PRIVATE static
20 #define MNODES 4000 /* Maximal number of nodes for alignment */
22 PUBLIC Tree *make_tree(char *struc);
23 PUBLIC float tree_edit_distance(Tree *T1, Tree *T2);
24 PUBLIC void print_tree(Tree *t);
25 PUBLIC void free_tree(Tree *t);
28 PRIVATE void tree_dist(int i, int j);
29 PRIVATE int edit_cost(int i, int j);
30 PRIVATE int *make_keyroots(Postorder_list *pl);
31 PRIVATE void sort(int n, int *ra);
32 PRIVATE Postorder_list *make_postorder_list(char *struc);
33 PRIVATE int decode(char *id);
34 PRIVATE int number_of_nodes(char *struc);
35 PRIVATE void encode(int type, char *label);
36 PRIVATE void print_keyroots(int *keyroots);
37 PRIVATE void print_postorder_list(Postorder_list *pl);
38 PRIVATE void backtracking(void);
39 PRIVATE void sprint_aligned_trees(void);
42 PRIVATE Tree *tree1, *tree2;
43 PRIVATE int **tdist; /* contains distances between subtrees */
44 PRIVATE int **fdist; /* contains distances between forests */
45 PRIVATE int *alignment[2]; /* contains numeric information on the alignment:
46 alignment[0][p], aligment[1][p] are aligned postions.
48 alignment[0][0] contains the length of the alignment. */
50 /*---------------------------------------------------------------------------*/
52 PUBLIC float tree_edit_distance(Tree *T1, Tree *T2)
57 if (cost_matrix==0) EditCost = &UsualCost;
58 else EditCost = &ShapiroCost;
60 n1 = T1->postorder_list[0].sons;
61 n2 = T2->postorder_list[0].sons;
63 tdist = (int **) space(sizeof(int *) * (n1+1));
64 fdist = (int **) space(sizeof(int *) * (n1+1));
65 for (i=0; i<=n1; i++) {
66 tdist[i] = (int *) space(sizeof(int) * (n2+1));
67 fdist[i] = (int *) space(sizeof(int) * (n2+1));
70 tree1 = T1; tree2 = T2;
72 for (i1 = 1; i1 <= T1->keyroots[0]; i1++) {
74 for (j1 = 1; j1 <= T2->keyroots[0]; j1++) {
83 if ((n1>MNODES)||(n2>MNODES)) nrerror("tree too large for alignment");
85 alignment[0] = (int *) space((n1+1)*sizeof(int));
86 alignment[1] = (int *) space((n2+1)*sizeof(int));
89 sprint_aligned_trees();
94 for (i=0; i<=n1; i++) {
104 /*---------------------------------------------------------------------------*/
106 PRIVATE void tree_dist(int i, int j)
108 int li,lj,i1,j1,i1_1,j1_1,li1_1,lj1_1,f1,f2,f3,f;
109 int cost, lleaf_i1, lleaf_j1;
113 li = tree1->postorder_list[i].leftmostleaf;
114 lj = tree2->postorder_list[j].leftmostleaf;
116 for (i1 = li; i1 <= i; i1++) {
117 i1_1 = (li == i1 ? 0 : i1-1);
118 fdist[i1][0] = fdist[i1_1][0] + edit_cost(i1, 0);
121 for (j1 = lj; j1 <= j; j1++) {
122 j1_1 = (lj == j1 ? 0 : j1-1);
123 fdist[0][j1] = fdist[0][j1_1] + edit_cost(0, j1);
126 for (i1 = li; i1 <= i; i1++) {
128 lleaf_i1 = tree1->postorder_list[i1].leftmostleaf;
129 li1_1 = (li > lleaf_i1-1 ? 0 : lleaf_i1-1);
130 i1_1 = (i1 == li ? 0: i1-1);
131 cost = edit_cost(i1, 0);
133 for (j1 = lj; j1 <= j; j1++) {
135 lleaf_j1 = tree2->postorder_list[j1].leftmostleaf;
136 j1_1 = (j1 == lj ? 0: j1-1);
138 f1 = fdist[i1_1][j1] + cost;
139 f2 = fdist[i1][j1_1] + edit_cost(0, j1);
141 f = f1 < f2 ? f1 : f2;
143 if (lleaf_i1 == li && lleaf_j1 == lj) {
145 f3 = fdist[i1_1][j1_1] + edit_cost(i1, j1);
147 fdist[i1][j1] = f3 < f ? f3 : f;
149 tdist[i1][j1] = fdist[i1][j1]; /* store in array permanently */
152 lj1_1 = (lj > lleaf_j1-1 ? 0 : lleaf_j1-1);
154 f3 = fdist[li1_1][lj1_1] + tdist[i1][j1];
156 fdist[i1][j1] = f3 < f ? f3 : f;
162 /*---------------------------------------------------------------------------*/
164 PRIVATE int edit_cost(int i, int j)
166 int c, diff, cd, min, a, b;
168 c = (*EditCost)[tree1->postorder_list[i].type][tree2->postorder_list[j].type];
170 diff = abs((a=tree1->postorder_list[i].weight) - (b=tree2->postorder_list[j].weight));
172 min = (a < b ? a: b);
173 if (min == a) cd = (*EditCost)[0][tree2->postorder_list[j].type];
174 else cd = (*EditCost)[0][tree1->postorder_list[i].type];
176 return (c * min + cd * diff);
180 /*---------------------------------------------------------------------------*/
182 PUBLIC Tree *make_tree(char *struc)
186 tree = (Tree *) space(sizeof(Tree));
188 tree->postorder_list = make_postorder_list(struc);
189 tree->keyroots = make_keyroots(tree->postorder_list);
195 /*---------------------------------------------------------------------------*/
197 PRIVATE int *make_keyroots(Postorder_list *pl)
202 keyroots = (int *) space(sizeof(int)*(pl[0].sons+1));
205 for (i = 1; i <= pl[0].sons; i++) {
211 while (pl[k].leftmostleaf != i) k--;
212 keyroots[++keys] = k;
216 sort(keys, keyroots);
222 /*---------------------------------------------------------------------------*/
224 PRIVATE void sort(int n, int *ra) /* heap sort, indices are 1..n !!! */
247 if (j < ir && ra[j] < ra[j+1]) ++j;
258 /*---------------------------------------------------------------------------*/
260 PRIVATE Postorder_list *make_postorder_list(char *struc)
263 Convention for structure representation "struc":
264 Nodes are one pair of matching parentheses, with the type and possibly
265 a weight of the node immediately preceding the closing parentheses.
280 .((..(((...)))..((..)))). in usual notation becomes:
283 ((U)(((U)(U)((((U)(U)(U)P)P)P)(U)(U)(((U)(U)P)P)P)P)(U)R)
285 ((U1)((U2)((U3)P3)(U2)((U2)P2)P2)(U1)R)
287 (((((H)S)((H)S)M)S)R)
291 int paren, i, l, order, local_order, w, sons, count;
295 int match_pos[MNODES], match_order[MNODES];
298 n_nodes = number_of_nodes(struc);
299 if (n_nodes>MNODES) nrerror("structure too long in make_postorder_list");
300 pl = (Postorder_list *) space(sizeof(Postorder_list)*(n_nodes+1));
301 pl[0].sons = n_nodes;
304 match_pos[paren] = 0;
305 match_order[paren] = 0;
314 match_pos[++paren] = i;
315 match_order[paren] = order;
321 while (isalpha((int) id[l])) l++;
322 if (id[l]) sscanf(id+l, "%d", &w);
325 pl[order].type = decode(id);
326 pl[order].weight = w;
327 pl[order].leftmostleaf = match_order[paren]+1;
330 local_order = match_order[paren];
331 for (p = match_pos[paren]+1; p < i; p++) {
332 if (struc[p] == '(') count++;
333 else if (struc[p] == ')') {
337 pl[local_order].father = order;
343 pl[order].sons = sons;
357 /*---------------------------------------------------------------------------*/
359 PRIVATE int decode(char *id)
362 char label[100], *code;
370 for (i = 0; code[i] != sep; i++) {
371 if (code[i] == '\0') {
378 if (strcmp(id, label) == 0) return (n);
383 fprintf(stderr,"Syntax error: node identifier \"%s\" not found "
384 "in coding string \"%s\"\n", id, coding);
385 fprintf(stderr, "Exiting...");
389 /*---------------------------------------------------------------------------*/
391 PRIVATE void encode(int type, char label[])
396 for (i = 0; i < type; i++) {
397 while (coding[l] != sep && coding[l]) l++;
401 for (i = 0; coding[l+i] != sep; i++) {
402 if (coding[l+i] == '\0') break;
403 label[i] = coding[l+i];
408 /*---------------------------------------------------------------------------*/
410 PRIVATE int number_of_nodes(char *struc)
415 for (c = 0, i = 0; i < l; i++) if (struc[i] == ')') c++;
419 /*---------------------------------------------------------------------------*/
421 PRIVATE void print_keyroots(int *keyroots)
425 printf("---> key roots <---\n\n");
427 printf("entries: %d\n", keyroots[0]);
429 for (i = 1; i <= keyroots[0]; i++) printf(" %d", keyroots[i]);
433 /*---------------------------------------------------------------------------*/
435 PRIVATE void print_postorder_list(Postorder_list *pl)
440 printf("---> postorder list <---\n\n");
442 for (i = 1; i <= pl[0].sons; i++) {
443 printf(" postorder: %3d\n", i);
445 encode(pl[i].type, label);
446 printf(" type: %3d (%s)\n", pl[i].type, label);
447 printf(" weight: %3d\n", pl[i].weight);
448 printf(" father: %3d\n", pl[i].father);
449 printf(" sons: %3d\n", pl[i].sons);
450 printf("leftmost leaf: %3d\n", pl[i].leftmostleaf);
455 /*---------------------------------------------------------------------------*/
457 PUBLIC void print_tree(Tree *t)
459 print_postorder_list(t->postorder_list);
460 print_keyroots(t->keyroots);
464 /*---------------------------------------------------------------------------*/
466 PUBLIC void free_tree(Tree *t)
468 free(t->postorder_list);
473 /*---------------------------------------------------------------------------*/
476 PRIVATE void backtracking(void)
478 int li,lj,i1,j1,i1_1,j1_1,li1_1,lj1_1,f;
479 int cost, lleaf_i1, lleaf_j1, ss, i,j,k;
480 struct {int i,j;} sector[MNODES/2];
484 i=i1=tree1->postorder_list[0].sons;
485 j=j1=tree2->postorder_list[0].sons;
488 li = tree1->postorder_list[i].leftmostleaf;
489 lj = tree2->postorder_list[j].leftmostleaf;
492 while ((i1>=li)&&(j1>=lj)) {
494 lleaf_i1 = tree1->postorder_list[i1].leftmostleaf;
495 li1_1 = (li > lleaf_i1-1 ? 0 : lleaf_i1-1);
496 i1_1 = (i1 == li ? 0: i1-1);
497 lleaf_j1 = tree2->postorder_list[j1].leftmostleaf;
498 lj1_1 = (lj > lleaf_j1-1 ? 0 : lleaf_j1-1);
499 j1_1 = (j1 == lj ? 0: j1-1);
503 cost = edit_cost(i1, 0);
504 if (f == fdist[i1_1][j1] + cost) {
509 if (f == fdist[i1][j1_1] + edit_cost(0, j1)) {
513 else if (lleaf_i1 == li && lleaf_j1 == lj) {
514 alignment[0][i1] = j1;
515 alignment[1][j1] = i1;
527 i1 = (i1 == li ? 0: i1-1);
531 j1 = (j1 == lj ? 0: j1-1);
537 i = tree1->keyroots[k];
538 if (tree1->postorder_list[i].leftmostleaf ==
539 tree1->postorder_list[i1].leftmostleaf) break;
542 j = tree2->keyroots[k];
543 if (tree2->postorder_list[j].leftmostleaf ==
544 tree2->postorder_list[j1].leftmostleaf) break;
551 /*---------------------------------------------------------------------------*/
553 PRIVATE void sprint_aligned_trees(void)
555 int i,j,n1,n2,k,l,p, ni, nj, weights;
556 char t1[2*MNODES+1], t2[2*MNODES+1], a1[8*MNODES], a2[8*MNODES], ll[20], ll1[20];
559 n1=tree1->postorder_list[0].sons;
560 n2=tree2->postorder_list[0].sons;
561 for (i=1; i<=n1; i++) weights |= (tree1->postorder_list[i].weight!=1);
562 for (i=1; i<=n2; i++) weights |= (tree2->postorder_list[i].weight!=1);
564 for (i=n1, l=2*n1-1; i>0; i--) {
565 if (alignment[0][i]!=0) t1[l--]=']';
568 while(i==tree1->postorder_list[p].leftmostleaf) {
569 if (alignment[0][p]!=0) t1[l--]='[';
571 p=tree1->postorder_list[p].father;
575 for (j=n2, l=2*n2-1; j>0; j--) {
576 if (alignment[1][j]!=0) t2[l--]=']';
579 while(j==tree2->postorder_list[p].leftmostleaf) {
580 if (alignment[1][p]!=0) t2[l--]='[';
582 p=tree2->postorder_list[p].father;
588 while (t1[i]||t2[j]) {
589 while ((t1[i]=='(')||(t1[i]==')')) {
592 encode(tree1->postorder_list[ni].type, ll);
594 sprintf(ll+strlen(ll), "%d", tree1->postorder_list[ni].weight);
595 for (k=0; k< strlen(ll); k++) {
599 a1[l]=')'; a2[l++]='_';
607 while ((t2[j]=='(')||(t2[j]==')')) {
610 encode(tree2->postorder_list[nj].type, ll);
612 sprintf(ll+strlen(ll), "%d", tree2->postorder_list[nj].weight);
613 for (k=0; k< strlen(ll); k++) {
617 a2[l]=')'; a1[l++]='_';
627 encode(tree2->postorder_list[nj].type, ll);
629 sprintf(ll+strlen(ll), "%d", tree2->postorder_list[nj].weight);
630 encode(tree1->postorder_list[ni].type, ll1);
632 sprintf(ll1+strlen(ll1), "%d", tree1->postorder_list[ni].weight);
633 if (strlen(ll)>strlen(ll1))
634 for (k=0; k<strlen(ll)-strlen(ll1); k++) strcat(ll1,"_");
635 if (strlen(ll)<strlen(ll1))
636 for (k=0; k<strlen(ll1)-strlen(ll); k++) strcat(ll,"_");
637 for (k=0; k< strlen(ll); k++) a2[l+k]=ll[k];
638 for (k=0; k< strlen(ll); k++) a1[l+k]=ll1[k];
640 a1[l]=a2[l]=')'; l++;
642 } else if (t2[j]=='[') {
643 a1[l]=a2[l]='('; l++;
648 if (l>8*MNODES) nrerror("structure too long in sprint_aligned_trees");
649 if (aligned_line[0]!= NULL) free(aligned_line[0]);
650 if (aligned_line[1]!= NULL) free(aligned_line[1]);
651 aligned_line[0] = (char *) space((l+1)*sizeof(char));
652 aligned_line[1] = (char *) space((l+1)*sizeof(char));
653 strcpy(aligned_line[0], a1);
654 strcpy(aligned_line[1], a2);
657 /*---------------------------------------------------------------------------*/