8 #include "io_lib_header.h"
9 #include "util_lib_header.h"
10 #include "define_header.h"
11 #include "dp_lib_header.h"
12 #include "fastal_lib_header.h"
13 #include "fast_tree_header.h"
17 // #define CHUNKSIZE 100
21 //Fastal_param *param_set;
24 /*! \mainpage T-Coffee Index Page
26 * \section intro_sec Introduction
28 * This is the introduction.
30 * \section install_sec Installation
32 * \subsection step1 Step 1: Opening the box
35 * \section fastal_sec Fastal
37 * This program is a very fast aligner. It is capable of aligning huge sets of sequences because it keeps as much as necessary on hard disk.
45 * \brief Source code for the fastal algorithm
50 //************************** sparse dynamic aligning **********************************************************
54 fill_arguments_sparse(Sparse_dynamic_param* method_arguments_p)
56 method_arguments_p->diagonals = vcalloc(3,sizeof(Diagonal));
57 method_arguments_p->dig_length = vcalloc(1,sizeof(int));
58 *method_arguments_p->dig_length = 3;
59 method_arguments_p->list = NULL;
60 method_arguments_p->list_length = vcalloc(1,sizeof(int));
61 *method_arguments_p->list_length = 0;
62 method_arguments_p->file_name1 = vtmpnam(NULL);
63 method_arguments_p->file_name2 = vtmpnam(NULL);
67 free_sparse(Sparse_dynamic_param* method_arguments_p)
69 vfree(method_arguments_p->diagonals);
70 vfree(method_arguments_p->dig_length);
71 vfree(method_arguments_p->list_length);
76 * \brief One run of sparse dynamic programming.
78 * \param profiles The profiles.
79 * \param param_set The fastal parameters.
80 * \param method_arguments_p The method arguments.
81 * \param is_dna Sequences are DNA (\a is_dna = 1) or protein.
82 * \param edit_file The edit file.
83 * \param prof_file the profile file.
84 * \param number Number of the parent node.
85 * \return The length of the alignment.
88 sparse_dyn(Fastal_profile **profiles,
89 Fastal_param *param_set,
90 void *method_arguments_p,
96 // printf("WHAT THE HELL ARE YOU DOING HERE?\n");
97 Sparse_dynamic_param *arguments = (Sparse_dynamic_param*)method_arguments_p;
98 // static char *file_name1 = vtmpnam(NULL);
99 // static char *file_name2 = vtmpnam(NULL);
100 char *file_name1 = arguments->file_name1;
101 char *file_name2 = arguments->file_name2;
103 Fastal_profile *tmp1 = profiles[0];
104 Fastal_profile *tmp2 = profiles[1];
106 seq1 = profile2consensus(tmp1, param_set);
107 seq2 = profile2consensus(tmp2, param_set);
110 int **diagonals_p = &(arguments->diagonals);
111 int num_diagonals = -1;
112 if (!strcmp(param_set->diag_method, "blastz"))
114 FILE *cons_f = fopen(file_name1,"w");
115 fprintf(cons_f, ">%i\n", tmp1->prf_number);
116 fprintf(cons_f, "%s", seq1);
117 fprintf( cons_f, "\n");
119 cons_f = fopen(file_name2,"w");
120 fprintf(cons_f, ">%i\n", tmp2->prf_number);
121 fprintf(cons_f, "%s", seq2);
122 fprintf( cons_f, "\n");
124 num_diagonals = seq_pair2blastz_diagonal(file_name1, file_name2, diagonals_p, arguments->dig_length, strlen(seq1),strlen(seq2), is_dna);
126 else if (!strcmp(param_set->diag_method, "blast"))
128 FILE *cons_f = fopen(file_name1,"w");
129 fprintf(cons_f, ">%i\n", tmp1->prf_number);
130 fprintf(cons_f, "%s", seq1);
131 fprintf( cons_f, "\n");
133 cons_f = fopen(file_name2,"w");
134 fprintf(cons_f, ">%i\n", tmp2->prf_number);
135 fprintf(cons_f, "%s", seq2);
136 fprintf( cons_f, "\n");
138 int l1 = strlen(seq1);
139 int l2 = strlen(seq2);
140 num_diagonals = seq_pair2blast_diagonal(file_name1, file_name2, diagonals_p, arguments->dig_length, l1, l2, is_dna);
141 // int *num_p = &num_diagonals;
142 // Segment* seg = extend_diagonals(*diagonals_p, num_p, l1, l2);
143 // printf("A: %i\n", num_diagonals);
145 else if (!strcmp(param_set->diag_method, "ktup"))
147 num_diagonals = seq_pair2diagonal_own(seq1, seq2, diagonals_p, arguments->dig_length, strlen(seq1),strlen(seq2), is_dna, 3);
148 // num_diagonals = seq_pair2diagonal_swift(seq1, seq2, diagonals_p, arguments->dig_length, strlen(seq1),strlen(seq2), is_dna, 3);
152 // arguments->diagonals = diagonals_p[0];
153 // arguments->list = segments2int(seg, *num_p, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set);
155 // t ** segments2int_gap(Segment *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set);
157 // arguments->list = diagonals2int_dot(arguments->diagonals, num_diagonals, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set);
158 // arguments->list = diagonals2int_euclidf(arguments->diagonals, num_diagonals, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set);
159 arguments->list = diagonals2int_gap_test(arguments->diagonals, num_diagonals, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set);
160 // arguments->list = diagonals2int(arguments->diagonals, num_diagonals, seq1, seq2, arguments->list_length, param_set);
161 int alignment_length = list2linked_pair_wise_fastal(profiles[0], profiles[1], param_set, arguments->list, *arguments->list_length, edit_file, prof_file, number);
164 for (x = 0; x < *arguments->list_length; ++x)
166 vfree(arguments->list[x]);
168 vfree(arguments->list);
169 arguments->list = NULL;
172 return alignment_length;
177 fastal_compare (const void * a, const void * b)
179 return (*(int*)a - *(int*)b);
184 * \brief Makes a sorted list out of diagonals.
186 * \param diagonals A list of diagonals to use during dynamic programming.
187 * \param num_diagonals Number of diagonals.
188 * \param seq1 Sequence 1.
189 * \param seq2 Sequence 2.
190 * \param num_points Number of points in the list
191 * \param param_set Fastal parameters.
192 * \return A 2-dim array which contains all points needed for the sparse dynamic programming algorithm.
195 diagonals2int(int *diagonals,
200 Fastal_param *param_set)
203 int l1 = strlen(seq1);
204 int l2 = strlen(seq2);
205 int gep = param_set->gep;
213 int current_size = num_diagonals*dig_length + l1 +l2;
215 int **list = vcalloc(current_size, sizeof(int*));
216 int *diags = vcalloc(num_diagonals, sizeof(int));
218 for (i = 0; i < num_diagonals; ++i)
220 diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1];
223 qsort (diags, num_diagonals, sizeof(int), fastal_compare);
226 int *diagx = vcalloc(num_diagonals, sizeof(int));
227 int *diagy = vcalloc(num_diagonals, sizeof(int));
230 //+1 because diagonals start here at position 1, like in "real" dynamic programming
232 for (i = 0; i < num_diagonals; ++i)
236 diagx[i] = l1 - diags[i];
245 for (; i < num_diagonals; ++i)
248 diagy[i] = diags[i]-l1;
255 int **M = param_set->M;
256 int *last_y = vcalloc(l2+1, sizeof(int));
257 int *last_x = vcalloc(l1+1, sizeof(int));
261 list[0] = vcalloc(6, sizeof(int));
268 for (; list_pos < tmp_l2; ++list_pos)
270 list[list_pos] = vcalloc(6, sizeof(int));
271 list[list_pos][0] = 0;
272 list[list_pos][1] = list_pos;
273 last_y[list_pos] = list_pos;
274 list[list_pos][2] = list_pos*gep;
275 list[list_pos][4] = list_pos-1;
281 while (pos_x < tmp_l1)
283 if (list_pos + num_diagonals+2 > current_size)
285 current_size += num_diagonals*1000;
286 list = vrealloc(list, current_size * sizeof(int*));
289 list[list_pos] = vcalloc(6, sizeof(int));
290 list[list_pos][0] = ++pos_x;
291 list[list_pos][1] = 0;
292 list[list_pos][2] = pos_x * gep;
293 list[list_pos][3] = last_y[0];
294 tmpy_value = list_pos;
296 last_x[pos_x] = list_pos;
300 for (i = a; i <= b; ++i)
302 list[list_pos] = vcalloc(6, sizeof(int));
304 list[list_pos][0] = ++diagx[i];
306 list[list_pos][1] = ++diagy[i];
307 list[list_pos][3] = last_y[diagy[i]];
308 list[list_pos][4] = list_pos-1;
309 list[list_pos][5] = last_y[diagy[i]-1];
310 list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A'];
311 last_y[tmpy_pos] = tmpy_value;
312 tmpy_value = list_pos;
317 last_y[tmpy_pos] = tmpy_value;
321 if (list[list_pos-1][1] != l2)
323 list[list_pos] = vcalloc(6, sizeof(int));
324 list[list_pos][0] = pos_x;
325 list[list_pos][1] = l2;
326 list[list_pos][3] = last_y[l2];
328 list[list_pos][2] = -1000;
329 list[list_pos][4] = list_pos-1;
331 list[list_pos][5] = last_x[pos_x-l2];
333 list[list_pos][5] = l2-pos_x;
334 last_y[l2] = list_pos;
340 if ((b >= 0) && (diagy[b] == l2))
343 if ((a >0) && (diagx[a-1] == pos_x))
349 if (list_pos + l2+2 > current_size)
351 current_size += list_pos + l2 + 2;
352 list = vrealloc(list, current_size * sizeof(int*));
357 list[list_pos] = vcalloc(6, sizeof(int));
358 list[list_pos][0] = l1;
359 list[list_pos][1] = 0;
360 list[list_pos][3] = last_x[l1-1];
361 list[list_pos][2] = -1000;
366 for (i = 1; i <= l2; ++i)
368 list[list_pos] = vcalloc(6, sizeof(int));
369 list[list_pos][0] = l1;
370 list[list_pos][1] = i;
371 list[list_pos][3] = last_y[i];
372 list[list_pos][4] = list_pos-1;
374 if ((list[y][0] == l1-1) && (list[y][1] == i-1))
376 list[list_pos][5] = y;
377 list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'];
383 list[list_pos][5] = last_x[l1-i];
387 list[list_pos][5] = i-l1;
389 list[list_pos][2] = -1000;
394 list[list_pos - l2][2] = -1000;
396 *num_points = list_pos;
406 * \brief Makes a sorted list out of diagonals.
408 * \param diagonals A list of diagonals to use during dynamic programming.
409 * \param num_diagonals Number of diagonals.
410 * \param seq1 Sequence 1.
411 * \param seq2 Sequence 2.
412 * \param num_points Number of points in the list
413 * \param param_set Fastal parameters.
414 * \return A 2-dim array which contains all points needed for the sparse dynamic programming algorithm.
417 diagonals2int_gap_test(int *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set)
419 int alphabet_size = param_set->alphabet_size;
420 int l1 = strlen(seq1);
421 int l2 = strlen(seq2);
422 int gep = param_set->gep;
424 int current_size = l2+l1;
426 int **list = vcalloc(current_size, sizeof(int*));
427 int *diags = vcalloc(num_diagonals, sizeof(int));
429 for (i = 0; i < num_diagonals; ++i)
431 diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1];
433 qsort (diags, num_diagonals, sizeof(int), fastal_compare);
436 int *diagx = vcalloc(num_diagonals, sizeof(int));
437 int *diagy = vcalloc(num_diagonals, sizeof(int));
438 int *old_pos = vcalloc(num_diagonals, sizeof(int));
440 //+1 because diagonals start here at position 1, like in "real" dynamic programming
442 for (i = 0; i < num_diagonals; ++i)
447 diagx[i] = l1 - diags[i];
456 for (; i < num_diagonals; ++i)
459 diagy[i] = diags[i]-l1;
466 int **M = param_set->M;
467 int *last_y = vcalloc(l2+1, sizeof(int));
468 int *last_x = vcalloc(l1+1, sizeof(int));
472 list[0] = vcalloc(6, sizeof(int));
479 for (; list_pos < tmp_l2; ++list_pos)
481 list[list_pos] = vcalloc(6, sizeof(int));
482 list[list_pos][0] = 0;
483 list[list_pos][1] = list_pos;
484 last_y[list_pos] = list_pos;
485 list[list_pos][2] = list_pos*gep;
486 list[list_pos][4] = list_pos-1;
490 // int diags_old = l2;
495 while (pos_x < tmp_l1)
497 if (list_pos + num_diagonals+2 > current_size)
499 current_size += num_diagonals*1000;
500 list = vrealloc(list, current_size * sizeof(int*));
503 list[list_pos] = vcalloc(6, sizeof(int));
504 list[list_pos][0] = ++pos_x;
505 list[list_pos][1] = 0;
506 list[list_pos][2] = pos_x * gep;
507 list[list_pos][3] = last_y[0];
508 tmpy_value = list_pos;
510 last_x[pos_x] = list_pos;
514 for (i = a; i <= b; ++i)
516 list[list_pos] = vcalloc(6, sizeof(int));
518 list[list_pos][0] = ++diagx[i];
520 list[list_pos][1] = ++diagy[i];
521 list[list_pos][3] = last_y[diagy[i]];
522 list[list_pos][4] = list_pos-1;
523 list[list_pos][5] = last_y[diagy[i]-1];
529 int num_seq = profile1->number_of_sequences + profile2->number_of_sequences;
532 for (char_c = 0; char_c < alphabet_size; ++char_c)
535 gap_num += profile1->prf[char_c][diagx[i]-1] + profile2->prf[char_c][diagy[i]-1];
540 list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A'] * gap_num;
543 // int num_seq = profile1->number_of_sequences + profile2->number_of_sequences;
544 // double gap_num = 0;
545 // int char_c, char_c2;
546 // for (char_c = 0; char_c < alphabet_size; ++char_c)
547 // for (char_c2 = 0; char_c2 < alphabet_size; ++char_c2)
549 // gap_num += (profile1->prf[char_c][diagx[i]-1]/profile1->number_of_sequences) * (profile2->prf[char_c][diagy[i]-1]/profile2->number_of_sequences) * M[param_set->pos2char[char_c]-'A'][param_set->pos2char[char_c2]-'A'];
551 // list[list_pos][2] = gap_num;
554 last_y[tmpy_pos] = tmpy_value;
555 tmpy_value = list_pos;
560 last_y[tmpy_pos] = tmpy_value;
564 if (list[list_pos-1][1] != l2)
566 list[list_pos] = vcalloc(6, sizeof(int));
567 list[list_pos][0] = pos_x;
568 list[list_pos][1] = l2;
569 list[list_pos][3] = last_y[l2];
571 list[list_pos][2] = -1000;
572 list[list_pos][4] = list_pos-1;
574 list[list_pos][5] = last_x[pos_x-l2];
576 list[list_pos][5] = l2-pos_x;
577 last_y[l2] = list_pos;
583 if ((b >= 0) && (diagy[b] == l2))
586 if ((a >0) && (diagx[a-1] == pos_x))
592 if (list_pos + l2+2 > current_size)
594 current_size += list_pos + l2 + 2;
595 list = vrealloc(list, current_size * sizeof(int*));
600 list[list_pos] = vcalloc(6, sizeof(int));
601 list[list_pos][0] = l1;
602 list[list_pos][1] = 0;
603 list[list_pos][3] = last_x[l1-1];
604 list[list_pos][2] = -1000;
609 for (i = 1; i <= l2; ++i)
611 list[list_pos] = vcalloc(6, sizeof(int));
612 list[list_pos][0] = l1;
613 list[list_pos][1] = i;
614 list[list_pos][3] = last_y[i];
615 list[list_pos][4] = list_pos-1;
617 if ((list[y][0] == l1-1) && (list[y][1] == i-1))
619 list[list_pos][5] = y;
620 int num_seq = profile1->number_of_sequences + profile2->number_of_sequences;
623 for (char_c = 0; char_c < alphabet_size; ++char_c)
625 gap_num += profile1->prf[char_c][l1-1] + profile2->prf[char_c][i-1];
630 list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'] * gap_num;
636 list[list_pos][5] = last_x[l1-i];
640 list[list_pos][5] = i-l1;
642 list[list_pos][2] = -1000;
647 list[list_pos - l2][2] = -1000;
649 *num_points = list_pos;
659 diagonals2int_euclidf(int *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set)
661 int alphabet_size = param_set->alphabet_size;
662 int l1 = strlen(seq1);
663 int l2 = strlen(seq2);
664 int gep = param_set->gep;
666 int current_size = l2+l1;
668 int **list = vcalloc(current_size, sizeof(int*));
669 int *diags = vcalloc(num_diagonals, sizeof(int));
671 for (i = 0; i < num_diagonals; ++i)
673 diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1];
676 qsort (diags, num_diagonals, sizeof(int), fastal_compare);
679 int *diagx = vcalloc(num_diagonals, sizeof(int));
680 int *diagy = vcalloc(num_diagonals, sizeof(int));
681 int *old_pos = vcalloc(num_diagonals, sizeof(int));
683 //+1 because diagonals start here at position 1, like in "real" dynamic programming
685 for (i = 0; i < num_diagonals; ++i)
690 diagx[i] = l1 - diags[i];
699 for (; i < num_diagonals; ++i)
702 diagy[i] = diags[i]-l1;
709 // int **M = param_set->M;
710 int *last_y = vcalloc(l2+1, sizeof(int));
711 int *last_x = vcalloc(l1+1, sizeof(int));
715 list[0] = vcalloc(6, sizeof(int));
722 for (; list_pos < tmp_l2; ++list_pos)
724 list[list_pos] = vcalloc(6, sizeof(int));
725 list[list_pos][0] = 0;
726 list[list_pos][1] = list_pos;
727 last_y[list_pos] = list_pos;
728 list[list_pos][2] = list_pos*gep;
729 list[list_pos][4] = list_pos-1;
733 // int diags_old = l2;
738 while (pos_x < tmp_l1)
740 if (list_pos + num_diagonals+2 > current_size)
742 current_size += num_diagonals*1000;
743 list = vrealloc(list, current_size * sizeof(int*));
746 list[list_pos] = vcalloc(6, sizeof(int));
747 list[list_pos][0] = ++pos_x;
748 list[list_pos][1] = 0;
749 list[list_pos][2] = pos_x * gep;
750 list[list_pos][3] = last_y[0];
751 tmpy_value = list_pos;
753 last_x[pos_x] = list_pos;
757 for (i = a; i <= b; ++i)
759 list[list_pos] = vcalloc(6, sizeof(int));
761 list[list_pos][0] = ++diagx[i];
763 list[list_pos][1] = ++diagy[i];
764 list[list_pos][3] = last_y[diagy[i]];
765 list[list_pos][4] = list_pos-1;
766 list[list_pos][5] = last_y[diagy[i]-1];
768 double tmp_score = 0;
770 for (char_c = 0; char_c < alphabet_size; ++char_c)
772 freq1 = (double)profile1->prf[char_c][diagx[i]-1] / profile1->number_of_sequences;
774 freq2 = (double)profile2->prf[char_c][diagy[i]-1] / profile2->number_of_sequences;
776 tmp_score += ( freq1 - freq2) * (freq1 - freq2);
779 list[list_pos][2] = 10 - sqrt(tmp_score);
781 last_y[tmpy_pos] = tmpy_value;
782 tmpy_value = list_pos;
787 last_y[tmpy_pos] = tmpy_value;
791 if (list[list_pos-1][1] != l2)
793 list[list_pos] = vcalloc(6, sizeof(int));
794 list[list_pos][0] = pos_x;
795 list[list_pos][1] = l2;
796 list[list_pos][3] = last_y[l2];
798 list[list_pos][2] = -1000;
799 list[list_pos][4] = list_pos-1;
801 list[list_pos][5] = last_x[pos_x-l2];
803 list[list_pos][5] = l2-pos_x;
804 last_y[l2] = list_pos;
810 if ((b >= 0) && (diagy[b] == l2))
813 if ((a >0) && (diagx[a-1] == pos_x))
819 if (list_pos + l2+2 > current_size)
821 current_size += list_pos + l2 + 2;
822 list = vrealloc(list, current_size * sizeof(int*));
827 list[list_pos] = vcalloc(6, sizeof(int));
828 list[list_pos][0] = l1;
829 list[list_pos][1] = 0;
830 list[list_pos][3] = last_x[l1-1];
831 list[list_pos][2] = -1000;
836 for (i = 1; i <= l2; ++i)
838 list[list_pos] = vcalloc(6, sizeof(int));
839 list[list_pos][0] = l1;
840 list[list_pos][1] = i;
841 list[list_pos][3] = last_y[i];
842 list[list_pos][4] = list_pos-1;
844 if ((list[y][0] == l1-1) && (list[y][1] == i-1))
846 list[list_pos][5] = y;
850 for (char_c = 0; char_c < alphabet_size; ++char_c)
852 freq1 = profile1->prf[char_c][l1-1] / profile1->number_of_sequences;
853 freq2 = profile2->prf[char_c][i-1] / profile2->number_of_sequences;
854 tmp_score += ( freq1 - freq2) * (freq2 - freq1);
856 list[list_pos][2] = 10 - sqrt(tmp_score);
857 // list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'];
863 list[list_pos][5] = last_x[l1-i];
867 list[list_pos][5] = i-l1;
869 list[list_pos][2] = -1000;
874 list[list_pos - l2][2] = -1000;
876 *num_points = list_pos;
885 diagonals2int_dot(int *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set)
887 int alphabet_size = param_set->alphabet_size;
888 int l1 = strlen(seq1);
889 int l2 = strlen(seq2);
890 int gep = param_set->gep;
892 int current_size = l2+l1;
894 int **list = vcalloc(current_size, sizeof(int*));
895 int *diags = vcalloc(num_diagonals, sizeof(int));
897 for (i = 0; i < num_diagonals; ++i)
899 diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1];
902 qsort (diags, num_diagonals, sizeof(int), fastal_compare);
905 int *diagx = vcalloc(num_diagonals, sizeof(int));
906 int *diagy = vcalloc(num_diagonals, sizeof(int));
907 int *old_pos = vcalloc(num_diagonals, sizeof(int));
909 //+1 because diagonals start here at position 1, like in "real" dynamic programming
911 for (i = 0; i < num_diagonals; ++i)
916 diagx[i] = l1 - diags[i];
925 for (; i < num_diagonals; ++i)
928 diagy[i] = diags[i]-l1;
935 // int **M = param_set->M;
936 int *last_y = vcalloc(l2+1, sizeof(int));
937 int *last_x = vcalloc(l1+1, sizeof(int));
941 list[0] = vcalloc(6, sizeof(int));
948 for (; list_pos < tmp_l2; ++list_pos)
950 list[list_pos] = vcalloc(6, sizeof(int));
951 list[list_pos][0] = 0;
952 list[list_pos][1] = list_pos;
953 last_y[list_pos] = list_pos;
954 list[list_pos][2] = list_pos*gep;
955 list[list_pos][4] = list_pos-1;
959 // int diags_old = l2;
964 while (pos_x < tmp_l1)
966 if (list_pos + num_diagonals+2 > current_size)
968 current_size += num_diagonals*1000;
969 list = vrealloc(list, current_size * sizeof(int*));
972 list[list_pos] = vcalloc(6, sizeof(int));
973 list[list_pos][0] = ++pos_x;
974 list[list_pos][1] = 0;
975 list[list_pos][2] = pos_x * gep;
976 list[list_pos][3] = last_y[0];
977 tmpy_value = list_pos;
979 last_x[pos_x] = list_pos;
983 for (i = a; i <= b; ++i)
985 list[list_pos] = vcalloc(6, sizeof(int));
987 list[list_pos][0] = ++diagx[i];
989 list[list_pos][1] = ++diagy[i];
990 list[list_pos][3] = last_y[diagy[i]];
991 list[list_pos][4] = list_pos-1;
992 list[list_pos][5] = last_y[diagy[i]-1];
994 double tmp_score = 0;
996 for (char_c = 0; char_c < alphabet_size; ++char_c)
998 freq1 = (double)profile1->prf[char_c][diagx[i]-1] / profile1->number_of_sequences;
1000 freq2 = (double)profile2->prf[char_c][diagy[i]-1] / profile2->number_of_sequences;
1002 tmp_score += freq1 * freq2;
1005 list[list_pos][2] = tmp_score * 10;
1007 last_y[tmpy_pos] = tmpy_value;
1008 tmpy_value = list_pos;
1009 tmpy_pos = diagy[i];
1013 last_y[tmpy_pos] = tmpy_value;
1017 if (list[list_pos-1][1] != l2)
1019 list[list_pos] = vcalloc(6, sizeof(int));
1020 list[list_pos][0] = pos_x;
1021 list[list_pos][1] = l2;
1022 list[list_pos][3] = last_y[l2];
1024 list[list_pos][2] = -1000;
1025 list[list_pos][4] = list_pos-1;
1027 list[list_pos][5] = last_x[pos_x-l2];
1029 list[list_pos][5] = l2-pos_x;
1030 last_y[l2] = list_pos;
1036 if ((b >= 0) && (diagy[b] == l2))
1039 if ((a >0) && (diagx[a-1] == pos_x))
1045 if (list_pos + l2+2 > current_size)
1047 current_size += list_pos + l2 + 2;
1048 list = vrealloc(list, current_size * sizeof(int*));
1053 list[list_pos] = vcalloc(6, sizeof(int));
1054 list[list_pos][0] = l1;
1055 list[list_pos][1] = 0;
1056 list[list_pos][3] = last_x[l1-1];
1057 list[list_pos][2] = -1000;
1062 for (i = 1; i <= l2; ++i)
1064 list[list_pos] = vcalloc(6, sizeof(int));
1065 list[list_pos][0] = l1;
1066 list[list_pos][1] = i;
1067 list[list_pos][3] = last_y[i];
1068 list[list_pos][4] = list_pos-1;
1070 if ((list[y][0] == l1-1) && (list[y][1] == i-1))
1072 list[list_pos][5] = y;
1075 double freq1, freq2;
1076 for (char_c = 0; char_c < alphabet_size; ++char_c)
1078 freq1 = profile1->prf[char_c][l1-1] / profile1->number_of_sequences;
1079 freq2 = profile2->prf[char_c][i-1] / profile2->number_of_sequences;
1080 tmp_score += freq2 * freq1;
1082 list[list_pos][2] = tmp_score * 10;
1083 // list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'];
1089 list[list_pos][5] = last_x[l1-i];
1093 list[list_pos][5] = i-l1;
1095 list[list_pos][2] = -1000;
1100 list[list_pos - l2][2] = -1000;
1102 *num_points = list_pos;
1112 combine_profiles2file(int **prf1,
1116 Fastal_param *param_set,
1120 int alphabet_size = param_set->alphabet_size;
1121 char *pos2aa = &(param_set->pos2char[0]);
1126 for (i = 0; i < alphabet_size; ++i)
1127 if (prf1[i][pos1] + prf2[i][pos2] > 0)
1130 fprintf(prof_f," %c%i", pos2aa[i],prf1[i][pos1]+prf2[i][pos2]);
1132 fprintf(prof_f,"%c%i", pos2aa[i],prf1[i][pos1]+prf2[i][pos2]);
1135 fprintf(prof_f,"\n");
1137 else if (state == 'D')
1139 for (i = 0; i < alphabet_size; ++i)
1140 if (prf2[i][pos2] > 0)
1143 fprintf(prof_f," %c%i", pos2aa[i],prf2[i][pos2]);
1145 fprintf(prof_f,"%c%i", pos2aa[i],prf2[i][pos2]);
1148 fprintf(prof_f,"\n");
1152 for (i = 0; i < alphabet_size; ++i)
1153 if (prf1[i][pos1] > 0)
1156 fprintf(prof_f," %c%i", pos2aa[i],prf1[i][pos1]);
1158 fprintf(prof_f,"%c%i", pos2aa[i],prf1[i][pos1]);
1161 fprintf(prof_f,"\n");
1167 #define LIN(a,b,c) a[b*5+c]
1169 * Calculates a fast and sparse dynamic programming matrix
1171 * \param prf1 Profile of first sequence.
1172 * \param prf2 Profile of second sequence.
1173 * \param param_set The parameter for the alignment.
1174 * \param list The list of diagonals.
1175 * \param n number of dots.
1176 * \param edit_f File to save the edit information.
1177 * \param prof_f File to save the profile.
1178 * \param node_number Number of the new profile.
1181 list2linked_pair_wise_fastal(Fastal_profile *prf1,
1182 Fastal_profile *prf2,
1183 Fastal_param *param_set,
1190 int a,b, i, j, LEN=0, start_trace = -1;
1191 int pi, pj,ij, delta_i, delta_j, prev_i, prev_j;
1192 // static int **slist;
1193 static long *MI, *MJ, *MM,*MT2;
1194 // static int *sortseq;
1195 static int max_size;
1196 int gop, gep, igop, igep;
1201 int nomatch = param_set->nomatch;
1206 al=declare_char (2,l1+l2+1);
1210 igop=param_set->gop;
1211 gep=igep=param_set->gep;
1216 vfree (MI);vfree (MJ); vfree (MM);
1218 MI=vcalloc (5*n, sizeof (long));
1219 MJ=vcalloc (5*n, sizeof (long));
1220 MM=vcalloc (5*n, sizeof (long));
1227 LIN(MI,a,b)=LIN(MJ,a,b)=LIN(MJ,a,b)=-1000000;
1236 if (i==l1 || j==l2)gop=0;
1239 if (i==l1 && j==l2)start_trace=a;
1240 else if ( i==0 || j==0)
1242 LIN(MM,a,0)=-1000000;
1245 LIN(MJ,a,0)=-10000000;
1250 LIN(MI,a,0)=-10000000;
1254 LIN(MI,a,1)=LIN(MJ,a,1)=-1;
1255 LIN(MI,a,2)=LIN(MJ,a,2)=i;
1256 LIN(MI,a,3)=LIN(MJ,a,3)=j;
1267 delta_i=list[a][0]-list[pi][0];
1268 delta_j=list[a][1]-list[pj][1];
1271 LIN(MI,a,0)=MAX(LIN(MI,pi,0),(LIN(MM,pi,0)+gop))+delta_i*gep;
1273 LIN(MI,a,2)=delta_i;
1275 LIN(MI,a,4)=(LIN(MI,pi,0) >=(LIN(MM,pi,0)+gop))?'i':'m';
1277 LIN(MJ,a,0)=MAX(LIN(MJ,pj,0),(LIN(MM,pj,0)+gop))+delta_j*gep;
1280 LIN(MJ,a,3)=delta_j;
1282 LIN(MJ,a,4)=(LIN(MJ,pj,0) >=LIN(MM,pj,0)+gop)?'j':'m';
1284 if (a>1 && (ls=list[a][0]-list[ij][0])==(list[a][1]-list[ij][1]))
1286 LIN(MM,a,0)=MAX3(LIN(MM,ij,0),LIN(MI,ij,0),LIN(MJ,ij,0))+list[a][2]-(ls*nomatch);
1291 if ( LIN(MM,ij,0) >=LIN(MI,ij,0) && LIN(MM,ij,0)>=LIN(MJ,ij,0))LIN(MM,a,4)='m';
1292 else if ( LIN(MI,ij,0) >= LIN(MJ,ij,0))LIN(MM,a,4)='i';
1293 else LIN(MM,a,4)='j';
1298 LIN(MM,a,0)=UNDEFINED;
1304 if (LIN(MM,a,0)>=LIN(MI,a,0) && LIN(MM,a,0) >=LIN(MJ,a,0))MT2=MM;
1305 else if ( LIN(MI,a,0)>=LIN(MJ,a,0))MT2=MI;
1308 score=MAX3(LIN(MM,a,0), LIN(MI,a,0), LIN(MJ,a,0));
1313 while (!(i==0 &&j==0))
1316 l=MAX(LIN(MT2,a,2),LIN(MT2,a,3));
1317 // HERE ("%c from %c %d %d SCORE=%d [%d %d] [%2d %2d]", T2[a][5],T2[a][4], T2[a][2], T2[a][3], T2[a][0], gop, gep, i, j);
1337 // else if (l==0) {HERE ("L=0 i=%d j=%d",l, i, j);exit (0);}
1340 for (b=0; b<l; b++, LEN++)
1342 if (LIN(MT2,a,2)){al[0][LEN]=1;i--;ni++;}
1345 if (LIN(MT2,a,3)){al[1][LEN]=1;j--;nj++;}
1349 next_a=LIN(MT2,a,1);
1350 if (LIN(MT2,a,4)=='m')MT2=MM;
1351 else if (LIN(MT2,a,4)=='i')MT2=MI;
1352 else if (LIN(MT2,a,4)=='j')MT2=MJ;
1357 invert_list_char ( al[0], LEN);
1358 invert_list_char ( al[1], LEN);
1360 fprintf(edit_f, "%i\n%i\n%i\n%i\n",prf1->prf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf);
1361 fprintf(prof_f, "%i\n0\n%i\n1\n%i\n", node_number,LEN, prf1->number_of_sequences+prf2->number_of_sequences);
1363 char statec[] = {'M','D','I'};
1369 for ( b=0; b< LEN; b++)
1371 if ((al[0][b]==1) && (al[1][b]==1))
1374 combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'M');
1379 fprintf(edit_f, "%c%i\n",statec[state], num);
1386 else if (al[0][b]==1)
1388 // prf1->prf[param_set->alphabet_size-1] += prf2->num_sequences;
1389 combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'I');
1393 fprintf(edit_f, "%c%i\n",statec[state], num);
1400 else if (al[1][b]==1)
1402 // prf2->prf[param_set->alphabet_size-1] += prf1->num_sequences;
1403 combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'D');
1407 fprintf(edit_f, "%c%i\n",statec[state], num);
1417 fprintf(edit_f, "%c%i\n",statec[state], num);
1422 fprintf(edit_f,"*\n");
1423 fprintf(prof_f,"*\n");
1435 * \brief Turns a profile into a consensus sequence.
1437 * The character with the highest number of occurences is used as consensus. Gaps are not included. For example: 10 '-' and one 'A' would give 'A' as consensus.
1438 * \param profile The profile.
1439 * \param file_name Name of the file to save the consensus sequence in.
1440 * \param param_set The parameter of the fastal algorithm.
1441 * \return the sequence
1444 profile2consensus(Fastal_profile *profile, Fastal_param *param_set)
1447 // FILE *cons_f = fopen(file_name,"w");
1448 // fprintf(cons_f, ">%i\n", profile->prf_number);
1449 char* seq = vcalloc(profile->length+1, sizeof(char));
1451 int most_pos = -1, most;
1452 int alphabet_size = param_set->alphabet_size;
1453 int **prf = profile->prf;
1454 char *pos2char = param_set->pos2char;
1455 for (i = 0; i < profile->length; ++i)
1458 for (j = 0; j < alphabet_size; ++j)
1460 if (prf[j][i] > most)
1466 seq[i] = pos2char[most_pos];
1467 // fprintf(cons_f, "%c",pos2char[most_pos]);
1474 diag_compare (const void * a, const void * b)
1476 return (((Diagonal_counter*)b)->count - ((Diagonal_counter*)a)->count);
1480 * \brief Calculates the diagonals between two sequences.
1482 * Uses to calculate the diagonals.
1483 * \param seq_file1 File with sequence 1.
1484 * \param seq_file2 File with sequence 2.
1485 * \param diagonals An array where the diagonal points will be stored.
1486 * \param dig_length length of \a diagonals .
1487 * \param num_points Number of points in all diagonals.
1488 * \return number of diagonals;
1491 seq_pair2diagonal_own(char *seq1,
1505 word_number = (int)pow(5, word_length);
1510 word_number = (int)pow(24, word_length);
1513 int **word_index = vcalloc(word_number, sizeof(int*));
1514 for (i = 0 ; i < word_number; ++i)
1516 word_index[i] = vcalloc(20, sizeof(int));
1517 word_index[i][0] = 2;
1518 word_index[i][1] = 20;
1522 //making of k-tup index of seq1
1524 int *prod=vcalloc (word_length, sizeof(int));
1525 for ( i=0; i<word_length; i++)
1527 prod[word_length-i-1]=(int)pow(ng,i);
1567 for (i = 0; i < word_length; ++i)
1569 index += aa[(short)seq1[i]] *prod[i];
1571 word_index[index][2] = 0;
1572 word_index[index][0] = 3;
1577 index -= aa[(short)seq1[++z]] * prod[0];
1579 index += aa[(short)seq1[i]];
1580 tmp = word_index[index];
1581 if (tmp[0] == tmp[1])
1584 tmp = vrealloc(tmp, word_index[index][1] *sizeof(int));
1585 word_index[index] = tmp;
1592 //counting diagonals
1593 const int window_length = 14;
1595 Diagonal_counter *diag_index = vcalloc(l1+l2, sizeof(Diagonal_counter));
1597 for (i = 0; i < num; ++i)
1599 diag_index[i].diagonal = i;
1600 diag_index[i].count = 0;
1605 for (i = 0; i < word_length; ++i)
1607 index += aa[(short)seq2[i]] *prod[i];
1608 for (j = 2; j < word_index[index][0]; ++j)
1610 ++(diag_index[i - word_index[index][j] + l1].count);
1616 int second_index = index;
1617 for (; i < window_length; ++i)
1619 index -= aa[(short)seq2[++z]] * prod[0];
1621 index += aa[(short)seq2[i]];
1622 tmp = word_index[index];
1623 for (j = 2; j < tmp[0]; ++j)
1625 ++(diag_index[i - tmp[j] + l1].count);
1631 index -= aa[(short)seq2[++z]] * prod[0];
1633 index += aa[(short)seq2[i]];
1634 second_index -= aa[(short)seq2[++z2]] * prod[0];
1636 second_index += aa[(short)seq2[++i2]];
1638 tmp = word_index[index];
1639 for (j = 2; j < tmp[0]; ++j)
1641 ++(diag_index[i - tmp[j] + l1].count);
1645 tmp = word_index[second_index];
1646 for (j = 2; j < tmp[0]; ++j)
1648 if (diag_index[i2 - tmp[j] + l1].count > window_length-3)
1649 diag_index[i2 - tmp[j] + l1].count = window_length+100;
1651 --diag_index[i2 - tmp[j] + l1].count;
1657 int *diags = diagonals[0];
1658 int current_pos = 0;
1661 qsort (diag_index, num, sizeof(Diagonal_counter*), diag_compare);
1665 while (diag_index[i].count > window_length+10)
1667 if (current_pos > (*dig_length)-3)
1669 (*dig_length) += 30;
1670 diags = vrealloc(diags, sizeof(int)*(*dig_length));
1674 y = diag_index[i].diagonal - l1;
1684 diags[current_pos++] = x;
1685 diags[current_pos++] = y;
1686 diags[current_pos++] = 200;
1691 for (i = 0; i < word_number; ++i)
1692 vfree(word_index[i]);
1694 diagonals[0] = diags;
1695 return current_pos/3;
1701 seq_pair2diagonal_swift(char *seq1,
1714 word_number = (int)pow(5, word_length);
1719 word_number = (int)pow(24, word_length);
1722 int **word_index = vcalloc(word_number, sizeof(int*));
1723 for (i = 0 ; i < word_number; ++i)
1725 word_index[i] = vcalloc(20, sizeof(int));
1726 word_index[i][0] = 2;
1727 word_index[i][1] = 20;
1731 //making of k-tup index of seq1
1733 int *prod=vcalloc (word_length, sizeof(int));
1734 for ( i=0; i<word_length; i++)
1736 prod[word_length-i-1]=(int)pow(ng,i);
1746 for (i = 0; i < word_length; ++i)
1748 index += aa[(short)seq1[i]] *prod[i];
1750 word_index[index][2] = 0;
1751 word_index[index][0] = 3;
1756 index -= aa[(short)seq1[++z]] * prod[0];
1758 index += aa[(short)seq1[i]];
1759 tmp = word_index[index];
1760 if (tmp[0] == tmp[1])
1763 tmp = vrealloc(tmp, word_index[index][1] *sizeof(int));
1764 word_index[index] = tmp;
1770 //counting diagonals
1771 const int window_length = 14;
1772 const int threshold = 12;
1774 Swift_diagonal *diag_index = vcalloc(l1+l2, sizeof(Swift_diagonal));
1776 for (i = 0; i < num; ++i)
1778 diag_index[i].diagonal = i;
1779 diag_index[i].count = 0;
1780 diag_index[i].start = -99999;
1781 diag_index[i].end = -99999;
1787 for (i = 0; i < word_length; ++i)
1789 index += aa[(short)seq2[i]] *prod[i];
1790 for (j = 2; j < word_index[index][0]; ++j)
1792 ++(diag_index[i - word_index[index][j] + l1].count);
1800 index -= aa[(short)seq2[++z]] * prod[0];
1802 index += aa[(short)seq2[i]];
1803 tmp = word_index[index];
1804 for (j = 2; j < tmp[0]; ++j)
1806 tmp_index = i - tmp[j] + l1;
1807 if (i - diag_index[tmp_index].end > window_length)
1809 if (diag_index[tmp_index].count < threshold)
1811 diag_index[tmp_index].count = 0;
1812 diag_index[tmp_index].start = i;
1813 diag_index[tmp_index].end = i + word_length;
1819 ++(diag_index[tmp_index].count);
1828 int *diags = diagonals[0];
1829 int current_pos = 0;
1831 for (i = 0; i < num; ++i)
1833 if (diag_index[i].count > threshold)
1835 if (current_pos > (*dig_length)-3)
1837 (*dig_length) += 30;
1838 diags = vrealloc(diags, sizeof(int)*(*dig_length));
1840 y = diag_index[i].diagonal - l1;
1850 diags[current_pos++] = x;
1851 diags[current_pos++] = y;
1852 diags[current_pos++] = 200;
1857 for (i = 0; i < word_number; ++i)
1858 vfree(word_index[i]);
1860 diagonals[0] = diags;
1862 return current_pos/3;
1868 * \brief Calculates the diagonals between two sequences.
1870 * Uses a k-tup index to choose diagonals.
1871 * \param seq_file1 File with sequence 1.
1872 * \param seq_file2 File with sequence 2.
1873 * \param diagonals An array where the diagonal points will be stored.
1874 * \param dig_length length of \a diagonals .
1875 * \param num_points Number of points in all diagonals.
1876 * \return number of diagonals;
1879 seq_pair2blast_diagonal(char *seq_file_name1,
1880 char *seq_file_name2,
1887 // static int blast_measure[12]={0,0,0,0,0,0,0,0,0,0,0,0};
1888 int *diag = vcalloc(l1 + l2, sizeof(int));
1889 char *out_file = vtmpnam(NULL);
1890 char blast_command[200];
1891 // char blast_command2[200];
1895 // printf("BLAST-Types:\n");
1896 // for (i = 0; i < 11; ++i)
1898 // printf("Type %i: %i\n", i, blast_measure[i]);
1902 // char blast_command2[600];
1903 // sprintf(blast_command2, "less %s", out_file);
1907 sprintf(blast_command, "bl2seq -p blastn -i %s -j %s -D 1 -g F -o %s -S 1 -F F", seq_file_name1, seq_file_name2, out_file);
1911 sprintf(blast_command, "bl2seq -p blastp -i %s -j %s -D 1 -g F -o %s -F F -S 1", seq_file_name1, seq_file_name2, out_file);
1913 system(blast_command);
1915 int *diags = diagonals[0];
1916 FILE *diag_f = fopen(out_file,"r");
1918 fgets(line, 300, diag_f);
1919 fgets(line, 300, diag_f);
1920 fgets(line, 300, diag_f);
1923 char delims[] = "\t";
1924 int length, pos_q, pos_d;
1925 int current_pos = 0;
1926 while (fgets(line, 300, diag_f) != NULL)
1928 strtok(line, delims);
1929 strtok(NULL, delims);
1930 strtok(NULL, delims);
1931 length = atoi(strtok(NULL, delims));
1932 strtok(NULL, delims);
1933 strtok(NULL, delims);
1934 pos_q = atoi(strtok(NULL, delims))-1;
1935 strtok(NULL, delims);
1936 pos_d = atoi(strtok(NULL, delims))-1;
1938 if (current_pos >= *dig_length-20)
1940 (*dig_length) += 90;
1941 diags = vrealloc(diags, sizeof(int)*(*dig_length));
1943 if (diag[l1-(pos_q)+pos_d] == 0)
1945 diag[l1-(pos_q)+pos_d] =1;
1946 diags[current_pos++] = pos_q;
1947 diags[current_pos++] = pos_d;
1948 diags[current_pos++] = length;
1954 int e_threshold = 10;
1955 while ((current_pos == 0) && (round < 10))
1959 sprintf(blast_command, "bl2seq -p blastn -i %s -j %s -D 1 -g F -o %s -S 1 -F F -W 6 -e %i", seq_file_name1, seq_file_name2, out_file, e_threshold);
1963 sprintf(blast_command, "bl2seq -p blastp -i %s -j %s -D 1 -g F -o %s -F F -S 1 -e %i", seq_file_name1, seq_file_name2, out_file, e_threshold);
1965 system(blast_command);
1967 FILE *diag_f = fopen(out_file,"r");
1969 fgets(line, 300, diag_f);
1970 fgets(line, 300, diag_f);
1971 fgets(line, 300, diag_f);
1974 char delims[] = "\t";
1975 while (fgets(line, 300, diag_f) != NULL)
1977 strtok(line, delims);
1978 strtok(NULL, delims);
1979 strtok(NULL, delims);
1980 length = atoi(strtok(NULL, delims));
1981 strtok(NULL, delims);
1982 strtok(NULL, delims);
1983 pos_q = atoi(strtok(NULL, delims))-1;
1984 strtok(NULL, delims);
1985 pos_d = atoi(strtok(NULL, delims))-1;
1987 if (current_pos >= *dig_length-20)
1989 (*dig_length) += 90;
1990 diags = vrealloc(diags, sizeof(int)*(*dig_length));
1992 if (diag[l1-(pos_q)+pos_d] == 0)
1994 diag[l1-(pos_q)+pos_d] =1;
1995 diags[current_pos++] = pos_q;
1996 diags[current_pos++] = pos_d;
1997 diags[current_pos++] = length;
2002 if (current_pos < 27)
2005 // ++blast_measure[round];
2007 if (current_pos == 0)
2009 printf("BLAST NOT SUCCESFULL\n");
2013 int diff = l2 - l1 + 10;
2014 for (i = diff; i > 0; --i)
2016 if (current_pos >= *dig_length-20)
2018 (*dig_length) += 90;
2019 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2021 diags[current_pos++] = 0;
2022 diags[current_pos++] = i;
2023 diags[current_pos++] = 100;
2026 // printf("A: %i\n", diff);
2027 for (i = 0; i < diff; ++i)
2029 if (current_pos >= *dig_length-20)
2031 (*dig_length) += 90;
2032 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2034 diags[current_pos++] = i;
2035 diags[current_pos++] = 0;
2036 diags[current_pos++] = 100;
2043 // printf("A: %i\n", diff);
2044 for (i = diff; i > 0; --i)
2046 if (current_pos >= *dig_length-20)
2048 (*dig_length) += 90;
2049 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2051 diags[current_pos++] = 0;
2052 diags[current_pos++] = i;
2053 diags[current_pos++] = 100;
2055 diff = l1 - l2 + 10;
2056 for (i = 0; i < diff; ++i)
2058 if (current_pos >= *dig_length-20)
2060 (*dig_length) += 90;
2061 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2063 diags[current_pos++] = i;
2064 diags[current_pos++] = 0;
2065 diags[current_pos++] = 100;
2074 diagonals[0] = diags;
2075 return current_pos/3;
2081 seq_pair2blat_diagonal(char *seq_file_name1,
2082 char *seq_file_name2,
2089 int *diag = vcalloc(l1 + l2, sizeof(int));
2090 char *out_file = vtmpnam(NULL);
2091 char blast_command[200];
2092 // char blast_command2[200];
2093 // char blast_command2[600];
2094 // sprintf(blast_command2, "less %s", out_file);
2098 sprintf(blast_command, "blat %s %s %s -out=blast8 -q=dna -t=dna -maxGap=0 >/dev/null 2>/dev/null", seq_file_name2, seq_file_name1, out_file);
2102 sprintf(blast_command, "blat %s %s %s -out=blast8 -prot -maxGap=0 >/dev/null 2>/dev/null", seq_file_name2, seq_file_name1, out_file);
2104 system(blast_command);
2106 int *diags = diagonals[0];
2107 FILE *diag_f = fopen(out_file,"r");
2109 // fgets(line, 300, diag_f);
2110 // fgets(line, 300, diag_f);
2111 // fgets(line, 300, diag_f);
2114 char delims[] = "\t";
2115 int length, pos_q, pos_d;
2116 int current_pos = 0;
2117 while (fgets(line, 300, diag_f) != NULL)
2119 strtok(line, delims);
2120 strtok(NULL, delims);
2121 strtok(NULL, delims);
2122 length = atoi(strtok(NULL, delims));
2123 strtok(NULL, delims);
2124 strtok(NULL, delims);
2125 pos_q = atoi(strtok(NULL, delims))-1;
2126 strtok(NULL, delims);
2127 pos_d = atoi(strtok(NULL, delims))-1;
2129 if (current_pos >= *dig_length-20)
2131 (*dig_length) += 90;
2132 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2134 if (diag[l1-(pos_q)+pos_d] == 0)
2136 diag[l1-(pos_q)+pos_d] =1;
2137 diags[current_pos++] = pos_q;
2138 diags[current_pos++] = pos_d;
2139 diags[current_pos++] = length;
2142 if (current_pos == 0)
2144 printf("BLAT NOT SUCCESFULL\n");
2148 int diff = l2 - l1 + 10;
2149 for (i = diff; i > 0; --i)
2151 if (current_pos >= *dig_length-20)
2153 (*dig_length) += 90;
2154 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2156 diags[current_pos++] = 0;
2157 diags[current_pos++] = i;
2158 diags[current_pos++] = 100;
2161 // printf("A: %i\n", diff);
2162 for (i = 0; i < diff; ++i)
2164 if (current_pos >= *dig_length-20)
2166 (*dig_length) += 90;
2167 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2169 diags[current_pos++] = i;
2170 diags[current_pos++] = 0;
2171 diags[current_pos++] = 100;
2178 // printf("A: %i\n", diff);
2179 for (i = diff; i > 0; --i)
2181 if (current_pos >= *dig_length-20)
2183 (*dig_length) += 90;
2184 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2186 diags[current_pos++] = 0;
2187 diags[current_pos++] = i;
2188 diags[current_pos++] = 100;
2190 diff = l1 - l2 + 10;
2191 for (i = 0; i < diff; ++i)
2193 if (current_pos >= *dig_length-20)
2195 (*dig_length) += 90;
2196 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2198 diags[current_pos++] = i;
2199 diags[current_pos++] = 0;
2200 diags[current_pos++] = 100;
2209 diagonals[0] = diags;
2210 return current_pos/3;
2216 * \brief Calculates the diagonals between two sequences.
2218 * Uses blastz to calculate the diagonals.
2219 * \param seq_file1 File with sequence 1.
2220 * \param seq_file2 File with sequence 2.
2221 * \param diagonals An array where the diagonal points will be stored.
2222 * \param dig_length length of \a diagonals .
2223 * \param num_points Number of points in all diagonals.
2224 * \return number of diagonals;
2227 seq_pair2blastz_diagonal(char *seq_file_name1,
2228 char *seq_file_name2,
2235 int *diag = vcalloc(l1 + l2, sizeof(int));
2236 char *out_file = vtmpnam(NULL);
2237 char blast_command[200];
2238 // char blast_command2[200];
2239 // char blast_command2[600];
2240 // sprintf(blast_command2, "less %s", out_file);
2244 sprintf(blast_command, "~/Download/blastz-source/blastz %s %s B=0 K=10000> %s", seq_file_name1, seq_file_name2, out_file);
2248 printf("SORRY - no BLASTZ with amino accid\n");
2251 system(blast_command);
2253 int *diags = diagonals[0];
2254 FILE *diag_f = fopen(out_file,"r");
2256 char delims[] = " ";
2257 // char *result = NULL;
2258 int length, pos_q, pos_d;
2259 int current_pos = 0;
2260 while (fgets(line, 300, diag_f) != NULL)
2265 while (fgets(line, 300, diag_f) != NULL)
2272 line_tmp = &line[4];
2273 if (current_pos >= *dig_length-20)
2275 (*dig_length) += 90;
2276 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2278 pos_q = atoi(strtok(line_tmp, delims));
2279 pos_d = atoi(strtok(NULL, delims));
2280 length = atoi(strtok(NULL, delims) - pos_q);
2281 if (diag[l1-(pos_q)+pos_d] == 0)
2283 diag[l1-(pos_q)+pos_d] =1;
2284 diags[current_pos++] = pos_q;
2285 diags[current_pos++] = pos_d;
2286 diags[current_pos++] = length;
2294 if (current_pos == 0)
2296 printf("BLASTZ NOT SUCCESFULL\n");
2300 int diff = l2 - l1 + 10;
2302 for (i = diff; i > 0; --i)
2304 if (current_pos >= *dig_length-20)
2306 (*dig_length) += 90;
2307 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2309 diags[current_pos++] = 0;
2310 diags[current_pos++] = i;
2311 diags[current_pos++] = 100;
2315 for (i = 0; i < diff; ++i)
2317 if (current_pos >= *dig_length-20)
2319 (*dig_length) += 90;
2320 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2322 diags[current_pos++] = i;
2323 diags[current_pos++] = 0;
2324 diags[current_pos++] = 100;
2332 for (i = diff; i > 0; --i)
2334 if (current_pos >= *dig_length-20)
2336 (*dig_length) += 90;
2337 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2339 diags[current_pos++] = 0;
2340 diags[current_pos++] = i;
2341 diags[current_pos++] = 100;
2343 diff = l1 - l2 + 10;
2344 for (i = 0; i < diff; ++i)
2346 if (current_pos >= *dig_length-20)
2348 (*dig_length) += 90;
2349 diags = vrealloc(diags, sizeof(int)*(*dig_length));
2351 diags[current_pos++] = i;
2352 diags[current_pos++] = 0;
2353 diags[current_pos++] = 100;
2361 diagonals[0] = diags;
2362 return current_pos/3;
2367 //************************** needleman-wunsch aligning **********************************************************
2371 fill_arguments_nw(Nw_param* method_arguments_p, int alphabet_size)
2373 method_arguments_p-> dyn_matrix = vcalloc(1,sizeof(double*));
2374 method_arguments_p->dyn_matrix[0] = vcalloc(1,sizeof(double));
2375 method_arguments_p->length1 = vcalloc(1,sizeof(int));
2376 method_arguments_p->length2 = vcalloc(1,sizeof(int));
2377 *method_arguments_p->length1 = 1;
2378 *method_arguments_p->length2 = 1;
2379 method_arguments_p->sumup_prf = vcalloc(alphabet_size+1,sizeof(int*));
2381 for (i = 0; i < alphabet_size+1; ++i)
2382 method_arguments_p->sumup_prf[i] = vcalloc(1,sizeof(int));
2383 method_arguments_p->sumup_length = vcalloc(1,sizeof(int));
2384 *method_arguments_p->sumup_length = 1;
2389 free_nw(Nw_param* method_arguments_p, int alphabet_size)
2391 free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->dyn_matrix);
2393 for (i = 0; i <= alphabet_size; ++i)
2395 vfree(method_arguments_p->sumup_prf[i]);
2397 vfree(method_arguments_p->sumup_prf);
2398 vfree(method_arguments_p->length1);
2399 vfree(method_arguments_p->length2);
2400 vfree(method_arguments_p->sumup_length);
2405 * \brief One run of needleman-wunsch dynamic programming.
2407 * \param profiles The profiles.
2408 * \param param_set The fastal parameters.
2409 * \param method_arguments_p The method arguments.
2410 * \param is_dna Sequences are DNA (\a is_dna = 1) or protein.
2411 * \param edit_file The edit file.
2412 * \param prof_file the profile file.
2413 * \param number Number of the parent node.
2414 * \return The length of the alignment.
2417 nw_dyn(Fastal_profile **profiles, Fastal_param *param_set, void *method_arguments_p, int is_dna, FILE *edit_file, FILE *prof_file, int number)
2419 Nw_param *arguments = (Nw_param*)method_arguments_p;
2420 // int old_length1 = *arguments->length1;
2421 // int old_length2 = *arguments->length2;
2422 arguments->dyn_matrix = resize_dyn_matrix(arguments->dyn_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1);
2423 *arguments->length1 = profiles[0]->length+1;
2424 *arguments->length2 = profiles[1]->length+1;
2425 int alignment_length = prf_nw(profiles[0], profiles[1], arguments->dyn_matrix, edit_file, arguments->sumup_prf, arguments->sumup_length, param_set);
2426 write2file(arguments->sumup_prf, alignment_length, prof_file, number,profiles[0]->number_of_sequences + profiles[1]->number_of_sequences, param_set);
2427 return alignment_length;
2432 * \brief This method takes a profile and turns it into a sumed up version.
2434 * Required for NW-algorithm.
2435 * \param profile The profile to sum up.
2436 * \param sumup A field where the result will be stored.
2437 * \param param_set Parameters for the fastal algorithm.
2438 * \return The new \a sumup.
2441 sumup_profile(Fastal_profile *profile,
2443 Fastal_param *param_set)
2446 char *pos2aa = &(param_set->pos2char[0]);
2447 int alphabet_size = param_set->alphabet_size;
2448 int **M = param_set->M;
2449 int prof_length = profile->length;
2453 for (i = 0; i < prof_length; ++i)
2455 sumup[alphabet_size][i] = 0;
2456 for (k = 0; k < alphabet_size; ++k)
2459 sumup[alphabet_size][i] += profile->prf[k][i];
2460 for (j = 0; j < alphabet_size; ++j)
2462 sumup[k][i] += profile->weight * profile->prf[j][i] * M[pos2aa[j]-'A'][pos2aa[k]-'A'];
2472 * \brief Turns the dynamic programming matrix into a editfile and calculates the new profile.
2474 * Required for NW-algorithm.
2475 * \param prog_matrix The dynamic programming matrix.
2476 * \param prf1 The first profile.
2477 * \param prf2 The second profile.
2478 * \param edit_f A File object (already opened) to write the edit sequence into.
2479 * \param prf_field A 2-dim array to save the new profile into.
2480 * \param field_length Length of the new profile.
2481 * \param param_set Parameters of the Fastal-Algorithm
2484 nw_matrix2edit_file(double **prog_matrix, //dynamic programming matrix
2485 Fastal_profile *prf1, //profile of dim1
2486 Fastal_profile *prf2, //profile of dim2
2487 FILE *edit_f, //file to safe the edit in
2488 int **prf_field, //space to safe the new profile
2490 Fastal_param *param_set) //length of prf_field
2492 // int **M = param_set->M;
2493 int alphabet_size = param_set->alphabet_size;
2494 double gap_cost = param_set -> gop;
2495 fprintf(edit_f, "%i\n%i\n%i\n%i\n",prf1->prf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf);
2496 int sum[] = {0,0,0};
2497 char sumc[] = {'M','I','D'};
2503 int prf1_length = prf1->length;
2504 int prf2_length = prf2->length;
2505 while ((n < prf1_length) && (m < prf2_length))
2507 //if necesarry allocate more memory for result
2508 if ((*field_length)-alphabet_size < field_pos)
2510 (*field_length) += ENLARGEMENT_PER_STEP;
2512 for (i = 0; i <alphabet_size+1; ++i)
2514 prf_field[i] = vrealloc(prf_field[i], (*field_length)*sizeof(int));
2518 if (prog_matrix[n][m] == (prog_matrix[n+1][m] +gap_cost))
2520 for (i = 0; i<alphabet_size; ++i)
2522 prf_field[i][field_pos] = prf1->prf[i][n];
2529 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
2535 else if (prog_matrix[n][m] == (prog_matrix[n][m+1] +gap_cost))
2538 for (i = 0; i<alphabet_size; ++i)
2540 prf_field[i][field_pos] = prf2->prf[i][m];
2546 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
2554 for (i = 0; i<alphabet_size; ++i)
2556 prf_field[i][field_pos] = prf1->prf[i][n] + prf2->prf[i][m];
2563 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
2570 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
2574 while (n < prf1_length)
2576 for (i = 0; i<alphabet_size; ++i)
2578 prf_field[i][field_pos] = prf1->prf[i][n];
2585 fprintf(edit_f,"I%i\n",last);
2589 while (m < prf2_length)
2591 for (i = 0; i<alphabet_size; ++i)
2593 prf_field[i][field_pos] = prf2->prf[i][m];
2600 fprintf(edit_f,"D%i\n",last);
2601 fprintf(edit_f,"*\n");
2608 * \brief Pairwise alignments of profile is done here.
2610 * \param profile1 Profile of sequence 1
2611 * \param profile2 Profile of sequence 2
2612 * \param prog_matrix Matrix for dynamic programming
2613 * \param edit_file_name The edit_file_name
2614 * \param sumup_prf The sumup version of profile 1, which later contains the aligned profile.
2615 * \param sumup_length Contains the length of the aligned profile.
2616 * \return length of the aligned profile
2619 prf_nw(Fastal_profile *profile1,
2620 Fastal_profile *profile2,
2621 double **prog_matrix,
2622 FILE *edit_file_name,
2625 Fastal_param *param_set)
2627 int alphabet_size = param_set->alphabet_size;
2628 double gap_cost = param_set->gop;
2631 if (*sumup_length < profile1->length)
2633 for (i = 0; i < alphabet_size+1; ++i)
2635 sumup_prf[i] = vrealloc(sumup_prf[i], profile1->length*sizeof(int));
2637 *sumup_length = profile1->length;
2639 sumup_prf = sumup_profile(profile1, sumup_prf, param_set);
2644 int prof1_length = profile1->length;
2645 int prof2_length = profile2->length;
2647 // int** M = param_set->M;
2649 // int amino_counter;
2650 int residue_pairs = 0;
2652 for (i = prof2_length; i > 0; --i)
2654 prog_matrix[prof1_length][i] = gap_cost * (prof2_length-i);
2658 prog_matrix[prof1_length][prof2_length] = 0.0;
2663 prog_matrix[i][prof2_length] = gap_cost*(prof1_length-i);
2668 for (k = 0; k < alphabet_size; ++k)
2670 residue_pairs += profile2->prf[k][j];
2671 match_score += (profile2->prf[k][j] * sumup_prf[k][i]);
2673 match_score /= (residue_pairs * sumup_prf[alphabet_size][i]);
2674 prog_matrix[i][j] = MAX3(prog_matrix[i+1][j+1]+match_score, prog_matrix[i+1][j]+gap_cost, prog_matrix[i][j+1]+gap_cost);
2680 return nw_matrix2edit_file(prog_matrix, profile1, profile2, edit_file_name, sumup_prf, sumup_length, param_set);
2684 /************** GOTOH ***********************/
2688 fill_arguments_gotoh(Gotoh_param* method_arguments_p, int alphabet_size)
2690 method_arguments_p->m_matrix = vcalloc(1,sizeof(double*));
2691 method_arguments_p->m_matrix[0] = vcalloc(1,sizeof(double));
2692 method_arguments_p->d_matrix = vcalloc(1,sizeof(double*));
2693 method_arguments_p->d_matrix[0] = vcalloc(1,sizeof(double));
2694 method_arguments_p->i_matrix = vcalloc(1,sizeof(double*));
2695 method_arguments_p->i_matrix[0] = vcalloc(1,sizeof(double));
2696 method_arguments_p->length1 = vcalloc(1,sizeof(int));
2697 method_arguments_p->length2 = vcalloc(1,sizeof(int));
2698 method_arguments_p->log_saver = vcalloc(alphabet_size+1, sizeof(double*));
2699 *method_arguments_p->length1 = 1;
2700 *method_arguments_p->length2 = 1;
2701 method_arguments_p->sumup_prf = vcalloc(alphabet_size+1,sizeof(int*));
2703 for (i = 0; i < alphabet_size+1; ++i)
2705 method_arguments_p->sumup_prf[i] = vcalloc(1,sizeof(int));
2706 method_arguments_p->log_saver[i] = vcalloc(1, sizeof(double));
2708 method_arguments_p->sumup_length = vcalloc(1,sizeof(int));
2709 *method_arguments_p->sumup_length = 1;
2714 free_gotoh(Gotoh_param* method_arguments_p, int alphabet_size)
2716 free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->m_matrix);
2717 free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->i_matrix);
2718 free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->d_matrix);
2721 for (i = 0; i <= alphabet_size; ++i)
2723 vfree(method_arguments_p->sumup_prf[i]);
2725 vfree(method_arguments_p->sumup_prf);
2726 vfree(method_arguments_p->length1);
2727 vfree(method_arguments_p->length2);
2728 vfree(method_arguments_p->sumup_length);
2733 gotoh_dyn(Fastal_profile **profiles, Fastal_param *param_set, void *method_arguments_p, int is_dna, FILE *edit_file, FILE *prof_file, int number)
2735 Gotoh_param *arguments = (Gotoh_param*)method_arguments_p;
2736 arguments->m_matrix = resize_dyn_matrix(arguments->m_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1);
2737 arguments->i_matrix = resize_dyn_matrix(arguments->i_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1);
2738 arguments->d_matrix = resize_dyn_matrix(arguments->d_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1);
2740 if (profiles[1]->length > *arguments->length2-1)
2742 for (i = 0; i < param_set->alphabet_size; ++i)
2744 arguments->log_saver[i] = vrealloc(arguments->log_saver[i], (profiles[1]->length)*sizeof(double*));
2747 *arguments->length1 = profiles[0]->length+1;
2748 *arguments->length2 = profiles[1]->length+1;
2749 int alignment_length = prf_gotoh(profiles[0], profiles[1], edit_file, arguments, param_set);
2750 write2file(arguments->sumup_prf, alignment_length, prof_file, number, profiles[0]->number_of_sequences + profiles[1]->number_of_sequences, param_set);
2751 return alignment_length;
2756 gotoh_matrix2edit_file(double **m_matrix, //dynamic programming matrix
2757 double **v_matrix, //dynamic programming matrix
2758 double **h_matrix, //dynamic programming matrix
2759 Fastal_profile *prf1, //profile of dim1
2760 Fastal_profile *prf2, //profile of dim2
2761 FILE *edit_f, //file to safe the edit in
2762 int **prf_field, //space to safe the new profile
2764 Fastal_param *param_set) //length of prf_field
2766 double comp_num = log((double)prf1->number_of_sequences) + log((double)prf2->number_of_sequences);
2767 int** M = param_set->M;
2768 int alphabet_size = param_set->alphabet_size;
2769 double gep = param_set -> gep;
2770 fprintf(edit_f, "%i\n%i\n%i\n%i\n",prf1->prf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf);
2771 int sum[] = {0,0,0};
2772 char sumc[] = {'M','I','D'};
2778 int prf1_length = prf1->length;
2779 int prf2_length = prf2->length;
2780 int current_mode = 0;
2781 //determine start mode
2782 char *pos2char = param_set->pos2char;
2783 if (h_matrix[n][m] == m_matrix[n][m])
2790 if (v_matrix[n][m] == m_matrix[n][m])
2799 // printf("%f %f %f - %i\n",h_matrix[n][m],v_matrix[n][m],m_matrix[n][m], current_mode);
2800 while ((n < prf1_length) && (m < prf2_length))
2802 //if necesarry allocate more memory for result
2803 if ((*field_length)-alphabet_size < field_pos)
2805 (*field_length) += ENLARGEMENT_PER_STEP;
2807 for (i = 0; i <alphabet_size+1; ++i)
2809 prf_field[i] = vrealloc(prf_field[i], (*field_length)*sizeof(int));
2814 if (current_mode == 2)
2816 for (i = 0; i<alphabet_size; ++i)
2818 prf_field[i][field_pos] = prf2->prf[i][m];
2820 if (h_matrix[n][m] != (h_matrix[n][m+1]+gep))
2828 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
2836 if (current_mode== 1)
2838 for (i = 0; i<alphabet_size; ++i)
2840 prf_field[i][field_pos] = prf1->prf[i][n];
2842 if (v_matrix[n][m] != (v_matrix[n+1][m]+gep))
2851 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
2859 double match_score = 0.0;
2860 int char_c, char_c2;
2861 for (char_c = 0; char_c < alphabet_size; ++char_c)
2863 for (char_c2 = 0; char_c2 < alphabet_size; ++char_c2)
2866 if ((log(prf1->prf[char_c][n]) != -1) && ( log(prf2->prf[char_c2][m]) != -1))
2868 match_score += exp(log((double)prf1->prf[char_c][n]) + log((double)prf2->prf[char_c2][m])-comp_num) * M[pos2char[char_c]-'A'][pos2char[char_c2]-'A'];
2872 if (m_matrix[n+1][m+1] + match_score != m_matrix[n][m])
2874 if (m_matrix[n][m] == v_matrix[n][m])
2879 if (m_matrix[n][m] == h_matrix[n][m])
2885 for (i = 0; i<alphabet_size; ++i)
2887 prf_field[i][field_pos] = prf1->prf[i][n] + prf2->prf[i][m];
2894 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
2903 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
2905 int needed = MAX(prf1_length -n, prf2_length -m);
2907 if ((*field_length) - needed -10 < field_pos)
2909 (*field_length) += needed +10;
2911 for (i = 0; i <alphabet_size+1; ++i)
2913 prf_field[i] = vrealloc(prf_field[i], (*field_length)*sizeof(int));
2918 while (n < prf1_length)
2920 for (i = 0; i<alphabet_size; ++i)
2922 prf_field[i][field_pos] = prf1->prf[i][n];
2929 fprintf(edit_f,"I%i\n",last);
2933 while (m < prf2_length)
2935 for (i = 0; i<alphabet_size; ++i)
2937 prf_field[i][field_pos] = prf2->prf[i][m];
2944 fprintf(edit_f,"D%i\n",last);
2946 fprintf(edit_f,"*\n");
2954 * \brief The gotoh dynamic programming algorithm.
2956 * \param profile1 The first profile.
2959 prf_gotoh(Fastal_profile *profile1,
2960 Fastal_profile *profile2,
2961 FILE *edit_file_name,
2962 Gotoh_param *arguments,
2963 Fastal_param *param_set)
2966 // printf("I AM HERE - again\n");
2967 int **sumup_prf = arguments->sumup_prf;
2968 int *sumup_length = arguments->sumup_length;
2969 int alphabet_size = param_set->alphabet_size;
2970 double gop = param_set->gop;
2971 double gep = param_set->gep;
2973 const int INF = -999999;
2976 double **m_matrix = arguments->m_matrix;
2977 double **h_matrix = arguments->i_matrix;
2978 double **v_matrix = arguments->d_matrix;
2981 int prof1_length = profile1->length;
2982 int prof2_length = profile2->length;
2984 int** M = param_set->M;
2986 for (i = prof2_length; i >= 0; --i)
2988 m_matrix[prof1_length][i] = gop + gep * (prof2_length-i);
2989 v_matrix[prof1_length][i] = INF;
2990 h_matrix[prof1_length][i] = m_matrix[prof1_length][i];
2993 m_matrix[prof1_length][prof2_length] = 0.0;
2994 h_matrix[prof1_length][prof2_length] = INF;
2995 v_matrix[prof1_length][prof2_length] = INF;
2997 double comp_num = log((double)profile1->number_of_sequences) + log((double)profile2->number_of_sequences);
2998 static double *log_test = NULL;
3000 log_test = vcalloc(alphabet_size, sizeof(double));
3002 int **prf1 = profile1->prf;
3003 int **prf2 = profile2->prf;
3004 double **log_test2 = arguments->log_saver;
3005 for (l = 0; l < alphabet_size; ++l)
3007 for (i = 0; i < profile2->length; ++i)
3011 log_test2[l][i] = log(prf2[l][i]);
3014 log_test2[l][i] = -1;
3018 char *pos2char = param_set->pos2char;
3024 for (l = 0; l < alphabet_size; ++l)
3027 log_test[l] = log((double)prf1[l][i]);
3031 m_matrix[i][prof2_length] = gop + gep *(prof1_length-i);
3032 v_matrix[i][prof2_length] = m_matrix[i][prof2_length];
3033 h_matrix[i][prof2_length] = INF;
3038 v_matrix[i][j] = (MAX(v_matrix[i+1][j], m_matrix[i+1][j] + gop) + gep);
3039 h_matrix[i][j] = (MAX(h_matrix[i][j+1], m_matrix[i][j+1] + gop) + gep);
3041 int char_c, char_c2;
3043 for (char_c = 0; char_c < alphabet_size; ++char_c)
3045 for (char_c2 = 0; char_c2 < alphabet_size; ++char_c2)
3048 if ((log_test[char_c] != -1) && (log_test2[char_c2][j] != -1))
3050 match_score += exp(log_test[char_c] + log_test2[char_c2][j]-comp_num) * M[pos2char[char_c]-'A'][pos2char[char_c2]-'A'];
3055 m_matrix[i][j] = m_matrix[i+1][j+1]+match_score;
3057 if (m_matrix[i][j] < v_matrix[i][j])
3059 m_matrix[i][j] = v_matrix[i][j];
3061 if (m_matrix[i][j] < h_matrix[i][j])
3063 m_matrix[i][j] = h_matrix[i][j];
3070 return gotoh_matrix2edit_file(m_matrix, v_matrix, h_matrix, profile1, profile2, edit_file_name, sumup_prf, sumup_length, param_set);
3074 /************* OTHER STUFF ******************/
3077 * \brief Writes the alignment into the profile file and the edit file.
3079 * \param profiles The two profiles to combine.
3080 * \param alignment The alinment information.
3081 * \param alignment The length of the alignment.
3082 * \param edit_f The edit file.
3083 * \param prof_f The profile file.
3084 * \param node_number the new node number.
3087 alignment2files(Fastal_profile **profiles,
3088 Fastal_param *param_set,
3090 int alignment_length,
3095 fprintf(edit_f, "%i\n%i\n%i\n%i\n",profiles[0]->prf_number, profiles[1]->prf_number, profiles[0]->is_leaf, profiles[1]->is_leaf);
3096 fprintf(prof_f, "%i\n0\n%i\n1\n", node_number, alignment_length);
3098 int **prf1 = profiles[0]->prf;
3099 int **prf2 = profiles[1]->prf;
3104 char statec[] = {'M','D','I'};
3108 while (i < alignment_length)
3111 pos1 = alignment[0][pos];
3112 pos2 = alignment[1][pos];
3114 if ((pos1 != -1) && (pos2 != -1))
3117 combine_profiles2file(prf1, prf2, pos1, pos2, param_set, prof_f, 'M');
3120 fprintf(edit_f, "%c%i\n",statec[state], num);
3128 // insertion in seq 1
3129 else if (pos1 != -1)
3131 combine_profiles2file(prf1, prf2, pos1, pos2, param_set, prof_f, 'I');
3134 fprintf(edit_f, "%c%i\n",statec[state], num);
3142 // deletion in seq 1
3143 else if (pos2 != -1)
3145 combine_profiles2file(prf1, prf2, pos1, pos2, param_set, prof_f, 'D');
3148 fprintf(edit_f, "%c%i\n",statec[state], num);
3158 fprintf(edit_f, "%c%i\n",statec[state], num);
3160 fprintf(edit_f,"*\n");
3161 fprintf(prof_f,"*\n");
3166 //******************************* OTHER STUFF ***********************
3169 * \brief Reads the sequence from a given position in a fasta file and turns it into a profile.
3171 * \param seq_file The file where the sequence is stored.
3172 * \param off_set The off_set from the beginning of the file to the position of the sequence name.
3173 * \param profile The profile where the sequence will be stored into.
3174 * \param prf_number The number of this profile.
3177 file_pos2profile(FILE *seq_file, //File with sequences
3178 long off_set, //offset of sequence from the beginning of file point to the sequence name, not to the sequence itself
3179 Fastal_profile *profile, //profile to save into
3180 int prf_number, //number of the profile
3181 Fastal_param *param_set)
3183 int alphabet_size = param_set->alphabet_size;
3184 profile->is_leaf = 1;
3185 profile->number_of_sequences = 1;
3186 int *aa2pos = &(param_set->char2pos[0]);
3187 const int LINE_LENGTH = 500;
3188 char line[LINE_LENGTH];
3189 profile->num_sequences = 1;
3190 profile->prf_number = prf_number;
3191 fseek (seq_file , off_set , SEEK_SET );
3193 fgets (line, LINE_LENGTH , seq_file);
3197 while(fgets(line, LINE_LENGTH, seq_file)!=NULL)
3201 line[LINE_LENGTH-1] = '\n';
3202 if (seq_length + LINE_LENGTH >= profile->allocated_memory)
3204 for (i = 0; i < alphabet_size; ++i)
3206 profile->prf[i] = vrealloc(profile->prf[i], (profile->allocated_memory+PROFILE_ENLARGEMENT)*sizeof(int));
3208 profile->allocated_memory += PROFILE_ENLARGEMENT;
3213 while ((line[i] != '\n') && (line[i] != '\0'))
3217 for(j = 0; j<alphabet_size; ++j )
3218 profile->prf[j][seq_length+x] = 0;
3219 profile->prf[aa2pos[toupper((short)line[i])]][seq_length+x] = 1;
3230 profile->length = seq_length;
3237 * \brief Constructs index of fasta_file.
3239 * The index is of length n (n= number of sequences in the given multi fasta file.). In the order of appearance in the file the position of each sequence in the file is stored.
3240 * \param file_name The file with the sequences.
3241 * \param file_positions Array to save the positions in.
3242 * \return The number of sequences in \a file_name.
3245 make_index_of_file(char *file_name, //file with sequences
3246 long **file_positions) //array to save the positions
3248 const int LINE_LENGTH = 150;
3249 (*file_positions) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(long));
3251 FILE *file = fopen(file_name,"r");
3253 char line[LINE_LENGTH];
3255 int num_of_sequences = 0;
3256 int mem_for_pos = ENLARGEMENT_PER_STEP;
3261 printf("FILE NOT FOUND\n");
3266 (*file_positions)[num_of_sequences] = ftell(file);
3267 while(fgets(line, LINE_LENGTH , file)!=NULL)
3269 // int length = strlen(line);
3274 if (num_of_sequences == mem_for_pos)
3276 (*file_positions) = vrealloc((*file_positions),(ENLARGEMENT_PER_STEP+mem_for_pos) * sizeof(long));
3277 mem_for_pos += ENLARGEMENT_PER_STEP;
3280 (*file_positions)[num_of_sequences] = ftell(file);
3285 return num_of_sequences;
3291 * \brief Reads a profile from a profile file.
3293 * \param prof A Fastal_profile object to save the profile in.
3294 * \param profile_f file where the profile is stored.
3295 * \param position Position of the profile in \a profile_f.
3296 * \param param_set The parameter set for Fastal
3300 profile_file2profile(Fastal_profile *prof, //structure to save the profile in
3301 FILE *profile_f, //file where the profile is stored
3302 long position, //position in profile_f where the profile is stored
3303 Fastal_param *param_set)
3306 int alphabet_size = param_set->alphabet_size;
3308 int *aa2pos = &(param_set->char2pos[0]);
3311 fseek(profile_f,position,SEEK_SET);
3312 const int LINE_LENGTH = 500;
3315 fgets(line, LINE_LENGTH, profile_f);
3317 prof->prf_number = atoi(line);
3318 fgets(line, LINE_LENGTH, profile_f);
3319 prof->is_leaf = atoi(line);
3321 fgets(line, LINE_LENGTH, profile_f);
3322 prof->length = atoi(line);
3323 fgets(line, LINE_LENGTH, profile_f);
3324 prof->weight = atoi(line);
3325 fgets(line, LINE_LENGTH, profile_f);
3326 prof->number_of_sequences = atoi(line);
3328 if (prof->length > prof->allocated_memory)
3329 for (i = 0;i < alphabet_size; ++i)
3331 prof->prf[i] = vrealloc(prof->prf[i],prof->length*sizeof(int));
3333 prof->allocated_memory = prof->length;
3334 char delims[] = " ";
3335 char *result = NULL;
3336 char *result_num = NULL;
3338 int length = prof->length;
3340 for (i = 0; i < length; ++i)
3342 for(j = 0; j<alphabet_size; ++j )
3343 prof->prf[j][i] = 0;
3344 fgets(line, LINE_LENGTH , profile_f);
3345 result = strtok( line, delims );
3347 while( result != NULL)
3349 result_num = &result[1];
3350 prof->prf[aa2pos[(short)result[0]]][i] = atoi(result_num);
3351 result = strtok( NULL, delims );
3359 * \brief Writes a profile into a file
3361 * \param profile Pointer to the profile which has to be saved.
3362 * \param file A File object (already opened) to write the profile to.
3363 * \param param_set The parameters for the fastal algorithm.
3366 profile2file(Fastal_profile *profile, //the profile to save
3367 FILE* file, //file to save in
3368 Fastal_param *param_set)
3370 int alphabet_size = param_set->alphabet_size;
3372 char *pos2aa = &(param_set->pos2char[0]);
3374 fseek(file,0,SEEK_SET);
3376 fprintf(file,"%i\n", profile->prf_number);
3379 fprintf(file,"%i\n", profile->is_leaf);
3380 fprintf(file,"%i\n", profile->length);
3381 fprintf(file,"%i\n", profile->weight);
3383 int max = profile->length;
3388 for (j = 0; j < alphabet_size; ++j)
3389 if (profile->prf[j][i] > 0)
3392 fprintf(file," %c%i", pos2aa[j],profile->prf[j][i]);
3394 fprintf(file,"%c%i", pos2aa[j],profile->prf[j][i]);
3397 if (profile->prf[j][i] > 0)
3400 fprintf(file," %c%i", pos2aa[j],profile->prf[j][i]);
3402 fprintf(file,"%c%i", pos2aa[j],profile->prf[j][i]);
3409 fprintf(file,"*\n");
3415 * Reads the profile out of an alignment (NOT IN USE)
3418 // file2profile(FILE* profile_f, //file to read the profile of
3419 // Fastal_profile *prof, //profile saved in here
3420 // int prf_number, //number of the profile
3421 // Fastal_param *param_set)
3423 // int alphabet_size = param_set->alphabet_size;
3425 // int *aa2pos = &(param_set->char2pos[0]);
3428 // fseek(profile_f,0,SEEK_SET);
3429 // const int LINE_LENGTH = 500;
3432 // fgets(line, LINE_LENGTH, profile_f);
3433 // prof->prf_number = atoi(line);
3434 // fgets(line, LINE_LENGTH, profile_f);
3435 // prof->is_leaf = atoi(line);
3437 // fgets(line, LINE_LENGTH, profile_f);
3438 // prof->length = atoi(line);
3440 // fgets(line, LINE_LENGTH, profile_f);
3441 // prof->weight = atoi(line);
3443 // if (prof->length > prof->allocated_memory)
3444 // for (i = 0;i < alphabet_size; ++i)
3446 // prof->prf[i] = vrealloc(prof->prf[i],prof->length*sizeof(int));
3449 // char delims[] = " ";
3450 // char *result = NULL;
3451 // char *result_num = NULL;
3453 // int length = prof->length;
3455 // for (i = 0; i < length; ++i)
3457 // for(j = 0; j<alphabet_size; ++j )
3458 // prof->prf[j][i] = 0;
3459 // fgets(line, LINE_LENGTH , profile_f);
3460 // result = strtok( line, delims );
3462 // while( result != NULL)
3464 // result_num = &result[1];
3465 // prof->prf[aa2pos[(short)result[0]]][i] = atoi(result_num);
3466 // result = strtok( NULL, delims );
3477 * \brief Writes the sequence into the alignment_file.
3479 * \param aligned_sequence Pattern of aligned sequence.
3480 * \param sequence_file File with sequences.
3481 * \param sequence_position Positions of sequences in \a sequence_file.
3482 * \param alignment_file The file to write the sequence into.
3486 edit_seq2aligned_seq(char *aligned_sequence, //pattern for aligned sequence
3487 FILE *sequence_file, //file with all the sequences
3488 long sequence_position, //position in sequence file with the correct sequence
3489 FILE *alignment_file) //file to write the alignment into
3491 fseek(sequence_file, sequence_position, SEEK_SET);
3492 const int LINE_LENGTH = 300;
3493 char line[LINE_LENGTH];
3494 fgets (line, LINE_LENGTH , sequence_file);
3495 fprintf(alignment_file,"%s", line); //writing of sequence name
3498 while(fgets(line, LINE_LENGTH, sequence_file)!=NULL)
3503 line[LINE_LENGTH-1] = '\n';
3505 while ((line[i] != '\n') && (line[i] != '\0'))
3507 while (aligned_sequence[pos] == '-')
3510 fprintf(alignment_file,"-");
3516 fprintf(alignment_file,"%c",line[i]);
3526 while (aligned_sequence[pos] != '\n')
3528 fprintf(alignment_file,"-");
3531 fprintf(alignment_file,"\n");
3537 * \brief Recursive function to turn the edit_file into the alignment.
3539 * \param sequence_file File with all sequences.
3540 * \param sequence_position The array of sequence positions in \a sequence_file
3541 * \param edit_file File to safe the edit profiles in.
3542 * \param edit_positions Array saving the coorespondence between edit profile and position in \a edit_file
3543 * \param node_number The current node.
3544 * \param number_of_sequences The number of sequences.
3545 * \param aligned_sequence The sequence that is edited.
3546 * \param alignment_length The length of the alignment.
3547 * \param edit_seq_file File that saves the edited_sequences of the internal nodes.
3548 * \param offset Saves the size of the edited_sequences.
3549 * \param alignment_file File where the alignment is saved.
3552 edit2alignment(FILE *sequence_file, //sequence file
3553 long *seq_positions, //sequence positions
3554 FILE *edit_file, //file saving the edit profiles
3555 long *edit_positions, //array saving the correspondence between edit profile and position in edit_file
3556 int node_number, //the current node
3557 int number_of_sequences, //number of sequences
3558 char *aligned_sequence, //the sequence that is edited
3559 int alignment_length, //length of the alignment - and thus of aligned_sequence
3560 FILE *edit_seq_file, //file saving the edited_sequences of the internal nodes
3561 int offset, //saves the size of the edited_sequence
3562 FILE* alignment_file) //file saving the alignments
3564 fseek(edit_file, edit_positions[node_number-number_of_sequences], SEEK_SET);
3565 const int LINE_LENGTH = 50;
3566 char line[LINE_LENGTH];
3567 fgets(line, LINE_LENGTH , edit_file);
3568 int child1 = atoi(line);
3569 fgets(line, LINE_LENGTH , edit_file);
3570 int child2 = atoi(line);
3571 fgets(line, LINE_LENGTH , edit_file);
3572 int is_leaf1 = atoi(line);
3573 fgets(line, LINE_LENGTH , edit_file);
3574 int is_leaf2 = atoi(line);
3576 // static char seq_line[10];
3577 // printf("SO EINE VERDAMMTE SCHEISE ABER AUCH\n");
3583 while(fgets(line, LINE_LENGTH , edit_file)!=NULL)
3589 number = atoi(&line[1]);
3594 if (aligned_sequence[pos] == 'X')
3603 if (aligned_sequence[pos] == 'X')
3612 if (aligned_sequence[pos] == 'X')
3614 aligned_sequence[pos] = '-';
3624 // printf("LEAF\n");
3625 edit_seq2aligned_seq(aligned_sequence, sequence_file, seq_positions[child1], alignment_file);
3629 fprintf(edit_seq_file, "%s", aligned_sequence);
3630 edit2alignment(sequence_file, seq_positions, edit_file, edit_positions, child1, number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file);
3634 fseek(edit_seq_file, offset, SEEK_CUR);
3635 fgets(aligned_sequence, alignment_length+3, edit_seq_file);
3636 fseek(edit_seq_file, offset, SEEK_CUR);
3639 fseek(edit_file, edit_positions[node_number-number_of_sequences], SEEK_SET);
3640 while(fgets(line, LINE_LENGTH , edit_file)!=NULL)
3645 number = atoi(&line[1]);
3650 if (aligned_sequence[pos] == 'X')
3659 if (aligned_sequence[pos] == 'X')
3661 aligned_sequence[pos] = '-';
3671 if (aligned_sequence[pos] == 'X')
3680 edit_seq2aligned_seq(aligned_sequence, sequence_file, seq_positions[child2], alignment_file);
3684 fprintf(edit_seq_file, "%s", aligned_sequence);
3685 edit2alignment(sequence_file, seq_positions, edit_file, edit_positions, child2, number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file);
3692 // * The file has the follwing format (# and text behind are only comments and not included into the file):
3693 // * 1 # Number of profile.
3695 // * 5 # Number of columns in the profile.
3696 // * 4A 1C # In the first column are 4 'A' and 1 'C'
3697 // * 3G # In the second column are 3 'G'
3698 // * 5A # In the third column are 5 'A'
3699 // * 2A 3C # In the fourth column are 2 'A' and 3 'C'
3700 // * 5C # In the fifth column are 5 'C'
3701 // * * # Marks the end of this profile
3706 * \brief Writes a profile to a file.
3708 * \param sumup_prf The profile array, not a real profile.
3709 * \param length The length of the profile. The format can be seen in ./test.txt
3710 * \param file The FILE object to write the the profile into.
3711 * \param is_dna The type of sequence.
3712 * \param number The number of the profile.
3715 write2file(int **sumup_prf,
3720 Fastal_param *param_set)
3722 char *pos2aa = &(param_set->pos2char[0]);
3723 fprintf(file,"%i\n0\n%i\n1\n%i\n",number, length, num_sequences );
3725 int alphabet_size = param_set->alphabet_size;
3731 for (j = 0; j < alphabet_size; ++j)
3732 if (sumup_prf[j][i] > 0)
3735 fprintf(file," %c%i", pos2aa[j],sumup_prf[j][i]);
3737 fprintf(file,"%c%i", pos2aa[j],sumup_prf[j][i]);
3744 fprintf(file,"*\n");
3749 //************************************* main function of the fasta algorithm ***********************************************
3752 * \brief main of the fastal algorithm
3755 fastal_main(int argc, //number of arguments
3756 char **argv) //arguments first = fastal, second = tree
3760 //pointer to arguments
3761 void * method_arguments_p;
3762 int (*alignment_function)(Fastal_profile **profiles, Fastal_param *param_set, void *method_arguments_p, int is_dna, FILE *edit_file, FILE *prof_file, int number);
3764 struct Fastal_arguments arguments;
3766 arg_parse (argc, argv, &arguments);
3768 Fastal_param *param_set = vcalloc(1,sizeof(Fastal_param));
3770 fill_parameters(arguments.is_dna, param_set, arguments.method, arguments.diag_method, arguments.mat);
3771 param_set->gep = arguments.gep;
3772 param_set->gop = arguments.gop;
3774 // printf("%s",arguments.mat);
3775 if (arguments.evaluate)
3777 printf("Calculate Sum of pairs Score\n");
3778 printf("Score: %f\n", calculate_sum_of_pairs_score_affine(arguments.sequence_file, param_set->M, param_set->gop, param_set->gep));
3784 if (arguments.agreement_score)
3786 complete_agreement_score(arguments.aln2test, arguments.aln_ref);
3791 if (arguments.num_ref_aln)
3793 compute_ref_alignments(arguments.sequence_file, arguments.aln_ref, arguments.num_ref_aln, arguments.num_seq_in_ref);
3799 int alphabet_size = param_set->alphabet_size;
3802 //sequence file management
3804 long *file_positions = NULL;
3805 long **tmp = &file_positions;
3806 int number_of_sequences = make_index_of_file(arguments.sequence_file, tmp);
3810 //edit file management
3812 // long current_edit_pos;
3813 long *edit_positions = vcalloc(number_of_sequences,sizeof(long));
3816 //profile management
3817 Fastal_profile **profiles = vcalloc(3,sizeof(Fastal_profile*));
3818 initiate_profiles(profiles, param_set);
3819 FILE * prof_file = fopen(vtmpnam(NULL),"w+");
3820 long* profile_positions = vcalloc(4,sizeof(long*));
3825 printf("METHOD: %s\n",param_set->method);
3826 if (strcmp(param_set->method, "fast") == 0)
3828 method_arguments_p = vcalloc(1,sizeof(Sparse_dynamic_param));
3829 fill_arguments_sparse((Sparse_dynamic_param*)method_arguments_p);
3830 alignment_function = sparse_dyn;
3832 else if (strcmp(param_set->method, "nw") == 0)
3834 method_arguments_p = vcalloc(1,sizeof(Nw_param));
3835 fill_arguments_nw((Nw_param*)method_arguments_p, alphabet_size);
3836 alignment_function = nw_dyn;
3838 else if (strcmp(param_set->method, "gotoh") == 0)
3840 method_arguments_p = vcalloc(1,sizeof(Gotoh_param));
3841 fill_arguments_gotoh((Gotoh_param*)method_arguments_p, alphabet_size);
3842 alignment_function = gotoh_dyn;
3844 else if (strcmp(param_set->method, "udisc") == 0)
3846 method_arguments_p = vcalloc(1,sizeof(Udisc_param));
3847 fill_arguments_gotoh((Gotoh_param*)method_arguments_p, alphabet_size);
3848 alignment_function = gotoh_dyn;
3853 printf("ERROR - METHOD");
3858 if (arguments.gap_iterate)
3860 iterate(param_set, method_arguments_p, arguments.sequence_file, arguments.output_file, arguments.gap_iterate);
3864 if (arguments.tree_file == NULL)
3866 arguments.tree_file = vtmpnam(NULL);
3867 printf("CONSTRUCT TREE\n");
3868 if (strcmp(arguments.tree_method, "parttree")==0)
3870 make_partTree(arguments.sequence_file, arguments.tree_file, arguments.tree_param1, arguments.tree_param2, arguments.is_dna, 0);
3872 else if (strcmp(arguments.tree_method, "oligotree") == 0)
3874 compute_oligomer_distance_tree(arguments.sequence_file, param_set->char2pos, arguments.tree_file, arguments.tree_param2, arguments.tree_param1, param_set->alphabet_size);
3877 if (arguments.tree_only == 1)
3882 if (arguments.tree_out == 1)
3884 char tree_out_file_name[500];
3885 sprintf(tree_out_file_name, "%s.tree",arguments.output_file);
3886 char const LINE_LENGTH = 50;
3887 char line[LINE_LENGTH];
3889 FILE* in = fopen(arguments.tree_file, "r");
3890 FILE* out = fopen(tree_out_file_name, "w");
3891 while( (fgets(line, LINE_LENGTH, in)) != NULL)
3892 fprintf(out, "%s", line);
3901 FILE *seq_file = fopen(arguments.sequence_file,"r");
3902 // FILE *edit_file = fopen(vtmpnam(NULL),"w+");
3903 FILE *edit_file = fopen("aha","w+");
3905 printf("CONSTRUCT ALIGNMENT\n");
3906 FILE *tree_file = fopen(arguments.tree_file,"r");
3907 const int LINE_LENGTH = 100;
3908 char line[LINE_LENGTH];
3909 char delims[] = " ";
3911 int alignment_length = -1;
3915 //bottom-up traversal
3916 while(fgets(line, LINE_LENGTH, tree_file)!=NULL)
3919 node[0] = atoi(strtok(line,delims));
3920 node[1] = atoi(strtok(NULL,delims));
3921 node[2] = atoi(strtok(NULL,delims));
3923 //getting profile of second child
3924 if (node[1] < number_of_sequences)
3926 file_pos2profile(seq_file, file_positions[node[1]], profiles[1], node[1], param_set); //profile to save into
3930 profile_file2profile(profiles[1], prof_file, profile_positions[--saved_prof], param_set);
3931 fseek (prof_file , profile_positions[saved_prof] , SEEK_SET);
3934 //getting profile of first child
3935 if (node[0] < number_of_sequences)
3937 file_pos2profile(seq_file, file_positions[node[0]], profiles[0], node[0], param_set); //profile to save into
3941 profile_file2profile(profiles[0], prof_file, profile_positions[--saved_prof], param_set);
3942 fseek (prof_file , profile_positions[saved_prof] , SEEK_SET);
3946 if (saved_prof == max_prof)
3949 profile_positions = vrealloc(profile_positions, max_prof*sizeof(long));
3952 edit_positions[node[2]-number_of_sequences] = ftell(edit_file);
3953 profile_positions[saved_prof] = ftell(prof_file);
3956 //aligning the sequences
3957 alignment_length = alignment_function(profiles, param_set, method_arguments_p, arguments.is_dna, edit_file, prof_file, node[2]);
3961 // bottom-down traversal (edit_files --> alignment)
3962 // tmp_out_file_name = vtmpnam(NULL);
3964 // FILE *alignment_file = fopen(tmp_out_file_name, "w");
3965 FILE *alignment_file = fopen(arguments.output_file, "w");
3966 FILE *edit_seq_file = fopen(vtmpnam(NULL),"w+");
3968 char *aligned_sequence = vcalloc(alignment_length+3, sizeof(char));
3971 long offset = ftell(edit_seq_file);
3972 for (i = 0; i < alignment_length; ++i)
3974 fprintf(edit_seq_file, "X");
3975 aligned_sequence[i] = 'X';
3977 aligned_sequence[i]= '\n';
3978 aligned_sequence[i+1]= '\0';
3979 fprintf(edit_seq_file, "\n");
3980 offset = (ftell(edit_seq_file) - offset)*-1;
3983 edit2alignment(seq_file, file_positions, edit_file, edit_positions, node[2], number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file);
3984 fclose(alignment_file);
3988 fclose(edit_seq_file);
3990 //set stuff for the next cycle
3991 // arguments.sequence_file = tmp_out_file_name;
3995 // // char copy_command[500];
3996 // // sprintf(copy_command, "cp %s %s_%i", tmp_out_file_name, arguments.output_file, cycle);
3997 // // system(copy_command);
4001 // printf("HERE_COPY\n");
4002 // char copy_command[2000];
4003 // sprintf(copy_command, "mv %s %s", tmp_out_file_name, arguments.output_file);
4004 // printf("%s\n", copy_command);
4005 // int error = system(copy_command);
4006 // printf("ERROR %i\n", error);
4009 //free_memory & close files
4011 free_fastal_profile(profiles[0], alphabet_size);
4012 free_fastal_profile(profiles[1], alphabet_size);
4014 vfree(profile_positions);
4018 // number_of_sequences
4021 if (arguments.score)
4023 printf("Calculate Score\n");
4024 double aln_score = calculate_sum_of_pairs_score_affine(arguments.output_file, param_set->M, param_set->gop, param_set->gep);
4025 printf("SCORE: %f\n", aln_score);
4032 if (!strcmp(param_set->method, "fast"))
4034 free_sparse((Sparse_dynamic_param*)method_arguments_p);
4036 else if (!strcmp(param_set->method, "nw"))
4038 free_nw((Nw_param*)method_arguments_p, alphabet_size);
4040 else if (!strcmp(param_set->method, "gotoh"))
4042 free_gotoh((Gotoh_param*)method_arguments_p, alphabet_size);
4047 //free_memory & close files
4049 vfree(edit_positions);
4058 //****************** toolbox ***************************
4062 * \brief Enlargement of the dynamic programming matrix in case it is to small.
4064 * \param dyn_matrix The dynamic programming matrix.
4065 * \param old_length1 Current size of dimension 1.
4066 * \param old_length2 Current size of dimension 2.
4067 * \param length1 New size of dimension 1.
4068 * \param length2 New size of dimension 2.
4069 * \return Pointer to the new array.
4072 resize_dyn_matrix(double **dyn_matrix, //the dynamic programming matrix
4073 int old_length1, //old length of dimension 1
4074 int old_length2, //old length of dimension 2
4075 int length1, //new minimum length of dimension 1
4076 int length2) //new maximum length of dimension 2
4079 if (old_length1 < length1)
4081 dyn_matrix = vrealloc(dyn_matrix,length1*sizeof(double*));
4082 for (i = old_length1; i < length1; ++i)
4083 dyn_matrix[i] = vcalloc(old_length2,sizeof(double));
4084 old_length1 = length1;
4087 if (old_length2 < length2)
4089 for (i = 0;i<old_length1; ++i)
4090 dyn_matrix[i] = vrealloc(dyn_matrix[i], length2*sizeof(double));
4091 old_length2 = length2;
4099 * \brief Frees the memory of a dynamic programming matrix.
4101 * \param length1 Size of the first dimension of the matrix.
4102 * \param dyn_matrix The dynamic programming matrix.
4105 free_dyn_matrix(int length1, //length of first dimension
4106 double **dyn_matrix) //dynamic matrix
4109 for (; i<length1; ++i)
4110 vfree(dyn_matrix[i]);
4117 * \brief Initialises the profiles with basic values.
4119 * \param profiles Array of 3 profiles.
4120 * \param param_set The fastal parameters
4123 initiate_profiles(Fastal_profile **profiles, //profiles pointer
4124 Fastal_param *param_set)
4126 int alphabet_size = param_set->alphabet_size;
4128 for (i =0; i < 3; ++i)
4130 profiles[i] = vcalloc(1,sizeof(Fastal_profile));
4131 profiles[i]->weight = 1;
4132 profiles[i]->is_leaf = 1;
4133 profiles[i]->prf = vcalloc(alphabet_size, sizeof(int*));
4134 for (j = 0; j < alphabet_size; ++j)
4136 profiles[i]->prf[j] = vcalloc(PROFILE_ENLARGEMENT, sizeof(int));
4138 profiles[i]->allocated_memory = PROFILE_ENLARGEMENT;
4146 * \brief frees all memory occupied by the profile
4148 * \param profile The profile to free.
4149 * \param alphabet_size The alphabet_size.
4152 free_fastal_profile(Fastal_profile* profile, int alphabet_size)
4155 for (;alphabet_size >= 0; --alphabet_size)
4156 vfree(profile->prf[alphabet_size]);
4157 vfree(profile->prf);
4162 * \brief Initialize the Fastal parameter set.
4164 * \param is_dna 1 when sequences are dna, 0 when not
4165 * \param param_set The fastal parameter set.
4166 * \param method The method to use in Fastal.
4169 fill_parameters(int is_dna, Fastal_param *param_set, char *method, char *diag_method, char *mat)
4171 sprintf(param_set->method,"%s",method);
4172 sprintf(param_set->diag_method,"%s",diag_method);
4175 param_set->M = read_matrice(mat);
4178 param_set->alphabet_size = 4;
4179 char tmp1[] = {'A','C','G','T'};
4181 // int tmp2[] = { 0, 1, 1, 0, -1, -1, 2, 0, -1, -1, 3, -1, 1, 0, -1, -1, -1, 0, 1, 3, 4, -1, 3, -1, 1, -1};
4182 for (i = 0; i<param_set->alphabet_size; ++i)
4183 param_set->pos2char[i] = tmp1[i];
4184 // for (i = 0; i<26; ++i)
4185 // param_set->char2pos[i] = tmp2[i];
4186 param_set->char2pos['A'] = 0;
4187 param_set->char2pos['B'] = 1;
4188 param_set->char2pos['C'] = 1;
4189 param_set->char2pos['D'] = 0;
4190 param_set->char2pos['G'] = 2;
4191 param_set->char2pos['H'] = 0;
4192 param_set->char2pos['K'] = 3;
4193 param_set->char2pos['M'] = 1;
4194 param_set->char2pos['N'] = 0;
4195 param_set->char2pos['R'] = 0;
4196 param_set->char2pos['S'] = 1;
4197 param_set->char2pos['T'] = 3;
4198 param_set->char2pos['U'] = 3;
4199 param_set->char2pos['W'] = 3;
4200 param_set->char2pos['Y'] = 1;
4201 // param_set->M[0][3] = param_set->M[3][0] = -10;
4202 // param_set->M[1][2] = param_set->M[2][1] = -10;
4203 // param_set->M[0][1] = param_set->M[0][2] = param_set->M[1][0] = param_set->M[2][0] = -10;
4204 // param_set->M[3][1] = param_set->M[3][2] = param_set->M[1][3] = param_set->M[2][3] = -10;
4208 param_set->alphabet_size = 21;
4209 char tmp1[] = {'A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','X'};
4210 // int tmp2[] = { 0, 20, 1, 5, 16, 4, 2, 6, 7, 21, 8, 9, 10, 11, -1, 12, 13, 14, 15, 3, -1, 17, 18, 22, 19, 23};
4211 for (i = 0; i<param_set->alphabet_size; ++i)
4212 param_set->pos2char[i] = tmp1[i];
4213 // for (i = 0; i<26; ++i)
4214 // param_set->char2pos[i] = tmp2[i];
4215 param_set->char2pos['A'] = 0;
4216 param_set->char2pos['B'] = 20;
4217 param_set->char2pos['C'] = 1;
4218 param_set->char2pos['D'] = 2;
4219 param_set->char2pos['E'] = 3;
4220 param_set->char2pos['F'] = 4;
4221 param_set->char2pos['G'] = 5;
4222 param_set->char2pos['H'] = 6;
4223 param_set->char2pos['I'] = 7;
4224 param_set->char2pos['J'] = 20;
4225 param_set->char2pos['K'] = 8;
4226 param_set->char2pos['L'] = 9;
4227 param_set->char2pos['M'] = 10;
4228 param_set->char2pos['N'] = 11;
4229 param_set->char2pos['P'] = 12;
4230 param_set->char2pos['Q'] = 13;
4231 param_set->char2pos['R'] = 14;
4232 param_set->char2pos['S'] = 15;
4233 param_set->char2pos['T'] = 16;
4234 param_set->char2pos['V'] = 17;
4235 param_set->char2pos['W'] = 18;
4236 param_set->char2pos['X'] = 20;
4237 param_set->char2pos['Y'] = 19;
4238 param_set->char2pos['X'] = 20;
4244 /******************************COPYRIGHT NOTICE*******************************/
4245 /*© Centro de Regulacio Genomica */
4247 /*Cedric Notredame */
4248 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
4249 /*All rights reserved.*/
4250 /*This file is part of T-COFFEE.*/
4252 /* T-COFFEE is free software; you can redistribute it and/or modify*/
4253 /* it under the terms of the GNU General Public License as published by*/
4254 /* the Free Software Foundation; either version 2 of the License, or*/
4255 /* (at your option) any later version.*/
4257 /* T-COFFEE is distributed in the hope that it will be useful,*/
4258 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
4259 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
4260 /* GNU General Public License for more details.*/
4262 /* You should have received a copy of the GNU General Public License*/
4263 /* along with Foobar; if not, write to the Free Software*/
4264 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
4265 /*............................................... |*/
4266 /* If you need some more information*/
4267 /* cedric.notredame@europe.com*/
4268 /*............................................... |*/
4272 /******************************COPYRIGHT NOTICE*******************************/