#include #include #include #include #include // #include #include "io_lib_header.h" #include "util_lib_header.h" #include "define_header.h" #include "dp_lib_header.h" #include "fastal_lib_header.h" #include "fast_tree_header.h" // #include // #define CHUNKSIZE 100 // #define N 1000 //Fastal_param *param_set; /*! \mainpage T-Coffee Index Page * * \section intro_sec Introduction * * This is the introduction. * * \section install_sec Installation * * \subsection step1 Step 1: Opening the box * * etc... * \section fastal_sec Fastal * * This program is a very fast aligner. It is capable of aligning huge sets of sequences because it keeps as much as necessary on hard disk. */ /*! * \file fastal.c * \brief Source code for the fastal algorithm */ //************************** sparse dynamic aligning ********************************************************** void fill_arguments_sparse(Sparse_dynamic_param* method_arguments_p) { method_arguments_p->diagonals = vcalloc(3,sizeof(Diagonal)); method_arguments_p->dig_length = vcalloc(1,sizeof(int)); *method_arguments_p->dig_length = 3; method_arguments_p->list = NULL; method_arguments_p->list_length = vcalloc(1,sizeof(int)); *method_arguments_p->list_length = 0; method_arguments_p->file_name1 = vtmpnam(NULL); method_arguments_p->file_name2 = vtmpnam(NULL); } void free_sparse(Sparse_dynamic_param* method_arguments_p) { vfree(method_arguments_p->diagonals); vfree(method_arguments_p->dig_length); vfree(method_arguments_p->list_length); } /** * \brief One run of sparse dynamic programming. * * \param profiles The profiles. * \param param_set The fastal parameters. * \param method_arguments_p The method arguments. * \param is_dna Sequences are DNA (\a is_dna = 1) or protein. * \param edit_file The edit file. * \param prof_file the profile file. * \param number Number of the parent node. * \return The length of the alignment. */ int sparse_dyn(Fastal_profile **profiles, Fastal_param *param_set, void *method_arguments_p, int is_dna, FILE *edit_file, FILE *prof_file, int number) { // printf("WHAT THE HELL ARE YOU DOING HERE?\n"); Sparse_dynamic_param *arguments = (Sparse_dynamic_param*)method_arguments_p; // static char *file_name1 = vtmpnam(NULL); // static char *file_name2 = vtmpnam(NULL); char *file_name1 = arguments->file_name1; char *file_name2 = arguments->file_name2; char *seq1, *seq2; Fastal_profile *tmp1 = profiles[0]; Fastal_profile *tmp2 = profiles[1]; seq1 = profile2consensus(tmp1, param_set); seq2 = profile2consensus(tmp2, param_set); int **diagonals_p = &(arguments->diagonals); int num_diagonals = -1; if (!strcmp(param_set->diag_method, "blastz")) { FILE *cons_f = fopen(file_name1,"w"); fprintf(cons_f, ">%i\n", tmp1->prf_number); fprintf(cons_f, "%s", seq1); fprintf( cons_f, "\n"); fclose(cons_f); cons_f = fopen(file_name2,"w"); fprintf(cons_f, ">%i\n", tmp2->prf_number); fprintf(cons_f, "%s", seq2); fprintf( cons_f, "\n"); fclose(cons_f); num_diagonals = seq_pair2blastz_diagonal(file_name1, file_name2, diagonals_p, arguments->dig_length, strlen(seq1),strlen(seq2), is_dna); } else if (!strcmp(param_set->diag_method, "blast")) { FILE *cons_f = fopen(file_name1,"w"); fprintf(cons_f, ">%i\n", tmp1->prf_number); fprintf(cons_f, "%s", seq1); fprintf( cons_f, "\n"); fclose(cons_f); cons_f = fopen(file_name2,"w"); fprintf(cons_f, ">%i\n", tmp2->prf_number); fprintf(cons_f, "%s", seq2); fprintf( cons_f, "\n"); fclose(cons_f); int l1 = strlen(seq1); int l2 = strlen(seq2); num_diagonals = seq_pair2blast_diagonal(file_name1, file_name2, diagonals_p, arguments->dig_length, l1, l2, is_dna); // int *num_p = &num_diagonals; // Segment* seg = extend_diagonals(*diagonals_p, num_p, l1, l2); // printf("A: %i\n", num_diagonals); } else if (!strcmp(param_set->diag_method, "ktup")) { num_diagonals = seq_pair2diagonal_own(seq1, seq2, diagonals_p, arguments->dig_length, strlen(seq1),strlen(seq2), is_dna, 3); // num_diagonals = seq_pair2diagonal_swift(seq1, seq2, diagonals_p, arguments->dig_length, strlen(seq1),strlen(seq2), is_dna, 3); } // arguments->diagonals = diagonals_p[0]; // arguments->list = segments2int(seg, *num_p, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set); // t ** segments2int_gap(Segment *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set); // arguments->list = diagonals2int_dot(arguments->diagonals, num_diagonals, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set); // arguments->list = diagonals2int_euclidf(arguments->diagonals, num_diagonals, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set); arguments->list = diagonals2int_gap_test(arguments->diagonals, num_diagonals, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set); // arguments->list = diagonals2int(arguments->diagonals, num_diagonals, seq1, seq2, arguments->list_length, param_set); int alignment_length = list2linked_pair_wise_fastal(profiles[0], profiles[1], param_set, arguments->list, *arguments->list_length, edit_file, prof_file, number); int x; for (x = 0; x < *arguments->list_length; ++x) { vfree(arguments->list[x]); } vfree(arguments->list); arguments->list = NULL; vfree(seq1); vfree(seq2); return alignment_length; } int fastal_compare (const void * a, const void * b) { return (*(int*)a - *(int*)b); } /** * \brief Makes a sorted list out of diagonals. * * \param diagonals A list of diagonals to use during dynamic programming. * \param num_diagonals Number of diagonals. * \param seq1 Sequence 1. * \param seq2 Sequence 2. * \param num_points Number of points in the list * \param param_set Fastal parameters. * \return A 2-dim array which contains all points needed for the sparse dynamic programming algorithm. */ int ** diagonals2int(int *diagonals, int num_diagonals, char *seq1, char *seq2, int *num_points, Fastal_param *param_set) { int l1 = strlen(seq1); int l2 = strlen(seq2); int gep = param_set->gep; int dig_length; if (seq1 > seq2) dig_length = l1; else dig_length = l2; int current_size = num_diagonals*dig_length + l1 +l2; int **list = vcalloc(current_size, sizeof(int*)); int *diags = vcalloc(num_diagonals, sizeof(int)); int i; for (i = 0; i < num_diagonals; ++i) { diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1]; } qsort (diags, num_diagonals, sizeof(int), fastal_compare); int *diagx = vcalloc(num_diagonals, sizeof(int)); int *diagy = vcalloc(num_diagonals, sizeof(int)); //+1 because diagonals start here at position 1, like in "real" dynamic programming int a = -1, b = -1; for (i = 0; i < num_diagonals; ++i) { if (diags[i] < l1) { diagx[i] = l1 - diags[i]; diagy[i] = 0; a= i; } else break; } ++a; b=a-1; for (; i < num_diagonals; ++i) { diagx[i] = 0; diagy[i] = diags[i]-l1; b = i; } vfree(diags); int tmpy_pos; int tmpy_value; int **M = param_set->M; int *last_y = vcalloc(l2+1, sizeof(int)); int *last_x = vcalloc(l1+1, sizeof(int)); last_y[0] = 0; last_x[0] = 0; list[0] = vcalloc(6, sizeof(int)); int list_pos = 1; int dig_num = l1; int tmp_l2 = l2 + 1; //left border for (; list_pos < tmp_l2; ++list_pos) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = 0; list[list_pos][1] = list_pos; last_y[list_pos] = list_pos; list[list_pos][2] = list_pos*gep; list[list_pos][4] = list_pos-1; } int pos_x = 0; int y; int tmp_l1 = l1-1; while (pos_x < tmp_l1) { if (list_pos + num_diagonals+2 > current_size) { current_size += num_diagonals*1000; list = vrealloc(list, current_size * sizeof(int*)); } //upper border list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = ++pos_x; list[list_pos][1] = 0; list[list_pos][2] = pos_x * gep; list[list_pos][3] = last_y[0]; tmpy_value = list_pos; tmpy_pos = 0; last_x[pos_x] = list_pos; ++list_pos; //diagonals for (i = a; i <= b; ++i) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = ++diagx[i]; list[list_pos][1] = ++diagy[i]; list[list_pos][3] = last_y[diagy[i]]; list[list_pos][4] = list_pos-1; list[list_pos][5] = last_y[diagy[i]-1]; list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A']; last_y[tmpy_pos] = tmpy_value; tmpy_value = list_pos; tmpy_pos = diagy[i]; ++list_pos; } last_y[tmpy_pos] = tmpy_value; //lower border if (list[list_pos-1][1] != l2) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = pos_x; list[list_pos][1] = l2; list[list_pos][3] = last_y[l2]; list[list_pos][2] = -1000; list[list_pos][4] = list_pos-1; if (pos_x > l2) list[list_pos][5] = last_x[pos_x-l2]; else list[list_pos][5] = l2-pos_x; last_y[l2] = list_pos; ++list_pos; } if ((b >= 0) && (diagy[b] == l2)) --b; if ((a >0) && (diagx[a-1] == pos_x)) --a; } dig_num = -1; if (list_pos + l2+2 > current_size) { current_size += list_pos + l2 + 2; list = vrealloc(list, current_size * sizeof(int*)); } // right border list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = l1; list[list_pos][1] = 0; list[list_pos][3] = last_x[l1-1]; list[list_pos][2] = -1000; ++list_pos; for (i = 1; i <= l2; ++i) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = l1; list[list_pos][1] = i; list[list_pos][3] = last_y[i]; list[list_pos][4] = list_pos-1; y = last_y[i-1]; if ((list[y][0] == l1-1) && (list[y][1] == i-1)) { list[list_pos][5] = y; list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A']; } else { if (i <= l1) { list[list_pos][5] = last_x[l1-i]; } else { list[list_pos][5] = i-l1; } list[list_pos][2] = -1000; } ++list_pos; } list[list_pos - l2][2] = -1000; *num_points = list_pos; vfree(diagx); vfree(diagy); return list; } /** * \brief Makes a sorted list out of diagonals. * * \param diagonals A list of diagonals to use during dynamic programming. * \param num_diagonals Number of diagonals. * \param seq1 Sequence 1. * \param seq2 Sequence 2. * \param num_points Number of points in the list * \param param_set Fastal parameters. * \return A 2-dim array which contains all points needed for the sparse dynamic programming algorithm. */ int ** diagonals2int_gap_test(int *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set) { int alphabet_size = param_set->alphabet_size; int l1 = strlen(seq1); int l2 = strlen(seq2); int gep = param_set->gep; int current_size = l2+l1; int **list = vcalloc(current_size, sizeof(int*)); int *diags = vcalloc(num_diagonals, sizeof(int)); int i; for (i = 0; i < num_diagonals; ++i) { diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1]; } qsort (diags, num_diagonals, sizeof(int), fastal_compare); int *diagx = vcalloc(num_diagonals, sizeof(int)); int *diagy = vcalloc(num_diagonals, sizeof(int)); int *old_pos = vcalloc(num_diagonals, sizeof(int)); //+1 because diagonals start here at position 1, like in "real" dynamic programming int a = -1, b = -1; for (i = 0; i < num_diagonals; ++i) { if (diags[i] < l1) { diagx[i] = l1 - diags[i]; diagy[i] = 0; a= i; } else break; } ++a; b=a-1; for (; i < num_diagonals; ++i) { diagx[i] = 0; diagy[i] = diags[i]-l1; b = i; } vfree(diags); int tmpy_pos; int tmpy_value; int **M = param_set->M; int *last_y = vcalloc(l2+1, sizeof(int)); int *last_x = vcalloc(l1+1, sizeof(int)); last_y[0] = 0; last_x[0] = 0; list[0] = vcalloc(6, sizeof(int)); int list_pos = 1; int dig_num = l1; int tmp_l2 = l2 + 1; //left border for (; list_pos < tmp_l2; ++list_pos) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = 0; list[list_pos][1] = list_pos; last_y[list_pos] = list_pos; list[list_pos][2] = list_pos*gep; list[list_pos][4] = list_pos-1; } int pos_x = 0; // int diags_old = l2; // int tmp = l1; int y; int tmp_l1 = l1-1; while (pos_x < tmp_l1) { if (list_pos + num_diagonals+2 > current_size) { current_size += num_diagonals*1000; list = vrealloc(list, current_size * sizeof(int*)); } //upper border list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = ++pos_x; list[list_pos][1] = 0; list[list_pos][2] = pos_x * gep; list[list_pos][3] = last_y[0]; tmpy_value = list_pos; tmpy_pos = 0; last_x[pos_x] = list_pos; ++list_pos; //diagonals for (i = a; i <= b; ++i) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = ++diagx[i]; list[list_pos][1] = ++diagy[i]; list[list_pos][3] = last_y[diagy[i]]; list[list_pos][4] = list_pos-1; list[list_pos][5] = last_y[diagy[i]-1]; //SIMPLEGAP int num_seq = profile1->number_of_sequences + profile2->number_of_sequences; double gap_num = 0; int char_c; for (char_c = 0; char_c < alphabet_size; ++char_c) { gap_num += profile1->prf[char_c][diagx[i]-1] + profile2->prf[char_c][diagy[i]-1]; } gap_num /= num_seq; list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A'] * gap_num; // CLUSTAL // int num_seq = profile1->number_of_sequences + profile2->number_of_sequences; // double gap_num = 0; // int char_c, char_c2; // for (char_c = 0; char_c < alphabet_size; ++char_c) // for (char_c2 = 0; char_c2 < alphabet_size; ++char_c2) // { // gap_num += (profile1->prf[char_c][diagx[i]-1]/profile1->number_of_sequences) * (profile2->prf[char_c][diagy[i]-1]/profile2->number_of_sequences) * M[param_set->pos2char[char_c]-'A'][param_set->pos2char[char_c2]-'A']; // } // list[list_pos][2] = gap_num; last_y[tmpy_pos] = tmpy_value; tmpy_value = list_pos; tmpy_pos = diagy[i]; ++list_pos; } last_y[tmpy_pos] = tmpy_value; //lower border if (list[list_pos-1][1] != l2) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = pos_x; list[list_pos][1] = l2; list[list_pos][3] = last_y[l2]; list[list_pos][2] = -1000; list[list_pos][4] = list_pos-1; if (pos_x > l2) list[list_pos][5] = last_x[pos_x-l2]; else list[list_pos][5] = l2-pos_x; last_y[l2] = list_pos; ++list_pos; } if ((b >= 0) && (diagy[b] == l2)) --b; if ((a >0) && (diagx[a-1] == pos_x)) --a; } dig_num = -1; if (list_pos + l2+2 > current_size) { current_size += list_pos + l2 + 2; list = vrealloc(list, current_size * sizeof(int*)); } // right border list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = l1; list[list_pos][1] = 0; list[list_pos][3] = last_x[l1-1]; list[list_pos][2] = -1000; ++list_pos; for (i = 1; i <= l2; ++i) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = l1; list[list_pos][1] = i; list[list_pos][3] = last_y[i]; list[list_pos][4] = list_pos-1; y = last_y[i-1]; if ((list[y][0] == l1-1) && (list[y][1] == i-1)) { list[list_pos][5] = y; int num_seq = profile1->number_of_sequences + profile2->number_of_sequences; double gap_num = 0; int char_c; for (char_c = 0; char_c < alphabet_size; ++char_c) { gap_num += profile1->prf[char_c][l1-1] + profile2->prf[char_c][i-1]; } gap_num /= num_seq; list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'] * gap_num; } else { if (i <= l1) { list[list_pos][5] = last_x[l1-i]; } else { list[list_pos][5] = i-l1; } list[list_pos][2] = -1000; } ++list_pos; } list[list_pos - l2][2] = -1000; *num_points = list_pos; vfree(diagx); vfree(diagy); vfree(old_pos); return list; } int ** diagonals2int_euclidf(int *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set) { int alphabet_size = param_set->alphabet_size; int l1 = strlen(seq1); int l2 = strlen(seq2); int gep = param_set->gep; int current_size = l2+l1; int **list = vcalloc(current_size, sizeof(int*)); int *diags = vcalloc(num_diagonals, sizeof(int)); int i; for (i = 0; i < num_diagonals; ++i) { diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1]; } qsort (diags, num_diagonals, sizeof(int), fastal_compare); int *diagx = vcalloc(num_diagonals, sizeof(int)); int *diagy = vcalloc(num_diagonals, sizeof(int)); int *old_pos = vcalloc(num_diagonals, sizeof(int)); //+1 because diagonals start here at position 1, like in "real" dynamic programming int a = -1, b = -1; for (i = 0; i < num_diagonals; ++i) { if (diags[i] < l1) { diagx[i] = l1 - diags[i]; diagy[i] = 0; a= i; } else break; } ++a; b=a-1; for (; i < num_diagonals; ++i) { diagx[i] = 0; diagy[i] = diags[i]-l1; b = i; } vfree(diags); int tmpy_pos; int tmpy_value; // int **M = param_set->M; int *last_y = vcalloc(l2+1, sizeof(int)); int *last_x = vcalloc(l1+1, sizeof(int)); last_y[0] = 0; last_x[0] = 0; list[0] = vcalloc(6, sizeof(int)); int list_pos = 1; int dig_num = l1; int tmp_l2 = l2 + 1; //left border for (; list_pos < tmp_l2; ++list_pos) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = 0; list[list_pos][1] = list_pos; last_y[list_pos] = list_pos; list[list_pos][2] = list_pos*gep; list[list_pos][4] = list_pos-1; } int pos_x = 0; // int diags_old = l2; // int tmp = l1; int y; int tmp_l1 = l1-1; while (pos_x < tmp_l1) { if (list_pos + num_diagonals+2 > current_size) { current_size += num_diagonals*1000; list = vrealloc(list, current_size * sizeof(int*)); } //upper border list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = ++pos_x; list[list_pos][1] = 0; list[list_pos][2] = pos_x * gep; list[list_pos][3] = last_y[0]; tmpy_value = list_pos; tmpy_pos = 0; last_x[pos_x] = list_pos; ++list_pos; //diagonals for (i = a; i <= b; ++i) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = ++diagx[i]; list[list_pos][1] = ++diagy[i]; list[list_pos][3] = last_y[diagy[i]]; list[list_pos][4] = list_pos-1; list[list_pos][5] = last_y[diagy[i]-1]; int char_c; double tmp_score = 0; double freq1, freq2; for (char_c = 0; char_c < alphabet_size; ++char_c) { freq1 = (double)profile1->prf[char_c][diagx[i]-1] / profile1->number_of_sequences; freq2 = (double)profile2->prf[char_c][diagy[i]-1] / profile2->number_of_sequences; tmp_score += ( freq1 - freq2) * (freq1 - freq2); } list[list_pos][2] = 10 - sqrt(tmp_score); last_y[tmpy_pos] = tmpy_value; tmpy_value = list_pos; tmpy_pos = diagy[i]; ++list_pos; } last_y[tmpy_pos] = tmpy_value; //lower border if (list[list_pos-1][1] != l2) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = pos_x; list[list_pos][1] = l2; list[list_pos][3] = last_y[l2]; list[list_pos][2] = -1000; list[list_pos][4] = list_pos-1; if (pos_x > l2) list[list_pos][5] = last_x[pos_x-l2]; else list[list_pos][5] = l2-pos_x; last_y[l2] = list_pos; ++list_pos; } if ((b >= 0) && (diagy[b] == l2)) --b; if ((a >0) && (diagx[a-1] == pos_x)) --a; } dig_num = -1; if (list_pos + l2+2 > current_size) { current_size += list_pos + l2 + 2; list = vrealloc(list, current_size * sizeof(int*)); } // right border list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = l1; list[list_pos][1] = 0; list[list_pos][3] = last_x[l1-1]; list[list_pos][2] = -1000; ++list_pos; for (i = 1; i <= l2; ++i) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = l1; list[list_pos][1] = i; list[list_pos][3] = last_y[i]; list[list_pos][4] = list_pos-1; y = last_y[i-1]; if ((list[y][0] == l1-1) && (list[y][1] == i-1)) { list[list_pos][5] = y; int char_c; int tmp_score = 0; double freq1, freq2; for (char_c = 0; char_c < alphabet_size; ++char_c) { freq1 = profile1->prf[char_c][l1-1] / profile1->number_of_sequences; freq2 = profile2->prf[char_c][i-1] / profile2->number_of_sequences; tmp_score += ( freq1 - freq2) * (freq2 - freq1); } list[list_pos][2] = 10 - sqrt(tmp_score); // list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A']; } else { if (i <= l1) { list[list_pos][5] = last_x[l1-i]; } else { list[list_pos][5] = i-l1; } list[list_pos][2] = -1000; } ++list_pos; } list[list_pos - l2][2] = -1000; *num_points = list_pos; vfree(diagx); vfree(diagy); vfree(old_pos); return list; } int ** diagonals2int_dot(int *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set) { int alphabet_size = param_set->alphabet_size; int l1 = strlen(seq1); int l2 = strlen(seq2); int gep = param_set->gep; int current_size = l2+l1; int **list = vcalloc(current_size, sizeof(int*)); int *diags = vcalloc(num_diagonals, sizeof(int)); int i; for (i = 0; i < num_diagonals; ++i) { diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1]; } qsort (diags, num_diagonals, sizeof(int), fastal_compare); int *diagx = vcalloc(num_diagonals, sizeof(int)); int *diagy = vcalloc(num_diagonals, sizeof(int)); int *old_pos = vcalloc(num_diagonals, sizeof(int)); //+1 because diagonals start here at position 1, like in "real" dynamic programming int a = -1, b = -1; for (i = 0; i < num_diagonals; ++i) { if (diags[i] < l1) { diagx[i] = l1 - diags[i]; diagy[i] = 0; a= i; } else break; } ++a; b=a-1; for (; i < num_diagonals; ++i) { diagx[i] = 0; diagy[i] = diags[i]-l1; b = i; } vfree(diags); int tmpy_pos; int tmpy_value; // int **M = param_set->M; int *last_y = vcalloc(l2+1, sizeof(int)); int *last_x = vcalloc(l1+1, sizeof(int)); last_y[0] = 0; last_x[0] = 0; list[0] = vcalloc(6, sizeof(int)); int list_pos = 1; int dig_num = l1; int tmp_l2 = l2 + 1; //left border for (; list_pos < tmp_l2; ++list_pos) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = 0; list[list_pos][1] = list_pos; last_y[list_pos] = list_pos; list[list_pos][2] = list_pos*gep; list[list_pos][4] = list_pos-1; } int pos_x = 0; // int diags_old = l2; // int tmp = l1; int y; int tmp_l1 = l1-1; while (pos_x < tmp_l1) { if (list_pos + num_diagonals+2 > current_size) { current_size += num_diagonals*1000; list = vrealloc(list, current_size * sizeof(int*)); } //upper border list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = ++pos_x; list[list_pos][1] = 0; list[list_pos][2] = pos_x * gep; list[list_pos][3] = last_y[0]; tmpy_value = list_pos; tmpy_pos = 0; last_x[pos_x] = list_pos; ++list_pos; //diagonals for (i = a; i <= b; ++i) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = ++diagx[i]; list[list_pos][1] = ++diagy[i]; list[list_pos][3] = last_y[diagy[i]]; list[list_pos][4] = list_pos-1; list[list_pos][5] = last_y[diagy[i]-1]; int char_c; double tmp_score = 0; double freq1, freq2; for (char_c = 0; char_c < alphabet_size; ++char_c) { freq1 = (double)profile1->prf[char_c][diagx[i]-1] / profile1->number_of_sequences; freq2 = (double)profile2->prf[char_c][diagy[i]-1] / profile2->number_of_sequences; tmp_score += freq1 * freq2; } list[list_pos][2] = tmp_score * 10; last_y[tmpy_pos] = tmpy_value; tmpy_value = list_pos; tmpy_pos = diagy[i]; ++list_pos; } last_y[tmpy_pos] = tmpy_value; //lower border if (list[list_pos-1][1] != l2) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = pos_x; list[list_pos][1] = l2; list[list_pos][3] = last_y[l2]; list[list_pos][2] = -1000; list[list_pos][4] = list_pos-1; if (pos_x > l2) list[list_pos][5] = last_x[pos_x-l2]; else list[list_pos][5] = l2-pos_x; last_y[l2] = list_pos; ++list_pos; } if ((b >= 0) && (diagy[b] == l2)) --b; if ((a >0) && (diagx[a-1] == pos_x)) --a; } dig_num = -1; if (list_pos + l2+2 > current_size) { current_size += list_pos + l2 + 2; list = vrealloc(list, current_size * sizeof(int*)); } // right border list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = l1; list[list_pos][1] = 0; list[list_pos][3] = last_x[l1-1]; list[list_pos][2] = -1000; ++list_pos; for (i = 1; i <= l2; ++i) { list[list_pos] = vcalloc(6, sizeof(int)); list[list_pos][0] = l1; list[list_pos][1] = i; list[list_pos][3] = last_y[i]; list[list_pos][4] = list_pos-1; y = last_y[i-1]; if ((list[y][0] == l1-1) && (list[y][1] == i-1)) { list[list_pos][5] = y; int char_c; int tmp_score = 0; double freq1, freq2; for (char_c = 0; char_c < alphabet_size; ++char_c) { freq1 = profile1->prf[char_c][l1-1] / profile1->number_of_sequences; freq2 = profile2->prf[char_c][i-1] / profile2->number_of_sequences; tmp_score += freq2 * freq1; } list[list_pos][2] = tmp_score * 10; // list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A']; } else { if (i <= l1) { list[list_pos][5] = last_x[l1-i]; } else { list[list_pos][5] = i-l1; } list[list_pos][2] = -1000; } ++list_pos; } list[list_pos - l2][2] = -1000; *num_points = list_pos; vfree(diagx); vfree(diagy); vfree(old_pos); return list; } void combine_profiles2file(int **prf1, int **prf2, int pos1, int pos2, Fastal_param *param_set, FILE *prof_f, char state) { int alphabet_size = param_set->alphabet_size; char *pos2aa = &(param_set->pos2char[0]); int i; int x = 0; if (state == 'M') { for (i = 0; i < alphabet_size; ++i) if (prf1[i][pos1] + prf2[i][pos2] > 0) { if (x) fprintf(prof_f," %c%i", pos2aa[i],prf1[i][pos1]+prf2[i][pos2]); else fprintf(prof_f,"%c%i", pos2aa[i],prf1[i][pos1]+prf2[i][pos2]); x = 1; } fprintf(prof_f,"\n"); } else if (state == 'D') { for (i = 0; i < alphabet_size; ++i) if (prf2[i][pos2] > 0) { if (x) fprintf(prof_f," %c%i", pos2aa[i],prf2[i][pos2]); else fprintf(prof_f,"%c%i", pos2aa[i],prf2[i][pos2]); x = 1; } fprintf(prof_f,"\n"); } else { for (i = 0; i < alphabet_size; ++i) if (prf1[i][pos1] > 0) { if (x) fprintf(prof_f," %c%i", pos2aa[i],prf1[i][pos1]); else fprintf(prof_f,"%c%i", pos2aa[i],prf1[i][pos1]); x = 1; } fprintf(prof_f,"\n"); } } #define LIN(a,b,c) a[b*5+c] /** * Calculates a fast and sparse dynamic programming matrix * * \param prf1 Profile of first sequence. * \param prf2 Profile of second sequence. * \param param_set The parameter for the alignment. * \param list The list of diagonals. * \param n number of dots. * \param edit_f File to save the edit information. * \param prof_f File to save the profile. * \param node_number Number of the new profile. */ int list2linked_pair_wise_fastal(Fastal_profile *prf1, Fastal_profile *prf2, Fastal_param *param_set, int **list, int n, FILE *edit_f, FILE *prof_f, int node_number) { int a,b, i, j, LEN=0, start_trace = -1; int pi, pj,ij, delta_i, delta_j, prev_i, prev_j; // static int **slist; static long *MI, *MJ, *MM,*MT2; // static int *sortseq; static int max_size; int gop, gep, igop, igep; int l1, l2, l, ls; char **al; int ni=0, nj=0; long score; int nomatch = param_set->nomatch; l1=prf1->length; l2=prf2->length; al=declare_char (2,l1+l2+1); igop=param_set->gop; gep=igep=param_set->gep; if (n>max_size) { max_size=n; vfree (MI);vfree (MJ); vfree (MM); MI=vcalloc (5*n, sizeof (long)); MJ=vcalloc (5*n, sizeof (long)); MM=vcalloc (5*n, sizeof (long)); } else { for (a=0; a=(LIN(MM,pi,0)+gop))?'i':'m'; LIN(MJ,a,0)=MAX(LIN(MJ,pj,0),(LIN(MM,pj,0)+gop))+delta_j*gep; LIN(MJ,a,1)=pj; LIN(MJ,a,2)=0; LIN(MJ,a,3)=delta_j; LIN(MJ,a,4)=(LIN(MJ,pj,0) >=LIN(MM,pj,0)+gop)?'j':'m'; if (a>1 && (ls=list[a][0]-list[ij][0])==(list[a][1]-list[ij][1])) { LIN(MM,a,0)=MAX3(LIN(MM,ij,0),LIN(MI,ij,0),LIN(MJ,ij,0))+list[a][2]-(ls*nomatch); LIN(MM,a,1)=ij; LIN(MM,a,2)=ls; LIN(MM,a,3)=ls; if ( LIN(MM,ij,0) >=LIN(MI,ij,0) && LIN(MM,ij,0)>=LIN(MJ,ij,0))LIN(MM,a,4)='m'; else if ( LIN(MI,ij,0) >= LIN(MJ,ij,0))LIN(MM,a,4)='i'; else LIN(MM,a,4)='j'; } else { LIN(MM,a,0)=UNDEFINED; LIN(MM,a,1)=-1; } } a=start_trace; if (LIN(MM,a,0)>=LIN(MI,a,0) && LIN(MM,a,0) >=LIN(MJ,a,0))MT2=MM; else if ( LIN(MI,a,0)>=LIN(MJ,a,0))MT2=MI; else MT2=MJ; score=MAX3(LIN(MM,a,0), LIN(MI,a,0), LIN(MJ,a,0)); i=l1; j=l2; while (!(i==0 &&j==0)) { int next_a; l=MAX(LIN(MT2,a,2),LIN(MT2,a,3)); // HERE ("%c from %c %d %d SCORE=%d [%d %d] [%2d %2d]", T2[a][5],T2[a][4], T2[a][2], T2[a][3], T2[a][0], gop, gep, i, j); if (i==0) { while ( j>0) { al[0][LEN]=0; al[1][LEN]=1; j--; LEN++; } } else if (j==0) { while ( i>0) { al[0][LEN]=1; al[1][LEN]=0; i--; LEN++; } } // else if (l==0) {HERE ("L=0 i=%d j=%d",l, i, j);exit (0);} else { for (b=0; bprf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf); fprintf(prof_f, "%i\n0\n%i\n1\n%i\n", node_number,LEN, prf1->number_of_sequences+prf2->number_of_sequences); char statec[] = {'M','D','I'}; int num = 0; int state = 0; i = 0; j = 0; for ( b=0; b< LEN; b++) { if ((al[0][b]==1) && (al[1][b]==1)) { combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'M'); ++i; ++j; if (state != 0) { fprintf(edit_f, "%c%i\n",statec[state], num); num =1; state = 0; } else ++num; } else if (al[0][b]==1) { // prf1->prf[param_set->alphabet_size-1] += prf2->num_sequences; combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'I'); ++i; if (state != 2) { fprintf(edit_f, "%c%i\n",statec[state], num); num =1; state = 2; } else ++num; } else if (al[1][b]==1) { // prf2->prf[param_set->alphabet_size-1] += prf1->num_sequences; combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'D'); ++j; if (state != 1) { fprintf(edit_f, "%c%i\n",statec[state], num); num =1; state = 1; } else ++num; } } fprintf(edit_f, "%c%i\n",statec[state], num); num =1; state = 1; fprintf(edit_f,"*\n"); fprintf(prof_f,"*\n"); free_char (al, -1); // exit(0); return LEN; } /** * \brief Turns a profile into a consensus sequence. * * The character with the highest number of occurences is used as consensus. Gaps are not included. For example: 10 '-' and one 'A' would give 'A' as consensus. * \param profile The profile. * \param file_name Name of the file to save the consensus sequence in. * \param param_set The parameter of the fastal algorithm. * \return the sequence */ char* profile2consensus(Fastal_profile *profile, Fastal_param *param_set) { // FILE *cons_f = fopen(file_name,"w"); // fprintf(cons_f, ">%i\n", profile->prf_number); char* seq = vcalloc(profile->length+1, sizeof(char)); int i, j; int most_pos = -1, most; int alphabet_size = param_set->alphabet_size; int **prf = profile->prf; char *pos2char = param_set->pos2char; for (i = 0; i < profile->length; ++i) { most = -1; for (j = 0; j < alphabet_size; ++j) { if (prf[j][i] > most) { most = prf[j][i]; most_pos = j; } } seq[i] = pos2char[most_pos]; // fprintf(cons_f, "%c",pos2char[most_pos]); } return seq; } int diag_compare (const void * a, const void * b) { return (((Diagonal_counter*)b)->count - ((Diagonal_counter*)a)->count); } /** * \brief Calculates the diagonals between two sequences. * * Uses to calculate the diagonals. * \param seq_file1 File with sequence 1. * \param seq_file2 File with sequence 2. * \param diagonals An array where the diagonal points will be stored. * \param dig_length length of \a diagonals . * \param num_points Number of points in all diagonals. * \return number of diagonals; */ int seq_pair2diagonal_own(char *seq1, char *seq2, int **diagonals, int *dig_length, int l1, int l2, int is_dna, int word_length) { // word_length = 7; int word_number, i; int ng; if (is_dna) { word_number = (int)pow(5, word_length); ng = 4; } else { word_number = (int)pow(24, word_length); ng = 24; } int **word_index = vcalloc(word_number, sizeof(int*)); for (i = 0 ; i < word_number; ++i) { word_index[i] = vcalloc(20, sizeof(int)); word_index[i][0] = 2; word_index[i][1] = 20; } //making of k-tup index of seq1 int *prod=vcalloc (word_length, sizeof(int)); for ( i=0; i window_length-3) diag_index[i2 - tmp[j] + l1].count = window_length+100; else --diag_index[i2 - tmp[j] + l1].count; } } //choose diagonals int *diags = diagonals[0]; int current_pos = 0; qsort (diag_index, num, sizeof(Diagonal_counter*), diag_compare); i = 0; int y, x; while (diag_index[i].count > window_length+10) { if (current_pos > (*dig_length)-3) { (*dig_length) += 30; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } y = diag_index[i].diagonal - l1; if (y < 0) { x = y * (-1); y = 0; } else { x = 0; } diags[current_pos++] = x; diags[current_pos++] = y; diags[current_pos++] = 200; ++i; } vfree(diag_index); for (i = 0; i < word_number; ++i) vfree(word_index[i]); vfree(word_index); diagonals[0] = diags; return current_pos/3; } int seq_pair2diagonal_swift(char *seq1, char *seq2, int **diagonals, int *dig_length, int l1, int l2, int is_dna, int word_length) { int word_number, i; int ng; if (is_dna) { word_number = (int)pow(5, word_length); ng = 5; } else { word_number = (int)pow(24, word_length); ng = 24; } int **word_index = vcalloc(word_number, sizeof(int*)); for (i = 0 ; i < word_number; ++i) { word_index[i] = vcalloc(20, sizeof(int)); word_index[i][0] = 2; word_index[i][1] = 20; } //making of k-tup index of seq1 int *prod=vcalloc (word_length, sizeof(int)); for ( i=0; i window_length) { if (diag_index[tmp_index].count < threshold) { diag_index[tmp_index].count = 0; diag_index[tmp_index].start = i; diag_index[tmp_index].end = i + word_length; } } else { ++(diag_index[tmp_index].count); } } } // choose diagonals int *diags = diagonals[0]; int current_pos = 0; int x, y; for (i = 0; i < num; ++i) { if (diag_index[i].count > threshold) { if (current_pos > (*dig_length)-3) { (*dig_length) += 30; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } y = diag_index[i].diagonal - l1; if (y < 0) { x = y * (-1); y = 0; } else { x = 0; } diags[current_pos++] = x; diags[current_pos++] = y; diags[current_pos++] = 200; } } vfree(diag_index); for (i = 0; i < word_number; ++i) vfree(word_index[i]); vfree(word_index); diagonals[0] = diags; return current_pos/3; } /** * \brief Calculates the diagonals between two sequences. * * Uses a k-tup index to choose diagonals. * \param seq_file1 File with sequence 1. * \param seq_file2 File with sequence 2. * \param diagonals An array where the diagonal points will be stored. * \param dig_length length of \a diagonals . * \param num_points Number of points in all diagonals. * \return number of diagonals; */ int seq_pair2blast_diagonal(char *seq_file_name1, char *seq_file_name2, int **diagonals, int *dig_length, int l1, int l2, int is_dna) { // static int blast_measure[12]={0,0,0,0,0,0,0,0,0,0,0,0}; int *diag = vcalloc(l1 + l2, sizeof(int)); char *out_file = vtmpnam(NULL); char blast_command[200]; // char blast_command2[200]; // if (x) // { // int i; // printf("BLAST-Types:\n"); // for (i = 0; i < 11; ++i) // { // printf("Type %i: %i\n", i, blast_measure[i]); // } // return 0; // } // char blast_command2[600]; // sprintf(blast_command2, "less %s", out_file); if (is_dna) { sprintf(blast_command, "bl2seq -p blastn -i %s -j %s -D 1 -g F -o %s -S 1 -F F", seq_file_name1, seq_file_name2, out_file); } else { sprintf(blast_command, "bl2seq -p blastp -i %s -j %s -D 1 -g F -o %s -F F -S 1", seq_file_name1, seq_file_name2, out_file); } system(blast_command); int *diags = diagonals[0]; FILE *diag_f = fopen(out_file,"r"); char line[300]; fgets(line, 300, diag_f); fgets(line, 300, diag_f); fgets(line, 300, diag_f); char delims[] = "\t"; int length, pos_q, pos_d; int current_pos = 0; while (fgets(line, 300, diag_f) != NULL) { strtok(line, delims); strtok(NULL, delims); strtok(NULL, delims); length = atoi(strtok(NULL, delims)); strtok(NULL, delims); strtok(NULL, delims); pos_q = atoi(strtok(NULL, delims))-1; strtok(NULL, delims); pos_d = atoi(strtok(NULL, delims))-1; if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } if (diag[l1-(pos_q)+pos_d] == 0) { diag[l1-(pos_q)+pos_d] =1; diags[current_pos++] = pos_q; diags[current_pos++] = pos_d; diags[current_pos++] = length; } } fclose(diag_f); int round = 0; int e_threshold = 10; while ((current_pos == 0) && (round < 10)) { if (is_dna) { sprintf(blast_command, "bl2seq -p blastn -i %s -j %s -D 1 -g F -o %s -S 1 -F F -W 6 -e %i", seq_file_name1, seq_file_name2, out_file, e_threshold); } else { sprintf(blast_command, "bl2seq -p blastp -i %s -j %s -D 1 -g F -o %s -F F -S 1 -e %i", seq_file_name1, seq_file_name2, out_file, e_threshold); } system(blast_command); e_threshold *= 10; FILE *diag_f = fopen(out_file,"r"); char line[300]; fgets(line, 300, diag_f); fgets(line, 300, diag_f); fgets(line, 300, diag_f); char delims[] = "\t"; while (fgets(line, 300, diag_f) != NULL) { strtok(line, delims); strtok(NULL, delims); strtok(NULL, delims); length = atoi(strtok(NULL, delims)); strtok(NULL, delims); strtok(NULL, delims); pos_q = atoi(strtok(NULL, delims))-1; strtok(NULL, delims); pos_d = atoi(strtok(NULL, delims))-1; if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } if (diag[l1-(pos_q)+pos_d] == 0) { diag[l1-(pos_q)+pos_d] =1; diags[current_pos++] = pos_q; diags[current_pos++] = pos_d; diags[current_pos++] = length; } } fclose(diag_f); ++round; if (current_pos < 27) current_pos = 0; } // ++blast_measure[round]; if (current_pos == 0) { printf("BLAST NOT SUCCESFULL\n"); if (l1 < l2) { int i; int diff = l2 - l1 + 10; for (i = diff; i > 0; --i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = 0; diags[current_pos++] = i; diags[current_pos++] = 100; } diff = 10; // printf("A: %i\n", diff); for (i = 0; i < diff; ++i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = i; diags[current_pos++] = 0; diags[current_pos++] = 100; } } else { int i; int diff = 10; // printf("A: %i\n", diff); for (i = diff; i > 0; --i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = 0; diags[current_pos++] = i; diags[current_pos++] = 100; } diff = l1 - l2 + 10; for (i = 0; i < diff; ++i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = i; diags[current_pos++] = 0; diags[current_pos++] = 100; } } } vfree(diag); diagonals[0] = diags; return current_pos/3; } int seq_pair2blat_diagonal(char *seq_file_name1, char *seq_file_name2, int **diagonals, int *dig_length, int l1, int l2, int is_dna) { int *diag = vcalloc(l1 + l2, sizeof(int)); char *out_file = vtmpnam(NULL); char blast_command[200]; // char blast_command2[200]; // char blast_command2[600]; // sprintf(blast_command2, "less %s", out_file); if (is_dna) { sprintf(blast_command, "blat %s %s %s -out=blast8 -q=dna -t=dna -maxGap=0 >/dev/null 2>/dev/null", seq_file_name2, seq_file_name1, out_file); } else { sprintf(blast_command, "blat %s %s %s -out=blast8 -prot -maxGap=0 >/dev/null 2>/dev/null", seq_file_name2, seq_file_name1, out_file); } system(blast_command); int *diags = diagonals[0]; FILE *diag_f = fopen(out_file,"r"); char line[300]; // fgets(line, 300, diag_f); // fgets(line, 300, diag_f); // fgets(line, 300, diag_f); char delims[] = "\t"; int length, pos_q, pos_d; int current_pos = 0; while (fgets(line, 300, diag_f) != NULL) { strtok(line, delims); strtok(NULL, delims); strtok(NULL, delims); length = atoi(strtok(NULL, delims)); strtok(NULL, delims); strtok(NULL, delims); pos_q = atoi(strtok(NULL, delims))-1; strtok(NULL, delims); pos_d = atoi(strtok(NULL, delims))-1; if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } if (diag[l1-(pos_q)+pos_d] == 0) { diag[l1-(pos_q)+pos_d] =1; diags[current_pos++] = pos_q; diags[current_pos++] = pos_d; diags[current_pos++] = length; } } if (current_pos == 0) { printf("BLAT NOT SUCCESFULL\n"); if (l1 < l2) { int i; int diff = l2 - l1 + 10; for (i = diff; i > 0; --i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = 0; diags[current_pos++] = i; diags[current_pos++] = 100; } diff = 10; // printf("A: %i\n", diff); for (i = 0; i < diff; ++i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = i; diags[current_pos++] = 0; diags[current_pos++] = 100; } } else { int i; int diff = 10; // printf("A: %i\n", diff); for (i = diff; i > 0; --i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = 0; diags[current_pos++] = i; diags[current_pos++] = 100; } diff = l1 - l2 + 10; for (i = 0; i < diff; ++i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = i; diags[current_pos++] = 0; diags[current_pos++] = 100; } } } // printf("END\n"); vfree(diag); fclose(diag_f); diagonals[0] = diags; return current_pos/3; } /** * \brief Calculates the diagonals between two sequences. * * Uses blastz to calculate the diagonals. * \param seq_file1 File with sequence 1. * \param seq_file2 File with sequence 2. * \param diagonals An array where the diagonal points will be stored. * \param dig_length length of \a diagonals . * \param num_points Number of points in all diagonals. * \return number of diagonals; */ int seq_pair2blastz_diagonal(char *seq_file_name1, char *seq_file_name2, int **diagonals, int *dig_length, int l1, int l2, int is_dna) { int *diag = vcalloc(l1 + l2, sizeof(int)); char *out_file = vtmpnam(NULL); char blast_command[200]; // char blast_command2[200]; // char blast_command2[600]; // sprintf(blast_command2, "less %s", out_file); if (is_dna) { sprintf(blast_command, "~/Download/blastz-source/blastz %s %s B=0 K=10000> %s", seq_file_name1, seq_file_name2, out_file); } else { printf("SORRY - no BLASTZ with amino accid\n"); exit(0); } system(blast_command); int *diags = diagonals[0]; FILE *diag_f = fopen(out_file,"r"); char line[300]; char delims[] = " "; // char *result = NULL; int length, pos_q, pos_d; int current_pos = 0; while (fgets(line, 300, diag_f) != NULL) { if (line[0] == 'a') { char *line_tmp; while (fgets(line, 300, diag_f) != NULL) { if (line[0] == '}') break; if (line[2] == 'l') { line_tmp = &line[4]; if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } pos_q = atoi(strtok(line_tmp, delims)); pos_d = atoi(strtok(NULL, delims)); length = atoi(strtok(NULL, delims) - pos_q); if (diag[l1-(pos_q)+pos_d] == 0) { diag[l1-(pos_q)+pos_d] =1; diags[current_pos++] = pos_q; diags[current_pos++] = pos_d; diags[current_pos++] = length; } } } } } if (current_pos == 0) { printf("BLASTZ NOT SUCCESFULL\n"); if (l1 < l2) { int i; int diff = l2 - l1 + 10; for (i = diff; i > 0; --i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = 0; diags[current_pos++] = i; diags[current_pos++] = 100; } diff = 10; for (i = 0; i < diff; ++i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = i; diags[current_pos++] = 0; diags[current_pos++] = 100; } } else { int i; int diff = 10; for (i = diff; i > 0; --i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = 0; diags[current_pos++] = i; diags[current_pos++] = 100; } diff = l1 - l2 + 10; for (i = 0; i < diff; ++i) { if (current_pos >= *dig_length-20) { (*dig_length) += 90; diags = vrealloc(diags, sizeof(int)*(*dig_length)); } diags[current_pos++] = i; diags[current_pos++] = 0; diags[current_pos++] = 100; } } } vfree(diag); fclose(diag_f); diagonals[0] = diags; return current_pos/3; } //************************** needleman-wunsch aligning ********************************************************** void fill_arguments_nw(Nw_param* method_arguments_p, int alphabet_size) { method_arguments_p-> dyn_matrix = vcalloc(1,sizeof(double*)); method_arguments_p->dyn_matrix[0] = vcalloc(1,sizeof(double)); method_arguments_p->length1 = vcalloc(1,sizeof(int)); method_arguments_p->length2 = vcalloc(1,sizeof(int)); *method_arguments_p->length1 = 1; *method_arguments_p->length2 = 1; method_arguments_p->sumup_prf = vcalloc(alphabet_size+1,sizeof(int*)); int i; for (i = 0; i < alphabet_size+1; ++i) method_arguments_p->sumup_prf[i] = vcalloc(1,sizeof(int)); method_arguments_p->sumup_length = vcalloc(1,sizeof(int)); *method_arguments_p->sumup_length = 1; } void free_nw(Nw_param* method_arguments_p, int alphabet_size) { free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->dyn_matrix); int i; for (i = 0; i <= alphabet_size; ++i) { vfree(method_arguments_p->sumup_prf[i]); } vfree(method_arguments_p->sumup_prf); vfree(method_arguments_p->length1); vfree(method_arguments_p->length2); vfree(method_arguments_p->sumup_length); } /** * \brief One run of needleman-wunsch dynamic programming. * * \param profiles The profiles. * \param param_set The fastal parameters. * \param method_arguments_p The method arguments. * \param is_dna Sequences are DNA (\a is_dna = 1) or protein. * \param edit_file The edit file. * \param prof_file the profile file. * \param number Number of the parent node. * \return The length of the alignment. */ int nw_dyn(Fastal_profile **profiles, Fastal_param *param_set, void *method_arguments_p, int is_dna, FILE *edit_file, FILE *prof_file, int number) { Nw_param *arguments = (Nw_param*)method_arguments_p; // int old_length1 = *arguments->length1; // int old_length2 = *arguments->length2; arguments->dyn_matrix = resize_dyn_matrix(arguments->dyn_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1); *arguments->length1 = profiles[0]->length+1; *arguments->length2 = profiles[1]->length+1; int alignment_length = prf_nw(profiles[0], profiles[1], arguments->dyn_matrix, edit_file, arguments->sumup_prf, arguments->sumup_length, param_set); write2file(arguments->sumup_prf, alignment_length, prof_file, number,profiles[0]->number_of_sequences + profiles[1]->number_of_sequences, param_set); return alignment_length; } /** * \brief This method takes a profile and turns it into a sumed up version. * * Required for NW-algorithm. * \param profile The profile to sum up. * \param sumup A field where the result will be stored. * \param param_set Parameters for the fastal algorithm. * \return The new \a sumup. */ int** sumup_profile(Fastal_profile *profile, int **sumup, Fastal_param *param_set) { char *pos2aa = &(param_set->pos2char[0]); int alphabet_size = param_set->alphabet_size; int **M = param_set->M; int prof_length = profile->length; int i,j,k; for (i = 0; i < prof_length; ++i) { sumup[alphabet_size][i] = 0; for (k = 0; k < alphabet_size; ++k) { sumup[k][i] = 0; sumup[alphabet_size][i] += profile->prf[k][i]; for (j = 0; j < alphabet_size; ++j) { sumup[k][i] += profile->weight * profile->prf[j][i] * M[pos2aa[j]-'A'][pos2aa[k]-'A']; } } } return sumup; } /** * \brief Turns the dynamic programming matrix into a editfile and calculates the new profile. * * Required for NW-algorithm. * \param prog_matrix The dynamic programming matrix. * \param prf1 The first profile. * \param prf2 The second profile. * \param edit_f A File object (already opened) to write the edit sequence into. * \param prf_field A 2-dim array to save the new profile into. * \param field_length Length of the new profile. * \param param_set Parameters of the Fastal-Algorithm */ int nw_matrix2edit_file(double **prog_matrix, //dynamic programming matrix Fastal_profile *prf1, //profile of dim1 Fastal_profile *prf2, //profile of dim2 FILE *edit_f, //file to safe the edit in int **prf_field, //space to safe the new profile int *field_length, Fastal_param *param_set) //length of prf_field { // int **M = param_set->M; int alphabet_size = param_set->alphabet_size; double gap_cost = param_set -> gop; fprintf(edit_f, "%i\n%i\n%i\n%i\n",prf1->prf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf); int sum[] = {0,0,0}; char sumc[] = {'M','I','D'}; int last = 0; int n = 0; int m = 0; int field_pos = 0; int i; int prf1_length = prf1->length; int prf2_length = prf2->length; while ((n < prf1_length) && (m < prf2_length)) { //if necesarry allocate more memory for result if ((*field_length)-alphabet_size < field_pos) { (*field_length) += ENLARGEMENT_PER_STEP; for (i = 0; i prf[i][n]; } ++n; ++ field_pos; if (last != 1) { fprintf(edit_f,"%c%i\n",sumc[last],sum[last]); sum[last] = 0; } last = 1; ++sum[last]; } else if (prog_matrix[n][m] == (prog_matrix[n][m+1] +gap_cost)) { for (i = 0; iprf[i][m]; } ++m; ++ field_pos; if (last != 2) { fprintf(edit_f,"%c%i\n",sumc[last],sum[last]); sum[last] = 0; } last = 2; ++sum[last]; } else { for (i = 0; iprf[i][n] + prf2->prf[i][m]; } ++n; ++m; ++ field_pos; if (last != 0) { fprintf(edit_f,"%c%i\n",sumc[last],sum[last]); sum[last] = 0; } last = 0; ++sum[last]; } } fprintf(edit_f,"%c%i\n",sumc[last],sum[last]); //gaps in prf2 last = 0; while (n < prf1_length) { for (i = 0; iprf[i][n]; } ++n; ++ field_pos; ++last; } if (last > 0) fprintf(edit_f,"I%i\n",last); //gaps in prf1 last = 0; while (m < prf2_length) { for (i = 0; iprf[i][m]; } ++m; ++ field_pos; ++last; } if (last > 0) fprintf(edit_f,"D%i\n",last); fprintf(edit_f,"*\n"); return field_pos; } /** * \brief Pairwise alignments of profile is done here. * * \param profile1 Profile of sequence 1 * \param profile2 Profile of sequence 2 * \param prog_matrix Matrix for dynamic programming * \param edit_file_name The edit_file_name * \param sumup_prf The sumup version of profile 1, which later contains the aligned profile. * \param sumup_length Contains the length of the aligned profile. * \return length of the aligned profile */ int prf_nw(Fastal_profile *profile1, Fastal_profile *profile2, double **prog_matrix, FILE *edit_file_name, int **sumup_prf, int *sumup_length, Fastal_param *param_set) { int alphabet_size = param_set->alphabet_size; double gap_cost = param_set->gop; int i; if (*sumup_length < profile1->length) { for (i = 0; i < alphabet_size+1; ++i) { sumup_prf[i] = vrealloc(sumup_prf[i], profile1->length*sizeof(int)); } *sumup_length = profile1->length; } sumup_prf = sumup_profile(profile1, sumup_prf, param_set); int j,k; int prof1_length = profile1->length; int prof2_length = profile2->length; // int** M = param_set->M; double match_score; // int amino_counter; int residue_pairs = 0; for (i = prof2_length; i > 0; --i) { prog_matrix[prof1_length][i] = gap_cost * (prof2_length-i); } i = prof1_length-1; prog_matrix[prof1_length][prof2_length] = 0.0; while (i >=0) { j = prof2_length-1; prog_matrix[i][prof2_length] = gap_cost*(prof1_length-i); while (j >=0) { match_score = 0.0; residue_pairs = 0; for (k = 0; k < alphabet_size; ++k) { residue_pairs += profile2->prf[k][j]; match_score += (profile2->prf[k][j] * sumup_prf[k][i]); } match_score /= (residue_pairs * sumup_prf[alphabet_size][i]); prog_matrix[i][j] = MAX3(prog_matrix[i+1][j+1]+match_score, prog_matrix[i+1][j]+gap_cost, prog_matrix[i][j+1]+gap_cost); --j; } --i; } return nw_matrix2edit_file(prog_matrix, profile1, profile2, edit_file_name, sumup_prf, sumup_length, param_set); } /************** GOTOH ***********************/ void fill_arguments_gotoh(Gotoh_param* method_arguments_p, int alphabet_size) { method_arguments_p->m_matrix = vcalloc(1,sizeof(double*)); method_arguments_p->m_matrix[0] = vcalloc(1,sizeof(double)); method_arguments_p->d_matrix = vcalloc(1,sizeof(double*)); method_arguments_p->d_matrix[0] = vcalloc(1,sizeof(double)); method_arguments_p->i_matrix = vcalloc(1,sizeof(double*)); method_arguments_p->i_matrix[0] = vcalloc(1,sizeof(double)); method_arguments_p->length1 = vcalloc(1,sizeof(int)); method_arguments_p->length2 = vcalloc(1,sizeof(int)); method_arguments_p->log_saver = vcalloc(alphabet_size+1, sizeof(double*)); *method_arguments_p->length1 = 1; *method_arguments_p->length2 = 1; method_arguments_p->sumup_prf = vcalloc(alphabet_size+1,sizeof(int*)); int i; for (i = 0; i < alphabet_size+1; ++i) { method_arguments_p->sumup_prf[i] = vcalloc(1,sizeof(int)); method_arguments_p->log_saver[i] = vcalloc(1, sizeof(double)); } method_arguments_p->sumup_length = vcalloc(1,sizeof(int)); *method_arguments_p->sumup_length = 1; } void free_gotoh(Gotoh_param* method_arguments_p, int alphabet_size) { free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->m_matrix); free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->i_matrix); free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->d_matrix); int i; for (i = 0; i <= alphabet_size; ++i) { vfree(method_arguments_p->sumup_prf[i]); } vfree(method_arguments_p->sumup_prf); vfree(method_arguments_p->length1); vfree(method_arguments_p->length2); vfree(method_arguments_p->sumup_length); } int gotoh_dyn(Fastal_profile **profiles, Fastal_param *param_set, void *method_arguments_p, int is_dna, FILE *edit_file, FILE *prof_file, int number) { Gotoh_param *arguments = (Gotoh_param*)method_arguments_p; arguments->m_matrix = resize_dyn_matrix(arguments->m_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1); arguments->i_matrix = resize_dyn_matrix(arguments->i_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1); arguments->d_matrix = resize_dyn_matrix(arguments->d_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1); int i; if (profiles[1]->length > *arguments->length2-1) { for (i = 0; i < param_set->alphabet_size; ++i) { arguments->log_saver[i] = vrealloc(arguments->log_saver[i], (profiles[1]->length)*sizeof(double*)); } } *arguments->length1 = profiles[0]->length+1; *arguments->length2 = profiles[1]->length+1; int alignment_length = prf_gotoh(profiles[0], profiles[1], edit_file, arguments, param_set); write2file(arguments->sumup_prf, alignment_length, prof_file, number, profiles[0]->number_of_sequences + profiles[1]->number_of_sequences, param_set); return alignment_length; } int gotoh_matrix2edit_file(double **m_matrix, //dynamic programming matrix double **v_matrix, //dynamic programming matrix double **h_matrix, //dynamic programming matrix Fastal_profile *prf1, //profile of dim1 Fastal_profile *prf2, //profile of dim2 FILE *edit_f, //file to safe the edit in int **prf_field, //space to safe the new profile int *field_length, Fastal_param *param_set) //length of prf_field { double comp_num = log((double)prf1->number_of_sequences) + log((double)prf2->number_of_sequences); int** M = param_set->M; int alphabet_size = param_set->alphabet_size; double gep = param_set -> gep; fprintf(edit_f, "%i\n%i\n%i\n%i\n",prf1->prf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf); int sum[] = {0,0,0}; char sumc[] = {'M','I','D'}; int last = 0; int n = 0; int m = 0; int field_pos = 0; int i; int prf1_length = prf1->length; int prf2_length = prf2->length; int current_mode = 0; //determine start mode char *pos2char = param_set->pos2char; if (h_matrix[n][m] == m_matrix[n][m]) { current_mode = 2; } else { if (v_matrix[n][m] == m_matrix[n][m]) { current_mode = 1; } else { current_mode = 0; } } // printf("%f %f %f - %i\n",h_matrix[n][m],v_matrix[n][m],m_matrix[n][m], current_mode); while ((n < prf1_length) && (m < prf2_length)) { //if necesarry allocate more memory for result if ((*field_length)-alphabet_size < field_pos) { (*field_length) += ENLARGEMENT_PER_STEP; for (i = 0; i prf[i][m]; } if (h_matrix[n][m] != (h_matrix[n][m+1]+gep)) { current_mode = 0; } ++m; ++ field_pos; if (last != 2) { fprintf(edit_f,"%c%i\n",sumc[last],sum[last]); sum[last] = 0; } last = 2; ++sum[last]; } else { if (current_mode== 1) { for (i = 0; iprf[i][n]; } if (v_matrix[n][m] != (v_matrix[n+1][m]+gep)) { current_mode = 0; } ++n; ++ field_pos; if (last != 1) { fprintf(edit_f,"%c%i\n",sumc[last],sum[last]); sum[last] = 0; } last = 1; ++sum[last]; } else { double match_score = 0.0; int char_c, char_c2; for (char_c = 0; char_c < alphabet_size; ++char_c) { for (char_c2 = 0; char_c2 < alphabet_size; ++char_c2) { if ((log(prf1->prf[char_c][n]) != -1) && ( log(prf2->prf[char_c2][m]) != -1)) { match_score += exp(log((double)prf1->prf[char_c][n]) + log((double)prf2->prf[char_c2][m])-comp_num) * M[pos2char[char_c]-'A'][pos2char[char_c2]-'A']; } } } if (m_matrix[n+1][m+1] + match_score != m_matrix[n][m]) { if (m_matrix[n][m] == v_matrix[n][m]) { current_mode = 1; continue; } if (m_matrix[n][m] == h_matrix[n][m]) { current_mode = 2; continue; } } for (i = 0; iprf[i][n] + prf2->prf[i][m]; } ++n; ++m; ++ field_pos; if (last != 0) { fprintf(edit_f,"%c%i\n",sumc[last],sum[last]); sum[last] = 0; } last = 0; ++sum[last]; } } } fprintf(edit_f,"%c%i\n",sumc[last],sum[last]); int needed = MAX(prf1_length -n, prf2_length -m); if ((*field_length) - needed -10 < field_pos) { (*field_length) += needed +10; for (i = 0; i prf[i][n]; } ++n; ++ field_pos; ++last; } if (last > 0) fprintf(edit_f,"I%i\n",last); //gaps in prf1 last = 0; while (m < prf2_length) { for (i = 0; iprf[i][m]; } ++m; ++ field_pos; ++last; } if (last > 0) fprintf(edit_f,"D%i\n",last); fprintf(edit_f,"*\n"); return field_pos; } /** * \brief The gotoh dynamic programming algorithm. * * \param profile1 The first profile. */ int prf_gotoh(Fastal_profile *profile1, Fastal_profile *profile2, FILE *edit_file_name, Gotoh_param *arguments, Fastal_param *param_set) { // printf("I AM HERE - again\n"); int **sumup_prf = arguments->sumup_prf; int *sumup_length = arguments->sumup_length; int alphabet_size = param_set->alphabet_size; double gop = param_set->gop; double gep = param_set->gep; const int INF = -999999; int i; double **m_matrix = arguments->m_matrix; double **h_matrix = arguments->i_matrix; double **v_matrix = arguments->d_matrix; int j; int prof1_length = profile1->length; int prof2_length = profile2->length; int** M = param_set->M; double match_score; for (i = prof2_length; i >= 0; --i) { m_matrix[prof1_length][i] = gop + gep * (prof2_length-i); v_matrix[prof1_length][i] = INF; h_matrix[prof1_length][i] = m_matrix[prof1_length][i]; } m_matrix[prof1_length][prof2_length] = 0.0; h_matrix[prof1_length][prof2_length] = INF; v_matrix[prof1_length][prof2_length] = INF; int l; double comp_num = log((double)profile1->number_of_sequences) + log((double)profile2->number_of_sequences); static double *log_test = NULL; if (!log_test) log_test = vcalloc(alphabet_size, sizeof(double)); // int k; int **prf1 = profile1->prf; int **prf2 = profile2->prf; double **log_test2 = arguments->log_saver; for (l = 0; l < alphabet_size; ++l) { for (i = 0; i < profile2->length; ++i) { if (prf2[l][i] > 0) { log_test2[l][i] = log(prf2[l][i]); } else log_test2[l][i] = -1; } } char *pos2char = param_set->pos2char; i = prof1_length-1; while (i >=0) { j = prof2_length-1; for (l = 0; l < alphabet_size; ++l) { if (prf1[l][i] > 0) log_test[l] = log((double)prf1[l][i]); else log_test[l] = -1; } m_matrix[i][prof2_length] = gop + gep *(prof1_length-i); v_matrix[i][prof2_length] = m_matrix[i][prof2_length]; h_matrix[i][prof2_length] = INF; while (j >=0) { match_score = 0.0; v_matrix[i][j] = (MAX(v_matrix[i+1][j], m_matrix[i+1][j] + gop) + gep); h_matrix[i][j] = (MAX(h_matrix[i][j+1], m_matrix[i][j+1] + gop) + gep); int char_c, char_c2; int num = 0; for (char_c = 0; char_c < alphabet_size; ++char_c) { for (char_c2 = 0; char_c2 < alphabet_size; ++char_c2) { if ((log_test[char_c] != -1) && (log_test2[char_c2][j] != -1)) { match_score += exp(log_test[char_c] + log_test2[char_c2][j]-comp_num) * M[pos2char[char_c]-'A'][pos2char[char_c2]-'A']; } } } m_matrix[i][j] = m_matrix[i+1][j+1]+match_score; if (m_matrix[i][j] < v_matrix[i][j]) { m_matrix[i][j] = v_matrix[i][j]; } if (m_matrix[i][j] < h_matrix[i][j]) { m_matrix[i][j] = h_matrix[i][j]; } --j; } --i; } return gotoh_matrix2edit_file(m_matrix, v_matrix, h_matrix, profile1, profile2, edit_file_name, sumup_prf, sumup_length, param_set); } /************* OTHER STUFF ******************/ /** * \brief Writes the alignment into the profile file and the edit file. * * \param profiles The two profiles to combine. * \param alignment The alinment information. * \param alignment The length of the alignment. * \param edit_f The edit file. * \param prof_f The profile file. * \param node_number the new node number. */ void alignment2files(Fastal_profile **profiles, Fastal_param *param_set, int **alignment, int alignment_length, FILE *edit_f, FILE *prof_f, int node_number) { fprintf(edit_f, "%i\n%i\n%i\n%i\n",profiles[0]->prf_number, profiles[1]->prf_number, profiles[0]->is_leaf, profiles[1]->is_leaf); fprintf(prof_f, "%i\n0\n%i\n1\n", node_number, alignment_length); int **prf1 = profiles[0]->prf; int **prf2 = profiles[1]->prf; int i = 0; int pos = 0; int pos1, pos2; char statec[] = {'M','D','I'}; int num = 0; int state = 0; while (i < alignment_length) { pos1 = alignment[0][pos]; pos2 = alignment[1][pos]; // match if ((pos1 != -1) && (pos2 != -1)) { combine_profiles2file(prf1, prf2, pos1, pos2, param_set, prof_f, 'M'); if (state != 0) { fprintf(edit_f, "%c%i\n",statec[state], num); num =1; state = 0; } else ++num; ++i; } // insertion in seq 1 else if (pos1 != -1) { combine_profiles2file(prf1, prf2, pos1, pos2, param_set, prof_f, 'I'); if (state != 2) { fprintf(edit_f, "%c%i\n",statec[state], num); num =1; state = 2; } else ++num; ++i; } // deletion in seq 1 else if (pos2 != -1) { combine_profiles2file(prf1, prf2, pos1, pos2, param_set, prof_f, 'D'); if (state != 1) { fprintf(edit_f, "%c%i\n",statec[state], num); num =1; state = 1; } else ++num; ++i; } ++pos; } fprintf(edit_f, "%c%i\n",statec[state], num); fprintf(edit_f,"*\n"); fprintf(prof_f,"*\n"); } //******************************* OTHER STUFF *********************** /** * \brief Reads the sequence from a given position in a fasta file and turns it into a profile. * * \param seq_file The file where the sequence is stored. * \param off_set The off_set from the beginning of the file to the position of the sequence name. * \param profile The profile where the sequence will be stored into. * \param prf_number The number of this profile. */ void file_pos2profile(FILE *seq_file, //File with sequences long off_set, //offset of sequence from the beginning of file point to the sequence name, not to the sequence itself Fastal_profile *profile, //profile to save into int prf_number, //number of the profile Fastal_param *param_set) { int alphabet_size = param_set->alphabet_size; profile->is_leaf = 1; profile->number_of_sequences = 1; int *aa2pos = &(param_set->char2pos[0]); const int LINE_LENGTH = 500; char line[LINE_LENGTH]; profile->num_sequences = 1; profile->prf_number = prf_number; fseek (seq_file , off_set , SEEK_SET ); fgets (line, LINE_LENGTH , seq_file); int seq_length = 0; int i, j, x; while(fgets(line, LINE_LENGTH, seq_file)!=NULL) { if (line[0] != '>') { line[LINE_LENGTH-1] = '\n'; if (seq_length + LINE_LENGTH >= profile->allocated_memory) { for (i = 0; i < alphabet_size; ++i) { profile->prf[i] = vrealloc(profile->prf[i], (profile->allocated_memory+PROFILE_ENLARGEMENT)*sizeof(int)); } profile->allocated_memory += PROFILE_ENLARGEMENT; } i = 0; x = 0; while ((line[i] != '\n') && (line[i] != '\0')) { if (line[i] != '-') { for(j = 0; jprf[j][seq_length+x] = 0; profile->prf[aa2pos[toupper((short)line[i])]][seq_length+x] = 1; ++x; } ++i; } seq_length += x; } else break; } profile->length = seq_length; } /** * \brief Constructs index of fasta_file. * * The index is of length n (n= number of sequences in the given multi fasta file.). In the order of appearance in the file the position of each sequence in the file is stored. * \param file_name The file with the sequences. * \param file_positions Array to save the positions in. * \return The number of sequences in \a file_name. */ int make_index_of_file(char *file_name, //file with sequences long **file_positions) //array to save the positions { const int LINE_LENGTH = 150; (*file_positions) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(long)); FILE *file = fopen(file_name,"r"); char line[LINE_LENGTH]; int num_of_sequences = 0; int mem_for_pos = ENLARGEMENT_PER_STEP; if (file == NULL) { printf("FILE NOT FOUND\n"); exit(1); } else { (*file_positions)[num_of_sequences] = ftell(file); while(fgets(line, LINE_LENGTH , file)!=NULL) { // int length = strlen(line); if (line[0] == '>') { ++num_of_sequences; if (num_of_sequences == mem_for_pos) { (*file_positions) = vrealloc((*file_positions),(ENLARGEMENT_PER_STEP+mem_for_pos) * sizeof(long)); mem_for_pos += ENLARGEMENT_PER_STEP; } } (*file_positions)[num_of_sequences] = ftell(file); } } fclose(file); return num_of_sequences; } /** * \brief Reads a profile from a profile file. * * \param prof A Fastal_profile object to save the profile in. * \param profile_f file where the profile is stored. * \param position Position of the profile in \a profile_f. * \param param_set The parameter set for Fastal */ void profile_file2profile(Fastal_profile *prof, //structure to save the profile in FILE *profile_f, //file where the profile is stored long position, //position in profile_f where the profile is stored Fastal_param *param_set) { int alphabet_size = param_set->alphabet_size; int *aa2pos = &(param_set->char2pos[0]); fseek(profile_f,position,SEEK_SET); const int LINE_LENGTH = 500; char line[500]; fgets(line, LINE_LENGTH, profile_f); prof->prf_number = atoi(line); fgets(line, LINE_LENGTH, profile_f); prof->is_leaf = atoi(line); fgets(line, LINE_LENGTH, profile_f); prof->length = atoi(line); fgets(line, LINE_LENGTH, profile_f); prof->weight = atoi(line); fgets(line, LINE_LENGTH, profile_f); prof->number_of_sequences = atoi(line); int i,j; if (prof->length > prof->allocated_memory) for (i = 0;i < alphabet_size; ++i) { prof->prf[i] = vrealloc(prof->prf[i],prof->length*sizeof(int)); } prof->allocated_memory = prof->length; char delims[] = " "; char *result = NULL; char *result_num = NULL; int length = prof->length; for (i = 0; i < length; ++i) { for(j = 0; jprf[j][i] = 0; fgets(line, LINE_LENGTH , profile_f); result = strtok( line, delims ); while( result != NULL) { result_num = &result[1]; prof->prf[aa2pos[(short)result[0]]][i] = atoi(result_num); result = strtok( NULL, delims ); } } } /** * \brief Writes a profile into a file * * \param profile Pointer to the profile which has to be saved. * \param file A File object (already opened) to write the profile to. * \param param_set The parameters for the fastal algorithm. */ void profile2file(Fastal_profile *profile, //the profile to save FILE* file, //file to save in Fastal_param *param_set) { int alphabet_size = param_set->alphabet_size; char *pos2aa = &(param_set->pos2char[0]); fseek(file,0,SEEK_SET); fprintf(file,"%i\n", profile->prf_number); fprintf(file,"%i\n", profile->is_leaf); fprintf(file,"%i\n", profile->length); fprintf(file,"%i\n", profile->weight); int i = 0, j = 0; int max = profile->length; int x= 0; --alphabet_size; while (i < max) { for (j = 0; j < alphabet_size; ++j) if (profile->prf[j][i] > 0) { if (x) fprintf(file," %c%i", pos2aa[j],profile->prf[j][i]); else fprintf(file,"%c%i", pos2aa[j],profile->prf[j][i]); x = 1; } if (profile->prf[j][i] > 0) { if (x) fprintf(file," %c%i", pos2aa[j],profile->prf[j][i]); else fprintf(file,"%c%i", pos2aa[j],profile->prf[j][i]); x = 1; } x = 0; fprintf(file,"\n"); ++i; } fprintf(file,"*\n"); } /** * Reads the profile out of an alignment (NOT IN USE) */ // void // file2profile(FILE* profile_f, //file to read the profile of // Fastal_profile *prof, //profile saved in here // int prf_number, //number of the profile // Fastal_param *param_set) // { // int alphabet_size = param_set->alphabet_size; // // int *aa2pos = &(param_set->char2pos[0]); // // // fseek(profile_f,0,SEEK_SET); // const int LINE_LENGTH = 500; // char line[500]; // // fgets(line, LINE_LENGTH, profile_f); // prof->prf_number = atoi(line); // fgets(line, LINE_LENGTH, profile_f); // prof->is_leaf = atoi(line); // // fgets(line, LINE_LENGTH, profile_f); // prof->length = atoi(line); // // fgets(line, LINE_LENGTH, profile_f); // prof->weight = atoi(line); // int i,j; // if (prof->length > prof->allocated_memory) // for (i = 0;i < alphabet_size; ++i) // { // prof->prf[i] = vrealloc(prof->prf[i],prof->length*sizeof(int)); // } // // char delims[] = " "; // char *result = NULL; // char *result_num = NULL; // // int length = prof->length; // // for (i = 0; i < length; ++i) // { // for(j = 0; jprf[j][i] = 0; // fgets(line, LINE_LENGTH , profile_f); // result = strtok( line, delims ); // // while( result != NULL) // { // result_num = &result[1]; // prof->prf[aa2pos[(short)result[0]]][i] = atoi(result_num); // result = strtok( NULL, delims ); // } // } // } /** * \brief Writes the sequence into the alignment_file. * * \param aligned_sequence Pattern of aligned sequence. * \param sequence_file File with sequences. * \param sequence_position Positions of sequences in \a sequence_file. * \param alignment_file The file to write the sequence into. * */ void edit_seq2aligned_seq(char *aligned_sequence, //pattern for aligned sequence FILE *sequence_file, //file with all the sequences long sequence_position, //position in sequence file with the correct sequence FILE *alignment_file) //file to write the alignment into { fseek(sequence_file, sequence_position, SEEK_SET); const int LINE_LENGTH = 300; char line[LINE_LENGTH]; fgets (line, LINE_LENGTH , sequence_file); fprintf(alignment_file,"%s", line); //writing of sequence name int pos = 0; int i = 0; while(fgets(line, LINE_LENGTH, sequence_file)!=NULL) { if (line[0] != '>') { line[LINE_LENGTH-1] = '\n'; i = 0; while ((line[i] != '\n') && (line[i] != '\0')) { while (aligned_sequence[pos] == '-') { fprintf(alignment_file,"-"); ++pos; } if (line[i] != '-') { fprintf(alignment_file,"%c",line[i]); ++pos; } ++i; // ++pos; } } else break; } while (aligned_sequence[pos] != '\n') { fprintf(alignment_file,"-"); ++pos; } fprintf(alignment_file,"\n"); } /** * \brief Recursive function to turn the edit_file into the alignment. * * \param sequence_file File with all sequences. * \param sequence_position The array of sequence positions in \a sequence_file * \param edit_file File to safe the edit profiles in. * \param edit_positions Array saving the coorespondence between edit profile and position in \a edit_file * \param node_number The current node. * \param number_of_sequences The number of sequences. * \param aligned_sequence The sequence that is edited. * \param alignment_length The length of the alignment. * \param edit_seq_file File that saves the edited_sequences of the internal nodes. * \param offset Saves the size of the edited_sequences. * \param alignment_file File where the alignment is saved. */ void edit2alignment(FILE *sequence_file, //sequence file long *seq_positions, //sequence positions FILE *edit_file, //file saving the edit profiles long *edit_positions, //array saving the correspondence between edit profile and position in edit_file int node_number, //the current node int number_of_sequences, //number of sequences char *aligned_sequence, //the sequence that is edited int alignment_length, //length of the alignment - and thus of aligned_sequence FILE *edit_seq_file, //file saving the edited_sequences of the internal nodes int offset, //saves the size of the edited_sequence FILE* alignment_file) //file saving the alignments { fseek(edit_file, edit_positions[node_number-number_of_sequences], SEEK_SET); const int LINE_LENGTH = 50; char line[LINE_LENGTH]; fgets(line, LINE_LENGTH , edit_file); int child1 = atoi(line); fgets(line, LINE_LENGTH , edit_file); int child2 = atoi(line); fgets(line, LINE_LENGTH , edit_file); int is_leaf1 = atoi(line); fgets(line, LINE_LENGTH , edit_file); int is_leaf2 = atoi(line); // static char seq_line[10]; // printf("SO EINE VERDAMMTE SCHEISE ABER AUCH\n"); char x; int number; int pos = 0; //first child while(fgets(line, LINE_LENGTH , edit_file)!=NULL) { x = line[0]; if (x == '*') break; number = atoi(&line[1]); if (x == 'M') { while (number > 0) { if (aligned_sequence[pos] == 'X') --number; ++pos; } } else if (x == 'I') { while (number > 0) { if (aligned_sequence[pos] == 'X') --number; ++pos; } } else if (x == 'D') { while (number > 0) { if (aligned_sequence[pos] == 'X') { aligned_sequence[pos] = '-'; --number; } ++pos; } } } if (is_leaf1) { // printf("LEAF\n"); edit_seq2aligned_seq(aligned_sequence, sequence_file, seq_positions[child1], alignment_file); } else { fprintf(edit_seq_file, "%s", aligned_sequence); edit2alignment(sequence_file, seq_positions, edit_file, edit_positions, child1, number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file); } //second child fseek(edit_seq_file, offset, SEEK_CUR); fgets(aligned_sequence, alignment_length+3, edit_seq_file); fseek(edit_seq_file, offset, SEEK_CUR); pos = 0; fseek(edit_file, edit_positions[node_number-number_of_sequences], SEEK_SET); while(fgets(line, LINE_LENGTH , edit_file)!=NULL) { x = line[0]; if (x == '*') break; number = atoi(&line[1]); if (x == 'M') { while (number > 0) { if (aligned_sequence[pos] == 'X') --number; ++pos; } } else if (x == 'I') { while (number > 0) { if (aligned_sequence[pos] == 'X') { aligned_sequence[pos] = '-'; --number; } ++pos; } } else if (x == 'D') { while (number > 0) { if (aligned_sequence[pos] == 'X') --number; ++pos; } } } if (is_leaf2) { edit_seq2aligned_seq(aligned_sequence, sequence_file, seq_positions[child2], alignment_file); } else { fprintf(edit_seq_file, "%s", aligned_sequence); edit2alignment(sequence_file, seq_positions, edit_file, edit_positions, child2, number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file); } } // * The file has the follwing format (# and text behind are only comments and not included into the file): // * 1 # Number of profile. // * 1 # is leaf. // * 5 # Number of columns in the profile. // * 4A 1C # In the first column are 4 'A' and 1 'C' // * 3G # In the second column are 3 'G' // * 5A # In the third column are 5 'A' // * 2A 3C # In the fourth column are 2 'A' and 3 'C' // * 5C # In the fifth column are 5 'C' // * * # Marks the end of this profile /** * \brief Writes a profile to a file. * * \param sumup_prf The profile array, not a real profile. * \param length The length of the profile. The format can be seen in ./test.txt * \param file The FILE object to write the the profile into. * \param is_dna The type of sequence. * \param number The number of the profile. */ void write2file(int **sumup_prf, int length, FILE *file, int number, int num_sequences, Fastal_param *param_set) { char *pos2aa = &(param_set->pos2char[0]); fprintf(file,"%i\n0\n%i\n1\n%i\n",number, length, num_sequences ); int i, j; int alphabet_size = param_set->alphabet_size; i = 0; int x = 0; while (i < length) { for (j = 0; j < alphabet_size; ++j) if (sumup_prf[j][i] > 0) { if (x) fprintf(file," %c%i", pos2aa[j],sumup_prf[j][i]); else fprintf(file,"%c%i", pos2aa[j],sumup_prf[j][i]); x = 1; } x = 0; fprintf(file,"\n"); ++i; } fprintf(file,"*\n"); } //************************************* main function of the fasta algorithm *********************************************** /** * \brief main of the fastal algorithm */ int fastal_main(int argc, //number of arguments char **argv) //arguments first = fastal, second = tree { int i; //pointer to arguments void * method_arguments_p; int (*alignment_function)(Fastal_profile **profiles, Fastal_param *param_set, void *method_arguments_p, int is_dna, FILE *edit_file, FILE *prof_file, int number); struct Fastal_arguments arguments; arg_parse (argc, argv, &arguments); Fastal_param *param_set = vcalloc(1,sizeof(Fastal_param)); fill_parameters(arguments.is_dna, param_set, arguments.method, arguments.diag_method, arguments.mat); param_set->gep = arguments.gep; param_set->gop = arguments.gop; // printf("%s",arguments.mat); if (arguments.evaluate) { printf("Calculate Sum of pairs Score\n"); printf("Score: %f\n", calculate_sum_of_pairs_score_affine(arguments.sequence_file, param_set->M, param_set->gop, param_set->gep)); vfree(param_set); exit(0); } if (arguments.agreement_score) { complete_agreement_score(arguments.aln2test, arguments.aln_ref); return 0; } if (arguments.num_ref_aln) { compute_ref_alignments(arguments.sequence_file, arguments.aln_ref, arguments.num_ref_aln, arguments.num_seq_in_ref); return 0; } int alphabet_size = param_set->alphabet_size; //sequence file management // char **seq_name; long *file_positions = NULL; long **tmp = &file_positions; int number_of_sequences = make_index_of_file(arguments.sequence_file, tmp); //edit file management // long current_edit_pos; long *edit_positions = vcalloc(number_of_sequences,sizeof(long)); //profile management Fastal_profile **profiles = vcalloc(3,sizeof(Fastal_profile*)); initiate_profiles(profiles, param_set); FILE * prof_file = fopen(vtmpnam(NULL),"w+"); long* profile_positions = vcalloc(4,sizeof(long*)); int max_prof = 4; int saved_prof = 0; printf("METHOD: %s\n",param_set->method); if (strcmp(param_set->method, "fast") == 0) { method_arguments_p = vcalloc(1,sizeof(Sparse_dynamic_param)); fill_arguments_sparse((Sparse_dynamic_param*)method_arguments_p); alignment_function = sparse_dyn; } else if (strcmp(param_set->method, "nw") == 0) { method_arguments_p = vcalloc(1,sizeof(Nw_param)); fill_arguments_nw((Nw_param*)method_arguments_p, alphabet_size); alignment_function = nw_dyn; } else if (strcmp(param_set->method, "gotoh") == 0) { method_arguments_p = vcalloc(1,sizeof(Gotoh_param)); fill_arguments_gotoh((Gotoh_param*)method_arguments_p, alphabet_size); alignment_function = gotoh_dyn; } else if (strcmp(param_set->method, "udisc") == 0) { method_arguments_p = vcalloc(1,sizeof(Udisc_param)); fill_arguments_gotoh((Gotoh_param*)method_arguments_p, alphabet_size); alignment_function = gotoh_dyn; } else { printf("ERROR - METHOD"); exit(1); } if (arguments.gap_iterate) { iterate(param_set, method_arguments_p, arguments.sequence_file, arguments.output_file, arguments.gap_iterate); return 0; } if (arguments.tree_file == NULL) { arguments.tree_file = vtmpnam(NULL); printf("CONSTRUCT TREE\n"); if (strcmp(arguments.tree_method, "parttree")==0) { make_partTree(arguments.sequence_file, arguments.tree_file, arguments.tree_param1, arguments.tree_param2, arguments.is_dna, 0); } else if (strcmp(arguments.tree_method, "oligotree") == 0) { compute_oligomer_distance_tree(arguments.sequence_file, param_set->char2pos, arguments.tree_file, arguments.tree_param2, arguments.tree_param1, param_set->alphabet_size); } if (arguments.tree_only == 1) return 0; } if (arguments.tree_out == 1) { char tree_out_file_name[500]; sprintf(tree_out_file_name, "%s.tree",arguments.output_file); char const LINE_LENGTH = 50; char line[LINE_LENGTH]; FILE* in = fopen(arguments.tree_file, "r"); FILE* out = fopen(tree_out_file_name, "w"); while( (fgets(line, LINE_LENGTH, in)) != NULL) fprintf(out, "%s", line); fclose(in); fclose(out); } FILE *seq_file = fopen(arguments.sequence_file,"r"); // FILE *edit_file = fopen(vtmpnam(NULL),"w+"); FILE *edit_file = fopen("aha","w+"); printf("CONSTRUCT ALIGNMENT\n"); FILE *tree_file = fopen(arguments.tree_file,"r"); const int LINE_LENGTH = 100; char line[LINE_LENGTH]; char delims[] = " "; int node[3]; int alignment_length = -1; node[2] = -1; //bottom-up traversal while(fgets(line, LINE_LENGTH, tree_file)!=NULL) { //read profiles node[0] = atoi(strtok(line,delims)); node[1] = atoi(strtok(NULL,delims)); node[2] = atoi(strtok(NULL,delims)); //getting profile of second child if (node[1] < number_of_sequences) { file_pos2profile(seq_file, file_positions[node[1]], profiles[1], node[1], param_set); //profile to save into } else { profile_file2profile(profiles[1], prof_file, profile_positions[--saved_prof], param_set); fseek (prof_file , profile_positions[saved_prof] , SEEK_SET); } //getting profile of first child if (node[0] < number_of_sequences) { file_pos2profile(seq_file, file_positions[node[0]], profiles[0], node[0], param_set); //profile to save into } else { profile_file2profile(profiles[0], prof_file, profile_positions[--saved_prof], param_set); fseek (prof_file , profile_positions[saved_prof] , SEEK_SET); } if (saved_prof == max_prof) { max_prof += 5; profile_positions = vrealloc(profile_positions, max_prof*sizeof(long)); } edit_positions[node[2]-number_of_sequences] = ftell(edit_file); profile_positions[saved_prof] = ftell(prof_file); ++saved_prof; //aligning the sequences alignment_length = alignment_function(profiles, param_set, method_arguments_p, arguments.is_dna, edit_file, prof_file, node[2]); } // bottom-down traversal (edit_files --> alignment) // tmp_out_file_name = vtmpnam(NULL); // FILE *alignment_file = fopen(tmp_out_file_name, "w"); FILE *alignment_file = fopen(arguments.output_file, "w"); FILE *edit_seq_file = fopen(vtmpnam(NULL),"w+"); char *aligned_sequence = vcalloc(alignment_length+3, sizeof(char)); long offset = ftell(edit_seq_file); for (i = 0; i < alignment_length; ++i) { fprintf(edit_seq_file, "X"); aligned_sequence[i] = 'X'; } aligned_sequence[i]= '\n'; aligned_sequence[i+1]= '\0'; fprintf(edit_seq_file, "\n"); offset = (ftell(edit_seq_file) - offset)*-1; edit2alignment(seq_file, file_positions, edit_file, edit_positions, node[2], number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file); fclose(alignment_file); fclose(tree_file); fclose(edit_file); fclose(seq_file); fclose(edit_seq_file); //set stuff for the next cycle // arguments.sequence_file = tmp_out_file_name; // // //DEBUG // // char copy_command[500]; // // sprintf(copy_command, "cp %s %s_%i", tmp_out_file_name, arguments.output_file, cycle); // // system(copy_command); // ++cycle; // } // printf("HERE_COPY\n"); // char copy_command[2000]; // sprintf(copy_command, "mv %s %s", tmp_out_file_name, arguments.output_file); // printf("%s\n", copy_command); // int error = system(copy_command); // printf("ERROR %i\n", error); //free_memory & close files fclose(prof_file); free_fastal_profile(profiles[0], alphabet_size); free_fastal_profile(profiles[1], alphabet_size); vfree(profiles); vfree(profile_positions); // number_of_sequences if (arguments.score) { printf("Calculate Score\n"); double aln_score = calculate_sum_of_pairs_score_affine(arguments.output_file, param_set->M, param_set->gop, param_set->gep); printf("SCORE: %f\n", aln_score); } if (!strcmp(param_set->method, "fast")) { free_sparse((Sparse_dynamic_param*)method_arguments_p); } else if (!strcmp(param_set->method, "nw")) { free_nw((Nw_param*)method_arguments_p, alphabet_size); } else if (!strcmp(param_set->method, "gotoh")) { free_gotoh((Gotoh_param*)method_arguments_p, alphabet_size); } vfree(param_set); //free_memory & close files vfree(edit_positions); return 0; } //****************** toolbox *************************** /** * \brief Enlargement of the dynamic programming matrix in case it is to small. * * \param dyn_matrix The dynamic programming matrix. * \param old_length1 Current size of dimension 1. * \param old_length2 Current size of dimension 2. * \param length1 New size of dimension 1. * \param length2 New size of dimension 2. * \return Pointer to the new array. */ double** resize_dyn_matrix(double **dyn_matrix, //the dynamic programming matrix int old_length1, //old length of dimension 1 int old_length2, //old length of dimension 2 int length1, //new minimum length of dimension 1 int length2) //new maximum length of dimension 2 { int i; if (old_length1 < length1) { dyn_matrix = vrealloc(dyn_matrix,length1*sizeof(double*)); for (i = old_length1; i < length1; ++i) dyn_matrix[i] = vcalloc(old_length2,sizeof(double)); old_length1 = length1; } if (old_length2 < length2) { for (i = 0;ialphabet_size; int i,j; for (i =0; i < 3; ++i) { profiles[i] = vcalloc(1,sizeof(Fastal_profile)); profiles[i]->weight = 1; profiles[i]->is_leaf = 1; profiles[i]->prf = vcalloc(alphabet_size, sizeof(int*)); for (j = 0; j < alphabet_size; ++j) { profiles[i]->prf[j] = vcalloc(PROFILE_ENLARGEMENT, sizeof(int)); } profiles[i]->allocated_memory = PROFILE_ENLARGEMENT; } } /** * \brief frees all memory occupied by the profile * * \param profile The profile to free. * \param alphabet_size The alphabet_size. */ void free_fastal_profile(Fastal_profile* profile, int alphabet_size) { --alphabet_size; for (;alphabet_size >= 0; --alphabet_size) vfree(profile->prf[alphabet_size]); vfree(profile->prf); } /** * \brief Initialize the Fastal parameter set. * * \param is_dna 1 when sequences are dna, 0 when not * \param param_set The fastal parameter set. * \param method The method to use in Fastal. */ void fill_parameters(int is_dna, Fastal_param *param_set, char *method, char *diag_method, char *mat) { sprintf(param_set->method,"%s",method); sprintf(param_set->diag_method,"%s",diag_method); int i; printf("%s",mat); param_set->M = read_matrice(mat); if (is_dna) { param_set->alphabet_size = 4; char tmp1[] = {'A','C','G','T'}; // int tmp2[] = { 0, 1, 1, 0, -1, -1, 2, 0, -1, -1, 3, -1, 1, 0, -1, -1, -1, 0, 1, 3, 4, -1, 3, -1, 1, -1}; for (i = 0; ialphabet_size; ++i) param_set->pos2char[i] = tmp1[i]; // for (i = 0; i<26; ++i) // param_set->char2pos[i] = tmp2[i]; param_set->char2pos['A'] = 0; param_set->char2pos['B'] = 1; param_set->char2pos['C'] = 1; param_set->char2pos['D'] = 0; param_set->char2pos['G'] = 2; param_set->char2pos['H'] = 0; param_set->char2pos['K'] = 3; param_set->char2pos['M'] = 1; param_set->char2pos['N'] = 0; param_set->char2pos['R'] = 0; param_set->char2pos['S'] = 1; param_set->char2pos['T'] = 3; param_set->char2pos['U'] = 3; param_set->char2pos['W'] = 3; param_set->char2pos['Y'] = 1; // param_set->M[0][3] = param_set->M[3][0] = -10; // param_set->M[1][2] = param_set->M[2][1] = -10; // param_set->M[0][1] = param_set->M[0][2] = param_set->M[1][0] = param_set->M[2][0] = -10; // param_set->M[3][1] = param_set->M[3][2] = param_set->M[1][3] = param_set->M[2][3] = -10; } else { param_set->alphabet_size = 21; char tmp1[] = {'A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','X'}; // int tmp2[] = { 0, 20, 1, 5, 16, 4, 2, 6, 7, 21, 8, 9, 10, 11, -1, 12, 13, 14, 15, 3, -1, 17, 18, 22, 19, 23}; for (i = 0; ialphabet_size; ++i) param_set->pos2char[i] = tmp1[i]; // for (i = 0; i<26; ++i) // param_set->char2pos[i] = tmp2[i]; param_set->char2pos['A'] = 0; param_set->char2pos['B'] = 20; param_set->char2pos['C'] = 1; param_set->char2pos['D'] = 2; param_set->char2pos['E'] = 3; param_set->char2pos['F'] = 4; param_set->char2pos['G'] = 5; param_set->char2pos['H'] = 6; param_set->char2pos['I'] = 7; param_set->char2pos['J'] = 20; param_set->char2pos['K'] = 8; param_set->char2pos['L'] = 9; param_set->char2pos['M'] = 10; param_set->char2pos['N'] = 11; param_set->char2pos['P'] = 12; param_set->char2pos['Q'] = 13; param_set->char2pos['R'] = 14; param_set->char2pos['S'] = 15; param_set->char2pos['T'] = 16; param_set->char2pos['V'] = 17; param_set->char2pos['W'] = 18; param_set->char2pos['X'] = 20; param_set->char2pos['Y'] = 19; param_set->char2pos['X'] = 20; } } /******************************COPYRIGHT NOTICE*******************************/ /*© Centro de Regulacio Genomica */ /*and */ /*Cedric Notredame */ /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */ /*All rights reserved.*/ /*This file is part of T-COFFEE.*/ /**/ /* T-COFFEE is free software; you can redistribute it and/or modify*/ /* it under the terms of the GNU General Public License as published by*/ /* the Free Software Foundation; either version 2 of the License, or*/ /* (at your option) any later version.*/ /**/ /* T-COFFEE is distributed in the hope that it will be useful,*/ /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/ /* GNU General Public License for more details.*/ /**/ /* You should have received a copy of the GNU General Public License*/ /* along with Foobar; if not, write to the Free Software*/ /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/ /*............................................... |*/ /* If you need some more information*/ /* cedric.notredame@europe.com*/ /*............................................... |*/ /**/ /**/ /* */ /******************************COPYRIGHT NOTICE*******************************/