+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <ctype.h>
-#include <string.h>
-// #include <unistd.h>
-
-#include "io_lib_header.h"
-#include "util_lib_header.h"
-#include "define_header.h"
-#include "dp_lib_header.h"
-#include "fastal_lib_header.h"
-#include "fast_tree_header.h"
-
-
-// #include <omp.h>
-// #define CHUNKSIZE 100
-// #define N 1000
-
-
-//Fastal_param *param_set;
-
-
-/*! \mainpage T-Coffee Index Page
- *
- * \section intro_sec Introduction
- *
- * This is the introduction.
- *
- * \section install_sec Installation
- *
- * \subsection step1 Step 1: Opening the box
- *
- * etc...
- * \section fastal_sec Fastal
- *
- * This program is a very fast aligner. It is capable of aligning huge sets of sequences because it keeps as much as necessary on hard disk.
- */
-
-
-
-
-/*!
- * \file fastal.c
- * \brief Source code for the fastal algorithm
- */
-
-
-
-//************************** sparse dynamic aligning **********************************************************
-
-
-void
-fill_arguments_sparse(Sparse_dynamic_param* method_arguments_p)
-{
- method_arguments_p->diagonals = vcalloc(3,sizeof(Diagonal));
- method_arguments_p->dig_length = vcalloc(1,sizeof(int));
- *method_arguments_p->dig_length = 3;
- method_arguments_p->list = NULL;
- method_arguments_p->list_length = vcalloc(1,sizeof(int));
- *method_arguments_p->list_length = 0;
- method_arguments_p->file_name1 = vtmpnam(NULL);
- method_arguments_p->file_name2 = vtmpnam(NULL);
-}
-
-void
-free_sparse(Sparse_dynamic_param* method_arguments_p)
-{
- vfree(method_arguments_p->diagonals);
- vfree(method_arguments_p->dig_length);
- vfree(method_arguments_p->list_length);
-}
-
-
-/**
- * \brief One run of sparse dynamic programming.
- *
- * \param profiles The profiles.
- * \param param_set The fastal parameters.
- * \param method_arguments_p The method arguments.
- * \param is_dna Sequences are DNA (\a is_dna = 1) or protein.
- * \param edit_file The edit file.
- * \param prof_file the profile file.
- * \param number Number of the parent node.
- * \return The length of the alignment.
- */
-int
-sparse_dyn(Fastal_profile **profiles,
- Fastal_param *param_set,
- void *method_arguments_p,
- int is_dna,
- FILE *edit_file,
- FILE *prof_file,
- int number)
-{
-// printf("WHAT THE HELL ARE YOU DOING HERE?\n");
- Sparse_dynamic_param *arguments = (Sparse_dynamic_param*)method_arguments_p;
-// static char *file_name1 = vtmpnam(NULL);
-// static char *file_name2 = vtmpnam(NULL);
- char *file_name1 = arguments->file_name1;
- char *file_name2 = arguments->file_name2;
- char *seq1, *seq2;
- Fastal_profile *tmp1 = profiles[0];
- Fastal_profile *tmp2 = profiles[1];
-
- seq1 = profile2consensus(tmp1, param_set);
- seq2 = profile2consensus(tmp2, param_set);
-
-
- int **diagonals_p = &(arguments->diagonals);
- int num_diagonals = -1;
- if (!strcmp(param_set->diag_method, "blastz"))
- {
- FILE *cons_f = fopen(file_name1,"w");
- fprintf(cons_f, ">%i\n", tmp1->prf_number);
- fprintf(cons_f, "%s", seq1);
- fprintf( cons_f, "\n");
- fclose(cons_f);
- cons_f = fopen(file_name2,"w");
- fprintf(cons_f, ">%i\n", tmp2->prf_number);
- fprintf(cons_f, "%s", seq2);
- fprintf( cons_f, "\n");
- fclose(cons_f);
- num_diagonals = seq_pair2blastz_diagonal(file_name1, file_name2, diagonals_p, arguments->dig_length, strlen(seq1),strlen(seq2), is_dna);
- }
- else if (!strcmp(param_set->diag_method, "blast"))
- {
- FILE *cons_f = fopen(file_name1,"w");
- fprintf(cons_f, ">%i\n", tmp1->prf_number);
- fprintf(cons_f, "%s", seq1);
- fprintf( cons_f, "\n");
- fclose(cons_f);
- cons_f = fopen(file_name2,"w");
- fprintf(cons_f, ">%i\n", tmp2->prf_number);
- fprintf(cons_f, "%s", seq2);
- fprintf( cons_f, "\n");
- fclose(cons_f);
- int l1 = strlen(seq1);
- int l2 = strlen(seq2);
- num_diagonals = seq_pair2blast_diagonal(file_name1, file_name2, diagonals_p, arguments->dig_length, l1, l2, is_dna);
-// int *num_p = &num_diagonals;
-// Segment* seg = extend_diagonals(*diagonals_p, num_p, l1, l2);
-// printf("A: %i\n", num_diagonals);
- }
- else if (!strcmp(param_set->diag_method, "ktup"))
- {
- num_diagonals = seq_pair2diagonal_own(seq1, seq2, diagonals_p, arguments->dig_length, strlen(seq1),strlen(seq2), is_dna, 3);
-// num_diagonals = seq_pair2diagonal_swift(seq1, seq2, diagonals_p, arguments->dig_length, strlen(seq1),strlen(seq2), is_dna, 3);
- }
-
-
-// arguments->diagonals = diagonals_p[0];
-// arguments->list = segments2int(seg, *num_p, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set);
-
-// t ** segments2int_gap(Segment *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set);
-
-// arguments->list = diagonals2int_dot(arguments->diagonals, num_diagonals, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set);
-// arguments->list = diagonals2int_euclidf(arguments->diagonals, num_diagonals, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set);
- arguments->list = diagonals2int_gap_test(arguments->diagonals, num_diagonals, seq1, seq2, profiles[0], profiles[1], arguments->list_length, param_set);
-// arguments->list = diagonals2int(arguments->diagonals, num_diagonals, seq1, seq2, arguments->list_length, param_set);
- int alignment_length = list2linked_pair_wise_fastal(profiles[0], profiles[1], param_set, arguments->list, *arguments->list_length, edit_file, prof_file, number);
- int x;
-
- for (x = 0; x < *arguments->list_length; ++x)
- {
- vfree(arguments->list[x]);
- }
- vfree(arguments->list);
- arguments->list = NULL;
- vfree(seq1);
- vfree(seq2);
- return alignment_length;
-}
-
-
-int
-fastal_compare (const void * a, const void * b)
-{
- return (*(int*)a - *(int*)b);
-}
-
-
-/**
- * \brief Makes a sorted list out of diagonals.
- *
- * \param diagonals A list of diagonals to use during dynamic programming.
- * \param num_diagonals Number of diagonals.
- * \param seq1 Sequence 1.
- * \param seq2 Sequence 2.
- * \param num_points Number of points in the list
- * \param param_set Fastal parameters.
- * \return A 2-dim array which contains all points needed for the sparse dynamic programming algorithm.
- */
-int **
-diagonals2int(int *diagonals,
- int num_diagonals,
- char *seq1,
- char *seq2,
- int *num_points,
- Fastal_param *param_set)
-{
-
- int l1 = strlen(seq1);
- int l2 = strlen(seq2);
- int gep = param_set->gep;
-
- int dig_length;
- if (seq1 > seq2)
- dig_length = l1;
- else
- dig_length = l2;
-
- int current_size = num_diagonals*dig_length + l1 +l2;
-
- int **list = vcalloc(current_size, sizeof(int*));
- int *diags = vcalloc(num_diagonals, sizeof(int));
- int i;
- for (i = 0; i < num_diagonals; ++i)
- {
- diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1];
- }
-
- qsort (diags, num_diagonals, sizeof(int), fastal_compare);
-
-
- int *diagx = vcalloc(num_diagonals, sizeof(int));
- int *diagy = vcalloc(num_diagonals, sizeof(int));
-
-
- //+1 because diagonals start here at position 1, like in "real" dynamic programming
- int a = -1, b = -1;
- for (i = 0; i < num_diagonals; ++i)
- {
- if (diags[i] < l1)
- {
- diagx[i] = l1 - diags[i];
- diagy[i] = 0;
- a= i;
- }
- else
- break;
- }
- ++a;
- b=a-1;
- for (; i < num_diagonals; ++i)
- {
- diagx[i] = 0;
- diagy[i] = diags[i]-l1;
- b = i;
- }
-
- vfree(diags);
- int tmpy_pos;
- int tmpy_value;
- int **M = param_set->M;
- int *last_y = vcalloc(l2+1, sizeof(int));
- int *last_x = vcalloc(l1+1, sizeof(int));
- last_y[0] = 0;
-
- last_x[0] = 0;
- list[0] = vcalloc(6, sizeof(int));
-
- int list_pos = 1;
- int dig_num = l1;
- int tmp_l2 = l2 + 1;
-
- //left border
- for (; list_pos < tmp_l2; ++list_pos)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = 0;
- list[list_pos][1] = list_pos;
- last_y[list_pos] = list_pos;
- list[list_pos][2] = list_pos*gep;
- list[list_pos][4] = list_pos-1;
- }
-
- int pos_x = 0;
- int y;
- int tmp_l1 = l1-1;
- while (pos_x < tmp_l1)
- {
- if (list_pos + num_diagonals+2 > current_size)
- {
- current_size += num_diagonals*1000;
- list = vrealloc(list, current_size * sizeof(int*));
- }
- //upper border
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = ++pos_x;
- list[list_pos][1] = 0;
- list[list_pos][2] = pos_x * gep;
- list[list_pos][3] = last_y[0];
- tmpy_value = list_pos;
- tmpy_pos = 0;
- last_x[pos_x] = list_pos;
- ++list_pos;
-
- //diagonals
- for (i = a; i <= b; ++i)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
-
- list[list_pos][0] = ++diagx[i];
-
- list[list_pos][1] = ++diagy[i];
- list[list_pos][3] = last_y[diagy[i]];
- list[list_pos][4] = list_pos-1;
- list[list_pos][5] = last_y[diagy[i]-1];
- list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A'];
- last_y[tmpy_pos] = tmpy_value;
- tmpy_value = list_pos;
- tmpy_pos = diagy[i];
-
- ++list_pos;
- }
- last_y[tmpy_pos] = tmpy_value;
-
-
- //lower border
- if (list[list_pos-1][1] != l2)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = pos_x;
- list[list_pos][1] = l2;
- list[list_pos][3] = last_y[l2];
-
- list[list_pos][2] = -1000;
- list[list_pos][4] = list_pos-1;
- if (pos_x > l2)
- list[list_pos][5] = last_x[pos_x-l2];
- else
- list[list_pos][5] = l2-pos_x;
- last_y[l2] = list_pos;
- ++list_pos;
-
- }
-
-
- if ((b >= 0) && (diagy[b] == l2))
- --b;
-
- if ((a >0) && (diagx[a-1] == pos_x))
- --a;
- }
-
-
- dig_num = -1;
- if (list_pos + l2+2 > current_size)
- {
- current_size += list_pos + l2 + 2;
- list = vrealloc(list, current_size * sizeof(int*));
- }
-
-
-// right border
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = l1;
- list[list_pos][1] = 0;
- list[list_pos][3] = last_x[l1-1];
- list[list_pos][2] = -1000;
- ++list_pos;
-
-
-
- for (i = 1; i <= l2; ++i)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = l1;
- list[list_pos][1] = i;
- list[list_pos][3] = last_y[i];
- list[list_pos][4] = list_pos-1;
- y = last_y[i-1];
- if ((list[y][0] == l1-1) && (list[y][1] == i-1))
- {
- list[list_pos][5] = y;
- list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'];
- }
- else
- {
- if (i <= l1)
- {
- list[list_pos][5] = last_x[l1-i];
- }
- else
- {
- list[list_pos][5] = i-l1;
- }
- list[list_pos][2] = -1000;
- }
- ++list_pos;
- }
-
- list[list_pos - l2][2] = -1000;
-
- *num_points = list_pos;
- vfree(diagx);
- vfree(diagy);
-
-
- return list;
-}
-
-
-/**
- * \brief Makes a sorted list out of diagonals.
- *
- * \param diagonals A list of diagonals to use during dynamic programming.
- * \param num_diagonals Number of diagonals.
- * \param seq1 Sequence 1.
- * \param seq2 Sequence 2.
- * \param num_points Number of points in the list
- * \param param_set Fastal parameters.
- * \return A 2-dim array which contains all points needed for the sparse dynamic programming algorithm.
- */
-int **
-diagonals2int_gap_test(int *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set)
-{
- int alphabet_size = param_set->alphabet_size;
- int l1 = strlen(seq1);
- int l2 = strlen(seq2);
- int gep = param_set->gep;
-
- int current_size = l2+l1;
-
- int **list = vcalloc(current_size, sizeof(int*));
- int *diags = vcalloc(num_diagonals, sizeof(int));
- int i;
- for (i = 0; i < num_diagonals; ++i)
- {
- diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1];
- }
- qsort (diags, num_diagonals, sizeof(int), fastal_compare);
-
-
- int *diagx = vcalloc(num_diagonals, sizeof(int));
- int *diagy = vcalloc(num_diagonals, sizeof(int));
- int *old_pos = vcalloc(num_diagonals, sizeof(int));
-
- //+1 because diagonals start here at position 1, like in "real" dynamic programming
- int a = -1, b = -1;
- for (i = 0; i < num_diagonals; ++i)
- {
-
- if (diags[i] < l1)
- {
- diagx[i] = l1 - diags[i];
- diagy[i] = 0;
- a= i;
- }
- else
- break;
- }
- ++a;
- b=a-1;
- for (; i < num_diagonals; ++i)
- {
- diagx[i] = 0;
- diagy[i] = diags[i]-l1;
- b = i;
- }
-
- vfree(diags);
- int tmpy_pos;
- int tmpy_value;
- int **M = param_set->M;
- int *last_y = vcalloc(l2+1, sizeof(int));
- int *last_x = vcalloc(l1+1, sizeof(int));
- last_y[0] = 0;
-
- last_x[0] = 0;
- list[0] = vcalloc(6, sizeof(int));
-
- int list_pos = 1;
- int dig_num = l1;
- int tmp_l2 = l2 + 1;
-
- //left border
- for (; list_pos < tmp_l2; ++list_pos)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = 0;
- list[list_pos][1] = list_pos;
- last_y[list_pos] = list_pos;
- list[list_pos][2] = list_pos*gep;
- list[list_pos][4] = list_pos-1;
- }
-
- int pos_x = 0;
-// int diags_old = l2;
-
-// int tmp = l1;
- int y;
- int tmp_l1 = l1-1;
- while (pos_x < tmp_l1)
- {
- if (list_pos + num_diagonals+2 > current_size)
- {
- current_size += num_diagonals*1000;
- list = vrealloc(list, current_size * sizeof(int*));
- }
- //upper border
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = ++pos_x;
- list[list_pos][1] = 0;
- list[list_pos][2] = pos_x * gep;
- list[list_pos][3] = last_y[0];
- tmpy_value = list_pos;
- tmpy_pos = 0;
- last_x[pos_x] = list_pos;
- ++list_pos;
-
- //diagonals
- for (i = a; i <= b; ++i)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
-
- list[list_pos][0] = ++diagx[i];
-
- list[list_pos][1] = ++diagy[i];
- list[list_pos][3] = last_y[diagy[i]];
- list[list_pos][4] = list_pos-1;
- list[list_pos][5] = last_y[diagy[i]-1];
-
-
-
-
-//SIMPLEGAP
- int num_seq = profile1->number_of_sequences + profile2->number_of_sequences;
- double gap_num = 0;
- int char_c;
- for (char_c = 0; char_c < alphabet_size; ++char_c)
- {
-
- gap_num += profile1->prf[char_c][diagx[i]-1] + profile2->prf[char_c][diagy[i]-1];
- }
-
- gap_num /= num_seq;
-
- list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A'] * gap_num;
-
-// CLUSTAL
-// int num_seq = profile1->number_of_sequences + profile2->number_of_sequences;
-// double gap_num = 0;
-// int char_c, char_c2;
-// for (char_c = 0; char_c < alphabet_size; ++char_c)
-// for (char_c2 = 0; char_c2 < alphabet_size; ++char_c2)
-// {
-// gap_num += (profile1->prf[char_c][diagx[i]-1]/profile1->number_of_sequences) * (profile2->prf[char_c][diagy[i]-1]/profile2->number_of_sequences) * M[param_set->pos2char[char_c]-'A'][param_set->pos2char[char_c2]-'A'];
-// }
-// list[list_pos][2] = gap_num;
-
-
- last_y[tmpy_pos] = tmpy_value;
- tmpy_value = list_pos;
- tmpy_pos = diagy[i];
-
- ++list_pos;
- }
- last_y[tmpy_pos] = tmpy_value;
-
-
- //lower border
- if (list[list_pos-1][1] != l2)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = pos_x;
- list[list_pos][1] = l2;
- list[list_pos][3] = last_y[l2];
-
- list[list_pos][2] = -1000;
- list[list_pos][4] = list_pos-1;
- if (pos_x > l2)
- list[list_pos][5] = last_x[pos_x-l2];
- else
- list[list_pos][5] = l2-pos_x;
- last_y[l2] = list_pos;
- ++list_pos;
-
- }
-
-
- if ((b >= 0) && (diagy[b] == l2))
- --b;
-
- if ((a >0) && (diagx[a-1] == pos_x))
- --a;
- }
-
-
- dig_num = -1;
- if (list_pos + l2+2 > current_size)
- {
- current_size += list_pos + l2 + 2;
- list = vrealloc(list, current_size * sizeof(int*));
- }
-
-
-// right border
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = l1;
- list[list_pos][1] = 0;
- list[list_pos][3] = last_x[l1-1];
- list[list_pos][2] = -1000;
- ++list_pos;
-
-
-
- for (i = 1; i <= l2; ++i)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = l1;
- list[list_pos][1] = i;
- list[list_pos][3] = last_y[i];
- list[list_pos][4] = list_pos-1;
- y = last_y[i-1];
- if ((list[y][0] == l1-1) && (list[y][1] == i-1))
- {
- list[list_pos][5] = y;
- int num_seq = profile1->number_of_sequences + profile2->number_of_sequences;
- double gap_num = 0;
- int char_c;
- for (char_c = 0; char_c < alphabet_size; ++char_c)
- {
- gap_num += profile1->prf[char_c][l1-1] + profile2->prf[char_c][i-1];
- }
-
- gap_num /= num_seq;
-
- list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'] * gap_num;
- }
- else
- {
- if (i <= l1)
- {
- list[list_pos][5] = last_x[l1-i];
- }
- else
- {
- list[list_pos][5] = i-l1;
- }
- list[list_pos][2] = -1000;
- }
- ++list_pos;
- }
-
- list[list_pos - l2][2] = -1000;
-
- *num_points = list_pos;
- vfree(diagx);
- vfree(diagy);
- vfree(old_pos);
-
- return list;
-}
-
-
-int **
-diagonals2int_euclidf(int *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set)
-{
- int alphabet_size = param_set->alphabet_size;
- int l1 = strlen(seq1);
- int l2 = strlen(seq2);
- int gep = param_set->gep;
-
- int current_size = l2+l1;
-
- int **list = vcalloc(current_size, sizeof(int*));
- int *diags = vcalloc(num_diagonals, sizeof(int));
- int i;
- for (i = 0; i < num_diagonals; ++i)
- {
- diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1];
- }
-
- qsort (diags, num_diagonals, sizeof(int), fastal_compare);
-
-
- int *diagx = vcalloc(num_diagonals, sizeof(int));
- int *diagy = vcalloc(num_diagonals, sizeof(int));
- int *old_pos = vcalloc(num_diagonals, sizeof(int));
-
- //+1 because diagonals start here at position 1, like in "real" dynamic programming
- int a = -1, b = -1;
- for (i = 0; i < num_diagonals; ++i)
- {
-
- if (diags[i] < l1)
- {
- diagx[i] = l1 - diags[i];
- diagy[i] = 0;
- a= i;
- }
- else
- break;
- }
- ++a;
- b=a-1;
- for (; i < num_diagonals; ++i)
- {
- diagx[i] = 0;
- diagy[i] = diags[i]-l1;
- b = i;
- }
-
- vfree(diags);
- int tmpy_pos;
- int tmpy_value;
-// int **M = param_set->M;
- int *last_y = vcalloc(l2+1, sizeof(int));
- int *last_x = vcalloc(l1+1, sizeof(int));
- last_y[0] = 0;
-
- last_x[0] = 0;
- list[0] = vcalloc(6, sizeof(int));
-
- int list_pos = 1;
- int dig_num = l1;
- int tmp_l2 = l2 + 1;
-
- //left border
- for (; list_pos < tmp_l2; ++list_pos)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = 0;
- list[list_pos][1] = list_pos;
- last_y[list_pos] = list_pos;
- list[list_pos][2] = list_pos*gep;
- list[list_pos][4] = list_pos-1;
- }
-
- int pos_x = 0;
-// int diags_old = l2;
-
-// int tmp = l1;
- int y;
- int tmp_l1 = l1-1;
- while (pos_x < tmp_l1)
- {
- if (list_pos + num_diagonals+2 > current_size)
- {
- current_size += num_diagonals*1000;
- list = vrealloc(list, current_size * sizeof(int*));
- }
- //upper border
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = ++pos_x;
- list[list_pos][1] = 0;
- list[list_pos][2] = pos_x * gep;
- list[list_pos][3] = last_y[0];
- tmpy_value = list_pos;
- tmpy_pos = 0;
- last_x[pos_x] = list_pos;
- ++list_pos;
-
- //diagonals
- for (i = a; i <= b; ++i)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
-
- list[list_pos][0] = ++diagx[i];
-
- list[list_pos][1] = ++diagy[i];
- list[list_pos][3] = last_y[diagy[i]];
- list[list_pos][4] = list_pos-1;
- list[list_pos][5] = last_y[diagy[i]-1];
- int char_c;
- double tmp_score = 0;
- double freq1, freq2;
- for (char_c = 0; char_c < alphabet_size; ++char_c)
- {
- freq1 = (double)profile1->prf[char_c][diagx[i]-1] / profile1->number_of_sequences;
-
- freq2 = (double)profile2->prf[char_c][diagy[i]-1] / profile2->number_of_sequences;
-
- tmp_score += ( freq1 - freq2) * (freq1 - freq2);
- }
-
- list[list_pos][2] = 10 - sqrt(tmp_score);
-
- last_y[tmpy_pos] = tmpy_value;
- tmpy_value = list_pos;
- tmpy_pos = diagy[i];
-
- ++list_pos;
- }
- last_y[tmpy_pos] = tmpy_value;
-
-
- //lower border
- if (list[list_pos-1][1] != l2)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = pos_x;
- list[list_pos][1] = l2;
- list[list_pos][3] = last_y[l2];
-
- list[list_pos][2] = -1000;
- list[list_pos][4] = list_pos-1;
- if (pos_x > l2)
- list[list_pos][5] = last_x[pos_x-l2];
- else
- list[list_pos][5] = l2-pos_x;
- last_y[l2] = list_pos;
- ++list_pos;
-
- }
-
-
- if ((b >= 0) && (diagy[b] == l2))
- --b;
-
- if ((a >0) && (diagx[a-1] == pos_x))
- --a;
- }
-
-
- dig_num = -1;
- if (list_pos + l2+2 > current_size)
- {
- current_size += list_pos + l2 + 2;
- list = vrealloc(list, current_size * sizeof(int*));
- }
-
-
-// right border
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = l1;
- list[list_pos][1] = 0;
- list[list_pos][3] = last_x[l1-1];
- list[list_pos][2] = -1000;
- ++list_pos;
-
-
-
- for (i = 1; i <= l2; ++i)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = l1;
- list[list_pos][1] = i;
- list[list_pos][3] = last_y[i];
- list[list_pos][4] = list_pos-1;
- y = last_y[i-1];
- if ((list[y][0] == l1-1) && (list[y][1] == i-1))
- {
- list[list_pos][5] = y;
- int char_c;
- int tmp_score = 0;
- double freq1, freq2;
- for (char_c = 0; char_c < alphabet_size; ++char_c)
- {
- freq1 = profile1->prf[char_c][l1-1] / profile1->number_of_sequences;
- freq2 = profile2->prf[char_c][i-1] / profile2->number_of_sequences;
- tmp_score += ( freq1 - freq2) * (freq2 - freq1);
- }
- list[list_pos][2] = 10 - sqrt(tmp_score);
-// list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'];
- }
- else
- {
- if (i <= l1)
- {
- list[list_pos][5] = last_x[l1-i];
- }
- else
- {
- list[list_pos][5] = i-l1;
- }
- list[list_pos][2] = -1000;
- }
- ++list_pos;
- }
-
- list[list_pos - l2][2] = -1000;
-
- *num_points = list_pos;
- vfree(diagx);
- vfree(diagy);
- vfree(old_pos);
-
- return list;
-}
-
-int **
-diagonals2int_dot(int *diagonals, int num_diagonals, char *seq1, char *seq2, Fastal_profile *profile1, Fastal_profile *profile2, int *num_points, Fastal_param *param_set)
-{
- int alphabet_size = param_set->alphabet_size;
- int l1 = strlen(seq1);
- int l2 = strlen(seq2);
- int gep = param_set->gep;
-
- int current_size = l2+l1;
-
- int **list = vcalloc(current_size, sizeof(int*));
- int *diags = vcalloc(num_diagonals, sizeof(int));
- int i;
- for (i = 0; i < num_diagonals; ++i)
- {
- diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1];
- }
-
- qsort (diags, num_diagonals, sizeof(int), fastal_compare);
-
-
- int *diagx = vcalloc(num_diagonals, sizeof(int));
- int *diagy = vcalloc(num_diagonals, sizeof(int));
- int *old_pos = vcalloc(num_diagonals, sizeof(int));
-
- //+1 because diagonals start here at position 1, like in "real" dynamic programming
- int a = -1, b = -1;
- for (i = 0; i < num_diagonals; ++i)
- {
-
- if (diags[i] < l1)
- {
- diagx[i] = l1 - diags[i];
- diagy[i] = 0;
- a= i;
- }
- else
- break;
- }
- ++a;
- b=a-1;
- for (; i < num_diagonals; ++i)
- {
- diagx[i] = 0;
- diagy[i] = diags[i]-l1;
- b = i;
- }
-
- vfree(diags);
- int tmpy_pos;
- int tmpy_value;
-// int **M = param_set->M;
- int *last_y = vcalloc(l2+1, sizeof(int));
- int *last_x = vcalloc(l1+1, sizeof(int));
- last_y[0] = 0;
-
- last_x[0] = 0;
- list[0] = vcalloc(6, sizeof(int));
-
- int list_pos = 1;
- int dig_num = l1;
- int tmp_l2 = l2 + 1;
-
- //left border
- for (; list_pos < tmp_l2; ++list_pos)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = 0;
- list[list_pos][1] = list_pos;
- last_y[list_pos] = list_pos;
- list[list_pos][2] = list_pos*gep;
- list[list_pos][4] = list_pos-1;
- }
-
- int pos_x = 0;
-// int diags_old = l2;
-
-// int tmp = l1;
- int y;
- int tmp_l1 = l1-1;
- while (pos_x < tmp_l1)
- {
- if (list_pos + num_diagonals+2 > current_size)
- {
- current_size += num_diagonals*1000;
- list = vrealloc(list, current_size * sizeof(int*));
- }
- //upper border
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = ++pos_x;
- list[list_pos][1] = 0;
- list[list_pos][2] = pos_x * gep;
- list[list_pos][3] = last_y[0];
- tmpy_value = list_pos;
- tmpy_pos = 0;
- last_x[pos_x] = list_pos;
- ++list_pos;
-
- //diagonals
- for (i = a; i <= b; ++i)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
-
- list[list_pos][0] = ++diagx[i];
-
- list[list_pos][1] = ++diagy[i];
- list[list_pos][3] = last_y[diagy[i]];
- list[list_pos][4] = list_pos-1;
- list[list_pos][5] = last_y[diagy[i]-1];
- int char_c;
- double tmp_score = 0;
- double freq1, freq2;
- for (char_c = 0; char_c < alphabet_size; ++char_c)
- {
- freq1 = (double)profile1->prf[char_c][diagx[i]-1] / profile1->number_of_sequences;
-
- freq2 = (double)profile2->prf[char_c][diagy[i]-1] / profile2->number_of_sequences;
-
- tmp_score += freq1 * freq2;
- }
-
- list[list_pos][2] = tmp_score * 10;
-
- last_y[tmpy_pos] = tmpy_value;
- tmpy_value = list_pos;
- tmpy_pos = diagy[i];
-
- ++list_pos;
- }
- last_y[tmpy_pos] = tmpy_value;
-
-
- //lower border
- if (list[list_pos-1][1] != l2)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = pos_x;
- list[list_pos][1] = l2;
- list[list_pos][3] = last_y[l2];
-
- list[list_pos][2] = -1000;
- list[list_pos][4] = list_pos-1;
- if (pos_x > l2)
- list[list_pos][5] = last_x[pos_x-l2];
- else
- list[list_pos][5] = l2-pos_x;
- last_y[l2] = list_pos;
- ++list_pos;
-
- }
-
-
- if ((b >= 0) && (diagy[b] == l2))
- --b;
-
- if ((a >0) && (diagx[a-1] == pos_x))
- --a;
- }
-
-
- dig_num = -1;
- if (list_pos + l2+2 > current_size)
- {
- current_size += list_pos + l2 + 2;
- list = vrealloc(list, current_size * sizeof(int*));
- }
-
-
-// right border
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = l1;
- list[list_pos][1] = 0;
- list[list_pos][3] = last_x[l1-1];
- list[list_pos][2] = -1000;
- ++list_pos;
-
-
-
- for (i = 1; i <= l2; ++i)
- {
- list[list_pos] = vcalloc(6, sizeof(int));
- list[list_pos][0] = l1;
- list[list_pos][1] = i;
- list[list_pos][3] = last_y[i];
- list[list_pos][4] = list_pos-1;
- y = last_y[i-1];
- if ((list[y][0] == l1-1) && (list[y][1] == i-1))
- {
- list[list_pos][5] = y;
- int char_c;
- int tmp_score = 0;
- double freq1, freq2;
- for (char_c = 0; char_c < alphabet_size; ++char_c)
- {
- freq1 = profile1->prf[char_c][l1-1] / profile1->number_of_sequences;
- freq2 = profile2->prf[char_c][i-1] / profile2->number_of_sequences;
- tmp_score += freq2 * freq1;
- }
- list[list_pos][2] = tmp_score * 10;
-// list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'];
- }
- else
- {
- if (i <= l1)
- {
- list[list_pos][5] = last_x[l1-i];
- }
- else
- {
- list[list_pos][5] = i-l1;
- }
- list[list_pos][2] = -1000;
- }
- ++list_pos;
- }
-
- list[list_pos - l2][2] = -1000;
-
- *num_points = list_pos;
- vfree(diagx);
- vfree(diagy);
- vfree(old_pos);
-
- return list;
-}
-
-
-void
-combine_profiles2file(int **prf1,
- int **prf2,
- int pos1,
- int pos2,
- Fastal_param *param_set,
- FILE *prof_f,
- char state)
-{
- int alphabet_size = param_set->alphabet_size;
- char *pos2aa = &(param_set->pos2char[0]);
- int i;
- int x = 0;
- if (state == 'M')
- {
- for (i = 0; i < alphabet_size; ++i)
- if (prf1[i][pos1] + prf2[i][pos2] > 0)
- {
- if (x)
- fprintf(prof_f," %c%i", pos2aa[i],prf1[i][pos1]+prf2[i][pos2]);
- else
- fprintf(prof_f,"%c%i", pos2aa[i],prf1[i][pos1]+prf2[i][pos2]);
- x = 1;
- }
- fprintf(prof_f,"\n");
- }
- else if (state == 'D')
- {
- for (i = 0; i < alphabet_size; ++i)
- if (prf2[i][pos2] > 0)
- {
- if (x)
- fprintf(prof_f," %c%i", pos2aa[i],prf2[i][pos2]);
- else
- fprintf(prof_f,"%c%i", pos2aa[i],prf2[i][pos2]);
- x = 1;
- }
- fprintf(prof_f,"\n");
- }
- else
- {
- for (i = 0; i < alphabet_size; ++i)
- if (prf1[i][pos1] > 0)
- {
- if (x)
- fprintf(prof_f," %c%i", pos2aa[i],prf1[i][pos1]);
- else
- fprintf(prof_f,"%c%i", pos2aa[i],prf1[i][pos1]);
- x = 1;
- }
- fprintf(prof_f,"\n");
- }
-}
-
-
-
-#define LIN(a,b,c) a[b*5+c]
-/**
- * Calculates a fast and sparse dynamic programming matrix
- *
- * \param prf1 Profile of first sequence.
- * \param prf2 Profile of second sequence.
- * \param param_set The parameter for the alignment.
- * \param list The list of diagonals.
- * \param n number of dots.
- * \param edit_f File to save the edit information.
- * \param prof_f File to save the profile.
- * \param node_number Number of the new profile.
- */
-int
-list2linked_pair_wise_fastal(Fastal_profile *prf1,
- Fastal_profile *prf2,
- Fastal_param *param_set,
- int **list,
- int n,
- FILE *edit_f,
- FILE *prof_f,
- int node_number)
-{
- int a,b, i, j, LEN=0, start_trace = -1;
- int pi, pj,ij, delta_i, delta_j, prev_i, prev_j;
-// static int **slist;
- static long *MI, *MJ, *MM,*MT2;
-// static int *sortseq;
- static int max_size;
- int gop, gep, igop, igep;
- int l1, l2, l, ls;
- char **al;
- int ni=0, nj=0;
- long score;
- int nomatch = param_set->nomatch;
-
- l1=prf1->length;
- l2=prf2->length;
-
- al=declare_char (2,l1+l2+1);
-
-
-
- igop=param_set->gop;
- gep=igep=param_set->gep;
- if (n>max_size)
- {
- max_size=n;
-
- vfree (MI);vfree (MJ); vfree (MM);
-
- MI=vcalloc (5*n, sizeof (long));
- MJ=vcalloc (5*n, sizeof (long));
- MM=vcalloc (5*n, sizeof (long));
-
- }
- else
- {
- for (a=0; a<n; a++)
- for (b=0; b<5; b++)
- LIN(MI,a,b)=LIN(MJ,a,b)=LIN(MJ,a,b)=-1000000;
- }
-
- for (a=0; a<n; a++)
- {
- i=list[a][0];
- j=list[a][1];
-
-
- if (i==l1 || j==l2)gop=0;
- else gop=igop;
-
- if (i==l1 && j==l2)start_trace=a;
- else if ( i==0 || j==0)
- {
- LIN(MM,a,0)=-1000000;
- if (j==0)
- {
- LIN(MJ,a,0)=-10000000;
- LIN(MI,a,0)=gep*i;
- }
- else if (i==0)
- {
- LIN(MI,a,0)=-10000000;
- LIN(MJ,a,0)=gep*j;
- }
-
- LIN(MI,a,1)=LIN(MJ,a,1)=-1;
- LIN(MI,a,2)=LIN(MJ,a,2)=i;
- LIN(MI,a,3)=LIN(MJ,a,3)=j;
- continue;
- }
-
- pi = list[a][3];
- pj = list[a][4];
- ij = list[a][5];
-
- prev_i=list[pi][0];
- prev_j=list[pj][1];
-
- delta_i=list[a][0]-list[pi][0];
- delta_j=list[a][1]-list[pj][1];
-
- /*Linear Notation*/
- LIN(MI,a,0)=MAX(LIN(MI,pi,0),(LIN(MM,pi,0)+gop))+delta_i*gep;
- LIN(MI,a,1)=pi;
- LIN(MI,a,2)=delta_i;
- LIN(MI,a,3)=0;
- LIN(MI,a,4)=(LIN(MI,pi,0) >=(LIN(MM,pi,0)+gop))?'i':'m';
-
- LIN(MJ,a,0)=MAX(LIN(MJ,pj,0),(LIN(MM,pj,0)+gop))+delta_j*gep;
- LIN(MJ,a,1)=pj;
- LIN(MJ,a,2)=0;
- LIN(MJ,a,3)=delta_j;
-
- LIN(MJ,a,4)=(LIN(MJ,pj,0) >=LIN(MM,pj,0)+gop)?'j':'m';
-
- if (a>1 && (ls=list[a][0]-list[ij][0])==(list[a][1]-list[ij][1]))
- {
- LIN(MM,a,0)=MAX3(LIN(MM,ij,0),LIN(MI,ij,0),LIN(MJ,ij,0))+list[a][2]-(ls*nomatch);
-
- LIN(MM,a,1)=ij;
- LIN(MM,a,2)=ls;
- LIN(MM,a,3)=ls;
- if ( LIN(MM,ij,0) >=LIN(MI,ij,0) && LIN(MM,ij,0)>=LIN(MJ,ij,0))LIN(MM,a,4)='m';
- else if ( LIN(MI,ij,0) >= LIN(MJ,ij,0))LIN(MM,a,4)='i';
- else LIN(MM,a,4)='j';
-
- }
- else
- {
- LIN(MM,a,0)=UNDEFINED;
- LIN(MM,a,1)=-1;
- }
- }
-
- a=start_trace;
- if (LIN(MM,a,0)>=LIN(MI,a,0) && LIN(MM,a,0) >=LIN(MJ,a,0))MT2=MM;
- else if ( LIN(MI,a,0)>=LIN(MJ,a,0))MT2=MI;
- else MT2=MJ;
-
- score=MAX3(LIN(MM,a,0), LIN(MI,a,0), LIN(MJ,a,0));
-
- i=l1;
- j=l2;
-
- while (!(i==0 &&j==0))
- {
- int next_a;
- l=MAX(LIN(MT2,a,2),LIN(MT2,a,3));
- // HERE ("%c from %c %d %d SCORE=%d [%d %d] [%2d %2d]", T2[a][5],T2[a][4], T2[a][2], T2[a][3], T2[a][0], gop, gep, i, j);
- if (i==0)
- {
- while ( j>0)
- {
- al[0][LEN]=0;
- al[1][LEN]=1;
- j--; LEN++;
- }
- }
- else if (j==0)
- {
- while ( i>0)
- {
- al[0][LEN]=1;
- al[1][LEN]=0;
- i--; LEN++;
- }
- }
-
-// else if (l==0) {HERE ("L=0 i=%d j=%d",l, i, j);exit (0);}
- else
- {
- for (b=0; b<l; b++, LEN++)
- {
- if (LIN(MT2,a,2)){al[0][LEN]=1;i--;ni++;}
- else al[0][LEN]=0;
-
- if (LIN(MT2,a,3)){al[1][LEN]=1;j--;nj++;}
- else al[1][LEN]=0;
- }
-
- next_a=LIN(MT2,a,1);
- if (LIN(MT2,a,4)=='m')MT2=MM;
- else if (LIN(MT2,a,4)=='i')MT2=MI;
- else if (LIN(MT2,a,4)=='j')MT2=MJ;
- a=next_a;
- }
- }
-
- invert_list_char ( al[0], LEN);
- invert_list_char ( al[1], LEN);
-
- fprintf(edit_f, "%i\n%i\n%i\n%i\n",prf1->prf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf);
- fprintf(prof_f, "%i\n0\n%i\n1\n%i\n", node_number,LEN, prf1->number_of_sequences+prf2->number_of_sequences);
-
- char statec[] = {'M','D','I'};
- int num = 0;
- int state = 0;
- i = 0;
- j = 0;
-
- for ( b=0; b< LEN; b++)
- {
- if ((al[0][b]==1) && (al[1][b]==1))
- {
-
- combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'M');
- ++i;
- ++j;
- if (state != 0)
- {
- fprintf(edit_f, "%c%i\n",statec[state], num);
- num =1;
- state = 0;
- }
- else
- ++num;
- }
- else if (al[0][b]==1)
- {
-// prf1->prf[param_set->alphabet_size-1] += prf2->num_sequences;
- combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'I');
- ++i;
- if (state != 2)
- {
- fprintf(edit_f, "%c%i\n",statec[state], num);
- num =1;
- state = 2;
- }
- else
- ++num;
- }
- else if (al[1][b]==1)
- {
-// prf2->prf[param_set->alphabet_size-1] += prf1->num_sequences;
- combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'D');
- ++j;
- if (state != 1)
- {
- fprintf(edit_f, "%c%i\n",statec[state], num);
- num =1;
- state = 1;
- }
- else
- ++num;
- }
- }
-
-
- fprintf(edit_f, "%c%i\n",statec[state], num);
- num =1;
- state = 1;
-
-
- fprintf(edit_f,"*\n");
- fprintf(prof_f,"*\n");
- free_char (al, -1);
-// exit(0);
- return LEN;
-}
-
-
-
-
-
-
-/**
- * \brief Turns a profile into a consensus sequence.
- *
- * The character with the highest number of occurences is used as consensus. Gaps are not included. For example: 10 '-' and one 'A' would give 'A' as consensus.
- * \param profile The profile.
- * \param file_name Name of the file to save the consensus sequence in.
- * \param param_set The parameter of the fastal algorithm.
- * \return the sequence
- */
-char*
-profile2consensus(Fastal_profile *profile, Fastal_param *param_set)
-{
-
-// FILE *cons_f = fopen(file_name,"w");
-// fprintf(cons_f, ">%i\n", profile->prf_number);
- char* seq = vcalloc(profile->length+1, sizeof(char));
- int i, j;
- int most_pos = -1, most;
- int alphabet_size = param_set->alphabet_size;
- int **prf = profile->prf;
- char *pos2char = param_set->pos2char;
- for (i = 0; i < profile->length; ++i)
- {
- most = -1;
- for (j = 0; j < alphabet_size; ++j)
- {
- if (prf[j][i] > most)
- {
- most = prf[j][i];
- most_pos = j;
- }
- }
- seq[i] = pos2char[most_pos];
-// fprintf(cons_f, "%c",pos2char[most_pos]);
- }
- return seq;
-}
-
-
-int
-diag_compare (const void * a, const void * b)
-{
- return (((Diagonal_counter*)b)->count - ((Diagonal_counter*)a)->count);
-}
-
-/**
- * \brief Calculates the diagonals between two sequences.
- *
- * Uses to calculate the diagonals.
- * \param seq_file1 File with sequence 1.
- * \param seq_file2 File with sequence 2.
- * \param diagonals An array where the diagonal points will be stored.
- * \param dig_length length of \a diagonals .
- * \param num_points Number of points in all diagonals.
- * \return number of diagonals;
- */
-int
-seq_pair2diagonal_own(char *seq1,
- char *seq2,
- int **diagonals,
- int *dig_length,
- int l1,
- int l2,
- int is_dna,
- int word_length)
-{
-// word_length = 7;
- int word_number, i;
- int ng;
- if (is_dna)
- {
- word_number = (int)pow(5, word_length);
- ng = 4;
- }
- else
- {
- word_number = (int)pow(24, word_length);
- ng = 24;
- }
- int **word_index = vcalloc(word_number, sizeof(int*));
- for (i = 0 ; i < word_number; ++i)
- {
- word_index[i] = vcalloc(20, sizeof(int));
- word_index[i][0] = 2;
- word_index[i][1] = 20;
- }
-
-
- //making of k-tup index of seq1
-
- int *prod=vcalloc (word_length, sizeof(int));
- for ( i=0; i<word_length; i++)
- {
- prod[word_length-i-1]=(int)pow(ng,i);
- }
-
- int aa[256];
- if (is_dna)
- {
- aa['A'] = 0;
- aa['C'] = 1;
- aa['G'] = 2;
- aa['T'] = 3;
- aa['U'] = 3;
- }
- else
- {
- aa['A'] = 0;
- aa['B'] = 20;
- aa['C'] = 1;
- aa['D'] = 2;
- aa['E'] = 3;
- aa['F'] = 4;
- aa['G'] = 5;
- aa['H'] = 6;
- aa['I'] = 7;
- aa['J'] = 20;
- aa['K'] = 8;
- aa['L'] = 9;
- aa['M'] = 10;
- aa['N'] = 11;
- aa['P'] = 12;
- aa['Q'] = 13;
- aa['R'] = 14;
- aa['S'] = 15;
- aa['T'] = 16;
- aa['V'] = 17;
- aa['W'] = 18;
- aa['X'] = 20;
- aa['Y'] = 19;
- aa['X'] = 20;
- }
- int index = 0;
- for (i = 0; i < word_length; ++i)
- {
- index += aa[(short)seq1[i]] *prod[i];
- }
- word_index[index][2] = 0;
- word_index[index][0] = 3;
- int z = -1;
- int *tmp;
- for (; i < l1; ++i)
- {
- index -= aa[(short)seq1[++z]] * prod[0];
- index *= ng;
- index += aa[(short)seq1[i]];
- tmp = word_index[index];
- if (tmp[0] == tmp[1])
- {
- tmp[1] += 25;
- tmp = vrealloc(tmp, word_index[index][1] *sizeof(int));
- word_index[index] = tmp;
- }
- tmp[tmp[0]++] = i;
- }
-
-
-
- //counting diagonals
- const int window_length = 14;
-
- Diagonal_counter *diag_index = vcalloc(l1+l2, sizeof(Diagonal_counter));
- int num = l1+l2;
- for (i = 0; i < num; ++i)
- {
- diag_index[i].diagonal = i;
- diag_index[i].count = 0;
- }
- index = 0;
-
- int j;
- for (i = 0; i < word_length; ++i)
- {
- index += aa[(short)seq2[i]] *prod[i];
- for (j = 2; j < word_index[index][0]; ++j)
- {
- ++(diag_index[i - word_index[index][j] + l1].count);
- }
- }
-
- z = -1;
- int i2 = i-1;
- int second_index = index;
- for (; i < window_length; ++i)
- {
- index -= aa[(short)seq2[++z]] * prod[0];
- index *= ng;
- index += aa[(short)seq2[i]];
- tmp = word_index[index];
- for (j = 2; j < tmp[0]; ++j)
- {
- ++(diag_index[i - tmp[j] + l1].count);
- }
- }
- int z2 = -1;
- for (; i < l2; ++i)
- {
- index -= aa[(short)seq2[++z]] * prod[0];
- index *= ng;
- index += aa[(short)seq2[i]];
- second_index -= aa[(short)seq2[++z2]] * prod[0];
- second_index *= ng;
- second_index += aa[(short)seq2[++i2]];
-
- tmp = word_index[index];
- for (j = 2; j < tmp[0]; ++j)
- {
- ++(diag_index[i - tmp[j] + l1].count);
- }
-
-
- tmp = word_index[second_index];
- for (j = 2; j < tmp[0]; ++j)
- {
- if (diag_index[i2 - tmp[j] + l1].count > window_length-3)
- diag_index[i2 - tmp[j] + l1].count = window_length+100;
- else
- --diag_index[i2 - tmp[j] + l1].count;
- }
- }
-
-
- //choose diagonals
- int *diags = diagonals[0];
- int current_pos = 0;
-
-
- qsort (diag_index, num, sizeof(Diagonal_counter*), diag_compare);
-
- i = 0;
- int y, x;
- while (diag_index[i].count > window_length+10)
- {
- if (current_pos > (*dig_length)-3)
- {
- (*dig_length) += 30;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
-
-
- y = diag_index[i].diagonal - l1;
- if (y < 0)
- {
- x = y * (-1);
- y = 0;
- }
- else
- {
- x = 0;
- }
- diags[current_pos++] = x;
- diags[current_pos++] = y;
- diags[current_pos++] = 200;
- ++i;
- }
-
- vfree(diag_index);
- for (i = 0; i < word_number; ++i)
- vfree(word_index[i]);
- vfree(word_index);
- diagonals[0] = diags;
- return current_pos/3;
-}
-
-
-
-int
-seq_pair2diagonal_swift(char *seq1,
- char *seq2,
- int **diagonals,
- int *dig_length,
- int l1,
- int l2,
- int is_dna,
- int word_length)
-{
- int word_number, i;
- int ng;
- if (is_dna)
- {
- word_number = (int)pow(5, word_length);
- ng = 5;
- }
- else
- {
- word_number = (int)pow(24, word_length);
- ng = 24;
- }
- int **word_index = vcalloc(word_number, sizeof(int*));
- for (i = 0 ; i < word_number; ++i)
- {
- word_index[i] = vcalloc(20, sizeof(int));
- word_index[i][0] = 2;
- word_index[i][1] = 20;
- }
-
-
- //making of k-tup index of seq1
-
- int *prod=vcalloc (word_length, sizeof(int));
- for ( i=0; i<word_length; i++)
- {
- prod[word_length-i-1]=(int)pow(ng,i);
- }
-
- int aa[256];
- aa['A'] = 0;
- aa['C'] = 1;
- aa['G'] = 2;
- aa['T'] = 3;
- aa['U'] = 4;
- int index = 0;
- for (i = 0; i < word_length; ++i)
- {
- index += aa[(short)seq1[i]] *prod[i];
- }
- word_index[index][2] = 0;
- word_index[index][0] = 3;
- int z = -1;
- int *tmp;
- for (; i < l1; ++i)
- {
- index -= aa[(short)seq1[++z]] * prod[0];
- index *= ng;
- index += aa[(short)seq1[i]];
- tmp = word_index[index];
- if (tmp[0] == tmp[1])
- {
- tmp[1] += 25;
- tmp = vrealloc(tmp, word_index[index][1] *sizeof(int));
- word_index[index] = tmp;
- }
- tmp[tmp[0]++] = i;
- }
-
-
- //counting diagonals
- const int window_length = 14;
- const int threshold = 12;
-
- Swift_diagonal *diag_index = vcalloc(l1+l2, sizeof(Swift_diagonal));
- int num = l1+l2;
- for (i = 0; i < num; ++i)
- {
- diag_index[i].diagonal = i;
- diag_index[i].count = 0;
- diag_index[i].start = -99999;
- diag_index[i].end = -99999;
- }
-
- index = 0;
-
- int j;
- for (i = 0; i < word_length; ++i)
- {
- index += aa[(short)seq2[i]] *prod[i];
- for (j = 2; j < word_index[index][0]; ++j)
- {
- ++(diag_index[i - word_index[index][j] + l1].count);
- }
- }
-
- z = -1;
- int tmp_index;
- for (; i < l2; ++i)
- {
- index -= aa[(short)seq2[++z]] * prod[0];
- index *= ng;
- index += aa[(short)seq2[i]];
- tmp = word_index[index];
- for (j = 2; j < tmp[0]; ++j)
- {
- tmp_index = i - tmp[j] + l1;
- if (i - diag_index[tmp_index].end > window_length)
- {
- if (diag_index[tmp_index].count < threshold)
- {
- diag_index[tmp_index].count = 0;
- diag_index[tmp_index].start = i;
- diag_index[tmp_index].end = i + word_length;
- }
-
- }
- else
- {
- ++(diag_index[tmp_index].count);
- }
- }
-
- }
-
-
-
- // choose diagonals
- int *diags = diagonals[0];
- int current_pos = 0;
- int x, y;
- for (i = 0; i < num; ++i)
- {
- if (diag_index[i].count > threshold)
- {
- if (current_pos > (*dig_length)-3)
- {
- (*dig_length) += 30;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- y = diag_index[i].diagonal - l1;
- if (y < 0)
- {
- x = y * (-1);
- y = 0;
- }
- else
- {
- x = 0;
- }
- diags[current_pos++] = x;
- diags[current_pos++] = y;
- diags[current_pos++] = 200;
- }
- }
-
- vfree(diag_index);
- for (i = 0; i < word_number; ++i)
- vfree(word_index[i]);
- vfree(word_index);
- diagonals[0] = diags;
-
- return current_pos/3;
-}
-
-
-
-/**
- * \brief Calculates the diagonals between two sequences.
- *
- * Uses a k-tup index to choose diagonals.
- * \param seq_file1 File with sequence 1.
- * \param seq_file2 File with sequence 2.
- * \param diagonals An array where the diagonal points will be stored.
- * \param dig_length length of \a diagonals .
- * \param num_points Number of points in all diagonals.
- * \return number of diagonals;
- */
-int
-seq_pair2blast_diagonal(char *seq_file_name1,
- char *seq_file_name2,
- int **diagonals,
- int *dig_length,
- int l1,
- int l2,
- int is_dna)
-{
-// static int blast_measure[12]={0,0,0,0,0,0,0,0,0,0,0,0};
- int *diag = vcalloc(l1 + l2, sizeof(int));
- char *out_file = vtmpnam(NULL);
- char blast_command[200];
-// char blast_command2[200];
-// if (x)
-// {
-// int i;
-// printf("BLAST-Types:\n");
-// for (i = 0; i < 11; ++i)
-// {
-// printf("Type %i: %i\n", i, blast_measure[i]);
-// }
-// return 0;
-// }
-// char blast_command2[600];
-// sprintf(blast_command2, "less %s", out_file);
-
- if (is_dna)
- {
- sprintf(blast_command, "bl2seq -p blastn -i %s -j %s -D 1 -g F -o %s -S 1 -F F", seq_file_name1, seq_file_name2, out_file);
- }
- else
- {
- sprintf(blast_command, "bl2seq -p blastp -i %s -j %s -D 1 -g F -o %s -F F -S 1", seq_file_name1, seq_file_name2, out_file);
- }
- system(blast_command);
-
- int *diags = diagonals[0];
- FILE *diag_f = fopen(out_file,"r");
- char line[300];
- fgets(line, 300, diag_f);
- fgets(line, 300, diag_f);
- fgets(line, 300, diag_f);
-
-
- char delims[] = "\t";
- int length, pos_q, pos_d;
- int current_pos = 0;
- while (fgets(line, 300, diag_f) != NULL)
- {
- strtok(line, delims);
- strtok(NULL, delims);
- strtok(NULL, delims);
- length = atoi(strtok(NULL, delims));
- strtok(NULL, delims);
- strtok(NULL, delims);
- pos_q = atoi(strtok(NULL, delims))-1;
- strtok(NULL, delims);
- pos_d = atoi(strtok(NULL, delims))-1;
-
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- if (diag[l1-(pos_q)+pos_d] == 0)
- {
- diag[l1-(pos_q)+pos_d] =1;
- diags[current_pos++] = pos_q;
- diags[current_pos++] = pos_d;
- diags[current_pos++] = length;
- }
-
- }
- fclose(diag_f);
- int round = 0;
- int e_threshold = 10;
- while ((current_pos == 0) && (round < 10))
- {
- if (is_dna)
- {
- sprintf(blast_command, "bl2seq -p blastn -i %s -j %s -D 1 -g F -o %s -S 1 -F F -W 6 -e %i", seq_file_name1, seq_file_name2, out_file, e_threshold);
- }
- else
- {
- sprintf(blast_command, "bl2seq -p blastp -i %s -j %s -D 1 -g F -o %s -F F -S 1 -e %i", seq_file_name1, seq_file_name2, out_file, e_threshold);
- }
- system(blast_command);
- e_threshold *= 10;
- FILE *diag_f = fopen(out_file,"r");
- char line[300];
- fgets(line, 300, diag_f);
- fgets(line, 300, diag_f);
- fgets(line, 300, diag_f);
-
-
- char delims[] = "\t";
- while (fgets(line, 300, diag_f) != NULL)
- {
- strtok(line, delims);
- strtok(NULL, delims);
- strtok(NULL, delims);
- length = atoi(strtok(NULL, delims));
- strtok(NULL, delims);
- strtok(NULL, delims);
- pos_q = atoi(strtok(NULL, delims))-1;
- strtok(NULL, delims);
- pos_d = atoi(strtok(NULL, delims))-1;
-
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- if (diag[l1-(pos_q)+pos_d] == 0)
- {
- diag[l1-(pos_q)+pos_d] =1;
- diags[current_pos++] = pos_q;
- diags[current_pos++] = pos_d;
- diags[current_pos++] = length;
- }
- }
- fclose(diag_f);
- ++round;
- if (current_pos < 27)
- current_pos = 0;
- }
-// ++blast_measure[round];
-
- if (current_pos == 0)
- {
- printf("BLAST NOT SUCCESFULL\n");
- if (l1 < l2)
- {
- int i;
- int diff = l2 - l1 + 10;
- for (i = diff; i > 0; --i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = 0;
- diags[current_pos++] = i;
- diags[current_pos++] = 100;
- }
- diff = 10;
-// printf("A: %i\n", diff);
- for (i = 0; i < diff; ++i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = i;
- diags[current_pos++] = 0;
- diags[current_pos++] = 100;
- }
- }
- else
- {
- int i;
- int diff = 10;
-// printf("A: %i\n", diff);
- for (i = diff; i > 0; --i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = 0;
- diags[current_pos++] = i;
- diags[current_pos++] = 100;
- }
- diff = l1 - l2 + 10;
- for (i = 0; i < diff; ++i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = i;
- diags[current_pos++] = 0;
- diags[current_pos++] = 100;
- }
-
- }
- }
-
-
- vfree(diag);
-
- diagonals[0] = diags;
- return current_pos/3;
-}
-
-
-
-int
-seq_pair2blat_diagonal(char *seq_file_name1,
- char *seq_file_name2,
- int **diagonals,
- int *dig_length,
- int l1,
- int l2,
- int is_dna)
-{
- int *diag = vcalloc(l1 + l2, sizeof(int));
- char *out_file = vtmpnam(NULL);
- char blast_command[200];
-// char blast_command2[200];
-// char blast_command2[600];
-// sprintf(blast_command2, "less %s", out_file);
-
- if (is_dna)
- {
- sprintf(blast_command, "blat %s %s %s -out=blast8 -q=dna -t=dna -maxGap=0 >/dev/null 2>/dev/null", seq_file_name2, seq_file_name1, out_file);
- }
- else
- {
- sprintf(blast_command, "blat %s %s %s -out=blast8 -prot -maxGap=0 >/dev/null 2>/dev/null", seq_file_name2, seq_file_name1, out_file);
- }
- system(blast_command);
-
- int *diags = diagonals[0];
- FILE *diag_f = fopen(out_file,"r");
- char line[300];
-// fgets(line, 300, diag_f);
-// fgets(line, 300, diag_f);
-// fgets(line, 300, diag_f);
-
-
- char delims[] = "\t";
- int length, pos_q, pos_d;
- int current_pos = 0;
- while (fgets(line, 300, diag_f) != NULL)
- {
- strtok(line, delims);
- strtok(NULL, delims);
- strtok(NULL, delims);
- length = atoi(strtok(NULL, delims));
- strtok(NULL, delims);
- strtok(NULL, delims);
- pos_q = atoi(strtok(NULL, delims))-1;
- strtok(NULL, delims);
- pos_d = atoi(strtok(NULL, delims))-1;
-
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- if (diag[l1-(pos_q)+pos_d] == 0)
- {
- diag[l1-(pos_q)+pos_d] =1;
- diags[current_pos++] = pos_q;
- diags[current_pos++] = pos_d;
- diags[current_pos++] = length;
- }
- }
- if (current_pos == 0)
- {
- printf("BLAT NOT SUCCESFULL\n");
- if (l1 < l2)
- {
- int i;
- int diff = l2 - l1 + 10;
- for (i = diff; i > 0; --i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = 0;
- diags[current_pos++] = i;
- diags[current_pos++] = 100;
- }
- diff = 10;
-// printf("A: %i\n", diff);
- for (i = 0; i < diff; ++i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = i;
- diags[current_pos++] = 0;
- diags[current_pos++] = 100;
- }
- }
- else
- {
- int i;
- int diff = 10;
-// printf("A: %i\n", diff);
- for (i = diff; i > 0; --i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = 0;
- diags[current_pos++] = i;
- diags[current_pos++] = 100;
- }
- diff = l1 - l2 + 10;
- for (i = 0; i < diff; ++i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = i;
- diags[current_pos++] = 0;
- diags[current_pos++] = 100;
- }
-
- }
- }
-
-// printf("END\n");
- vfree(diag);
- fclose(diag_f);
- diagonals[0] = diags;
- return current_pos/3;
-}
-
-
-
-/**
- * \brief Calculates the diagonals between two sequences.
- *
- * Uses blastz to calculate the diagonals.
- * \param seq_file1 File with sequence 1.
- * \param seq_file2 File with sequence 2.
- * \param diagonals An array where the diagonal points will be stored.
- * \param dig_length length of \a diagonals .
- * \param num_points Number of points in all diagonals.
- * \return number of diagonals;
- */
-int
-seq_pair2blastz_diagonal(char *seq_file_name1,
- char *seq_file_name2,
- int **diagonals,
- int *dig_length,
- int l1,
- int l2,
- int is_dna)
-{
- int *diag = vcalloc(l1 + l2, sizeof(int));
- char *out_file = vtmpnam(NULL);
- char blast_command[200];
-// char blast_command2[200];
-// char blast_command2[600];
-// sprintf(blast_command2, "less %s", out_file);
-
- if (is_dna)
- {
- sprintf(blast_command, "~/Download/blastz-source/blastz %s %s B=0 K=10000> %s", seq_file_name1, seq_file_name2, out_file);
- }
- else
- {
- printf("SORRY - no BLASTZ with amino accid\n");
- exit(0);
- }
- system(blast_command);
-
- int *diags = diagonals[0];
- FILE *diag_f = fopen(out_file,"r");
- char line[300];
- char delims[] = " ";
-// char *result = NULL;
- int length, pos_q, pos_d;
- int current_pos = 0;
- while (fgets(line, 300, diag_f) != NULL)
- {
- if (line[0] == 'a')
- {
- char *line_tmp;
- while (fgets(line, 300, diag_f) != NULL)
- {
- if (line[0] == '}')
- break;
-
- if (line[2] == 'l')
- {
- line_tmp = &line[4];
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- pos_q = atoi(strtok(line_tmp, delims));
- pos_d = atoi(strtok(NULL, delims));
- length = atoi(strtok(NULL, delims) - pos_q);
- if (diag[l1-(pos_q)+pos_d] == 0)
- {
- diag[l1-(pos_q)+pos_d] =1;
- diags[current_pos++] = pos_q;
- diags[current_pos++] = pos_d;
- diags[current_pos++] = length;
- }
- }
- }
- }
- }
-
-
- if (current_pos == 0)
- {
- printf("BLASTZ NOT SUCCESFULL\n");
- if (l1 < l2)
- {
- int i;
- int diff = l2 - l1 + 10;
-
- for (i = diff; i > 0; --i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = 0;
- diags[current_pos++] = i;
- diags[current_pos++] = 100;
- }
- diff = 10;
-
- for (i = 0; i < diff; ++i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = i;
- diags[current_pos++] = 0;
- diags[current_pos++] = 100;
- }
- }
- else
- {
- int i;
- int diff = 10;
-
- for (i = diff; i > 0; --i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = 0;
- diags[current_pos++] = i;
- diags[current_pos++] = 100;
- }
- diff = l1 - l2 + 10;
- for (i = 0; i < diff; ++i)
- {
- if (current_pos >= *dig_length-20)
- {
- (*dig_length) += 90;
- diags = vrealloc(diags, sizeof(int)*(*dig_length));
- }
- diags[current_pos++] = i;
- diags[current_pos++] = 0;
- diags[current_pos++] = 100;
- }
-
- }
- }
-
- vfree(diag);
- fclose(diag_f);
- diagonals[0] = diags;
- return current_pos/3;
-}
-
-
-
-//************************** needleman-wunsch aligning **********************************************************
-
-
-void
-fill_arguments_nw(Nw_param* method_arguments_p, int alphabet_size)
-{
- method_arguments_p-> dyn_matrix = vcalloc(1,sizeof(double*));
- method_arguments_p->dyn_matrix[0] = vcalloc(1,sizeof(double));
- method_arguments_p->length1 = vcalloc(1,sizeof(int));
- method_arguments_p->length2 = vcalloc(1,sizeof(int));
- *method_arguments_p->length1 = 1;
- *method_arguments_p->length2 = 1;
- method_arguments_p->sumup_prf = vcalloc(alphabet_size+1,sizeof(int*));
- int i;
- for (i = 0; i < alphabet_size+1; ++i)
- method_arguments_p->sumup_prf[i] = vcalloc(1,sizeof(int));
- method_arguments_p->sumup_length = vcalloc(1,sizeof(int));
- *method_arguments_p->sumup_length = 1;
-}
-
-
-void
-free_nw(Nw_param* method_arguments_p, int alphabet_size)
-{
- free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->dyn_matrix);
- int i;
- for (i = 0; i <= alphabet_size; ++i)
- {
- vfree(method_arguments_p->sumup_prf[i]);
- }
- vfree(method_arguments_p->sumup_prf);
- vfree(method_arguments_p->length1);
- vfree(method_arguments_p->length2);
- vfree(method_arguments_p->sumup_length);
-}
-
-
-/**
- * \brief One run of needleman-wunsch dynamic programming.
- *
- * \param profiles The profiles.
- * \param param_set The fastal parameters.
- * \param method_arguments_p The method arguments.
- * \param is_dna Sequences are DNA (\a is_dna = 1) or protein.
- * \param edit_file The edit file.
- * \param prof_file the profile file.
- * \param number Number of the parent node.
- * \return The length of the alignment.
- */
-int
-nw_dyn(Fastal_profile **profiles, Fastal_param *param_set, void *method_arguments_p, int is_dna, FILE *edit_file, FILE *prof_file, int number)
-{
- Nw_param *arguments = (Nw_param*)method_arguments_p;
-// int old_length1 = *arguments->length1;
-// int old_length2 = *arguments->length2;
- arguments->dyn_matrix = resize_dyn_matrix(arguments->dyn_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1);
- *arguments->length1 = profiles[0]->length+1;
- *arguments->length2 = profiles[1]->length+1;
- int alignment_length = prf_nw(profiles[0], profiles[1], arguments->dyn_matrix, edit_file, arguments->sumup_prf, arguments->sumup_length, param_set);
- write2file(arguments->sumup_prf, alignment_length, prof_file, number,profiles[0]->number_of_sequences + profiles[1]->number_of_sequences, param_set);
- return alignment_length;
-}
-
-
-/**
- * \brief This method takes a profile and turns it into a sumed up version.
- *
- * Required for NW-algorithm.
- * \param profile The profile to sum up.
- * \param sumup A field where the result will be stored.
- * \param param_set Parameters for the fastal algorithm.
- * \return The new \a sumup.
- */
-int**
-sumup_profile(Fastal_profile *profile,
- int **sumup,
- Fastal_param *param_set)
-{
-
- char *pos2aa = &(param_set->pos2char[0]);
- int alphabet_size = param_set->alphabet_size;
- int **M = param_set->M;
- int prof_length = profile->length;
-
- int i,j,k;
-
- for (i = 0; i < prof_length; ++i)
- {
- sumup[alphabet_size][i] = 0;
- for (k = 0; k < alphabet_size; ++k)
- {
- sumup[k][i] = 0;
- sumup[alphabet_size][i] += profile->prf[k][i];
- for (j = 0; j < alphabet_size; ++j)
- {
- sumup[k][i] += profile->weight * profile->prf[j][i] * M[pos2aa[j]-'A'][pos2aa[k]-'A'];
- }
- }
- }
-
- return sumup;
-}
-
-
-/**
- * \brief Turns the dynamic programming matrix into a editfile and calculates the new profile.
- *
- * Required for NW-algorithm.
- * \param prog_matrix The dynamic programming matrix.
- * \param prf1 The first profile.
- * \param prf2 The second profile.
- * \param edit_f A File object (already opened) to write the edit sequence into.
- * \param prf_field A 2-dim array to save the new profile into.
- * \param field_length Length of the new profile.
- * \param param_set Parameters of the Fastal-Algorithm
- */
-int
-nw_matrix2edit_file(double **prog_matrix, //dynamic programming matrix
- Fastal_profile *prf1, //profile of dim1
- Fastal_profile *prf2, //profile of dim2
- FILE *edit_f, //file to safe the edit in
- int **prf_field, //space to safe the new profile
- int *field_length,
- Fastal_param *param_set) //length of prf_field
-{
-// int **M = param_set->M;
- int alphabet_size = param_set->alphabet_size;
- double gap_cost = param_set -> gop;
- fprintf(edit_f, "%i\n%i\n%i\n%i\n",prf1->prf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf);
- int sum[] = {0,0,0};
- char sumc[] = {'M','I','D'};
- int last = 0;
- int n = 0;
- int m = 0;
- int field_pos = 0;
- int i;
- int prf1_length = prf1->length;
- int prf2_length = prf2->length;
- while ((n < prf1_length) && (m < prf2_length))
- {
- //if necesarry allocate more memory for result
- if ((*field_length)-alphabet_size < field_pos)
- {
- (*field_length) += ENLARGEMENT_PER_STEP;
-
- for (i = 0; i <alphabet_size+1; ++i)
- {
- prf_field[i] = vrealloc(prf_field[i], (*field_length)*sizeof(int));
- }
- }
-
- if (prog_matrix[n][m] == (prog_matrix[n+1][m] +gap_cost))
- {
- for (i = 0; i<alphabet_size; ++i)
- {
- prf_field[i][field_pos] = prf1->prf[i][n];
- }
- ++n;
- ++ field_pos;
-
- if (last != 1)
- {
- fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
- sum[last] = 0;
- }
- last = 1;
- ++sum[last];
- }
- else if (prog_matrix[n][m] == (prog_matrix[n][m+1] +gap_cost))
- {
-
- for (i = 0; i<alphabet_size; ++i)
- {
- prf_field[i][field_pos] = prf2->prf[i][m];
- }
- ++m;
- ++ field_pos;
- if (last != 2)
- {
- fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
- sum[last] = 0;
- }
- last = 2;
- ++sum[last];
- }
- else
- {
- for (i = 0; i<alphabet_size; ++i)
- {
- prf_field[i][field_pos] = prf1->prf[i][n] + prf2->prf[i][m];
- }
- ++n;
- ++m;
- ++ field_pos;
- if (last != 0)
- {
- fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
- sum[last] = 0;
- }
- last = 0;
- ++sum[last];
- }
- }
- fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
-
- //gaps in prf2
- last = 0;
- while (n < prf1_length)
- {
- for (i = 0; i<alphabet_size; ++i)
- {
- prf_field[i][field_pos] = prf1->prf[i][n];
- }
- ++n;
- ++ field_pos;
- ++last;
- }
- if (last > 0)
- fprintf(edit_f,"I%i\n",last);
-
- //gaps in prf1
- last = 0;
- while (m < prf2_length)
- {
- for (i = 0; i<alphabet_size; ++i)
- {
- prf_field[i][field_pos] = prf2->prf[i][m];
- }
- ++m;
- ++ field_pos;
- ++last;
- }
- if (last > 0)
- fprintf(edit_f,"D%i\n",last);
- fprintf(edit_f,"*\n");
- return field_pos;
-}
-
-
-
-/**
- * \brief Pairwise alignments of profile is done here.
- *
- * \param profile1 Profile of sequence 1
- * \param profile2 Profile of sequence 2
- * \param prog_matrix Matrix for dynamic programming
- * \param edit_file_name The edit_file_name
- * \param sumup_prf The sumup version of profile 1, which later contains the aligned profile.
- * \param sumup_length Contains the length of the aligned profile.
- * \return length of the aligned profile
- */
-int
-prf_nw(Fastal_profile *profile1,
- Fastal_profile *profile2,
- double **prog_matrix,
- FILE *edit_file_name,
- int **sumup_prf,
- int *sumup_length,
- Fastal_param *param_set)
-{
- int alphabet_size = param_set->alphabet_size;
- double gap_cost = param_set->gop;
-
- int i;
- if (*sumup_length < profile1->length)
- {
- for (i = 0; i < alphabet_size+1; ++i)
- {
- sumup_prf[i] = vrealloc(sumup_prf[i], profile1->length*sizeof(int));
- }
- *sumup_length = profile1->length;
- }
- sumup_prf = sumup_profile(profile1, sumup_prf, param_set);
-
-
-
- int j,k;
- int prof1_length = profile1->length;
- int prof2_length = profile2->length;
-
-// int** M = param_set->M;
- double match_score;
-// int amino_counter;
- int residue_pairs = 0;
-
- for (i = prof2_length; i > 0; --i)
- {
- prog_matrix[prof1_length][i] = gap_cost * (prof2_length-i);
- }
-
- i = prof1_length-1;
- prog_matrix[prof1_length][prof2_length] = 0.0;
- while (i >=0)
- {
- j = prof2_length-1;
-
- prog_matrix[i][prof2_length] = gap_cost*(prof1_length-i);
- while (j >=0)
- {
- match_score = 0.0;
- residue_pairs = 0;
- for (k = 0; k < alphabet_size; ++k)
- {
- residue_pairs += profile2->prf[k][j];
- match_score += (profile2->prf[k][j] * sumup_prf[k][i]);
- }
- match_score /= (residue_pairs * sumup_prf[alphabet_size][i]);
- prog_matrix[i][j] = MAX3(prog_matrix[i+1][j+1]+match_score, prog_matrix[i+1][j]+gap_cost, prog_matrix[i][j+1]+gap_cost);
-
- --j;
- }
- --i;
- }
- return nw_matrix2edit_file(prog_matrix, profile1, profile2, edit_file_name, sumup_prf, sumup_length, param_set);
-}
-
-
-/************** GOTOH ***********************/
-
-
-void
-fill_arguments_gotoh(Gotoh_param* method_arguments_p, int alphabet_size)
-{
- method_arguments_p->m_matrix = vcalloc(1,sizeof(double*));
- method_arguments_p->m_matrix[0] = vcalloc(1,sizeof(double));
- method_arguments_p->d_matrix = vcalloc(1,sizeof(double*));
- method_arguments_p->d_matrix[0] = vcalloc(1,sizeof(double));
- method_arguments_p->i_matrix = vcalloc(1,sizeof(double*));
- method_arguments_p->i_matrix[0] = vcalloc(1,sizeof(double));
- method_arguments_p->length1 = vcalloc(1,sizeof(int));
- method_arguments_p->length2 = vcalloc(1,sizeof(int));
- method_arguments_p->log_saver = vcalloc(alphabet_size+1, sizeof(double*));
- *method_arguments_p->length1 = 1;
- *method_arguments_p->length2 = 1;
- method_arguments_p->sumup_prf = vcalloc(alphabet_size+1,sizeof(int*));
- int i;
- for (i = 0; i < alphabet_size+1; ++i)
- {
- method_arguments_p->sumup_prf[i] = vcalloc(1,sizeof(int));
- method_arguments_p->log_saver[i] = vcalloc(1, sizeof(double));
- }
- method_arguments_p->sumup_length = vcalloc(1,sizeof(int));
- *method_arguments_p->sumup_length = 1;
-}
-
-
-void
-free_gotoh(Gotoh_param* method_arguments_p, int alphabet_size)
-{
- free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->m_matrix);
- free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->i_matrix);
- free_dyn_matrix(*method_arguments_p->length1,method_arguments_p->d_matrix);
-
- int i;
- for (i = 0; i <= alphabet_size; ++i)
- {
- vfree(method_arguments_p->sumup_prf[i]);
- }
- vfree(method_arguments_p->sumup_prf);
- vfree(method_arguments_p->length1);
- vfree(method_arguments_p->length2);
- vfree(method_arguments_p->sumup_length);
-}
-
-
-int
-gotoh_dyn(Fastal_profile **profiles, Fastal_param *param_set, void *method_arguments_p, int is_dna, FILE *edit_file, FILE *prof_file, int number)
-{
- Gotoh_param *arguments = (Gotoh_param*)method_arguments_p;
- arguments->m_matrix = resize_dyn_matrix(arguments->m_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1);
- arguments->i_matrix = resize_dyn_matrix(arguments->i_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1);
- arguments->d_matrix = resize_dyn_matrix(arguments->d_matrix, *arguments->length1, *arguments->length2, profiles[0]->length+1, profiles[1]->length+1);
- int i;
- if (profiles[1]->length > *arguments->length2-1)
- {
- for (i = 0; i < param_set->alphabet_size; ++i)
- {
- arguments->log_saver[i] = vrealloc(arguments->log_saver[i], (profiles[1]->length)*sizeof(double*));
- }
- }
- *arguments->length1 = profiles[0]->length+1;
- *arguments->length2 = profiles[1]->length+1;
- int alignment_length = prf_gotoh(profiles[0], profiles[1], edit_file, arguments, param_set);
- write2file(arguments->sumup_prf, alignment_length, prof_file, number, profiles[0]->number_of_sequences + profiles[1]->number_of_sequences, param_set);
- return alignment_length;
-}
-
-
-int
-gotoh_matrix2edit_file(double **m_matrix, //dynamic programming matrix
- double **v_matrix, //dynamic programming matrix
- double **h_matrix, //dynamic programming matrix
- Fastal_profile *prf1, //profile of dim1
- Fastal_profile *prf2, //profile of dim2
- FILE *edit_f, //file to safe the edit in
- int **prf_field, //space to safe the new profile
- int *field_length,
- Fastal_param *param_set) //length of prf_field
-{
- double comp_num = log((double)prf1->number_of_sequences) + log((double)prf2->number_of_sequences);
- int** M = param_set->M;
- int alphabet_size = param_set->alphabet_size;
- double gep = param_set -> gep;
- fprintf(edit_f, "%i\n%i\n%i\n%i\n",prf1->prf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf);
- int sum[] = {0,0,0};
- char sumc[] = {'M','I','D'};
- int last = 0;
- int n = 0;
- int m = 0;
- int field_pos = 0;
- int i;
- int prf1_length = prf1->length;
- int prf2_length = prf2->length;
- int current_mode = 0;
- //determine start mode
- char *pos2char = param_set->pos2char;
- if (h_matrix[n][m] == m_matrix[n][m])
- {
- current_mode = 2;
- }
- else
- {
-
- if (v_matrix[n][m] == m_matrix[n][m])
- {
- current_mode = 1;
- }
- else
- {
- current_mode = 0;
- }
- }
-// printf("%f %f %f - %i\n",h_matrix[n][m],v_matrix[n][m],m_matrix[n][m], current_mode);
- while ((n < prf1_length) && (m < prf2_length))
- {
- //if necesarry allocate more memory for result
- if ((*field_length)-alphabet_size < field_pos)
- {
- (*field_length) += ENLARGEMENT_PER_STEP;
-
- for (i = 0; i <alphabet_size+1; ++i)
- {
- prf_field[i] = vrealloc(prf_field[i], (*field_length)*sizeof(int));
- }
- }
-
-
- if (current_mode == 2)
- {
- for (i = 0; i<alphabet_size; ++i)
- {
- prf_field[i][field_pos] = prf2->prf[i][m];
- }
- if (h_matrix[n][m] != (h_matrix[n][m+1]+gep))
- {
- current_mode = 0;
- }
- ++m;
- ++ field_pos;
- if (last != 2)
- {
- fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
- sum[last] = 0;
- }
- last = 2;
- ++sum[last];
- }
- else
- {
- if (current_mode== 1)
- {
- for (i = 0; i<alphabet_size; ++i)
- {
- prf_field[i][field_pos] = prf1->prf[i][n];
- }
- if (v_matrix[n][m] != (v_matrix[n+1][m]+gep))
- {
- current_mode = 0;
- }
- ++n;
- ++ field_pos;
-
- if (last != 1)
- {
- fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
- sum[last] = 0;
- }
- last = 1;
- ++sum[last];
- }
- else
- {
- double match_score = 0.0;
- int char_c, char_c2;
- for (char_c = 0; char_c < alphabet_size; ++char_c)
- {
- for (char_c2 = 0; char_c2 < alphabet_size; ++char_c2)
- {
-
- if ((log(prf1->prf[char_c][n]) != -1) && ( log(prf2->prf[char_c2][m]) != -1))
- {
- match_score += exp(log((double)prf1->prf[char_c][n]) + log((double)prf2->prf[char_c2][m])-comp_num) * M[pos2char[char_c]-'A'][pos2char[char_c2]-'A'];
- }
- }
- }
- if (m_matrix[n+1][m+1] + match_score != m_matrix[n][m])
- {
- if (m_matrix[n][m] == v_matrix[n][m])
- {
- current_mode = 1;
- continue;
- }
- if (m_matrix[n][m] == h_matrix[n][m])
- {
- current_mode = 2;
- continue;
- }
- }
- for (i = 0; i<alphabet_size; ++i)
- {
- prf_field[i][field_pos] = prf1->prf[i][n] + prf2->prf[i][m];
- }
- ++n;
- ++m;
- ++ field_pos;
- if (last != 0)
- {
- fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
- sum[last] = 0;
- }
- last = 0;
- ++sum[last];
- }
- }
-
- }
- fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
-
- int needed = MAX(prf1_length -n, prf2_length -m);
-
- if ((*field_length) - needed -10 < field_pos)
- {
- (*field_length) += needed +10;
-
- for (i = 0; i <alphabet_size+1; ++i)
- {
- prf_field[i] = vrealloc(prf_field[i], (*field_length)*sizeof(int));
- }
- }
- //gaps in prf2
- last = 0;
- while (n < prf1_length)
- {
- for (i = 0; i<alphabet_size; ++i)
- {
- prf_field[i][field_pos] = prf1->prf[i][n];
- }
- ++n;
- ++ field_pos;
- ++last;
- }
- if (last > 0)
- fprintf(edit_f,"I%i\n",last);
-
- //gaps in prf1
- last = 0;
- while (m < prf2_length)
- {
- for (i = 0; i<alphabet_size; ++i)
- {
- prf_field[i][field_pos] = prf2->prf[i][m];
- }
- ++m;
- ++ field_pos;
- ++last;
- }
- if (last > 0)
- fprintf(edit_f,"D%i\n",last);
-
- fprintf(edit_f,"*\n");
- return field_pos;
-}
-
-
-
-
-/**
- * \brief The gotoh dynamic programming algorithm.
- *
- * \param profile1 The first profile.
- */
-int
-prf_gotoh(Fastal_profile *profile1,
- Fastal_profile *profile2,
- FILE *edit_file_name,
- Gotoh_param *arguments,
- Fastal_param *param_set)
-{
-
-// printf("I AM HERE - again\n");
- int **sumup_prf = arguments->sumup_prf;
- int *sumup_length = arguments->sumup_length;
- int alphabet_size = param_set->alphabet_size;
- double gop = param_set->gop;
- double gep = param_set->gep;
-
- const int INF = -999999;
- int i;
-
- double **m_matrix = arguments->m_matrix;
- double **h_matrix = arguments->i_matrix;
- double **v_matrix = arguments->d_matrix;
-
- int j;
- int prof1_length = profile1->length;
- int prof2_length = profile2->length;
-
- int** M = param_set->M;
- double match_score;
- for (i = prof2_length; i >= 0; --i)
- {
- m_matrix[prof1_length][i] = gop + gep * (prof2_length-i);
- v_matrix[prof1_length][i] = INF;
- h_matrix[prof1_length][i] = m_matrix[prof1_length][i];
- }
-
- m_matrix[prof1_length][prof2_length] = 0.0;
- h_matrix[prof1_length][prof2_length] = INF;
- v_matrix[prof1_length][prof2_length] = INF;
- int l;
- double comp_num = log((double)profile1->number_of_sequences) + log((double)profile2->number_of_sequences);
- static double *log_test = NULL;
- if (!log_test)
- log_test = vcalloc(alphabet_size, sizeof(double));
-// int k;
- int **prf1 = profile1->prf;
- int **prf2 = profile2->prf;
- double **log_test2 = arguments->log_saver;
- for (l = 0; l < alphabet_size; ++l)
- {
- for (i = 0; i < profile2->length; ++i)
- {
- if (prf2[l][i] > 0)
- {
- log_test2[l][i] = log(prf2[l][i]);
- }
- else
- log_test2[l][i] = -1;
- }
- }
-
- char *pos2char = param_set->pos2char;
- i = prof1_length-1;
- while (i >=0)
- {
- j = prof2_length-1;
-
- for (l = 0; l < alphabet_size; ++l)
- {
- if (prf1[l][i] > 0)
- log_test[l] = log((double)prf1[l][i]);
- else
- log_test[l] = -1;
- }
- m_matrix[i][prof2_length] = gop + gep *(prof1_length-i);
- v_matrix[i][prof2_length] = m_matrix[i][prof2_length];
- h_matrix[i][prof2_length] = INF;
- while (j >=0)
- {
-
- match_score = 0.0;
- v_matrix[i][j] = (MAX(v_matrix[i+1][j], m_matrix[i+1][j] + gop) + gep);
- h_matrix[i][j] = (MAX(h_matrix[i][j+1], m_matrix[i][j+1] + gop) + gep);
-
- int char_c, char_c2;
- int num = 0;
- for (char_c = 0; char_c < alphabet_size; ++char_c)
- {
- for (char_c2 = 0; char_c2 < alphabet_size; ++char_c2)
- {
-
- if ((log_test[char_c] != -1) && (log_test2[char_c2][j] != -1))
- {
- match_score += exp(log_test[char_c] + log_test2[char_c2][j]-comp_num) * M[pos2char[char_c]-'A'][pos2char[char_c2]-'A'];
- }
- }
- }
-
- m_matrix[i][j] = m_matrix[i+1][j+1]+match_score;
-
- if (m_matrix[i][j] < v_matrix[i][j])
- {
- m_matrix[i][j] = v_matrix[i][j];
- }
- if (m_matrix[i][j] < h_matrix[i][j])
- {
- m_matrix[i][j] = h_matrix[i][j];
- }
-
- --j;
- }
- --i;
- }
- return gotoh_matrix2edit_file(m_matrix, v_matrix, h_matrix, profile1, profile2, edit_file_name, sumup_prf, sumup_length, param_set);
-}
-
-
-/************* OTHER STUFF ******************/
-
-/**
- * \brief Writes the alignment into the profile file and the edit file.
- *
- * \param profiles The two profiles to combine.
- * \param alignment The alinment information.
- * \param alignment The length of the alignment.
- * \param edit_f The edit file.
- * \param prof_f The profile file.
- * \param node_number the new node number.
- */
-void
-alignment2files(Fastal_profile **profiles,
- Fastal_param *param_set,
- int **alignment,
- int alignment_length,
- FILE *edit_f,
- FILE *prof_f,
- int node_number)
-{
- fprintf(edit_f, "%i\n%i\n%i\n%i\n",profiles[0]->prf_number, profiles[1]->prf_number, profiles[0]->is_leaf, profiles[1]->is_leaf);
- fprintf(prof_f, "%i\n0\n%i\n1\n", node_number, alignment_length);
-
- int **prf1 = profiles[0]->prf;
- int **prf2 = profiles[1]->prf;
- int i = 0;
- int pos = 0;
- int pos1, pos2;
-
- char statec[] = {'M','D','I'};
- int num = 0;
- int state = 0;
-
- while (i < alignment_length)
- {
-
- pos1 = alignment[0][pos];
- pos2 = alignment[1][pos];
- // match
- if ((pos1 != -1) && (pos2 != -1))
- {
-
- combine_profiles2file(prf1, prf2, pos1, pos2, param_set, prof_f, 'M');
- if (state != 0)
- {
- fprintf(edit_f, "%c%i\n",statec[state], num);
- num =1;
- state = 0;
- }
- else
- ++num;
- ++i;
- }
- // insertion in seq 1
- else if (pos1 != -1)
- {
- combine_profiles2file(prf1, prf2, pos1, pos2, param_set, prof_f, 'I');
- if (state != 2)
- {
- fprintf(edit_f, "%c%i\n",statec[state], num);
- num =1;
- state = 2;
- }
- else
- ++num;
- ++i;
- }
- // deletion in seq 1
- else if (pos2 != -1)
- {
- combine_profiles2file(prf1, prf2, pos1, pos2, param_set, prof_f, 'D');
- if (state != 1)
- {
- fprintf(edit_f, "%c%i\n",statec[state], num);
- num =1;
- state = 1;
- }
- else
- ++num;
- ++i;
- }
- ++pos;
- }
- fprintf(edit_f, "%c%i\n",statec[state], num);
-
- fprintf(edit_f,"*\n");
- fprintf(prof_f,"*\n");
-
-}
-
-
-//******************************* OTHER STUFF ***********************
-
-/**
- * \brief Reads the sequence from a given position in a fasta file and turns it into a profile.
- *
- * \param seq_file The file where the sequence is stored.
- * \param off_set The off_set from the beginning of the file to the position of the sequence name.
- * \param profile The profile where the sequence will be stored into.
- * \param prf_number The number of this profile.
- */
-void
-file_pos2profile(FILE *seq_file, //File with sequences
- long off_set, //offset of sequence from the beginning of file point to the sequence name, not to the sequence itself
- Fastal_profile *profile, //profile to save into
- int prf_number, //number of the profile
- Fastal_param *param_set)
-{
- int alphabet_size = param_set->alphabet_size;
- profile->is_leaf = 1;
- profile->number_of_sequences = 1;
- int *aa2pos = &(param_set->char2pos[0]);
- const int LINE_LENGTH = 500;
- char line[LINE_LENGTH];
- profile->num_sequences = 1;
- profile->prf_number = prf_number;
- fseek (seq_file , off_set , SEEK_SET );
-
- fgets (line, LINE_LENGTH , seq_file);
- int seq_length = 0;
- int i, j, x;
-
- while(fgets(line, LINE_LENGTH, seq_file)!=NULL)
- {
- if (line[0] != '>')
- {
- line[LINE_LENGTH-1] = '\n';
- if (seq_length + LINE_LENGTH >= profile->allocated_memory)
- {
- for (i = 0; i < alphabet_size; ++i)
- {
- profile->prf[i] = vrealloc(profile->prf[i], (profile->allocated_memory+PROFILE_ENLARGEMENT)*sizeof(int));
- }
- profile->allocated_memory += PROFILE_ENLARGEMENT;
- }
-
- i = 0;
- x = 0;
- while ((line[i] != '\n') && (line[i] != '\0'))
- {
- if (line[i] != '-')
- {
- for(j = 0; j<alphabet_size; ++j )
- profile->prf[j][seq_length+x] = 0;
- profile->prf[aa2pos[toupper((short)line[i])]][seq_length+x] = 1;
- ++x;
- }
- ++i;
- }
- seq_length += x;
-
- }
- else
- break;
- }
- profile->length = seq_length;
-
-}
-
-
-
-/**
- * \brief Constructs index of fasta_file.
- *
- * The index is of length n (n= number of sequences in the given multi fasta file.). In the order of appearance in the file the position of each sequence in the file is stored.
- * \param file_name The file with the sequences.
- * \param file_positions Array to save the positions in.
- * \return The number of sequences in \a file_name.
- */
-int
-make_index_of_file(char *file_name, //file with sequences
- long **file_positions) //array to save the positions
-{
- const int LINE_LENGTH = 150;
- (*file_positions) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(long));
-
- FILE *file = fopen(file_name,"r");
-
- char line[LINE_LENGTH];
-
- int num_of_sequences = 0;
- int mem_for_pos = ENLARGEMENT_PER_STEP;
-
-
- if (file == NULL)
- {
- printf("FILE NOT FOUND\n");
- exit(1);
- }
- else
- {
- (*file_positions)[num_of_sequences] = ftell(file);
- while(fgets(line, LINE_LENGTH , file)!=NULL)
- {
-// int length = strlen(line);
- if (line[0] == '>')
- {
- ++num_of_sequences;
-
- if (num_of_sequences == mem_for_pos)
- {
- (*file_positions) = vrealloc((*file_positions),(ENLARGEMENT_PER_STEP+mem_for_pos) * sizeof(long));
- mem_for_pos += ENLARGEMENT_PER_STEP;
- }
- }
- (*file_positions)[num_of_sequences] = ftell(file);
- }
- }
-
- fclose(file);
- return num_of_sequences;
-}
-
-
-
-/**
- * \brief Reads a profile from a profile file.
- *
- * \param prof A Fastal_profile object to save the profile in.
- * \param profile_f file where the profile is stored.
- * \param position Position of the profile in \a profile_f.
- * \param param_set The parameter set for Fastal
- */
-
-void
-profile_file2profile(Fastal_profile *prof, //structure to save the profile in
- FILE *profile_f, //file where the profile is stored
- long position, //position in profile_f where the profile is stored
- Fastal_param *param_set)
-{
-
- int alphabet_size = param_set->alphabet_size;
-
- int *aa2pos = &(param_set->char2pos[0]);
-
-
- fseek(profile_f,position,SEEK_SET);
- const int LINE_LENGTH = 500;
- char line[500];
-
- fgets(line, LINE_LENGTH, profile_f);
-
- prof->prf_number = atoi(line);
- fgets(line, LINE_LENGTH, profile_f);
- prof->is_leaf = atoi(line);
-
- fgets(line, LINE_LENGTH, profile_f);
- prof->length = atoi(line);
- fgets(line, LINE_LENGTH, profile_f);
- prof->weight = atoi(line);
- fgets(line, LINE_LENGTH, profile_f);
- prof->number_of_sequences = atoi(line);
- int i,j;
- if (prof->length > prof->allocated_memory)
- for (i = 0;i < alphabet_size; ++i)
- {
- prof->prf[i] = vrealloc(prof->prf[i],prof->length*sizeof(int));
- }
- prof->allocated_memory = prof->length;
- char delims[] = " ";
- char *result = NULL;
- char *result_num = NULL;
-
- int length = prof->length;
-
- for (i = 0; i < length; ++i)
- {
- for(j = 0; j<alphabet_size; ++j )
- prof->prf[j][i] = 0;
- fgets(line, LINE_LENGTH , profile_f);
- result = strtok( line, delims );
-
- while( result != NULL)
- {
- result_num = &result[1];
- prof->prf[aa2pos[(short)result[0]]][i] = atoi(result_num);
- result = strtok( NULL, delims );
- }
- }
-}
-
-
-
-/**
- * \brief Writes a profile into a file
- *
- * \param profile Pointer to the profile which has to be saved.
- * \param file A File object (already opened) to write the profile to.
- * \param param_set The parameters for the fastal algorithm.
- */
-void
-profile2file(Fastal_profile *profile, //the profile to save
- FILE* file, //file to save in
- Fastal_param *param_set)
-{
- int alphabet_size = param_set->alphabet_size;
-
- char *pos2aa = &(param_set->pos2char[0]);
-
- fseek(file,0,SEEK_SET);
-
- fprintf(file,"%i\n", profile->prf_number);
-
-
- fprintf(file,"%i\n", profile->is_leaf);
- fprintf(file,"%i\n", profile->length);
- fprintf(file,"%i\n", profile->weight);
- int i = 0, j = 0;
- int max = profile->length;
- int x= 0;
- --alphabet_size;
- while (i < max)
- {
- for (j = 0; j < alphabet_size; ++j)
- if (profile->prf[j][i] > 0)
- {
- if (x)
- fprintf(file," %c%i", pos2aa[j],profile->prf[j][i]);
- else
- fprintf(file,"%c%i", pos2aa[j],profile->prf[j][i]);
- x = 1;
- }
- if (profile->prf[j][i] > 0)
- {
- if (x)
- fprintf(file," %c%i", pos2aa[j],profile->prf[j][i]);
- else
- fprintf(file,"%c%i", pos2aa[j],profile->prf[j][i]);
- x = 1;
- }
- x = 0;
- fprintf(file,"\n");
- ++i;
- }
- fprintf(file,"*\n");
-}
-
-
-
-/**
-* Reads the profile out of an alignment (NOT IN USE)
-*/
-// void
-// file2profile(FILE* profile_f, //file to read the profile of
-// Fastal_profile *prof, //profile saved in here
-// int prf_number, //number of the profile
-// Fastal_param *param_set)
-// {
-// int alphabet_size = param_set->alphabet_size;
-//
-// int *aa2pos = &(param_set->char2pos[0]);
-//
-//
-// fseek(profile_f,0,SEEK_SET);
-// const int LINE_LENGTH = 500;
-// char line[500];
-//
-// fgets(line, LINE_LENGTH, profile_f);
-// prof->prf_number = atoi(line);
-// fgets(line, LINE_LENGTH, profile_f);
-// prof->is_leaf = atoi(line);
-//
-// fgets(line, LINE_LENGTH, profile_f);
-// prof->length = atoi(line);
-//
-// fgets(line, LINE_LENGTH, profile_f);
-// prof->weight = atoi(line);
-// int i,j;
-// if (prof->length > prof->allocated_memory)
-// for (i = 0;i < alphabet_size; ++i)
-// {
-// prof->prf[i] = vrealloc(prof->prf[i],prof->length*sizeof(int));
-// }
-//
-// char delims[] = " ";
-// char *result = NULL;
-// char *result_num = NULL;
-//
-// int length = prof->length;
-//
-// for (i = 0; i < length; ++i)
-// {
-// for(j = 0; j<alphabet_size; ++j )
-// prof->prf[j][i] = 0;
-// fgets(line, LINE_LENGTH , profile_f);
-// result = strtok( line, delims );
-//
-// while( result != NULL)
-// {
-// result_num = &result[1];
-// prof->prf[aa2pos[(short)result[0]]][i] = atoi(result_num);
-// result = strtok( NULL, delims );
-// }
-// }
-// }
-
-
-
-
-
-
-/**
- * \brief Writes the sequence into the alignment_file.
- *
- * \param aligned_sequence Pattern of aligned sequence.
- * \param sequence_file File with sequences.
- * \param sequence_position Positions of sequences in \a sequence_file.
- * \param alignment_file The file to write the sequence into.
- *
-*/
-void
-edit_seq2aligned_seq(char *aligned_sequence, //pattern for aligned sequence
- FILE *sequence_file, //file with all the sequences
- long sequence_position, //position in sequence file with the correct sequence
- FILE *alignment_file) //file to write the alignment into
-{
- fseek(sequence_file, sequence_position, SEEK_SET);
- const int LINE_LENGTH = 300;
- char line[LINE_LENGTH];
- fgets (line, LINE_LENGTH , sequence_file);
- fprintf(alignment_file,"%s", line); //writing of sequence name
- int pos = 0;
- int i = 0;
- while(fgets(line, LINE_LENGTH, sequence_file)!=NULL)
- {
- if (line[0] != '>')
- {
-
- line[LINE_LENGTH-1] = '\n';
- i = 0;
- while ((line[i] != '\n') && (line[i] != '\0'))
- {
- while (aligned_sequence[pos] == '-')
- {
-
- fprintf(alignment_file,"-");
- ++pos;
- }
- if (line[i] != '-')
- {
-
- fprintf(alignment_file,"%c",line[i]);
- ++pos;
- }
- ++i;
-// ++pos;
- }
- }
- else
- break;
- }
- while (aligned_sequence[pos] != '\n')
- {
- fprintf(alignment_file,"-");
- ++pos;
- }
- fprintf(alignment_file,"\n");
-}
-
-
-
-/**
- * \brief Recursive function to turn the edit_file into the alignment.
- *
- * \param sequence_file File with all sequences.
- * \param sequence_position The array of sequence positions in \a sequence_file
- * \param edit_file File to safe the edit profiles in.
- * \param edit_positions Array saving the coorespondence between edit profile and position in \a edit_file
- * \param node_number The current node.
- * \param number_of_sequences The number of sequences.
- * \param aligned_sequence The sequence that is edited.
- * \param alignment_length The length of the alignment.
- * \param edit_seq_file File that saves the edited_sequences of the internal nodes.
- * \param offset Saves the size of the edited_sequences.
- * \param alignment_file File where the alignment is saved.
- */
-void
-edit2alignment(FILE *sequence_file, //sequence file
- long *seq_positions, //sequence positions
- FILE *edit_file, //file saving the edit profiles
- long *edit_positions, //array saving the correspondence between edit profile and position in edit_file
- int node_number, //the current node
- int number_of_sequences, //number of sequences
- char *aligned_sequence, //the sequence that is edited
- int alignment_length, //length of the alignment - and thus of aligned_sequence
- FILE *edit_seq_file, //file saving the edited_sequences of the internal nodes
- int offset, //saves the size of the edited_sequence
- FILE* alignment_file) //file saving the alignments
-{
- fseek(edit_file, edit_positions[node_number-number_of_sequences], SEEK_SET);
- const int LINE_LENGTH = 50;
- char line[LINE_LENGTH];
- fgets(line, LINE_LENGTH , edit_file);
- int child1 = atoi(line);
- fgets(line, LINE_LENGTH , edit_file);
- int child2 = atoi(line);
- fgets(line, LINE_LENGTH , edit_file);
- int is_leaf1 = atoi(line);
- fgets(line, LINE_LENGTH , edit_file);
- int is_leaf2 = atoi(line);
-
-// static char seq_line[10];
-// printf("SO EINE VERDAMMTE SCHEISE ABER AUCH\n");
- char x;
- int number;
- int pos = 0;
-
- //first child
- while(fgets(line, LINE_LENGTH , edit_file)!=NULL)
- {
-
- x = line[0];
- if (x == '*')
- break;
- number = atoi(&line[1]);
- if (x == 'M')
- {
- while (number > 0)
- {
- if (aligned_sequence[pos] == 'X')
- --number;
- ++pos;
- }
- }
- else if (x == 'I')
- {
- while (number > 0)
- {
- if (aligned_sequence[pos] == 'X')
- --number;
- ++pos;
- }
- }
- else if (x == 'D')
- {
- while (number > 0)
- {
- if (aligned_sequence[pos] == 'X')
- {
- aligned_sequence[pos] = '-';
- --number;
- }
- ++pos;
- }
- }
- }
-
- if (is_leaf1)
- {
-// printf("LEAF\n");
- edit_seq2aligned_seq(aligned_sequence, sequence_file, seq_positions[child1], alignment_file);
- }
- else
- {
- fprintf(edit_seq_file, "%s", aligned_sequence);
- edit2alignment(sequence_file, seq_positions, edit_file, edit_positions, child1, number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file);
- }
-
- //second child
- fseek(edit_seq_file, offset, SEEK_CUR);
- fgets(aligned_sequence, alignment_length+3, edit_seq_file);
- fseek(edit_seq_file, offset, SEEK_CUR);
-
- pos = 0;
- fseek(edit_file, edit_positions[node_number-number_of_sequences], SEEK_SET);
- while(fgets(line, LINE_LENGTH , edit_file)!=NULL)
- {
- x = line[0];
- if (x == '*')
- break;
- number = atoi(&line[1]);
- if (x == 'M')
- {
- while (number > 0)
- {
- if (aligned_sequence[pos] == 'X')
- --number;
- ++pos;
- }
- }
- else if (x == 'I')
- {
- while (number > 0)
- {
- if (aligned_sequence[pos] == 'X')
- {
- aligned_sequence[pos] = '-';
- --number;
- }
- ++pos;
- }
- }
- else if (x == 'D')
- {
- while (number > 0)
- {
- if (aligned_sequence[pos] == 'X')
- --number;
- ++pos;
- }
- }
- }
-
- if (is_leaf2)
- {
- edit_seq2aligned_seq(aligned_sequence, sequence_file, seq_positions[child2], alignment_file);
- }
- else
- {
- fprintf(edit_seq_file, "%s", aligned_sequence);
- edit2alignment(sequence_file, seq_positions, edit_file, edit_positions, child2, number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file);
- }
-}
-
-
-
-
-// * The file has the follwing format (# and text behind are only comments and not included into the file):
-// * 1 # Number of profile.
-// * 1 # is leaf.
-// * 5 # Number of columns in the profile.
-// * 4A 1C # In the first column are 4 'A' and 1 'C'
-// * 3G # In the second column are 3 'G'
-// * 5A # In the third column are 5 'A'
-// * 2A 3C # In the fourth column are 2 'A' and 3 'C'
-// * 5C # In the fifth column are 5 'C'
-// * * # Marks the end of this profile
-
-
-
-/**
- * \brief Writes a profile to a file.
- *
- * \param sumup_prf The profile array, not a real profile.
- * \param length The length of the profile. The format can be seen in ./test.txt
- * \param file The FILE object to write the the profile into.
- * \param is_dna The type of sequence.
- * \param number The number of the profile.
- */
-void
-write2file(int **sumup_prf,
- int length,
- FILE *file,
- int number,
- int num_sequences,
- Fastal_param *param_set)
-{
- char *pos2aa = &(param_set->pos2char[0]);
- fprintf(file,"%i\n0\n%i\n1\n%i\n",number, length, num_sequences );
- int i, j;
- int alphabet_size = param_set->alphabet_size;
-
- i = 0;
- int x = 0;
- while (i < length)
- {
- for (j = 0; j < alphabet_size; ++j)
- if (sumup_prf[j][i] > 0)
- {
- if (x)
- fprintf(file," %c%i", pos2aa[j],sumup_prf[j][i]);
- else
- fprintf(file,"%c%i", pos2aa[j],sumup_prf[j][i]);
- x = 1;
- }
- x = 0;
- fprintf(file,"\n");
- ++i;
- }
- fprintf(file,"*\n");
-}
-
-
-
-//************************************* main function of the fasta algorithm ***********************************************
-
-/**
-* \brief main of the fastal algorithm
-*/
-int
-fastal_main(int argc, //number of arguments
- char **argv) //arguments first = fastal, second = tree
-{
-
- int i;
- //pointer to arguments
- void * method_arguments_p;
- int (*alignment_function)(Fastal_profile **profiles, Fastal_param *param_set, void *method_arguments_p, int is_dna, FILE *edit_file, FILE *prof_file, int number);
-
- struct Fastal_arguments arguments;
-
- arg_parse (argc, argv, &arguments);
-
- Fastal_param *param_set = vcalloc(1,sizeof(Fastal_param));
-
- fill_parameters(arguments.is_dna, param_set, arguments.method, arguments.diag_method, arguments.mat);
- param_set->gep = arguments.gep;
- param_set->gop = arguments.gop;
-
-// printf("%s",arguments.mat);
- if (arguments.evaluate)
- {
- printf("Calculate Sum of pairs Score\n");
- printf("Score: %f\n", calculate_sum_of_pairs_score_affine(arguments.sequence_file, param_set->M, param_set->gop, param_set->gep));
- vfree(param_set);
- exit(0);
- }
-
-
- if (arguments.agreement_score)
- {
- complete_agreement_score(arguments.aln2test, arguments.aln_ref);
- return 0;
- }
-
-
- if (arguments.num_ref_aln)
- {
- compute_ref_alignments(arguments.sequence_file, arguments.aln_ref, arguments.num_ref_aln, arguments.num_seq_in_ref);
- return 0;
- }
-
-
-
- int alphabet_size = param_set->alphabet_size;
-
-
- //sequence file management
-// char **seq_name;
- long *file_positions = NULL;
- long **tmp = &file_positions;
- int number_of_sequences = make_index_of_file(arguments.sequence_file, tmp);
-
-
-
- //edit file management
-
-// long current_edit_pos;
- long *edit_positions = vcalloc(number_of_sequences,sizeof(long));
-
-
- //profile management
- Fastal_profile **profiles = vcalloc(3,sizeof(Fastal_profile*));
- initiate_profiles(profiles, param_set);
- FILE * prof_file = fopen(vtmpnam(NULL),"w+");
- long* profile_positions = vcalloc(4,sizeof(long*));
- int max_prof = 4;
- int saved_prof = 0;
-
-
- printf("METHOD: %s\n",param_set->method);
- if (strcmp(param_set->method, "fast") == 0)
- {
- method_arguments_p = vcalloc(1,sizeof(Sparse_dynamic_param));
- fill_arguments_sparse((Sparse_dynamic_param*)method_arguments_p);
- alignment_function = sparse_dyn;
- }
- else if (strcmp(param_set->method, "nw") == 0)
- {
- method_arguments_p = vcalloc(1,sizeof(Nw_param));
- fill_arguments_nw((Nw_param*)method_arguments_p, alphabet_size);
- alignment_function = nw_dyn;
- }
- else if (strcmp(param_set->method, "gotoh") == 0)
- {
- method_arguments_p = vcalloc(1,sizeof(Gotoh_param));
- fill_arguments_gotoh((Gotoh_param*)method_arguments_p, alphabet_size);
- alignment_function = gotoh_dyn;
- }
- else if (strcmp(param_set->method, "udisc") == 0)
- {
- method_arguments_p = vcalloc(1,sizeof(Udisc_param));
- fill_arguments_gotoh((Gotoh_param*)method_arguments_p, alphabet_size);
- alignment_function = gotoh_dyn;
-
- }
- else
- {
- printf("ERROR - METHOD");
- exit(1);
- }
-
-
- if (arguments.gap_iterate)
- {
- iterate(param_set, method_arguments_p, arguments.sequence_file, arguments.output_file, arguments.gap_iterate);
- return 0;
- }
-
- if (arguments.tree_file == NULL)
- {
- arguments.tree_file = vtmpnam(NULL);
- printf("CONSTRUCT TREE\n");
- if (strcmp(arguments.tree_method, "parttree")==0)
- {
- make_partTree(arguments.sequence_file, arguments.tree_file, arguments.tree_param1, arguments.tree_param2, arguments.is_dna, 0);
- }
- else if (strcmp(arguments.tree_method, "oligotree") == 0)
- {
- compute_oligomer_distance_tree(arguments.sequence_file, param_set->char2pos, arguments.tree_file, arguments.tree_param2, arguments.tree_param1, param_set->alphabet_size);
- }
-
- if (arguments.tree_only == 1)
- return 0;
- }
-
-
- if (arguments.tree_out == 1)
- {
- char tree_out_file_name[500];
- sprintf(tree_out_file_name, "%s.tree",arguments.output_file);
- char const LINE_LENGTH = 50;
- char line[LINE_LENGTH];
-
- FILE* in = fopen(arguments.tree_file, "r");
- FILE* out = fopen(tree_out_file_name, "w");
- while( (fgets(line, LINE_LENGTH, in)) != NULL)
- fprintf(out, "%s", line);
- fclose(in);
- fclose(out);
- }
-
-
-
-
-
- FILE *seq_file = fopen(arguments.sequence_file,"r");
-// FILE *edit_file = fopen(vtmpnam(NULL),"w+");
- FILE *edit_file = fopen("aha","w+");
-
- printf("CONSTRUCT ALIGNMENT\n");
- FILE *tree_file = fopen(arguments.tree_file,"r");
- const int LINE_LENGTH = 100;
- char line[LINE_LENGTH];
- char delims[] = " ";
- int node[3];
- int alignment_length = -1;
- node[2] = -1;
-
-
- //bottom-up traversal
- while(fgets(line, LINE_LENGTH, tree_file)!=NULL)
- {
- //read profiles
- node[0] = atoi(strtok(line,delims));
- node[1] = atoi(strtok(NULL,delims));
- node[2] = atoi(strtok(NULL,delims));
-
- //getting profile of second child
- if (node[1] < number_of_sequences)
- {
- file_pos2profile(seq_file, file_positions[node[1]], profiles[1], node[1], param_set); //profile to save into
- }
- else
- {
- profile_file2profile(profiles[1], prof_file, profile_positions[--saved_prof], param_set);
- fseek (prof_file , profile_positions[saved_prof] , SEEK_SET);
- }
-
- //getting profile of first child
- if (node[0] < number_of_sequences)
- {
- file_pos2profile(seq_file, file_positions[node[0]], profiles[0], node[0], param_set); //profile to save into
- }
- else
- {
- profile_file2profile(profiles[0], prof_file, profile_positions[--saved_prof], param_set);
- fseek (prof_file , profile_positions[saved_prof] , SEEK_SET);
- }
-
-
- if (saved_prof == max_prof)
- {
- max_prof += 5;
- profile_positions = vrealloc(profile_positions, max_prof*sizeof(long));
- }
-
- edit_positions[node[2]-number_of_sequences] = ftell(edit_file);
- profile_positions[saved_prof] = ftell(prof_file);
- ++saved_prof;
-
- //aligning the sequences
- alignment_length = alignment_function(profiles, param_set, method_arguments_p, arguments.is_dna, edit_file, prof_file, node[2]);
- }
-
-
-// bottom-down traversal (edit_files --> alignment)
-// tmp_out_file_name = vtmpnam(NULL);
-
-// FILE *alignment_file = fopen(tmp_out_file_name, "w");
- FILE *alignment_file = fopen(arguments.output_file, "w");
- FILE *edit_seq_file = fopen(vtmpnam(NULL),"w+");
-
- char *aligned_sequence = vcalloc(alignment_length+3, sizeof(char));
-
-
- long offset = ftell(edit_seq_file);
- for (i = 0; i < alignment_length; ++i)
- {
- fprintf(edit_seq_file, "X");
- aligned_sequence[i] = 'X';
- }
- aligned_sequence[i]= '\n';
- aligned_sequence[i+1]= '\0';
- fprintf(edit_seq_file, "\n");
- offset = (ftell(edit_seq_file) - offset)*-1;
-
-
- edit2alignment(seq_file, file_positions, edit_file, edit_positions, node[2], number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file);
- fclose(alignment_file);
- fclose(tree_file);
- fclose(edit_file);
- fclose(seq_file);
- fclose(edit_seq_file);
-
- //set stuff for the next cycle
-// arguments.sequence_file = tmp_out_file_name;
-
-
-// // //DEBUG
-// // char copy_command[500];
-// // sprintf(copy_command, "cp %s %s_%i", tmp_out_file_name, arguments.output_file, cycle);
-// // system(copy_command);
-// ++cycle;
-// }
-
-// printf("HERE_COPY\n");
-// char copy_command[2000];
-// sprintf(copy_command, "mv %s %s", tmp_out_file_name, arguments.output_file);
-// printf("%s\n", copy_command);
-// int error = system(copy_command);
-// printf("ERROR %i\n", error);
-
-
- //free_memory & close files
- fclose(prof_file);
- free_fastal_profile(profiles[0], alphabet_size);
- free_fastal_profile(profiles[1], alphabet_size);
- vfree(profiles);
- vfree(profile_positions);
-
-
-
-// number_of_sequences
-
-
- if (arguments.score)
- {
- printf("Calculate Score\n");
- double aln_score = calculate_sum_of_pairs_score_affine(arguments.output_file, param_set->M, param_set->gop, param_set->gep);
- printf("SCORE: %f\n", aln_score);
- }
-
-
-
-
-
- if (!strcmp(param_set->method, "fast"))
- {
- free_sparse((Sparse_dynamic_param*)method_arguments_p);
- }
- else if (!strcmp(param_set->method, "nw"))
- {
- free_nw((Nw_param*)method_arguments_p, alphabet_size);
- }
- else if (!strcmp(param_set->method, "gotoh"))
- {
- free_gotoh((Gotoh_param*)method_arguments_p, alphabet_size);
- }
-
- vfree(param_set);
-
- //free_memory & close files
-
- vfree(edit_positions);
-
-
- return 0;
-}
-
-
-
-
-//****************** toolbox ***************************
-
-
-/**
- * \brief Enlargement of the dynamic programming matrix in case it is to small.
- *
- * \param dyn_matrix The dynamic programming matrix.
- * \param old_length1 Current size of dimension 1.
- * \param old_length2 Current size of dimension 2.
- * \param length1 New size of dimension 1.
- * \param length2 New size of dimension 2.
- * \return Pointer to the new array.
- */
-double**
-resize_dyn_matrix(double **dyn_matrix, //the dynamic programming matrix
- int old_length1, //old length of dimension 1
- int old_length2, //old length of dimension 2
- int length1, //new minimum length of dimension 1
- int length2) //new maximum length of dimension 2
-{
- int i;
- if (old_length1 < length1)
- {
- dyn_matrix = vrealloc(dyn_matrix,length1*sizeof(double*));
- for (i = old_length1; i < length1; ++i)
- dyn_matrix[i] = vcalloc(old_length2,sizeof(double));
- old_length1 = length1;
- }
-
- if (old_length2 < length2)
- {
- for (i = 0;i<old_length1; ++i)
- dyn_matrix[i] = vrealloc(dyn_matrix[i], length2*sizeof(double));
- old_length2 = length2;
- }
- return dyn_matrix;
-}
-
-
-
-/**
- * \brief Frees the memory of a dynamic programming matrix.
- *
- * \param length1 Size of the first dimension of the matrix.
- * \param dyn_matrix The dynamic programming matrix.
- */
-void
-free_dyn_matrix(int length1, //length of first dimension
- double **dyn_matrix) //dynamic matrix
-{
- int i = 0;
- for (; i<length1; ++i)
- vfree(dyn_matrix[i]);
- vfree(dyn_matrix);
-}
-
-
-
-/**
- * \brief Initialises the profiles with basic values.
- *
- * \param profiles Array of 3 profiles.
- * \param param_set The fastal parameters
- */
-void
-initiate_profiles(Fastal_profile **profiles, //profiles pointer
- Fastal_param *param_set)
-{
- int alphabet_size = param_set->alphabet_size;
- int i,j;
- for (i =0; i < 3; ++i)
- {
- profiles[i] = vcalloc(1,sizeof(Fastal_profile));
- profiles[i]->weight = 1;
- profiles[i]->is_leaf = 1;
- profiles[i]->prf = vcalloc(alphabet_size, sizeof(int*));
- for (j = 0; j < alphabet_size; ++j)
- {
- profiles[i]->prf[j] = vcalloc(PROFILE_ENLARGEMENT, sizeof(int));
- }
- profiles[i]->allocated_memory = PROFILE_ENLARGEMENT;
- }
-}
-
-
-
-
-/**
- * \brief frees all memory occupied by the profile
- *
- * \param profile The profile to free.
- * \param alphabet_size The alphabet_size.
- */
-void
-free_fastal_profile(Fastal_profile* profile, int alphabet_size)
-{
- --alphabet_size;
- for (;alphabet_size >= 0; --alphabet_size)
- vfree(profile->prf[alphabet_size]);
- vfree(profile->prf);
-}
-
-
-/**
- * \brief Initialize the Fastal parameter set.
- *
- * \param is_dna 1 when sequences are dna, 0 when not
- * \param param_set The fastal parameter set.
- * \param method The method to use in Fastal.
-*/
-void
-fill_parameters(int is_dna, Fastal_param *param_set, char *method, char *diag_method, char *mat)
-{
- sprintf(param_set->method,"%s",method);
- sprintf(param_set->diag_method,"%s",diag_method);
- int i;
- printf("%s",mat);
- param_set->M = read_matrice(mat);
- if (is_dna)
- {
- param_set->alphabet_size = 4;
- char tmp1[] = {'A','C','G','T'};
-
-// int tmp2[] = { 0, 1, 1, 0, -1, -1, 2, 0, -1, -1, 3, -1, 1, 0, -1, -1, -1, 0, 1, 3, 4, -1, 3, -1, 1, -1};
- for (i = 0; i<param_set->alphabet_size; ++i)
- param_set->pos2char[i] = tmp1[i];
-// for (i = 0; i<26; ++i)
-// param_set->char2pos[i] = tmp2[i];
- param_set->char2pos['A'] = 0;
- param_set->char2pos['B'] = 1;
- param_set->char2pos['C'] = 1;
- param_set->char2pos['D'] = 0;
- param_set->char2pos['G'] = 2;
- param_set->char2pos['H'] = 0;
- param_set->char2pos['K'] = 3;
- param_set->char2pos['M'] = 1;
- param_set->char2pos['N'] = 0;
- param_set->char2pos['R'] = 0;
- param_set->char2pos['S'] = 1;
- param_set->char2pos['T'] = 3;
- param_set->char2pos['U'] = 3;
- param_set->char2pos['W'] = 3;
- param_set->char2pos['Y'] = 1;
-// param_set->M[0][3] = param_set->M[3][0] = -10;
-// param_set->M[1][2] = param_set->M[2][1] = -10;
-// param_set->M[0][1] = param_set->M[0][2] = param_set->M[1][0] = param_set->M[2][0] = -10;
-// param_set->M[3][1] = param_set->M[3][2] = param_set->M[1][3] = param_set->M[2][3] = -10;
- }
- else
- {
- param_set->alphabet_size = 21;
- char tmp1[] = {'A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','X'};
-// int tmp2[] = { 0, 20, 1, 5, 16, 4, 2, 6, 7, 21, 8, 9, 10, 11, -1, 12, 13, 14, 15, 3, -1, 17, 18, 22, 19, 23};
- for (i = 0; i<param_set->alphabet_size; ++i)
- param_set->pos2char[i] = tmp1[i];
-// for (i = 0; i<26; ++i)
-// param_set->char2pos[i] = tmp2[i];
- param_set->char2pos['A'] = 0;
- param_set->char2pos['B'] = 20;
- param_set->char2pos['C'] = 1;
- param_set->char2pos['D'] = 2;
- param_set->char2pos['E'] = 3;
- param_set->char2pos['F'] = 4;
- param_set->char2pos['G'] = 5;
- param_set->char2pos['H'] = 6;
- param_set->char2pos['I'] = 7;
- param_set->char2pos['J'] = 20;
- param_set->char2pos['K'] = 8;
- param_set->char2pos['L'] = 9;
- param_set->char2pos['M'] = 10;
- param_set->char2pos['N'] = 11;
- param_set->char2pos['P'] = 12;
- param_set->char2pos['Q'] = 13;
- param_set->char2pos['R'] = 14;
- param_set->char2pos['S'] = 15;
- param_set->char2pos['T'] = 16;
- param_set->char2pos['V'] = 17;
- param_set->char2pos['W'] = 18;
- param_set->char2pos['X'] = 20;
- param_set->char2pos['Y'] = 19;
- param_set->char2pos['X'] = 20;
- }
-}
-
-
-
-/******************************COPYRIGHT NOTICE*******************************/
-/*© Centro de Regulacio Genomica */
-/*and */
-/*Cedric Notredame */
-/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
-/*All rights reserved.*/
-/*This file is part of T-COFFEE.*/
-/**/
-/* T-COFFEE is free software; you can redistribute it and/or modify*/
-/* it under the terms of the GNU General Public License as published by*/
-/* the Free Software Foundation; either version 2 of the License, or*/
-/* (at your option) any later version.*/
-/**/
-/* T-COFFEE is distributed in the hope that it will be useful,*/
-/* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
-/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
-/* GNU General Public License for more details.*/
-/**/
-/* You should have received a copy of the GNU General Public License*/
-/* along with Foobar; if not, write to the Free Software*/
-/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
-/*............................................... |*/
-/* If you need some more information*/
-/* cedric.notredame@europe.com*/
-/*............................................... |*/
-/**/
-/**/
-/* */
-/******************************COPYRIGHT NOTICE*******************************/