8 #include "io_lib_header.h"
9 #include "util_lib_header.h"
10 #include "define_header.h"
11 #include "dp_lib_header.h"
12 #include "fastal_lib_header.h"
13 #include "fast_tree_header.h"
16 //TODO: seq_pair2diagonal delete num points from parameters
20 //Fastal_param *param_set;
23 /*! \mainpage T-Coffee Index Page
25 * \section intro_sec Introduction
27 * This is the introduction.
29 * \section install_sec Installation
31 * \subsection step1 Step 1: Opening the box
34 * \section fastal_sec Fastal
36 * This program is a very fast aligner. It is capable of aligning huge sets of sequences because it keeps as much as necessary on hard disk.
47 * \brief Source code for the fastal algorithm
52 * \brief Calculates scores for diagonal segments.
54 * \param seq1 Sequence 1
55 * \param seq2 Sequence 2
56 * \param *diagonals The diagonals. Three consecutive entries belong togehter. 1. pos in \a seq1 , 2. pos in \a seq2 and 3. length of diagonal
57 * \param num_diagonals Number of diagonals
58 * \param s1_length Length of \a seq1
59 * \param list length of list.
60 * \param list An 2-dim array to save the scores in.
64 diag2pair_list(char* seq1,
70 int *current_num_points,
71 int additional_needed,
72 Fastal_param *param_set)
74 int **mat = param_set->M;
75 int i, j, diag_length, pos1, pos2;
76 int **list = list_in[0];
78 // printf("NUM: %i\n",num_diagonals);
80 int l1 = strlen(seq1), l2 = strlen(seq2);
81 int x = *current_num_points;
84 int s1_length = strlen(seq1);
86 for (i = 0; i < num_diagonals; ++i)
88 pos1 = diagonals[i*3];
89 pos2 = diagonals[i*3+1];
90 diag_length = diagonals[i*3+2];
91 mini = MIN(pos1, pos2);
94 while ((pos1 < l1) && (pos2 < l2))
96 if (x==*current_length)
98 *current_length+=1000;
99 list=vrealloc (list,(*current_length)*sizeof(int*));
102 list[x]=vcalloc (7, sizeof (int));
105 list[x][2] = mat[toupper(seq1[pos1])-'A'][toupper(seq2[pos2])-'A'];
112 *current_num_points = x;
117 guessalignment(Fastal_profile prf)
123 fastal_compare (const void * a, const void * b)
125 return (*(int*)a - *(int*)b);
129 diagonals2int(int *diagonals,
134 Fastal_param *param_set)
136 int l1 = strlen(seq1);
137 int l2 = strlen(seq2);
138 int gep = param_set->gep;
140 int current_size = l2+l1;
142 int **list = vcalloc(current_size, sizeof(int*));
143 int *diags = vcalloc(num_diagonals, sizeof(int));
145 // printf("SEQ: %s\nSEQ:%s\n",seq1, seq2);
146 // printf("X: %i\n",num_diagonals);
147 for (i = 0; i < num_diagonals; ++i)
149 diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1];
152 qsort (diags, num_diagonals, sizeof(int), fastal_compare);
154 int *diagx = vcalloc(num_diagonals, sizeof(int));
155 int *diagy = vcalloc(num_diagonals, sizeof(int));
156 int *old_pos = vcalloc(num_diagonals, sizeof(int));
158 //+1 because diagonals start here at position 1, like in "real" dynamic programming
160 for (i = 0; i < num_diagonals; ++i)
165 diagx[i] = l1 - diags[i];
174 for (; i < num_diagonals; ++i)
177 diagy[i] = diags[i]-l1;
183 int **M = param_set->M;
184 int *last_y = vcalloc(l2+1, sizeof(int));
185 int *last_x = vcalloc(l1+1, sizeof(int));
189 list[0] = vcalloc(6, sizeof(int));
196 for (; list_pos < tmp_l2; ++list_pos)
198 list[list_pos] = vcalloc(6, sizeof(int));
199 list[list_pos][0] = 0;
200 list[list_pos][1] = list_pos;
201 last_y[list_pos] = list_pos;
202 list[list_pos][2] = list_pos*gep;
203 list[list_pos][4] = list_pos-1;
212 while (pos_x < tmp_l1)
214 if (list_pos + num_diagonals+2 > current_size)
216 current_size += num_diagonals*1000;
217 list = vrealloc(list, current_size * sizeof(int*));
220 list[list_pos] = vcalloc(6, sizeof(int));
221 list[list_pos][0] = ++pos_x;
222 list[list_pos][1] = 0;
223 list[list_pos][2] = pos_x * gep;
224 list[list_pos][3] = last_y[0];
225 tmpy_value = list_pos;
227 last_x[pos_x] = list_pos;
231 for (i = a; i <= b; ++i)
233 list[list_pos] = vcalloc(6, sizeof(int));
235 list[list_pos][0] = ++diagx[i];
237 list[list_pos][1] = ++diagy[i];
238 list[list_pos][3] = last_y[diagy[i]];
239 list[list_pos][4] = list_pos-1;
240 list[list_pos][5] = last_y[diagy[i]-1];
241 list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A'];
242 last_y[tmpy_pos] = tmpy_value;
243 tmpy_value = list_pos;
248 last_y[tmpy_pos] = tmpy_value;
252 if (list[list_pos-1][1] != l2)
254 list[list_pos] = vcalloc(6, sizeof(int));
255 list[list_pos][0] = pos_x;
256 list[list_pos][1] = l2;
257 list[list_pos][3] = last_y[l2];
259 list[list_pos][2] = -1000;
260 list[list_pos][4] = list_pos-1;
262 list[list_pos][5] = last_x[pos_x-l2];
264 list[list_pos][5] = l2-pos_x;
265 last_y[l2] = list_pos;
271 if ((b >= 0) && (diagy[b] == l2))
274 if ((a >0) && (diagx[a-1] == pos_x))
280 if (list_pos + l2+2 > current_size)
282 current_size += list_pos + l2 + 2;
283 list = vrealloc(list, current_size * sizeof(int*));
288 list[list_pos] = vcalloc(6, sizeof(int));
289 list[list_pos][0] = l1;
290 list[list_pos][1] = 0;
291 list[list_pos][3] = last_x[l1-1];
292 list[list_pos][2] = -1000;
297 for (i = 1; i <= l2; ++i)
299 list[list_pos] = vcalloc(6, sizeof(int));
300 list[list_pos][0] = l1;
301 list[list_pos][1] = i;
302 list[list_pos][3] = last_y[i];
303 list[list_pos][4] = list_pos-1;
305 if ((list[y][0] == l1-1) && (list[y][1] == i-1))
307 list[list_pos][5] = y;
308 list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'];
314 list[list_pos][5] = last_x[l1-i];
318 list[list_pos][5] = i-l1;
320 list[list_pos][2] = -1000;
325 list[list_pos - l2][2] = -1000;
327 *num_points = list_pos;
331 // for (blb = 0; blb <list_pos; ++blb)
333 // printf("LIST_A: %i %i %i %i %i %i %i %i\n",blb, list[blb][0],list[blb][1],list[blb][3],list[blb][2], list[blb][4], list[blb][5], list[blb][6]);
339 * \brief Makes a sorted list out of diagonals.
341 * \param diagonals A list of diagonals to use during dynamic programming.
342 * \param num_diagonals Number of diagonals.
343 * \param seq1 Sequence 1.
344 * \param seq2 Sequence 2.
345 * \param gep cost for gap extension.
346 * \param num_points Number of points in the list
347 * \return A 2-dim array which contains all points needed for the sparse dynamic programming algorithm.
350 diagonals2int5(int *diagonals,
355 Fastal_profile *prf1,
356 Fastal_profile *prf2,
358 Fastal_param *param_set)
361 int l1 = strlen(seq1);
362 int l2 = strlen(seq2);
364 int gep = param_set->gep;
366 int current_size = l2+l1;
367 int **list = vcalloc(current_size, sizeof(int*));
368 int *diags = vcalloc(num_diagonals, sizeof(int));
370 for (i = 0; i < num_diagonals; ++i)
372 diags[i] = l1 - diagonals[i*3] + diagonals[i*3+1];
376 qsort (diags, num_diagonals, sizeof(int), fastal_compare);
378 int *diagx = vcalloc(num_diagonals, sizeof(int));
379 int *diagy = vcalloc(num_diagonals, sizeof(int));
380 int *old_pos = vcalloc(num_diagonals, sizeof(int));
382 //+1 because diagonals start here at position 1, like in "real" dynamic programming
384 for (i = 0; i < num_diagonals; ++i)
390 diagx[i] = l1 - diags[i];
400 for (; i < num_diagonals; ++i)
403 diagy[i] = diags[i]-l1;
410 int **M = param_set->M;
412 int *last_y = vcalloc(l2+1, sizeof(int));
413 int *last_x = vcalloc(l1+1, sizeof(int));
417 list[0] = vcalloc(6, sizeof(int));
424 for (; list_pos < tmp_l2; ++list_pos)
426 list[list_pos] = vcalloc(6, sizeof(int));
427 list[list_pos][0] = 0;
428 list[list_pos][1] = list_pos;
429 last_y[list_pos] = list_pos;
430 list[list_pos][2] = list_pos*gep;
431 list[list_pos][3] = ++dig_num;
432 list[list_pos][5] = list_pos-1;
439 int bla2, bla3, tmp_x;
444 while (pos_x < tmp_l1)
446 if (list_pos + num_diagonals+2 > current_size)
448 current_size += num_diagonals*50;
449 list = vrealloc(list, current_size * sizeof(int*));
452 list[list_pos] = vcalloc(6, sizeof(int));
453 list[list_pos][0] = ++pos_x;
454 list[list_pos][1] = 0;
455 list[list_pos][2] = pos_x * gep;
456 list[list_pos][3] = --tmp;
457 list[list_pos][4] = last_y[0];
458 tmpy_value = list_pos;
460 last_x[pos_x] = list_pos;
464 for (i = a; i <= b; ++i)
466 list[list_pos] = vcalloc(6, sizeof(int));
467 list[list_pos][0] = ++diagx[i];
468 list[list_pos][1] = ++diagy[i];
469 list[list_pos][3] = diags[i];
471 list[list_pos][4] = last_y[diagy[i]];
472 list[list_pos][5] = list_pos-1;
473 list[list_pos][6] = last_y[diagy[i]-1];
475 list[list_pos][2] = 0;
480 for (bla = 0; bla<10; ++bla)
483 for (bla2 = 0; bla2<10; ++bla2)
485 bla3 += prf2->prf[bla2][diagy[i]-1] * prf1->prf[bla][diagx[i]-1];
486 tmp_x += prf2->prf[bla2][diagy[i]-1] * prf1->prf[bla][diagx[i]-1] * M[pos2char[bla]-'A'][pos2char[bla2] -'A'];
490 list[list_pos][2] = (int)tmp_x / bla3;
492 // for (bla = 0; bla<10; ++bla)
493 // bla2 += prf2->prf[bla][diagy[i]-1];
494 // bla2 = bla2/prf2->num_sequences;
496 // for (bla = 0; bla<10; ++bla)
497 // bla3 += prf1->prf[bla][diagy[i]-1];
499 // bla3 = bla3/prf1->num_sequences;
502 // if ((bla2 > 0.7) && (bla3 > 0.7))
503 // list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A'];
504 // else if ((bla< 0.7) && (bla3 < 0.7))
505 // list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A'] = 3;
507 // list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A'] * ((bla< 0.7) && (bla3 < 0.7));
508 // list[list_pos][2] = M[toupper(seq1[diagx[i]-1])-'A'][toupper(seq2[diagy[i]-1])-'A'];//* ((bla2+bla3)/2);
509 last_y[tmpy_pos] = tmpy_value;
510 tmpy_value = list_pos;
515 last_y[tmpy_pos] = tmpy_value;
519 if (list[list_pos-1][1] != l2)
521 list[list_pos] = vcalloc(6, sizeof(int));
522 list[list_pos][0] = pos_x;
523 list[list_pos][1] = l2;
524 list[list_pos][4] = last_y[l2];
526 list[list_pos][2] = -1000;
527 list[list_pos][3] = l1 - pos_x + l2;
528 list[list_pos][5] = list_pos-1;
530 list[list_pos][6] = last_x[pos_x-l2];
532 list[list_pos][6] = l2-pos_x;
533 last_y[l2] = list_pos;
538 if ((b >= 0) && (diagy[b] == l2))
541 if ((a >0) && (diagx[a-1] == pos_x))
547 if (list_pos + l2+2 > current_size)
549 current_size += list_pos + l2 + 2;
550 list = vrealloc(list, current_size * sizeof(int*));
555 list[list_pos] = vcalloc(6, sizeof(int));
556 list[list_pos][0] = l1;
557 list[list_pos][1] = 0;
558 list[list_pos][3] = ++dig_num;
559 list[list_pos][4] = last_x[l1-1];
560 list[list_pos][2] = -1000;
563 for (i = 1; i <= l2; ++i)
565 list[list_pos] = vcalloc(6, sizeof(int));
566 list[list_pos][0] = l1;
567 list[list_pos][1] = i;
568 list[list_pos][3] = ++dig_num;
569 list[list_pos][4] = last_y[i];
570 list[list_pos][5] = list_pos-1;
572 if ((list[y][0] == l1-1) && (list[y][1] == i-1))
574 list[list_pos][6] = y;
575 list[list_pos][2] = M[toupper(seq1[l1-1])-'A'][toupper(seq2[i-1])-'A'];
581 list[list_pos][6] = last_x[l1-i];
585 list[list_pos][6] = i-l1;
587 list[list_pos][2] = -1000;
592 list[list_pos - l2][2] = -1000;
594 *num_points = list_pos;
602 //************************** sparse dynamic aligning **********************************************************
606 combine_profiles2file(int **prf1,
610 Fastal_param *param_set,
614 int alphabet_size = param_set->alphabet_size;
615 char *pos2aa = &(param_set->pos2char[0]);
620 for (i = 0; i < alphabet_size; ++i)
621 if (prf1[i][pos1] + prf2[i][pos2] > 0)
624 fprintf(prof_f," %c%i", pos2aa[i],prf1[i][pos1]+prf2[i][pos2]);
626 fprintf(prof_f,"%c%i", pos2aa[i],prf1[i][pos1]+prf2[i][pos2]);
629 fprintf(prof_f,"\n");
631 else if (state == 'D')
633 for (i = 0; i < alphabet_size; ++i)
634 if (prf2[i][pos2] > 0)
637 fprintf(prof_f," %c%i", pos2aa[i],prf2[i][pos2]);
639 fprintf(prof_f,"%c%i", pos2aa[i],prf2[i][pos2]);
642 fprintf(prof_f,"\n");
646 for (i = 0; i < alphabet_size; ++i)
647 if (prf1[i][pos1] > 0)
650 fprintf(prof_f," %c%i", pos2aa[i],prf1[i][pos1]);
652 fprintf(prof_f,"%c%i", pos2aa[i],prf1[i][pos1]);
655 fprintf(prof_f,"\n");
661 #define LIN(a,b,c) a[b*5+c]
663 * Calculates a fast and sparse dynamic programming matrix
665 * \param prf1 Profile of first sequence.
666 * \param prf2 Profile of second sequence.
667 * \param param_set The parameter for the alignment.
668 * \param list The list of diagonals.
669 * \param n number of dots.
670 * \param edit_f File to save the edit information.
671 * \param prof_f File to save the profile.
672 * \param node_number Number of the new profile.
675 list2linked_pair_wise_fastal(Fastal_profile *prf1,
676 Fastal_profile *prf2,
677 Fastal_param *param_set,
684 int a,b,c, i, j, LEN=0, start_trace;
685 int pi, pj,ij, delta_i, delta_j, prev_i, prev_j;
687 static long *MI, *MJ, *MM,*MT2;
690 int gop, gep, igop, igep;
693 char **aln,*char_buf;
696 int nomatch = param_set->nomatch;
701 al=declare_char (2,l1+l2+1);
706 gep=igep=param_set->gep;
711 vfree (MI);vfree (MJ); vfree (MM);
712 free_int (slist, -1);
714 slist=declare_int (n,3);
716 MI=vcalloc (5*n, sizeof (long));
717 MJ=vcalloc (5*n, sizeof (long));
718 MM=vcalloc (5*n, sizeof (long));
724 for (b=0; b<5; b++)LIN(MI,a,b)=LIN(MJ,a,b)=LIN(MJ,a,b)=-1000000;
733 if (i==l1 || j==l2)gop=0;
736 if (i==l1 && j==l2)start_trace=a;
737 else if ( i==0 || j==0)
739 LIN(MM,a,0)=-1000000;
742 LIN(MJ,a,0)=-10000000;
747 LIN(MI,a,0)=-10000000;
751 LIN(MI,a,1)=LIN(MJ,a,1)=-1;
752 LIN(MI,a,2)=LIN(MJ,a,2)=i;
753 LIN(MI,a,3)=LIN(MJ,a,3)=j;
764 delta_i=list[a][0]-list[pi][0];
765 delta_j=list[a][1]-list[pj][1];
768 LIN(MI,a,0)=MAX(LIN(MI,pi,0),(LIN(MM,pi,0)+gop))+delta_i*gep;
772 LIN(MI,a,4)=(LIN(MI,pi,0) =(LIN(MM,pi,0)+gop))?'i':'m';
774 LIN(MJ,a,0)=MAX(LIN(MJ,pj,0),(LIN(MM,pj,0)+gop))+delta_j*gep;
779 LIN(MJ,a,4)=(LIN(MJ,pj,0) =LIN(MM,pj,0)+gop)?'j':'m';
781 if (a>1 && (ls=list[a][0]-list[ij][0])==(list[a][1]-list[ij][1]))
783 LIN(MM,a,0)=MAX3(LIN(MM,ij,0),LIN(MI,ij,0),LIN(MJ,ij,0))+list[a][2]-(ls*nomatch);
788 if ( LIN(MM,ij,0)>=LIN(MI,ij,0) && LIN(MM,ij,0)>=LIN(MJ,ij,0))LIN(MM,a,4)='m';
789 else if ( LIN(MI,ij,0) >= LIN(MJ,ij,0))LIN(MM,a,4)='i';
790 else LIN(MM,a,4)='j';
795 LIN(MM,a,0)=UNDEFINED;
801 if (LIN(MM,a,0)>=LIN(MI,a,0) && LIN(MM,a,0) >=LIN(MJ,a,0))MT2=MM;
802 else if ( LIN(MI,a,0)>=LIN(MJ,a,0))MT2=MI;
805 score=MAX3(LIN(MM,a,0), LIN(MI,a,0), LIN(MJ,a,0));
811 while (!(i==0 &&j==0))
814 l=MAX(LIN(MT2,a,2),LIN(MT2,a,3));
815 // HERE ("%c from %c %d %d SCORE=%d [%d %d] [%2d %2d]", T2[a][5],T2[a][4], T2[a][2], T2[a][3], T2[a][0], gop, gep, i, j);
835 else if (l==0) {HERE ("L=0 i=%d j=%d",l, i, j);exit (0);}
838 for (b=0; b<l; b++, LEN++)
840 if (LIN(MT2,a,2)){al[0][LEN]=1;i--;ni++;}
843 if (LIN(MT2,a,3)){al[1][LEN]=1;j--;nj++;}
848 if (LIN(MT2,a,4)=='m')MT2=MM;
849 else if (LIN(MT2,a,4)=='i')MT2=MI;
850 else if (LIN(MT2,a,4)=='j')MT2=MJ;
855 invert_list_char ( al[0], LEN);
856 invert_list_char ( al[1], LEN);
858 fprintf(edit_f, "%i\n%i\n%i\n%i\n",prf1->prf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf);
859 fprintf(prof_f, "%i\n0\n%i\n1\n", node_number,LEN);
861 char statec[] = {'M','D','I'};
867 for ( b=0; b< LEN; b++)
869 if ((al[0][b]==1) && (al[1][b]==1))
872 combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'M');
877 fprintf(edit_f, "%c%i\n",statec[state], num);
884 else if (al[0][b]==1)
886 // prf1->prf[param_set->alphabet_size-1] += prf2->num_sequences;
887 combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'I');
891 fprintf(edit_f, "%c%i\n",statec[state], num);
898 else if (al[1][b]==1)
900 // prf2->prf[param_set->alphabet_size-1] += prf1->num_sequences;
901 combine_profiles2file(prf1->prf, prf2->prf, i, j, param_set, prof_f, 'D');
905 fprintf(edit_f, "%c%i\n",statec[state], num);
915 fprintf(edit_f, "%c%i\n",statec[state], num);
920 fprintf(edit_f,"*\n");
921 fprintf(prof_f,"*\n");
933 * \brief Tuns a profile into a consensus sequence.
935 * The character with the highest number of occurences is used as consensus. Gaps are not included. For example: 10 '-' and one 'A' would give 'A' as consensus.
936 * \param profile The profile.
937 * \param file_name Name of the file to save the consensus sequence in.
938 * \param param_set The parameter of the fastal algorithm.
939 * \return the sequence
942 profile2consensus(Fastal_profile *profile, char *file_name, Fastal_param *param_set)
944 FILE *cons_f = fopen(file_name,"w");
945 fprintf(cons_f, ">%i\n", profile->prf_number);
946 char* seq = vcalloc(profile->length+1, sizeof(char));
949 int alphabet_size = param_set->alphabet_size;
950 int **prf = profile->prf;
951 char *pos2char = param_set->pos2char;
952 for (i = 0; i < profile->length; ++i)
955 for (j = 0; j < alphabet_size; ++j)
957 if (prf[j][i] > most)
963 seq[i] = pos2char[most_pos];
964 fprintf(cons_f, "%c",pos2char[most_pos]);
967 fprintf( cons_f, "\n");
976 * \brief Calculates the diagonals between two sequences.
978 * Uses bl2seq to calculate the diagonals.
979 * \param seq_file1 File with sequence 1.
980 * \param seq_file2 File with sequence 2.
981 * \param diagonals An array where the diagonal points will be stored.
982 * \param dig_length length of \a diagonals .
983 * \param num_points Number of points in all diagonals.
984 * \return number of diagonals;
987 seq_pair2blast_diagonal(char *seq_file_name1,
988 char *seq_file_name2,
995 int *diag = vcalloc(l1 + l2, sizeof(int));
996 char *out_file = vtmpnam(NULL);
997 char blast_command[600];
1000 sprintf(blast_command, "bl2seq -p blastn -i %s -j %s -D 1 -g F -o %s", seq_file_name1, seq_file_name2, out_file);
1002 sprintf(blast_command, "bl2seq -p blastp -i %s -j %s -D 1 -g F -o %s", seq_file_name1, seq_file_name2, out_file);
1003 system(blast_command);
1005 int *diags = diagonals[0];
1006 FILE *diag_f = fopen(out_file,"r");
1008 fgets(line, 300, diag_f);
1009 fgets(line, 300, diag_f);
1010 fgets(line, 300, diag_f);
1013 char delims[] = "\t";
1014 char *result = NULL;
1015 int length, pos_q, pos_d, i;
1016 int current_pos = 0;
1017 while (fgets(line, 300, diag_f) != NULL)
1019 strtok(line, delims);
1020 strtok(NULL, delims);
1021 strtok(NULL, delims);
1022 length = atoi(strtok(NULL, delims));
1023 strtok(NULL, delims);
1024 strtok(NULL, delims);
1025 pos_q = atoi(strtok(NULL, delims))-1;
1026 strtok(NULL, delims);
1027 pos_d = atoi(strtok(NULL, delims))-1;
1029 if (current_pos >= *dig_length)
1031 (*dig_length) += 90;
1032 diags = vrealloc(diags, sizeof(int)*(*dig_length));
1034 if (diag[l1-pos_q+pos_d] == 0)
1036 diag[l1-pos_q+pos_d] =1;
1037 diags[current_pos++] = pos_q;
1038 diags[current_pos++] = pos_d;
1039 diags[current_pos++] = length;
1044 diagonals[0] = diags;
1045 return current_pos/3;
1051 //******************************* OTHER STUFF ***********************
1054 * \brief Reads the sequence from a given position in a fasta file and turns it into a profile.
1056 * \param seq_file The file where the sequence is stored.
1057 * \param off_set The off_set from the beginning of the file to the position of the sequence name.
1058 * \param profile The profile where the sequence will be stored into.
1059 * \param prf_number The number of this profile.
1062 file_pos2profile(FILE *seq_file, //File with sequences
1063 long off_set, //offset of sequence from the beginning of file point to the sequence name, not to the sequence itself
1064 Fastal_profile *profile, //profile to save into
1065 int prf_number, //number of the profile
1066 Fastal_param *param_set)
1068 int alphabet_size = param_set->alphabet_size;
1069 profile->is_leaf = 1;
1070 int *aa2pos = &(param_set->char2pos[0]);
1071 const int LINE_LENGTH = 500;
1072 char line[LINE_LENGTH];
1073 profile->num_sequences = 1;
1074 profile->prf_number = prf_number;
1075 fseek (seq_file , off_set , SEEK_SET );
1077 fgets (line, LINE_LENGTH , seq_file);
1081 while(fgets(line, LINE_LENGTH, seq_file)!=NULL)
1086 line[LINE_LENGTH-1] = '\n';
1087 if (seq_length + LINE_LENGTH >= profile->allocated_memory)
1089 for (i = 0; i < alphabet_size; ++i)
1091 profile->prf[i] = vrealloc(profile->prf[i], (profile->allocated_memory+PROFILE_ENLARGEMENT)*sizeof(int));
1093 profile->allocated_memory += PROFILE_ENLARGEMENT;
1097 while (line[i] != '\n')
1099 for(j = 0; j<alphabet_size; ++j )
1100 profile->prf[j][seq_length+i] = 0;
1101 profile->prf[aa2pos[toupper(line[i])-'A']][seq_length+i] = 1;
1110 profile->length = seq_length;
1116 * constructs index of fasta_file
1119 make_index_of_file(char *file_name, //file with sequences
1120 long **file_positions) //array to save the positions
1122 const int LINE_LENGTH = 150;
1123 (*file_positions) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(long));
1125 int current_size = ENLARGEMENT_PER_STEP;
1126 int current_pos = 0;
1128 FILE *file = fopen(file_name,"r");
1130 char *sequence = vcalloc(3*LINE_LENGTH,sizeof(char));
1132 int allocated_length=3*LINE_LENGTH;
1133 char line[LINE_LENGTH];
1135 int num_of_sequences = 0;
1136 int mem_for_pos = ENLARGEMENT_PER_STEP;
1140 printf("FILE NOT FOUND\n");
1145 (*file_positions)[num_of_sequences] = ftell(file);
1146 while(fgets(line, LINE_LENGTH , file)!=NULL)
1148 int length = strlen(line);
1153 if (num_of_sequences == mem_for_pos)
1155 (*file_positions) = vrealloc((*file_positions),(ENLARGEMENT_PER_STEP+mem_for_pos) * sizeof(long));
1156 mem_for_pos += ENLARGEMENT_PER_STEP;
1159 (*file_positions)[num_of_sequences] = ftell(file);
1163 return num_of_sequences;
1168 * reads a profile from a profile_file
1170 profile_file2profile(Fastal_profile *prof, //structure to save the profile in
1171 FILE *profile_f, //file where the profile is stored
1172 long position, //position in profile_f where the profile is stored
1173 Fastal_param *param_set)
1176 int alphabet_size = param_set->alphabet_size;
1178 int *aa2pos = &(param_set->char2pos[0]);
1181 fseek(profile_f,position,SEEK_SET);
1182 const int LINE_LENGTH = 500;
1185 fgets(line, LINE_LENGTH, profile_f);
1187 prof->prf_number = atoi(line);
1188 // fgets(line, LINE_LENGTH, profile_f);
1189 // prof->num_sequences = atoi(line);
1190 // fgets(line, LINE_LENGTH, profile_f); //is-dna is already known
1191 fgets(line, LINE_LENGTH, profile_f);
1192 prof->is_leaf = atoi(line);
1194 fgets(line, LINE_LENGTH, profile_f);
1195 prof->length = atoi(line);
1196 fgets(line, LINE_LENGTH, profile_f);
1197 prof->weight = atoi(line);
1199 if (prof->length > prof->allocated_memory)
1200 for (i = 0;i < alphabet_size; ++i)
1202 prof->prf[i] = vrealloc(prof->prf[i],prof->length*sizeof(int));
1205 char delims[] = " ";
1206 char *result = NULL;
1207 char *result_num = NULL;
1209 int length = prof->length;
1211 for (i = 0; i < length; ++i)
1213 for(j = 0; j<alphabet_size; ++j )
1214 prof->prf[j][i] = 0;
1215 fgets(line, LINE_LENGTH , profile_f);
1216 result = strtok( line, delims );
1218 while( result != NULL)
1220 result_num = &result[1];
1221 prof->prf[aa2pos[result[0]-'A']][i] = atoi(result_num);
1222 result = strtok( NULL, delims );
1230 * writes a profile into a file
1233 profile2file(Fastal_profile *profile, //the profile to save
1234 FILE* file, //file to save in
1235 Fastal_param *param_set)
1237 int alphabet_size = param_set->alphabet_size;
1239 char *pos2aa = &(param_set->pos2char[0]);
1241 fseek(file,0,SEEK_SET);
1243 fprintf(file,"%i\n", profile->prf_number);
1244 // fprintf(file,"%i\n", profile->num_sequences);
1246 fprintf(file,"%i\n", profile->is_leaf);
1247 fprintf(file,"%i\n", profile->length);
1248 fprintf(file,"%i\n", profile->weight);
1250 int max = profile->length;
1255 for (j = 0; j < alphabet_size; ++j)
1256 if (profile->prf[j][i] > 0)
1259 fprintf(file," %c%i", pos2aa[j],profile->prf[j][i]);
1261 fprintf(file,"%c%i", pos2aa[j],profile->prf[j][i]);
1264 if (profile->prf[j][i] > 0)
1266 fprintf(file," %c%i", pos2aa[j],profile->prf[j][i]);
1268 fprintf(file,"%c%i", pos2aa[j],profile->prf[j][i]);
1274 fprintf(file,"*\n");
1280 * Reads the profile out of an alignment
1283 file2profile(FILE* profile_f, //file to read the profile of
1284 Fastal_profile *prof, //profile saved in here
1285 int prf_number, //number of the profile
1286 Fastal_param *param_set)
1288 int alphabet_size = param_set->alphabet_size;
1290 int *aa2pos = &(param_set->char2pos[0]);
1293 fseek(profile_f,0,SEEK_SET);
1294 const int LINE_LENGTH = 500;
1297 fgets(line, LINE_LENGTH, profile_f);
1298 prof->prf_number = atoi(line);
1299 // fgets(line, LINE_LENGTH, profile_f); //is-dna is already known
1300 fgets(line, LINE_LENGTH, profile_f);
1301 prof->is_leaf = atoi(line);
1303 fgets(line, LINE_LENGTH, profile_f);
1304 prof->length = atoi(line);
1306 fgets(line, LINE_LENGTH, profile_f);
1307 prof->weight = atoi(line);
1309 if (prof->length > prof->allocated_memory)
1310 for (i = 0;i < alphabet_size; ++i)
1312 prof->prf[i] = vrealloc(prof->prf[i],prof->length*sizeof(int));
1315 char delims[] = " ";
1316 char *result = NULL;
1317 char *result_num = NULL;
1319 int length = prof->length;
1321 for (i = 0; i < length; ++i)
1323 for(j = 0; j<alphabet_size; ++j )
1324 prof->prf[j][i] = 0;
1325 fgets(line, LINE_LENGTH , profile_f);
1326 result = strtok( line, delims );
1328 while( result != NULL)
1330 result_num = &result[1];
1331 prof->prf[aa2pos[result[0]-'A']][i] = atoi(result_num);
1332 result = strtok( NULL, delims );
1340 * This method takes a profile and turns it into a sumed up version of same size.
1343 sumup_profile(Fastal_profile *profile, //profile to sum-up
1345 Fastal_param *param_set) //summed_up_profile
1348 char *pos2aa = &(param_set->pos2char[0]);
1349 int alphabet_size = param_set->alphabet_size;
1350 int **M = param_set->M;
1351 int prof_length = profile->length;
1355 for (i = 0; i < prof_length; ++i)
1357 sumup[alphabet_size][i] = 0;
1358 for (k = 0; k < alphabet_size; ++k)
1361 sumup[alphabet_size][i] += profile->prf[k][i];
1362 for (j = 0; j < alphabet_size; ++j)
1364 sumup[k][i] += profile->weight * profile->prf[j][i] * M[pos2aa[j]-'A'][pos2aa[k]-'A'];
1375 * Turns the dynamic programming matrix into a editfile and calculates the new profile
1378 nw_matrix2edit_file(double **prog_matrix, //dynamic programming matrix
1379 Fastal_profile *prf1, //profile of dim1
1380 Fastal_profile *prf2, //profile of dim2
1381 FILE *edit_f, //file to safe the edit in
1382 int **prf_field, //space to safe the new profile
1384 Fastal_param *param_set) //length of prf_field
1386 int **M = param_set->M;
1387 int alphabet_size = param_set->alphabet_size;
1388 double gap_cost = param_set -> gop;
1389 fprintf(edit_f, "%i\n%i\n%i\n%i\n",prf1->prf_number, prf2->prf_number, prf1->is_leaf, prf2->is_leaf);
1390 int sum[] = {0,0,0};
1391 char sumc[] = {'M','I','D'};
1397 int prf1_length = prf1->length;
1398 int prf2_length = prf2->length;
1399 while ((n < prf1_length) && (m < prf2_length))
1401 //if necesarry allocate more memory for result
1402 if ((*field_length)-alphabet_size < field_pos)
1404 (*field_length) += ENLARGEMENT_PER_STEP;
1406 for (i = 0; i <alphabet_size+1; ++i)
1408 prf_field[i] = vrealloc(prf_field[i], (*field_length)*sizeof(int));
1412 if (prog_matrix[n][m] == (prog_matrix[n+1][m] +gap_cost))
1414 for (i = 0; i<alphabet_size; ++i)
1416 prf_field[i][field_pos] = prf1->prf[i][n];
1423 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
1429 else if (prog_matrix[n][m] == (prog_matrix[n][m+1] +gap_cost))
1432 for (i = 0; i<alphabet_size; ++i)
1434 prf_field[i][field_pos] = prf2->prf[i][m];
1440 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
1448 for (i = 0; i<alphabet_size; ++i)
1450 prf_field[i][field_pos] = prf1->prf[i][n] + prf2->prf[i][m];
1457 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
1464 fprintf(edit_f,"%c%i\n",sumc[last],sum[last]);
1468 while (n < prf1_length)
1470 for (i = 0; i<alphabet_size; ++i)
1472 prf_field[i][field_pos] = prf1->prf[i][n];
1479 fprintf(edit_f,"I%i\n",last);
1483 while (m < prf2_length)
1485 for (i = 0; i<alphabet_size; ++i)
1487 prf_field[i][field_pos] = prf2->prf[i][m];
1494 fprintf(edit_f,"D%i\n",last);
1495 fprintf(edit_f,"*\n");
1503 * \brief Pairwise alignments of profile is done here.
1505 * \param profile1 Profile of sequence 1
1506 * \param profile2 Profile of sequence 2
1507 * \param prog_matrix Matrix for dynamic programming
1508 * \param edit_file_name The edit_file_name
1509 * \param sumup_prf The sumup version of profile 1, which later contains the aligned profile.
1510 * \param sumup_length Contains the length of the aligned profile.
1511 * \return length of the aligned profile
1514 prf_nw(Fastal_profile *profile1, //profile of sequence 1
1515 Fastal_profile *profile2, //profile of sequence 2
1516 double **prog_matrix, //matrix for dynamic programming (at least as long as necessary for alignment)
1517 FILE *edit_file_name, //name of edit file
1518 int **sumup_prf, //sum_up
1520 Fastal_param *param_set) //sum_up length
1522 int alphabet_size = param_set->alphabet_size;
1523 double gap_cost = param_set->gop;
1526 if (*sumup_length < profile1->length)
1528 for (i = 0; i < alphabet_size+1; ++i)
1530 sumup_prf[i] = vrealloc(sumup_prf[i], profile1->length*sizeof(int));
1532 *sumup_length = profile1->length;
1534 sumup_prf = sumup_profile(profile1, sumup_prf, param_set);
1539 int prof1_length = profile1->length;
1540 int prof2_length = profile2->length;
1542 int** M = param_set->M;
1545 int residue_pairs = 0;
1547 for (i = prof2_length; i > 0; --i)
1549 prog_matrix[prof1_length][i] = gap_cost * (prof2_length-i);
1553 prog_matrix[prof1_length][prof2_length] = 0.0;
1558 prog_matrix[i][prof2_length] = gap_cost*(prof1_length-i);
1563 for (k = 0; k < alphabet_size; ++k)
1565 residue_pairs += profile2->prf[k][j];
1566 match_score += (profile2->prf[k][j] * sumup_prf[k][i]);
1568 match_score /= (residue_pairs * sumup_prf[alphabet_size][i]);
1569 prog_matrix[i][j] = MAX3(prog_matrix[i+1][j+1]+match_score, prog_matrix[i+1][j]+gap_cost, prog_matrix[i][j+1]+gap_cost);
1575 return nw_matrix2edit_file(prog_matrix, profile1, profile2, edit_file_name, sumup_prf, sumup_length, param_set);
1581 * \brief Writes the sequence into the alignment_file.
1583 * \param aligned_sequence Pattern of aligned sequence.
1584 * \param sequence_file File with sequences.
1585 * \param sequence_position Positions of sequences in \a sequence_file.
1586 * \param alignment_file The file to write the sequence into.
1590 edit_seq2aligned_seq(char *aligned_sequence, //pattern for aligned sequence
1591 FILE *sequence_file, //file with all the sequences
1592 long sequence_position, //position in sequence file with the correct sequence
1593 FILE *alignment_file) //file to write the alignment into
1595 fseek(sequence_file, sequence_position, SEEK_SET);
1596 const int LINE_LENGTH = 300;
1597 char line[LINE_LENGTH];
1598 fgets (line, LINE_LENGTH , sequence_file);
1599 fprintf(alignment_file,"%s", line); //writing of sequence name
1602 while(fgets(line, LINE_LENGTH, sequence_file)!=NULL)
1606 line[LINE_LENGTH-1] = '\n';
1608 while (line[i] != '\n')
1610 while (aligned_sequence[pos] == '-')
1612 fprintf(alignment_file,"-");
1615 fprintf(alignment_file,"%c",line[i]);
1623 while (aligned_sequence[pos] != '\n')
1625 fprintf(alignment_file,"-");
1628 fprintf(alignment_file,"\n");
1634 * \brief Recursive function to turn the edit_file into the alignment.
1636 * \param sequence_file File with all sequences.
1637 * \param sequence_position The array of sequence positions in \a sequence_file
1638 * \param edit_file File to safe the edit profiles in.
1639 * \param edit_positions Array saving the coorespondence between edit profile and position in \a edit_file
1640 * \param node_number The current node.
1641 * \param number_of_sequences The number of sequences.
1642 * \param aligned_sequence The sequence that is edited.
1643 * \param alignment_length The length of the alignment.
1644 * \param edit_seq_file File that saves the edited_sequences of the internal nodes.
1645 * \param offset Saves the size of the edited_sequences.
1646 * \param alignment_file File where the alignment is saved.
1650 edit2alignment(FILE *sequence_file, //sequence file
1651 long *seq_positions, //sequence positions
1652 FILE *edit_file, //file saving the edit profiles
1653 long *edit_positions, //array saving the correspondence between edit profile and position in edit_file
1654 int node_number, //the current node
1655 int number_of_sequences, //number of sequences
1656 char *aligned_sequence, //the sequence that is edited
1657 int alignment_length, //length of the alignment - and thus of aligned_sequence
1658 FILE *edit_seq_file, //file saving the edited_sequences of the internal nodes
1659 int offset, //saves the size of the edited_sequence
1660 FILE* alignment_file) //file saving the alignments
1662 fseek(edit_file, edit_positions[node_number-number_of_sequences], SEEK_SET);
1663 const LINE_LENGTH = 50;
1664 char line[LINE_LENGTH];
1665 fgets(line, LINE_LENGTH , edit_file);
1666 int child1 = atoi(line);
1667 fgets(line, LINE_LENGTH , edit_file);
1668 int child2 = atoi(line);
1669 fgets(line, LINE_LENGTH , edit_file);
1670 int is_leaf1 = atoi(line);
1671 fgets(line, LINE_LENGTH , edit_file);
1672 int is_leaf2 = atoi(line);
1674 static char seq_line[10];
1681 while(fgets(line, LINE_LENGTH , edit_file)!=NULL)
1686 number = atoi(&line[1]);
1691 if (aligned_sequence[pos] == 'X')
1700 if (aligned_sequence[pos] == 'X')
1709 if (aligned_sequence[pos] == 'X')
1711 aligned_sequence[pos] = '-';
1721 edit_seq2aligned_seq(aligned_sequence, sequence_file, seq_positions[child1], alignment_file);
1725 fprintf(edit_seq_file, "%s", aligned_sequence);
1726 edit2alignment(sequence_file, seq_positions, edit_file, edit_positions, child1, number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file);
1730 fseek(edit_seq_file, offset, SEEK_CUR);
1731 fgets(aligned_sequence, alignment_length+3, edit_seq_file);
1732 fseek(edit_seq_file, offset, SEEK_CUR);
1735 fseek(edit_file, edit_positions[node_number-number_of_sequences], SEEK_SET);
1736 while(fgets(line, LINE_LENGTH , edit_file)!=NULL)
1741 number = atoi(&line[1]);
1746 if (aligned_sequence[pos] == 'X')
1755 if (aligned_sequence[pos] == 'X')
1757 aligned_sequence[pos] = '-';
1767 if (aligned_sequence[pos] == 'X')
1776 edit_seq2aligned_seq(aligned_sequence, sequence_file, seq_positions[child2], alignment_file);
1780 fprintf(edit_seq_file, "%s", aligned_sequence);
1781 edit2alignment(sequence_file, seq_positions, edit_file, edit_positions, child2, number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file);
1788 // * The file has the follwing format (# and text behind are only comments and not included into the file):
1789 // * 1 # Number of profile.
1790 // * 1 # is DNA or not.
1791 // * 5 # Number of columns in the profile.
1792 // * 4A 1C # In this column are 4 'A' and 1 'C'
1793 // * 3G # In this column are 3 'G'
1794 // * 5A # In this column are 5 'A'
1795 // * 2A 3C # In this column are 2 'A' and 3 'C'
1796 // * 5C # In this column are 5 'C'
1797 // * * # Marks the end of this profile
1802 * \brief Writes a profile to a file.
1804 * \param sumup_prf The profile array, not a real profile.
1805 * \param length The length of the profile.
1806 * \param file The FILE object to write the the profile into.
1807 * \param is_dna The type of sequence.
1808 * \param number The number of the profile.
1811 write2file(int **sumup_prf,
1815 Fastal_param *param_set)
1817 char *pos2aa = &(param_set->pos2char[0]);
1818 fprintf(file,"%i\n0\n%i\n1\n",number, length );
1820 int alphabet_size = param_set->alphabet_size;
1826 for (j = 0; j < alphabet_size; ++j)
1827 if (sumup_prf[j][i] > 0)
1830 fprintf(file," %c%i", pos2aa[j],sumup_prf[j][i]);
1832 fprintf(file,"%c%i", pos2aa[j],sumup_prf[j][i]);
1840 fprintf(file,"*\n");
1852 * main of the fastal algorithm
1855 fastal(int argc, //number of arguments
1856 char **argv) //arguments first = fastal, second = tree
1860 for (test = 0; test < argc; ++test)
1862 printf("%s\n",argv[test]);
1865 struct fastal_arguments arguments;
1867 arguments.output_file = "out.aln";
1868 arguments.tree_file = NULL;
1870 arguments.gop = -10;
1871 arguments.method = "fast";
1873 // argp_parse (&argp, argc, argv, 0, 0, &arguments);
1875 Fastal_param *param_set = vcalloc(1,sizeof(Fastal_param));
1876 fill_parameters(arguments.is_dna, param_set, arguments.method);
1877 param_set->gep = arguments.gep;
1878 param_set->gop = arguments.gop;
1881 int alphabet_size = param_set->alphabet_size;
1883 //sequence file management
1885 long *file_positions = NULL;
1886 long **tmp = &file_positions;
1887 int number_of_sequences = make_index_of_file(arguments.sequence_file, tmp);
1888 FILE *seq_file = fopen(arguments.sequence_file,"r");
1891 //edit file management
1892 FILE *edit_file = fopen("edit_tmp","w+");
1893 long current_edit_pos;
1894 long *edit_positions = vcalloc(number_of_sequences,sizeof(long));
1897 //profile management
1898 Fastal_profile **profiles = vcalloc(3,sizeof(Fastal_profile*));
1899 initiate_profiles(profiles, param_set);
1900 FILE * prof_file = fopen("prf_tmp","w+");
1901 long* profile_positions = vcalloc(4,sizeof(long*));
1906 //dynamic programming matrix
1907 double ** dyn_matrix = vcalloc(1,sizeof(double*));
1908 dyn_matrix[0] = vcalloc(1,sizeof(double));
1909 int *length1 = vcalloc(1,sizeof(int));
1910 int *length2 = vcalloc(1,sizeof(int));
1914 int **sumup_prf = vcalloc(alphabet_size+1,sizeof(int*));
1915 for (i = 0; i < alphabet_size+1; ++i)
1916 sumup_prf[i] = vcalloc(1,sizeof(int));
1917 int *sumup_length = vcalloc(1,sizeof(int));
1922 if (arguments.tree_file == NULL)
1924 arguments.tree_file = "HUMAN.tree";
1925 printf("CONSTRUCT TREE\n");
1926 make_partTree(arguments.sequence_file, arguments.tree_file, 4, 20);
1930 printf("CONSTRUCT ALIGNMENT\n");
1931 //tree file management
1932 FILE *tree_file = fopen(arguments.tree_file,"r");
1933 const int LINE_LENGTH = 100;
1934 char line[LINE_LENGTH];
1935 char delims[] = " ";
1937 char *result = NULL;
1939 int alignment_length;
1942 //memory for sparse dynamic
1943 int *diagonals = vcalloc(3,sizeof(int));
1944 int *dig_length = vcalloc(1,sizeof(int));
1946 int **list = NULL;//vcalloc(1,sizeof(int*));
1947 // list[0] = vcalloc(7,sizeof(int));
1948 int *list_length = vcalloc(1,sizeof(int));
1951 int ***list_p = vcalloc(1,sizeof(int**));
1955 //bottom-up traversal
1956 while(fgets(line, LINE_LENGTH, tree_file)!=NULL)
1959 node[0] = atoi(strtok(line,delims));
1960 node[1] = atoi(strtok(NULL,delims));
1961 node[2] = atoi(strtok(NULL,delims));
1962 //getting profile of second child
1963 if (node[1] < number_of_sequences)
1965 file_pos2profile(seq_file, file_positions[node[1]], profiles[1], node[1], param_set); //profile to save into
1969 profile_file2profile(profiles[1], prof_file, profile_positions[--saved_prof], param_set);
1970 fseek (prof_file , profile_positions[saved_prof] , SEEK_SET);
1973 //getting profile of first child
1974 if (node[0] < number_of_sequences)
1976 file_pos2profile(seq_file, file_positions[node[0]], profiles[0], node[0], param_set); //profile to save into
1980 profile_file2profile(profiles[0], prof_file, profile_positions[--saved_prof], param_set);
1981 fseek (prof_file , profile_positions[saved_prof] , SEEK_SET);
1983 if (saved_prof == max_prof)
1986 profile_positions = vrealloc(profile_positions, max_prof*sizeof(long));
1988 edit_positions[node[2]-number_of_sequences] = ftell(edit_file);
1989 profile_positions[saved_prof] = ftell(prof_file);
1991 if (!strcmp(param_set->method,"nw"))
1993 dyn_matrix = resize_dyn_matrix(dyn_matrix, length1, length2, profiles[0]->length+1, profiles[1]->length+1);
1994 alignment_length = prf_nw(profiles[0], profiles[1], dyn_matrix, edit_file, sumup_prf, sumup_length, param_set);
1995 write2file(sumup_prf, alignment_length, prof_file, node[2], param_set);
1997 else if (!strcmp(param_set->method, "fast"))
1999 char *file_name1 = vtmpnam(NULL);
2000 char *file_name2 = vtmpnam(NULL);
2001 char *seq1 = profile2consensus(profiles[0], file_name1, param_set);
2002 char *seq2 = profile2consensus(profiles[1], file_name2, param_set);
2003 int **diagonals_p = &diagonals;
2004 int num_diagonals = seq_pair2blast_diagonal(file_name1, file_name2, diagonals_p, dig_length, strlen(seq1),strlen(seq2), arguments.is_dna);
2005 diagonals = diagonals_p[0];
2006 char *p = ¶m_set->pos2char[0];
2007 list = diagonals2int(diagonals, num_diagonals, seq1, seq2, list_length, param_set);//, profiles[0], profiles[1], p);
2008 alignment_length = list2linked_pair_wise_fastal(profiles[0], profiles[1], param_set, list, *list_length, edit_file, prof_file, node[2]);
2011 for (x = 0; x < *list_length; ++x)
2022 //free_memory & close files
2026 free_fastal_profile(profiles[0], alphabet_size);
2027 free_fastal_profile(profiles[1], alphabet_size);
2029 vfree(profile_positions);
2030 free_dyn_matrix(*length1,dyn_matrix);
2031 for (i = 0; i <= alphabet_size; ++i)
2033 vfree(sumup_prf[i]);
2038 //bottom-down traversal (edit_files --> alignment)
2039 char file_name[FILENAMELEN];
2040 sprintf(file_name,arguments.output_file);
2042 FILE *alignment_file = fopen(file_name, "w");
2043 FILE *edit_seq_file = fopen("edit_seq.tmp","w+");
2045 char *aligned_sequence = vcalloc(alignment_length+3, sizeof(char));
2048 long offset = ftell(edit_seq_file);
2049 for (i = 0; i < alignment_length; ++i)
2051 fprintf(edit_seq_file, "X");
2052 aligned_sequence[i] = 'X';
2054 aligned_sequence[i]= '\n';
2055 aligned_sequence[i+1]= '\0';
2056 fprintf(edit_seq_file, "\n");
2057 offset = (ftell(edit_seq_file) - offset)*-1;
2060 edit2alignment(seq_file, file_positions, edit_file, edit_positions, node[2], number_of_sequences, aligned_sequence, alignment_length, edit_seq_file, offset, alignment_file);
2063 //free_memory & close files
2065 vfree(edit_positions);
2075 //****************** toolbox ***************************
2079 * enlargement of the dynamic programming matrix in case it is to small.
2082 resize_dyn_matrix(double **dyn_matrix, //the dynamic programming matrix
2083 int *old_length1, //old length of dimension 1
2084 int *old_length2, //old length of dimension 2
2085 int length1, //new minimum length of dimension 1
2086 int length2) //new maximum length of dimension 2
2089 if (*old_length1 < length1)
2091 dyn_matrix = vrealloc(dyn_matrix,length1*sizeof(double*));
2092 for (i = *old_length1; i < length1; ++i)
2093 dyn_matrix[i] = vcalloc(*old_length2,sizeof(double));
2094 *old_length1 = length1;
2096 if (*old_length2 < length2)
2098 for (i = 0;i<*old_length1; ++i)
2099 dyn_matrix[i] = vrealloc(dyn_matrix[i], length2*sizeof(double));
2100 *old_length2 = length2;
2108 * frees the memory of a dynamic programming matrix
2111 free_dyn_matrix(int length1, //length of first dimension
2112 double **dyn_matrix) //dynamic matrix
2115 for (; i<length1; ++i)
2116 vfree(dyn_matrix[i]);
2123 * initialises the profiles.
2126 initiate_profiles(Fastal_profile **profiles, //profiles pointer
2127 Fastal_param *param_set)
2129 int alphabet_size = param_set->alphabet_size;
2131 for (i =0; i < 3; ++i)
2133 profiles[i] = vcalloc(1,sizeof(Fastal_profile));
2134 profiles[i]->weight = 1;
2135 profiles[i]->is_leaf = 1;
2136 profiles[i]->prf = vcalloc(alphabet_size, sizeof(int*));
2137 for (j = 0; j < alphabet_size; ++j)
2139 profiles[i]->prf[j] = vcalloc(PROFILE_ENLARGEMENT, sizeof(int));
2141 profiles[i]->allocated_memory = PROFILE_ENLARGEMENT;
2147 * initalises the files where the profiles are temporarly stored.
2150 initiate_profile_files(FILE **profile_files)
2156 sprintf(names,"tmp_prf_%i",i);
2157 profile_files[i] = fopen(names,"w+");
2164 * frees all memory occupied by the profile
2167 free_fastal_profile(Fastal_profile* profile, int alphabet_size)
2170 for (;alphabet_size >= 0; --alphabet_size)
2171 vfree(profile->prf[alphabet_size]);
2172 vfree(profile->prf);
2177 * initialize the parameters
2180 fill_parameters(int is_dna, Fastal_param *param_set, char *method)
2182 sprintf(param_set->method,"%s",method);
2186 param_set->alphabet_size = 10;
2187 char tmp1[] = {'A','C','G','T','N','R','Y','D','M','W'};
2188 int tmp2[] = { 0, -1, 1, 7, -1, -1, 2, -1, -1, -1, -1, -1, 8, 4, -1, -1, -1, 5, -1, 3, -1, -1, 9, -1, 6, -1};
2189 for (i = 0; i<param_set->alphabet_size; ++i)
2190 param_set->pos2char[i] = tmp1[i];
2191 for (i = 0; i<26; ++i)
2192 param_set->char2pos[i] = tmp2[i];
2193 param_set->M = read_matrice("dna_idmat");
2197 param_set->alphabet_size = 24;
2198 char tmp1[] = {'A','C','G','T','F','D','H','I','K','L','M','N','P','Q','R','S','E','V','W','Y','B','J','X','Z'};
2199 int tmp2[] = { 0, 20, 1, 5, 16, 4, 2, 6, 7, 21, 8, 9, 10, 11, -1, 12, 13, 14, 15, 3, -1, 17, 18,22, 19,23};
2200 for (i = 0; i<param_set->alphabet_size; ++i)
2201 param_set->pos2char[i] = tmp1[i];
2202 for (i = 0; i<26; ++i)
2203 param_set->char2pos[i] = tmp2[i];
2204 param_set->M = read_matrice("blosum62mt");
2207 /*********************************COPYRIGHT NOTICE**********************************/
2208 /*© Centro de Regulacio Genomica */
2210 /*Cedric Notredame */
2211 /*Tue Oct 27 10:12:26 WEST 2009. */
2212 /*All rights reserved.*/
2213 /*This file is part of T-COFFEE.*/
2215 /* T-COFFEE is free software; you can redistribute it and/or modify*/
2216 /* it under the terms of the GNU General Public License as published by*/
2217 /* the Free Software Foundation; either version 2 of the License, or*/
2218 /* (at your option) any later version.*/
2220 /* T-COFFEE is distributed in the hope that it will be useful,*/
2221 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
2222 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
2223 /* GNU General Public License for more details.*/
2225 /* You should have received a copy of the GNU General Public License*/
2226 /* along with Foobar; if not, write to the Free Software*/
2227 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
2228 /*............................................... |*/
2229 /* If you need some more information*/
2230 /* cedric.notredame@europe.com*/
2231 /*............................................... |*/
2235 /*********************************COPYRIGHT NOTICE**********************************/