11 #include "io_lib_header.h"
12 #include "util_lib_header.h"
13 #include "fastal_lib_header.h"
19 * \brief Function to calculate the sum of pairs score with quasi affine gap costs.
21 * \param alignment_file File with the aligned sequences in fasta_format.
22 * \param file_index Positions of the sequences in \a alignment_file.
23 * \param number_of_sequences The number of sequences in \a alignment_file.
24 * \param score_matrix Matrix containing the scores.
25 * \param gop The gap opening costs.
26 * \param gep The gep extension costs.
27 * \return The score of the alignment.
30 calculate_sum_of_pairs_score_affine(char *alignment_file_name,
36 const int LINE_LENGTH = 1000;
37 int number_of_sequences = 0;
38 FILE *alignment_file = fopen(alignment_file_name,"r");
39 if (alignment_file == NULL)
41 printf("FILE COULD NOT BE OPENED!\n");
44 int alignment_length = 0;
45 char line[LINE_LENGTH+1];
48 while (line[0] != '>')
50 fgets(line, LINE_LENGTH, alignment_file);
52 fgets(line, LINE_LENGTH, alignment_file);
53 while (line[0] != '>')
55 alignment_length += strlen(line);
56 fgets(line, LINE_LENGTH, alignment_file);
59 int alphabet_size = 28;
60 int **counting = vcalloc(alphabet_size, sizeof(int*));
63 for (i = 0; i < alphabet_size; ++i)
65 counting[i] = vcalloc(alignment_length, sizeof(int));
71 int pos_line, pos_alignment = -1;
72 fseek (alignment_file, 0, SEEK_SET);
76 int current_size = 100;
80 int **gaps = vcalloc(100, sizeof(int*));
82 while (fgets(line, LINE_LENGTH, alignment_file) != NULL)
87 while (((c = line[++pos_line]) != '\n') && (c != '\0'))
91 ++counting[toupper(c)-'A'][++pos_alignment];
94 gaps[gap_line][gap_pos++] = pos_alignment-1;
100 ++counting[alphabet_size][++pos_alignment];
103 ++counting[alphabet_size+1][pos_alignment];
105 if (gap_pos >= gap_size-4)
108 gaps[gap_line] = vrealloc(gaps[gap_line], gap_size * sizeof(int));
109 gaps[gap_line][gap_pos++] = pos_alignment;
117 if (number_of_sequences != 0)
119 if ((gap_pos % 2) != 0)
121 gaps[gap_line][gap_pos++] = pos_alignment-1;
123 gaps[gap_line][gap_pos++] = alignment_length+2;
124 gaps[gap_line][gap_pos] = alignment_length+2;
127 if (current_size == gap_line)
130 gaps = vrealloc(gaps, current_size * sizeof(int*));
133 gaps[gap_line] = vcalloc(gap_size, sizeof(int));
134 ++number_of_sequences;
142 gaps[gap_line][gap_pos++] = alignment_length+2;
143 gaps[gap_line][gap_pos] = alignment_length+2;
146 long double score = 0;
149 for (i = 0; i < alignment_length; ++i)
151 for (j = 0; j < alphabet_size; ++j)
153 if ((tmp2 = counting[j][i]) >1)
157 score += score_matrix[j][j] * (--tmp2);
160 for (k = j+1; k < alphabet_size; ++k)
162 score += score_matrix[j][k] * counting[j][i] * counting[k][i];
168 non_gap = number_of_sequences - counting[alphabet_size][i];
169 score += counting[alphabet_size][i] * non_gap * gep;
174 alignment_length += 2;
175 unsigned long gap_open = 0;
179 // #pragma omp parallel shared(gaps, chunk, alignment_length, gap_open) private(i, j)
181 int alignment_length2 = alignment_length;
182 // #pragma omp for schedule(dynamic,chunk) reduction(+:gap_open) nowait
183 for (i = 0; i < gap_line; ++i)
185 int *gaps1 = gaps[i];
186 for (j = i+1; j < gap_line; ++j)
188 int *gaps2 = gaps[j];
191 while ((gaps1[k] != alignment_length2) && (gaps2[l] != alignment_length2))
193 if (gaps1[k+1] < gaps2[l])
199 if (gaps2[l+1] < gaps1[k])
205 if (gaps1[k] < gaps2[l])
207 if (gaps1[k+1] < gaps2[l+1])
219 if (gaps1[k] > gaps2[l])
221 if (gaps1[k+1] > gaps2[l+1])
233 if (gaps1[k] == gaps2[l])
235 if (gaps1[k+1] == gaps2[l+1])
243 if (gaps1[k+1] <gaps2[l+1])
255 while (gaps1[k] != alignment_length2)
260 while (gaps2[l] != alignment_length2)
269 score += gop * gap_open;
272 return score / number_of_sequences;
277 compare (const void * a, const void * b)
279 return (*(int*)a - *(int*)b);
285 get_random_value(int a, int b)
287 return rand() % (b - a + 1) + a;
288 // j = 1 + (int)( 10.0 * rand() / ( RAND_MAX + 1.0 ) );
296 compute_ref_alignments(char *seq_file_name, char* ref_directory, int num_alignments, int num_seq_in_aln)
299 printf("%s %s %i %i\n", seq_file_name, ref_directory, num_alignments, num_seq_in_aln);
300 //check if target directory exists, if not make it
302 if (ref_directory[0]== '~')
304 sprintf(directory,"%s%s",getenv("HOME"),ref_directory+1);
308 sprintf(directory,"%s",ref_directory);
311 if(stat(directory, &st) != 0)
313 mkdir(directory,0700);
318 // get sequences & produce alignments
320 long *file_positions = NULL;
321 long **tmp_pos = &file_positions;
322 int number_of_sequences = make_index_of_file(seq_file_name, tmp_pos);
323 char *tmp_file_name = vtmpnam(NULL);
324 FILE *seq_file = fopen(seq_file_name, "r");
325 int *already_taken = vcalloc(num_seq_in_aln, sizeof(int));
326 int i, j, k, tmp, already, num_already;
327 for (i = 0; i < num_alignments; ++i)
329 //choose randomly the sequences
332 while (num_already < num_seq_in_aln)
335 tmp = get_random_value(0, number_of_sequences-1);
336 for (k = 0; k < num_already; ++k)
338 if (already_taken[k] == tmp)
346 already_taken[num_already] = tmp;
350 //write sequences into temporary file
351 const int LINE_LENGTH = 200;
352 char line[LINE_LENGTH];
355 FILE* tmp_file = fopen(tmp_file_name, "w");
356 for (k = 0; k < num_already; ++k)
358 fseek(seq_file, file_positions[already_taken[k]], SEEK_SET);
359 fgets(line, LINE_LENGTH, seq_file);
360 fprintf(tmp_file, "%s", line);
361 while( (fgets(line, LINE_LENGTH, seq_file)) != NULL)
365 fprintf(tmp_file, "%s", line);
370 //calculate alignment
371 char aln_command[1000];
372 sprintf(aln_command,"t_coffee -in %s -outfile %s/%i.fa -output fasta_aln 2>/dev/null >/dev/null", tmp_file_name, directory, i);
376 vfree(already_taken);
382 // compute agreement_score(char *alignment_file_name, char *alignment_directory)
391 * This algorithm choses according to a tree a number of sequenes of which it computes a reference algnment.
392 * \param seq_file_name The sequence file.
393 * \param tree_file_name The given tree.
394 * \param ref_aln_name The file where the reference alignment should be written to.
395 * \param num_seq_in_ref The number of sequences to choose.
398 make_ref_alignment(char *seq_file_name, char *tree_file_name, char *ref_aln_name, int num_seq_in_ref)
400 const int LINE_LENGTH = 200;
401 char line[LINE_LENGTH];
403 FILE *tree_f = fopen(tree_file_name,"r");
406 long *file_positions = NULL;
407 long **tmp = &file_positions;
408 int num_sequences = make_index_of_file(seq_file_name, tmp);
411 int every_x = num_sequences/(num_seq_in_ref);
413 int *seq_ids = vcalloc(num_seq_in_ref,sizeof(int));
417 fseek (tree_f, 0, SEEK_SET);
420 while(fgets(line, LINE_LENGTH, tree_f)!=NULL)
422 node = atoi(strtok(line,delims));
423 if (node < num_sequences)
428 seq_ids[++pos] = node;
437 node = atoi(strtok(NULL,delims));
438 if (node < num_sequences)
442 seq_ids[++pos] = node;
455 qsort(seq_ids, num_seq_in_ref, sizeof(int), compare);
457 char *seq_ref_name = vtmpnam(NULL);
458 FILE *seq_ref_f = fopen(seq_ref_name, "w");;
459 FILE *seq_f = fopen(seq_file_name, "r");
461 for (i = 0; i < num_seq_in_ref; ++i)
463 fseek (seq_f, file_positions[seq_ids[i]], SEEK_SET);
464 fgets(line, LINE_LENGTH, seq_f);
465 fprintf(seq_ref_f, "%s", line);
466 while(fgets(line, LINE_LENGTH, seq_f)!=NULL)
470 fprintf(seq_ref_f, "%s", line);
477 sprintf(command, "mafft --quiet %s > %s", seq_ref_name, ref_aln_name);
483 * This function calculates the agreement between two alignments in a specified number of sequences.
484 * \param ref_file_name The reference alignment.
485 * \param aln_file_name The test alignment.
488 agreement_score(char *ref_file_name, char *aln_file_name)
490 const int LINE_LENGTH = 200;
491 char line[LINE_LENGTH];
493 int number_of_sequences = 0;
494 FILE *ref_f = fopen(ref_file_name,"r");
495 while(fgets(line, LINE_LENGTH, ref_f)!=NULL)
499 ++number_of_sequences;
502 fseek(ref_f, 0, SEEK_SET);
503 char **seq_names = vcalloc(number_of_sequences, sizeof(char*));
505 while(fgets(line, LINE_LENGTH, ref_f)!=NULL)
509 seq_names[i] = vcalloc(LINE_LENGTH, sizeof(char));
510 sprintf(seq_names[i], "%s",line);
517 FILE *aln_f = fopen(aln_file_name,"r");
518 char *tmp_name = vtmpnam(NULL);
519 FILE *tmp_f = fopen(tmp_name,"w");
522 while(fgets(line, LINE_LENGTH, aln_f)!=NULL)
526 for (i = 0; i < number_of_sequences; ++i)
528 if (!strcmp(line, seq_names[i]))
530 fprintf(tmp_f, "%s", line);
531 last_pos = ftell(aln_f);
532 while(fgets(line, LINE_LENGTH, aln_f)!=NULL)
536 fprintf(tmp_f, "%s", line);
538 last_pos = ftell(aln_f);
541 fseek(aln_f, last_pos, SEEK_SET);
546 if (x == number_of_sequences)
552 // sprintf(command, "cp %s /users/cn/ckemena/Desktop/1.fa", tmp_name);
554 // fp = popen("ls -l", "r");
555 sprintf(command, "t_coffee -other_pg aln_compare -al1 %s -al2 %s ", ref_file_name, tmp_name);
559 result = popen(command, "r");
560 fgets(line, LINE_LENGTH, result);
561 fgets(line, LINE_LENGTH, result);
562 fgets(line, LINE_LENGTH, result);
564 // sprintf(command, "cp %s ~/Destkop/1.fa", tmp_name);
569 strtok(line, delims);
570 for (i = 0; i < 3; ++i)
572 while((tmp_str = strtok(NULL, delims))==NULL);
574 return(atof(tmp_str));
578 complete_agreement_score(char *aln_file_name, const char *ref_directory)
581 char *name = strrchr(aln_file_name,'/')+1;
582 // printf("%s ",name);
583 // enter existing path to directory below
585 if (ref_directory[0]== '~')
587 sprintf(directory,"%s%s",getenv("HOME"),ref_directory+1);
591 sprintf(directory,"%s",ref_directory);
593 DIR *dir = opendir(directory);
595 char ref_file_name[200];
596 while ((dp=readdir(dir)) != NULL)
598 if ((strcmp(dp->d_name,".")) && (strcmp(dp->d_name,"..")))
601 sprintf(ref_file_name, "%s/%s",directory, dp->d_name);
602 printf("%s %f\n",name, agreement_score(ref_file_name, aln_file_name));
603 // printf("%f ", agreement_score(ref_file_name, aln_file_name));
614 // char delims[] = " ";
616 // int alignment_length = -1;
620 //bottom-up traversal
621 // while(fgets(line, LINE_LENGTH, tree_file)!=NULL)
624 // node[0] = atoi(strtok(line,delims));
625 // node[1] = atoi(strtok(NULL,delims));
626 // node[2] = atoi(strtok(NULL,delims));
627 /******************************COPYRIGHT NOTICE*******************************/
628 /*© Centro de Regulacio Genomica */
630 /*Cedric Notredame */
631 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
632 /*All rights reserved.*/
633 /*This file is part of T-COFFEE.*/
635 /* T-COFFEE is free software; you can redistribute it and/or modify*/
636 /* it under the terms of the GNU General Public License as published by*/
637 /* the Free Software Foundation; either version 2 of the License, or*/
638 /* (at your option) any later version.*/
640 /* T-COFFEE is distributed in the hope that it will be useful,*/
641 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
642 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
643 /* GNU General Public License for more details.*/
645 /* You should have received a copy of the GNU General Public License*/
646 /* along with Foobar; if not, write to the Free Software*/
647 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
648 /*............................................... |*/
649 /* If you need some more information*/
650 /* cedric.notredame@europe.com*/
651 /*............................................... |*/
655 /******************************COPYRIGHT NOTICE*******************************/