Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / tcoffee / t_coffee_source / scoring.c
1
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <math.h>
5 #include <ctype.h>
6 #include <string.h>
7 #include <sys/stat.h>
8 #include <dirent.h>
9
10
11 #include "io_lib_header.h"
12 #include "util_lib_header.h"
13 #include "fastal_lib_header.h"
14
15
16
17
18 /**
19  * \brief Function to calculate the sum of pairs score with quasi affine gap costs.
20  *
21  * \param alignment_file File with the aligned sequences in fasta_format.
22  * \param file_index Positions of the sequences in \a alignment_file.
23  * \param number_of_sequences The number of sequences in \a alignment_file.
24  * \param score_matrix Matrix containing the scores.
25  * \param gop The gap opening costs.
26  * \param gep The gep extension costs.
27  * \return The score of the alignment.
28  */
29 double
30 calculate_sum_of_pairs_score_affine(char *alignment_file_name,
31                                                                         int **score_matrix,
32                                                                         double gop,
33                                                                         double gep)
34 {
35
36         const int LINE_LENGTH = 1000;
37         int number_of_sequences = 0;
38         FILE *alignment_file = fopen(alignment_file_name,"r");
39         if (alignment_file == NULL)
40         {
41                 printf("FILE COULD NOT BE OPENED!\n");
42                 exit(1);
43         }
44         int alignment_length = 0;
45         char line[LINE_LENGTH+1];
46         line[0] = '0';
47
48         while (line[0] != '>')
49         {
50                 fgets(line, LINE_LENGTH, alignment_file);
51         }
52         fgets(line, LINE_LENGTH, alignment_file);
53         while (line[0] != '>')
54         {
55                 alignment_length += strlen(line);
56                 fgets(line, LINE_LENGTH, alignment_file);
57         }
58
59         int alphabet_size = 28;
60         int **counting = vcalloc(alphabet_size, sizeof(int*));
61
62         int i;
63         for (i = 0; i < alphabet_size; ++i)
64         {
65                 counting[i] = vcalloc(alignment_length, sizeof(int));
66         }
67
68         alphabet_size -= 2;
69         char c;
70         int was_gap = 0;
71         int pos_line, pos_alignment = -1;
72         fseek (alignment_file, 0, SEEK_SET);
73         pos_line = 0;
74
75         int gap_line = 0;
76         int current_size = 100;
77         int gap_size = 0;
78         int gap_pos = 0;
79
80         int **gaps = vcalloc(100, sizeof(int*));
81
82         while (fgets(line, LINE_LENGTH, alignment_file) != NULL)
83         {
84                 if (line[0] != '>')
85                 {
86                         pos_line = -1;
87                         while (((c = line[++pos_line]) != '\n') && (c != '\0'))
88                         {
89                                 if (isalpha(c))
90                                 {
91                                         ++counting[toupper(c)-'A'][++pos_alignment];
92                                         if (was_gap)
93                                         {
94                                                 gaps[gap_line][gap_pos++] = pos_alignment-1;
95                                         }
96                                         was_gap = 0;
97                                 }
98                                 else
99                                 {
100                                         ++counting[alphabet_size][++pos_alignment];
101                                         if (!was_gap)
102                                         {
103                                                 ++counting[alphabet_size+1][pos_alignment];
104                                                 was_gap = 1;
105                                                 if (gap_pos >= gap_size-4)
106                                                 {
107                                                         gap_size += 50;
108                                                         gaps[gap_line] = vrealloc(gaps[gap_line], gap_size * sizeof(int));
109                                                         gaps[gap_line][gap_pos++] = pos_alignment;
110                                                 }
111                                         }
112                                 }
113                         }
114                 }
115                 else
116                 {
117                         if (number_of_sequences != 0)
118                         {
119                                 if ((gap_pos % 2) != 0)
120                                 {
121                                         gaps[gap_line][gap_pos++] = pos_alignment-1;
122                                 }
123                                 gaps[gap_line][gap_pos++] = alignment_length+2;
124                                 gaps[gap_line][gap_pos] = alignment_length+2;
125                                 ++gap_line;
126                         }
127                         if (current_size == gap_line)
128                         {
129                                 current_size += 50;
130                                 gaps = vrealloc(gaps, current_size * sizeof(int*));
131                         }
132                         gap_size = 2;
133                         gaps[gap_line] = vcalloc(gap_size, sizeof(int));
134                         ++number_of_sequences;
135                         pos_alignment = -1;
136                         was_gap = 0;
137
138                         gap_pos = 0;
139                 }
140         }
141
142         gaps[gap_line][gap_pos++] = alignment_length+2;
143         gaps[gap_line][gap_pos] = alignment_length+2;
144
145
146         long double score = 0;
147         int j, k, tmp2;
148         int non_gap;
149         for (i = 0; i < alignment_length; ++i)
150         {
151                 for (j = 0; j < alphabet_size; ++j)
152                 {
153                         if ((tmp2 = counting[j][i]) >1)
154                         {
155                                 while (tmp2 > 1)
156                                 {
157                                         score += score_matrix[j][j] * (--tmp2);
158                                 }
159                         }
160                         for (k = j+1; k < alphabet_size; ++k)
161                         {
162                                 score += score_matrix[j][k] * counting[j][i] * counting[k][i];
163                         }
164
165                 }
166
167
168                 non_gap = number_of_sequences - counting[alphabet_size][i];
169                 score += counting[alphabet_size][i] * non_gap  * gep;
170
171         }
172
173         ++gap_line;
174         alignment_length += 2;
175         unsigned long gap_open = 0;
176
177         int chunk = 500;
178
179 //      #pragma omp parallel shared(gaps, chunk, alignment_length, gap_open) private(i, j)
180         {
181                 int alignment_length2 = alignment_length;
182 //              #pragma omp for schedule(dynamic,chunk) reduction(+:gap_open) nowait
183                 for (i = 0; i < gap_line; ++i)
184                 {
185                         int *gaps1 = gaps[i];
186                         for (j = i+1; j < gap_line; ++j)
187                         {
188                                 int *gaps2 = gaps[j];
189                                 int k = 0;
190                                 int l = 0;
191                                 while ((gaps1[k] != alignment_length2) && (gaps2[l] != alignment_length2))
192                                 {
193                                         if (gaps1[k+1] < gaps2[l])
194                                         {
195                                                 ++gap_open;
196                                                 k+=2;
197                                                 continue;
198                                         }
199                                         if (gaps2[l+1] < gaps1[k])
200                                         {
201                                                 ++gap_open;
202                                                 l+=2;
203                                                 continue;
204                                         }
205                                         if (gaps1[k] < gaps2[l])
206                                         {
207                                                 if (gaps1[k+1] < gaps2[l+1])
208                                                 {
209                                                         ++gap_open;
210                                                         k+=2;
211                                                         continue;
212                                                 }
213                                                 else
214                                                 {
215                                                         l+=2;
216                                                         continue;
217                                                 }
218                                         }
219                                         if (gaps1[k] > gaps2[l])
220                                         {
221                                                 if (gaps1[k+1] > gaps2[l+1])
222                                                 {
223                                                         ++gap_open;
224                                                         l+=2;
225                                                         continue;
226                                                 }
227                                                 else
228                                                 {
229                                                         k+=2;
230                                                         continue;
231                                                 }
232                                         }
233                                         if (gaps1[k] == gaps2[l])
234                                         {
235                                                 if (gaps1[k+1] == gaps2[l+1])
236                                                 {
237                                                         k+=2;
238                                                         l+=2;
239                                                         continue;
240                                                 }
241                                                 else
242                                                 {
243                                                         if (gaps1[k+1] <gaps2[l+1])
244                                                         {
245                                                                 k+=2;
246                                                         }
247                                                         else
248                                                         {
249                                                                 l+=2;
250                                                         }
251                                                 }
252                                         }
253
254                                 }
255                                 while (gaps1[k] != alignment_length2)
256                                 {
257                                         ++gap_open;
258                                         k+=2;
259                                 }
260                                 while (gaps2[l] != alignment_length2)
261                                 {
262                                         ++gap_open;
263                                         l+=2;
264                                 }
265                         }
266                 }
267         }
268
269         score += gop * gap_open;
270
271
272         return score / number_of_sequences;
273 }
274
275
276 int
277 compare (const void * a, const void * b)
278 {
279         return (*(int*)a - *(int*)b);
280 }
281
282
283
284 int
285 get_random_value(int a, int b)
286 {
287         return rand() % (b - a + 1) + a;
288 //      j = 1 + (int)( 10.0 * rand() / ( RAND_MAX + 1.0 ) );
289 }
290
291
292
293
294
295 void
296 compute_ref_alignments(char *seq_file_name, char* ref_directory, int num_alignments, int num_seq_in_aln)
297 {
298
299         printf("%s %s %i %i\n", seq_file_name, ref_directory, num_alignments, num_seq_in_aln);
300         //check if target directory exists, if not make it
301         char directory[500];
302         if (ref_directory[0]== '~')
303         {
304                 sprintf(directory,"%s%s",getenv("HOME"),ref_directory+1);
305         }
306         else
307         {
308                 sprintf(directory,"%s",ref_directory);
309         }
310         struct stat st;
311         if(stat(directory, &st) != 0)
312         {
313                 mkdir(directory,0700);
314         }
315
316         printf("made\n");
317
318         // get sequences & produce alignments
319         //make index
320         long *file_positions = NULL;
321         long **tmp_pos = &file_positions;
322         int number_of_sequences = make_index_of_file(seq_file_name, tmp_pos);
323         char *tmp_file_name = vtmpnam(NULL);
324         FILE *seq_file = fopen(seq_file_name, "r");
325         int *already_taken = vcalloc(num_seq_in_aln, sizeof(int));
326         int i, j, k, tmp, already, num_already;
327         for (i = 0; i < num_alignments; ++i)
328         {
329                 //choose randomly the sequences
330
331                 num_already = 0;
332                 while (num_already < num_seq_in_aln)
333                 {
334                         already = 0;
335                         tmp = get_random_value(0, number_of_sequences-1);
336                         for (k = 0; k < num_already; ++k)
337                         {
338                                 if (already_taken[k] == tmp)
339                                 {
340                                         already = 1;
341                                         break;
342                                 }
343                         }
344                         if (!already)
345                         {
346                                 already_taken[num_already] = tmp;
347                                 ++num_already;
348                         }
349                 }
350                 //write sequences into temporary file
351                 const int LINE_LENGTH = 200;
352                 char line[LINE_LENGTH];
353
354
355                 FILE* tmp_file = fopen(tmp_file_name, "w");
356                 for (k = 0; k < num_already; ++k)
357                 {
358                         fseek(seq_file, file_positions[already_taken[k]], SEEK_SET);
359                         fgets(line, LINE_LENGTH, seq_file);
360                         fprintf(tmp_file, "%s", line);
361                         while( (fgets(line, LINE_LENGTH, seq_file)) != NULL)
362                         {
363                                 if (line[0] == '>')
364                                         break;
365                                 fprintf(tmp_file, "%s", line);
366                         }
367                 }
368                 fclose(tmp_file);
369
370                 //calculate alignment
371                 char aln_command[1000];
372                 sprintf(aln_command,"t_coffee -in %s -outfile %s/%i.fa -output fasta_aln 2>/dev/null >/dev/null", tmp_file_name, directory, i);
373                 system(aln_command);
374         }
375
376         vfree(already_taken);
377 }
378
379
380
381 // void
382 // compute agreement_score(char *alignment_file_name, char *alignment_directory)
383 // {
384 //
385 // }
386
387
388
389
390 /**
391  * This algorithm choses according to a tree a number of sequenes of which it computes a reference algnment.
392  * \param seq_file_name The sequence file.
393  * \param tree_file_name The given tree.
394  * \param ref_aln_name The file where the reference alignment should be written to.
395  * \param num_seq_in_ref The number of sequences to choose.
396 **/
397 void
398 make_ref_alignment(char *seq_file_name, char *tree_file_name, char *ref_aln_name, int num_seq_in_ref)
399 {
400         const int LINE_LENGTH = 200;
401         char line[LINE_LENGTH];
402
403         FILE *tree_f = fopen(tree_file_name,"r");
404
405
406         long *file_positions = NULL;
407         long **tmp = &file_positions;
408         int num_sequences = make_index_of_file(seq_file_name, tmp);
409
410
411         int every_x = num_sequences/(num_seq_in_ref);
412
413         int *seq_ids = vcalloc(num_seq_in_ref,sizeof(int));
414         char delims[] = " ";
415
416         int pos = -1;
417         fseek (tree_f, 0, SEEK_SET);
418         int node;
419         int dist = every_x;
420         while(fgets(line, LINE_LENGTH, tree_f)!=NULL)
421         {
422                 node = atoi(strtok(line,delims));
423                 if (node < num_sequences)
424                 {
425                         if (dist == every_x)
426                         {
427
428                                 seq_ids[++pos] = node;
429                                 dist = 0;
430                         }
431                         else
432                         {
433                                 ++dist;
434                         }
435                 }
436
437                 node = atoi(strtok(NULL,delims));
438                 if (node < num_sequences)
439                 {
440                         if (dist == every_x)
441                         {
442                                 seq_ids[++pos] = node;
443                                 dist = 0;
444                         }
445                         else
446                         {
447                                 ++dist;
448                         }
449                 }
450         }
451         fclose(tree_f);
452
453
454         int i = 0;
455         qsort(seq_ids, num_seq_in_ref, sizeof(int), compare);
456
457         char *seq_ref_name = vtmpnam(NULL);
458         FILE *seq_ref_f = fopen(seq_ref_name, "w");;
459         FILE *seq_f = fopen(seq_file_name, "r");
460
461         for (i = 0; i < num_seq_in_ref; ++i)
462         {
463                 fseek (seq_f, file_positions[seq_ids[i]], SEEK_SET);
464                 fgets(line, LINE_LENGTH, seq_f);
465                 fprintf(seq_ref_f, "%s", line);
466                 while(fgets(line, LINE_LENGTH, seq_f)!=NULL)
467                 {
468                         if (line[0] == '>')
469                                 break;
470                         fprintf(seq_ref_f, "%s", line);
471                 }
472         }
473         fclose(seq_ref_f);
474         fclose(seq_f);
475
476         char command[500];
477         sprintf(command, "mafft --quiet %s > %s", seq_ref_name, ref_aln_name);
478         system(command);
479 }
480
481
482 /**
483  * This function calculates the agreement between two alignments in a specified number of sequences.
484  * \param ref_file_name The reference alignment.
485  * \param aln_file_name The test alignment.
486  **/
487 double
488 agreement_score(char *ref_file_name, char *aln_file_name)
489 {
490         const int LINE_LENGTH = 200;
491         char line[LINE_LENGTH];
492
493         int number_of_sequences = 0;
494         FILE *ref_f = fopen(ref_file_name,"r");
495         while(fgets(line, LINE_LENGTH, ref_f)!=NULL)
496         {
497                 if (line[0] == '>')
498                 {
499                         ++number_of_sequences;
500                 }
501         }
502         fseek(ref_f, 0, SEEK_SET);
503         char **seq_names = vcalloc(number_of_sequences, sizeof(char*));
504         int i = 0;
505         while(fgets(line, LINE_LENGTH, ref_f)!=NULL)
506         {
507                 if (line[0] == '>')
508                 {
509                         seq_names[i] = vcalloc(LINE_LENGTH, sizeof(char));
510                         sprintf(seq_names[i], "%s",line);
511                         ++i;
512                 }
513         }
514
515
516         fclose(ref_f);
517         FILE *aln_f = fopen(aln_file_name,"r");
518         char *tmp_name = vtmpnam(NULL);
519         FILE *tmp_f = fopen(tmp_name,"w");
520         int x = 0;
521         long last_pos = -1;
522         while(fgets(line, LINE_LENGTH, aln_f)!=NULL)
523         {
524                 if (line[0] == '>')
525                 {
526                         for (i = 0; i < number_of_sequences; ++i)
527                         {
528                                 if (!strcmp(line, seq_names[i]))
529                                 {
530                                         fprintf(tmp_f, "%s", line);
531                                         last_pos = ftell(aln_f);
532                                         while(fgets(line, LINE_LENGTH, aln_f)!=NULL)
533                                         {
534                                                 if (line[0] == '>')
535                                                         break;
536                                                 fprintf(tmp_f, "%s", line);
537
538                                                 last_pos = ftell(aln_f);
539                                         }
540                                         ++x;
541                                         fseek(aln_f, last_pos, SEEK_SET);
542                                         break;
543                                 }
544                         }
545                 }
546                 if (x == number_of_sequences)
547                         break;
548         }
549         fclose(aln_f);
550         fclose(tmp_f);
551         char command[500];
552 //      sprintf(command, "cp %s /users/cn/ckemena/Desktop/1.fa", tmp_name);
553 //      system(command);
554 //      fp = popen("ls -l", "r");
555         sprintf(command, "t_coffee -other_pg aln_compare -al1 %s -al2 %s ", ref_file_name, tmp_name);
556
557
558         FILE *result;
559         result = popen(command, "r");
560         fgets(line, LINE_LENGTH, result);
561         fgets(line, LINE_LENGTH, result);
562         fgets(line, LINE_LENGTH, result);
563
564 //      sprintf(command, "cp %s ~/Destkop/1.fa", tmp_name);
565 //      system(command);
566
567         char delims[] = " ";
568         char *tmp_str;
569         strtok(line, delims);
570         for (i = 0; i < 3; ++i)
571         {
572                 while((tmp_str = strtok(NULL, delims))==NULL);
573         }
574         return(atof(tmp_str));
575 }
576
577
578 complete_agreement_score(char *aln_file_name, const char *ref_directory)
579 {
580         struct dirent *dp;
581         char *name = strrchr(aln_file_name,'/')+1;
582 //      printf("%s ",name);
583      // enter existing path to directory below
584         char directory[500];
585         if (ref_directory[0]== '~')
586         {
587                 sprintf(directory,"%s%s",getenv("HOME"),ref_directory+1);
588         }
589         else
590         {
591                 sprintf(directory,"%s",ref_directory);
592         }
593         DIR *dir = opendir(directory);
594
595         char ref_file_name[200];
596         while ((dp=readdir(dir)) != NULL)
597         {
598                 if ((strcmp(dp->d_name,".")) && (strcmp(dp->d_name,"..")))
599                 {
600
601                         sprintf(ref_file_name, "%s/%s",directory, dp->d_name);
602                         printf("%s %f\n",name, agreement_score(ref_file_name, aln_file_name));
603 //                      printf("%f ", agreement_score(ref_file_name, aln_file_name));
604
605                 }
606         }
607         printf("\n");
608         closedir(dir);
609         return 0;
610
611 }
612
613
614 //      char delims[] = " ";
615 //      int node[3];
616 //      int alignment_length = -1;
617 //      node[2] = -1;
618
619
620         //bottom-up traversal
621 //      while(fgets(line, LINE_LENGTH, tree_file)!=NULL)
622 // {
623 //              //read profiles
624 //      node[0] = atoi(strtok(line,delims));
625 //      node[1] = atoi(strtok(NULL,delims));
626 //      node[2] = atoi(strtok(NULL,delims));
627 /******************************COPYRIGHT NOTICE*******************************/
628 /*© Centro de Regulacio Genomica */
629 /*and */
630 /*Cedric Notredame */
631 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
632 /*All rights reserved.*/
633 /*This file is part of T-COFFEE.*/
634 /**/
635 /*    T-COFFEE is free software; you can redistribute it and/or modify*/
636 /*    it under the terms of the GNU General Public License as published by*/
637 /*    the Free Software Foundation; either version 2 of the License, or*/
638 /*    (at your option) any later version.*/
639 /**/
640 /*    T-COFFEE is distributed in the hope that it will be useful,*/
641 /*    but WITHOUT ANY WARRANTY; without even the implied warranty of*/
642 /*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*/
643 /*    GNU General Public License for more details.*/
644 /**/
645 /*    You should have received a copy of the GNU General Public License*/
646 /*    along with Foobar; if not, write to the Free Software*/
647 /*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/
648 /*...............................................                                                                                      |*/
649 /*  If you need some more information*/
650 /*  cedric.notredame@europe.com*/
651 /*...............................................                                                                                                                                     |*/
652 /**/
653 /**/
654 /*      */
655 /******************************COPYRIGHT NOTICE*******************************/