Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / tcoffee / t_coffee_source / scoring.c
diff --git a/website/archive/binaries/mac/src/tcoffee/t_coffee_source/scoring.c b/website/archive/binaries/mac/src/tcoffee/t_coffee_source/scoring.c
new file mode 100644 (file)
index 0000000..04a89b9
--- /dev/null
@@ -0,0 +1,655 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <ctype.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <dirent.h>
+
+
+#include "io_lib_header.h"
+#include "util_lib_header.h"
+#include "fastal_lib_header.h"
+
+
+
+
+/**
+ * \brief Function to calculate the sum of pairs score with quasi affine gap costs.
+ *
+ * \param alignment_file File with the aligned sequences in fasta_format.
+ * \param file_index Positions of the sequences in \a alignment_file.
+ * \param number_of_sequences The number of sequences in \a alignment_file.
+ * \param score_matrix Matrix containing the scores.
+ * \param gop The gap opening costs.
+ * \param gep The gep extension costs.
+ * \return The score of the alignment.
+ */
+double
+calculate_sum_of_pairs_score_affine(char *alignment_file_name,
+                                                                       int **score_matrix,
+                                                                       double gop,
+                                                                       double gep)
+{
+
+       const int LINE_LENGTH = 1000;
+       int number_of_sequences = 0;
+       FILE *alignment_file = fopen(alignment_file_name,"r");
+       if (alignment_file == NULL)
+       {
+               printf("FILE COULD NOT BE OPENED!\n");
+               exit(1);
+       }
+       int alignment_length = 0;
+       char line[LINE_LENGTH+1];
+       line[0] = '0';
+
+       while (line[0] != '>')
+       {
+               fgets(line, LINE_LENGTH, alignment_file);
+       }
+       fgets(line, LINE_LENGTH, alignment_file);
+       while (line[0] != '>')
+       {
+               alignment_length += strlen(line);
+               fgets(line, LINE_LENGTH, alignment_file);
+       }
+
+       int alphabet_size = 28;
+       int **counting = vcalloc(alphabet_size, sizeof(int*));
+
+       int i;
+       for (i = 0; i < alphabet_size; ++i)
+       {
+               counting[i] = vcalloc(alignment_length, sizeof(int));
+       }
+
+       alphabet_size -= 2;
+       char c;
+       int was_gap = 0;
+       int pos_line, pos_alignment = -1;
+       fseek (alignment_file, 0, SEEK_SET);
+       pos_line = 0;
+
+       int gap_line = 0;
+       int current_size = 100;
+       int gap_size = 0;
+       int gap_pos = 0;
+
+       int **gaps = vcalloc(100, sizeof(int*));
+
+       while (fgets(line, LINE_LENGTH, alignment_file) != NULL)
+       {
+               if (line[0] != '>')
+               {
+                       pos_line = -1;
+                       while (((c = line[++pos_line]) != '\n') && (c != '\0'))
+                       {
+                               if (isalpha(c))
+                               {
+                                       ++counting[toupper(c)-'A'][++pos_alignment];
+                                       if (was_gap)
+                                       {
+                                               gaps[gap_line][gap_pos++] = pos_alignment-1;
+                                       }
+                                       was_gap = 0;
+                               }
+                               else
+                               {
+                                       ++counting[alphabet_size][++pos_alignment];
+                                       if (!was_gap)
+                                       {
+                                               ++counting[alphabet_size+1][pos_alignment];
+                                               was_gap = 1;
+                                               if (gap_pos >= gap_size-4)
+                                               {
+                                                       gap_size += 50;
+                                                       gaps[gap_line] = vrealloc(gaps[gap_line], gap_size * sizeof(int));
+                                                       gaps[gap_line][gap_pos++] = pos_alignment;
+                                               }
+                                       }
+                               }
+                       }
+               }
+               else
+               {
+                       if (number_of_sequences != 0)
+                       {
+                               if ((gap_pos % 2) != 0)
+                               {
+                                       gaps[gap_line][gap_pos++] = pos_alignment-1;
+                               }
+                               gaps[gap_line][gap_pos++] = alignment_length+2;
+                               gaps[gap_line][gap_pos] = alignment_length+2;
+                               ++gap_line;
+                       }
+                       if (current_size == gap_line)
+                       {
+                               current_size += 50;
+                               gaps = vrealloc(gaps, current_size * sizeof(int*));
+                       }
+                       gap_size = 2;
+                       gaps[gap_line] = vcalloc(gap_size, sizeof(int));
+                       ++number_of_sequences;
+                       pos_alignment = -1;
+                       was_gap = 0;
+
+                       gap_pos = 0;
+               }
+       }
+
+       gaps[gap_line][gap_pos++] = alignment_length+2;
+       gaps[gap_line][gap_pos] = alignment_length+2;
+
+
+       long double score = 0;
+       int j, k, tmp2;
+       int non_gap;
+       for (i = 0; i < alignment_length; ++i)
+       {
+               for (j = 0; j < alphabet_size; ++j)
+               {
+                       if ((tmp2 = counting[j][i]) >1)
+                       {
+                               while (tmp2 > 1)
+                               {
+                                       score += score_matrix[j][j] * (--tmp2);
+                               }
+                       }
+                       for (k = j+1; k < alphabet_size; ++k)
+                       {
+                               score += score_matrix[j][k] * counting[j][i] * counting[k][i];
+                       }
+
+               }
+
+
+               non_gap = number_of_sequences - counting[alphabet_size][i];
+               score += counting[alphabet_size][i] * non_gap  * gep;
+
+       }
+
+       ++gap_line;
+       alignment_length += 2;
+       unsigned long gap_open = 0;
+
+       int chunk = 500;
+
+//     #pragma omp parallel shared(gaps, chunk, alignment_length, gap_open) private(i, j)
+       {
+               int alignment_length2 = alignment_length;
+//             #pragma omp for schedule(dynamic,chunk) reduction(+:gap_open) nowait
+               for (i = 0; i < gap_line; ++i)
+               {
+                       int *gaps1 = gaps[i];
+                       for (j = i+1; j < gap_line; ++j)
+                       {
+                               int *gaps2 = gaps[j];
+                               int k = 0;
+                               int l = 0;
+                               while ((gaps1[k] != alignment_length2) && (gaps2[l] != alignment_length2))
+                               {
+                                       if (gaps1[k+1] < gaps2[l])
+                                       {
+                                               ++gap_open;
+                                               k+=2;
+                                               continue;
+                                       }
+                                       if (gaps2[l+1] < gaps1[k])
+                                       {
+                                               ++gap_open;
+                                               l+=2;
+                                               continue;
+                                       }
+                                       if (gaps1[k] < gaps2[l])
+                                       {
+                                               if (gaps1[k+1] < gaps2[l+1])
+                                               {
+                                                       ++gap_open;
+                                                       k+=2;
+                                                       continue;
+                                               }
+                                               else
+                                               {
+                                                       l+=2;
+                                                       continue;
+                                               }
+                                       }
+                                       if (gaps1[k] > gaps2[l])
+                                       {
+                                               if (gaps1[k+1] > gaps2[l+1])
+                                               {
+                                                       ++gap_open;
+                                                       l+=2;
+                                                       continue;
+                                               }
+                                               else
+                                               {
+                                                       k+=2;
+                                                       continue;
+                                               }
+                                       }
+                                       if (gaps1[k] == gaps2[l])
+                                       {
+                                               if (gaps1[k+1] == gaps2[l+1])
+                                               {
+                                                       k+=2;
+                                                       l+=2;
+                                                       continue;
+                                               }
+                                               else
+                                               {
+                                                       if (gaps1[k+1] <gaps2[l+1])
+                                                       {
+                                                               k+=2;
+                                                       }
+                                                       else
+                                                       {
+                                                               l+=2;
+                                                       }
+                                               }
+                                       }
+
+                               }
+                               while (gaps1[k] != alignment_length2)
+                               {
+                                       ++gap_open;
+                                       k+=2;
+                               }
+                               while (gaps2[l] != alignment_length2)
+                               {
+                                       ++gap_open;
+                                       l+=2;
+                               }
+                       }
+               }
+       }
+
+       score += gop * gap_open;
+
+
+       return score / number_of_sequences;
+}
+
+
+int
+compare (const void * a, const void * b)
+{
+       return (*(int*)a - *(int*)b);
+}
+
+
+
+int
+get_random_value(int a, int b)
+{
+       return rand() % (b - a + 1) + a;
+//     j = 1 + (int)( 10.0 * rand() / ( RAND_MAX + 1.0 ) );
+}
+
+
+
+
+
+void
+compute_ref_alignments(char *seq_file_name, char* ref_directory, int num_alignments, int num_seq_in_aln)
+{
+
+       printf("%s %s %i %i\n", seq_file_name, ref_directory, num_alignments, num_seq_in_aln);
+       //check if target directory exists, if not make it
+       char directory[500];
+       if (ref_directory[0]== '~')
+       {
+               sprintf(directory,"%s%s",getenv("HOME"),ref_directory+1);
+       }
+       else
+       {
+               sprintf(directory,"%s",ref_directory);
+       }
+       struct stat st;
+       if(stat(directory, &st) != 0)
+       {
+               mkdir(directory,0700);
+       }
+
+       printf("made\n");
+
+       // get sequences & produce alignments
+       //make index
+       long *file_positions = NULL;
+       long **tmp_pos = &file_positions;
+       int number_of_sequences = make_index_of_file(seq_file_name, tmp_pos);
+       char *tmp_file_name = vtmpnam(NULL);
+       FILE *seq_file = fopen(seq_file_name, "r");
+       int *already_taken = vcalloc(num_seq_in_aln, sizeof(int));
+       int i, j, k, tmp, already, num_already;
+       for (i = 0; i < num_alignments; ++i)
+       {
+               //choose randomly the sequences
+
+               num_already = 0;
+               while (num_already < num_seq_in_aln)
+               {
+                       already = 0;
+                       tmp = get_random_value(0, number_of_sequences-1);
+                       for (k = 0; k < num_already; ++k)
+                       {
+                               if (already_taken[k] == tmp)
+                               {
+                                       already = 1;
+                                       break;
+                               }
+                       }
+                       if (!already)
+                       {
+                               already_taken[num_already] = tmp;
+                               ++num_already;
+                       }
+               }
+               //write sequences into temporary file
+               const int LINE_LENGTH = 200;
+               char line[LINE_LENGTH];
+
+
+               FILE* tmp_file = fopen(tmp_file_name, "w");
+               for (k = 0; k < num_already; ++k)
+               {
+                       fseek(seq_file, file_positions[already_taken[k]], SEEK_SET);
+                       fgets(line, LINE_LENGTH, seq_file);
+                       fprintf(tmp_file, "%s", line);
+                       while( (fgets(line, LINE_LENGTH, seq_file)) != NULL)
+                       {
+                               if (line[0] == '>')
+                                       break;
+                               fprintf(tmp_file, "%s", line);
+                       }
+               }
+               fclose(tmp_file);
+
+               //calculate alignment
+               char aln_command[1000];
+               sprintf(aln_command,"t_coffee -in %s -outfile %s/%i.fa -output fasta_aln 2>/dev/null >/dev/null", tmp_file_name, directory, i);
+               system(aln_command);
+       }
+
+       vfree(already_taken);
+}
+
+
+
+// void
+// compute agreement_score(char *alignment_file_name, char *alignment_directory)
+// {
+//
+// }
+
+
+
+
+/**
+ * This algorithm choses according to a tree a number of sequenes of which it computes a reference algnment.
+ * \param seq_file_name The sequence file.
+ * \param tree_file_name The given tree.
+ * \param ref_aln_name The file where the reference alignment should be written to.
+ * \param num_seq_in_ref The number of sequences to choose.
+**/
+void
+make_ref_alignment(char *seq_file_name, char *tree_file_name, char *ref_aln_name, int num_seq_in_ref)
+{
+       const int LINE_LENGTH = 200;
+       char line[LINE_LENGTH];
+
+       FILE *tree_f = fopen(tree_file_name,"r");
+
+
+       long *file_positions = NULL;
+       long **tmp = &file_positions;
+       int num_sequences = make_index_of_file(seq_file_name, tmp);
+
+
+       int every_x = num_sequences/(num_seq_in_ref);
+
+       int *seq_ids = vcalloc(num_seq_in_ref,sizeof(int));
+       char delims[] = " ";
+
+       int pos = -1;
+       fseek (tree_f, 0, SEEK_SET);
+       int node;
+       int dist = every_x;
+       while(fgets(line, LINE_LENGTH, tree_f)!=NULL)
+       {
+               node = atoi(strtok(line,delims));
+               if (node < num_sequences)
+               {
+                       if (dist == every_x)
+                       {
+
+                               seq_ids[++pos] = node;
+                               dist = 0;
+                       }
+                       else
+                       {
+                               ++dist;
+                       }
+               }
+
+               node = atoi(strtok(NULL,delims));
+               if (node < num_sequences)
+               {
+                       if (dist == every_x)
+                       {
+                               seq_ids[++pos] = node;
+                               dist = 0;
+                       }
+                       else
+                       {
+                               ++dist;
+                       }
+               }
+       }
+       fclose(tree_f);
+
+
+       int i = 0;
+       qsort(seq_ids, num_seq_in_ref, sizeof(int), compare);
+
+       char *seq_ref_name = vtmpnam(NULL);
+       FILE *seq_ref_f = fopen(seq_ref_name, "w");;
+       FILE *seq_f = fopen(seq_file_name, "r");
+
+       for (i = 0; i < num_seq_in_ref; ++i)
+       {
+               fseek (seq_f, file_positions[seq_ids[i]], SEEK_SET);
+               fgets(line, LINE_LENGTH, seq_f);
+               fprintf(seq_ref_f, "%s", line);
+               while(fgets(line, LINE_LENGTH, seq_f)!=NULL)
+               {
+                       if (line[0] == '>')
+                               break;
+                       fprintf(seq_ref_f, "%s", line);
+               }
+       }
+       fclose(seq_ref_f);
+       fclose(seq_f);
+
+       char command[500];
+       sprintf(command, "mafft --quiet %s > %s", seq_ref_name, ref_aln_name);
+       system(command);
+}
+
+
+/**
+ * This function calculates the agreement between two alignments in a specified number of sequences.
+ * \param ref_file_name The reference alignment.
+ * \param aln_file_name The test alignment.
+ **/
+double
+agreement_score(char *ref_file_name, char *aln_file_name)
+{
+       const int LINE_LENGTH = 200;
+       char line[LINE_LENGTH];
+
+       int number_of_sequences = 0;
+       FILE *ref_f = fopen(ref_file_name,"r");
+       while(fgets(line, LINE_LENGTH, ref_f)!=NULL)
+       {
+               if (line[0] == '>')
+               {
+                       ++number_of_sequences;
+               }
+       }
+       fseek(ref_f, 0, SEEK_SET);
+       char **seq_names = vcalloc(number_of_sequences, sizeof(char*));
+       int i = 0;
+       while(fgets(line, LINE_LENGTH, ref_f)!=NULL)
+       {
+               if (line[0] == '>')
+               {
+                       seq_names[i] = vcalloc(LINE_LENGTH, sizeof(char));
+                       sprintf(seq_names[i], "%s",line);
+                       ++i;
+               }
+       }
+
+
+       fclose(ref_f);
+       FILE *aln_f = fopen(aln_file_name,"r");
+       char *tmp_name = vtmpnam(NULL);
+       FILE *tmp_f = fopen(tmp_name,"w");
+       int x = 0;
+       long last_pos = -1;
+       while(fgets(line, LINE_LENGTH, aln_f)!=NULL)
+       {
+               if (line[0] == '>')
+               {
+                       for (i = 0; i < number_of_sequences; ++i)
+                       {
+                               if (!strcmp(line, seq_names[i]))
+                               {
+                                       fprintf(tmp_f, "%s", line);
+                                       last_pos = ftell(aln_f);
+                                       while(fgets(line, LINE_LENGTH, aln_f)!=NULL)
+                                       {
+                                               if (line[0] == '>')
+                                                       break;
+                                               fprintf(tmp_f, "%s", line);
+
+                                               last_pos = ftell(aln_f);
+                                       }
+                                       ++x;
+                                       fseek(aln_f, last_pos, SEEK_SET);
+                                       break;
+                               }
+                       }
+               }
+               if (x == number_of_sequences)
+                       break;
+       }
+       fclose(aln_f);
+       fclose(tmp_f);
+       char command[500];
+//     sprintf(command, "cp %s /users/cn/ckemena/Desktop/1.fa", tmp_name);
+//     system(command);
+//     fp = popen("ls -l", "r");
+       sprintf(command, "t_coffee -other_pg aln_compare -al1 %s -al2 %s ", ref_file_name, tmp_name);
+
+
+       FILE *result;
+       result = popen(command, "r");
+       fgets(line, LINE_LENGTH, result);
+       fgets(line, LINE_LENGTH, result);
+       fgets(line, LINE_LENGTH, result);
+
+//     sprintf(command, "cp %s ~/Destkop/1.fa", tmp_name);
+//     system(command);
+
+       char delims[] = " ";
+       char *tmp_str;
+       strtok(line, delims);
+       for (i = 0; i < 3; ++i)
+       {
+               while((tmp_str = strtok(NULL, delims))==NULL);
+       }
+       return(atof(tmp_str));
+}
+
+
+complete_agreement_score(char *aln_file_name, const char *ref_directory)
+{
+       struct dirent *dp;
+       char *name = strrchr(aln_file_name,'/')+1;
+//     printf("%s ",name);
+     // enter existing path to directory below
+       char directory[500];
+       if (ref_directory[0]== '~')
+       {
+               sprintf(directory,"%s%s",getenv("HOME"),ref_directory+1);
+       }
+       else
+       {
+               sprintf(directory,"%s",ref_directory);
+       }
+       DIR *dir = opendir(directory);
+
+       char ref_file_name[200];
+       while ((dp=readdir(dir)) != NULL)
+       {
+               if ((strcmp(dp->d_name,".")) && (strcmp(dp->d_name,"..")))
+               {
+
+                       sprintf(ref_file_name, "%s/%s",directory, dp->d_name);
+                       printf("%s %f\n",name, agreement_score(ref_file_name, aln_file_name));
+//                     printf("%f ", agreement_score(ref_file_name, aln_file_name));
+
+               }
+       }
+       printf("\n");
+       closedir(dir);
+       return 0;
+
+}
+
+
+//     char delims[] = " ";
+//     int node[3];
+//     int alignment_length = -1;
+//     node[2] = -1;
+
+
+       //bottom-up traversal
+//     while(fgets(line, LINE_LENGTH, tree_file)!=NULL)
+// {
+//             //read profiles
+//     node[0] = atoi(strtok(line,delims));
+//     node[1] = atoi(strtok(NULL,delims));
+//     node[2] = atoi(strtok(NULL,delims));
+/******************************COPYRIGHT NOTICE*******************************/
+/*© Centro de Regulacio Genomica */
+/*and */
+/*Cedric Notredame */
+/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
+/*All rights reserved.*/
+/*This file is part of T-COFFEE.*/
+/**/
+/*    T-COFFEE is free software; you can redistribute it and/or modify*/
+/*    it under the terms of the GNU General Public License as published by*/
+/*    the Free Software Foundation; either version 2 of the License, or*/
+/*    (at your option) any later version.*/
+/**/
+/*    T-COFFEE is distributed in the hope that it will be useful,*/
+/*    but WITHOUT ANY WARRANTY; without even the implied warranty of*/
+/*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*/
+/*    GNU General Public License for more details.*/
+/**/
+/*    You should have received a copy of the GNU General Public License*/
+/*    along with Foobar; if not, write to the Free Software*/
+/*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/
+/*...............................................                                                                                      |*/
+/*  If you need some more information*/
+/*  cedric.notredame@europe.com*/
+/*...............................................                                                                                                                                     |*/
+/**/
+/**/
+/*     */
+/******************************COPYRIGHT NOTICE*******************************/