--- /dev/null
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <ctype.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <dirent.h>
+
+
+#include "io_lib_header.h"
+#include "util_lib_header.h"
+#include "fastal_lib_header.h"
+
+
+
+
+/**
+ * \brief Function to calculate the sum of pairs score with quasi affine gap costs.
+ *
+ * \param alignment_file File with the aligned sequences in fasta_format.
+ * \param file_index Positions of the sequences in \a alignment_file.
+ * \param number_of_sequences The number of sequences in \a alignment_file.
+ * \param score_matrix Matrix containing the scores.
+ * \param gop The gap opening costs.
+ * \param gep The gep extension costs.
+ * \return The score of the alignment.
+ */
+double
+calculate_sum_of_pairs_score_affine(char *alignment_file_name,
+ int **score_matrix,
+ double gop,
+ double gep)
+{
+
+ const int LINE_LENGTH = 1000;
+ int number_of_sequences = 0;
+ FILE *alignment_file = fopen(alignment_file_name,"r");
+ if (alignment_file == NULL)
+ {
+ printf("FILE COULD NOT BE OPENED!\n");
+ exit(1);
+ }
+ int alignment_length = 0;
+ char line[LINE_LENGTH+1];
+ line[0] = '0';
+
+ while (line[0] != '>')
+ {
+ fgets(line, LINE_LENGTH, alignment_file);
+ }
+ fgets(line, LINE_LENGTH, alignment_file);
+ while (line[0] != '>')
+ {
+ alignment_length += strlen(line);
+ fgets(line, LINE_LENGTH, alignment_file);
+ }
+
+ int alphabet_size = 28;
+ int **counting = vcalloc(alphabet_size, sizeof(int*));
+
+ int i;
+ for (i = 0; i < alphabet_size; ++i)
+ {
+ counting[i] = vcalloc(alignment_length, sizeof(int));
+ }
+
+ alphabet_size -= 2;
+ char c;
+ int was_gap = 0;
+ int pos_line, pos_alignment = -1;
+ fseek (alignment_file, 0, SEEK_SET);
+ pos_line = 0;
+
+ int gap_line = 0;
+ int current_size = 100;
+ int gap_size = 0;
+ int gap_pos = 0;
+
+ int **gaps = vcalloc(100, sizeof(int*));
+
+ while (fgets(line, LINE_LENGTH, alignment_file) != NULL)
+ {
+ if (line[0] != '>')
+ {
+ pos_line = -1;
+ while (((c = line[++pos_line]) != '\n') && (c != '\0'))
+ {
+ if (isalpha(c))
+ {
+ ++counting[toupper(c)-'A'][++pos_alignment];
+ if (was_gap)
+ {
+ gaps[gap_line][gap_pos++] = pos_alignment-1;
+ }
+ was_gap = 0;
+ }
+ else
+ {
+ ++counting[alphabet_size][++pos_alignment];
+ if (!was_gap)
+ {
+ ++counting[alphabet_size+1][pos_alignment];
+ was_gap = 1;
+ if (gap_pos >= gap_size-4)
+ {
+ gap_size += 50;
+ gaps[gap_line] = vrealloc(gaps[gap_line], gap_size * sizeof(int));
+ gaps[gap_line][gap_pos++] = pos_alignment;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ if (number_of_sequences != 0)
+ {
+ if ((gap_pos % 2) != 0)
+ {
+ gaps[gap_line][gap_pos++] = pos_alignment-1;
+ }
+ gaps[gap_line][gap_pos++] = alignment_length+2;
+ gaps[gap_line][gap_pos] = alignment_length+2;
+ ++gap_line;
+ }
+ if (current_size == gap_line)
+ {
+ current_size += 50;
+ gaps = vrealloc(gaps, current_size * sizeof(int*));
+ }
+ gap_size = 2;
+ gaps[gap_line] = vcalloc(gap_size, sizeof(int));
+ ++number_of_sequences;
+ pos_alignment = -1;
+ was_gap = 0;
+
+ gap_pos = 0;
+ }
+ }
+
+ gaps[gap_line][gap_pos++] = alignment_length+2;
+ gaps[gap_line][gap_pos] = alignment_length+2;
+
+
+ long double score = 0;
+ int j, k, tmp2;
+ int non_gap;
+ for (i = 0; i < alignment_length; ++i)
+ {
+ for (j = 0; j < alphabet_size; ++j)
+ {
+ if ((tmp2 = counting[j][i]) >1)
+ {
+ while (tmp2 > 1)
+ {
+ score += score_matrix[j][j] * (--tmp2);
+ }
+ }
+ for (k = j+1; k < alphabet_size; ++k)
+ {
+ score += score_matrix[j][k] * counting[j][i] * counting[k][i];
+ }
+
+ }
+
+
+ non_gap = number_of_sequences - counting[alphabet_size][i];
+ score += counting[alphabet_size][i] * non_gap * gep;
+
+ }
+
+ ++gap_line;
+ alignment_length += 2;
+ unsigned long gap_open = 0;
+
+ int chunk = 500;
+
+// #pragma omp parallel shared(gaps, chunk, alignment_length, gap_open) private(i, j)
+ {
+ int alignment_length2 = alignment_length;
+// #pragma omp for schedule(dynamic,chunk) reduction(+:gap_open) nowait
+ for (i = 0; i < gap_line; ++i)
+ {
+ int *gaps1 = gaps[i];
+ for (j = i+1; j < gap_line; ++j)
+ {
+ int *gaps2 = gaps[j];
+ int k = 0;
+ int l = 0;
+ while ((gaps1[k] != alignment_length2) && (gaps2[l] != alignment_length2))
+ {
+ if (gaps1[k+1] < gaps2[l])
+ {
+ ++gap_open;
+ k+=2;
+ continue;
+ }
+ if (gaps2[l+1] < gaps1[k])
+ {
+ ++gap_open;
+ l+=2;
+ continue;
+ }
+ if (gaps1[k] < gaps2[l])
+ {
+ if (gaps1[k+1] < gaps2[l+1])
+ {
+ ++gap_open;
+ k+=2;
+ continue;
+ }
+ else
+ {
+ l+=2;
+ continue;
+ }
+ }
+ if (gaps1[k] > gaps2[l])
+ {
+ if (gaps1[k+1] > gaps2[l+1])
+ {
+ ++gap_open;
+ l+=2;
+ continue;
+ }
+ else
+ {
+ k+=2;
+ continue;
+ }
+ }
+ if (gaps1[k] == gaps2[l])
+ {
+ if (gaps1[k+1] == gaps2[l+1])
+ {
+ k+=2;
+ l+=2;
+ continue;
+ }
+ else
+ {
+ if (gaps1[k+1] <gaps2[l+1])
+ {
+ k+=2;
+ }
+ else
+ {
+ l+=2;
+ }
+ }
+ }
+
+ }
+ while (gaps1[k] != alignment_length2)
+ {
+ ++gap_open;
+ k+=2;
+ }
+ while (gaps2[l] != alignment_length2)
+ {
+ ++gap_open;
+ l+=2;
+ }
+ }
+ }
+ }
+
+ score += gop * gap_open;
+
+
+ return score / number_of_sequences;
+}
+
+
+int
+compare (const void * a, const void * b)
+{
+ return (*(int*)a - *(int*)b);
+}
+
+
+
+int
+get_random_value(int a, int b)
+{
+ return rand() % (b - a + 1) + a;
+// j = 1 + (int)( 10.0 * rand() / ( RAND_MAX + 1.0 ) );
+}
+
+
+
+
+
+void
+compute_ref_alignments(char *seq_file_name, char* ref_directory, int num_alignments, int num_seq_in_aln)
+{
+
+ printf("%s %s %i %i\n", seq_file_name, ref_directory, num_alignments, num_seq_in_aln);
+ //check if target directory exists, if not make it
+ char directory[500];
+ if (ref_directory[0]== '~')
+ {
+ sprintf(directory,"%s%s",getenv("HOME"),ref_directory+1);
+ }
+ else
+ {
+ sprintf(directory,"%s",ref_directory);
+ }
+ struct stat st;
+ if(stat(directory, &st) != 0)
+ {
+ mkdir(directory,0700);
+ }
+
+ printf("made\n");
+
+ // get sequences & produce alignments
+ //make index
+ long *file_positions = NULL;
+ long **tmp_pos = &file_positions;
+ int number_of_sequences = make_index_of_file(seq_file_name, tmp_pos);
+ char *tmp_file_name = vtmpnam(NULL);
+ FILE *seq_file = fopen(seq_file_name, "r");
+ int *already_taken = vcalloc(num_seq_in_aln, sizeof(int));
+ int i, j, k, tmp, already, num_already;
+ for (i = 0; i < num_alignments; ++i)
+ {
+ //choose randomly the sequences
+
+ num_already = 0;
+ while (num_already < num_seq_in_aln)
+ {
+ already = 0;
+ tmp = get_random_value(0, number_of_sequences-1);
+ for (k = 0; k < num_already; ++k)
+ {
+ if (already_taken[k] == tmp)
+ {
+ already = 1;
+ break;
+ }
+ }
+ if (!already)
+ {
+ already_taken[num_already] = tmp;
+ ++num_already;
+ }
+ }
+ //write sequences into temporary file
+ const int LINE_LENGTH = 200;
+ char line[LINE_LENGTH];
+
+
+ FILE* tmp_file = fopen(tmp_file_name, "w");
+ for (k = 0; k < num_already; ++k)
+ {
+ fseek(seq_file, file_positions[already_taken[k]], SEEK_SET);
+ fgets(line, LINE_LENGTH, seq_file);
+ fprintf(tmp_file, "%s", line);
+ while( (fgets(line, LINE_LENGTH, seq_file)) != NULL)
+ {
+ if (line[0] == '>')
+ break;
+ fprintf(tmp_file, "%s", line);
+ }
+ }
+ fclose(tmp_file);
+
+ //calculate alignment
+ char aln_command[1000];
+ sprintf(aln_command,"t_coffee -in %s -outfile %s/%i.fa -output fasta_aln 2>/dev/null >/dev/null", tmp_file_name, directory, i);
+ system(aln_command);
+ }
+
+ vfree(already_taken);
+}
+
+
+
+// void
+// compute agreement_score(char *alignment_file_name, char *alignment_directory)
+// {
+//
+// }
+
+
+
+
+/**
+ * This algorithm choses according to a tree a number of sequenes of which it computes a reference algnment.
+ * \param seq_file_name The sequence file.
+ * \param tree_file_name The given tree.
+ * \param ref_aln_name The file where the reference alignment should be written to.
+ * \param num_seq_in_ref The number of sequences to choose.
+**/
+void
+make_ref_alignment(char *seq_file_name, char *tree_file_name, char *ref_aln_name, int num_seq_in_ref)
+{
+ const int LINE_LENGTH = 200;
+ char line[LINE_LENGTH];
+
+ FILE *tree_f = fopen(tree_file_name,"r");
+
+
+ long *file_positions = NULL;
+ long **tmp = &file_positions;
+ int num_sequences = make_index_of_file(seq_file_name, tmp);
+
+
+ int every_x = num_sequences/(num_seq_in_ref);
+
+ int *seq_ids = vcalloc(num_seq_in_ref,sizeof(int));
+ char delims[] = " ";
+
+ int pos = -1;
+ fseek (tree_f, 0, SEEK_SET);
+ int node;
+ int dist = every_x;
+ while(fgets(line, LINE_LENGTH, tree_f)!=NULL)
+ {
+ node = atoi(strtok(line,delims));
+ if (node < num_sequences)
+ {
+ if (dist == every_x)
+ {
+
+ seq_ids[++pos] = node;
+ dist = 0;
+ }
+ else
+ {
+ ++dist;
+ }
+ }
+
+ node = atoi(strtok(NULL,delims));
+ if (node < num_sequences)
+ {
+ if (dist == every_x)
+ {
+ seq_ids[++pos] = node;
+ dist = 0;
+ }
+ else
+ {
+ ++dist;
+ }
+ }
+ }
+ fclose(tree_f);
+
+
+ int i = 0;
+ qsort(seq_ids, num_seq_in_ref, sizeof(int), compare);
+
+ char *seq_ref_name = vtmpnam(NULL);
+ FILE *seq_ref_f = fopen(seq_ref_name, "w");;
+ FILE *seq_f = fopen(seq_file_name, "r");
+
+ for (i = 0; i < num_seq_in_ref; ++i)
+ {
+ fseek (seq_f, file_positions[seq_ids[i]], SEEK_SET);
+ fgets(line, LINE_LENGTH, seq_f);
+ fprintf(seq_ref_f, "%s", line);
+ while(fgets(line, LINE_LENGTH, seq_f)!=NULL)
+ {
+ if (line[0] == '>')
+ break;
+ fprintf(seq_ref_f, "%s", line);
+ }
+ }
+ fclose(seq_ref_f);
+ fclose(seq_f);
+
+ char command[500];
+ sprintf(command, "mafft --quiet %s > %s", seq_ref_name, ref_aln_name);
+ system(command);
+}
+
+
+/**
+ * This function calculates the agreement between two alignments in a specified number of sequences.
+ * \param ref_file_name The reference alignment.
+ * \param aln_file_name The test alignment.
+ **/
+double
+agreement_score(char *ref_file_name, char *aln_file_name)
+{
+ const int LINE_LENGTH = 200;
+ char line[LINE_LENGTH];
+
+ int number_of_sequences = 0;
+ FILE *ref_f = fopen(ref_file_name,"r");
+ while(fgets(line, LINE_LENGTH, ref_f)!=NULL)
+ {
+ if (line[0] == '>')
+ {
+ ++number_of_sequences;
+ }
+ }
+ fseek(ref_f, 0, SEEK_SET);
+ char **seq_names = vcalloc(number_of_sequences, sizeof(char*));
+ int i = 0;
+ while(fgets(line, LINE_LENGTH, ref_f)!=NULL)
+ {
+ if (line[0] == '>')
+ {
+ seq_names[i] = vcalloc(LINE_LENGTH, sizeof(char));
+ sprintf(seq_names[i], "%s",line);
+ ++i;
+ }
+ }
+
+
+ fclose(ref_f);
+ FILE *aln_f = fopen(aln_file_name,"r");
+ char *tmp_name = vtmpnam(NULL);
+ FILE *tmp_f = fopen(tmp_name,"w");
+ int x = 0;
+ long last_pos = -1;
+ while(fgets(line, LINE_LENGTH, aln_f)!=NULL)
+ {
+ if (line[0] == '>')
+ {
+ for (i = 0; i < number_of_sequences; ++i)
+ {
+ if (!strcmp(line, seq_names[i]))
+ {
+ fprintf(tmp_f, "%s", line);
+ last_pos = ftell(aln_f);
+ while(fgets(line, LINE_LENGTH, aln_f)!=NULL)
+ {
+ if (line[0] == '>')
+ break;
+ fprintf(tmp_f, "%s", line);
+
+ last_pos = ftell(aln_f);
+ }
+ ++x;
+ fseek(aln_f, last_pos, SEEK_SET);
+ break;
+ }
+ }
+ }
+ if (x == number_of_sequences)
+ break;
+ }
+ fclose(aln_f);
+ fclose(tmp_f);
+ char command[500];
+// sprintf(command, "cp %s /users/cn/ckemena/Desktop/1.fa", tmp_name);
+// system(command);
+// fp = popen("ls -l", "r");
+ sprintf(command, "t_coffee -other_pg aln_compare -al1 %s -al2 %s ", ref_file_name, tmp_name);
+
+
+ FILE *result;
+ result = popen(command, "r");
+ fgets(line, LINE_LENGTH, result);
+ fgets(line, LINE_LENGTH, result);
+ fgets(line, LINE_LENGTH, result);
+
+// sprintf(command, "cp %s ~/Destkop/1.fa", tmp_name);
+// system(command);
+
+ char delims[] = " ";
+ char *tmp_str;
+ strtok(line, delims);
+ for (i = 0; i < 3; ++i)
+ {
+ while((tmp_str = strtok(NULL, delims))==NULL);
+ }
+ return(atof(tmp_str));
+}
+
+
+complete_agreement_score(char *aln_file_name, const char *ref_directory)
+{
+ struct dirent *dp;
+ char *name = strrchr(aln_file_name,'/')+1;
+// printf("%s ",name);
+ // enter existing path to directory below
+ char directory[500];
+ if (ref_directory[0]== '~')
+ {
+ sprintf(directory,"%s%s",getenv("HOME"),ref_directory+1);
+ }
+ else
+ {
+ sprintf(directory,"%s",ref_directory);
+ }
+ DIR *dir = opendir(directory);
+
+ char ref_file_name[200];
+ while ((dp=readdir(dir)) != NULL)
+ {
+ if ((strcmp(dp->d_name,".")) && (strcmp(dp->d_name,"..")))
+ {
+
+ sprintf(ref_file_name, "%s/%s",directory, dp->d_name);
+ printf("%s %f\n",name, agreement_score(ref_file_name, aln_file_name));
+// printf("%f ", agreement_score(ref_file_name, aln_file_name));
+
+ }
+ }
+ printf("\n");
+ closedir(dir);
+ return 0;
+
+}
+
+
+// char delims[] = " ";
+// int node[3];
+// int alignment_length = -1;
+// node[2] = -1;
+
+
+ //bottom-up traversal
+// while(fgets(line, LINE_LENGTH, tree_file)!=NULL)
+// {
+// //read profiles
+// node[0] = atoi(strtok(line,delims));
+// node[1] = atoi(strtok(NULL,delims));
+// node[2] = atoi(strtok(NULL,delims));
+/******************************COPYRIGHT NOTICE*******************************/
+/*© Centro de Regulacio Genomica */
+/*and */
+/*Cedric Notredame */
+/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
+/*All rights reserved.*/
+/*This file is part of T-COFFEE.*/
+/**/
+/* T-COFFEE is free software; you can redistribute it and/or modify*/
+/* it under the terms of the GNU General Public License as published by*/
+/* the Free Software Foundation; either version 2 of the License, or*/
+/* (at your option) any later version.*/
+/**/
+/* T-COFFEE is distributed in the hope that it will be useful,*/
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
+/* GNU General Public License for more details.*/
+/**/
+/* You should have received a copy of the GNU General Public License*/
+/* along with Foobar; if not, write to the Free Software*/
+/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
+/*............................................... |*/
+/* If you need some more information*/
+/* cedric.notredame@europe.com*/
+/*............................................... |*/
+/**/
+/**/
+/* */
+/******************************COPYRIGHT NOTICE*******************************/