new version of tcoffee 8.99 not yet compiled for ia32 linux (currently compiled for...
[jabaws.git] / binaries / src / tcoffee / t_coffee_source / iteration.c
diff --git a/binaries/src/tcoffee/t_coffee_source/iteration.c b/binaries/src/tcoffee/t_coffee_source/iteration.c
new file mode 100644 (file)
index 0000000..b0d5090
--- /dev/null
@@ -0,0 +1,610 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+// #include <ctype.h>
+// #include <string.h>
+
+#include "io_lib_header.h"
+#include "util_lib_header.h"
+#include "fastal_lib_header.h"
+
+
+/**
+ * Adds a single sequence to a profile.
+ * \param seq The sequence.
+ * \param prf The profile.
+ */
+void
+seq2profile2(char *seq, Fastal_profile *prf, int *char2pos)
+{
+//     printf("SEDQ: %s\n", seq);
+       int **prf_in = prf->prf;
+       int i = -1;
+       --seq;
+       while (*(++seq) !='\0')
+       {       
+               if (*seq != '-')
+                       ++(prf_in[char2pos[*seq]][++i]);
+               else
+                       ++i;
+//             ++seq;
+       }
+}
+
+
+
+
+// typedef struct
+// {
+//     /// Number of sequences in this profile
+//     int num_sequences;
+//     /// number of the profile
+//     int prf_number;
+//     ///0 = combination of two profiles, 1 = profile of a single sequence -> name1 = seq_name
+//     int is_leaf;
+//     ///length of the profile
+//     int length;
+//     ///weight of the sequence
+//     int weight;
+//     ///saves the amount of allocated memory
+//     int allocated_memory;   
+//     ///the profile itself [alphabet_size][profile_length]
+//     int **prf;
+//     ///number_of_sequences
+//     int number_of_sequences;
+// }
+// Fastal_profile;
+
+
+
+
+
+/**
+ * Deletes all gap columns from the profile.
+ * 
+ * \param prf The profile.
+ * \param alphabet_size The size of the alphabet.
+ * \param gap_list The gap_list.
+ * \param gap_list_length The length of the gap list.
+ * \param num_gaps The number of gaps.
+ * \return The new gap list.
+ */
+int *
+del_gap_from_profile(Fastal_profile *prf, int alphabet_size, int *gap_list, int *gap_list_length, int *num_gaps)
+{
+       int i;
+       int pos = -1;
+       int not_gap = 0;
+       int gap_pos = 0;
+       int write_pos = 0;
+//     int gap_counter = 0;
+       int **prf_in = prf->prf;
+       int prf_length = prf->length;
+       
+       while (gap_pos < prf_length)
+       {
+//             printf("PRF1: %i %i %i %i\n", prf_in[0][gap_pos], prf_in[1][gap_pos], prf_in[2][gap_pos], prf_in[3][gap_pos]);
+
+               not_gap = 0;
+               for (i = 0; i < alphabet_size; ++i)
+               {
+                       if (prf_in[i][gap_pos])
+                       {
+                               not_gap = 1;
+//                             printf("ARG: %i\n", i);
+                               break;
+                       }
+               }
+               if (not_gap)
+               {
+                       
+                       if (gap_pos != write_pos)
+                       {
+                               for (i = 0; i < alphabet_size; ++i)
+                               {
+                                       prf_in[i][write_pos] = prf_in[i][gap_pos];
+                               }
+                       }
+                       ++write_pos;
+               }
+               else
+               {
+                       
+                       if (++pos >= *gap_list_length)
+                       {
+                               *gap_list_length += 10;
+                               gap_list = vrealloc(gap_list, (*gap_list_length)*sizeof(int));
+                       }
+                       gap_list[pos] = gap_pos;
+//                     ++gap_counter;
+               }
+               ++gap_pos;
+       }
+//     printf("DEEEEEEEEEEEEEEEEEL %i %i\n", gap_counter, pos+1);
+       
+       *num_gaps = pos+1;
+       prf->length -= *num_gaps;
+       return gap_list;
+}
+
+
+/**
+ * Splits a given alignment according to a given position.
+ * \param alignment_file The alignment file.
+ * \param gap_prf Saves the profile with the gap sequences.
+ * \param no_gap_prf Saves the profile of the non gap sequences.
+ * \param seq A temporary place to save the sequence.
+ * \param index The index where to look for the gaps.
+ * \param char2pos The character to position translation array.
+ */
+void
+split_set(FILE *alignment_file, Fastal_profile *gap_prf, Fastal_profile *no_gap_prf, char *seq, int index, int *char2pos, char* split_file_name)
+{
+       FILE *split_set_file1 = fopen("GAP", "w");
+       FILE *split_set_file2 = fopen("NOGAP", "w");
+//     FILE *shorted = fopen("SHORTT.fa", "w");
+       printf("INDEX: %i\n",index);
+       fseek (alignment_file , 0 , SEEK_SET);
+       FILE *split_file = fopen(split_file_name, "w");
+       int i,j;
+       const int LINE_LENGTH = 200;
+       char line[LINE_LENGTH];
+       int pos = -1;
+       int overall_pos = -1;
+       fgets(line, LINE_LENGTH , alignment_file);
+//     fprintf(shorted, "%s", line);
+       while(fgets(line, LINE_LENGTH , alignment_file)!=NULL)
+       {
+               pos = -1;
+               if (line[0] == '>')
+               {
+//                     fprintf(shorted, "%s", line);
+                       if (seq[index] == '-')
+                       {
+                               
+                               seq[++overall_pos] = '\0';
+                               fprintf(split_file, "g\n");
+                               fprintf(split_set_file1,"%s\n",seq);
+                               seq2profile2(seq, gap_prf, char2pos);
+                               ++(gap_prf->number_of_sequences);
+                       }
+                       else
+                       {
+                               seq[++overall_pos] = '\0';
+                               fprintf(split_file, "n\n");
+                               fprintf(split_set_file2,"%s\n",seq);
+                               seq2profile2(seq, no_gap_prf, char2pos);
+                               ++(no_gap_prf->number_of_sequences);
+                       }
+                       overall_pos = -1;
+                       
+               }
+               else
+               {
+//                     for (j = 35; j < 96; ++j)
+//                     {
+//                             fprintf(shorted, "%c", line[j]);
+//                     }
+//                     fprintf(shorted, "\n");
+                       while ((line[++pos] != '\n') && (line[pos] != '\0'))
+                       {
+                               seq[++overall_pos] = line[pos];
+                       }
+               }
+       }
+
+       if (seq[index] == '-')
+       {
+               seq[++overall_pos] = '\0';
+               fprintf(split_file, "g\n");
+               seq2profile2(seq, gap_prf, char2pos);
+               ++(gap_prf->number_of_sequences);
+       }
+       else
+       {
+               seq[++overall_pos] = '\0';
+               fprintf(split_file, "n\n");
+               seq2profile2(seq, no_gap_prf, char2pos);
+               ++(no_gap_prf->number_of_sequences);
+       }
+       fclose(split_file);
+}
+
+
+Fastal_profile *
+enlarge_prof(Fastal_profile *prof, int new_length, int alphabet_size)
+{
+       if (new_length > prof->allocated_memory)
+       {
+               int i;
+               for (i = 0;i < alphabet_size; ++i)
+               {
+                       prof->prf[i] = vrealloc(prof->prf[i],new_length*sizeof(int));
+               }
+               prof->allocated_memory = new_length;
+       }
+       return prof;
+}
+
+
+
+
+void
+iterate(Fastal_param *param, void *method_arguments_p, char *aln_file_name, char *out_file_name_end, int iteration_number)
+{
+//     calculate alignment length
+       
+//count gap
+       int it_coutner_2 = 0;
+       const int LINE_LENGTH = 200;
+       char line[LINE_LENGTH];
+       char *seq1 = vcalloc(1,sizeof(char));
+       char *seq2 = vcalloc(1,sizeof(char));
+       Fastal_profile **profiles = vcalloc(3,sizeof(Fastal_profile*));
+       initiate_profiles(profiles, param);
+       Fastal_profile *gap_prf = profiles[0];
+       Fastal_profile *no_gap_prf = profiles[1];
+       int alphabet_size = param->alphabet_size;
+
+       int *gap_list_1 = vcalloc(1, sizeof(int));
+       int *gap_list_1_length = vcalloc(1, sizeof(int));
+       *gap_list_1_length = 1;
+       int num_gaps_1 = 0;
+       int *gap_list_2 = vcalloc(1, sizeof(int));
+       int *gap_list_2_length = vcalloc(1, sizeof(int));
+       *gap_list_2_length = 1;
+       int num_gaps_2 = 0;
+
+       int alignment_length = 1;
+       //from here repeat!
+       int it_counter = 0;
+       char *out_file_name = aln_file_name;
+       int *gap_profile = vcalloc(alignment_length, sizeof(int));
+       
+//     while (it_counter < alignment_length)
+//     {
+               
+               
+               FILE *alignment_file = fopen(aln_file_name,"r");
+               if (alignment_file == NULL)
+               {
+                       printf("Could not open alignment file %s\n", aln_file_name);
+                       exit(1);
+               }
+       
+       
+               
+               alignment_length = 0;
+               int tmp_len = -1;
+               fgets(line, LINE_LENGTH , alignment_file);
+               while(fgets(line, LINE_LENGTH , alignment_file)!=NULL)
+               {
+                       tmp_len = -1;
+                       if (line[0] == '>')
+                       {
+                               break;
+                       }
+                       while ((line[++tmp_len] != '\n') && (line[tmp_len] != '\0'));
+                       alignment_length += tmp_len;
+               }
+       //      printf("ALN_LENGTH %i\n", alignment_length);
+               seq1 =vrealloc(seq1, (1+alignment_length)*sizeof(char));
+               
+               gap_profile = vrealloc(gap_profile, alignment_length * sizeof(int));
+               int i;
+               for (i = 0; i < alignment_length; ++i)
+               {
+                       gap_profile[i] = 0;
+               }
+       
+               int number_of_sequences = 0;
+               int pos = -1;
+               fseek (alignment_file , 0 , SEEK_SET);
+               while(fgets(line, LINE_LENGTH , alignment_file)!=NULL)
+               {
+                       if (line[0] == '>')
+                       {
+                               ++number_of_sequences;
+                               pos = -1;
+                       }
+                       else
+                       {
+                               i = -1;
+                               while ((line[++i] != '\n') && (line[i] != '\0'))
+                               {
+                                       ++pos;
+                                       if (line[i] == '-')
+                                       {
+                                               ++gap_profile[pos];
+                                       }
+                               }
+                       }
+               }
+       
+               double max_entrop = 0;
+               int column_index = 0;
+               double entrop = 0;
+               double last = 0;
+               double p;
+
+               
+//             for (i = it_counter; i<=it_counter; ++i)
+//             {
+//                     p = gap_profile[i]/(double)number_of_sequences;
+//                     if (!p)
+//                     {
+//                             entrop = 0;
+//                     }
+//                     else
+//                     {
+//                             entrop = (-1)*(p*log(p) + (1-p)*log(1-p) ) ;
+//                     }
+//                     if (entrop > max_entrop)
+//                     {
+//                             column_index = i;
+//                             max_entrop = entrop;
+//                     }
+//                     last = entrop;
+//             }
+//             ++it_counter;
+//             if (max_entrop < 0.6)
+//             {
+//                     printf("CONTINUE %f\n",entrop);
+//                     continue;
+//             }
+               
+//             column_index = 18;//it_counter-1;
+//             if (column_index == 19)
+                       column_index = 58;
+               out_file_name = vtmpnam(NULL);
+               
+               char *edit_file_name = "EDIT";//vtmpnam(NULL);
+               FILE *edit_file = fopen(edit_file_name,"w");
+               char *profile_file_name = vtmpnam(NULL);
+               FILE *profile_file = fopen(profile_file_name,"w");
+               char *split_file_name = "AHA";//vtmpnam(NULL);
+               ++it_coutner_2;
+
+               gap_prf = enlarge_prof(gap_prf, alignment_length, alphabet_size);
+               no_gap_prf = enlarge_prof(no_gap_prf, alignment_length, alphabet_size );
+               no_gap_prf->number_of_sequences = 0;
+               gap_prf->number_of_sequences = 0;
+       
+               split_set(alignment_file, gap_prf, no_gap_prf, seq1, column_index, param->char2pos, split_file_name);
+               gap_prf -> length = alignment_length;
+               no_gap_prf -> length = alignment_length;
+
+
+               gap_list_1 = del_gap_from_profile(gap_prf, alphabet_size, gap_list_1, gap_list_1_length, &num_gaps_1 );
+               gap_list_2 = del_gap_from_profile(no_gap_prf, alphabet_size, gap_list_2, gap_list_2_length, &num_gaps_2 );
+
+
+               fclose(alignment_file);
+               profiles[0] = gap_prf;
+               profiles[1] = no_gap_prf;
+
+               alignment_length = gotoh_dyn(profiles, param, method_arguments_p, 0, edit_file, profile_file, 0);
+               seq1 =vrealloc(seq1, (1+alignment_length)*sizeof(char));
+               seq2 =vrealloc(seq2, (1+alignment_length)*sizeof(char));
+               
+               fclose(edit_file);
+               edit_file = fopen(edit_file_name,"r");
+               edit2seq_pattern(edit_file, seq1, seq2);
+               fclose(edit_file);
+               fclose(profile_file);
+               
+               write_iterated_aln(aln_file_name, out_file_name, split_file_name, seq1, gap_list_1, num_gaps_1, seq2, gap_list_2, num_gaps_2);
+               aln_file_name = out_file_name;
+
+//             if (it_coutner_2 == 2)
+//             break;
+//     }
+       char command[1000];
+       sprintf(command, "mv %s %s",out_file_name, out_file_name_end);
+       system(command);
+       
+       vfree(seq1);
+       vfree(seq2);
+       vfree(gap_list_1);
+       vfree(gap_list_2);
+       if (!strcmp(param->method, "fast"))
+       {
+               free_sparse((Sparse_dynamic_param*)method_arguments_p);
+       }
+       else if (!strcmp(param->method, "nw"))
+       {
+               free_nw((Nw_param*)method_arguments_p, alphabet_size);
+       }
+       else if (!strcmp(param->method, "gotoh"))
+       {
+               free_gotoh((Gotoh_param*)method_arguments_p, alphabet_size);
+       }
+}
+
+
+
+
+void
+write_iterated_aln(char* old_aln_file_name, char* new_aln_file_name, char *gap_file_name, char *seq1, int *gap_list1, int num_gap1, char *seq2, int *gap_list2, int num_gap2)
+{
+       int i;
+//     for (i = 0; i < num_gap1; ++i)
+//     {
+//             printf("%i ", gap_list1[i]);
+//     }
+//     printf("\n-----\n");
+//     for (i = 0; i < num_gap2; ++i)
+//     {
+//             printf("%i ", gap_list2[i]);
+//     }
+//     printf("\n");
+//     
+//     printf("%s\n%s\n",seq1, seq2);
+//     printf("EX: %i\n", gap_list1[0]);
+       FILE *gap_file = fopen(gap_file_name, "r");
+       FILE *new_aln_file = fopen(new_aln_file_name, "w");
+//     if (new_aln_file == NULL)
+//             printf("AHA\n");
+       FILE *old_aln_file = fopen(old_aln_file_name, "r");
+       
+       const int GAPLINE_LENGTH = 5;
+       char gapline[GAPLINE_LENGTH];
+       char *seq;
+       int *gap_list;
+       int num_gap;
+       
+       const int ALNLINE_LENGTH = 200;
+       char alnline[ALNLINE_LENGTH];
+       
+       int alnline_pos  = -1;
+       int overall_aln_pos = -1;
+       int pattern_pos = -1;
+       int gap_pos = 0;
+       
+       
+//     fprintf(new_aln_file, "%s", alnline);
+       while(fgets(gapline, GAPLINE_LENGTH , gap_file)!=NULL)
+       {
+               
+               //this is the sequence name
+               fgets(alnline, ALNLINE_LENGTH , old_aln_file);
+               fprintf(new_aln_file,"%s",alnline);
+               //getting (part if) the sequence
+               fgets(alnline, ALNLINE_LENGTH , old_aln_file);
+               if (gapline[0] == 'g')
+               {
+                       seq = seq1;
+                       gap_list = gap_list1;
+                       num_gap = num_gap1;
+               }
+               else
+               {
+                       seq = seq2;
+                       gap_list = gap_list2;
+                       num_gap = num_gap2;
+               }
+               overall_aln_pos = 0;
+               pattern_pos = 0;
+               gap_pos = 0;
+               alnline_pos = -1;
+               
+               //go along the pattern
+               while (seq[pattern_pos] != '\0')
+               {
+                       if (seq[pattern_pos] == '-')
+                       {
+                               ++pattern_pos;
+                               fprintf(new_aln_file,"-");
+                               continue;
+                       }
+                       //get next part of the sequence if you are at the end
+                       if ((alnline[++alnline_pos] == '\n') || (alnline[alnline_pos] == '\0'))
+                       {
+                               alnline_pos = 0;
+                               fgets(alnline, ALNLINE_LENGTH, old_aln_file);
+                       }
+                       if ((gap_pos < num_gap) && (overall_aln_pos == gap_list[gap_pos]))
+                       {
+                               ++gap_pos;
+                               ++overall_aln_pos;
+                               continue;
+                       }
+                       fprintf(new_aln_file,"%c",alnline[alnline_pos]);
+                       ++overall_aln_pos;
+                       ++pattern_pos;
+               }
+               fprintf(new_aln_file,"\n");
+       }
+       
+       fclose(new_aln_file);
+       fclose(old_aln_file);
+       fclose(gap_file);
+}
+
+
+void
+edit2seq_pattern(FILE *edit_file, char *seq1, char *seq2)
+{
+       
+       fseek(edit_file, 0, SEEK_SET);
+       const int LINE_LENGTH = 50;
+       char line[LINE_LENGTH];
+       fgets(line, LINE_LENGTH , edit_file);
+//     int child1 = atoi(line);
+       fgets(line, LINE_LENGTH , edit_file);
+//     int child2 = atoi(line);
+       fgets(line, LINE_LENGTH , edit_file);
+//     int is_leaf1 = atoi(line);
+       fgets(line, LINE_LENGTH , edit_file);
+//     int is_leaf2 = atoi(line);
+
+       char x;
+       int number;
+       int pos = -1;
+       while(fgets(line, LINE_LENGTH , edit_file)!=NULL)
+       {
+//             printf("%s\n",line);
+               x = line[0];
+               if (x == '*')
+                       break;
+               number = atoi(&line[1]);
+               if (x == 'M')
+               {
+                       while (--number >= 0)
+                       {
+                               seq1[++pos] = 'X';
+                               seq2[pos] = 'X';
+                       }
+               }
+               else if (x == 'I')
+               {
+                       while (--number >= 0)
+                       {
+                               seq1[++pos] = 'X';
+                               seq2[pos] = '-';
+                       }
+               }
+               else if (x == 'D')
+               {
+                       while (--number >= 0)
+                       {
+//                             printf("POS: %i\n", pos);
+                               seq1[++pos] = '-';
+                               seq2[pos] = 'X';
+                       }
+               }
+       }
+       seq1[++pos] = '\0';
+       seq2[pos] = '\0';
+//     printf("%s\n%s\n",seq1, seq2);
+}
+/******************************COPYRIGHT NOTICE*******************************/
+/*© Centro de Regulacio Genomica */
+/*and */
+/*Cedric Notredame */
+/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
+/*All rights reserved.*/
+/*This file is part of T-COFFEE.*/
+/**/
+/*    T-COFFEE is free software; you can redistribute it and/or modify*/
+/*    it under the terms of the GNU General Public License as published by*/
+/*    the Free Software Foundation; either version 2 of the License, or*/
+/*    (at your option) any later version.*/
+/**/
+/*    T-COFFEE is distributed in the hope that it will be useful,*/
+/*    but WITHOUT ANY WARRANTY; without even the implied warranty of*/
+/*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*/
+/*    GNU General Public License for more details.*/
+/**/
+/*    You should have received a copy of the GNU General Public License*/
+/*    along with Foobar; if not, write to the Free Software*/
+/*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/
+/*...............................................                                                                                      |*/
+/*  If you need some more information*/
+/*  cedric.notredame@europe.com*/
+/*...............................................                                                                                                                                     |*/
+/**/
+/**/
+/*     */
+/******************************COPYRIGHT NOTICE*******************************/