IA32 and x64 tcoffee binaries are added
[jabaws.git] / binaries / src / tcoffee / t_coffee_source / iteration.c
1
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <math.h>
5 // #include <ctype.h>
6 // #include <string.h>
7
8 #include "io_lib_header.h"
9 #include "util_lib_header.h"
10 #include "fastal_lib_header.h"
11
12
13 /**
14  * Adds a single sequence to a profile.
15  * \param seq The sequence.
16  * \param prf The profile.
17  */
18 void
19 seq2profile2(char *seq, Fastal_profile *prf, int *char2pos)
20 {
21 //      printf("SEDQ: %s\n", seq);
22         int **prf_in = prf->prf;
23         int i = -1;
24         --seq;
25         while (*(++seq) !='\0')
26         {       
27                 if (*seq != '-')
28                         ++(prf_in[char2pos[*seq]][++i]);
29                 else
30                         ++i;
31 //              ++seq;
32         }
33 }
34
35
36
37
38 // typedef struct
39 // {
40 //      /// Number of sequences in this profile
41 //      int num_sequences;
42 //      /// number of the profile
43 //      int prf_number;
44 //      ///0 = combination of two profiles, 1 = profile of a single sequence -> name1 = seq_name
45 //      int is_leaf;
46 //      ///length of the profile
47 //      int length;
48 //      ///weight of the sequence
49 //      int weight;
50 //      ///saves the amount of allocated memory
51 //      int allocated_memory;   
52 //      ///the profile itself [alphabet_size][profile_length]
53 //      int **prf;
54 //      ///number_of_sequences
55 //      int number_of_sequences;
56 // }
57 // Fastal_profile;
58
59
60
61
62
63 /**
64  * Deletes all gap columns from the profile.
65  * 
66  * \param prf The profile.
67  * \param alphabet_size The size of the alphabet.
68  * \param gap_list The gap_list.
69  * \param gap_list_length The length of the gap list.
70  * \param num_gaps The number of gaps.
71  * \return The new gap list.
72  */
73 int *
74 del_gap_from_profile(Fastal_profile *prf, int alphabet_size, int *gap_list, int *gap_list_length, int *num_gaps)
75 {
76         int i;
77         int pos = -1;
78         int not_gap = 0;
79         int gap_pos = 0;
80         int write_pos = 0;
81 //      int gap_counter = 0;
82         int **prf_in = prf->prf;
83         int prf_length = prf->length;
84         
85         while (gap_pos < prf_length)
86         {
87 //              printf("PRF1: %i %i %i %i\n", prf_in[0][gap_pos], prf_in[1][gap_pos], prf_in[2][gap_pos], prf_in[3][gap_pos]);
88
89                 not_gap = 0;
90                 for (i = 0; i < alphabet_size; ++i)
91                 {
92                         if (prf_in[i][gap_pos])
93                         {
94                                 not_gap = 1;
95 //                              printf("ARG: %i\n", i);
96                                 break;
97                         }
98                 }
99                 if (not_gap)
100                 {
101                         
102                         if (gap_pos != write_pos)
103                         {
104                                 for (i = 0; i < alphabet_size; ++i)
105                                 {
106                                         prf_in[i][write_pos] = prf_in[i][gap_pos];
107                                 }
108                         }
109                         ++write_pos;
110                 }
111                 else
112                 {
113                         
114                         if (++pos >= *gap_list_length)
115                         {
116                                 *gap_list_length += 10;
117                                 gap_list = vrealloc(gap_list, (*gap_list_length)*sizeof(int));
118                         }
119                         gap_list[pos] = gap_pos;
120 //                      ++gap_counter;
121                 }
122                 ++gap_pos;
123         }
124 //      printf("DEEEEEEEEEEEEEEEEEL %i %i\n", gap_counter, pos+1);
125         
126         *num_gaps = pos+1;
127         prf->length -= *num_gaps;
128         return gap_list;
129 }
130
131
132 /**
133  * Splits a given alignment according to a given position.
134  * \param alignment_file The alignment file.
135  * \param gap_prf Saves the profile with the gap sequences.
136  * \param no_gap_prf Saves the profile of the non gap sequences.
137  * \param seq A temporary place to save the sequence.
138  * \param index The index where to look for the gaps.
139  * \param char2pos The character to position translation array.
140  */
141 void
142 split_set(FILE *alignment_file, Fastal_profile *gap_prf, Fastal_profile *no_gap_prf, char *seq, int index, int *char2pos, char* split_file_name)
143 {
144         FILE *split_set_file1 = fopen("GAP", "w");
145         FILE *split_set_file2 = fopen("NOGAP", "w");
146 //      FILE *shorted = fopen("SHORTT.fa", "w");
147         printf("INDEX: %i\n",index);
148         fseek (alignment_file , 0 , SEEK_SET);
149         FILE *split_file = fopen(split_file_name, "w");
150         int i,j;
151         const int LINE_LENGTH = 200;
152         char line[LINE_LENGTH];
153         int pos = -1;
154         int overall_pos = -1;
155         fgets(line, LINE_LENGTH , alignment_file);
156 //      fprintf(shorted, "%s", line);
157         while(fgets(line, LINE_LENGTH , alignment_file)!=NULL)
158         {
159                 pos = -1;
160                 if (line[0] == '>')
161                 {
162 //                      fprintf(shorted, "%s", line);
163                         if (seq[index] == '-')
164                         {
165                                 
166                                 seq[++overall_pos] = '\0';
167                                 fprintf(split_file, "g\n");
168                                 fprintf(split_set_file1,"%s\n",seq);
169                                 seq2profile2(seq, gap_prf, char2pos);
170                                 ++(gap_prf->number_of_sequences);
171                         }
172                         else
173                         {
174                                 seq[++overall_pos] = '\0';
175                                 fprintf(split_file, "n\n");
176                                 fprintf(split_set_file2,"%s\n",seq);
177                                 seq2profile2(seq, no_gap_prf, char2pos);
178                                 ++(no_gap_prf->number_of_sequences);
179                         }
180                         overall_pos = -1;
181                         
182                 }
183                 else
184                 {
185 //                      for (j = 35; j < 96; ++j)
186 //                      {
187 //                              fprintf(shorted, "%c", line[j]);
188 //                      }
189 //                      fprintf(shorted, "\n");
190                         while ((line[++pos] != '\n') && (line[pos] != '\0'))
191                         {
192                                 seq[++overall_pos] = line[pos];
193                         }
194                 }
195         }
196
197         if (seq[index] == '-')
198         {
199                 seq[++overall_pos] = '\0';
200                 fprintf(split_file, "g\n");
201                 seq2profile2(seq, gap_prf, char2pos);
202                 ++(gap_prf->number_of_sequences);
203         }
204         else
205         {
206                 seq[++overall_pos] = '\0';
207                 fprintf(split_file, "n\n");
208                 seq2profile2(seq, no_gap_prf, char2pos);
209                 ++(no_gap_prf->number_of_sequences);
210         }
211         fclose(split_file);
212 }
213
214
215 Fastal_profile *
216 enlarge_prof(Fastal_profile *prof, int new_length, int alphabet_size)
217 {
218         if (new_length > prof->allocated_memory)
219         {
220                 int i;
221                 for (i = 0;i < alphabet_size; ++i)
222                 {
223                         prof->prf[i] = vrealloc(prof->prf[i],new_length*sizeof(int));
224                 }
225                 prof->allocated_memory = new_length;
226         }
227         return prof;
228 }
229
230
231
232
233 void
234 iterate(Fastal_param *param, void *method_arguments_p, char *aln_file_name, char *out_file_name_end, int iteration_number)
235 {
236 //      calculate alignment length
237         
238 //count gap
239         int it_coutner_2 = 0;
240         const int LINE_LENGTH = 200;
241         char line[LINE_LENGTH];
242         char *seq1 = vcalloc(1,sizeof(char));
243         char *seq2 = vcalloc(1,sizeof(char));
244         Fastal_profile **profiles = vcalloc(3,sizeof(Fastal_profile*));
245         initiate_profiles(profiles, param);
246         Fastal_profile *gap_prf = profiles[0];
247         Fastal_profile *no_gap_prf = profiles[1];
248         int alphabet_size = param->alphabet_size;
249
250         int *gap_list_1 = vcalloc(1, sizeof(int));
251         int *gap_list_1_length = vcalloc(1, sizeof(int));
252         *gap_list_1_length = 1;
253         int num_gaps_1 = 0;
254         int *gap_list_2 = vcalloc(1, sizeof(int));
255         int *gap_list_2_length = vcalloc(1, sizeof(int));
256         *gap_list_2_length = 1;
257         int num_gaps_2 = 0;
258
259         int alignment_length = 1;
260         //from here repeat!
261         int it_counter = 0;
262         char *out_file_name = aln_file_name;
263         int *gap_profile = vcalloc(alignment_length, sizeof(int));
264         
265 //      while (it_counter < alignment_length)
266 //      {
267                 
268                 
269                 FILE *alignment_file = fopen(aln_file_name,"r");
270                 if (alignment_file == NULL)
271                 {
272                         printf("Could not open alignment file %s\n", aln_file_name);
273                         exit(1);
274                 }
275         
276         
277                 
278                 alignment_length = 0;
279                 int tmp_len = -1;
280                 fgets(line, LINE_LENGTH , alignment_file);
281                 while(fgets(line, LINE_LENGTH , alignment_file)!=NULL)
282                 {
283                         tmp_len = -1;
284                         if (line[0] == '>')
285                         {
286                                 break;
287                         }
288                         while ((line[++tmp_len] != '\n') && (line[tmp_len] != '\0'));
289                         alignment_length += tmp_len;
290                 }
291         //      printf("ALN_LENGTH %i\n", alignment_length);
292                 seq1 =vrealloc(seq1, (1+alignment_length)*sizeof(char));
293                 
294                 gap_profile = vrealloc(gap_profile, alignment_length * sizeof(int));
295                 int i;
296                 for (i = 0; i < alignment_length; ++i)
297                 {
298                         gap_profile[i] = 0;
299                 }
300         
301                 int number_of_sequences = 0;
302                 int pos = -1;
303                 fseek (alignment_file , 0 , SEEK_SET);
304                 while(fgets(line, LINE_LENGTH , alignment_file)!=NULL)
305                 {
306                         if (line[0] == '>')
307                         {
308                                 ++number_of_sequences;
309                                 pos = -1;
310                         }
311                         else
312                         {
313                                 i = -1;
314                                 while ((line[++i] != '\n') && (line[i] != '\0'))
315                                 {
316                                         ++pos;
317                                         if (line[i] == '-')
318                                         {
319                                                 ++gap_profile[pos];
320                                         }
321                                 }
322                         }
323                 }
324         
325                 double max_entrop = 0;
326                 int column_index = 0;
327                 double entrop = 0;
328                 double last = 0;
329                 double p;
330
331                 
332 //              for (i = it_counter; i<=it_counter; ++i)
333 //              {
334 //                      p = gap_profile[i]/(double)number_of_sequences;
335 //                      if (!p)
336 //                      {
337 //                              entrop = 0;
338 //                      }
339 //                      else
340 //                      {
341 //                              entrop = (-1)*(p*log(p) + (1-p)*log(1-p) ) ;
342 //                      }
343 //                      if (entrop > max_entrop)
344 //                      {
345 //                              column_index = i;
346 //                              max_entrop = entrop;
347 //                      }
348 //                      last = entrop;
349 //              }
350 //              ++it_counter;
351 //              if (max_entrop < 0.6)
352 //              {
353 //                      printf("CONTINUE %f\n",entrop);
354 //                      continue;
355 //              }
356                 
357 //              column_index = 18;//it_counter-1;
358 //              if (column_index == 19)
359                         column_index = 58;
360                 out_file_name = vtmpnam(NULL);
361                 
362                 char *edit_file_name = "EDIT";//vtmpnam(NULL);
363                 FILE *edit_file = fopen(edit_file_name,"w");
364                 char *profile_file_name = vtmpnam(NULL);
365                 FILE *profile_file = fopen(profile_file_name,"w");
366                 char *split_file_name = "AHA";//vtmpnam(NULL);
367                 ++it_coutner_2;
368
369                 gap_prf = enlarge_prof(gap_prf, alignment_length, alphabet_size);
370                 no_gap_prf = enlarge_prof(no_gap_prf, alignment_length, alphabet_size );
371                 no_gap_prf->number_of_sequences = 0;
372                 gap_prf->number_of_sequences = 0;
373         
374                 split_set(alignment_file, gap_prf, no_gap_prf, seq1, column_index, param->char2pos, split_file_name);
375                 gap_prf -> length = alignment_length;
376                 no_gap_prf -> length = alignment_length;
377
378
379                 gap_list_1 = del_gap_from_profile(gap_prf, alphabet_size, gap_list_1, gap_list_1_length, &num_gaps_1 );
380                 gap_list_2 = del_gap_from_profile(no_gap_prf, alphabet_size, gap_list_2, gap_list_2_length, &num_gaps_2 );
381
382
383                 fclose(alignment_file);
384                 profiles[0] = gap_prf;
385                 profiles[1] = no_gap_prf;
386
387                 alignment_length = gotoh_dyn(profiles, param, method_arguments_p, 0, edit_file, profile_file, 0);
388                 seq1 =vrealloc(seq1, (1+alignment_length)*sizeof(char));
389                 seq2 =vrealloc(seq2, (1+alignment_length)*sizeof(char));
390                 
391                 fclose(edit_file);
392                 edit_file = fopen(edit_file_name,"r");
393                 edit2seq_pattern(edit_file, seq1, seq2);
394                 fclose(edit_file);
395                 fclose(profile_file);
396                 
397                 write_iterated_aln(aln_file_name, out_file_name, split_file_name, seq1, gap_list_1, num_gaps_1, seq2, gap_list_2, num_gaps_2);
398                 aln_file_name = out_file_name;
399
400 //              if (it_coutner_2 == 2)
401 //              break;
402 //      }
403         char command[1000];
404         sprintf(command, "mv %s %s",out_file_name, out_file_name_end);
405         system(command);
406         
407         vfree(seq1);
408         vfree(seq2);
409         vfree(gap_list_1);
410         vfree(gap_list_2);
411         if (!strcmp(param->method, "fast"))
412         {
413                 free_sparse((Sparse_dynamic_param*)method_arguments_p);
414         }
415         else if (!strcmp(param->method, "nw"))
416         {
417                 free_nw((Nw_param*)method_arguments_p, alphabet_size);
418         }
419         else if (!strcmp(param->method, "gotoh"))
420         {
421                 free_gotoh((Gotoh_param*)method_arguments_p, alphabet_size);
422         }
423 }
424
425
426
427
428 void
429 write_iterated_aln(char* old_aln_file_name, char* new_aln_file_name, char *gap_file_name, char *seq1, int *gap_list1, int num_gap1, char *seq2, int *gap_list2, int num_gap2)
430 {
431         int i;
432 //      for (i = 0; i < num_gap1; ++i)
433 //      {
434 //              printf("%i ", gap_list1[i]);
435 //      }
436 //      printf("\n-----\n");
437 //      for (i = 0; i < num_gap2; ++i)
438 //      {
439 //              printf("%i ", gap_list2[i]);
440 //      }
441 //      printf("\n");
442 //      
443 //      printf("%s\n%s\n",seq1, seq2);
444 //      printf("EX: %i\n", gap_list1[0]);
445         FILE *gap_file = fopen(gap_file_name, "r");
446         FILE *new_aln_file = fopen(new_aln_file_name, "w");
447 //      if (new_aln_file == NULL)
448 //              printf("AHA\n");
449         FILE *old_aln_file = fopen(old_aln_file_name, "r");
450         
451         const int GAPLINE_LENGTH = 5;
452         char gapline[GAPLINE_LENGTH];
453         char *seq;
454         int *gap_list;
455         int num_gap;
456         
457         const int ALNLINE_LENGTH = 200;
458         char alnline[ALNLINE_LENGTH];
459         
460         int alnline_pos  = -1;
461         int overall_aln_pos = -1;
462         int pattern_pos = -1;
463         int gap_pos = 0;
464         
465         
466 //      fprintf(new_aln_file, "%s", alnline);
467         while(fgets(gapline, GAPLINE_LENGTH , gap_file)!=NULL)
468         {
469                 
470                 //this is the sequence name
471                 fgets(alnline, ALNLINE_LENGTH , old_aln_file);
472                 fprintf(new_aln_file,"%s",alnline);
473                 //getting (part if) the sequence
474                 fgets(alnline, ALNLINE_LENGTH , old_aln_file);
475                 if (gapline[0] == 'g')
476                 {
477                         seq = seq1;
478                         gap_list = gap_list1;
479                         num_gap = num_gap1;
480                 }
481                 else
482                 {
483                         seq = seq2;
484                         gap_list = gap_list2;
485                         num_gap = num_gap2;
486                 }
487                 overall_aln_pos = 0;
488                 pattern_pos = 0;
489                 gap_pos = 0;
490                 alnline_pos = -1;
491                 
492                 //go along the pattern
493                 while (seq[pattern_pos] != '\0')
494                 {
495                         if (seq[pattern_pos] == '-')
496                         {
497                                 ++pattern_pos;
498                                 fprintf(new_aln_file,"-");
499                                 continue;
500                         }
501                         //get next part of the sequence if you are at the end
502                         if ((alnline[++alnline_pos] == '\n') || (alnline[alnline_pos] == '\0'))
503                         {
504                                 alnline_pos = 0;
505                                 fgets(alnline, ALNLINE_LENGTH, old_aln_file);
506                         }
507                         if ((gap_pos < num_gap) && (overall_aln_pos == gap_list[gap_pos]))
508                         {
509                                 ++gap_pos;
510                                 ++overall_aln_pos;
511                                 continue;
512                         }
513                         fprintf(new_aln_file,"%c",alnline[alnline_pos]);
514                         ++overall_aln_pos;
515                         ++pattern_pos;
516                 }
517                 fprintf(new_aln_file,"\n");
518         }
519         
520         fclose(new_aln_file);
521         fclose(old_aln_file);
522         fclose(gap_file);
523 }
524
525
526 void
527 edit2seq_pattern(FILE *edit_file, char *seq1, char *seq2)
528 {
529         
530         fseek(edit_file, 0, SEEK_SET);
531         const int LINE_LENGTH = 50;
532         char line[LINE_LENGTH];
533         fgets(line, LINE_LENGTH , edit_file);
534 //      int child1 = atoi(line);
535         fgets(line, LINE_LENGTH , edit_file);
536 //      int child2 = atoi(line);
537         fgets(line, LINE_LENGTH , edit_file);
538 //      int is_leaf1 = atoi(line);
539         fgets(line, LINE_LENGTH , edit_file);
540 //      int is_leaf2 = atoi(line);
541
542         char x;
543         int number;
544         int pos = -1;
545         while(fgets(line, LINE_LENGTH , edit_file)!=NULL)
546         {
547 //              printf("%s\n",line);
548                 x = line[0];
549                 if (x == '*')
550                         break;
551                 number = atoi(&line[1]);
552                 if (x == 'M')
553                 {
554                         while (--number >= 0)
555                         {
556                                 seq1[++pos] = 'X';
557                                 seq2[pos] = 'X';
558                         }
559                 }
560                 else if (x == 'I')
561                 {
562                         while (--number >= 0)
563                         {
564                                 seq1[++pos] = 'X';
565                                 seq2[pos] = '-';
566                         }
567                 }
568                 else if (x == 'D')
569                 {
570                         while (--number >= 0)
571                         {
572 //                              printf("POS: %i\n", pos);
573                                 seq1[++pos] = '-';
574                                 seq2[pos] = 'X';
575                         }
576                 }
577         }
578         seq1[++pos] = '\0';
579         seq2[pos] = '\0';
580 //      printf("%s\n%s\n",seq1, seq2);
581 }
582 /******************************COPYRIGHT NOTICE*******************************/
583 /*© Centro de Regulacio Genomica */
584 /*and */
585 /*Cedric Notredame */
586 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
587 /*All rights reserved.*/
588 /*This file is part of T-COFFEE.*/
589 /**/
590 /*    T-COFFEE is free software; you can redistribute it and/or modify*/
591 /*    it under the terms of the GNU General Public License as published by*/
592 /*    the Free Software Foundation; either version 2 of the License, or*/
593 /*    (at your option) any later version.*/
594 /**/
595 /*    T-COFFEE is distributed in the hope that it will be useful,*/
596 /*    but WITHOUT ANY WARRANTY; without even the implied warranty of*/
597 /*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*/
598 /*    GNU General Public License for more details.*/
599 /**/
600 /*    You should have received a copy of the GNU General Public License*/
601 /*    along with Foobar; if not, write to the Free Software*/
602 /*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/
603 /*...............................................                                                                                      |*/
604 /*  If you need some more information*/
605 /*  cedric.notredame@europe.com*/
606 /*...............................................                                                                                                                                     |*/
607 /**/
608 /**/
609 /*      */
610 /******************************COPYRIGHT NOTICE*******************************/