8 #include "io_lib_header.h"
9 #include "util_lib_header.h"
10 #include "fastal_lib_header.h"
14 * Adds a single sequence to a profile.
15 * \param seq The sequence.
16 * \param prf The profile.
19 seq2profile2(char *seq, Fastal_profile *prf, int *char2pos)
21 // printf("SEDQ: %s\n", seq);
22 int **prf_in = prf->prf;
25 while (*(++seq) !='\0')
28 ++(prf_in[char2pos[*seq]][++i]);
40 // /// Number of sequences in this profile
42 // /// number of the profile
44 // ///0 = combination of two profiles, 1 = profile of a single sequence -> name1 = seq_name
46 // ///length of the profile
48 // ///weight of the sequence
50 // ///saves the amount of allocated memory
51 // int allocated_memory;
52 // ///the profile itself [alphabet_size][profile_length]
54 // ///number_of_sequences
55 // int number_of_sequences;
64 * Deletes all gap columns from the profile.
66 * \param prf The profile.
67 * \param alphabet_size The size of the alphabet.
68 * \param gap_list The gap_list.
69 * \param gap_list_length The length of the gap list.
70 * \param num_gaps The number of gaps.
71 * \return The new gap list.
74 del_gap_from_profile(Fastal_profile *prf, int alphabet_size, int *gap_list, int *gap_list_length, int *num_gaps)
81 // int gap_counter = 0;
82 int **prf_in = prf->prf;
83 int prf_length = prf->length;
85 while (gap_pos < prf_length)
87 // printf("PRF1: %i %i %i %i\n", prf_in[0][gap_pos], prf_in[1][gap_pos], prf_in[2][gap_pos], prf_in[3][gap_pos]);
90 for (i = 0; i < alphabet_size; ++i)
92 if (prf_in[i][gap_pos])
95 // printf("ARG: %i\n", i);
102 if (gap_pos != write_pos)
104 for (i = 0; i < alphabet_size; ++i)
106 prf_in[i][write_pos] = prf_in[i][gap_pos];
114 if (++pos >= *gap_list_length)
116 *gap_list_length += 10;
117 gap_list = vrealloc(gap_list, (*gap_list_length)*sizeof(int));
119 gap_list[pos] = gap_pos;
124 // printf("DEEEEEEEEEEEEEEEEEL %i %i\n", gap_counter, pos+1);
127 prf->length -= *num_gaps;
133 * Splits a given alignment according to a given position.
134 * \param alignment_file The alignment file.
135 * \param gap_prf Saves the profile with the gap sequences.
136 * \param no_gap_prf Saves the profile of the non gap sequences.
137 * \param seq A temporary place to save the sequence.
138 * \param index The index where to look for the gaps.
139 * \param char2pos The character to position translation array.
142 split_set(FILE *alignment_file, Fastal_profile *gap_prf, Fastal_profile *no_gap_prf, char *seq, int index, int *char2pos, char* split_file_name)
144 FILE *split_set_file1 = fopen("GAP", "w");
145 FILE *split_set_file2 = fopen("NOGAP", "w");
146 // FILE *shorted = fopen("SHORTT.fa", "w");
147 printf("INDEX: %i\n",index);
148 fseek (alignment_file , 0 , SEEK_SET);
149 FILE *split_file = fopen(split_file_name, "w");
151 const int LINE_LENGTH = 200;
152 char line[LINE_LENGTH];
154 int overall_pos = -1;
155 fgets(line, LINE_LENGTH , alignment_file);
156 // fprintf(shorted, "%s", line);
157 while(fgets(line, LINE_LENGTH , alignment_file)!=NULL)
162 // fprintf(shorted, "%s", line);
163 if (seq[index] == '-')
166 seq[++overall_pos] = '\0';
167 fprintf(split_file, "g\n");
168 fprintf(split_set_file1,"%s\n",seq);
169 seq2profile2(seq, gap_prf, char2pos);
170 ++(gap_prf->number_of_sequences);
174 seq[++overall_pos] = '\0';
175 fprintf(split_file, "n\n");
176 fprintf(split_set_file2,"%s\n",seq);
177 seq2profile2(seq, no_gap_prf, char2pos);
178 ++(no_gap_prf->number_of_sequences);
185 // for (j = 35; j < 96; ++j)
187 // fprintf(shorted, "%c", line[j]);
189 // fprintf(shorted, "\n");
190 while ((line[++pos] != '\n') && (line[pos] != '\0'))
192 seq[++overall_pos] = line[pos];
197 if (seq[index] == '-')
199 seq[++overall_pos] = '\0';
200 fprintf(split_file, "g\n");
201 seq2profile2(seq, gap_prf, char2pos);
202 ++(gap_prf->number_of_sequences);
206 seq[++overall_pos] = '\0';
207 fprintf(split_file, "n\n");
208 seq2profile2(seq, no_gap_prf, char2pos);
209 ++(no_gap_prf->number_of_sequences);
216 enlarge_prof(Fastal_profile *prof, int new_length, int alphabet_size)
218 if (new_length > prof->allocated_memory)
221 for (i = 0;i < alphabet_size; ++i)
223 prof->prf[i] = vrealloc(prof->prf[i],new_length*sizeof(int));
225 prof->allocated_memory = new_length;
234 iterate(Fastal_param *param, void *method_arguments_p, char *aln_file_name, char *out_file_name_end, int iteration_number)
236 // calculate alignment length
239 int it_coutner_2 = 0;
240 const int LINE_LENGTH = 200;
241 char line[LINE_LENGTH];
242 char *seq1 = vcalloc(1,sizeof(char));
243 char *seq2 = vcalloc(1,sizeof(char));
244 Fastal_profile **profiles = vcalloc(3,sizeof(Fastal_profile*));
245 initiate_profiles(profiles, param);
246 Fastal_profile *gap_prf = profiles[0];
247 Fastal_profile *no_gap_prf = profiles[1];
248 int alphabet_size = param->alphabet_size;
250 int *gap_list_1 = vcalloc(1, sizeof(int));
251 int *gap_list_1_length = vcalloc(1, sizeof(int));
252 *gap_list_1_length = 1;
254 int *gap_list_2 = vcalloc(1, sizeof(int));
255 int *gap_list_2_length = vcalloc(1, sizeof(int));
256 *gap_list_2_length = 1;
259 int alignment_length = 1;
262 char *out_file_name = aln_file_name;
263 int *gap_profile = vcalloc(alignment_length, sizeof(int));
265 // while (it_counter < alignment_length)
269 FILE *alignment_file = fopen(aln_file_name,"r");
270 if (alignment_file == NULL)
272 printf("Could not open alignment file %s\n", aln_file_name);
278 alignment_length = 0;
280 fgets(line, LINE_LENGTH , alignment_file);
281 while(fgets(line, LINE_LENGTH , alignment_file)!=NULL)
288 while ((line[++tmp_len] != '\n') && (line[tmp_len] != '\0'));
289 alignment_length += tmp_len;
291 // printf("ALN_LENGTH %i\n", alignment_length);
292 seq1 =vrealloc(seq1, (1+alignment_length)*sizeof(char));
294 gap_profile = vrealloc(gap_profile, alignment_length * sizeof(int));
296 for (i = 0; i < alignment_length; ++i)
301 int number_of_sequences = 0;
303 fseek (alignment_file , 0 , SEEK_SET);
304 while(fgets(line, LINE_LENGTH , alignment_file)!=NULL)
308 ++number_of_sequences;
314 while ((line[++i] != '\n') && (line[i] != '\0'))
325 double max_entrop = 0;
326 int column_index = 0;
332 // for (i = it_counter; i<=it_counter; ++i)
334 // p = gap_profile[i]/(double)number_of_sequences;
341 // entrop = (-1)*(p*log(p) + (1-p)*log(1-p) ) ;
343 // if (entrop > max_entrop)
346 // max_entrop = entrop;
351 // if (max_entrop < 0.6)
353 // printf("CONTINUE %f\n",entrop);
357 // column_index = 18;//it_counter-1;
358 // if (column_index == 19)
360 out_file_name = vtmpnam(NULL);
362 char *edit_file_name = "EDIT";//vtmpnam(NULL);
363 FILE *edit_file = fopen(edit_file_name,"w");
364 char *profile_file_name = vtmpnam(NULL);
365 FILE *profile_file = fopen(profile_file_name,"w");
366 char *split_file_name = "AHA";//vtmpnam(NULL);
369 gap_prf = enlarge_prof(gap_prf, alignment_length, alphabet_size);
370 no_gap_prf = enlarge_prof(no_gap_prf, alignment_length, alphabet_size );
371 no_gap_prf->number_of_sequences = 0;
372 gap_prf->number_of_sequences = 0;
374 split_set(alignment_file, gap_prf, no_gap_prf, seq1, column_index, param->char2pos, split_file_name);
375 gap_prf -> length = alignment_length;
376 no_gap_prf -> length = alignment_length;
379 gap_list_1 = del_gap_from_profile(gap_prf, alphabet_size, gap_list_1, gap_list_1_length, &num_gaps_1 );
380 gap_list_2 = del_gap_from_profile(no_gap_prf, alphabet_size, gap_list_2, gap_list_2_length, &num_gaps_2 );
383 fclose(alignment_file);
384 profiles[0] = gap_prf;
385 profiles[1] = no_gap_prf;
387 alignment_length = gotoh_dyn(profiles, param, method_arguments_p, 0, edit_file, profile_file, 0);
388 seq1 =vrealloc(seq1, (1+alignment_length)*sizeof(char));
389 seq2 =vrealloc(seq2, (1+alignment_length)*sizeof(char));
392 edit_file = fopen(edit_file_name,"r");
393 edit2seq_pattern(edit_file, seq1, seq2);
395 fclose(profile_file);
397 write_iterated_aln(aln_file_name, out_file_name, split_file_name, seq1, gap_list_1, num_gaps_1, seq2, gap_list_2, num_gaps_2);
398 aln_file_name = out_file_name;
400 // if (it_coutner_2 == 2)
404 sprintf(command, "mv %s %s",out_file_name, out_file_name_end);
411 if (!strcmp(param->method, "fast"))
413 free_sparse((Sparse_dynamic_param*)method_arguments_p);
415 else if (!strcmp(param->method, "nw"))
417 free_nw((Nw_param*)method_arguments_p, alphabet_size);
419 else if (!strcmp(param->method, "gotoh"))
421 free_gotoh((Gotoh_param*)method_arguments_p, alphabet_size);
429 write_iterated_aln(char* old_aln_file_name, char* new_aln_file_name, char *gap_file_name, char *seq1, int *gap_list1, int num_gap1, char *seq2, int *gap_list2, int num_gap2)
432 // for (i = 0; i < num_gap1; ++i)
434 // printf("%i ", gap_list1[i]);
436 // printf("\n-----\n");
437 // for (i = 0; i < num_gap2; ++i)
439 // printf("%i ", gap_list2[i]);
443 // printf("%s\n%s\n",seq1, seq2);
444 // printf("EX: %i\n", gap_list1[0]);
445 FILE *gap_file = fopen(gap_file_name, "r");
446 FILE *new_aln_file = fopen(new_aln_file_name, "w");
447 // if (new_aln_file == NULL)
449 FILE *old_aln_file = fopen(old_aln_file_name, "r");
451 const int GAPLINE_LENGTH = 5;
452 char gapline[GAPLINE_LENGTH];
457 const int ALNLINE_LENGTH = 200;
458 char alnline[ALNLINE_LENGTH];
460 int alnline_pos = -1;
461 int overall_aln_pos = -1;
462 int pattern_pos = -1;
466 // fprintf(new_aln_file, "%s", alnline);
467 while(fgets(gapline, GAPLINE_LENGTH , gap_file)!=NULL)
470 //this is the sequence name
471 fgets(alnline, ALNLINE_LENGTH , old_aln_file);
472 fprintf(new_aln_file,"%s",alnline);
473 //getting (part if) the sequence
474 fgets(alnline, ALNLINE_LENGTH , old_aln_file);
475 if (gapline[0] == 'g')
478 gap_list = gap_list1;
484 gap_list = gap_list2;
492 //go along the pattern
493 while (seq[pattern_pos] != '\0')
495 if (seq[pattern_pos] == '-')
498 fprintf(new_aln_file,"-");
501 //get next part of the sequence if you are at the end
502 if ((alnline[++alnline_pos] == '\n') || (alnline[alnline_pos] == '\0'))
505 fgets(alnline, ALNLINE_LENGTH, old_aln_file);
507 if ((gap_pos < num_gap) && (overall_aln_pos == gap_list[gap_pos]))
513 fprintf(new_aln_file,"%c",alnline[alnline_pos]);
517 fprintf(new_aln_file,"\n");
520 fclose(new_aln_file);
521 fclose(old_aln_file);
527 edit2seq_pattern(FILE *edit_file, char *seq1, char *seq2)
530 fseek(edit_file, 0, SEEK_SET);
531 const int LINE_LENGTH = 50;
532 char line[LINE_LENGTH];
533 fgets(line, LINE_LENGTH , edit_file);
534 // int child1 = atoi(line);
535 fgets(line, LINE_LENGTH , edit_file);
536 // int child2 = atoi(line);
537 fgets(line, LINE_LENGTH , edit_file);
538 // int is_leaf1 = atoi(line);
539 fgets(line, LINE_LENGTH , edit_file);
540 // int is_leaf2 = atoi(line);
545 while(fgets(line, LINE_LENGTH , edit_file)!=NULL)
547 // printf("%s\n",line);
551 number = atoi(&line[1]);
554 while (--number >= 0)
562 while (--number >= 0)
570 while (--number >= 0)
572 // printf("POS: %i\n", pos);
580 // printf("%s\n%s\n",seq1, seq2);
582 /******************************COPYRIGHT NOTICE*******************************/
583 /*© Centro de Regulacio Genomica */
585 /*Cedric Notredame */
586 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
587 /*All rights reserved.*/
588 /*This file is part of T-COFFEE.*/
590 /* T-COFFEE is free software; you can redistribute it and/or modify*/
591 /* it under the terms of the GNU General Public License as published by*/
592 /* the Free Software Foundation; either version 2 of the License, or*/
593 /* (at your option) any later version.*/
595 /* T-COFFEE is distributed in the hope that it will be useful,*/
596 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
597 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
598 /* GNU General Public License for more details.*/
600 /* You should have received a copy of the GNU General Public License*/
601 /* along with Foobar; if not, write to the Free Software*/
602 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
603 /*............................................... |*/
604 /* If you need some more information*/
605 /* cedric.notredame@europe.com*/
606 /*............................................... |*/
610 /******************************COPYRIGHT NOTICE*******************************/