X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=binaries%2Fsrc%2Ftcoffee%2Ft_coffee_source%2Fiteration.c;fp=binaries%2Fsrc%2Ftcoffee%2Ft_coffee_source%2Fiteration.c;h=b0d50907fe2b6141efad025700be0fd178d63212;hb=d8e1d04583007816f5dc77595d322ba669075ae3;hp=0000000000000000000000000000000000000000;hpb=8140043fefd2101326b3e651e3b7e2d4ac806b99;p=jabaws.git diff --git a/binaries/src/tcoffee/t_coffee_source/iteration.c b/binaries/src/tcoffee/t_coffee_source/iteration.c new file mode 100644 index 0000000..b0d5090 --- /dev/null +++ b/binaries/src/tcoffee/t_coffee_source/iteration.c @@ -0,0 +1,610 @@ + +#include +#include +#include +// #include +// #include + +#include "io_lib_header.h" +#include "util_lib_header.h" +#include "fastal_lib_header.h" + + +/** + * Adds a single sequence to a profile. + * \param seq The sequence. + * \param prf The profile. + */ +void +seq2profile2(char *seq, Fastal_profile *prf, int *char2pos) +{ +// printf("SEDQ: %s\n", seq); + int **prf_in = prf->prf; + int i = -1; + --seq; + while (*(++seq) !='\0') + { + if (*seq != '-') + ++(prf_in[char2pos[*seq]][++i]); + else + ++i; +// ++seq; + } +} + + + + +// typedef struct +// { +// /// Number of sequences in this profile +// int num_sequences; +// /// number of the profile +// int prf_number; +// ///0 = combination of two profiles, 1 = profile of a single sequence -> name1 = seq_name +// int is_leaf; +// ///length of the profile +// int length; +// ///weight of the sequence +// int weight; +// ///saves the amount of allocated memory +// int allocated_memory; +// ///the profile itself [alphabet_size][profile_length] +// int **prf; +// ///number_of_sequences +// int number_of_sequences; +// } +// Fastal_profile; + + + + + +/** + * Deletes all gap columns from the profile. + * + * \param prf The profile. + * \param alphabet_size The size of the alphabet. + * \param gap_list The gap_list. + * \param gap_list_length The length of the gap list. + * \param num_gaps The number of gaps. + * \return The new gap list. + */ +int * +del_gap_from_profile(Fastal_profile *prf, int alphabet_size, int *gap_list, int *gap_list_length, int *num_gaps) +{ + int i; + int pos = -1; + int not_gap = 0; + int gap_pos = 0; + int write_pos = 0; +// int gap_counter = 0; + int **prf_in = prf->prf; + int prf_length = prf->length; + + while (gap_pos < prf_length) + { +// printf("PRF1: %i %i %i %i\n", prf_in[0][gap_pos], prf_in[1][gap_pos], prf_in[2][gap_pos], prf_in[3][gap_pos]); + + not_gap = 0; + for (i = 0; i < alphabet_size; ++i) + { + if (prf_in[i][gap_pos]) + { + not_gap = 1; +// printf("ARG: %i\n", i); + break; + } + } + if (not_gap) + { + + if (gap_pos != write_pos) + { + for (i = 0; i < alphabet_size; ++i) + { + prf_in[i][write_pos] = prf_in[i][gap_pos]; + } + } + ++write_pos; + } + else + { + + if (++pos >= *gap_list_length) + { + *gap_list_length += 10; + gap_list = vrealloc(gap_list, (*gap_list_length)*sizeof(int)); + } + gap_list[pos] = gap_pos; +// ++gap_counter; + } + ++gap_pos; + } +// printf("DEEEEEEEEEEEEEEEEEL %i %i\n", gap_counter, pos+1); + + *num_gaps = pos+1; + prf->length -= *num_gaps; + return gap_list; +} + + +/** + * Splits a given alignment according to a given position. + * \param alignment_file The alignment file. + * \param gap_prf Saves the profile with the gap sequences. + * \param no_gap_prf Saves the profile of the non gap sequences. + * \param seq A temporary place to save the sequence. + * \param index The index where to look for the gaps. + * \param char2pos The character to position translation array. + */ +void +split_set(FILE *alignment_file, Fastal_profile *gap_prf, Fastal_profile *no_gap_prf, char *seq, int index, int *char2pos, char* split_file_name) +{ + FILE *split_set_file1 = fopen("GAP", "w"); + FILE *split_set_file2 = fopen("NOGAP", "w"); +// FILE *shorted = fopen("SHORTT.fa", "w"); + printf("INDEX: %i\n",index); + fseek (alignment_file , 0 , SEEK_SET); + FILE *split_file = fopen(split_file_name, "w"); + int i,j; + const int LINE_LENGTH = 200; + char line[LINE_LENGTH]; + int pos = -1; + int overall_pos = -1; + fgets(line, LINE_LENGTH , alignment_file); +// fprintf(shorted, "%s", line); + while(fgets(line, LINE_LENGTH , alignment_file)!=NULL) + { + pos = -1; + if (line[0] == '>') + { +// fprintf(shorted, "%s", line); + if (seq[index] == '-') + { + + seq[++overall_pos] = '\0'; + fprintf(split_file, "g\n"); + fprintf(split_set_file1,"%s\n",seq); + seq2profile2(seq, gap_prf, char2pos); + ++(gap_prf->number_of_sequences); + } + else + { + seq[++overall_pos] = '\0'; + fprintf(split_file, "n\n"); + fprintf(split_set_file2,"%s\n",seq); + seq2profile2(seq, no_gap_prf, char2pos); + ++(no_gap_prf->number_of_sequences); + } + overall_pos = -1; + + } + else + { +// for (j = 35; j < 96; ++j) +// { +// fprintf(shorted, "%c", line[j]); +// } +// fprintf(shorted, "\n"); + while ((line[++pos] != '\n') && (line[pos] != '\0')) + { + seq[++overall_pos] = line[pos]; + } + } + } + + if (seq[index] == '-') + { + seq[++overall_pos] = '\0'; + fprintf(split_file, "g\n"); + seq2profile2(seq, gap_prf, char2pos); + ++(gap_prf->number_of_sequences); + } + else + { + seq[++overall_pos] = '\0'; + fprintf(split_file, "n\n"); + seq2profile2(seq, no_gap_prf, char2pos); + ++(no_gap_prf->number_of_sequences); + } + fclose(split_file); +} + + +Fastal_profile * +enlarge_prof(Fastal_profile *prof, int new_length, int alphabet_size) +{ + if (new_length > prof->allocated_memory) + { + int i; + for (i = 0;i < alphabet_size; ++i) + { + prof->prf[i] = vrealloc(prof->prf[i],new_length*sizeof(int)); + } + prof->allocated_memory = new_length; + } + return prof; +} + + + + +void +iterate(Fastal_param *param, void *method_arguments_p, char *aln_file_name, char *out_file_name_end, int iteration_number) +{ +// calculate alignment length + +//count gap + int it_coutner_2 = 0; + const int LINE_LENGTH = 200; + char line[LINE_LENGTH]; + char *seq1 = vcalloc(1,sizeof(char)); + char *seq2 = vcalloc(1,sizeof(char)); + Fastal_profile **profiles = vcalloc(3,sizeof(Fastal_profile*)); + initiate_profiles(profiles, param); + Fastal_profile *gap_prf = profiles[0]; + Fastal_profile *no_gap_prf = profiles[1]; + int alphabet_size = param->alphabet_size; + + int *gap_list_1 = vcalloc(1, sizeof(int)); + int *gap_list_1_length = vcalloc(1, sizeof(int)); + *gap_list_1_length = 1; + int num_gaps_1 = 0; + int *gap_list_2 = vcalloc(1, sizeof(int)); + int *gap_list_2_length = vcalloc(1, sizeof(int)); + *gap_list_2_length = 1; + int num_gaps_2 = 0; + + int alignment_length = 1; + //from here repeat! + int it_counter = 0; + char *out_file_name = aln_file_name; + int *gap_profile = vcalloc(alignment_length, sizeof(int)); + +// while (it_counter < alignment_length) +// { + + + FILE *alignment_file = fopen(aln_file_name,"r"); + if (alignment_file == NULL) + { + printf("Could not open alignment file %s\n", aln_file_name); + exit(1); + } + + + + alignment_length = 0; + int tmp_len = -1; + fgets(line, LINE_LENGTH , alignment_file); + while(fgets(line, LINE_LENGTH , alignment_file)!=NULL) + { + tmp_len = -1; + if (line[0] == '>') + { + break; + } + while ((line[++tmp_len] != '\n') && (line[tmp_len] != '\0')); + alignment_length += tmp_len; + } + // printf("ALN_LENGTH %i\n", alignment_length); + seq1 =vrealloc(seq1, (1+alignment_length)*sizeof(char)); + + gap_profile = vrealloc(gap_profile, alignment_length * sizeof(int)); + int i; + for (i = 0; i < alignment_length; ++i) + { + gap_profile[i] = 0; + } + + int number_of_sequences = 0; + int pos = -1; + fseek (alignment_file , 0 , SEEK_SET); + while(fgets(line, LINE_LENGTH , alignment_file)!=NULL) + { + if (line[0] == '>') + { + ++number_of_sequences; + pos = -1; + } + else + { + i = -1; + while ((line[++i] != '\n') && (line[i] != '\0')) + { + ++pos; + if (line[i] == '-') + { + ++gap_profile[pos]; + } + } + } + } + + double max_entrop = 0; + int column_index = 0; + double entrop = 0; + double last = 0; + double p; + + +// for (i = it_counter; i<=it_counter; ++i) +// { +// p = gap_profile[i]/(double)number_of_sequences; +// if (!p) +// { +// entrop = 0; +// } +// else +// { +// entrop = (-1)*(p*log(p) + (1-p)*log(1-p) ) ; +// } +// if (entrop > max_entrop) +// { +// column_index = i; +// max_entrop = entrop; +// } +// last = entrop; +// } +// ++it_counter; +// if (max_entrop < 0.6) +// { +// printf("CONTINUE %f\n",entrop); +// continue; +// } + +// column_index = 18;//it_counter-1; +// if (column_index == 19) + column_index = 58; + out_file_name = vtmpnam(NULL); + + char *edit_file_name = "EDIT";//vtmpnam(NULL); + FILE *edit_file = fopen(edit_file_name,"w"); + char *profile_file_name = vtmpnam(NULL); + FILE *profile_file = fopen(profile_file_name,"w"); + char *split_file_name = "AHA";//vtmpnam(NULL); + ++it_coutner_2; + + gap_prf = enlarge_prof(gap_prf, alignment_length, alphabet_size); + no_gap_prf = enlarge_prof(no_gap_prf, alignment_length, alphabet_size ); + no_gap_prf->number_of_sequences = 0; + gap_prf->number_of_sequences = 0; + + split_set(alignment_file, gap_prf, no_gap_prf, seq1, column_index, param->char2pos, split_file_name); + gap_prf -> length = alignment_length; + no_gap_prf -> length = alignment_length; + + + gap_list_1 = del_gap_from_profile(gap_prf, alphabet_size, gap_list_1, gap_list_1_length, &num_gaps_1 ); + gap_list_2 = del_gap_from_profile(no_gap_prf, alphabet_size, gap_list_2, gap_list_2_length, &num_gaps_2 ); + + + fclose(alignment_file); + profiles[0] = gap_prf; + profiles[1] = no_gap_prf; + + alignment_length = gotoh_dyn(profiles, param, method_arguments_p, 0, edit_file, profile_file, 0); + seq1 =vrealloc(seq1, (1+alignment_length)*sizeof(char)); + seq2 =vrealloc(seq2, (1+alignment_length)*sizeof(char)); + + fclose(edit_file); + edit_file = fopen(edit_file_name,"r"); + edit2seq_pattern(edit_file, seq1, seq2); + fclose(edit_file); + fclose(profile_file); + + write_iterated_aln(aln_file_name, out_file_name, split_file_name, seq1, gap_list_1, num_gaps_1, seq2, gap_list_2, num_gaps_2); + aln_file_name = out_file_name; + +// if (it_coutner_2 == 2) +// break; +// } + char command[1000]; + sprintf(command, "mv %s %s",out_file_name, out_file_name_end); + system(command); + + vfree(seq1); + vfree(seq2); + vfree(gap_list_1); + vfree(gap_list_2); + if (!strcmp(param->method, "fast")) + { + free_sparse((Sparse_dynamic_param*)method_arguments_p); + } + else if (!strcmp(param->method, "nw")) + { + free_nw((Nw_param*)method_arguments_p, alphabet_size); + } + else if (!strcmp(param->method, "gotoh")) + { + free_gotoh((Gotoh_param*)method_arguments_p, alphabet_size); + } +} + + + + +void +write_iterated_aln(char* old_aln_file_name, char* new_aln_file_name, char *gap_file_name, char *seq1, int *gap_list1, int num_gap1, char *seq2, int *gap_list2, int num_gap2) +{ + int i; +// for (i = 0; i < num_gap1; ++i) +// { +// printf("%i ", gap_list1[i]); +// } +// printf("\n-----\n"); +// for (i = 0; i < num_gap2; ++i) +// { +// printf("%i ", gap_list2[i]); +// } +// printf("\n"); +// +// printf("%s\n%s\n",seq1, seq2); +// printf("EX: %i\n", gap_list1[0]); + FILE *gap_file = fopen(gap_file_name, "r"); + FILE *new_aln_file = fopen(new_aln_file_name, "w"); +// if (new_aln_file == NULL) +// printf("AHA\n"); + FILE *old_aln_file = fopen(old_aln_file_name, "r"); + + const int GAPLINE_LENGTH = 5; + char gapline[GAPLINE_LENGTH]; + char *seq; + int *gap_list; + int num_gap; + + const int ALNLINE_LENGTH = 200; + char alnline[ALNLINE_LENGTH]; + + int alnline_pos = -1; + int overall_aln_pos = -1; + int pattern_pos = -1; + int gap_pos = 0; + + +// fprintf(new_aln_file, "%s", alnline); + while(fgets(gapline, GAPLINE_LENGTH , gap_file)!=NULL) + { + + //this is the sequence name + fgets(alnline, ALNLINE_LENGTH , old_aln_file); + fprintf(new_aln_file,"%s",alnline); + //getting (part if) the sequence + fgets(alnline, ALNLINE_LENGTH , old_aln_file); + if (gapline[0] == 'g') + { + seq = seq1; + gap_list = gap_list1; + num_gap = num_gap1; + } + else + { + seq = seq2; + gap_list = gap_list2; + num_gap = num_gap2; + } + overall_aln_pos = 0; + pattern_pos = 0; + gap_pos = 0; + alnline_pos = -1; + + //go along the pattern + while (seq[pattern_pos] != '\0') + { + if (seq[pattern_pos] == '-') + { + ++pattern_pos; + fprintf(new_aln_file,"-"); + continue; + } + //get next part of the sequence if you are at the end + if ((alnline[++alnline_pos] == '\n') || (alnline[alnline_pos] == '\0')) + { + alnline_pos = 0; + fgets(alnline, ALNLINE_LENGTH, old_aln_file); + } + if ((gap_pos < num_gap) && (overall_aln_pos == gap_list[gap_pos])) + { + ++gap_pos; + ++overall_aln_pos; + continue; + } + fprintf(new_aln_file,"%c",alnline[alnline_pos]); + ++overall_aln_pos; + ++pattern_pos; + } + fprintf(new_aln_file,"\n"); + } + + fclose(new_aln_file); + fclose(old_aln_file); + fclose(gap_file); +} + + +void +edit2seq_pattern(FILE *edit_file, char *seq1, char *seq2) +{ + + fseek(edit_file, 0, SEEK_SET); + const int LINE_LENGTH = 50; + char line[LINE_LENGTH]; + fgets(line, LINE_LENGTH , edit_file); +// int child1 = atoi(line); + fgets(line, LINE_LENGTH , edit_file); +// int child2 = atoi(line); + fgets(line, LINE_LENGTH , edit_file); +// int is_leaf1 = atoi(line); + fgets(line, LINE_LENGTH , edit_file); +// int is_leaf2 = atoi(line); + + char x; + int number; + int pos = -1; + while(fgets(line, LINE_LENGTH , edit_file)!=NULL) + { +// printf("%s\n",line); + x = line[0]; + if (x == '*') + break; + number = atoi(&line[1]); + if (x == 'M') + { + while (--number >= 0) + { + seq1[++pos] = 'X'; + seq2[pos] = 'X'; + } + } + else if (x == 'I') + { + while (--number >= 0) + { + seq1[++pos] = 'X'; + seq2[pos] = '-'; + } + } + else if (x == 'D') + { + while (--number >= 0) + { +// printf("POS: %i\n", pos); + seq1[++pos] = '-'; + seq2[pos] = 'X'; + } + } + } + seq1[++pos] = '\0'; + seq2[pos] = '\0'; +// printf("%s\n%s\n",seq1, seq2); +} +/******************************COPYRIGHT NOTICE*******************************/ +/*© Centro de Regulacio Genomica */ +/*and */ +/*Cedric Notredame */ +/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */ +/*All rights reserved.*/ +/*This file is part of T-COFFEE.*/ +/**/ +/* T-COFFEE is free software; you can redistribute it and/or modify*/ +/* it under the terms of the GNU General Public License as published by*/ +/* the Free Software Foundation; either version 2 of the License, or*/ +/* (at your option) any later version.*/ +/**/ +/* T-COFFEE is distributed in the hope that it will be useful,*/ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of*/ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/ +/* GNU General Public License for more details.*/ +/**/ +/* You should have received a copy of the GNU General Public License*/ +/* along with Foobar; if not, write to the Free Software*/ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/ +/*............................................... |*/ +/* If you need some more information*/ +/* cedric.notredame@europe.com*/ +/*............................................... |*/ +/**/ +/**/ +/* */ +/******************************COPYRIGHT NOTICE*******************************/