X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=binaries%2Fsrc%2Ftcoffee%2Ft_coffee_source%2Fparttree.c;fp=binaries%2Fsrc%2Ftcoffee%2Ft_coffee_source%2Fparttree.c;h=0000000000000000000000000000000000000000;hb=8140043fefd2101326b3e651e3b7e2d4ac806b99;hp=8a6a4134130cd3284e2bdf2b594fc376edf49a0c;hpb=2eb05b9319960f2346045e5e2119d112c5909490;p=jabaws.git diff --git a/binaries/src/tcoffee/t_coffee_source/parttree.c b/binaries/src/tcoffee/t_coffee_source/parttree.c deleted file mode 100644 index 8a6a413..0000000 --- a/binaries/src/tcoffee/t_coffee_source/parttree.c +++ /dev/null @@ -1,1040 +0,0 @@ -#include -#include -#include -#include -#include -// #include - -#include "fast_tree_header.h" -#include "io_lib_header.h" -#include "util_lib_header.h" -#include "define_header.h" -#include "dp_lib_header.h" -//TODO: -change kick-out value -// -pass arrays in partTree_r - -/*! - * \file parttree.c - * \brief Source code for PartTree algorithm. - */ - - - -#define ENLARGEMENT_PER_STEP 50 - - - -void -print_fastal_tree(Tree_fastal *tree, - int pos, - FILE *tree_file, - int num_seq) -{ - if (tree[pos].left >= num_seq) - print_fastal_tree(tree, tree[pos].left-num_seq, tree_file, num_seq); - if (tree[pos].right >= num_seq) - print_fastal_tree(tree, tree[pos].right-num_seq, tree_file, num_seq); - - fprintf(tree_file, "%i %i %i\n", tree[pos].left, tree[pos].right, tree[pos].name); -} - - - - - -PartTree_param *param_set; - - -//********************** UPGMA ***************************** - -/** - * Function to write tree to file in fastal_format. Leafs in \a tree are leafs in the complete tree as well. - * - * \param tree The root node of the (sub)tree. - * \param param_set Parameter of PartTree. - * \param start_write_here Current node to write into. - * \return position in tree - * \see tree_process - */ -int -tree_process_simple(NT_node tree, - PartTree_param *param_set, - int start_write_here) -{ - if (tree->isseq) - { -// printf("T: %s\n", tree->name); - return atoi(tree->name); - } - else - { - Tree_fastal *tree_flat = ¶m_set->tree[start_write_here]; - tree_flat->name = start_write_here +param_set->num_sequences; - if (start_write_here == param_set->pos_tree) - { - ++param_set->pos_tree; - } - start_write_here = param_set->pos_tree; - int left = tree_process_simple(tree->left, param_set, start_write_here); - start_write_here = param_set->pos_tree; - int right = tree_process_simple(tree->right, param_set, start_write_here); - tree_flat->index = NULL; - tree_flat->right = right; - tree_flat->left = left; - return tree_flat->name; - } -} - - - - -/** -* Function to write tree to file in fastal_format. Leafs in \a tree do not need to be leafs in the complete tree. -* -* \param tree The root node of the (sub)tree. -* \param param_set Parameter of PartTree. -* \param clusters Number of sequences in each cluster. -* \param subgroup The sequences for each cluster. -* \param start_write_here Current node to write into. -* \return position in tree -* \see tree_process_simple -*/ -int -tree_process(NT_node tree, - PartTree_param *param_set, - int *clusters, - int *subgroup, - int start_write_here) -{ - if (tree->isseq) - { - int node_num = atoi(tree->name); - int num_in_sub = clusters[node_num+1] - clusters[node_num]; -// printf("NUM: %i %i %i %i\n",node_num, num_in_sub, clusters[node_num+1], clusters[node_num]); - if (num_in_sub > 1) - { - Tree_fastal *tree_flat = ¶m_set->tree[start_write_here]; - tree_flat->name = start_write_here +param_set->num_sequences; - if (start_write_here == param_set->pos_tree) - { - ++param_set->pos_tree; - } - tree_flat->left = -1; - tree_flat->right = -1; - tree_flat->index = &subgroup[clusters[node_num]]; - - tree_flat->num_leafs = num_in_sub; - return tree_flat->name; - } - else - { - return(subgroup[clusters[node_num]]); - } - } - else - { -// printf("TREEPOS: %i\n",param_set->pos_tree); - Tree_fastal *tree_flat = ¶m_set->tree[start_write_here]; - tree_flat->name = start_write_here +param_set->num_sequences; - if (start_write_here == param_set->pos_tree) - { - ++param_set->pos_tree; - } - start_write_here = param_set->pos_tree; - int left = tree_process(tree->left, param_set, clusters, subgroup, start_write_here); - start_write_here = param_set->pos_tree; - int right = tree_process(tree->right, param_set, clusters, subgroup, start_write_here); - tree_flat->index = NULL; - tree_flat->right = right; - tree_flat->left = left; - return tree_flat->name; - } -} - - -/** -* \brief Calculates tree out of distance matrix. -* -* Calculates the upgma tree using a given distance matrix. -* \param mat The distance matrix. -* \param nseq Number of sequences. -* \param fname Filename for temporary storage. -* \param seqnames Names of the sequences. -* \return The calculated UPGMA Tree. -*/ -NT_node ** int_dist2upgma_tree_fastal (int **mat, int nseq, char *fname, char **seqnames) -{ - NT_node *NL, T; - int a, n, *used; - int tot_node; - if (upgma_node_heap (NULL)) - { - printf_exit ( EXIT_FAILURE,stderr, "\nERROR: non empty heap in upgma [FATAL]"); - } - NL=vcalloc (nseq, sizeof (NT_node)); - - for (a=0; aname, "%s", seqnames[a]); - NL[a]->isseq=1; - NL[a]->leaf=1; - } - used=vcalloc ( nseq, sizeof (int)); - n=nseq; - while (n>1) - { - T=upgma_merge(mat, NL,used, &n, nseq); - } - vfree (used); - vfclose (print_tree (T, "newick", vfopen (fname, "w"))); - upgma_node_heap (NULL); - vfree (NL); - - return read_tree (fname,&tot_node, nseq, seqnames); -} - - - - -//Part_Tree - -/*! -* \brief Constructs a guide tree for multiple sequence alignment. -* -* This algorithm is an implementation of the partTree algorithm (PartTree: an algorithm to build an approximate tree from a large number of unaligned -* sequences. Katoh et al. 2007). -* \param sequence_f Filename of file with sequences. -* \param tree_f Filename of file where the tree will be stored. -* \param ktup Size of the ktups. -* \param subgroup Parameter for subgroupsize. -*/ -void -make_partTree(char *sequence_f, - char *tree_f, - int ktup, - int subgroup) -{ - param_set = vcalloc(1,sizeof(PartTree_param)); - param_set->ktup = ktup; - param_set->subgroup = subgroup; - - long *file_positions = NULL; - long **tmp1 = &file_positions; - int *seq_lengths = NULL; - int **tmp2 = &seq_lengths; - - //make index - int number_of_sequences = make_pos_len_index_of_file(sequence_f, "KTUP_table", tmp1, tmp2, ktup, "DNA"); - param_set->num_sequences = number_of_sequences; - param_set->ktup_positions = file_positions; - param_set->seq_lengths = seq_lengths; - param_set->threshold = 0.01; - param_set->ktup_table_f = fopen("KTUP_table","r"); - - Tree_fastal *tree = vcalloc(number_of_sequences-1,sizeof(Tree_fastal)); - param_set->tree = tree; - - int i; - partTree_r(param_set); -// for (i = 0; i < number_of_sequences-1; ++i) -// { -// printf("%i %i %i\n", tree[i].left, tree[i].right, tree[i].name); -// } - FILE * tree_file = fopen(tree_f,"w"); - print_fastal_tree(tree, 0, tree_file, number_of_sequences); - fclose(tree_file); - vfree(tree); -} - - -/** -* Filters seed set. -* -* \param sequence_group Sequences to filter. -* \param dist_mat The distance matrix. -* \param seed_set_cleaned ordered_seed_set. -* \param param_set Parameters for PartTree algorithm. -* \return number in the filtered set. -*/ -int -filter(int *sequence_group, - int **dist_mat, - int *seed_set_cleaned, - PartTree_param *param_set) -{ - int i, j; - int num_in_subgroup = param_set->subgroup; - int *seq_lengths = param_set->seq_lengths; - int num_in_clean = 0; - double threshold = param_set->threshold; -// printf("threshold: %f\n", threshold); - double min; - for (i = 0; i < num_in_subgroup; ++i) - { - if (!seed_set_cleaned[i]) - continue; - for (j = i+1; j < num_in_subgroup; ++j) - { - min = MIN(seq_lengths[sequence_group[i]], seq_lengths[sequence_group[j]]); -// printf("MINIMUM: %i\n",min); - min = (threshold * min); -// printf("MINIMUM: %f\n",min); - if (seed_set_cleaned[j] &&(dist_mat[i][j] < min)) - { - if (seq_lengths[sequence_group[i]] < seq_lengths[sequence_group[j]]) - { - seed_set_cleaned[i] = 0; - break; - } - else - seed_set_cleaned[j] = 0; - } - } - } - - for (i = 0; i < num_in_subgroup; ++i) - { - num_in_clean += seed_set_cleaned[i]; - } - int max = num_in_subgroup -1; - i = 0; - int tmp; -// printf("CLEAN: %i\n", num_in_clean); - while (i < num_in_clean) - { - if (seed_set_cleaned[i]) - { - ++i; - } - else - { - seed_set_cleaned[i] = seed_set_cleaned[max]; - seed_set_cleaned[max] = 0; - tmp = sequence_group[i]; - sequence_group[i] = sequence_group[max]; - sequence_group[max] = tmp; - --max; - } - } - return num_in_clean; -} - - - - - -/*! -* \brief Function to create a tree using the PartTree algorithm. -* -* \param param_set A \a PartTree_param object containing all necessary parameters and the data. -* \return The node_number. -*/ -void -partTree_r(PartTree_param *param_set) -{ - - int num_of_tree_nodes = param_set->num_sequences-1; - int loop_tree_node; - - Tree_fastal *tree = param_set->tree; -// int this_node = param_set->pos_tree; - - int i; - int tsize = param_set->tsize; - - - //get some memory - short *table1 = vcalloc(tsize, sizeof(short)); - short *table2 = vcalloc(tsize, sizeof(short)); - int *seed_set = vcalloc(param_set->subgroup, sizeof(int)); - char **names = declare_char(param_set->subgroup, 8); - int **dist_mat = declare_int(param_set->subgroup, param_set->subgroup); - int **dist_mat2 = declare_int(param_set->subgroup, param_set->subgroup); - char * file_name_tmp = vtmpnam(NULL); - int *seed_set_cleaned = vcalloc(param_set->subgroup, sizeof(int)); - FILE *table_f = param_set->ktup_table_f; - long *file_positions = param_set->ktup_positions; - int max_n_group = param_set->subgroup; - int num_in_subgroup = param_set->subgroup; - int *seq_lengths = param_set->seq_lengths; - int *clusters = vcalloc(param_set->subgroup+1, sizeof(int)); - int *min_dist = vcalloc(param_set->num_sequences, sizeof(int)); - int *belongs_to = vcalloc(param_set->num_sequences, sizeof(int)); - - - - - - - //Prepare first node - - tree[0].index = vcalloc(param_set->num_sequences,sizeof(int)); - int *index = tree[0].index; - for (i = 0; i< param_set->num_sequences; ++i) - index[i] = i; - tree[0].name = param_set->pos_tree +param_set->num_sequences; - - tree[0].num_leafs = param_set->num_sequences; - int *sequence_group2 = vcalloc(param_set->num_sequences,sizeof(int)); - - Tree_fastal *current_node; - for (loop_tree_node = 0; loop_tree_node < num_of_tree_nodes; ++loop_tree_node) - { -// printf("ROUND: %i\n", loop_tree_node); - current_node = &tree[loop_tree_node]; - index= current_node->index; - if (current_node->index == NULL) - { - continue; - } - int num_sequences = current_node->num_leafs; - - //if number of sequences in this group smaller than number subgoup size: make tree, finisch - if (num_sequences <= max_n_group) - { - int j; - dist_mat = make_distance_matrix(table_f, file_positions, index, num_sequences, dist_mat); - for (i = 0; i < num_sequences; ++i) - { - sprintf(names[i],"%i", current_node->index[i]); - } - NT_node **tree= (int_dist2upgma_tree_fastal (dist_mat, num_sequences, file_name_tmp , names)); - tree_process_simple(tree[0][0], param_set,loop_tree_node); - continue; - } - - - for (i = 0; i < num_in_subgroup; ++i) - { - seed_set_cleaned[i] = 1; - } - - //finde longest sequence and put into the first field - - int index_longest = 0; - int length_of_longest = 0; - - for(i = 0; i < num_sequences; ++i) - { - if (seq_lengths[index[i]] > length_of_longest) - { - index_longest = i; - length_of_longest = seq_lengths[index[i]]; - } - } - int tmp = index[index_longest]; - index[index_longest] = index[0]; - index[0] = tmp; - - //distance of longest to rest - int seq_index = 1; - int min= euclidean_dist(table_f, file_positions[index[0]], file_positions[index[1]], table1, table2, param_set->tsize); - for (i = 2; i < num_sequences; ++i) - { - tmp = euclidean_dist_half(table_f, file_positions[index[i]], table1, table2, param_set->tsize); - if (tmp < min) - { - min = tmp; - seq_index = i; - } - } - - //get the new seed_set in the first n spaces - tmp = index[1]; - index[1] = index[seq_index]; - index[seq_index] = tmp; - int r,j; - num_in_subgroup = param_set->subgroup; - - - for (i = 2; i < num_in_subgroup; ++i) - { - r = i + rand() / ( RAND_MAX / ( num_sequences-i) + 1 ); -// printf("RANDOM: %i\n",r); - tmp = index[r]; - index[r] = index[i]; - index[i] = tmp; - } - - //Calculate matrix - dist_mat = make_distance_matrix(table_f, file_positions, index, param_set->subgroup, dist_mat); - - //Filter out sequences that are to similar & reorder - - NT_node **upgma_tree; - - - int num_in_clean = filter(index, dist_mat, seed_set_cleaned, param_set); -// if (num_in_clean == 1) -// { -// int j; -// // dist_mat = make_distance_matrix(table_f, file_positions, index, upgma_tree, dist_mat); -// for (i = 0; i < param_set->subgroup; ++i) -// { -// sprintf(names[i],"%i", current_node->index[i]); -// } -// upgma_tree= (int_dist2upgma_tree_fastal (dist_mat, param_set->subgroup, file_name_tmp , names)); -// tree_process_simple(upgma_tree[0][0], param_set,loop_tree_node); -// continue; -// } -// else -// { - - if (num_in_clean ==1) - { - num_in_clean = 2; - seed_set_cleaned[1] = 1; - } - //make_tree - int col = 0; - int row = 0; - for (i = 0; i < num_in_subgroup; ++i) - { - if (seed_set_cleaned[i]) - { - row = col+1; - for (j = i+1; j < num_in_subgroup; ++j) - { - if (seed_set_cleaned[j]) - { - dist_mat2[row][col] = dist_mat2[col][row] = dist_mat[i][j]; - ++row; - } - } - ++col; - } - } - for (i = 0; i < num_in_clean; ++i) - { - sprintf(names[i],"%i",i); - } - upgma_tree= (int_dist2upgma_tree_fastal (dist_mat2, num_in_clean, file_name_tmp , names)); -// } - -// int *pos_tree_p = ¶m_set->pos_tree; - - - int leaf = 0; - - //cluster - //calculate distances from n' to N - get_table(table1, table_f, file_positions[index[0]]); - for (j = num_in_clean; j < num_sequences; ++j) - { - min_dist[j] = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize); - belongs_to[j] = 0; - } - for(i = 1; i < num_in_clean; ++i) - { - get_table(table1, table_f, file_positions[index[i]]); - belongs_to[i] = i; - for (j = num_in_clean; j < num_sequences; ++j) - { - tmp = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize); - if (tmp < min_dist[j]) - { - min_dist[j] = tmp; - belongs_to[j] = i; - } - } - } - - //how_many sequences has each cluster - for (j = 0; j <= num_in_subgroup; ++j) - { - clusters[j] = 0; - } - for (j = 0; j < num_sequences; ++j) - { - ++clusters[belongs_to[j]]; - } -// for (j = 0; j <= num_in_subgroup; ++j) -// { -// printf("CL: %i ",clusters[j]); -// } -// printf("\n"); - for(i = 1; i < num_in_clean; ++i) - { - clusters[i] += clusters[i-1]; - } - clusters[num_in_clean] = clusters[num_in_clean-1]; - - for (i = 0; i < num_sequences; ++i) - { - sequence_group2[--clusters[belongs_to[i]]] = index[i]; - } - - for (i = 0; i < num_sequences; ++i) - { - index[i] = sequence_group2[i]; - } - - - for (i = 0; i < num_in_clean; ++i) - { - sprintf(names[i],"%i",i); - } - tree_process(upgma_tree[0][0], param_set, clusters, index, loop_tree_node); - NT_node tmp_tree = upgma_tree[3][0]; - vfree(upgma_tree[0]); - vfree(upgma_tree[1]); - vfree(upgma_tree[2]); - vfree(upgma_tree[3]); - vfree(upgma_tree); - free_tree(tmp_tree); - } - vfree(min_dist); - vfree(belongs_to); - vfree(clusters); -} - - - -/*! - * \brief Makes the distance matrix between all sequences. - * - * \param table_file File with the ktup tables - * \param file_positions Index of positions where the tabels are stored in \a table_file - * \param sequence_group the group of sequences - * \param number number of sequences - * \param dist_mat distance matrix - * \return the distance matrix. (same as \a dist_mat ) -*/ -int ** -make_distance_matrix(FILE *table_f, - long *file_positions, - int *sequence_group, - int number, - int **dist_mat) -{ - static short *table1 = NULL; - static short *table2; - int tsize = param_set->tsize; - if (table1 == NULL) - { - table1 = vcalloc(tsize, sizeof(short)); - table2 = vcalloc(tsize, sizeof(short)); - } - int i, j, num = number-1; - for (i = 0; i < num; ++i) - { - j = i+1; - dist_mat[i][j] = dist_mat[j][i]= euclidean_dist(table_f, file_positions[sequence_group[i]], file_positions[sequence_group[j]], table1, table2, tsize); - ++j; - for (; j < number; ++j) - { - dist_mat[i][j] = dist_mat[j][i] = euclidean_dist_half(table_f, file_positions[sequence_group[j]], table1, table2, tsize); - } - } - return dist_mat; -} - - - - -/** -* Replaces the coded sequence with coded tuples -* -* \param coded_seq The coded sequence which will be replaced by the tuple number -* \param ktup Size of the ktup -* \param ng Coded alphabet size -* \param length Lengths of coded sequence -*/ -void -makepointtable_fast(int *coded_seq, //sequence - int ktup, //ktup size - int ng, //hmm... - int length) //length of coded_seq -{ - int point, a; - register int *p; - static int *prod; - - if (!prod) - { - prod=vcalloc ( ktup, sizeof (int)); - for ( a=0; a 0) - fprintf(tables_f, "%i %i\n", point, table[point]); - } - fprintf(tables_f, "*\n"); -} - - -/** JUST FOR TEST */ -void -make_fast_tree(char *file_name, - int n, - int ktup) -{ - - make_partTree(file_name, "TREE_OUT", ktup, n); - -} - - - -/** -* \brief Reads ktup_table from file -* -* \param table Table to save the file content in. -* \param tables_f File in which the tables are stored. -* \param index Position of the table in \a tables_f -*/ -void -get_table(short *table, //Table to save the readings in - FILE* tables_f, //File with tables - long index) //index positin of ktup-tables -{ - fseek(tables_f, index, SEEK_SET); - const int LINE_LENGTH = 101; - char line[LINE_LENGTH]; - fgets(line, LINE_LENGTH, tables_f); - - char delims[] = " "; - char *result = NULL; - int code; - - while (line[0] != '*') - { - result = strtok( line, delims ); - code = atoi(result); - table[code] = atoi(strtok( NULL, delims)); - fgets(line, LINE_LENGTH, tables_f); - } -} - - - -/** -* \brief calculates the euclidean ktub distance between two sequences -* -* @param ktup_f, ktup_file -* @param pos1 position of sequence 1 in \a ktup_f -* @param pos2 position of sequence 2 in \a ktup_f -* @param table1 Saves the number of occurences for each ktup in sequence 1 -* @param table2 Saves the number of occurences for each ktup in sequence 2 -*/ -int -euclidean_dist(FILE* ktup_f, //ktup_file - long pos1, //position of table1 - long pos2, //position of table2 - short *table1, //table to save ktups in - short *table2, //table to save ktups in - int length) -{ - const int LINE_LENGTH = 101; - char line[LINE_LENGTH]; - - - char delims[] = " "; - char *result = NULL; - int code; - - fseek(ktup_f, pos1, SEEK_SET); - fgets(line, LINE_LENGTH, ktup_f); - int i; - for (i = 0; i < length; ++i) - { - table1[i] = 0; - table2[i] = 0; - } - while (line[0] != '*') - { - result = strtok( line, delims ); - code = atoi(result); - table1[code] = atoi(strtok( NULL, delims)); - fgets(line, LINE_LENGTH, ktup_f); - } - fseek(ktup_f, pos2, SEEK_SET); - fgets(line, LINE_LENGTH, ktup_f); - while (line[0] != '*') - { - result = strtok( line, delims ); - code = atoi(result); - table2[code] = atoi(strtok( NULL, delims)); - fgets(line, LINE_LENGTH, ktup_f); - } - - int dist = 0; - for (i = 0; i < length; ++i) - { - dist += (table1[i]-table2[i])*(table1[i]-table2[i]); - } - return dist; -} - - - -/** - * \brief calculates the euclidean ktub distance between two sequences. - * - * The difference to \a euclidean_dist is, that this uses the ktups stored in \a table1 - * @param ktup_f, ktup_file - * @param pos2 position of sequence 2 in \a ktup_f - * @param table1 Saves the number of occurences for each ktup in sequence 1 - * @param table2 Saves the number of occurences for each ktup in sequence 2 - * \see euclidean_dist - */ -int -euclidean_dist_half(FILE* ktup_f, //ktup_file - long pos2, //position of table1 - short *table1, //table to save ktups in - short *table2, //table to save ktups in - int length) -{ - const int LINE_LENGTH = 101; - char line[LINE_LENGTH]; - - - char delims[] = " "; - char *result = NULL; - int code; - - fseek(ktup_f, pos2, SEEK_SET); - fgets(line, LINE_LENGTH, ktup_f); - int i; - for (i = 0; i < length; ++i) - { - table2[i] = 0; - } - while (line[0] != '*') - { - result = strtok( line, delims ); - code = atoi(result); - table2[code] = atoi(strtok( NULL, delims)); - fgets(line, LINE_LENGTH, ktup_f); - } - - int dist = 0; - for (i = 0; i < length; ++i) - { - dist += (table1[i]-table2[i])*(table1[i]-table2[i]); - } - return dist; -} - - - - -/** -* Makes an index of a file -*/ -int -make_pos_len_index_of_file(char *file_name, //file with sequences - char *ktable_f, //file with the ktup-tables - long **file_positions, //array to save the positions - int **seq_lengths, //array to save the sequence length - int ktup, //length of ktup - char *type) //type of the seuqence -{ - //preparations for recoding sequence - int *aa; - int a, b, l; - - int ng = 0; - char **gl; - if ( strm (type, "DNA") || strm (type, "RNA")) - { - gl=declare_char (5,13); - sprintf ( gl[ng++], "Aa"); - sprintf ( gl[ng++], "Gg"); - sprintf ( gl[ng++], "TtUu"); - sprintf ( gl[ng++], "Cc"); - sprintf ( gl[ng++], "NnRrYyDdMmWw"); - } - else - { - gl=make_group_aa ( &ng, "mafft"); - } - aa=vcalloc ( 256, sizeof (int)); - for ( a=0; atsize = tsize; - param_set->ng = ng; - - int *table=vcalloc ( tsize,sizeof (int)); - - - //Reading and recoding squences - const int LINE_LENGTH = 501; - int *coded_seq = vcalloc(2*LINE_LENGTH, sizeof(int)); - int allocated_mem = 2*LINE_LENGTH; - - (*file_positions) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(long)); - (*seq_lengths) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(int)); - int current_size = ENLARGEMENT_PER_STEP; - int current_pos = 0; - - FILE *file = fopen(file_name,"r"); - - - int seq_length=0; - char line[LINE_LENGTH]; - - int num_of_sequences = 0; - int str_len = 0; - int mem_for_pos = ENLARGEMENT_PER_STEP; - int tmp; - int real_len; - int *c_seq; - - FILE *tables_f = fopen(ktable_f, "w"); - - - if (file == NULL) - { - printf("FILE NOT FOUND\n"); - exit(1); - } - else - { - - while(fgets(line, LINE_LENGTH , file)!=NULL) - { - if ( str_len >= allocated_mem - LINE_LENGTH) - { - allocated_mem += LINE_LENGTH; - coded_seq = vrealloc(coded_seq, allocated_mem*sizeof(int)); - } - - int i; - int length = strlen(line); - if (line[0] == '>') - { - if (num_of_sequences >0) - { - (*seq_lengths)[num_of_sequences-1] = str_len; -// printf("len: %i\n", str_len); - c_seq = coded_seq; - makepointtable_fast(coded_seq,ktup,ng, str_len); - - (*file_positions)[num_of_sequences-1] = ftell(tables_f ); - for (i=0; i < tsize; ++i) - table[i] = 0; - makecompositiontable_fastal(tables_f, table, coded_seq,tsize ); - - - } - str_len = 0; - ++num_of_sequences; - - if (num_of_sequences == mem_for_pos) - { - mem_for_pos += ENLARGEMENT_PER_STEP; - (*file_positions) = vrealloc((*file_positions), mem_for_pos * sizeof(long)); - (*seq_lengths) = vrealloc((*seq_lengths), mem_for_pos * sizeof(int)); - } - } - else - { - int i; - real_len = strlen(line); - if (line[real_len-1] == '\n') - --real_len; - for (i = 0; i < real_len; ++i) - { - coded_seq[str_len++] = aa[line[i]]; - } - } - } - } - - (*seq_lengths)[num_of_sequences-1] = str_len; - c_seq = coded_seq; - makepointtable_fast(coded_seq,ktup,ng, str_len); - (*file_positions)[num_of_sequences] = ftell(tables_f ); - makecompositiontable_fastal(tables_f, table, coded_seq,tsize ); - fclose(file); - fclose(tables_f); - return num_of_sequences; -} -/*********************************COPYRIGHT NOTICE**********************************/ -/*© Centro de Regulacio Genomica */ -/*and */ -/*Cedric Notredame */ -/*Tue Oct 27 10:12:26 WEST 2009. */ -/*All rights reserved.*/ -/*This file is part of T-COFFEE.*/ -/**/ -/* T-COFFEE is free software; you can redistribute it and/or modify*/ -/* it under the terms of the GNU General Public License as published by*/ -/* the Free Software Foundation; either version 2 of the License, or*/ -/* (at your option) any later version.*/ -/**/ -/* T-COFFEE is distributed in the hope that it will be useful,*/ -/* but WITHOUT ANY WARRANTY; without even the implied warranty of*/ -/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/ -/* GNU General Public License for more details.*/ -/**/ -/* You should have received a copy of the GNU General Public License*/ -/* along with Foobar; if not, write to the Free Software*/ -/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/ -/*............................................... |*/ -/* If you need some more information*/ -/* cedric.notredame@europe.com*/ -/*............................................... |*/ -/**/ -/**/ -/* */ -/*********************************COPYRIGHT NOTICE**********************************/