#include #include #include #include #include #include // #include #include "fast_tree_header.h" #include "io_lib_header.h" #include "util_lib_header.h" #include "define_header.h" #include "dp_lib_header.h" //TODO: -change kick-out value // -pass arrays in partTree_r /*! * \file parttree.c * \brief Source code for PartTree algorithm. */ #define ENLARGEMENT_PER_STEP 50 void print_fastal_tree(Tree_fastal *tree, int pos, FILE *tree_file, int num_seq) { if (tree[pos].left >= num_seq) print_fastal_tree(tree, tree[pos].left-num_seq, tree_file, num_seq); if (tree[pos].right >= num_seq) print_fastal_tree(tree, tree[pos].right-num_seq, tree_file, num_seq); fprintf(tree_file, "%i %i %i\n", tree[pos].left, tree[pos].right, tree[pos].name); } PartTree_param *param_set; //********************** UPGMA ***************************** /** * Function to write tree to file in fastal_format. Leafs in \a tree are leafs in the complete tree as well. * * \param tree The root node of the (sub)tree. * \param param_set Parameter of PartTree. * \param start_write_here Current node to write into. * \return position in tree * \see tree_process */ int tree_process_simple(NT_node tree, PartTree_param *param_set, int start_write_here) { if (tree->isseq) { // printf("T: %s\n", tree->name); return atoi(tree->name); } else { Tree_fastal *tree_flat = ¶m_set->tree[start_write_here]; tree_flat->name = start_write_here +param_set->num_sequences; if (start_write_here == param_set->pos_tree) { ++param_set->pos_tree; } start_write_here = param_set->pos_tree; int left = tree_process_simple(tree->left, param_set, start_write_here); start_write_here = param_set->pos_tree; int right = tree_process_simple(tree->right, param_set, start_write_here); tree_flat->index = NULL; tree_flat->right = right; tree_flat->left = left; return tree_flat->name; } } void read_sequence_from_position(FILE *file, long position1, char *seq) { fseek(file, position1, SEEK_SET); char line[500]; int pos = -1; while ((fgets(line, 500, file) != NULL) && (line[0] != '>')) { int l = -1; while ((line[++l] != '\n') && (line[l] != '\0')) seq[++pos] = line[l]; } seq[++pos] = '\0'; } // char* // read_sequence_from_position(FILE *file, long position1, long position2) // { // // } /** * Function to write tree to file in fastal_format. Leafs in \a tree do not need to be leafs in the complete tree. * * \param tree The root node of the (sub)tree. * \param param_set Parameter of PartTree. * \param clusters Number of sequences in each cluster. * \param subgroup The sequences for each cluster. * \param start_write_here Current node to write into. * \return position in tree * \see tree_process_simple */ int tree_process(NT_node tree, PartTree_param *param_set, int *clusters, int *subgroup, int start_write_here) { if (tree->isseq) { int node_num = atoi(tree->name); int num_in_sub = clusters[node_num+1] - clusters[node_num]; // printf("NUM: %i %i %i %i\n",node_num, num_in_sub, clusters[node_num+1], clusters[node_num]); if (num_in_sub > 1) { Tree_fastal *tree_flat = ¶m_set->tree[start_write_here]; tree_flat->name = start_write_here +param_set->num_sequences; if (start_write_here == param_set->pos_tree) { ++param_set->pos_tree; } tree_flat->left = -1; tree_flat->right = -1; tree_flat->index = &subgroup[clusters[node_num]]; tree_flat->num_leafs = num_in_sub; return tree_flat->name; } else { return(subgroup[clusters[node_num]]); } } else { // printf("TREEPOS: %i\n",param_set->pos_tree); Tree_fastal *tree_flat = ¶m_set->tree[start_write_here]; tree_flat->name = start_write_here +param_set->num_sequences; if (start_write_here == param_set->pos_tree) { ++param_set->pos_tree; } start_write_here = param_set->pos_tree; int left = tree_process(tree->left, param_set, clusters, subgroup, start_write_here); start_write_here = param_set->pos_tree; int right = tree_process(tree->right, param_set, clusters, subgroup, start_write_here); tree_flat->index = NULL; tree_flat->right = right; tree_flat->left = left; return tree_flat->name; } } /** * \brief Calculates tree out of distance matrix. * * Calculates the upgma tree using a given distance matrix. * \param mat The distance matrix. * \param nseq Number of sequences. * \param fname Filename for temporary storage. * \param seqnames Names of the sequences. * \return The calculated UPGMA Tree. */ NT_node ** int_dist2upgma_tree_fastal (int **mat, int nseq, char *fname, char **seqnames) { NT_node *NL, T; int a, n, *used; int tot_node; if (upgma_node_heap (NULL)) { printf_exit ( EXIT_FAILURE,stderr, "\nERROR: non empty heap in upgma [FATAL]"); } NL=vcalloc (nseq, sizeof (NT_node)); for (a=0; aname, "%s", seqnames[a]); NL[a]->isseq=1; NL[a]->leaf=1; } used=vcalloc ( nseq, sizeof (int)); n=nseq; while (n>1) { T=upgma_merge(mat, NL,used, &n, nseq); } vfree (used); vfclose (print_tree (T, "newick", vfopen (fname, "w"))); upgma_node_heap (NULL); vfree (NL); return read_tree (fname,&tot_node, nseq, seqnames); } //Part_Tree /*! * \brief Constructs a guide tree for multiple sequence alignment. * * This algorithm is an implementation of the partTree algorithm (PartTree: an algorithm to build an approximate tree from a large number of unaligned * sequences. Katoh et al. 2007). * \param sequence_f Filename of file with sequences. * \param tree_f Filename of file where the tree will be stored. * \param ktup Size of the ktups. * \param subgroup Parameter for subgroupsize. */ void make_partTree(char *sequence_f, char *tree_f, int ktup, int subgroup, int is_dna, int retree) { long *file_positions = NULL; long **tmp1 = &file_positions; int *seq_lengths = NULL; int **tmp2 = &seq_lengths; int number_of_sequences; param_set = vcalloc(1,sizeof(PartTree_param)); param_set->ktup = ktup; param_set->subgroup = subgroup; Tree_fastal *tree; if (!retree) { //make index char *ktup_table = vtmpnam(NULL); param_set->num_sequences = number_of_sequences = make_pos_len_index_of_file(sequence_f, ktup_table, tmp1, tmp2, ktup, is_dna); tree = vcalloc(number_of_sequences-1,sizeof(Tree_fastal)); param_set->tree = tree; param_set->ktup_positions = file_positions; param_set->seq_lengths = seq_lengths; param_set->threshold = 0.01; param_set->ktup_table_f = fopen(ktup_table,"r"); partTree_r(param_set); } else { //make index param_set->num_sequences = number_of_sequences = make_pos_len_index_of_file_retree(sequence_f, tmp1, tmp2); tree = vcalloc(number_of_sequences-1,sizeof(Tree_fastal)); param_set->tree = tree; param_set->ktup_positions = file_positions; param_set->seq_lengths = seq_lengths; param_set->threshold = 0.01; param_set->ktup_table_f = fopen(sequence_f,"r"); partTree_retree(param_set); } FILE * tree_file = fopen(tree_f,"w"); print_fastal_tree(tree, 0, tree_file, number_of_sequences); fclose(tree_file); vfree(tree); // exit(1); } /** * Filters seed set. * * \param sequence_group Sequences to filter. * \param dist_mat The distance matrix. * \param seed_set_cleaned ordered_seed_set. * \param param_set Parameters for PartTree algorithm. * \return number in the filtered set. */ int filter(int *sequence_group, int **dist_mat, int *seed_set_cleaned, PartTree_param *param_set) { int i, j; int num_in_subgroup = param_set->subgroup; int *seq_lengths = param_set->seq_lengths; int num_in_clean = 0; double threshold = param_set->threshold; // printf("threshold: %f\n", threshold); double min; for (i = 0; i < num_in_subgroup; ++i) { if (!seed_set_cleaned[i]) continue; for (j = i+1; j < num_in_subgroup; ++j) { min = MIN(seq_lengths[sequence_group[i]], seq_lengths[sequence_group[j]]); // printf("MINIMUM: %i\n",min); min = (threshold * min); // printf("MINIMUM: %f\n",min); if (seed_set_cleaned[j] &&(dist_mat[i][j] < min)) { if (seq_lengths[sequence_group[i]] < seq_lengths[sequence_group[j]]) { seed_set_cleaned[i] = 0; break; } else seed_set_cleaned[j] = 0; } } } for (i = 0; i < num_in_subgroup; ++i) { num_in_clean += seed_set_cleaned[i]; } int max = num_in_subgroup -1; i = 0; int tmp; // printf("CLEAN: %i\n", num_in_clean); while (i < num_in_clean) { if (seed_set_cleaned[i]) { ++i; } else { seed_set_cleaned[i] = seed_set_cleaned[max]; seed_set_cleaned[max] = 0; tmp = sequence_group[i]; sequence_group[i] = sequence_group[max]; sequence_group[max] = tmp; --max; } } return num_in_clean; } /*! * \brief Function to create a tree using the PartTree algorithm. * * \param param_set A \a PartTree_param object containing all necessary parameters and the data. * \return The node_number. */ void partTree_r(PartTree_param *param_set) { int num_of_tree_nodes = param_set->num_sequences-1; int loop_tree_node; Tree_fastal *tree = param_set->tree; // int this_node = param_set->pos_tree; int i; int tsize = param_set->tsize; //get some memory short *table1 = vcalloc(tsize, sizeof(short)); short *table2 = vcalloc(tsize, sizeof(short)); char **names = declare_char(param_set->subgroup, 8); int **dist_mat = declare_int(param_set->subgroup, param_set->subgroup); int **dist_mat2 = declare_int(param_set->subgroup, param_set->subgroup); char * file_name_tmp = vtmpnam(NULL); int *seed_set_cleaned = vcalloc(param_set->subgroup, sizeof(int)); FILE *table_f = param_set->ktup_table_f; long *file_positions = param_set->ktup_positions; int max_n_group = param_set->subgroup; int num_in_subgroup = param_set->subgroup; int *seq_lengths = param_set->seq_lengths; int *clusters = vcalloc(param_set->subgroup+1, sizeof(int)); int *min_dist = vcalloc(param_set->num_sequences, sizeof(int)); int *belongs_to = vcalloc(param_set->num_sequences, sizeof(int)); //Prepare first node tree[0].index = vcalloc(param_set->num_sequences,sizeof(int)); int *index = tree[0].index; for (i = 0; i< param_set->num_sequences; ++i) index[i] = i; tree[0].name = param_set->pos_tree +param_set->num_sequences; tree[0].num_leafs = param_set->num_sequences; int *sequence_group2 = vcalloc(param_set->num_sequences,sizeof(int)); Tree_fastal *current_node; for (loop_tree_node = 0; loop_tree_node < num_of_tree_nodes; ++loop_tree_node) { // printf("ROUND: %i\n", loop_tree_node); current_node = &tree[loop_tree_node]; index= current_node->index; if (current_node->index == NULL) { continue; } int num_sequences = current_node->num_leafs; //if number of sequences in this group smaller than number subgoup size: make tree, finisch if (num_sequences <= max_n_group) { dist_mat = make_distance_matrix(table_f, file_positions, index, num_sequences, dist_mat); for (i = 0; i < num_sequences; ++i) { sprintf(names[i],"%i", current_node->index[i]); } NT_node **tree= (int_dist2upgma_tree_fastal (dist_mat, num_sequences, file_name_tmp , names)); tree_process_simple(tree[0][0], param_set,loop_tree_node); continue; } for (i = 0; i < num_in_subgroup; ++i) { seed_set_cleaned[i] = 1; } //finde longest sequence and put into the first field int index_longest = 0; int length_of_longest = 0; for(i = 0; i < num_sequences; ++i) { if (seq_lengths[index[i]] > length_of_longest) { index_longest = i; length_of_longest = seq_lengths[index[i]]; } } int tmp = index[index_longest]; index[index_longest] = index[0]; index[0] = tmp; //distance of longest to rest int seq_index = 1; int min= euclidean_dist(table_f, file_positions[index[0]], file_positions[index[1]], table1, table2, param_set->tsize); for (i = 2; i < num_sequences; ++i) { tmp = euclidean_dist_half(table_f, file_positions[index[i]], table1, table2, param_set->tsize); if (tmp < min) { min = tmp; seq_index = i; } } //get the new seed_set in the first n spaces tmp = index[1]; index[1] = index[seq_index]; index[seq_index] = tmp; int r,j; num_in_subgroup = param_set->subgroup; for (i = 2; i < num_in_subgroup; ++i) { r = i + rand() / ( RAND_MAX / ( num_sequences-i) + 1 ); // printf("RANDOM: %i\n",r); tmp = index[r]; index[r] = index[i]; index[i] = tmp; } //Calculate matrix dist_mat = make_distance_matrix(table_f, file_positions, index, param_set->subgroup, dist_mat); //Filter out sequences that are to similar & reorder NT_node **upgma_tree; int num_in_clean = filter(index, dist_mat, seed_set_cleaned, param_set); if (num_in_clean ==1) { num_in_clean = 2; seed_set_cleaned[1] = 1; } //make_tree int col = 0; int row = 0; for (i = 0; i < num_in_subgroup; ++i) { if (seed_set_cleaned[i]) { row = col+1; for (j = i+1; j < num_in_subgroup; ++j) { if (seed_set_cleaned[j]) { dist_mat2[row][col] = dist_mat2[col][row] = dist_mat[i][j]; ++row; } } ++col; } } for (i = 0; i < num_in_clean; ++i) { sprintf(names[i],"%i",i); } upgma_tree= (int_dist2upgma_tree_fastal (dist_mat2, num_in_clean, file_name_tmp , names)); //cluster //calculate distances from n' to N get_table(table1, table_f, file_positions[index[0]]); for (j = num_in_clean; j < num_sequences; ++j) { min_dist[j] = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize); belongs_to[j] = 0; } for(i = 1; i < num_in_clean; ++i) { get_table(table1, table_f, file_positions[index[i]]); belongs_to[i] = i; for (j = num_in_clean; j < num_sequences; ++j) { tmp = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize); if (tmp < min_dist[j]) { min_dist[j] = tmp; belongs_to[j] = i; } } } //how_many sequences has each cluster for (j = 0; j <= num_in_subgroup; ++j) { clusters[j] = 0; } for (j = 0; j < num_sequences; ++j) { ++clusters[belongs_to[j]]; } // for (j = 0; j <= num_in_subgroup; ++j) // { // printf("CL: %i ",clusters[j]); // } // printf("\n"); for(i = 1; i < num_in_clean; ++i) { clusters[i] += clusters[i-1]; } clusters[num_in_clean] = clusters[num_in_clean-1]; for (i = 0; i < num_sequences; ++i) { sequence_group2[--clusters[belongs_to[i]]] = index[i]; } for (i = 0; i < num_sequences; ++i) { index[i] = sequence_group2[i]; } for (i = 0; i < num_in_clean; ++i) { sprintf(names[i],"%i",i); } tree_process(upgma_tree[0][0], param_set, clusters, index, loop_tree_node); NT_node tmp_tree = upgma_tree[3][0]; vfree(upgma_tree[0]); vfree(upgma_tree[1]); vfree(upgma_tree[2]); vfree(upgma_tree[3]); vfree(upgma_tree); free_tree(tmp_tree); } vfree(min_dist); vfree(belongs_to); vfree(clusters); } /*! * \brief Makes the distance matrix between all sequences. * * \param table_file File with the ktup tables * \param file_positions Index of positions where the tabels are stored in \a table_file * \param sequence_group the group of sequences * \param number number of sequences * \param dist_mat distance matrix * \return the distance matrix. (same as \a dist_mat ) */ int ** make_distance_matrix(FILE *table_f, long *file_positions, int *sequence_group, int number, int **dist_mat) { static short *table1 = NULL; static short *table2; int tsize = param_set->tsize; if (table1 == NULL) { table1 = vcalloc(tsize, sizeof(short)); table2 = vcalloc(tsize, sizeof(short)); } int i, j, num = number-1; for (i = 0; i < num; ++i) { j = i+1; dist_mat[i][j] = dist_mat[j][i]= euclidean_dist(table_f, file_positions[sequence_group[i]], file_positions[sequence_group[j]], table1, table2, tsize); ++j; for (; j < number; ++j) { dist_mat[i][j] = dist_mat[j][i] = euclidean_dist_half(table_f, file_positions[sequence_group[j]], table1, table2, tsize); } } return dist_mat; } int ** make_distance_matrix_sim(FILE *aln_f, long *file_positions, int *sequence_group, int number, int **dist_mat) { static char *seq1 = NULL; static char *seq2; char line[500]; if (seq1 == NULL) { int length = 0; int i; fseek(aln_f, file_positions[0], SEEK_SET); fgets(line, 500, aln_f); while (line[0] != '>') { i = -1; while ((line[++i] != '\n') && (line[i] != '\0')); length += i; fgets(line, 500, aln_f); } seq1 = vcalloc(length+1, sizeof(short)); seq2 = vcalloc(length+1, sizeof(short)); } int i, j, num = number-1, pos; for (i = 0; i < num; ++i) { fseek(aln_f, file_positions[sequence_group[i]], SEEK_SET); pos = -1; while ((fgets(line, 500, aln_f) != NULL) && (line[0] != '>')) { int l = -1; while ((line[++l] != '\n') && (line[l] != '\0')) seq1[++pos] = line[l]; } seq1[++pos] = '\0'; for (j = i+1; j < number; ++j) { pos = -1; fseek(aln_f, file_positions[sequence_group[j]], SEEK_SET); while ((fgets(line, 500, aln_f) != NULL) && (line[0] != '>')) { int l = -1; while ((line[++l] != '\n') && (line[l] != '\0')) seq2[++pos] = line[l]; } seq2[++pos] = '\0'; dist_mat[i][j] = dist_mat[j][i] = 100 - fast_aln2sim_mat2(seq1, seq2); } } return dist_mat; } /** * Replaces the coded sequence with coded tuples * * \param coded_seq The coded sequence which will be replaced by the tuple number * \param ktup Size of the ktup * \param ng Coded alphabet size * \param length Lengths of coded sequence */ void makepointtable_fast(int *coded_seq, //sequence int ktup, //ktup size int ng, //hmm... int length) //length of coded_seq { int point, a; register int *p; static int *prod; if (!prod) { prod=vcalloc ( ktup, sizeof (int)); for ( a=0; a 0) fprintf(tables_f, "%i %i\n", point, table[point]); } fprintf(tables_f, "*\n"); } /** JUST FOR TEST */ void make_fast_tree(char *file_name, int n, int ktup) { make_partTree(file_name, "TREE_OUT", ktup, n, 1, 0); } /** * \brief Reads ktup_table from file * * \param table Table to save the file content in. * \param tables_f File in which the tables are stored. * \param index Position of the table in \a tables_f */ void get_table(short *table, //Table to save the readings in FILE* tables_f, //File with tables long index) //index positin of ktup-tables { fseek(tables_f, index, SEEK_SET); const int LINE_LENGTH = 101; char line[LINE_LENGTH]; fgets(line, LINE_LENGTH, tables_f); char delims[] = " "; char *result = NULL; int code; while (line[0] != '*') { result = strtok( line, delims ); code = atoi(result); table[code] = atoi(strtok( NULL, delims)); fgets(line, LINE_LENGTH, tables_f); } } /** * \brief calculates the euclidean ktub distance between two sequences * * @param ktup_f, ktup_file * @param pos1 position of sequence 1 in \a ktup_f * @param pos2 position of sequence 2 in \a ktup_f * @param table1 Saves the number of occurences for each ktup in sequence 1 * @param table2 Saves the number of occurences for each ktup in sequence 2 */ int euclidean_dist(FILE* ktup_f, //ktup_file long pos1, //position of table1 long pos2, //position of table2 short *table1, //table to save ktups in short *table2, //table to save ktups in int length) { const int LINE_LENGTH = 101; char line[LINE_LENGTH]; char delims[] = " "; char *result = NULL; int code; fseek(ktup_f, pos1, SEEK_SET); fgets(line, LINE_LENGTH, ktup_f); int i; for (i = 0; i < length; ++i) { table1[i] = 0; table2[i] = 0; } while (line[0] != '*') { result = strtok( line, delims ); code = atoi(result); table1[code] = atoi(strtok( NULL, delims)); fgets(line, LINE_LENGTH, ktup_f); } fseek(ktup_f, pos2, SEEK_SET); fgets(line, LINE_LENGTH, ktup_f); while (line[0] != '*') { result = strtok( line, delims ); code = atoi(result); table2[code] = atoi(strtok( NULL, delims)); fgets(line, LINE_LENGTH, ktup_f); } int dist = 0; for (i = 0; i < length; ++i) { dist += (table1[i]-table2[i])*(table1[i]-table2[i]); } return dist; } /** * \brief calculates the euclidean ktub distance between two sequences. * * The difference to \a euclidean_dist is, that this uses the ktups stored in \a table1 * @param ktup_f, ktup_file * @param pos2 position of sequence 2 in \a ktup_f * @param table1 Saves the number of occurences for each ktup in sequence 1 * @param table2 Saves the number of occurences for each ktup in sequence 2 * \see euclidean_dist */ int euclidean_dist_half(FILE* ktup_f, //ktup_file long pos2, //position of table1 short *table1, //table to save ktups in short *table2, //table to save ktups in int length) { const int LINE_LENGTH = 101; char line[LINE_LENGTH]; char delims[] = " "; char *result = NULL; int code; fseek(ktup_f, pos2, SEEK_SET); fgets(line, LINE_LENGTH, ktup_f); int i; for (i = 0; i < length; ++i) { table2[i] = 0; } while (line[0] != '*') { result = strtok( line, delims ); code = atoi(result); table2[code] = atoi(strtok( NULL, delims)); fgets(line, LINE_LENGTH, ktup_f); } int dist = 0; for (i = 0; i < length; ++i) { dist += (table1[i]-table2[i])*(table1[i]-table2[i]); } return dist; } /** * Makes an index of a file */ int make_pos_len_index_of_file(char *file_name, //file with sequences char *ktable_f, //file with the ktup-tables long **file_positions, //array to save the positions int **seq_lengths, //array to save the sequence length int ktup, //length of ktup int is_dna) //type of the seuqence { //preparations for recoding sequence int *aa; int a, b; int ng = 0; char **gl; if (is_dna) { gl=declare_char (5,13); sprintf ( gl[ng++], "Aa"); sprintf ( gl[ng++], "Gg"); sprintf ( gl[ng++], "TtUu"); sprintf ( gl[ng++], "Cc"); sprintf ( gl[ng++], "NnRrYyDdMmWw"); } else { gl=make_group_aa ( &ng, "mafft"); } aa=vcalloc ( 256, sizeof (int)); for ( a=0; atsize = tsize; param_set->ng = ng; int *table=vcalloc ( tsize,sizeof (int)); //Reading and recoding squences const int LINE_LENGTH = 501; int *coded_seq = vcalloc(2*LINE_LENGTH, sizeof(int)); int allocated_mem = 2*LINE_LENGTH; (*file_positions) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(long)); (*seq_lengths) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(int)); FILE *file = fopen(file_name,"r"); char line[LINE_LENGTH]; int num_of_sequences = 0; int str_len = 0; int mem_for_pos = ENLARGEMENT_PER_STEP; int real_len; int *c_seq; FILE *tables_f = fopen(ktable_f, "w"); if (file == NULL) { printf("FILE NOT FOUND\n"); exit(1); } else { while(fgets(line, LINE_LENGTH , file)!=NULL) { if ( str_len >= allocated_mem - LINE_LENGTH) { allocated_mem += LINE_LENGTH; coded_seq = vrealloc(coded_seq, allocated_mem*sizeof(int)); } int i; if (line[0] == '>') { if (num_of_sequences >0) { (*seq_lengths)[num_of_sequences-1] = str_len; // printf("len: %i\n", str_len); c_seq = coded_seq; makepointtable_fast(coded_seq,ktup,ng, str_len); (*file_positions)[num_of_sequences-1] = ftell(tables_f ); for (i=0; i < tsize; ++i) table[i] = 0; makecompositiontable_fastal(tables_f, table, coded_seq,tsize ); } str_len = 0; ++num_of_sequences; if (num_of_sequences == mem_for_pos) { mem_for_pos += ENLARGEMENT_PER_STEP; (*file_positions) = vrealloc((*file_positions), mem_for_pos * sizeof(long)); (*seq_lengths) = vrealloc((*seq_lengths), mem_for_pos * sizeof(int)); } } else { int i; real_len = strlen(line); if (line[real_len-1] == '\n') --real_len; for (i = 0; i < real_len; ++i) { coded_seq[str_len++] = aa[(short)line[i]]; } } } } (*seq_lengths)[num_of_sequences-1] = str_len; c_seq = coded_seq; makepointtable_fast(coded_seq,ktup,ng, str_len); (*file_positions)[num_of_sequences] = ftell(tables_f ); makecompositiontable_fastal(tables_f, table, coded_seq,tsize ); fclose(file); fclose(tables_f); return num_of_sequences; } /** * Makes an index of a file */ int make_pos_len_index_of_file_retree(char *file_name, //file with sequences long **file_positions, //array to save the positions int **seq_lengths) //array to save the sequence length { // printf("HALLO\n"); //Reading sequences const int LINE_LENGTH = 501; (*file_positions) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(long)); (*seq_lengths) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(int)); FILE *file = fopen(file_name,"r"); char line[LINE_LENGTH]; int num_of_sequences = 0; int mem_for_pos = ENLARGEMENT_PER_STEP; // fgets(line, LINE_LENGTH , file) int seq_len = 0; if (file == NULL) { printf("FILE NOT FOUND\n"); exit(1); } else { int i; while(fgets(line, LINE_LENGTH , file)!=NULL) { // line[LINE_LENGTH -2] = '\n'; if (line[0] == '>') { (*file_positions)[num_of_sequences] = ftell(file); if (num_of_sequences >0) { (*seq_lengths)[num_of_sequences-1] = seq_len; seq_len = 0; } ++num_of_sequences; if (num_of_sequences == mem_for_pos) { mem_for_pos += ENLARGEMENT_PER_STEP; (*file_positions) = vrealloc((*file_positions), mem_for_pos * sizeof(long)); (*seq_lengths) = vrealloc((*seq_lengths), mem_for_pos * sizeof(int)); } } else { i = -1; while ((line[++i] != '\n') && (line[i] != '\0')) { if (line[i] != '-') { // printf("A: %c\n", line[i]); ++seq_len; } } } } } (*seq_lengths)[num_of_sequences-1] = seq_len; // printf("%i %li\n", (*seq_lengths)[0], (*file_positions)[0]); // printf("%i %li\n", (*seq_lengths)[1], (*file_positions)[1]); fclose(file); return num_of_sequences; } int logid_score2 ( int sim, int len) { float score; if ( len==0)return (int)(0.33*(float)MAXID); score=(float)sim/(float)len; if (score>0.9) score=1.0; else score=-log10 (1.0-score); score=(score*MAXID); return score; } int fast_aln2sim_mat2 (char *seq1, char *seq2) { int r1, r2; int len = 0; int sim = 0; int c = -1; int simm=100; // printf("SEQ: %s %s\n", seq1, seq2); while (seq1[++c] != '\0') { r1=tolower (seq1[c]); r2=tolower (seq2[c]); if ((r1 == '-') && (r2 == '-')) continue; if (r1==r2) sim++; len++; } simm = logid_score2 ( sim, len); return simm; } /*! * \brief Function to create a tree using the PartTree algorithm. * * \param param_set A \a PartTree_param object containing all necessary parameters and the data. * \return The node_number. */ void partTree_retree(PartTree_param *param_set) { int num_of_tree_nodes = param_set->num_sequences-1; int loop_tree_node; Tree_fastal *tree = param_set->tree; // int this_node = param_set->pos_tree; int i; // int tsize = param_set->tsize; //get some memory // short *table1 = vcalloc(tsize, sizeof(short)); // short *table2 = vcalloc(tsize, sizeof(short)); char **names = declare_char(param_set->subgroup, 8); int **dist_mat = declare_int(param_set->subgroup, param_set->subgroup); int **dist_mat2 = declare_int(param_set->subgroup, param_set->subgroup); char * file_name_tmp = vtmpnam(NULL); int *seed_set_cleaned = vcalloc(param_set->subgroup, sizeof(int)); FILE *aln_f = param_set->ktup_table_f; long *file_positions = param_set->ktup_positions; int max_n_group = param_set->subgroup; int num_in_subgroup = param_set->subgroup; int *seq_lengths = param_set->seq_lengths; int *clusters = vcalloc(param_set->subgroup+1, sizeof(int)); int *min_dist = vcalloc(param_set->num_sequences, sizeof(int)); int *belongs_to = vcalloc(param_set->num_sequences, sizeof(int)); int aln_length = 1; fseek(aln_f, file_positions[0], SEEK_SET); char line[500]; while ((fgets(line, 500, aln_f) != NULL) && (line[0] != '>')) { i = -1; while ((line[++i] != '\n') && (line[i] != '\0')); aln_length += i; } char *seq1 = vcalloc(aln_length, sizeof(char)); char *seq2 = vcalloc(aln_length, sizeof(char)); //Prepare first node tree[0].index = vcalloc(param_set->num_sequences,sizeof(int)); int *index = tree[0].index; for (i = 0; i< param_set->num_sequences; ++i) index[i] = i; tree[0].name = param_set->pos_tree +param_set->num_sequences; tree[0].num_leafs = param_set->num_sequences; int *sequence_group2 = vcalloc(param_set->num_sequences,sizeof(int)); // Tree_fastal *current_node; for (loop_tree_node = 0; loop_tree_node < num_of_tree_nodes; ++loop_tree_node) { // printf("ROUND: %i\n", loop_tree_node); current_node = &tree[loop_tree_node]; index= current_node->index; if (current_node->index == NULL) { continue; } int num_sequences = current_node->num_leafs; //if number of sequences in this group smaller than number subgoup size: make tree, finisch if (num_sequences <= max_n_group) { dist_mat = make_distance_matrix_sim(aln_f, file_positions, index, num_sequences, dist_mat); for (i = 0; i < num_sequences; ++i) { sprintf(names[i],"%i", current_node->index[i]); } NT_node **tree= (int_dist2upgma_tree_fastal (dist_mat, num_sequences, file_name_tmp , names)); tree_process_simple(tree[0][0], param_set,loop_tree_node); continue; } for (i = 0; i < num_in_subgroup; ++i) { seed_set_cleaned[i] = 1; } //finde longest sequence and put into the first field int index_longest = 0; int length_of_longest = 0; for(i = 0; i < num_sequences; ++i) { if (seq_lengths[index[i]] > length_of_longest) { index_longest = i; length_of_longest = seq_lengths[index[i]]; } } int tmp = index[index_longest]; index[index_longest] = index[0]; index[0] = tmp; //distance of longest to rest int seq_index = 1; read_sequence_from_position(aln_f, file_positions[index[0]], seq1); int min = -1; for (i = 1; i < num_sequences; ++i) { read_sequence_from_position(aln_f, file_positions[index[1]], seq2); tmp = 100 - fast_aln2sim_mat2(seq1, seq2); if (tmp < min) { min = tmp; seq_index = i; } } //get the new seed_set in the first n spaces tmp = index[1]; index[1] = index[seq_index]; index[seq_index] = tmp; int r,j; num_in_subgroup = param_set->subgroup; for (i = 2; i < num_in_subgroup; ++i) { r = i + rand() / ( RAND_MAX / ( num_sequences-i) + 1 ); tmp = index[r]; index[r] = index[i]; index[i] = tmp; } //Calculate matrix dist_mat = make_distance_matrix_sim(aln_f, file_positions, index, param_set->subgroup, dist_mat); // // //Filter out sequences that are to similar & reorder // NT_node **upgma_tree; int num_in_clean = filter(index, dist_mat, seed_set_cleaned, param_set); if (num_in_clean ==1) { num_in_clean = 2; seed_set_cleaned[1] = 1; } //make_tree int col = 0; int row = 0; for (i = 0; i < num_in_subgroup; ++i) { if (seed_set_cleaned[i]) { row = col+1; for (j = i+1; j < num_in_subgroup; ++j) { if (seed_set_cleaned[j]) { dist_mat2[row][col] = dist_mat2[col][row] = dist_mat[i][j]; ++row; } } ++col; } } for (i = 0; i < num_in_clean; ++i) { sprintf(names[i],"%i",i); } upgma_tree= (int_dist2upgma_tree_fastal (dist_mat2, num_in_clean, file_name_tmp , names)); // //cluster // //calculate distances from n' to N read_sequence_from_position(aln_f, file_positions[index[0]], seq1); // get_table(table1, table_f, file_positions[index[0]]); for (j = num_in_clean; j < num_sequences; ++j) { read_sequence_from_position(aln_f, file_positions[index[j]], seq2); min_dist[j] = 100 - fast_aln2sim_mat2(seq1, seq2); // min_dist[j] = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize); belongs_to[j] = 0; } for(i = 1; i < num_in_clean; ++i) { read_sequence_from_position(aln_f, file_positions[index[0]], seq1); // get_table(table1, table_f, file_positions[index[i]]); belongs_to[i] = i; for (j = num_in_clean; j < num_sequences; ++j) { read_sequence_from_position(aln_f, file_positions[index[j]], seq2); tmp = 100 - fast_aln2sim_mat2(seq1, seq2); // tmp = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize); if (tmp < min_dist[j]) { min_dist[j] = tmp; belongs_to[j] = i; } } } // //how_many sequences has each cluster for (j = 0; j <= num_in_subgroup; ++j) { clusters[j] = 0; } for (j = 0; j < num_sequences; ++j) { ++clusters[belongs_to[j]]; } // for (j = 0; j <= num_in_subgroup; ++j) // { // printf("CL: %i ",clusters[j]); // } // printf("\n"); for(i = 1; i < num_in_clean; ++i) { clusters[i] += clusters[i-1]; } clusters[num_in_clean] = clusters[num_in_clean-1]; for (i = 0; i < num_sequences; ++i) { sequence_group2[--clusters[belongs_to[i]]] = index[i]; } for (i = 0; i < num_sequences; ++i) { index[i] = sequence_group2[i]; } for (i = 0; i < num_in_clean; ++i) { sprintf(names[i],"%i",i); } tree_process(upgma_tree[0][0], param_set, clusters, index, loop_tree_node); NT_node tmp_tree = upgma_tree[3][0]; vfree(upgma_tree[0]); vfree(upgma_tree[1]); vfree(upgma_tree[2]); vfree(upgma_tree[3]); vfree(upgma_tree); free_tree(tmp_tree); } // exit(1); vfree(min_dist); vfree(belongs_to); vfree(clusters); } /******************************COPYRIGHT NOTICE*******************************/ /*© Centro de Regulacio Genomica */ /*and */ /*Cedric Notredame */ /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */ /*All rights reserved.*/ /*This file is part of T-COFFEE.*/ /**/ /* T-COFFEE is free software; you can redistribute it and/or modify*/ /* it under the terms of the GNU General Public License as published by*/ /* the Free Software Foundation; either version 2 of the License, or*/ /* (at your option) any later version.*/ /**/ /* T-COFFEE is distributed in the hope that it will be useful,*/ /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/ /* GNU General Public License for more details.*/ /**/ /* You should have received a copy of the GNU General Public License*/ /* along with Foobar; if not, write to the Free Software*/ /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/ /*............................................... |*/ /* If you need some more information*/ /* cedric.notredame@europe.com*/ /*............................................... |*/ /**/ /**/ /* */ /******************************COPYRIGHT NOTICE*******************************/