7 #include "io_lib_header.h"
8 #include "util_lib_header.h"
9 // #include "define_header.h"
10 // #include "dp_lib_header.h"
11 // #include "fastal_lib_header.h"
15 #include "fast_tree_header.h"
19 * \brief Source code for the fastal tree algorithm
26 main(int argc, char** argv)
28 int alphabet_size = 5;
30 if (alphabet_size == 5)
75 compute_oligomer_distance_tree(argv[1], &aa[0], argv[2], 5, 2, alphabet_size);
81 compute_oligomer_distance_tree(char *seq_file_name, int* char2value, char *tree_file_name, int max_dist, int word_length, int alphabet_size)
84 int *compare = vcalloc(1,sizeof(int));
85 int *num_seq = vcalloc(1,sizeof(int));
87 int num_features = (int)pow(alphabet_size,word_length);
89 num_features *= num_features * (max_dist+1);
92 Cluster_info *matrix = feature_extract(seq_file_name, max_dist, word_length, char2value, alphabet_size, compare, num_seq, num_features);
94 int *node = vcalloc(1, sizeof(int));
97 FILE *tree_f = fopen(tree_file_name,"w");
98 double *mean = vcalloc(num_features, sizeof(double));
99 double *variance = vcalloc(num_features, sizeof(double));
100 cluster_recursive(0, *num_seq-1, matrix, num_features, mean, variance, compare, tree_f, node);
102 for (i = 0; i < *num_seq; ++i)
104 // printf("ERG: %i\n", i);
105 vfree(matrix[i].features);
119 * Recodes a given sequence into a sequence of it's k-mers.
121 * \param seq The sequence to encode.
122 * \param seq_length The length of \a seq.
123 * \param char2value Array giving each character in seq a unique value.
124 * \param alphabet_size The size of the alphabet.
125 * \param word_length The length of the k-mers to use.
127 * \return The recodes sequence.
130 recode_sequence(char * seq, int seq_length, int *char2value, int alphabet_size, int word_length)
133 int *recoded = vcalloc(seq_length - word_length + 1, sizeof(int));
136 if (word_length == 1)
138 for (i = 0; i < seq_length; ++i)
140 recoded[i] = char2value[(short)seq[i]];
145 int *prod=vcalloc (word_length, sizeof(int));
146 for ( i=0; i<word_length; ++i)
148 prod[word_length-i-1]=(int)pow(alphabet_size,i);
152 for (i = 0; i < word_length; ++i)
154 index += char2value[(short)seq[i]] * prod[i];
158 for (; i < seq_length; ++i)
160 index -= char2value[(short)seq[z]] * prod[0];
161 index *= alphabet_size;
162 index += char2value[(short)seq[i]];
163 recoded[++z] = index;
172 * Extracts the features from a given sequence.
174 * \param recoded_seq The recoded sequence, consisting of k-mer codes.
175 * \param seq_length The length of \a recoded_seq.
176 * \param k_tup Length of the k-mers.
177 * \param max_coding The maximal number used for coding the alphabet.
178 * \param max_dist The maximal distance to use between to k-mers.
180 * \return The feature vector of the sequence.
183 get_features(int *recoded_seq, int seq_length, int k_tup, int max_coding, int max_dist, int num_features)
187 int vector_length = num_features;//max_coding * max_coding*(max_dist+1);
189 double *feature_vector = vcalloc(vector_length, sizeof(double));
191 for (i = 0; i < vector_length; ++i)
193 feature_vector[i] = 0;
195 int j, max_length, indJ, indK, indDist;
196 for (i = 0; i < seq_length; ++i)
198 if (seq_length <= i + max_dist)
199 max_length = seq_length-1;
201 max_length = i + max_dist;
202 for (j = i; j <= max_length; ++j)
205 indJ = max_coding * (max_dist+1) * recoded_seq[i];
206 indK = (max_dist+1) * recoded_seq[j];
208 ++feature_vector[indJ + indK + indDist];
212 double normalize = 0;
213 for (i = 0; i < vector_length; ++i)
215 normalize += (feature_vector[i] * feature_vector[i]);
217 normalize = sqrt(normalize);
218 for (i = 0; i < vector_length; ++i)
220 feature_vector[i] /= normalize;
223 return feature_vector;
228 * Calculates the feature values for all sequences in a file.
230 * The File has to have fasta format.
231 * \param seq_file_name The name of the file (in fasta format).
232 * \param max_dist The maximal distance to be considered.
233 * \param k_tup Size of the k_tup.
234 * \param alphabet_size The size of the alphabet.
236 * \return The feature values for every sequence in a Cluster_info object.
239 feature_extract(char *seq_file_name, int max_dist, int k_tup, int *char2value, int alphabet_size, int * elem_2_compare, int *num_seq_p, int num_features)
242 FILE *tree_f = fopen(seq_file_name,"r");
243 // fgets(line, 500, tree_f);
246 int matrix_size = 1000;
247 const int STEP = 1000;
249 char *seq = vcalloc(size, sizeof(char));
250 Cluster_info *matrix = vcalloc(matrix_size, sizeof(Cluster_info));
252 int max_coding = pow(alphabet_size,k_tup);
253 while (fgets(line, 500, tree_f) != NULL)
259 seq[++seq_pos] = '\0';
260 if (num_seq > matrix_size -2)
263 matrix = vrealloc(matrix, matrix_size * sizeof(Cluster_info));
265 int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
266 matrix[num_seq].seq_number = num_seq;
267 matrix[num_seq].elem_2_compare = elem_2_compare;
268 matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
276 if (size - seq_pos < 500)
279 seq = vrealloc(seq, size*sizeof(char));
282 while ((*c_p != '\n') && (*c_p != '\0'))
284 seq[++seq_pos] = toupper(*(c_p));
291 seq[++seq_pos] = '\0';
292 int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
293 matrix[num_seq].seq_number = num_seq;
294 matrix[num_seq].elem_2_compare = elem_2_compare;
295 matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
296 *num_seq_p = ++num_seq;
305 * Compare function for qsort given an array of Cluster_info.
307 * The field used for sorting is done according to the element determined in the Cluster_info object.
308 * \param a void pointer to an object of Cluster_info.
309 * \param b void pointer to an object of Cluster_info.
311 * \return Value showing wheather \a a is bigger, smaller or equal to \a b.
314 cluster_compare (const void * a, const void * b)
316 Cluster_info *tmp1 = (Cluster_info*)a;
317 Cluster_info *tmp2 = (Cluster_info*)b;
318 int elem = *(tmp1->elem_2_compare);
319 if ((tmp1->features[elem]) > (tmp2->features[elem]))
321 else if ((tmp1->features[elem]) < (tmp2->features[elem]))
330 * Recursive function to build a tree according to a given feature matrix.
332 * \param start The begin of this set in \a matrix.
333 * \param end The end of this set in \a matrix.
334 * \param matrix The feature matrix.
335 * \param dim The number of features.
336 * \param mean Array of size \a dim.
337 * \param variance Array of size \a dim.
338 * \param elem_2_compare Pointer to a single in. This will save the feature according to which shall be sorted (biggest variance).
339 * \param tree_f The file in which the tree will be written.
340 * \param node The current node.
342 * \return The number of the current node.
345 cluster_recursive(int start,
347 Cluster_info* matrix,
359 return matrix[start].seq_number;
362 //reset mean/variance
364 int num_seq = end - start +1;
365 for (i = 0; i < dim; ++i)
372 Cluster_info *start_p = &matrix[start]-1;
373 Cluster_info *end_p = &matrix[end];
375 while (start_p < end_p)
377 tmp = &(++start_p)->features[0];
378 for (i = 0; i < dim; ++i)
383 for (i = 0; i < dim; ++i)
388 start_p = &matrix[start] -1;
390 while (start_p < end_p)
392 tmp = &(++start_p)->features[0];
393 for (i = 0; i < dim; ++i)
395 variance[i] += ((*tmp) - mean[i]) * ((*tmp) - mean[i]);
400 //get maximal variance
401 double max = variance[0];
403 for (i = 1; i < dim; ++i)
405 if (variance[i] > max)
411 *elem_2_compare = index;
414 //devide into to different sets and start recursion on these child sets
415 qsort (&matrix[start], num_seq, sizeof(Cluster_info), cluster_compare);
416 int split = start + (end-start)/2;
417 int split_value = matrix[split].features[index];
418 while ((matrix[split+1].features[index] == split_value) && (split < end-1))
421 int left_child = cluster_recursive(start, split, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
422 int right_child = cluster_recursive(split+1, end, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
425 //print node into file
426 fprintf(tree_f, "%i %i %i\n", left_child, right_child, node2);
431 /******************************COPYRIGHT NOTICE*******************************/
432 /*© Centro de Regulacio Genomica */
434 /*Cedric Notredame */
435 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
436 /*All rights reserved.*/
437 /*This file is part of T-COFFEE.*/
439 /* T-COFFEE is free software; you can redistribute it and/or modify*/
440 /* it under the terms of the GNU General Public License as published by*/
441 /* the Free Software Foundation; either version 2 of the License, or*/
442 /* (at your option) any later version.*/
444 /* T-COFFEE is distributed in the hope that it will be useful,*/
445 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
446 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
447 /* GNU General Public License for more details.*/
449 /* You should have received a copy of the GNU General Public License*/
450 /* along with Foobar; if not, write to the Free Software*/
451 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
452 /*............................................... |*/
453 /* If you need some more information*/
454 /* cedric.notredame@europe.com*/
455 /*............................................... |*/
459 /******************************COPYRIGHT NOTICE*******************************/