1 /******************************COPYRIGHT NOTICE*******************************/
2 /* (c) Centro de Regulacio Genomica */
5 /* 12 Aug 2014 - 22:07. */
6 /*All rights reserved. */
7 /*This file is part of T-COFFEE. */
9 /* T-COFFEE is free software; you can redistribute it and/or modify */
10 /* it under the terms of the GNU General Public License as published by */
11 /* the Free Software Foundation; either version 2 of the License, or */
12 /* (at your option) any later version. */
14 /* T-COFFEE is distributed in the hope that it will be useful, */
15 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
16 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
17 /* GNU General Public License for more details. */
19 /* You should have received a copy of the GNU General Public License */
20 /* along with Foobar; if not, write to the Free Software */
21 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
22 /*............................................... */
23 /* If you need some more information */
24 /* cedric.notredame@europe.com */
25 /*............................................... */
26 /******************************COPYRIGHT NOTICE*******************************/
33 #include "io_lib_header.h"
34 #include "util_lib_header.h"
35 // #include "define_header.h"
36 // #include "dp_lib_header.h"
37 // #include "fastal_lib_header.h"
41 #include "fastal_lib_header.h"
45 * \brief Source code for the fastal tree algorithm
52 main(int argc, char** argv)
54 int alphabet_size = 5;
56 if (alphabet_size == 5)
101 compute_oligomer_distance_tree(argv[1], &aa[0], argv[2], 5, 2, alphabet_size);
107 compute_oligomer_distance_tree(char *seq_file_name, int* char2value, char *tree_file_name, int max_dist, int word_length, int alphabet_size)
110 int *compare =(int*) vcalloc(1,sizeof(int));
111 int *num_seq =(int*) vcalloc(1,sizeof(int));
113 int num_features = (int)pow(alphabet_size,word_length);
115 num_features *= num_features * (max_dist+1);
118 Cluster_info *matrix = feature_extract(seq_file_name, max_dist, word_length, char2value, alphabet_size, compare, num_seq, num_features);
120 int *node =(int*) vcalloc(1, sizeof(int));
123 FILE *tree_f = fopen(tree_file_name,"w");
124 double *mean = (double*)vcalloc(num_features, sizeof(double));
125 double *variance = (double*)vcalloc(num_features, sizeof(double));
126 cluster_recursive(0, *num_seq-1, matrix, num_features, mean, variance, compare, tree_f, node);
128 for (i = 0; i < *num_seq; ++i)
130 // printf("ERG: %i\n", i);
131 vfree(matrix[i].features);
145 * Recodes a given sequence into a sequence of it's k-mers.
147 * \param seq The sequence to encode.
148 * \param seq_length The length of \a seq.
149 * \param char2value Array giving each character in seq a unique value.
150 * \param alphabet_size The size of the alphabet.
151 * \param word_length The length of the k-mers to use.
153 * \return The recodes sequence.
156 recode_sequence(char * seq, int seq_length, int *char2value, int alphabet_size, int word_length)
159 int *recoded =(int*) vcalloc(seq_length - word_length + 1, sizeof(int));
162 if (word_length == 1)
164 for (i = 0; i < seq_length; ++i)
166 recoded[i] = char2value[(short)seq[i]];
171 int *prod=(int*)vcalloc (word_length, sizeof(int));
172 for ( i=0; i<word_length; ++i)
174 prod[word_length-i-1]=(int)pow(alphabet_size,i);
178 for (i = 0; i < word_length; ++i)
180 index += char2value[(short)seq[i]] * prod[i];
184 for (; i < seq_length; ++i)
186 index -= char2value[(short)seq[z]] * prod[0];
187 index *= alphabet_size;
188 index += char2value[(short)seq[i]];
189 recoded[++z] = index;
198 * Extracts the features from a given sequence.
200 * \param recoded_seq The recoded sequence, consisting of k-mer codes.
201 * \param seq_length The length of \a recoded_seq.
202 * \param k_tup Length of the k-mers.
203 * \param max_coding The maximal number used for coding the alphabet.
204 * \param max_dist The maximal distance to use between to k-mers.
206 * \return The feature vector of the sequence.
209 get_features(int *recoded_seq, int seq_length, int k_tup, int max_coding, int max_dist, int num_features)
213 int vector_length = num_features;//max_coding * max_coding*(max_dist+1);
215 double *feature_vector =(double*) vcalloc(vector_length, sizeof(double));
217 for (i = 0; i < vector_length; ++i)
219 feature_vector[i] = 0;
221 int j, max_length, indJ, indK, indDist;
222 for (i = 0; i < seq_length; ++i)
224 if (seq_length <= i + max_dist)
225 max_length = seq_length-1;
227 max_length = i + max_dist;
228 for (j = i; j <= max_length; ++j)
231 indJ = max_coding * (max_dist+1) * recoded_seq[i];
232 indK = (max_dist+1) * recoded_seq[j];
234 ++feature_vector[indJ + indK + indDist];
238 double normalize = 0;
239 for (i = 0; i < vector_length; ++i)
241 normalize += (feature_vector[i] * feature_vector[i]);
243 normalize = sqrt(normalize);
244 for (i = 0; i < vector_length; ++i)
246 feature_vector[i] /= normalize;
249 return feature_vector;
254 * Calculates the feature values for all sequences in a file.
256 * The File has to have fasta format.
257 * \param seq_file_name The name of the file (in fasta format).
258 * \param max_dist The maximal distance to be considered.
259 * \param k_tup Size of the k_tup.
260 * \param alphabet_size The size of the alphabet.
262 * \return The feature values for every sequence in a Cluster_info object.
265 feature_extract(char *seq_file_name, int max_dist, int k_tup, int *char2value, int alphabet_size, int * elem_2_compare, int *num_seq_p, int num_features)
268 FILE *tree_f = fopen(seq_file_name,"r");
269 // fgets(line, 500, tree_f);
272 int matrix_size = 1000;
273 const int STEP = 1000;
275 char *seq = (char*)vcalloc(size, sizeof(char));
276 Cluster_info *matrix =(Cluster_info*) vcalloc(matrix_size, sizeof(Cluster_info));
278 int max_coding = pow(alphabet_size,k_tup);
279 while (fgets(line, 500, tree_f) != NULL)
285 seq[++seq_pos] = '\0';
286 if (num_seq > matrix_size -2)
289 matrix = (Cluster_info*)vrealloc(matrix, matrix_size * sizeof(Cluster_info));
291 int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
292 matrix[num_seq].seq_number = num_seq;
293 matrix[num_seq].elem_2_compare = elem_2_compare;
294 matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
302 if (size - seq_pos < 500)
305 seq = (char*)vrealloc(seq, size*sizeof(char));
308 while ((*c_p != '\n') && (*c_p != '\0'))
310 seq[++seq_pos] = toupper(*(c_p));
317 seq[++seq_pos] = '\0';
318 int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
319 matrix[num_seq].seq_number = num_seq;
320 matrix[num_seq].elem_2_compare = elem_2_compare;
321 matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
322 *num_seq_p = ++num_seq;
331 * Compare function for qsort given an array of Cluster_info.
333 * The field used for sorting is done according to the element determined in the Cluster_info object.
334 * \param a void pointer to an object of Cluster_info.
335 * \param b void pointer to an object of Cluster_info.
337 * \return Value showing wheather \a a is bigger, smaller or equal to \a b.
340 cluster_compare (const void * a, const void * b)
342 Cluster_info *tmp1 = (Cluster_info*)a;
343 Cluster_info *tmp2 = (Cluster_info*)b;
344 int elem = *(tmp1->elem_2_compare);
345 if ((tmp1->features[elem]) > (tmp2->features[elem]))
347 else if ((tmp1->features[elem]) < (tmp2->features[elem]))
356 * Recursive function to build a tree according to a given feature matrix.
358 * \param start The begin of this set in \a matrix.
359 * \param end The end of this set in \a matrix.
360 * \param matrix The feature matrix.
361 * \param dim The number of features.
362 * \param mean Array of size \a dim.
363 * \param variance Array of size \a dim.
364 * \param elem_2_compare Pointer to a single in. This will save the feature according to which shall be sorted (biggest variance).
365 * \param tree_f The file in which the tree will be written.
366 * \param node The current node.
368 * \return The number of the current node.
371 cluster_recursive(int start,
373 Cluster_info* matrix,
385 return matrix[start].seq_number;
388 //reset mean/variance
390 int num_seq = end - start +1;
391 for (i = 0; i < dim; ++i)
398 Cluster_info *start_p = &matrix[start]-1;
399 Cluster_info *end_p = &matrix[end];
401 while (start_p < end_p)
403 tmp = &(++start_p)->features[0];
404 for (i = 0; i < dim; ++i)
409 for (i = 0; i < dim; ++i)
414 start_p = &matrix[start] -1;
416 while (start_p < end_p)
418 tmp = &(++start_p)->features[0];
419 for (i = 0; i < dim; ++i)
421 variance[i] += ((*tmp) - mean[i]) * ((*tmp) - mean[i]);
426 //get maximal variance
427 double max = variance[0];
429 for (i = 1; i < dim; ++i)
431 if (variance[i] > max)
437 *elem_2_compare = index;
440 //devide into to different sets and start recursion on these child sets
441 qsort (&matrix[start], num_seq, sizeof(Cluster_info), cluster_compare);
442 int split = start + (end-start)/2;
443 int split_value = matrix[split].features[index];
444 while ((matrix[split+1].features[index] == split_value) && (split < end-1))
447 int left_child = cluster_recursive(start, split, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
448 int right_child = cluster_recursive(split+1, end, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
451 //print node into file
452 fprintf(tree_f, "%i %i %i\n", left_child, right_child, node2);