Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / tcoffee / t_coffee_source / tree.c
1 #include "stdio.h"
2 #include "stdlib.h"
3 #include "math.h"
4 #include "ctype.h"
5
6
7 #include "io_lib_header.h"
8 #include "util_lib_header.h"
9 // #include "define_header.h"
10 // #include "dp_lib_header.h"
11 // #include "fastal_lib_header.h"
12
13
14
15 #include "fast_tree_header.h"
16
17 /*!
18  *      \file tree.c
19  *      \brief Source code for the fastal tree algorithm
20  */
21
22 /*
23
24
25 int
26 main(int argc, char** argv)
27 {
28         int alphabet_size = 5;
29         int aa[256];
30         if (alphabet_size == 5)
31         {
32                 aa['A'] = 0;
33                 aa['B'] = 1;
34                 aa['C'] = 1;
35                 aa['D'] = 0;
36                 aa['G'] = 2;
37                 aa['H'] = 0;
38                 aa['K'] = 3;
39                 aa['M'] = 1;
40                 aa['N'] = 0;
41                 aa['R'] = 0;
42                 aa['S'] = 1;
43                 aa['T'] = 3;
44                 aa['U'] = 4;
45                 aa['W'] = 3;
46                 aa['Y'] = 1;
47         }
48         else
49         {
50                 aa['A'] = 0;
51                 aa['B'] = 1;
52                 aa['C'] = 2;
53                 aa['D'] = 3;
54                 aa['E'] = 4;
55                 aa['F'] = 5;
56                 aa['G'] = 6;
57                 aa['H'] = 7;
58                 aa['I'] = 8;
59                 aa['J'] = 9;
60                 aa['K'] = 10;
61                 aa['L'] = 11;
62                 aa['M'] = 12;
63                 aa['N'] = 13;
64                 aa['P'] = 14;
65                 aa['Q'] = 15;
66                 aa['R'] = 16;
67                 aa['S'] = 17;
68                 aa['T'] = 18;
69                 aa['V'] = 19;
70                 aa['W'] = 20;
71                 aa['X'] = 21;
72                 aa['Y'] = 22;
73                 aa['Z'] = 23;
74         }
75         compute_oligomer_distance_tree(argv[1], &aa[0], argv[2], 5, 2, alphabet_size);
76         return 0;
77 }*/
78
79
80 void
81 compute_oligomer_distance_tree(char *seq_file_name, int* char2value, char *tree_file_name, int max_dist, int word_length, int alphabet_size)
82 {
83         int i;
84         int *compare = vcalloc(1,sizeof(int));
85         int *num_seq = vcalloc(1,sizeof(int));
86
87         int num_features = (int)pow(alphabet_size,word_length);
88         
89         num_features *= num_features * (max_dist+1);
90
91
92         Cluster_info *matrix  = feature_extract(seq_file_name, max_dist, word_length, char2value, alphabet_size, compare, num_seq, num_features);
93         
94         int *node = vcalloc(1, sizeof(int));
95         *node = *num_seq;
96         
97         FILE *tree_f = fopen(tree_file_name,"w");
98         double *mean = vcalloc(num_features, sizeof(double));
99         double *variance = vcalloc(num_features, sizeof(double));
100         cluster_recursive(0, *num_seq-1, matrix, num_features, mean, variance, compare, tree_f, node);
101         
102         for (i = 0; i < *num_seq; ++i)
103         {
104 //              printf("ERG: %i\n", i);
105                 vfree(matrix[i].features);
106         }
107         vfree(matrix);
108         fclose (tree_f);
109         vfree(mean);
110         vfree(variance);
111         vfree(compare);
112         vfree(node);
113         vfree(num_seq);
114 }
115
116
117
118 /**
119  * Recodes a given sequence into a sequence of it's k-mers.
120  *
121  * \param seq The sequence to encode.
122  * \param seq_length The length of \a seq.
123  * \param char2value Array giving each character in seq a unique value.
124  * \param alphabet_size The size of the alphabet.
125  * \param word_length The length of the k-mers to use.
126  * 
127  * \return The recodes sequence.
128  */
129 int*
130 recode_sequence(char * seq, int seq_length, int *char2value, int alphabet_size, int word_length)
131 {
132
133         int *recoded = vcalloc(seq_length - word_length + 1, sizeof(int));
134         int i;
135         
136         if (word_length == 1)
137         {
138                 for (i = 0; i < seq_length; ++i)
139                 {
140                         recoded[i] = char2value[(short)seq[i]];
141                 }
142         }
143         else
144         {
145                 int *prod=vcalloc (word_length, sizeof(int));
146                 for ( i=0; i<word_length; ++i)
147                 {
148                         prod[word_length-i-1]=(int)pow(alphabet_size,i);
149                 }
150
151                 int index = 0;
152                 for (i = 0; i < word_length; ++i)
153                 {
154                         index += char2value[(short)seq[i]] * prod[i];
155                 }
156                 recoded[0] = index;
157                 int z = 0;
158                 for (; i < seq_length; ++i)
159                 {
160                         index -= char2value[(short)seq[z]] * prod[0];
161                         index *= alphabet_size;
162                         index += char2value[(short)seq[i]];
163                         recoded[++z] = index;
164                 }
165         }
166
167         return recoded;
168 }
169
170
171 /**
172  * Extracts the features from a given sequence.
173  *
174  * \param recoded_seq The recoded sequence, consisting of k-mer codes.
175  * \param seq_length The length of \a recoded_seq.
176  * \param k_tup Length of the k-mers.
177  * \param max_coding The maximal number used for coding the alphabet.
178  * \param max_dist The maximal distance to use between to k-mers.
179  * 
180  * \return The feature vector of the sequence.
181  */
182 double *
183 get_features(int *recoded_seq, int seq_length, int k_tup, int max_coding, int max_dist, int num_features)
184 {
185
186         
187         int vector_length = num_features;//max_coding * max_coding*(max_dist+1);
188         
189         double *feature_vector = vcalloc(vector_length, sizeof(double));
190         int i;
191         for (i = 0; i < vector_length; ++i)
192         {
193                 feature_vector[i] = 0;
194         }
195         int j, max_length, indJ, indK, indDist;
196         for (i = 0; i < seq_length; ++i)
197         {
198                 if (seq_length <= i + max_dist)
199                         max_length = seq_length-1;
200                 else
201                         max_length = i + max_dist;
202                 for (j = i; j <= max_length; ++j)
203                 {
204                         
205                         indJ = max_coding * (max_dist+1) * recoded_seq[i];
206                         indK = (max_dist+1) * recoded_seq[j];
207                         indDist = j - i;
208                         ++feature_vector[indJ + indK + indDist];
209                 }
210         }
211
212         double normalize = 0;
213         for (i = 0; i < vector_length; ++i)
214         {
215                 normalize += (feature_vector[i] * feature_vector[i]);
216         }
217         normalize = sqrt(normalize);
218         for (i = 0; i < vector_length; ++i)
219         {
220                 feature_vector[i] /= normalize; 
221         }
222
223         return feature_vector;
224 }
225
226
227 /**
228  * Calculates the feature values for all sequences in a file.
229  *
230  * The File has to have fasta format.
231  * \param seq_file_name The name of the file (in fasta format).
232  * \param max_dist The maximal distance to be considered.
233  * \param k_tup Size of the k_tup.
234  * \param alphabet_size The size of the alphabet.
235  * 
236  * \return The feature values for every sequence in a Cluster_info object.
237  */
238 Cluster_info *
239 feature_extract(char *seq_file_name, int max_dist, int k_tup, int *char2value, int alphabet_size, int * elem_2_compare, int *num_seq_p, int num_features)
240 {
241         char line[500];
242         FILE *tree_f = fopen(seq_file_name,"r");
243 //      fgets(line, 500, tree_f);
244         int num_seq = -1;
245         int size = 1000;
246         int matrix_size = 1000;
247         const int STEP = 1000;
248         int seq_pos = -1;
249         char *seq = vcalloc(size, sizeof(char));
250         Cluster_info *matrix = vcalloc(matrix_size, sizeof(Cluster_info));
251         char *c_p;
252         int max_coding = pow(alphabet_size,k_tup);
253         while (fgets(line, 500, tree_f) != NULL)
254         {
255                 if (line[0] == '>')
256                 {
257                         if (num_seq >= 0)
258                         {
259                                 seq[++seq_pos] = '\0';
260                                 if (num_seq > matrix_size -2)
261                                 {
262                                         matrix_size += STEP;
263                                         matrix = vrealloc(matrix, matrix_size * sizeof(Cluster_info));
264                                 }
265                                 int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
266                                 matrix[num_seq].seq_number = num_seq;
267                                 matrix[num_seq].elem_2_compare = elem_2_compare;
268                                 matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
269                                 vfree(recoded_seq);
270                                 seq_pos = -1;
271                         }
272                         ++num_seq;
273                 }
274                 else
275                 {
276                         if (size - seq_pos < 500)
277                         {
278                                 size += STEP;
279                                 seq = vrealloc(seq, size*sizeof(char));
280                         }
281                         c_p = &line[0];
282                         while ((*c_p != '\n') && (*c_p != '\0'))
283                         {
284                                 seq[++seq_pos] = toupper(*(c_p));
285                                 ++c_p;
286                         }
287                 }
288         }
289         
290         fclose(tree_f);
291         seq[++seq_pos] = '\0';
292         int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
293         matrix[num_seq].seq_number = num_seq;
294         matrix[num_seq].elem_2_compare = elem_2_compare;
295         matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
296         *num_seq_p = ++num_seq;
297         vfree(seq);
298         vfree(recoded_seq);
299         return matrix;
300 }
301
302
303
304 /**
305  * Compare function for qsort given an array of Cluster_info.
306  *
307  * The field used for sorting is done according to the element determined in the Cluster_info object.
308  * \param a void pointer to an object of Cluster_info.
309  * \param b void pointer to an object of Cluster_info.
310  *
311  * \return Value showing wheather \a a is bigger, smaller or equal to \a b.
312  */
313 int 
314 cluster_compare (const void * a, const void * b)
315 {
316         Cluster_info *tmp1 = (Cluster_info*)a;
317         Cluster_info *tmp2 = (Cluster_info*)b;
318         int elem = *(tmp1->elem_2_compare);
319         if ((tmp1->features[elem]) > (tmp2->features[elem]))
320                 return 1;
321         else if ((tmp1->features[elem]) < (tmp2->features[elem]))
322                 return -1;
323         else
324                 return 0;
325 }
326
327
328
329 /**
330  * Recursive function to build a tree according to a given feature matrix.
331  *
332  * \param start The begin of this set in \a matrix.
333  * \param end The end of this set in \a matrix.
334  * \param matrix The feature matrix.
335  * \param dim The number of features.
336  * \param mean Array of size \a dim.
337  * \param variance Array of size \a dim.
338  * \param elem_2_compare Pointer to a single in. This will save the feature according to which shall be sorted (biggest variance).
339  * \param tree_f The file in which the tree will be written.
340  * \param node The current node.
341  * 
342  * \return The number of the current node.
343  */
344 int
345 cluster_recursive(int start,
346                                   int end,
347                                   Cluster_info* matrix,
348                                   int dim,
349                                   double *mean,
350                                   double *variance,
351                                   int *elem_2_compare,
352                                   FILE* tree_f,
353                                   int *node)
354 {
355
356         //stop recursion
357         if (end-start < 1)
358         {
359                 return matrix[start].seq_number;
360         }
361
362         //reset mean/variance
363         int i;
364         int num_seq = end - start +1;
365         for (i = 0; i < dim; ++i)
366         {
367                 mean[i] = 0;
368                 variance[i] = 0;
369         }
370
371         //calculate mean
372         Cluster_info *start_p = &matrix[start]-1;
373         Cluster_info   *end_p = &matrix[end];
374         double *tmp;
375         while (start_p < end_p)
376         {
377                 tmp = &(++start_p)->features[0];
378                 for (i = 0; i < dim; ++i)
379                 {
380                         mean[i] += *(tmp++);
381                 }
382         }
383         for (i = 0; i < dim; ++i)
384                         mean[i] /= num_seq;
385
386
387         //calculate variance
388         start_p = &matrix[start] -1;
389
390         while (start_p < end_p)
391         {
392                 tmp = &(++start_p)->features[0];
393                 for (i = 0; i < dim; ++i)
394                 {
395                         variance[i] +=  ((*tmp) - mean[i]) * ((*tmp) - mean[i]);
396                         ++tmp;
397                 }
398         }
399
400         //get maximal variance
401         double max  = variance[0];
402         int index  = 0;
403         for (i = 1; i < dim; ++i)
404         {
405                 if (variance[i] > max)
406                 {
407                         max = variance[i];
408                         index = i;
409                 }
410         }
411         *elem_2_compare = index;
412
413
414         //devide into to different sets and start recursion on these child sets
415         qsort (&matrix[start], num_seq, sizeof(Cluster_info), cluster_compare);
416         int split = start + (end-start)/2;
417         int split_value = matrix[split].features[index];
418         while ((matrix[split+1].features[index] == split_value) && (split < end-1))
419                 ++split;
420
421         int left_child = cluster_recursive(start, split, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
422         int right_child = cluster_recursive(split+1, end, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
423         int node2 = *node;
424         ++(*node);
425         //print node into file
426         fprintf(tree_f, "%i %i %i\n", left_child, right_child, node2);
427
428         return node2;
429 }
430
431 /******************************COPYRIGHT NOTICE*******************************/
432 /*© Centro de Regulacio Genomica */
433 /*and */
434 /*Cedric Notredame */
435 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
436 /*All rights reserved.*/
437 /*This file is part of T-COFFEE.*/
438 /**/
439 /*    T-COFFEE is free software; you can redistribute it and/or modify*/
440 /*    it under the terms of the GNU General Public License as published by*/
441 /*    the Free Software Foundation; either version 2 of the License, or*/
442 /*    (at your option) any later version.*/
443 /**/
444 /*    T-COFFEE is distributed in the hope that it will be useful,*/
445 /*    but WITHOUT ANY WARRANTY; without even the implied warranty of*/
446 /*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*/
447 /*    GNU General Public License for more details.*/
448 /**/
449 /*    You should have received a copy of the GNU General Public License*/
450 /*    along with Foobar; if not, write to the Free Software*/
451 /*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/
452 /*...............................................                                                                                      |*/
453 /*  If you need some more information*/
454 /*  cedric.notredame@europe.com*/
455 /*...............................................                                                                                                                                     |*/
456 /**/
457 /**/
458 /*      */
459 /******************************COPYRIGHT NOTICE*******************************/