JWS-112 Bumping version of T-Coffee to version 11.00.8cbe486.
[jabaws.git] / binaries / src / tcoffee / t_coffee_source / fastal / tree.c
1 /******************************COPYRIGHT NOTICE*******************************/
2 /*  (c) Centro de Regulacio Genomica                                                        */
3 /*  and                                                                                     */
4 /*  Cedric Notredame                                                                        */
5 /*  12 Aug 2014 - 22:07.                                                                    */
6 /*All rights reserved.                                                                      */
7 /*This file is part of T-COFFEE.                                                            */
8 /*                                                                                          */
9 /*    T-COFFEE is free software; you can redistribute it and/or modify                      */
10 /*    it under the terms of the GNU General Public License as published by                  */
11 /*    the Free Software Foundation; either version 2 of the License, or                     */
12 /*    (at your option) any later version.                                                   */
13 /*                                                                                          */
14 /*    T-COFFEE is distributed in the hope that it will be useful,                           */
15 /*    but WITHOUT ANY WARRANTY; without even the implied warranty of                        */
16 /*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                         */
17 /*    GNU General Public License for more details.                                          */
18 /*                                                                                          */
19 /*    You should have received a copy of the GNU General Public License                     */
20 /*    along with Foobar; if not, write to the Free Software                                 */
21 /*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA             */
22 /*...............................................                                           */
23 /*  If you need some more information                                                       */
24 /*  cedric.notredame@europe.com                                                             */
25 /*...............................................                                           */
26 /******************************COPYRIGHT NOTICE*******************************/
27 #include "stdio.h"
28 #include "stdlib.h"
29 #include "math.h"
30 #include "ctype.h"
31
32
33 #include "io_lib_header.h"
34 #include "util_lib_header.h"
35 // #include "define_header.h"
36 // #include "dp_lib_header.h"
37 // #include "fastal_lib_header.h"
38
39
40
41 #include "fastal_lib_header.h"
42
43 /*!
44  *      \file tree.c
45  *      \brief Source code for the fastal tree algorithm
46  */
47
48 /*
49
50
51 int
52 main(int argc, char** argv)
53 {
54         int alphabet_size = 5;
55         int aa[256];
56         if (alphabet_size == 5)
57         {
58                 aa['A'] = 0;
59                 aa['B'] = 1;
60                 aa['C'] = 1;
61                 aa['D'] = 0;
62                 aa['G'] = 2;
63                 aa['H'] = 0;
64                 aa['K'] = 3;
65                 aa['M'] = 1;
66                 aa['N'] = 0;
67                 aa['R'] = 0;
68                 aa['S'] = 1;
69                 aa['T'] = 3;
70                 aa['U'] = 4;
71                 aa['W'] = 3;
72                 aa['Y'] = 1;
73         }
74         else
75         {
76                 aa['A'] = 0;
77                 aa['B'] = 1;
78                 aa['C'] = 2;
79                 aa['D'] = 3;
80                 aa['E'] = 4;
81                 aa['F'] = 5;
82                 aa['G'] = 6;
83                 aa['H'] = 7;
84                 aa['I'] = 8;
85                 aa['J'] = 9;
86                 aa['K'] = 10;
87                 aa['L'] = 11;
88                 aa['M'] = 12;
89                 aa['N'] = 13;
90                 aa['P'] = 14;
91                 aa['Q'] = 15;
92                 aa['R'] = 16;
93                 aa['S'] = 17;
94                 aa['T'] = 18;
95                 aa['V'] = 19;
96                 aa['W'] = 20;
97                 aa['X'] = 21;
98                 aa['Y'] = 22;
99                 aa['Z'] = 23;
100         }
101         compute_oligomer_distance_tree(argv[1], &aa[0], argv[2], 5, 2, alphabet_size);
102         return 0;
103 }*/
104
105
106 void
107 compute_oligomer_distance_tree(char *seq_file_name, int* char2value, char *tree_file_name, int max_dist, int word_length, int alphabet_size)
108 {
109         int i;
110         int *compare =(int*) vcalloc(1,sizeof(int));
111         int *num_seq =(int*) vcalloc(1,sizeof(int));
112
113         int num_features = (int)pow(alphabet_size,word_length);
114         
115         num_features *= num_features * (max_dist+1);
116
117
118         Cluster_info *matrix  = feature_extract(seq_file_name, max_dist, word_length, char2value, alphabet_size, compare, num_seq, num_features);
119         
120         int *node =(int*) vcalloc(1, sizeof(int));
121         *node = *num_seq;
122         
123         FILE *tree_f = fopen(tree_file_name,"w");
124         double *mean = (double*)vcalloc(num_features, sizeof(double));
125         double *variance =  (double*)vcalloc(num_features, sizeof(double));
126         cluster_recursive(0, *num_seq-1, matrix, num_features, mean, variance, compare, tree_f, node);
127         
128         for (i = 0; i < *num_seq; ++i)
129         {
130 //              printf("ERG: %i\n", i);
131                 vfree(matrix[i].features);
132         }
133         vfree(matrix);
134         fclose (tree_f);
135         vfree(mean);
136         vfree(variance);
137         vfree(compare);
138         vfree(node);
139         vfree(num_seq);
140 }
141
142
143
144 /**
145  * Recodes a given sequence into a sequence of it's k-mers.
146  *
147  * \param seq The sequence to encode.
148  * \param seq_length The length of \a seq.
149  * \param char2value Array giving each character in seq a unique value.
150  * \param alphabet_size The size of the alphabet.
151  * \param word_length The length of the k-mers to use.
152  * 
153  * \return The recodes sequence.
154  */
155 int*
156 recode_sequence(char * seq, int seq_length, int *char2value, int alphabet_size, int word_length)
157 {
158
159         int *recoded =(int*) vcalloc(seq_length - word_length + 1, sizeof(int));
160         int i;
161         
162         if (word_length == 1)
163         {
164                 for (i = 0; i < seq_length; ++i)
165                 {
166                         recoded[i] = char2value[(short)seq[i]];
167                 }
168         }
169         else
170         {
171                 int *prod=(int*)vcalloc (word_length, sizeof(int));
172                 for ( i=0; i<word_length; ++i)
173                 {
174                         prod[word_length-i-1]=(int)pow(alphabet_size,i);
175                 }
176
177                 int index = 0;
178                 for (i = 0; i < word_length; ++i)
179                 {
180                         index += char2value[(short)seq[i]] * prod[i];
181                 }
182                 recoded[0] = index;
183                 int z = 0;
184                 for (; i < seq_length; ++i)
185                 {
186                         index -= char2value[(short)seq[z]] * prod[0];
187                         index *= alphabet_size;
188                         index += char2value[(short)seq[i]];
189                         recoded[++z] = index;
190                 }
191         }
192
193         return recoded;
194 }
195
196
197 /**
198  * Extracts the features from a given sequence.
199  *
200  * \param recoded_seq The recoded sequence, consisting of k-mer codes.
201  * \param seq_length The length of \a recoded_seq.
202  * \param k_tup Length of the k-mers.
203  * \param max_coding The maximal number used for coding the alphabet.
204  * \param max_dist The maximal distance to use between to k-mers.
205  * 
206  * \return The feature vector of the sequence.
207  */
208 double *
209 get_features(int *recoded_seq, int seq_length, int k_tup, int max_coding, int max_dist, int num_features)
210 {
211
212         
213         int vector_length = num_features;//max_coding * max_coding*(max_dist+1);
214         
215         double *feature_vector =(double*) vcalloc(vector_length, sizeof(double));
216         int i;
217         for (i = 0; i < vector_length; ++i)
218         {
219                 feature_vector[i] = 0;
220         }
221         int j, max_length, indJ, indK, indDist;
222         for (i = 0; i < seq_length; ++i)
223         {
224                 if (seq_length <= i + max_dist)
225                         max_length = seq_length-1;
226                 else
227                         max_length = i + max_dist;
228                 for (j = i; j <= max_length; ++j)
229                 {
230                         
231                         indJ = max_coding * (max_dist+1) * recoded_seq[i];
232                         indK = (max_dist+1) * recoded_seq[j];
233                         indDist = j - i;
234                         ++feature_vector[indJ + indK + indDist];
235                 }
236         }
237
238         double normalize = 0;
239         for (i = 0; i < vector_length; ++i)
240         {
241                 normalize += (feature_vector[i] * feature_vector[i]);
242         }
243         normalize = sqrt(normalize);
244         for (i = 0; i < vector_length; ++i)
245         {
246                 feature_vector[i] /= normalize; 
247         }
248
249         return feature_vector;
250 }
251
252
253 /**
254  * Calculates the feature values for all sequences in a file.
255  *
256  * The File has to have fasta format.
257  * \param seq_file_name The name of the file (in fasta format).
258  * \param max_dist The maximal distance to be considered.
259  * \param k_tup Size of the k_tup.
260  * \param alphabet_size The size of the alphabet.
261  * 
262  * \return The feature values for every sequence in a Cluster_info object.
263  */
264 Cluster_info *
265 feature_extract(char *seq_file_name, int max_dist, int k_tup, int *char2value, int alphabet_size, int * elem_2_compare, int *num_seq_p, int num_features)
266 {
267         char line[500];
268         FILE *tree_f = fopen(seq_file_name,"r");
269 //      fgets(line, 500, tree_f);
270         int num_seq = -1;
271         int size = 1000;
272         int matrix_size = 1000;
273         const int STEP = 1000;
274         int seq_pos = -1;
275         char *seq = (char*)vcalloc(size, sizeof(char));
276         Cluster_info *matrix =(Cluster_info*) vcalloc(matrix_size, sizeof(Cluster_info));
277         char *c_p;
278         int max_coding = pow(alphabet_size,k_tup);
279         while (fgets(line, 500, tree_f) != NULL)
280         {
281                 if (line[0] == '>')
282                 {
283                         if (num_seq >= 0)
284                         {
285                                 seq[++seq_pos] = '\0';
286                                 if (num_seq > matrix_size -2)
287                                 {
288                                         matrix_size += STEP;
289                                         matrix = (Cluster_info*)vrealloc(matrix, matrix_size * sizeof(Cluster_info));
290                                 }
291                                 int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
292                                 matrix[num_seq].seq_number = num_seq;
293                                 matrix[num_seq].elem_2_compare = elem_2_compare;
294                                 matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
295                                 vfree(recoded_seq);
296                                 seq_pos = -1;
297                         }
298                         ++num_seq;
299                 }
300                 else
301                 {
302                         if (size - seq_pos < 500)
303                         {
304                                 size += STEP;
305                                 seq = (char*)vrealloc(seq, size*sizeof(char));
306                         }
307                         c_p = &line[0];
308                         while ((*c_p != '\n') && (*c_p != '\0'))
309                         {
310                                 seq[++seq_pos] = toupper(*(c_p));
311                                 ++c_p;
312                         }
313                 }
314         }
315         
316         fclose(tree_f);
317         seq[++seq_pos] = '\0';
318         int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
319         matrix[num_seq].seq_number = num_seq;
320         matrix[num_seq].elem_2_compare = elem_2_compare;
321         matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
322         *num_seq_p = ++num_seq;
323         vfree(seq);
324         vfree(recoded_seq);
325         return matrix;
326 }
327
328
329
330 /**
331  * Compare function for qsort given an array of Cluster_info.
332  *
333  * The field used for sorting is done according to the element determined in the Cluster_info object.
334  * \param a void pointer to an object of Cluster_info.
335  * \param b void pointer to an object of Cluster_info.
336  *
337  * \return Value showing wheather \a a is bigger, smaller or equal to \a b.
338  */
339 int 
340 cluster_compare (const void * a, const void * b)
341 {
342         Cluster_info *tmp1 = (Cluster_info*)a;
343         Cluster_info *tmp2 = (Cluster_info*)b;
344         int elem = *(tmp1->elem_2_compare);
345         if ((tmp1->features[elem]) > (tmp2->features[elem]))
346                 return 1;
347         else if ((tmp1->features[elem]) < (tmp2->features[elem]))
348                 return -1;
349         else
350                 return 0;
351 }
352
353
354
355 /**
356  * Recursive function to build a tree according to a given feature matrix.
357  *
358  * \param start The begin of this set in \a matrix.
359  * \param end The end of this set in \a matrix.
360  * \param matrix The feature matrix.
361  * \param dim The number of features.
362  * \param mean Array of size \a dim.
363  * \param variance Array of size \a dim.
364  * \param elem_2_compare Pointer to a single in. This will save the feature according to which shall be sorted (biggest variance).
365  * \param tree_f The file in which the tree will be written.
366  * \param node The current node.
367  * 
368  * \return The number of the current node.
369  */
370 int
371 cluster_recursive(int start,
372                                   int end,
373                                   Cluster_info* matrix,
374                                   int dim,
375                                   double *mean,
376                                   double *variance,
377                                   int *elem_2_compare,
378                                   FILE* tree_f,
379                                   int *node)
380 {
381
382         //stop recursion
383         if (end-start < 1)
384         {
385                 return matrix[start].seq_number;
386         }
387
388         //reset mean/variance
389         int i;
390         int num_seq = end - start +1;
391         for (i = 0; i < dim; ++i)
392         {
393                 mean[i] = 0;
394                 variance[i] = 0;
395         }
396
397         //calculate mean
398         Cluster_info *start_p = &matrix[start]-1;
399         Cluster_info   *end_p = &matrix[end];
400         double *tmp;
401         while (start_p < end_p)
402         {
403                 tmp = &(++start_p)->features[0];
404                 for (i = 0; i < dim; ++i)
405                 {
406                         mean[i] += *(tmp++);
407                 }
408         }
409         for (i = 0; i < dim; ++i)
410                         mean[i] /= num_seq;
411
412
413         //calculate variance
414         start_p = &matrix[start] -1;
415
416         while (start_p < end_p)
417         {
418                 tmp = &(++start_p)->features[0];
419                 for (i = 0; i < dim; ++i)
420                 {
421                         variance[i] +=  ((*tmp) - mean[i]) * ((*tmp) - mean[i]);
422                         ++tmp;
423                 }
424         }
425
426         //get maximal variance
427         double max  = variance[0];
428         int index  = 0;
429         for (i = 1; i < dim; ++i)
430         {
431                 if (variance[i] > max)
432                 {
433                         max = variance[i];
434                         index = i;
435                 }
436         }
437         *elem_2_compare = index;
438
439
440         //devide into to different sets and start recursion on these child sets
441         qsort (&matrix[start], num_seq, sizeof(Cluster_info), cluster_compare);
442         int split = start + (end-start)/2;
443         int split_value = matrix[split].features[index];
444         while ((matrix[split+1].features[index] == split_value) && (split < end-1))
445                 ++split;
446
447         int left_child = cluster_recursive(start, split, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
448         int right_child = cluster_recursive(split+1, end, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
449         int node2 = *node;
450         ++(*node);
451         //print node into file
452         fprintf(tree_f, "%i %i %i\n", left_child, right_child, node2);
453
454         return node2;
455 }
456