Next version of JABA
[jabaws.git] / binaries / src / tcoffee / t_coffee_source / parttree.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 #include <stdarg.h>
5 #include <string.h>
6 // #include <argp.h>
7
8 #include "fast_tree_header.h"
9 #include "io_lib_header.h"
10 #include "util_lib_header.h"
11 #include "define_header.h"
12 #include "dp_lib_header.h"
13 //TODO: -change kick-out value
14 //              -pass arrays in partTree_r
15
16 /*!
17  *      \file parttree.c
18  *      \brief Source code for PartTree algorithm.
19  */
20
21
22
23 #define ENLARGEMENT_PER_STEP 50
24
25
26
27 void
28 print_fastal_tree(Tree_fastal *tree,
29                                   int pos,
30                                   FILE *tree_file,
31                                   int num_seq)
32 {
33         if (tree[pos].left >= num_seq)
34                 print_fastal_tree(tree, tree[pos].left-num_seq, tree_file, num_seq);
35         if (tree[pos].right >= num_seq)
36                 print_fastal_tree(tree, tree[pos].right-num_seq, tree_file, num_seq);
37         
38         fprintf(tree_file, "%i %i %i\n", tree[pos].left, tree[pos].right, tree[pos].name);
39 }
40
41
42
43
44
45 PartTree_param *param_set;
46
47
48 //**********************   UPGMA *****************************
49
50 /**
51  * Function to write tree to file in fastal_format. Leafs in \a tree are leafs in the complete tree as well.
52  *
53  * \param tree The root node of the (sub)tree.
54  * \param param_set Parameter of PartTree.
55  * \param start_write_here Current node to write into.
56  * \return position in tree
57  * \see tree_process
58  */
59 int
60 tree_process_simple(NT_node tree,
61                                     PartTree_param *param_set,
62                                         int start_write_here)
63 {
64         if (tree->isseq)
65         {
66 //              printf("T: %s\n", tree->name);
67                 return atoi(tree->name);
68         }
69         else
70         {
71                 Tree_fastal *tree_flat = &param_set->tree[start_write_here];
72                 tree_flat->name = start_write_here +param_set->num_sequences;
73                 if (start_write_here == param_set->pos_tree)
74                 {
75                         ++param_set->pos_tree;
76                 }
77                 start_write_here = param_set->pos_tree;
78                 int left = tree_process_simple(tree->left, param_set, start_write_here);
79                 start_write_here = param_set->pos_tree;
80                 int right = tree_process_simple(tree->right, param_set, start_write_here);
81                 tree_flat->index = NULL;
82                 tree_flat->right = right;
83                 tree_flat->left = left;
84                 return tree_flat->name;
85         }
86 }
87
88
89
90
91 /**
92 * Function to write tree to file in fastal_format. Leafs in \a tree do not need to be leafs in the complete tree.
93 *
94 * \param tree The root node of the (sub)tree.
95 * \param param_set Parameter of PartTree.
96 * \param clusters Number of sequences in each cluster.
97 * \param subgroup The sequences for each cluster.
98 * \param start_write_here Current node to write into.
99 * \return position in tree
100 * \see tree_process_simple
101 */
102 int
103 tree_process(NT_node tree,
104                          PartTree_param *param_set,
105                          int *clusters,
106                      int *subgroup,
107                          int start_write_here)
108 {
109         if (tree->isseq)
110         {
111                 int node_num = atoi(tree->name);
112                 int num_in_sub = clusters[node_num+1] - clusters[node_num];
113 //              printf("NUM: %i %i %i %i\n",node_num, num_in_sub, clusters[node_num+1], clusters[node_num]);
114                 if (num_in_sub > 1)
115                 {
116                         Tree_fastal *tree_flat = &param_set->tree[start_write_here];
117                         tree_flat->name = start_write_here +param_set->num_sequences;
118                         if (start_write_here == param_set->pos_tree)
119                         {
120                                 ++param_set->pos_tree;
121                         }
122                         tree_flat->left  = -1;
123                         tree_flat->right = -1;
124                         tree_flat->index = &subgroup[clusters[node_num]];
125
126                         tree_flat->num_leafs = num_in_sub;
127                         return tree_flat->name;
128                 }
129                 else
130                 {
131                         return(subgroup[clusters[node_num]]);
132                 }
133         }
134         else
135         {
136 //              printf("TREEPOS: %i\n",param_set->pos_tree);
137                 Tree_fastal *tree_flat = &param_set->tree[start_write_here];
138                 tree_flat->name = start_write_here +param_set->num_sequences;
139                 if (start_write_here == param_set->pos_tree)
140                 {
141                         ++param_set->pos_tree;
142                 }
143                 start_write_here = param_set->pos_tree;
144                 int left = tree_process(tree->left, param_set, clusters, subgroup, start_write_here);
145                 start_write_here = param_set->pos_tree;
146                 int right = tree_process(tree->right, param_set, clusters, subgroup, start_write_here);
147                 tree_flat->index = NULL;
148                 tree_flat->right = right;
149                 tree_flat->left = left;
150                 return tree_flat->name;
151         }
152 }
153
154
155 /**
156 * \brief Calculates tree out of distance matrix.
157
158 * Calculates the upgma tree using a given distance matrix.
159 * \param mat The distance matrix.
160 * \param nseq Number of sequences.
161 * \param fname Filename for temporary storage.
162 * \param seqnames Names of the sequences.
163 * \return The calculated UPGMA Tree.
164 */
165 NT_node ** int_dist2upgma_tree_fastal (int **mat, int nseq, char *fname, char **seqnames)
166 {
167         NT_node *NL, T;
168         int a, n, *used;
169         int tot_node;
170         if (upgma_node_heap (NULL))
171         {
172                 printf_exit ( EXIT_FAILURE,stderr, "\nERROR: non empty heap in upgma [FATAL]");
173         }
174         NL=vcalloc (nseq, sizeof (NT_node));
175
176         for (a=0; a<nseq; a++)
177         {
178                 NL[a]=new_declare_tree_node ();
179                 upgma_node_heap (NL[a]);
180                 sprintf (NL[a]->name, "%s", seqnames[a]);
181                 NL[a]->isseq=1;
182                 NL[a]->leaf=1;
183         }
184         used=vcalloc ( nseq, sizeof (int));
185         n=nseq;
186         while (n>1)
187         {
188                 T=upgma_merge(mat, NL,used, &n, nseq);
189         }
190         vfree (used);
191         vfclose (print_tree (T, "newick", vfopen (fname, "w")));
192         upgma_node_heap (NULL);
193         vfree (NL);
194
195         return read_tree (fname,&tot_node, nseq, seqnames);
196 }
197
198
199
200
201 //Part_Tree
202
203 /*!
204 * \brief Constructs a guide tree for multiple sequence alignment.
205 *
206 * This algorithm is an implementation of the partTree algorithm (PartTree: an algorithm to build an approximate tree from a large number of unaligned
207 * sequences. Katoh et al. 2007).
208 * \param sequence_f Filename of file with sequences.
209 * \param tree_f Filename of file where the tree will be stored.
210 * \param ktup Size of the ktups.
211 * \param subgroup Parameter for subgroupsize.
212 */
213 void
214 make_partTree(char *sequence_f,
215                           char *tree_f,
216                           int ktup,
217                           int subgroup)
218 {
219         param_set = vcalloc(1,sizeof(PartTree_param));
220         param_set->ktup = ktup;
221         param_set->subgroup = subgroup;
222         
223         long *file_positions = NULL;
224         long **tmp1 = &file_positions;
225         int *seq_lengths = NULL;
226         int **tmp2 = &seq_lengths;
227
228         //make index
229         int number_of_sequences =  make_pos_len_index_of_file(sequence_f, "KTUP_table", tmp1, tmp2, ktup, "DNA"); 
230         param_set->num_sequences = number_of_sequences;
231         param_set->ktup_positions = file_positions;
232         param_set->seq_lengths = seq_lengths;
233         param_set->threshold = 0.01;
234         param_set->ktup_table_f = fopen("KTUP_table","r");
235         
236         Tree_fastal *tree = vcalloc(number_of_sequences-1,sizeof(Tree_fastal));
237         param_set->tree = tree;
238
239         int i;
240         partTree_r(param_set);
241 //      for (i = 0; i < number_of_sequences-1; ++i)
242 //      {
243 //              printf("%i %i %i\n", tree[i].left, tree[i].right, tree[i].name);
244 //      }
245         FILE * tree_file = fopen(tree_f,"w");
246         print_fastal_tree(tree, 0, tree_file, number_of_sequences);
247         fclose(tree_file);
248         vfree(tree);
249 }
250
251
252 /**
253 * Filters seed set.
254 *
255 * \param sequence_group Sequences to filter.
256 * \param dist_mat The distance matrix.
257 * \param seed_set_cleaned ordered_seed_set.
258 * \param param_set Parameters for PartTree algorithm.
259 * \return number in the filtered set.
260 */
261 int
262 filter(int *sequence_group,
263            int **dist_mat,
264            int *seed_set_cleaned,
265            PartTree_param *param_set)
266 {
267         int i, j;
268         int num_in_subgroup = param_set->subgroup;
269         int *seq_lengths = param_set->seq_lengths;
270         int num_in_clean = 0;
271         double threshold = param_set->threshold;
272 //      printf("threshold: %f\n", threshold);
273         double min;
274         for (i = 0; i < num_in_subgroup; ++i)
275         {
276                 if (!seed_set_cleaned[i])
277                         continue;
278                 for (j = i+1; j < num_in_subgroup; ++j)
279                 {
280                         min = MIN(seq_lengths[sequence_group[i]], seq_lengths[sequence_group[j]]);
281 //                      printf("MINIMUM: %i\n",min);
282                         min = (threshold * min);
283 //                      printf("MINIMUM: %f\n",min);
284                         if (seed_set_cleaned[j] &&(dist_mat[i][j] < min))
285                         {
286                                 if (seq_lengths[sequence_group[i]] < seq_lengths[sequence_group[j]])
287                                 {
288                                         seed_set_cleaned[i] = 0;
289                                         break;
290                                 }
291                                 else
292                                         seed_set_cleaned[j] = 0;
293                         }
294                 }
295         }
296
297         for (i = 0; i < num_in_subgroup; ++i)
298         {
299                 num_in_clean += seed_set_cleaned[i];
300         }
301         int max = num_in_subgroup -1;
302         i = 0;
303         int tmp;
304 //      printf("CLEAN: %i\n", num_in_clean);
305         while (i < num_in_clean)
306         {
307                 if (seed_set_cleaned[i])
308                 {
309                         ++i;
310                 }
311                 else
312                 {
313                         seed_set_cleaned[i] = seed_set_cleaned[max];
314                         seed_set_cleaned[max] = 0;
315                         tmp = sequence_group[i];
316                         sequence_group[i] = sequence_group[max];
317                         sequence_group[max] = tmp;
318                         --max;
319                 }
320         }
321         return num_in_clean;
322 }
323
324
325
326
327
328 /*!
329 *       \brief Function to create a tree using the PartTree algorithm.
330 *       
331 *       \param param_set A \a PartTree_param object containing all necessary parameters and the data.
332 *       \return The node_number.
333 */
334 void
335 partTree_r(PartTree_param *param_set)
336 {
337         
338         int num_of_tree_nodes = param_set->num_sequences-1;
339         int loop_tree_node;
340
341         Tree_fastal *tree = param_set->tree;
342 //      int this_node = param_set->pos_tree;
343         
344         int i;
345         int tsize = param_set->tsize;
346         
347         
348         //get some memory
349         short *table1 = vcalloc(tsize, sizeof(short));
350         short *table2 = vcalloc(tsize, sizeof(short));
351         int *seed_set = vcalloc(param_set->subgroup, sizeof(int));
352         char **names = declare_char(param_set->subgroup, 8);
353         int **dist_mat = declare_int(param_set->subgroup, param_set->subgroup);
354         int **dist_mat2 = declare_int(param_set->subgroup, param_set->subgroup);
355         char * file_name_tmp = vtmpnam(NULL);
356         int *seed_set_cleaned = vcalloc(param_set->subgroup, sizeof(int));
357         FILE *table_f = param_set->ktup_table_f;
358         long *file_positions = param_set->ktup_positions;
359         int max_n_group = param_set->subgroup;
360         int num_in_subgroup = param_set->subgroup;
361         int *seq_lengths = param_set->seq_lengths;
362         int *clusters = vcalloc(param_set->subgroup+1, sizeof(int));
363         int *min_dist = vcalloc(param_set->num_sequences, sizeof(int));
364         int *belongs_to = vcalloc(param_set->num_sequences, sizeof(int));
365         
366         
367         
368         
369         
370         
371         //Prepare first node
372         
373         tree[0].index = vcalloc(param_set->num_sequences,sizeof(int));
374         int *index = tree[0].index;
375         for (i = 0; i< param_set->num_sequences; ++i)
376                 index[i] = i;
377         tree[0].name = param_set->pos_tree +param_set->num_sequences;
378         
379         tree[0].num_leafs = param_set->num_sequences;
380         int *sequence_group2 = vcalloc(param_set->num_sequences,sizeof(int));
381         
382         Tree_fastal *current_node;
383         for (loop_tree_node = 0; loop_tree_node < num_of_tree_nodes; ++loop_tree_node)
384         {
385 //              printf("ROUND: %i\n", loop_tree_node);
386                 current_node = &tree[loop_tree_node];
387                 index= current_node->index;
388                 if (current_node->index == NULL)
389                 {
390                         continue;
391                 }
392                 int num_sequences = current_node->num_leafs;
393
394                 //if number of sequences in this group smaller than number subgoup size: make tree, finisch
395                 if (num_sequences <= max_n_group)
396                 {
397                         int j;
398                         dist_mat = make_distance_matrix(table_f, file_positions, index, num_sequences, dist_mat);
399                         for (i = 0; i < num_sequences; ++i)
400                         {
401                                 sprintf(names[i],"%i", current_node->index[i]);
402                         }
403                         NT_node **tree= (int_dist2upgma_tree_fastal (dist_mat, num_sequences, file_name_tmp , names));
404                         tree_process_simple(tree[0][0], param_set,loop_tree_node);
405                         continue;
406                 }
407
408                 
409                 for (i = 0; i < num_in_subgroup; ++i)
410                 {
411                         seed_set_cleaned[i] = 1;
412                 }
413                 
414                 //finde longest sequence and put into the first field
415                 
416                 int index_longest = 0;
417                 int length_of_longest = 0;
418                 
419                 for(i = 0; i < num_sequences; ++i)
420                 {
421                         if (seq_lengths[index[i]] > length_of_longest)
422                         {
423                                 index_longest = i;
424                                 length_of_longest = seq_lengths[index[i]];
425                         }
426                 }
427                 int tmp = index[index_longest];
428                 index[index_longest] = index[0];
429                 index[0] = tmp;
430
431                 //distance of longest to rest
432                 int seq_index = 1;
433                 int min= euclidean_dist(table_f, file_positions[index[0]], file_positions[index[1]], table1, table2, param_set->tsize);
434                 for (i = 2; i < num_sequences; ++i)
435                 {
436                         tmp = euclidean_dist_half(table_f, file_positions[index[i]], table1, table2, param_set->tsize);
437                         if (tmp < min)
438                         {
439                                 min = tmp;
440                                 seq_index = i;
441                         }
442                 }
443
444                 //get the new seed_set in the first n spaces
445                 tmp = index[1];
446                 index[1] = index[seq_index];
447                 index[seq_index] = tmp;
448                 int r,j;
449                 num_in_subgroup = param_set->subgroup;
450
451                 
452                 for (i = 2; i < num_in_subgroup; ++i)
453                 {
454                         r = i + rand() / ( RAND_MAX / ( num_sequences-i) + 1 );
455 //                      printf("RANDOM: %i\n",r);
456                         tmp = index[r];
457                         index[r] = index[i];
458                         index[i] = tmp;
459                 }
460
461                 //Calculate matrix
462                 dist_mat = make_distance_matrix(table_f, file_positions, index, param_set->subgroup, dist_mat);
463         
464                 //Filter out sequences that are to similar & reorder
465                 
466                 NT_node **upgma_tree;
467                 
468                 
469                 int num_in_clean = filter(index, dist_mat, seed_set_cleaned, param_set);
470 //              if (num_in_clean == 1)
471 //              {
472 //                      int j;
473 // //                   dist_mat = make_distance_matrix(table_f, file_positions, index, upgma_tree, dist_mat);
474 //                      for (i = 0; i < param_set->subgroup; ++i)
475 //                      {
476 //                              sprintf(names[i],"%i", current_node->index[i]);
477 //                      }
478 //                      upgma_tree= (int_dist2upgma_tree_fastal (dist_mat, param_set->subgroup, file_name_tmp , names));
479 //                      tree_process_simple(upgma_tree[0][0], param_set,loop_tree_node);
480 //                      continue;
481 //              }
482 //              else
483 //              {
484                 
485                 if (num_in_clean ==1)
486                 {
487                         num_in_clean = 2;
488                         seed_set_cleaned[1] = 1;
489                 }
490                         //make_tree
491                 int col = 0;
492                 int row = 0;
493                 for (i = 0; i <  num_in_subgroup; ++i)
494                 {
495                         if (seed_set_cleaned[i])
496                         {
497                                 row = col+1;
498                                 for (j = i+1; j <  num_in_subgroup; ++j)
499                                 {
500                                         if (seed_set_cleaned[j])
501                                         {
502                                                 dist_mat2[row][col] = dist_mat2[col][row] = dist_mat[i][j];
503                                                 ++row;
504                                         }
505                                 }
506                                 ++col;
507                         }
508                 }
509                 for (i = 0; i < num_in_clean; ++i)
510                 {
511                         sprintf(names[i],"%i",i);
512                 }
513                 upgma_tree= (int_dist2upgma_tree_fastal (dist_mat2, num_in_clean, file_name_tmp , names));
514 //              }
515                 
516 //              int *pos_tree_p = &param_set->pos_tree;
517
518                 
519                 int leaf = 0;
520
521                 //cluster
522                 //calculate distances from n' to N
523                 get_table(table1, table_f, file_positions[index[0]]);
524                 for (j = num_in_clean; j < num_sequences; ++j)
525                 {
526                         min_dist[j] = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize);
527                         belongs_to[j] = 0;
528                 }
529                 for(i = 1; i < num_in_clean; ++i)
530                 {
531                         get_table(table1, table_f, file_positions[index[i]]);
532                         belongs_to[i] = i;
533                         for (j = num_in_clean; j < num_sequences; ++j)
534                         {
535                                 tmp = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize);
536                                 if (tmp < min_dist[j])
537                                 {
538                                         min_dist[j] = tmp;
539                                         belongs_to[j] = i;
540                                 }
541                         }
542                 }
543
544                 //how_many sequences has each cluster
545                 for (j = 0; j <= num_in_subgroup; ++j)
546                 {
547                         clusters[j] = 0;
548                 }
549                 for (j = 0; j < num_sequences; ++j)
550                 {
551                         ++clusters[belongs_to[j]];
552                 }
553 //              for (j = 0; j <= num_in_subgroup; ++j)
554 //              {
555 //                      printf("CL: %i ",clusters[j]);
556 //              }
557 //              printf("\n");
558                 for(i = 1; i < num_in_clean; ++i)
559                 {
560                         clusters[i] += clusters[i-1];
561                 }
562                 clusters[num_in_clean] = clusters[num_in_clean-1];
563
564                 for (i = 0; i < num_sequences; ++i)
565                 {
566                         sequence_group2[--clusters[belongs_to[i]]] = index[i];
567                 }
568
569                 for (i = 0; i < num_sequences; ++i)
570                 {
571                         index[i] = sequence_group2[i];
572                 }
573
574                 
575                 for (i = 0; i < num_in_clean; ++i)
576                 {
577                         sprintf(names[i],"%i",i);
578                 }
579                 tree_process(upgma_tree[0][0], param_set, clusters, index, loop_tree_node);
580                 NT_node tmp_tree = upgma_tree[3][0];
581                 vfree(upgma_tree[0]);
582                 vfree(upgma_tree[1]);
583                 vfree(upgma_tree[2]);
584                 vfree(upgma_tree[3]);
585                 vfree(upgma_tree);
586                 free_tree(tmp_tree);
587         }
588         vfree(min_dist);
589         vfree(belongs_to);
590         vfree(clusters);
591 }
592
593
594
595 /*!
596  *      \brief Makes the distance matrix between all sequences.
597  *
598  *      \param table_file File with the ktup tables
599  *      \param file_positions Index of positions where the tabels are stored in \a table_file
600  *      \param sequence_group the group of sequences
601  *      \param number number of sequences
602  *      \param dist_mat distance matrix
603  *      \return the distance matrix. (same as \a dist_mat )
604 */
605 int **
606 make_distance_matrix(FILE *table_f,
607                                          long *file_positions,
608                                          int *sequence_group,
609                                          int number,
610                                          int **dist_mat)
611 {
612         static short *table1 = NULL;
613         static short *table2;
614         int tsize = param_set->tsize;
615         if (table1 == NULL)
616         {
617                 table1 = vcalloc(tsize, sizeof(short));
618                 table2 = vcalloc(tsize, sizeof(short));
619         }
620         int i, j, num = number-1;
621         for (i = 0; i < num; ++i)
622         {
623                 j = i+1;
624                 dist_mat[i][j] = dist_mat[j][i]= euclidean_dist(table_f, file_positions[sequence_group[i]], file_positions[sequence_group[j]], table1, table2, tsize);
625                 ++j;
626                 for (; j < number; ++j)
627                 {
628                         dist_mat[i][j] = dist_mat[j][i] = euclidean_dist_half(table_f, file_positions[sequence_group[j]], table1, table2, tsize);
629                 }
630         }
631         return dist_mat;
632 }
633
634
635
636
637 /**
638 * Replaces the coded sequence with coded tuples
639 *
640 * \param coded_seq The coded sequence which will be replaced by the tuple number
641 * \param ktup Size of the ktup
642 * \param ng Coded alphabet size
643 * \param length Lengths of coded sequence
644 */
645 void
646 makepointtable_fast(int *coded_seq,     //sequence
647                                         int ktup,               //ktup size
648                                         int ng,                 //hmm...
649                                         int length)             //length of coded_seq
650 {
651         int point, a;
652         register int *p;
653         static int *prod;
654
655         if (!prod)
656         {
657                 prod=vcalloc ( ktup, sizeof (int));
658                 for ( a=0; a<ktup; a++)
659                 {
660                         prod[ktup-a-1]=(int)pow(ng,a);
661                 }
662         }
663         p = coded_seq;
664
665   
666         for (point=0,a=0; a<ktup; a++)
667         {
668                 point+= *coded_seq++ *prod[a];
669         }
670
671         int i = ktup;
672         while (i < length)
673         {
674                 point -= *p * prod[0];
675                 point *= ng;
676                 point += *coded_seq;
677                 *p = point;
678                 ++p;
679                 ++coded_seq;
680                 ++i;
681         }
682         *p = END_ARRAY;
683 }
684
685
686 /**
687 * \brief Calculates the number of occurences for each ktup.
688 *
689 * \param tables_f File to save the tables in.
690 * \param table Table to save the result in.
691 * \param pointt Array with all ktups listed one after another. Has to end with END_ARRAY.
692 * \param length length of \a table
693 */
694 int
695 makecompositiontable_fastal(FILE* tables_f,     //File to save the tables in
696                                                         int *table,             //table to calculate in
697                                                         int *pointt,    //ktups array
698                                                         int length)             //length of the table
699 {
700         int point;
701         while(*pointt != END_ARRAY )
702         {
703                 ++table[*pointt];
704                 ++pointt;
705         }
706         for (point = 0; point < length; ++point)
707         {
708                 if (table[point] > 0)
709                         fprintf(tables_f, "%i %i\n", point, table[point]);
710         }
711         fprintf(tables_f, "*\n");
712 }
713
714
715 /** JUST FOR TEST */
716 void 
717 make_fast_tree(char *file_name,
718                            int n,
719                            int ktup)
720 {
721         
722         make_partTree(file_name, "TREE_OUT", ktup, n);
723
724 }
725
726
727
728 /**
729 * \brief Reads ktup_table from file
730 *
731 * \param table  Table to save the file content in.
732 * \param tables_f File in which the tables are stored.
733 * \param index Position of the table in \a tables_f
734 */
735 void
736 get_table(short *table,         //Table to save the readings in
737                   FILE* tables_f,       //File with tables
738                   long index)           //index positin of ktup-tables
739 {
740         fseek(tables_f, index, SEEK_SET);
741         const int LINE_LENGTH = 101;
742         char line[LINE_LENGTH];
743         fgets(line, LINE_LENGTH, tables_f);
744         
745         char delims[] = " ";
746         char *result = NULL;
747         int code;
748
749         while (line[0] != '*')
750         {
751                 result = strtok( line, delims );
752                 code = atoi(result);
753                 table[code] = atoi(strtok( NULL, delims));
754                 fgets(line, LINE_LENGTH, tables_f);
755         }
756 }
757
758
759
760 /**
761 * \brief calculates the euclidean ktub distance between two sequences
762 *
763 * @param ktup_f, ktup_file
764 * @param pos1 position of sequence 1 in \a ktup_f
765 * @param pos2 position of sequence 2 in \a ktup_f
766 * @param table1 Saves the number of occurences for each ktup in sequence 1
767 * @param table2 Saves the number of occurences for each ktup in sequence 2
768 */
769 int 
770 euclidean_dist(FILE* ktup_f,    //ktup_file
771                                  long pos1,             //position of table1
772                                  long pos2,             //position of table2
773                                  short *table1, //table to save ktups in
774                                  short *table2, //table to save ktups in
775                                  int length)    
776 {
777         const int LINE_LENGTH = 101;
778         char line[LINE_LENGTH];
779         
780         
781         char delims[] = " ";
782         char *result = NULL;
783         int code;
784
785         fseek(ktup_f, pos1, SEEK_SET);
786         fgets(line, LINE_LENGTH, ktup_f);
787         int i;
788         for (i = 0; i < length; ++i)
789         {
790                 table1[i] = 0;
791                 table2[i] = 0;
792         }
793         while (line[0] != '*')
794         {
795                 result = strtok( line, delims );
796                 code = atoi(result);
797                 table1[code] = atoi(strtok( NULL, delims));
798                 fgets(line, LINE_LENGTH, ktup_f);
799         }
800         fseek(ktup_f, pos2, SEEK_SET);
801         fgets(line, LINE_LENGTH, ktup_f);
802         while (line[0] != '*')
803         {
804                 result = strtok( line, delims );
805                 code = atoi(result);
806                 table2[code] = atoi(strtok( NULL, delims));
807                 fgets(line, LINE_LENGTH, ktup_f);
808         }
809
810         int dist = 0;
811         for (i = 0; i < length; ++i)
812         {
813                 dist += (table1[i]-table2[i])*(table1[i]-table2[i]);
814         }
815         return dist;
816 }
817
818
819
820 /**
821  * \brief calculates the euclidean ktub distance between two sequences.
822  *
823  * The difference to \a euclidean_dist is, that this uses the ktups stored in \a table1
824  * @param ktup_f, ktup_file
825  * @param pos2 position of sequence 2 in \a ktup_f
826  * @param table1 Saves the number of occurences for each ktup in sequence 1
827  * @param table2 Saves the number of occurences for each ktup in sequence 2
828  * \see euclidean_dist
829  */
830 int 
831 euclidean_dist_half(FILE* ktup_f,       //ktup_file
832                                         long pos2,              //position of table1
833                                         short *table1,  //table to save ktups in
834                                         short *table2,  //table to save ktups in
835                                         int length)
836 {
837         const int LINE_LENGTH = 101;
838         char line[LINE_LENGTH];
839         
840         
841         char delims[] = " ";
842         char *result = NULL;
843         int code;
844
845         fseek(ktup_f, pos2, SEEK_SET);
846         fgets(line, LINE_LENGTH, ktup_f);
847         int i;
848         for (i = 0; i < length; ++i)
849         {
850                 table2[i] = 0;
851         }
852         while (line[0] != '*')
853         {
854                 result = strtok( line, delims );
855                 code = atoi(result);
856                 table2[code] = atoi(strtok( NULL, delims));
857                 fgets(line, LINE_LENGTH, ktup_f);
858         }
859
860         int dist = 0;
861         for (i = 0; i < length; ++i)
862         {
863                 dist += (table1[i]-table2[i])*(table1[i]-table2[i]);
864         }
865         return dist;
866 }
867
868
869
870
871 /**
872 *       Makes an index of a file
873 */
874 int
875 make_pos_len_index_of_file(char *file_name,                     //file with sequences
876                                                    char *ktable_f,                      //file with the ktup-tables
877                                                    long **file_positions,       //array to save the positions
878                                                    int **seq_lengths,           //array to save the sequence length
879                                                    int ktup,                            //length of ktup
880                                                    char *type)                          //type of the seuqence  
881 {
882         //preparations for recoding sequence
883         int *aa;
884         int a, b, l;
885         
886         int ng = 0;
887         char **gl;
888         if ( strm (type, "DNA") || strm (type, "RNA"))
889         {
890                 gl=declare_char (5,13);
891                 sprintf ( gl[ng++], "Aa");
892                 sprintf ( gl[ng++], "Gg");
893                 sprintf ( gl[ng++], "TtUu");
894                 sprintf ( gl[ng++], "Cc");
895                 sprintf ( gl[ng++], "NnRrYyDdMmWw");
896         }
897         else
898         {
899                 gl=make_group_aa ( &ng, "mafft");
900         }
901         aa=vcalloc ( 256, sizeof (int));
902         for ( a=0; a<ng; a++)
903         {
904                 for ( b=0; b< strlen (gl[a]); b++) 
905                 {
906                         aa[(int)gl[a][b]]=a;
907                 }
908         }
909         free_char (gl, -1);
910
911         
912         int tsize=(int)pow(ng, ktup);
913         param_set->tsize = tsize;
914         param_set->ng = ng;
915         
916         int *table=vcalloc ( tsize,sizeof (int));
917
918
919         //Reading and recoding squences
920         const int LINE_LENGTH = 501;
921         int *coded_seq = vcalloc(2*LINE_LENGTH, sizeof(int));
922         int allocated_mem = 2*LINE_LENGTH;
923                         
924         (*file_positions) = vcalloc(ENLARGEMENT_PER_STEP,  sizeof(long));
925         (*seq_lengths) = vcalloc(ENLARGEMENT_PER_STEP,  sizeof(int));
926         int current_size = ENLARGEMENT_PER_STEP;
927         int current_pos = 0;
928
929         FILE *file = fopen(file_name,"r");
930
931
932         int seq_length=0;
933         char line[LINE_LENGTH];
934
935         int num_of_sequences = 0;
936         int str_len = 0;
937         int mem_for_pos = ENLARGEMENT_PER_STEP;
938         int tmp;
939         int real_len;
940         int *c_seq;
941
942         FILE *tables_f = fopen(ktable_f, "w");
943
944
945         if (file == NULL)
946         {
947                 printf("FILE NOT FOUND\n");
948                 exit(1);
949         }
950         else
951         {
952
953                 while(fgets(line, LINE_LENGTH , file)!=NULL)
954                 {
955                         if ( str_len >= allocated_mem - LINE_LENGTH)
956                         {
957                                 allocated_mem += LINE_LENGTH;
958                                 coded_seq = vrealloc(coded_seq, allocated_mem*sizeof(int));
959                         }
960                         
961                         int i;
962                         int length = strlen(line);
963                         if (line[0] == '>')
964                         {
965                                 if (num_of_sequences >0)
966                                 {
967                                         (*seq_lengths)[num_of_sequences-1] = str_len;
968 //                                      printf("len: %i\n", str_len);
969                                         c_seq = coded_seq;
970                                         makepointtable_fast(coded_seq,ktup,ng, str_len);
971                                         
972                                         (*file_positions)[num_of_sequences-1] = ftell(tables_f );
973                                         for (i=0; i < tsize; ++i)
974                                                 table[i] = 0;
975                                         makecompositiontable_fastal(tables_f, table, coded_seq,tsize );
976
977
978                                 }
979                                 str_len = 0;
980                                 ++num_of_sequences;
981
982                                 if (num_of_sequences == mem_for_pos)
983                                 {
984                                         mem_for_pos += ENLARGEMENT_PER_STEP;
985                                         (*file_positions) = vrealloc((*file_positions), mem_for_pos * sizeof(long));
986                                         (*seq_lengths) = vrealloc((*seq_lengths), mem_for_pos * sizeof(int));
987                                 }
988                         }
989                         else
990                         {
991                                 int i;
992                                 real_len = strlen(line);
993                                 if (line[real_len-1] == '\n')
994                                         --real_len;
995                                 for (i = 0; i < real_len; ++i)
996                                 {
997                                         coded_seq[str_len++] = aa[line[i]];
998                                 }
999                         }
1000                 }
1001         }
1002
1003         (*seq_lengths)[num_of_sequences-1] = str_len;
1004         c_seq = coded_seq;
1005         makepointtable_fast(coded_seq,ktup,ng, str_len);
1006         (*file_positions)[num_of_sequences] = ftell(tables_f );
1007         makecompositiontable_fastal(tables_f, table, coded_seq,tsize );
1008         fclose(file);
1009         fclose(tables_f);
1010         return num_of_sequences;
1011 }
1012 /*********************************COPYRIGHT NOTICE**********************************/
1013 /*© Centro de Regulacio Genomica */
1014 /*and */
1015 /*Cedric Notredame */
1016 /*Tue Oct 27 10:12:26 WEST 2009. */
1017 /*All rights reserved.*/
1018 /*This file is part of T-COFFEE.*/
1019 /**/
1020 /*    T-COFFEE is free software; you can redistribute it and/or modify*/
1021 /*    it under the terms of the GNU General Public License as published by*/
1022 /*    the Free Software Foundation; either version 2 of the License, or*/
1023 /*    (at your option) any later version.*/
1024 /**/
1025 /*    T-COFFEE is distributed in the hope that it will be useful,*/
1026 /*    but WITHOUT ANY WARRANTY; without even the implied warranty of*/
1027 /*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*/
1028 /*    GNU General Public License for more details.*/
1029 /**/
1030 /*    You should have received a copy of the GNU General Public License*/
1031 /*    along with Foobar; if not, write to the Free Software*/
1032 /*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/
1033 /*...............................................                                                                                      |*/
1034 /*  If you need some more information*/
1035 /*  cedric.notredame@europe.com*/
1036 /*...............................................                                                                                                                                     |*/
1037 /**/
1038 /**/
1039 /*      */
1040 /*********************************COPYRIGHT NOTICE**********************************/