9 #include "fast_tree_header.h"
10 #include "io_lib_header.h"
11 #include "util_lib_header.h"
12 #include "define_header.h"
13 #include "dp_lib_header.h"
14 //TODO: -change kick-out value
15 // -pass arrays in partTree_r
19 * \brief Source code for PartTree algorithm.
24 #define ENLARGEMENT_PER_STEP 50
29 print_fastal_tree(Tree_fastal *tree,
34 if (tree[pos].left >= num_seq)
35 print_fastal_tree(tree, tree[pos].left-num_seq, tree_file, num_seq);
36 if (tree[pos].right >= num_seq)
37 print_fastal_tree(tree, tree[pos].right-num_seq, tree_file, num_seq);
39 fprintf(tree_file, "%i %i %i\n", tree[pos].left, tree[pos].right, tree[pos].name);
46 PartTree_param *param_set;
49 //********************** UPGMA *****************************
52 * Function to write tree to file in fastal_format. Leafs in \a tree are leafs in the complete tree as well.
54 * \param tree The root node of the (sub)tree.
55 * \param param_set Parameter of PartTree.
56 * \param start_write_here Current node to write into.
57 * \return position in tree
61 tree_process_simple(NT_node tree,
62 PartTree_param *param_set,
67 // printf("T: %s\n", tree->name);
68 return atoi(tree->name);
72 Tree_fastal *tree_flat = ¶m_set->tree[start_write_here];
73 tree_flat->name = start_write_here +param_set->num_sequences;
74 if (start_write_here == param_set->pos_tree)
76 ++param_set->pos_tree;
78 start_write_here = param_set->pos_tree;
79 int left = tree_process_simple(tree->left, param_set, start_write_here);
80 start_write_here = param_set->pos_tree;
81 int right = tree_process_simple(tree->right, param_set, start_write_here);
82 tree_flat->index = NULL;
83 tree_flat->right = right;
84 tree_flat->left = left;
85 return tree_flat->name;
91 read_sequence_from_position(FILE *file, long position1, char *seq)
93 fseek(file, position1, SEEK_SET);
96 while ((fgets(line, 500, file) != NULL) && (line[0] != '>'))
99 while ((line[++l] != '\n') && (line[l] != '\0'))
100 seq[++pos] = line[l];
106 // read_sequence_from_position(FILE *file, long position1, long position2)
112 * Function to write tree to file in fastal_format. Leafs in \a tree do not need to be leafs in the complete tree.
114 * \param tree The root node of the (sub)tree.
115 * \param param_set Parameter of PartTree.
116 * \param clusters Number of sequences in each cluster.
117 * \param subgroup The sequences for each cluster.
118 * \param start_write_here Current node to write into.
119 * \return position in tree
120 * \see tree_process_simple
123 tree_process(NT_node tree,
124 PartTree_param *param_set,
127 int start_write_here)
131 int node_num = atoi(tree->name);
132 int num_in_sub = clusters[node_num+1] - clusters[node_num];
133 // printf("NUM: %i %i %i %i\n",node_num, num_in_sub, clusters[node_num+1], clusters[node_num]);
136 Tree_fastal *tree_flat = ¶m_set->tree[start_write_here];
137 tree_flat->name = start_write_here +param_set->num_sequences;
138 if (start_write_here == param_set->pos_tree)
140 ++param_set->pos_tree;
142 tree_flat->left = -1;
143 tree_flat->right = -1;
144 tree_flat->index = &subgroup[clusters[node_num]];
146 tree_flat->num_leafs = num_in_sub;
147 return tree_flat->name;
151 return(subgroup[clusters[node_num]]);
156 // printf("TREEPOS: %i\n",param_set->pos_tree);
157 Tree_fastal *tree_flat = ¶m_set->tree[start_write_here];
158 tree_flat->name = start_write_here +param_set->num_sequences;
159 if (start_write_here == param_set->pos_tree)
161 ++param_set->pos_tree;
163 start_write_here = param_set->pos_tree;
164 int left = tree_process(tree->left, param_set, clusters, subgroup, start_write_here);
165 start_write_here = param_set->pos_tree;
166 int right = tree_process(tree->right, param_set, clusters, subgroup, start_write_here);
167 tree_flat->index = NULL;
168 tree_flat->right = right;
169 tree_flat->left = left;
170 return tree_flat->name;
176 * \brief Calculates tree out of distance matrix.
178 * Calculates the upgma tree using a given distance matrix.
179 * \param mat The distance matrix.
180 * \param nseq Number of sequences.
181 * \param fname Filename for temporary storage.
182 * \param seqnames Names of the sequences.
183 * \return The calculated UPGMA Tree.
185 NT_node ** int_dist2upgma_tree_fastal (int **mat, int nseq, char *fname, char **seqnames)
190 if (upgma_node_heap (NULL))
192 printf_exit ( EXIT_FAILURE,stderr, "\nERROR: non empty heap in upgma [FATAL]");
194 NL=vcalloc (nseq, sizeof (NT_node));
196 for (a=0; a<nseq; a++)
198 NL[a]=new_declare_tree_node ();
199 upgma_node_heap (NL[a]);
200 sprintf (NL[a]->name, "%s", seqnames[a]);
204 used=vcalloc ( nseq, sizeof (int));
208 T=upgma_merge(mat, NL,used, &n, nseq);
211 vfclose (print_tree (T, "newick", vfopen (fname, "w")));
212 upgma_node_heap (NULL);
215 return read_tree (fname,&tot_node, nseq, seqnames);
224 * \brief Constructs a guide tree for multiple sequence alignment.
226 * This algorithm is an implementation of the partTree algorithm (PartTree: an algorithm to build an approximate tree from a large number of unaligned
227 * sequences. Katoh et al. 2007).
228 * \param sequence_f Filename of file with sequences.
229 * \param tree_f Filename of file where the tree will be stored.
230 * \param ktup Size of the ktups.
231 * \param subgroup Parameter for subgroupsize.
234 make_partTree(char *sequence_f,
241 long *file_positions = NULL;
242 long **tmp1 = &file_positions;
243 int *seq_lengths = NULL;
244 int **tmp2 = &seq_lengths;
245 int number_of_sequences;
246 param_set = vcalloc(1,sizeof(PartTree_param));
247 param_set->ktup = ktup;
248 param_set->subgroup = subgroup;
255 char *ktup_table = vtmpnam(NULL);
256 param_set->num_sequences = number_of_sequences = make_pos_len_index_of_file(sequence_f, ktup_table, tmp1, tmp2, ktup, is_dna);
257 tree = vcalloc(number_of_sequences-1,sizeof(Tree_fastal));
258 param_set->tree = tree;
259 param_set->ktup_positions = file_positions;
260 param_set->seq_lengths = seq_lengths;
261 param_set->threshold = 0.01;
262 param_set->ktup_table_f = fopen(ktup_table,"r");
265 partTree_r(param_set);
271 param_set->num_sequences = number_of_sequences = make_pos_len_index_of_file_retree(sequence_f, tmp1, tmp2);
272 tree = vcalloc(number_of_sequences-1,sizeof(Tree_fastal));
273 param_set->tree = tree;
274 param_set->ktup_positions = file_positions;
275 param_set->seq_lengths = seq_lengths;
276 param_set->threshold = 0.01;
277 param_set->ktup_table_f = fopen(sequence_f,"r");
279 partTree_retree(param_set);
283 FILE * tree_file = fopen(tree_f,"w");
284 print_fastal_tree(tree, 0, tree_file, number_of_sequences);
294 * \param sequence_group Sequences to filter.
295 * \param dist_mat The distance matrix.
296 * \param seed_set_cleaned ordered_seed_set.
297 * \param param_set Parameters for PartTree algorithm.
298 * \return number in the filtered set.
301 filter(int *sequence_group,
303 int *seed_set_cleaned,
304 PartTree_param *param_set)
307 int num_in_subgroup = param_set->subgroup;
308 int *seq_lengths = param_set->seq_lengths;
309 int num_in_clean = 0;
310 double threshold = param_set->threshold;
311 // printf("threshold: %f\n", threshold);
313 for (i = 0; i < num_in_subgroup; ++i)
315 if (!seed_set_cleaned[i])
317 for (j = i+1; j < num_in_subgroup; ++j)
319 min = MIN(seq_lengths[sequence_group[i]], seq_lengths[sequence_group[j]]);
320 // printf("MINIMUM: %i\n",min);
321 min = (threshold * min);
322 // printf("MINIMUM: %f\n",min);
323 if (seed_set_cleaned[j] &&(dist_mat[i][j] < min))
325 if (seq_lengths[sequence_group[i]] < seq_lengths[sequence_group[j]])
327 seed_set_cleaned[i] = 0;
331 seed_set_cleaned[j] = 0;
336 for (i = 0; i < num_in_subgroup; ++i)
338 num_in_clean += seed_set_cleaned[i];
340 int max = num_in_subgroup -1;
343 // printf("CLEAN: %i\n", num_in_clean);
344 while (i < num_in_clean)
346 if (seed_set_cleaned[i])
352 seed_set_cleaned[i] = seed_set_cleaned[max];
353 seed_set_cleaned[max] = 0;
354 tmp = sequence_group[i];
355 sequence_group[i] = sequence_group[max];
356 sequence_group[max] = tmp;
368 * \brief Function to create a tree using the PartTree algorithm.
370 * \param param_set A \a PartTree_param object containing all necessary parameters and the data.
371 * \return The node_number.
374 partTree_r(PartTree_param *param_set)
377 int num_of_tree_nodes = param_set->num_sequences-1;
380 Tree_fastal *tree = param_set->tree;
381 // int this_node = param_set->pos_tree;
384 int tsize = param_set->tsize;
388 short *table1 = vcalloc(tsize, sizeof(short));
389 short *table2 = vcalloc(tsize, sizeof(short));
390 char **names = declare_char(param_set->subgroup, 8);
391 int **dist_mat = declare_int(param_set->subgroup, param_set->subgroup);
392 int **dist_mat2 = declare_int(param_set->subgroup, param_set->subgroup);
393 char * file_name_tmp = vtmpnam(NULL);
394 int *seed_set_cleaned = vcalloc(param_set->subgroup, sizeof(int));
395 FILE *table_f = param_set->ktup_table_f;
396 long *file_positions = param_set->ktup_positions;
397 int max_n_group = param_set->subgroup;
398 int num_in_subgroup = param_set->subgroup;
399 int *seq_lengths = param_set->seq_lengths;
400 int *clusters = vcalloc(param_set->subgroup+1, sizeof(int));
401 int *min_dist = vcalloc(param_set->num_sequences, sizeof(int));
402 int *belongs_to = vcalloc(param_set->num_sequences, sizeof(int));
411 tree[0].index = vcalloc(param_set->num_sequences,sizeof(int));
412 int *index = tree[0].index;
413 for (i = 0; i< param_set->num_sequences; ++i)
415 tree[0].name = param_set->pos_tree +param_set->num_sequences;
417 tree[0].num_leafs = param_set->num_sequences;
418 int *sequence_group2 = vcalloc(param_set->num_sequences,sizeof(int));
420 Tree_fastal *current_node;
421 for (loop_tree_node = 0; loop_tree_node < num_of_tree_nodes; ++loop_tree_node)
423 // printf("ROUND: %i\n", loop_tree_node);
424 current_node = &tree[loop_tree_node];
425 index= current_node->index;
426 if (current_node->index == NULL)
430 int num_sequences = current_node->num_leafs;
432 //if number of sequences in this group smaller than number subgoup size: make tree, finisch
433 if (num_sequences <= max_n_group)
435 dist_mat = make_distance_matrix(table_f, file_positions, index, num_sequences, dist_mat);
436 for (i = 0; i < num_sequences; ++i)
438 sprintf(names[i],"%i", current_node->index[i]);
440 NT_node **tree= (int_dist2upgma_tree_fastal (dist_mat, num_sequences, file_name_tmp , names));
441 tree_process_simple(tree[0][0], param_set,loop_tree_node);
446 for (i = 0; i < num_in_subgroup; ++i)
448 seed_set_cleaned[i] = 1;
451 //finde longest sequence and put into the first field
453 int index_longest = 0;
454 int length_of_longest = 0;
456 for(i = 0; i < num_sequences; ++i)
458 if (seq_lengths[index[i]] > length_of_longest)
461 length_of_longest = seq_lengths[index[i]];
464 int tmp = index[index_longest];
465 index[index_longest] = index[0];
468 //distance of longest to rest
470 int min= euclidean_dist(table_f, file_positions[index[0]], file_positions[index[1]], table1, table2, param_set->tsize);
471 for (i = 2; i < num_sequences; ++i)
473 tmp = euclidean_dist_half(table_f, file_positions[index[i]], table1, table2, param_set->tsize);
481 //get the new seed_set in the first n spaces
483 index[1] = index[seq_index];
484 index[seq_index] = tmp;
486 num_in_subgroup = param_set->subgroup;
489 for (i = 2; i < num_in_subgroup; ++i)
491 r = i + rand() / ( RAND_MAX / ( num_sequences-i) + 1 );
492 // printf("RANDOM: %i\n",r);
499 dist_mat = make_distance_matrix(table_f, file_positions, index, param_set->subgroup, dist_mat);
501 //Filter out sequences that are to similar & reorder
503 NT_node **upgma_tree;
506 int num_in_clean = filter(index, dist_mat, seed_set_cleaned, param_set);
509 if (num_in_clean ==1)
512 seed_set_cleaned[1] = 1;
517 for (i = 0; i < num_in_subgroup; ++i)
519 if (seed_set_cleaned[i])
522 for (j = i+1; j < num_in_subgroup; ++j)
524 if (seed_set_cleaned[j])
526 dist_mat2[row][col] = dist_mat2[col][row] = dist_mat[i][j];
533 for (i = 0; i < num_in_clean; ++i)
535 sprintf(names[i],"%i",i);
537 upgma_tree= (int_dist2upgma_tree_fastal (dist_mat2, num_in_clean, file_name_tmp , names));
541 //calculate distances from n' to N
542 get_table(table1, table_f, file_positions[index[0]]);
543 for (j = num_in_clean; j < num_sequences; ++j)
545 min_dist[j] = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize);
548 for(i = 1; i < num_in_clean; ++i)
550 get_table(table1, table_f, file_positions[index[i]]);
552 for (j = num_in_clean; j < num_sequences; ++j)
554 tmp = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize);
555 if (tmp < min_dist[j])
563 //how_many sequences has each cluster
564 for (j = 0; j <= num_in_subgroup; ++j)
568 for (j = 0; j < num_sequences; ++j)
570 ++clusters[belongs_to[j]];
572 // for (j = 0; j <= num_in_subgroup; ++j)
574 // printf("CL: %i ",clusters[j]);
577 for(i = 1; i < num_in_clean; ++i)
579 clusters[i] += clusters[i-1];
581 clusters[num_in_clean] = clusters[num_in_clean-1];
583 for (i = 0; i < num_sequences; ++i)
585 sequence_group2[--clusters[belongs_to[i]]] = index[i];
588 for (i = 0; i < num_sequences; ++i)
590 index[i] = sequence_group2[i];
594 for (i = 0; i < num_in_clean; ++i)
596 sprintf(names[i],"%i",i);
598 tree_process(upgma_tree[0][0], param_set, clusters, index, loop_tree_node);
599 NT_node tmp_tree = upgma_tree[3][0];
600 vfree(upgma_tree[0]);
601 vfree(upgma_tree[1]);
602 vfree(upgma_tree[2]);
603 vfree(upgma_tree[3]);
615 * \brief Makes the distance matrix between all sequences.
617 * \param table_file File with the ktup tables
618 * \param file_positions Index of positions where the tabels are stored in \a table_file
619 * \param sequence_group the group of sequences
620 * \param number number of sequences
621 * \param dist_mat distance matrix
622 * \return the distance matrix. (same as \a dist_mat )
625 make_distance_matrix(FILE *table_f,
626 long *file_positions,
631 static short *table1 = NULL;
632 static short *table2;
633 int tsize = param_set->tsize;
636 table1 = vcalloc(tsize, sizeof(short));
637 table2 = vcalloc(tsize, sizeof(short));
639 int i, j, num = number-1;
640 for (i = 0; i < num; ++i)
643 dist_mat[i][j] = dist_mat[j][i]= euclidean_dist(table_f, file_positions[sequence_group[i]], file_positions[sequence_group[j]], table1, table2, tsize);
645 for (; j < number; ++j)
647 dist_mat[i][j] = dist_mat[j][i] = euclidean_dist_half(table_f, file_positions[sequence_group[j]], table1, table2, tsize);
655 make_distance_matrix_sim(FILE *aln_f,
656 long *file_positions,
661 static char *seq1 = NULL;
668 fseek(aln_f, file_positions[0], SEEK_SET);
669 fgets(line, 500, aln_f);
670 while (line[0] != '>')
673 while ((line[++i] != '\n') && (line[i] != '\0'));
675 fgets(line, 500, aln_f);
677 seq1 = vcalloc(length+1, sizeof(short));
678 seq2 = vcalloc(length+1, sizeof(short));
682 int i, j, num = number-1, pos;
683 for (i = 0; i < num; ++i)
686 fseek(aln_f, file_positions[sequence_group[i]], SEEK_SET);
690 while ((fgets(line, 500, aln_f) != NULL) && (line[0] != '>'))
693 while ((line[++l] != '\n') && (line[l] != '\0'))
694 seq1[++pos] = line[l];
698 for (j = i+1; j < number; ++j)
701 fseek(aln_f, file_positions[sequence_group[j]], SEEK_SET);
702 while ((fgets(line, 500, aln_f) != NULL) && (line[0] != '>'))
705 while ((line[++l] != '\n') && (line[l] != '\0'))
706 seq2[++pos] = line[l];
709 dist_mat[i][j] = dist_mat[j][i] = 100 - fast_aln2sim_mat2(seq1, seq2);
717 * Replaces the coded sequence with coded tuples
719 * \param coded_seq The coded sequence which will be replaced by the tuple number
720 * \param ktup Size of the ktup
721 * \param ng Coded alphabet size
722 * \param length Lengths of coded sequence
725 makepointtable_fast(int *coded_seq, //sequence
726 int ktup, //ktup size
728 int length) //length of coded_seq
736 prod=vcalloc ( ktup, sizeof (int));
737 for ( a=0; a<ktup; a++)
739 prod[ktup-a-1]=(int)pow(ng,a);
745 for (point=0,a=0; a<ktup; a++)
747 point+= *coded_seq++ *prod[a];
753 point -= *p * prod[0];
766 * \brief Calculates the number of occurences for each ktup.
768 * \param tables_f File to save the tables in.
769 * \param table Table to save the result in.
770 * \param pointt Array with all ktups listed one after another. Has to end with END_ARRAY.
771 * \param length length of \a table
774 makecompositiontable_fastal(FILE* tables_f, //File to save the tables in
775 int *table, //table to calculate in
776 int *pointt, //ktups array
777 int length) //length of the table
780 while(*pointt != END_ARRAY )
785 for (point = 0; point < length; ++point)
787 if (table[point] > 0)
788 fprintf(tables_f, "%i %i\n", point, table[point]);
790 fprintf(tables_f, "*\n");
796 make_fast_tree(char *file_name,
801 make_partTree(file_name, "TREE_OUT", ktup, n, 1, 0);
808 * \brief Reads ktup_table from file
810 * \param table Table to save the file content in.
811 * \param tables_f File in which the tables are stored.
812 * \param index Position of the table in \a tables_f
815 get_table(short *table, //Table to save the readings in
816 FILE* tables_f, //File with tables
817 long index) //index positin of ktup-tables
819 fseek(tables_f, index, SEEK_SET);
820 const int LINE_LENGTH = 101;
821 char line[LINE_LENGTH];
822 fgets(line, LINE_LENGTH, tables_f);
828 while (line[0] != '*')
830 result = strtok( line, delims );
832 table[code] = atoi(strtok( NULL, delims));
833 fgets(line, LINE_LENGTH, tables_f);
840 * \brief calculates the euclidean ktub distance between two sequences
842 * @param ktup_f, ktup_file
843 * @param pos1 position of sequence 1 in \a ktup_f
844 * @param pos2 position of sequence 2 in \a ktup_f
845 * @param table1 Saves the number of occurences for each ktup in sequence 1
846 * @param table2 Saves the number of occurences for each ktup in sequence 2
849 euclidean_dist(FILE* ktup_f, //ktup_file
850 long pos1, //position of table1
851 long pos2, //position of table2
852 short *table1, //table to save ktups in
853 short *table2, //table to save ktups in
856 const int LINE_LENGTH = 101;
857 char line[LINE_LENGTH];
864 fseek(ktup_f, pos1, SEEK_SET);
865 fgets(line, LINE_LENGTH, ktup_f);
867 for (i = 0; i < length; ++i)
872 while (line[0] != '*')
874 result = strtok( line, delims );
876 table1[code] = atoi(strtok( NULL, delims));
877 fgets(line, LINE_LENGTH, ktup_f);
879 fseek(ktup_f, pos2, SEEK_SET);
880 fgets(line, LINE_LENGTH, ktup_f);
881 while (line[0] != '*')
883 result = strtok( line, delims );
885 table2[code] = atoi(strtok( NULL, delims));
886 fgets(line, LINE_LENGTH, ktup_f);
890 for (i = 0; i < length; ++i)
892 dist += (table1[i]-table2[i])*(table1[i]-table2[i]);
900 * \brief calculates the euclidean ktub distance between two sequences.
902 * The difference to \a euclidean_dist is, that this uses the ktups stored in \a table1
903 * @param ktup_f, ktup_file
904 * @param pos2 position of sequence 2 in \a ktup_f
905 * @param table1 Saves the number of occurences for each ktup in sequence 1
906 * @param table2 Saves the number of occurences for each ktup in sequence 2
907 * \see euclidean_dist
910 euclidean_dist_half(FILE* ktup_f, //ktup_file
911 long pos2, //position of table1
912 short *table1, //table to save ktups in
913 short *table2, //table to save ktups in
916 const int LINE_LENGTH = 101;
917 char line[LINE_LENGTH];
924 fseek(ktup_f, pos2, SEEK_SET);
925 fgets(line, LINE_LENGTH, ktup_f);
927 for (i = 0; i < length; ++i)
931 while (line[0] != '*')
933 result = strtok( line, delims );
935 table2[code] = atoi(strtok( NULL, delims));
936 fgets(line, LINE_LENGTH, ktup_f);
940 for (i = 0; i < length; ++i)
942 dist += (table1[i]-table2[i])*(table1[i]-table2[i]);
951 * Makes an index of a file
954 make_pos_len_index_of_file(char *file_name, //file with sequences
955 char *ktable_f, //file with the ktup-tables
956 long **file_positions, //array to save the positions
957 int **seq_lengths, //array to save the sequence length
958 int ktup, //length of ktup
959 int is_dna) //type of the seuqence
961 //preparations for recoding sequence
969 gl=declare_char (5,13);
970 sprintf ( gl[ng++], "Aa");
971 sprintf ( gl[ng++], "Gg");
972 sprintf ( gl[ng++], "TtUu");
973 sprintf ( gl[ng++], "Cc");
974 sprintf ( gl[ng++], "NnRrYyDdMmWw");
978 gl=make_group_aa ( &ng, "mafft");
980 aa=vcalloc ( 256, sizeof (int));
981 for ( a=0; a<ng; a++)
983 for ( b=0; b< strlen (gl[a]); b++)
991 int tsize=(int)pow(ng, ktup);
992 param_set->tsize = tsize;
995 int *table=vcalloc ( tsize,sizeof (int));
998 //Reading and recoding squences
999 const int LINE_LENGTH = 501;
1000 int *coded_seq = vcalloc(2*LINE_LENGTH, sizeof(int));
1001 int allocated_mem = 2*LINE_LENGTH;
1003 (*file_positions) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(long));
1004 (*seq_lengths) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(int));
1007 FILE *file = fopen(file_name,"r");
1009 char line[LINE_LENGTH];
1011 int num_of_sequences = 0;
1013 int mem_for_pos = ENLARGEMENT_PER_STEP;
1018 FILE *tables_f = fopen(ktable_f, "w");
1023 printf("FILE NOT FOUND\n");
1029 while(fgets(line, LINE_LENGTH , file)!=NULL)
1031 if ( str_len >= allocated_mem - LINE_LENGTH)
1033 allocated_mem += LINE_LENGTH;
1034 coded_seq = vrealloc(coded_seq, allocated_mem*sizeof(int));
1041 if (num_of_sequences >0)
1043 (*seq_lengths)[num_of_sequences-1] = str_len;
1044 // printf("len: %i\n", str_len);
1046 makepointtable_fast(coded_seq,ktup,ng, str_len);
1048 (*file_positions)[num_of_sequences-1] = ftell(tables_f );
1049 for (i=0; i < tsize; ++i)
1051 makecompositiontable_fastal(tables_f, table, coded_seq,tsize );
1058 if (num_of_sequences == mem_for_pos)
1060 mem_for_pos += ENLARGEMENT_PER_STEP;
1061 (*file_positions) = vrealloc((*file_positions), mem_for_pos * sizeof(long));
1062 (*seq_lengths) = vrealloc((*seq_lengths), mem_for_pos * sizeof(int));
1068 real_len = strlen(line);
1069 if (line[real_len-1] == '\n')
1071 for (i = 0; i < real_len; ++i)
1073 coded_seq[str_len++] = aa[(short)line[i]];
1079 (*seq_lengths)[num_of_sequences-1] = str_len;
1081 makepointtable_fast(coded_seq,ktup,ng, str_len);
1082 (*file_positions)[num_of_sequences] = ftell(tables_f );
1083 makecompositiontable_fastal(tables_f, table, coded_seq,tsize );
1086 return num_of_sequences;
1091 * Makes an index of a file
1094 make_pos_len_index_of_file_retree(char *file_name, //file with sequences
1095 long **file_positions, //array to save the positions
1096 int **seq_lengths) //array to save the sequence length
1099 // printf("HALLO\n");
1101 const int LINE_LENGTH = 501;
1102 (*file_positions) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(long));
1103 (*seq_lengths) = vcalloc(ENLARGEMENT_PER_STEP, sizeof(int));
1106 FILE *file = fopen(file_name,"r");
1108 char line[LINE_LENGTH];
1110 int num_of_sequences = 0;
1111 int mem_for_pos = ENLARGEMENT_PER_STEP;
1112 // fgets(line, LINE_LENGTH , file)
1118 printf("FILE NOT FOUND\n");
1124 while(fgets(line, LINE_LENGTH , file)!=NULL)
1126 // line[LINE_LENGTH -2] = '\n';
1129 (*file_positions)[num_of_sequences] = ftell(file);
1130 if (num_of_sequences >0)
1132 (*seq_lengths)[num_of_sequences-1] = seq_len;
1137 if (num_of_sequences == mem_for_pos)
1139 mem_for_pos += ENLARGEMENT_PER_STEP;
1140 (*file_positions) = vrealloc((*file_positions), mem_for_pos * sizeof(long));
1141 (*seq_lengths) = vrealloc((*seq_lengths), mem_for_pos * sizeof(int));
1147 while ((line[++i] != '\n') && (line[i] != '\0'))
1151 // printf("A: %c\n", line[i]);
1158 (*seq_lengths)[num_of_sequences-1] = seq_len;
1159 // printf("%i %li\n", (*seq_lengths)[0], (*file_positions)[0]);
1160 // printf("%i %li\n", (*seq_lengths)[1], (*file_positions)[1]);
1162 return num_of_sequences;
1167 int logid_score2 ( int sim, int len)
1171 if ( len==0)return (int)(0.33*(float)MAXID);
1173 score=(float)sim/(float)len;
1174 if (score>0.9) score=1.0;
1175 else score=-log10 (1.0-score);
1177 score=(score*MAXID);
1182 int fast_aln2sim_mat2 (char *seq1, char *seq2)
1189 // printf("SEQ: %s %s\n", seq1, seq2);
1190 while (seq1[++c] != '\0')
1192 r1=tolower (seq1[c]);
1193 r2=tolower (seq2[c]);
1194 if ((r1 == '-') && (r2 == '-')) continue;
1200 simm = logid_score2 ( sim, len);
1207 * \brief Function to create a tree using the PartTree algorithm.
1209 * \param param_set A \a PartTree_param object containing all necessary parameters and the data.
1210 * \return The node_number.
1213 partTree_retree(PartTree_param *param_set)
1215 int num_of_tree_nodes = param_set->num_sequences-1;
1218 Tree_fastal *tree = param_set->tree;
1219 // int this_node = param_set->pos_tree;
1222 // int tsize = param_set->tsize;
1226 // short *table1 = vcalloc(tsize, sizeof(short));
1227 // short *table2 = vcalloc(tsize, sizeof(short));
1228 char **names = declare_char(param_set->subgroup, 8);
1229 int **dist_mat = declare_int(param_set->subgroup, param_set->subgroup);
1230 int **dist_mat2 = declare_int(param_set->subgroup, param_set->subgroup);
1231 char * file_name_tmp = vtmpnam(NULL);
1232 int *seed_set_cleaned = vcalloc(param_set->subgroup, sizeof(int));
1233 FILE *aln_f = param_set->ktup_table_f;
1234 long *file_positions = param_set->ktup_positions;
1235 int max_n_group = param_set->subgroup;
1236 int num_in_subgroup = param_set->subgroup;
1237 int *seq_lengths = param_set->seq_lengths;
1238 int *clusters = vcalloc(param_set->subgroup+1, sizeof(int));
1239 int *min_dist = vcalloc(param_set->num_sequences, sizeof(int));
1240 int *belongs_to = vcalloc(param_set->num_sequences, sizeof(int));
1243 fseek(aln_f, file_positions[0], SEEK_SET);
1245 while ((fgets(line, 500, aln_f) != NULL) && (line[0] != '>'))
1248 while ((line[++i] != '\n') && (line[i] != '\0'));
1251 char *seq1 = vcalloc(aln_length, sizeof(char));
1252 char *seq2 = vcalloc(aln_length, sizeof(char));
1254 //Prepare first node
1256 tree[0].index = vcalloc(param_set->num_sequences,sizeof(int));
1257 int *index = tree[0].index;
1258 for (i = 0; i< param_set->num_sequences; ++i)
1260 tree[0].name = param_set->pos_tree +param_set->num_sequences;
1262 tree[0].num_leafs = param_set->num_sequences;
1263 int *sequence_group2 = vcalloc(param_set->num_sequences,sizeof(int));
1265 Tree_fastal *current_node;
1266 for (loop_tree_node = 0; loop_tree_node < num_of_tree_nodes; ++loop_tree_node)
1268 // printf("ROUND: %i\n", loop_tree_node);
1269 current_node = &tree[loop_tree_node];
1270 index= current_node->index;
1271 if (current_node->index == NULL)
1275 int num_sequences = current_node->num_leafs;
1277 //if number of sequences in this group smaller than number subgoup size: make tree, finisch
1278 if (num_sequences <= max_n_group)
1280 dist_mat = make_distance_matrix_sim(aln_f, file_positions, index, num_sequences, dist_mat);
1281 for (i = 0; i < num_sequences; ++i)
1283 sprintf(names[i],"%i", current_node->index[i]);
1285 NT_node **tree= (int_dist2upgma_tree_fastal (dist_mat, num_sequences, file_name_tmp , names));
1286 tree_process_simple(tree[0][0], param_set,loop_tree_node);
1291 for (i = 0; i < num_in_subgroup; ++i)
1293 seed_set_cleaned[i] = 1;
1296 //finde longest sequence and put into the first field
1298 int index_longest = 0;
1299 int length_of_longest = 0;
1301 for(i = 0; i < num_sequences; ++i)
1303 if (seq_lengths[index[i]] > length_of_longest)
1306 length_of_longest = seq_lengths[index[i]];
1309 int tmp = index[index_longest];
1310 index[index_longest] = index[0];
1313 //distance of longest to rest
1315 read_sequence_from_position(aln_f, file_positions[index[0]], seq1);
1319 for (i = 1; i < num_sequences; ++i)
1321 read_sequence_from_position(aln_f, file_positions[index[1]], seq2);
1322 tmp = 100 - fast_aln2sim_mat2(seq1, seq2);
1330 //get the new seed_set in the first n spaces
1332 index[1] = index[seq_index];
1333 index[seq_index] = tmp;
1335 num_in_subgroup = param_set->subgroup;
1338 for (i = 2; i < num_in_subgroup; ++i)
1340 r = i + rand() / ( RAND_MAX / ( num_sequences-i) + 1 );
1342 index[r] = index[i];
1347 dist_mat = make_distance_matrix_sim(aln_f, file_positions, index, param_set->subgroup, dist_mat);
1349 // //Filter out sequences that are to similar & reorder
1351 NT_node **upgma_tree;
1354 int num_in_clean = filter(index, dist_mat, seed_set_cleaned, param_set);
1357 if (num_in_clean ==1)
1360 seed_set_cleaned[1] = 1;
1365 for (i = 0; i < num_in_subgroup; ++i)
1367 if (seed_set_cleaned[i])
1370 for (j = i+1; j < num_in_subgroup; ++j)
1372 if (seed_set_cleaned[j])
1374 dist_mat2[row][col] = dist_mat2[col][row] = dist_mat[i][j];
1381 for (i = 0; i < num_in_clean; ++i)
1383 sprintf(names[i],"%i",i);
1385 upgma_tree= (int_dist2upgma_tree_fastal (dist_mat2, num_in_clean, file_name_tmp , names));
1389 // //calculate distances from n' to N
1390 read_sequence_from_position(aln_f, file_positions[index[0]], seq1);
1391 // get_table(table1, table_f, file_positions[index[0]]);
1392 for (j = num_in_clean; j < num_sequences; ++j)
1394 read_sequence_from_position(aln_f, file_positions[index[j]], seq2);
1395 min_dist[j] = 100 - fast_aln2sim_mat2(seq1, seq2);
1396 // min_dist[j] = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize);
1399 for(i = 1; i < num_in_clean; ++i)
1401 read_sequence_from_position(aln_f, file_positions[index[0]], seq1);
1402 // get_table(table1, table_f, file_positions[index[i]]);
1404 for (j = num_in_clean; j < num_sequences; ++j)
1406 read_sequence_from_position(aln_f, file_positions[index[j]], seq2);
1407 tmp = 100 - fast_aln2sim_mat2(seq1, seq2);
1408 // tmp = euclidean_dist_half(table_f, file_positions[index[j]], table1, table2, param_set->tsize);
1409 if (tmp < min_dist[j])
1417 //how_many sequences has each cluster
1418 for (j = 0; j <= num_in_subgroup; ++j)
1422 for (j = 0; j < num_sequences; ++j)
1424 ++clusters[belongs_to[j]];
1426 // for (j = 0; j <= num_in_subgroup; ++j)
1428 // printf("CL: %i ",clusters[j]);
1431 for(i = 1; i < num_in_clean; ++i)
1433 clusters[i] += clusters[i-1];
1435 clusters[num_in_clean] = clusters[num_in_clean-1];
1437 for (i = 0; i < num_sequences; ++i)
1439 sequence_group2[--clusters[belongs_to[i]]] = index[i];
1442 for (i = 0; i < num_sequences; ++i)
1444 index[i] = sequence_group2[i];
1448 for (i = 0; i < num_in_clean; ++i)
1450 sprintf(names[i],"%i",i);
1452 tree_process(upgma_tree[0][0], param_set, clusters, index, loop_tree_node);
1453 NT_node tmp_tree = upgma_tree[3][0];
1454 vfree(upgma_tree[0]);
1455 vfree(upgma_tree[1]);
1456 vfree(upgma_tree[2]);
1457 vfree(upgma_tree[3]);
1459 free_tree(tmp_tree);
1467 /******************************COPYRIGHT NOTICE*******************************/
1468 /*© Centro de Regulacio Genomica */
1470 /*Cedric Notredame */
1471 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
1472 /*All rights reserved.*/
1473 /*This file is part of T-COFFEE.*/
1475 /* T-COFFEE is free software; you can redistribute it and/or modify*/
1476 /* it under the terms of the GNU General Public License as published by*/
1477 /* the Free Software Foundation; either version 2 of the License, or*/
1478 /* (at your option) any later version.*/
1480 /* T-COFFEE is distributed in the hope that it will be useful,*/
1481 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
1482 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
1483 /* GNU General Public License for more details.*/
1485 /* You should have received a copy of the GNU General Public License*/
1486 /* along with Foobar; if not, write to the Free Software*/
1487 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
1488 /*............................................... |*/
1489 /* If you need some more information*/
1490 /* cedric.notredame@europe.com*/
1491 /*............................................... |*/
1495 /******************************COPYRIGHT NOTICE*******************************/