+++ /dev/null
-#include "stdio.h"
-#include "stdlib.h"
-#include "math.h"
-#include "ctype.h"
-
-
-#include "io_lib_header.h"
-#include "util_lib_header.h"
-// #include "define_header.h"
-// #include "dp_lib_header.h"
-// #include "fastal_lib_header.h"
-
-
-
-#include "fast_tree_header.h"
-
-/*!
- * \file tree.c
- * \brief Source code for the fastal tree algorithm
- */
-
-/*
-
-
-int
-main(int argc, char** argv)
-{
- int alphabet_size = 5;
- int aa[256];
- if (alphabet_size == 5)
- {
- aa['A'] = 0;
- aa['B'] = 1;
- aa['C'] = 1;
- aa['D'] = 0;
- aa['G'] = 2;
- aa['H'] = 0;
- aa['K'] = 3;
- aa['M'] = 1;
- aa['N'] = 0;
- aa['R'] = 0;
- aa['S'] = 1;
- aa['T'] = 3;
- aa['U'] = 4;
- aa['W'] = 3;
- aa['Y'] = 1;
- }
- else
- {
- aa['A'] = 0;
- aa['B'] = 1;
- aa['C'] = 2;
- aa['D'] = 3;
- aa['E'] = 4;
- aa['F'] = 5;
- aa['G'] = 6;
- aa['H'] = 7;
- aa['I'] = 8;
- aa['J'] = 9;
- aa['K'] = 10;
- aa['L'] = 11;
- aa['M'] = 12;
- aa['N'] = 13;
- aa['P'] = 14;
- aa['Q'] = 15;
- aa['R'] = 16;
- aa['S'] = 17;
- aa['T'] = 18;
- aa['V'] = 19;
- aa['W'] = 20;
- aa['X'] = 21;
- aa['Y'] = 22;
- aa['Z'] = 23;
- }
- compute_oligomer_distance_tree(argv[1], &aa[0], argv[2], 5, 2, alphabet_size);
- return 0;
-}*/
-
-
-void
-compute_oligomer_distance_tree(char *seq_file_name, int* char2value, char *tree_file_name, int max_dist, int word_length, int alphabet_size)
-{
- int i;
- int *compare = vcalloc(1,sizeof(int));
- int *num_seq = vcalloc(1,sizeof(int));
-
- int num_features = (int)pow(alphabet_size,word_length);
-
- num_features *= num_features * (max_dist+1);
-
-
- Cluster_info *matrix = feature_extract(seq_file_name, max_dist, word_length, char2value, alphabet_size, compare, num_seq, num_features);
-
- int *node = vcalloc(1, sizeof(int));
- *node = *num_seq;
-
- FILE *tree_f = fopen(tree_file_name,"w");
- double *mean = vcalloc(num_features, sizeof(double));
- double *variance = vcalloc(num_features, sizeof(double));
- cluster_recursive(0, *num_seq-1, matrix, num_features, mean, variance, compare, tree_f, node);
-
- for (i = 0; i < *num_seq; ++i)
- {
-// printf("ERG: %i\n", i);
- vfree(matrix[i].features);
- }
- vfree(matrix);
- fclose (tree_f);
- vfree(mean);
- vfree(variance);
- vfree(compare);
- vfree(node);
- vfree(num_seq);
-}
-
-
-
-/**
- * Recodes a given sequence into a sequence of it's k-mers.
- *
- * \param seq The sequence to encode.
- * \param seq_length The length of \a seq.
- * \param char2value Array giving each character in seq a unique value.
- * \param alphabet_size The size of the alphabet.
- * \param word_length The length of the k-mers to use.
- *
- * \return The recodes sequence.
- */
-int*
-recode_sequence(char * seq, int seq_length, int *char2value, int alphabet_size, int word_length)
-{
-
- int *recoded = vcalloc(seq_length - word_length + 1, sizeof(int));
- int i;
-
- if (word_length == 1)
- {
- for (i = 0; i < seq_length; ++i)
- {
- recoded[i] = char2value[(short)seq[i]];
- }
- }
- else
- {
- int *prod=vcalloc (word_length, sizeof(int));
- for ( i=0; i<word_length; ++i)
- {
- prod[word_length-i-1]=(int)pow(alphabet_size,i);
- }
-
- int index = 0;
- for (i = 0; i < word_length; ++i)
- {
- index += char2value[(short)seq[i]] * prod[i];
- }
- recoded[0] = index;
- int z = 0;
- for (; i < seq_length; ++i)
- {
- index -= char2value[(short)seq[z]] * prod[0];
- index *= alphabet_size;
- index += char2value[(short)seq[i]];
- recoded[++z] = index;
- }
- }
-
- return recoded;
-}
-
-
-/**
- * Extracts the features from a given sequence.
- *
- * \param recoded_seq The recoded sequence, consisting of k-mer codes.
- * \param seq_length The length of \a recoded_seq.
- * \param k_tup Length of the k-mers.
- * \param max_coding The maximal number used for coding the alphabet.
- * \param max_dist The maximal distance to use between to k-mers.
- *
- * \return The feature vector of the sequence.
- */
-double *
-get_features(int *recoded_seq, int seq_length, int k_tup, int max_coding, int max_dist, int num_features)
-{
-
-
- int vector_length = num_features;//max_coding * max_coding*(max_dist+1);
-
- double *feature_vector = vcalloc(vector_length, sizeof(double));
- int i;
- for (i = 0; i < vector_length; ++i)
- {
- feature_vector[i] = 0;
- }
- int j, max_length, indJ, indK, indDist;
- for (i = 0; i < seq_length; ++i)
- {
- if (seq_length <= i + max_dist)
- max_length = seq_length-1;
- else
- max_length = i + max_dist;
- for (j = i; j <= max_length; ++j)
- {
-
- indJ = max_coding * (max_dist+1) * recoded_seq[i];
- indK = (max_dist+1) * recoded_seq[j];
- indDist = j - i;
- ++feature_vector[indJ + indK + indDist];
- }
- }
-
- double normalize = 0;
- for (i = 0; i < vector_length; ++i)
- {
- normalize += (feature_vector[i] * feature_vector[i]);
- }
- normalize = sqrt(normalize);
- for (i = 0; i < vector_length; ++i)
- {
- feature_vector[i] /= normalize;
- }
-
- return feature_vector;
-}
-
-
-/**
- * Calculates the feature values for all sequences in a file.
- *
- * The File has to have fasta format.
- * \param seq_file_name The name of the file (in fasta format).
- * \param max_dist The maximal distance to be considered.
- * \param k_tup Size of the k_tup.
- * \param alphabet_size The size of the alphabet.
- *
- * \return The feature values for every sequence in a Cluster_info object.
- */
-Cluster_info *
-feature_extract(char *seq_file_name, int max_dist, int k_tup, int *char2value, int alphabet_size, int * elem_2_compare, int *num_seq_p, int num_features)
-{
- char line[500];
- FILE *tree_f = fopen(seq_file_name,"r");
-// fgets(line, 500, tree_f);
- int num_seq = -1;
- int size = 1000;
- int matrix_size = 1000;
- const int STEP = 1000;
- int seq_pos = -1;
- char *seq = vcalloc(size, sizeof(char));
- Cluster_info *matrix = vcalloc(matrix_size, sizeof(Cluster_info));
- char *c_p;
- int max_coding = pow(alphabet_size,k_tup);
- while (fgets(line, 500, tree_f) != NULL)
- {
- if (line[0] == '>')
- {
- if (num_seq >= 0)
- {
- seq[++seq_pos] = '\0';
- if (num_seq > matrix_size -2)
- {
- matrix_size += STEP;
- matrix = vrealloc(matrix, matrix_size * sizeof(Cluster_info));
- }
- int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
- matrix[num_seq].seq_number = num_seq;
- matrix[num_seq].elem_2_compare = elem_2_compare;
- matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
- vfree(recoded_seq);
- seq_pos = -1;
- }
- ++num_seq;
- }
- else
- {
- if (size - seq_pos < 500)
- {
- size += STEP;
- seq = vrealloc(seq, size*sizeof(char));
- }
- c_p = &line[0];
- while ((*c_p != '\n') && (*c_p != '\0'))
- {
- seq[++seq_pos] = toupper(*(c_p));
- ++c_p;
- }
- }
- }
-
- fclose(tree_f);
- seq[++seq_pos] = '\0';
- int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
- matrix[num_seq].seq_number = num_seq;
- matrix[num_seq].elem_2_compare = elem_2_compare;
- matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
- *num_seq_p = ++num_seq;
- vfree(seq);
- vfree(recoded_seq);
- return matrix;
-}
-
-
-
-/**
- * Compare function for qsort given an array of Cluster_info.
- *
- * The field used for sorting is done according to the element determined in the Cluster_info object.
- * \param a void pointer to an object of Cluster_info.
- * \param b void pointer to an object of Cluster_info.
- *
- * \return Value showing wheather \a a is bigger, smaller or equal to \a b.
- */
-int
-cluster_compare (const void * a, const void * b)
-{
- Cluster_info *tmp1 = (Cluster_info*)a;
- Cluster_info *tmp2 = (Cluster_info*)b;
- int elem = *(tmp1->elem_2_compare);
- if ((tmp1->features[elem]) > (tmp2->features[elem]))
- return 1;
- else if ((tmp1->features[elem]) < (tmp2->features[elem]))
- return -1;
- else
- return 0;
-}
-
-
-
-/**
- * Recursive function to build a tree according to a given feature matrix.
- *
- * \param start The begin of this set in \a matrix.
- * \param end The end of this set in \a matrix.
- * \param matrix The feature matrix.
- * \param dim The number of features.
- * \param mean Array of size \a dim.
- * \param variance Array of size \a dim.
- * \param elem_2_compare Pointer to a single in. This will save the feature according to which shall be sorted (biggest variance).
- * \param tree_f The file in which the tree will be written.
- * \param node The current node.
- *
- * \return The number of the current node.
- */
-int
-cluster_recursive(int start,
- int end,
- Cluster_info* matrix,
- int dim,
- double *mean,
- double *variance,
- int *elem_2_compare,
- FILE* tree_f,
- int *node)
-{
-
- //stop recursion
- if (end-start < 1)
- {
- return matrix[start].seq_number;
- }
-
- //reset mean/variance
- int i;
- int num_seq = end - start +1;
- for (i = 0; i < dim; ++i)
- {
- mean[i] = 0;
- variance[i] = 0;
- }
-
- //calculate mean
- Cluster_info *start_p = &matrix[start]-1;
- Cluster_info *end_p = &matrix[end];
- double *tmp;
- while (start_p < end_p)
- {
- tmp = &(++start_p)->features[0];
- for (i = 0; i < dim; ++i)
- {
- mean[i] += *(tmp++);
- }
- }
- for (i = 0; i < dim; ++i)
- mean[i] /= num_seq;
-
-
- //calculate variance
- start_p = &matrix[start] -1;
-
- while (start_p < end_p)
- {
- tmp = &(++start_p)->features[0];
- for (i = 0; i < dim; ++i)
- {
- variance[i] += ((*tmp) - mean[i]) * ((*tmp) - mean[i]);
- ++tmp;
- }
- }
-
- //get maximal variance
- double max = variance[0];
- int index = 0;
- for (i = 1; i < dim; ++i)
- {
- if (variance[i] > max)
- {
- max = variance[i];
- index = i;
- }
- }
- *elem_2_compare = index;
-
-
- //devide into to different sets and start recursion on these child sets
- qsort (&matrix[start], num_seq, sizeof(Cluster_info), cluster_compare);
- int split = start + (end-start)/2;
- int split_value = matrix[split].features[index];
- while ((matrix[split+1].features[index] == split_value) && (split < end-1))
- ++split;
-
- int left_child = cluster_recursive(start, split, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
- int right_child = cluster_recursive(split+1, end, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
- int node2 = *node;
- ++(*node);
- //print node into file
- fprintf(tree_f, "%i %i %i\n", left_child, right_child, node2);
-
- return node2;
-}
-
-/******************************COPYRIGHT NOTICE*******************************/
-/*© Centro de Regulacio Genomica */
-/*and */
-/*Cedric Notredame */
-/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
-/*All rights reserved.*/
-/*This file is part of T-COFFEE.*/
-/**/
-/* T-COFFEE is free software; you can redistribute it and/or modify*/
-/* it under the terms of the GNU General Public License as published by*/
-/* the Free Software Foundation; either version 2 of the License, or*/
-/* (at your option) any later version.*/
-/**/
-/* T-COFFEE is distributed in the hope that it will be useful,*/
-/* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
-/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
-/* GNU General Public License for more details.*/
-/**/
-/* You should have received a copy of the GNU General Public License*/
-/* along with Foobar; if not, write to the Free Software*/
-/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
-/*............................................... |*/
-/* If you need some more information*/
-/* cedric.notredame@europe.com*/
-/*............................................... |*/
-/**/
-/**/
-/* */
-/******************************COPYRIGHT NOTICE*******************************/