Delete unneeded directory
[jabaws.git] / website / archive / binaries / mac / src / tcoffee / t_coffee_source / tree.c
diff --git a/website/archive/binaries/mac/src/tcoffee/t_coffee_source/tree.c b/website/archive/binaries/mac/src/tcoffee/t_coffee_source/tree.c
deleted file mode 100644 (file)
index 0928bb2..0000000
+++ /dev/null
@@ -1,459 +0,0 @@
-#include "stdio.h"
-#include "stdlib.h"
-#include "math.h"
-#include "ctype.h"
-
-
-#include "io_lib_header.h"
-#include "util_lib_header.h"
-// #include "define_header.h"
-// #include "dp_lib_header.h"
-// #include "fastal_lib_header.h"
-
-
-
-#include "fast_tree_header.h"
-
-/*!
- *     \file tree.c
- *     \brief Source code for the fastal tree algorithm
- */
-
-/*
-
-
-int
-main(int argc, char** argv)
-{
-       int alphabet_size = 5;
-       int aa[256];
-       if (alphabet_size == 5)
-       {
-               aa['A'] = 0;
-               aa['B'] = 1;
-               aa['C'] = 1;
-               aa['D'] = 0;
-               aa['G'] = 2;
-               aa['H'] = 0;
-               aa['K'] = 3;
-               aa['M'] = 1;
-               aa['N'] = 0;
-               aa['R'] = 0;
-               aa['S'] = 1;
-               aa['T'] = 3;
-               aa['U'] = 4;
-               aa['W'] = 3;
-               aa['Y'] = 1;
-       }
-       else
-       {
-               aa['A'] = 0;
-               aa['B'] = 1;
-               aa['C'] = 2;
-               aa['D'] = 3;
-               aa['E'] = 4;
-               aa['F'] = 5;
-               aa['G'] = 6;
-               aa['H'] = 7;
-               aa['I'] = 8;
-               aa['J'] = 9;
-               aa['K'] = 10;
-               aa['L'] = 11;
-               aa['M'] = 12;
-               aa['N'] = 13;
-               aa['P'] = 14;
-               aa['Q'] = 15;
-               aa['R'] = 16;
-               aa['S'] = 17;
-               aa['T'] = 18;
-               aa['V'] = 19;
-               aa['W'] = 20;
-               aa['X'] = 21;
-               aa['Y'] = 22;
-               aa['Z'] = 23;
-       }
-       compute_oligomer_distance_tree(argv[1], &aa[0], argv[2], 5, 2, alphabet_size);
-       return 0;
-}*/
-
-
-void
-compute_oligomer_distance_tree(char *seq_file_name, int* char2value, char *tree_file_name, int max_dist, int word_length, int alphabet_size)
-{
-       int i;
-       int *compare = vcalloc(1,sizeof(int));
-       int *num_seq = vcalloc(1,sizeof(int));
-
-       int num_features = (int)pow(alphabet_size,word_length);
-       
-       num_features *= num_features * (max_dist+1);
-
-
-       Cluster_info *matrix  = feature_extract(seq_file_name, max_dist, word_length, char2value, alphabet_size, compare, num_seq, num_features);
-       
-       int *node = vcalloc(1, sizeof(int));
-       *node = *num_seq;
-       
-       FILE *tree_f = fopen(tree_file_name,"w");
-       double *mean = vcalloc(num_features, sizeof(double));
-       double *variance = vcalloc(num_features, sizeof(double));
-       cluster_recursive(0, *num_seq-1, matrix, num_features, mean, variance, compare, tree_f, node);
-       
-       for (i = 0; i < *num_seq; ++i)
-       {
-//             printf("ERG: %i\n", i);
-               vfree(matrix[i].features);
-       }
-       vfree(matrix);
-       fclose (tree_f);
-       vfree(mean);
-       vfree(variance);
-       vfree(compare);
-       vfree(node);
-       vfree(num_seq);
-}
-
-
-
-/**
- * Recodes a given sequence into a sequence of it's k-mers.
- *
- * \param seq The sequence to encode.
- * \param seq_length The length of \a seq.
- * \param char2value Array giving each character in seq a unique value.
- * \param alphabet_size The size of the alphabet.
- * \param word_length The length of the k-mers to use.
- * 
- * \return The recodes sequence.
- */
-int*
-recode_sequence(char * seq, int seq_length, int *char2value, int alphabet_size, int word_length)
-{
-
-       int *recoded = vcalloc(seq_length - word_length + 1, sizeof(int));
-       int i;
-       
-       if (word_length == 1)
-       {
-               for (i = 0; i < seq_length; ++i)
-               {
-                       recoded[i] = char2value[(short)seq[i]];
-               }
-       }
-       else
-       {
-               int *prod=vcalloc (word_length, sizeof(int));
-               for ( i=0; i<word_length; ++i)
-               {
-                       prod[word_length-i-1]=(int)pow(alphabet_size,i);
-               }
-
-               int index = 0;
-               for (i = 0; i < word_length; ++i)
-               {
-                       index += char2value[(short)seq[i]] * prod[i];
-               }
-               recoded[0] = index;
-               int z = 0;
-               for (; i < seq_length; ++i)
-               {
-                       index -= char2value[(short)seq[z]] * prod[0];
-                       index *= alphabet_size;
-                       index += char2value[(short)seq[i]];
-                       recoded[++z] = index;
-               }
-       }
-
-       return recoded;
-}
-
-
-/**
- * Extracts the features from a given sequence.
- *
- * \param recoded_seq The recoded sequence, consisting of k-mer codes.
- * \param seq_length The length of \a recoded_seq.
- * \param k_tup Length of the k-mers.
- * \param max_coding The maximal number used for coding the alphabet.
- * \param max_dist The maximal distance to use between to k-mers.
- * 
- * \return The feature vector of the sequence.
- */
-double *
-get_features(int *recoded_seq, int seq_length, int k_tup, int max_coding, int max_dist, int num_features)
-{
-
-       
-       int vector_length = num_features;//max_coding * max_coding*(max_dist+1);
-       
-       double *feature_vector = vcalloc(vector_length, sizeof(double));
-       int i;
-       for (i = 0; i < vector_length; ++i)
-       {
-               feature_vector[i] = 0;
-       }
-       int j, max_length, indJ, indK, indDist;
-       for (i = 0; i < seq_length; ++i)
-       {
-               if (seq_length <= i + max_dist)
-                       max_length = seq_length-1;
-               else
-                       max_length = i + max_dist;
-               for (j = i; j <= max_length; ++j)
-               {
-                       
-                       indJ = max_coding * (max_dist+1) * recoded_seq[i];
-                       indK = (max_dist+1) * recoded_seq[j];
-                       indDist = j - i;
-                       ++feature_vector[indJ + indK + indDist];
-               }
-       }
-
-       double normalize = 0;
-       for (i = 0; i < vector_length; ++i)
-       {
-               normalize += (feature_vector[i] * feature_vector[i]);
-       }
-       normalize = sqrt(normalize);
-       for (i = 0; i < vector_length; ++i)
-       {
-               feature_vector[i] /= normalize; 
-       }
-
-       return feature_vector;
-}
-
-
-/**
- * Calculates the feature values for all sequences in a file.
- *
- * The File has to have fasta format.
- * \param seq_file_name The name of the file (in fasta format).
- * \param max_dist The maximal distance to be considered.
- * \param k_tup Size of the k_tup.
- * \param alphabet_size The size of the alphabet.
- * 
- * \return The feature values for every sequence in a Cluster_info object.
- */
-Cluster_info *
-feature_extract(char *seq_file_name, int max_dist, int k_tup, int *char2value, int alphabet_size, int * elem_2_compare, int *num_seq_p, int num_features)
-{
-       char line[500];
-       FILE *tree_f = fopen(seq_file_name,"r");
-//     fgets(line, 500, tree_f);
-       int num_seq = -1;
-       int size = 1000;
-       int matrix_size = 1000;
-       const int STEP = 1000;
-       int seq_pos = -1;
-       char *seq = vcalloc(size, sizeof(char));
-       Cluster_info *matrix = vcalloc(matrix_size, sizeof(Cluster_info));
-       char *c_p;
-       int max_coding = pow(alphabet_size,k_tup);
-       while (fgets(line, 500, tree_f) != NULL)
-       {
-               if (line[0] == '>')
-               {
-                       if (num_seq >= 0)
-                       {
-                               seq[++seq_pos] = '\0';
-                               if (num_seq > matrix_size -2)
-                               {
-                                       matrix_size += STEP;
-                                       matrix = vrealloc(matrix, matrix_size * sizeof(Cluster_info));
-                               }
-                               int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
-                               matrix[num_seq].seq_number = num_seq;
-                               matrix[num_seq].elem_2_compare = elem_2_compare;
-                               matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
-                               vfree(recoded_seq);
-                               seq_pos = -1;
-                       }
-                       ++num_seq;
-               }
-               else
-               {
-                       if (size - seq_pos < 500)
-                       {
-                               size += STEP;
-                               seq = vrealloc(seq, size*sizeof(char));
-                       }
-                       c_p = &line[0];
-                       while ((*c_p != '\n') && (*c_p != '\0'))
-                       {
-                               seq[++seq_pos] = toupper(*(c_p));
-                               ++c_p;
-                       }
-               }
-       }
-       
-       fclose(tree_f);
-       seq[++seq_pos] = '\0';
-       int *recoded_seq = recode_sequence(seq, seq_pos, char2value, alphabet_size, k_tup);
-       matrix[num_seq].seq_number = num_seq;
-       matrix[num_seq].elem_2_compare = elem_2_compare;
-       matrix[num_seq].features=get_features(recoded_seq, seq_pos- k_tup +1, k_tup, max_coding, max_dist, num_features);
-       *num_seq_p = ++num_seq;
-       vfree(seq);
-       vfree(recoded_seq);
-       return matrix;
-}
-
-
-
-/**
- * Compare function for qsort given an array of Cluster_info.
- *
- * The field used for sorting is done according to the element determined in the Cluster_info object.
- * \param a void pointer to an object of Cluster_info.
- * \param b void pointer to an object of Cluster_info.
- *
- * \return Value showing wheather \a a is bigger, smaller or equal to \a b.
- */
-int 
-cluster_compare (const void * a, const void * b)
-{
-       Cluster_info *tmp1 = (Cluster_info*)a;
-       Cluster_info *tmp2 = (Cluster_info*)b;
-       int elem = *(tmp1->elem_2_compare);
-       if ((tmp1->features[elem]) > (tmp2->features[elem]))
-               return 1;
-       else if ((tmp1->features[elem]) < (tmp2->features[elem]))
-               return -1;
-       else
-               return 0;
-}
-
-
-
-/**
- * Recursive function to build a tree according to a given feature matrix.
- *
- * \param start The begin of this set in \a matrix.
- * \param end The end of this set in \a matrix.
- * \param matrix The feature matrix.
- * \param dim The number of features.
- * \param mean Array of size \a dim.
- * \param variance Array of size \a dim.
- * \param elem_2_compare Pointer to a single in. This will save the feature according to which shall be sorted (biggest variance).
- * \param tree_f The file in which the tree will be written.
- * \param node The current node.
- * 
- * \return The number of the current node.
- */
-int
-cluster_recursive(int start,
-                                 int end,
-                                 Cluster_info* matrix,
-                                 int dim,
-                                 double *mean,
-                                 double *variance,
-                                 int *elem_2_compare,
-                                 FILE* tree_f,
-                                 int *node)
-{
-
-       //stop recursion
-       if (end-start < 1)
-       {
-               return matrix[start].seq_number;
-       }
-
-       //reset mean/variance
-       int i;
-       int num_seq = end - start +1;
-       for (i = 0; i < dim; ++i)
-       {
-               mean[i] = 0;
-               variance[i] = 0;
-       }
-
-       //calculate mean
-       Cluster_info *start_p = &matrix[start]-1;
-       Cluster_info   *end_p = &matrix[end];
-       double *tmp;
-       while (start_p < end_p)
-       {
-               tmp = &(++start_p)->features[0];
-               for (i = 0; i < dim; ++i)
-               {
-                       mean[i] += *(tmp++);
-               }
-       }
-       for (i = 0; i < dim; ++i)
-                       mean[i] /= num_seq;
-
-
-       //calculate variance
-       start_p = &matrix[start] -1;
-
-       while (start_p < end_p)
-       {
-               tmp = &(++start_p)->features[0];
-               for (i = 0; i < dim; ++i)
-               {
-                       variance[i] +=  ((*tmp) - mean[i]) * ((*tmp) - mean[i]);
-                       ++tmp;
-               }
-       }
-
-       //get maximal variance
-       double max  = variance[0];
-       int index  = 0;
-       for (i = 1; i < dim; ++i)
-       {
-               if (variance[i] > max)
-               {
-                       max = variance[i];
-                       index = i;
-               }
-       }
-       *elem_2_compare = index;
-
-
-       //devide into to different sets and start recursion on these child sets
-       qsort (&matrix[start], num_seq, sizeof(Cluster_info), cluster_compare);
-       int split = start + (end-start)/2;
-       int split_value = matrix[split].features[index];
-       while ((matrix[split+1].features[index] == split_value) && (split < end-1))
-               ++split;
-
-       int left_child = cluster_recursive(start, split, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
-       int right_child = cluster_recursive(split+1, end, matrix, dim, mean, variance, elem_2_compare, tree_f, node);
-       int node2 = *node;
-       ++(*node);
-       //print node into file
-       fprintf(tree_f, "%i %i %i\n", left_child, right_child, node2);
-
-       return node2;
-}
-
-/******************************COPYRIGHT NOTICE*******************************/
-/*© Centro de Regulacio Genomica */
-/*and */
-/*Cedric Notredame */
-/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
-/*All rights reserved.*/
-/*This file is part of T-COFFEE.*/
-/**/
-/*    T-COFFEE is free software; you can redistribute it and/or modify*/
-/*    it under the terms of the GNU General Public License as published by*/
-/*    the Free Software Foundation; either version 2 of the License, or*/
-/*    (at your option) any later version.*/
-/**/
-/*    T-COFFEE is distributed in the hope that it will be useful,*/
-/*    but WITHOUT ANY WARRANTY; without even the implied warranty of*/
-/*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*/
-/*    GNU General Public License for more details.*/
-/**/
-/*    You should have received a copy of the GNU General Public License*/
-/*    along with Foobar; if not, write to the Free Software*/
-/*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/
-/*...............................................                                                                                      |*/
-/*  If you need some more information*/
-/*  cedric.notredame@europe.com*/
-/*...............................................                                                                                                                                     |*/
-/**/
-/**/
-/*     */
-/******************************COPYRIGHT NOTICE*******************************/