JWS-112 Bumping version of T-Coffee to version 11.00.8cbe486.
[jabaws.git] / binaries / src / tcoffee / t_coffee_source / km_coffee / Vector.c
diff --git a/binaries/src/tcoffee/t_coffee_source/km_coffee/Vector.c b/binaries/src/tcoffee/t_coffee_source/km_coffee/Vector.c
new file mode 100644 (file)
index 0000000..dc18248
--- /dev/null
@@ -0,0 +1,895 @@
+/******************************COPYRIGHT NOTICE*******************************/
+/*  (c) Centro de Regulacio Genomica                                                        */
+/*  and                                                                                     */
+/*  Cedric Notredame                                                                        */
+/*  12 Aug 2014 - 22:07.                                                                    */
+/*All rights reserved.                                                                      */
+/*This file is part of T-COFFEE.                                                            */
+/*                                                                                          */
+/*    T-COFFEE is free software; you can redistribute it and/or modify                      */
+/*    it under the terms of the GNU General Public License as published by                  */
+/*    the Free Software Foundation; either version 2 of the License, or                     */
+/*    (at your option) any later version.                                                   */
+/*                                                                                          */
+/*    T-COFFEE is distributed in the hope that it will be useful,                           */
+/*    but WITHOUT ANY WARRANTY; without even the implied warranty of                        */
+/*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                         */
+/*    GNU General Public License for more details.                                          */
+/*                                                                                          */
+/*    You should have received a copy of the GNU General Public License                     */
+/*    along with Foobar; if not, write to the Free Software                                 */
+/*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA             */
+/*...............................................                                           */
+/*  If you need some more information                                                       */
+/*  cedric.notredame@europe.com                                                             */
+/*...............................................                                           */
+/******************************COPYRIGHT NOTICE*******************************/
+// #include "Vector.h"
+#include<iostream>
+#include<cstring>
+#include<string>
+#include "km_coffee_header.h"
+
+
+
+
+//char *groups[]={"LlVvIiMmCc","AaGgSsTtPp","FfYyWw","EeDdNnQqKkRrHh"};
+//size_t n_groups = 4;
+// char *groups[]={"FfLlVvIiMmCcAaGgSsTtPpYyWw","DdEeNnQqKkRrHh"};
+// size_t n_groups = 2;
+
+// double hydrophobic[]={ 0.159, 0, 0.778, -1.289, -1.076, 1.437, -0.131, -0553, 1.388, 0, -1.504, 1.236, 1.048, -0.866, 0, -0.104, -0.836, -1.432, -0.549, -0.292, 0, 1.064, 1.064, 0, 0.476, 0 };
+//A-Ala, B-0, C-Cys, D-Asp, E-Glu, F-Phe, G-Gly, H-His, I-Ile, J-0, K-Lys, L-Leu, M-Met, N-Asn, O-0, P-Pro, Q-Gln, R-Arg, S-Ser, T-Thr, U-0, V-Val, W-Trp, X, Y-Tyr, Z
+
+
+void calc_H(const char *seq, double *hydrophobic, double *H)
+{
+       size_t ProtLength = strlen(seq), i;
+       double sumH=0;
+       int strangeCh=0, a;
+
+       for (i=0; i<ProtLength; ++i)
+       {
+               sumH += hydrophobic[ toupper(seq[i])-65 ];
+               if( hydrophobic[ toupper(seq[i])-65 ] == 0 ){ strangeCh++; }
+       }
+
+       (*H)=sumH/(ProtLength-strangeCh);
+}
+
+
+
+void calc_IEP(char *seq, double *pI )
+{
+       size_t ProtLength = strlen(seq);
+
+       char Asp = 'D';
+       char Glu = 'E';
+       char Cys = 'C';
+       char Tyr = 'Y';
+       char His = 'H';
+       char Lys = 'K';
+       char Arg = 'R';
+
+       int AspNumber = 0;
+       int GluNumber = 0;
+       int CysNumber = 0;
+       int TyrNumber = 0;
+       int HisNumber = 0;
+       int LysNumber = 0;
+       int ArgNumber = 0;
+
+       int i = 0;
+
+       for ( i=0; i<=ProtLength-1; ++i)              // looking for charged amino acids
+       {
+               if (toupper(seq[i]) == Asp)
+                       ++AspNumber;
+
+               if (toupper(seq[i]) == Glu)
+                       ++GluNumber;
+
+               if (toupper(seq[i]) == Cys)
+                       ++CysNumber;
+
+               if (toupper(seq[i]) == Tyr)
+                       ++TyrNumber;
+
+               if (toupper(seq[i]) == His)
+                       ++HisNumber;
+
+               if (toupper(seq[i]) == Lys)
+                       ++LysNumber;
+
+               if (toupper(seq[i]) == Arg)
+                       ++ArgNumber;
+       }
+
+       double NQ = 0.0; //net charge in given pH
+
+       double QN1=0;  //C-terminal charge
+       double QN2=0;  //D charge
+       double QN3=0;  //E charge
+       double QN4=0;  //C charge
+       double QN5=0;  //Y charge
+       double QP1=0;  //H charge
+       double QP2=0;  //NH2 charge
+       double QP3=0;  //K charge
+       double QP4=0;  //R charge
+
+       double pH = 6.5;             //starting point pI = 6.5 - theoretically it should be 7, but
+       //average protein pI is 6.5 so we increase the probability of finding the solution
+       double pHprev = 0.0;
+       double pHnext = 14.0;        //0-14 is possible pH range
+       double E = 0.01;             //epsilon means precision [pI = pH ± E]
+       double temp = 0.0;
+
+       //%%%%%%%%%%%%%%%%%%%%%%%%%   CALCUTE IEP   %%%%%%%%%%%%%%%%%%%%%%%%
+
+       for(;;){                //the infinite loop --- pK values from Wikipedia
+
+               QN1=-1/(1+pow(10,(3.65-pH)));
+               QN2=-AspNumber/(1+pow(10,(3.9-pH)));
+               QN3=-GluNumber/(1+pow(10,(4.07-pH)));
+               QN4=-CysNumber/(1+pow(10,(8.18-pH)));
+               QN5=-TyrNumber/(1+pow(10,(10.46-pH)));
+               QP1=HisNumber/(1+pow(10,(pH-6.04)));
+               QP2=1/(1+pow(10,(pH-8.2)));
+               QP3=LysNumber/(1+pow(10,(pH-10.54)));
+               QP4=ArgNumber/(1+pow(10,(pH-12.48)));
+
+               NQ=QN1+QN2+QN3+QN4+QN5+QP1+QP2+QP3+QP4;
+
+
+               if(pH>=14.0){
+                       printf("ERROR: Something is wrong - pH is higher than 14.\n");
+                       exit(1);
+               }
+
+               //%%%%%%%%%%%   BISECTION   %%%%%%%%%%%%%
+
+               if(NQ<0){              //out of range -- the new pH value must be smaller
+                       temp = pH;
+                       pH = pH-((pH-pHprev)/2);
+                       pHnext = temp;
+               }
+               else{                 //too small pH value, so we have to increase it --- swap
+                       temp = pH;
+                       pH = pH + ((pHnext-pH)/2);
+                       pHprev = temp;
+               }
+
+               if ((pH-pHprev<E)&&(pHnext-pH<E)) //finding isoelectric point
+                       break;
+       }
+
+       (*pI)=pH;       //printf("ProtLength: %d  pI: %lf\n",ProtLength, pH );
+}
+
+
+
+Vector * seq2vec_kmer(const Seq *seq, short k, unsigned int *factor, size_t vec_len, size_t vec_num, short *alphabet, int *used)
+{
+       Vector *t = (Vector*) my_malloc(sizeof(Vector));
+       t->data = (double*) my_calloc(vec_len, sizeof(double));
+       t->id = vec_num;
+       size_t i;
+       unsigned int value = 0;
+       for (i = 0; i<k; ++i)
+               value += factor[i] * alphabet[(int)seq->seq[i]];
+       ++t->data[used[value]];
+       size_t j=0;
+       unsigned char c;
+       size_t seq_len = seq->size;
+       t->seq_len = seq_len-k+1;
+       for (i = k; i<seq_len; ++i)
+       {
+               c = alphabet[(int)seq->seq[j]];
+               value -= factor[0]*c;
+               value *= factor[k-2];
+               value += alphabet[(int)seq->seq[i]];
+               ++j;
+               ++t->data[used[value]];
+       }
+
+       return t;
+}
+
+
+double l2norm(Vector *vec, size_t size)
+{
+       int i;
+       double norm=0;
+       double *data = vec->data;
+       for (i=0; i<size; ++i)
+               norm += data[i]*data[i];
+       return sqrt(norm);
+}
+
+void normalize(VectorSet *vec_set)
+{
+       size_t vec_len=vec_set->dim;
+       size_t n_vecs = vec_set->n_vecs;
+
+       //calculate mean
+       Vector *mean = (Vector*)my_malloc(sizeof(Vector));
+       mean->data = (double*) my_calloc(vec_len, sizeof(double));
+       double *md=mean->data;
+       double *vec;
+       size_t i,j;
+       for (i=0;i<n_vecs;++i)
+       {
+               vec=vec_set->vecs[i]->data;
+               for (j=0;j<vec_len;++j)
+                       md[j]+=vec[j];
+       }
+       for (j=0;j<vec_len;++j)
+               md[j]/=n_vecs;
+
+       //calculate standard deviation
+       Vector *sd = (Vector*)my_malloc(sizeof(Vector));
+       sd->data = (double*)my_calloc(vec_len, sizeof(double));
+       double *sdd = sd->data;
+       for (i=0;i<n_vecs;++i)
+       {
+               vec=vec_set->vecs[i]->data;
+               for (j=0;j<vec_len;++j)
+                       sdd[j]+=(vec[j]-md[j])*(vec[j]-md[j]);
+       }
+       for (j=0;j<vec_len;++j)
+               sdd[j] = sqrt(sdd[j]/n_vecs);
+
+       for (i=0;i<n_vecs;++i)
+       {
+               vec=vec_set->vecs[i]->data;
+               for (j=0;j<vec_len;++j)
+                       vec[j]= (vec[j]-md[j])/sdd[j];
+       }
+}
+
+
+
+Vector * seq2vec_dists(const Seq *seq, char * groups[], size_t n_groups, size_t vec_num)
+{
+       size_t s_len = seq->size;
+       char *tmp_seq=seq->seq;
+       size_t i,j;
+       Vector *t = (Vector*)my_malloc(sizeof(Vector));
+       t->id = vec_num;
+       t->data =(double*) my_calloc(n_groups*2, sizeof(double));
+       double *data = t->data;
+       int *last_pos =(int*) my_malloc(n_groups*sizeof(int));
+       for (i=0; i<n_groups; ++i)
+               last_pos[i]=-1;
+       for (i=0; i<s_len; ++i)
+       {
+               for (j=0; j<n_groups; ++j)
+               {
+                       if (strchr(groups[j],tmp_seq[i]))
+                       {
+                               ++data[j*2];
+                               if (last_pos[j] >-1)
+                                       data[j*2+1]+=(i-last_pos[j]);
+                               last_pos[j]=i;
+                               break;
+                       }
+               }
+       }
+
+       for (j=0; j<n_groups; ++j)
+               if (data[j*2] > 0)
+                       data[j*2+1]/=data[j*2];
+
+       free(last_pos);
+       return t;
+}
+
+
+VectorSet* seqset2vecs_dist(SeqSet *seq_set, char *groups[], size_t n_groups)
+{
+       VectorSet *vec_set =(VectorSet*) my_malloc(sizeof(VectorSet));
+       vec_set->dim = seq_set->seqs[0]->size;
+       size_t n_seqs = seq_set->n_seqs;
+
+       Vector **vecs= (Vector**)malloc(n_seqs*sizeof(Vector*));
+       vec_set->n_vecs=n_seqs;
+       vec_set->vecs=vecs;
+
+       size_t i;
+       for (i = 0; i<n_seqs; ++i)
+               vecs[i] = seq2vec_dists(seq_set->seqs[i], groups, n_groups, i);
+       vec_set->dim=n_groups*2;
+//     print_vecs(vec_set, "strange");
+       return vec_set;
+}
+
+
+
+VectorSet* seqset2vecs_whatever(SeqSet *seq_set, char *groups[], size_t n_groups)
+{
+
+       double hydrophobic[]={ 0.159, 0, 0.778, -1.289, -1.076, 1.437, -0.131, -0553, 1.388, 0, -1.504, 1.236, 1.048, -0.866, 0, -0.104, -0.836, -1.432, -0.549, -0.292, 0, 1.064, 1.064, 0, 0.476, 0 };
+
+       VectorSet *vec_set = (VectorSet*)my_malloc(sizeof(VectorSet));
+       //vec_set->dim = seq_set->seqs[0]->size;
+       size_t n_seqs = seq_set->n_seqs;
+
+       Vector **vecs= (Vector**)malloc(n_seqs*sizeof(Vector*));
+       vec_set->n_vecs=n_seqs;
+       vec_set->vecs=vecs;
+
+       size_t i;
+       vec_set->dim=n_groups*2+1;
+
+       for (i = 0; i<n_seqs; ++i)
+       {
+               vecs[i] = seq2vec_dists(seq_set->seqs[i], groups, n_groups, i);
+               vecs[i] = (Vector*)realloc(vecs[i], vec_set->dim*sizeof(double));
+//             calc_H(seq_set->seqs[i]->seq, hydrophobic, &(vecs[i]->data[vec_set->dim-2]));
+               calc_IEP(seq_set->seqs[i]->seq,  &(vecs[i]->data[vec_set->dim-1]));
+       }
+//     print_vecs(vec_set, "strange");
+
+//     void calc_H(const char *seq, double *hydrophobic, double *H)
+
+       return vec_set;
+}
+
+//A-Ala, B-0, C-Cys, D-Asp, E-Glu, F-Phe, G-Gly, H-His, I-Ile, J-0, K-Lys, L-Leu, M-Met, N-Asn, O-0, P-Pro, Q-Gln, R-Arg, S-Ser, T-Thr, U-0, V-Val, W-Trp, X, Y-Tyr, Z
+
+
+
+
+
+
+
+int* identify_fields(const SeqSet *seq_set, short k, unsigned int *factor, size_t *vec_len, short *alphabet )
+{
+
+       size_t vec_length = *vec_len;
+       int *used = (int*) my_calloc(vec_length, sizeof(int));
+       int *value_arg = (int*)my_calloc(vec_length, sizeof(int));
+       int *value_test = (int*)my_malloc(vec_length * sizeof(int));
+
+       unsigned int value;
+       Seq *seq;
+       size_t seq_len, j, m;
+       size_t vec_num = seq_set->n_seqs;
+
+       short l;
+/*
+       seq = seq_set->seqs[0];
+       value = 0;
+       for (l = 0; l<k; ++l)
+       {
+               printf("%c\n", (int)seq->seq[l]);
+               value += factor[l] * alphabet[(int)seq->seq[l]];
+       }
+
+       ++value_arg[value];
+
+
+       j=0;
+       seq_len = seq->size;
+       for (m = k; m<seq_len; ++m)
+       {
+               value -= factor[0]*alphabet[(int)seq->seq[j]];
+               value *= factor[k-2];
+               value += alphabet[(int)seq->seq[m]];
+               ++j;
+               ++value_arg[value];
+       }*/
+
+
+       size_t i;
+       size_t x;
+       for (i = 0; i<vec_num; ++i)
+       {
+               for (x = 0; x < vec_length; ++x)
+               {
+                       value_test[x]=0;
+               }
+               seq = seq_set->seqs[i];
+               value = 0;
+               for (l = 0; l<k; ++l)
+                       value += factor[l] * alphabet[(int)seq->seq[l]];
+               value_test[value]=1;
+
+
+               j=0;
+               seq_len = seq->size;
+               for (m = k; m<seq_len; ++m)
+               {
+                       value -= factor[0]*alphabet[(int)seq->seq[j]];
+                       value *= factor[k-2];
+                       value += alphabet[(int)seq->seq[m]];
+                       ++j;
+                       ++value_test[value];
+                       //if (value >124)
+                       //printf("%li %c%c%c %i %i %i\n", value,(int)seq->seq[j],(int)seq->seq[j-1],(int)seq->seq[j-2], alphabet[(int)seq->seq[j]], alphabet[(int)seq->seq[j-1]], alphabet[(int)seq->seq[j-2]]);
+               }
+
+
+               for (x = 0; x < vec_length; ++x)
+               {
+                       if (value_test[x] != value_arg[x])
+                       {
+                               used[x]=1;
+                       }
+               }
+
+       }
+       free(value_test);
+       free(value_arg);
+
+       j=0;
+       for (i=0; i<vec_length; ++i)
+       {
+               if (used[i] != 0)
+                       used[i] = j++;
+       }
+       *vec_len=j;
+       //printf("DIM=%li\n", *vec_len);
+       return used;
+}
+
+int
+my_variance_sort (const void *a, const void *b)
+{
+       double i = *(double*)a;
+       double j = *(double*)b;
+       if (i<j)
+               return 1;
+       if (i>j)
+               return -1;
+       else
+               return 0;
+}
+
+typedef struct
+{
+       size_t id;
+       double dist;
+} dist_pair;
+
+
+int dist_pair_compare (const void * a, const void * b)
+{
+       return ( ((dist_pair*)b)->dist - ((dist_pair*)a)->dist );
+}
+
+int extract_compare (const void * a, const void * b)
+{
+       return ( *(int*)a - *(int*)b );
+}
+
+
+SeqSet *
+find_distant(SeqSet *seq_set, VectorSet *set)
+{
+       size_t n_vecs = set->n_vecs;
+       size_t dim = set->dim;
+       size_t i,j;
+
+       dist_pair *avg_dist = (dist_pair*)malloc(n_vecs*sizeof(dist_pair));
+       dist_pair *nearest_dist = (dist_pair*)malloc(n_vecs*sizeof(dist_pair));
+       for (i =0; i<n_vecs; ++i)
+       {
+               avg_dist[i].id=set->vecs[i]->id;
+               avg_dist[i].dist=0;
+               nearest_dist[i].id=set->vecs[i]->id;
+               nearest_dist[i].dist = INT_MAX;
+       }
+
+       double tmp_dist;
+       for (i=0; i<n_vecs; ++i)
+       {
+               for (j=i+1; j<n_vecs; ++j)
+               {
+                       tmp_dist= km_sq_dist(set->vecs[i], set->vecs[j], dim);
+                       avg_dist[i].dist += tmp_dist;
+                       avg_dist[j].dist += tmp_dist;
+                       if (tmp_dist < nearest_dist[i].dist)
+                               nearest_dist[i].dist = tmp_dist;
+                       if (tmp_dist < nearest_dist[j].dist)
+                               nearest_dist[j].dist = tmp_dist;
+               }
+       }
+
+
+
+       qsort (nearest_dist, n_vecs, sizeof(dist_pair), dist_pair_compare);
+       qsort (avg_dist, n_vecs, sizeof(dist_pair), dist_pair_compare);
+       //printf("Most dist to nearest:\n");
+
+       int to_extract = 10;
+       int *extract_ids = (int*)malloc(to_extract*sizeof(int));
+       for (i=0; i <to_extract; ++i)
+       {
+               extract_ids[i] = nearest_dist[i].id;
+               //printf("%s %f\n", seq_set->seqs[nearest_dist[i].id]->name, nearest_dist[i].dist);
+       }
+       qsort(extract_ids, to_extract, sizeof(int), extract_compare);
+
+       //SeqSet *extractedSet= malloc(sizeof(SeqSet));
+       //extractedSet->reserved=to_extract;
+       //extractedSet->n_seqs = to_extract;
+       //extractedSet->seqs = malloc(to_extract*sizeof(Sequence*));
+       int pos=0;
+       j=0;
+       for (i=0; i<n_vecs; ++i)
+       {
+               if (extract_ids[pos] == i)
+                       ++pos;
+               else
+                       set->vecs[j++] = set->vecs[i];
+       }
+       set->n_vecs -=10;
+
+
+//     printf("\nMost avg dist:\n");
+//     for (i=0; i <10; ++i)
+//     {
+//             printf("%s %f\n", seq_set->seqs[avg_dist[i].id]->name, avg_dist[i].dist);
+//     }
+
+       free(avg_dist);
+       free(nearest_dist);
+       return NULL;//extractedSet;
+}
+
+
+int*
+identify_fields_variance(const SeqSet *seq_set, short k, unsigned int *factor, size_t *vec_len, short *alphabet)
+{
+       size_t vec_length = *vec_len;
+       int *used = (int*)my_calloc(vec_length, sizeof(int));
+       double *mean =(double*) my_calloc(vec_length, sizeof(double));
+       double *variance = (double*)my_calloc(vec_length, sizeof(double));
+       double *value_test = (double*)my_malloc(vec_length * sizeof(double));
+
+       size_t i;
+
+       unsigned int value;
+       Seq *seq;
+       size_t seq_len, j, m;
+       size_t n_vecs = seq_set->n_seqs;
+
+//     calc mean
+       short l;
+       for (i = 1; i<n_vecs; ++i)
+       {
+               seq = seq_set->seqs[i];
+               value = 0;
+               for (l = 0; l<k; ++l)
+                       value += factor[l] * alphabet[(int)seq->seq[l]];
+               ++mean[value];
+
+               j=0;
+               seq_len = seq->size;
+               for (m = k; m<seq_len; ++m)
+               {
+                       value -= factor[0]*alphabet[(int)seq->seq[j]];
+                       value *= factor[k-2];
+                       value += alphabet[(int)seq->seq[m]];
+                       ++j;
+                       ++mean[value];
+               }
+       }
+
+       for (i = 0; i<vec_length; ++i)
+               mean[i] /= n_vecs;
+
+
+       //calc variance
+       size_t x;
+       for (i = 1; i<n_vecs; ++i)
+       {
+               for (x = 0; x < vec_length; ++x)
+                       value_test[x]=0;
+
+               seq = seq_set->seqs[i];
+               value = 0;
+               for (l = 0; l<k; ++l)
+                       value += factor[l] * alphabet[(int)seq->seq[l]];
+
+
+               j=0;
+               seq_len = seq->size;
+               for (m = k; m<seq_len; ++m)
+               {
+                       value -= factor[0]*alphabet[(int)seq->seq[j]];
+                       value *= factor[k-2];
+                       value += alphabet[(int)seq->seq[m]];
+                       ++j;
+                       ++value_test[value];
+               }
+
+               for (j = 0; j<vec_length; ++j)
+                       variance[j] += (value_test[j]-mean[j])*(value_test[j]-mean[j]);
+       }
+       free(value_test);
+       free(mean);
+
+       for (i = 0; i<vec_length; ++i)
+               variance[i] /= n_vecs;
+
+       qsort (variance, vec_length, sizeof(double), my_variance_sort );
+//     for (i = 0; i<vec_length; ++i)
+//             printf("%f ", variance[i]);
+//     printf("\n");
+       double threshold=variance[199];
+
+       j=0;
+       for (i=0; i<vec_length; ++i)
+       {
+               if (variance[i] >= threshold)
+                       used[i] = j++;
+       }
+       free(variance);
+       *vec_len=j;
+//     printf("DIM=%li\n", *vec_len);
+       return used;
+}
+
+
+VectorSet*
+seqset2vecs_kmer(SeqSet *seq_set, short k, short alphabet_size, short *alphabet)
+{
+       VectorSet *vec_set = (VectorSet*)my_malloc(sizeof(VectorSet));
+       vec_set->dim = seq_set->seqs[0]->size;
+       size_t n_seqs = seq_set->n_seqs;
+       unsigned int *factor = (unsigned int*)my_malloc(k*sizeof(unsigned int));
+       factor[k-1] = 1;
+
+       short j;
+       for (j=k-2; j>=0; --j)
+               factor[j] = factor[j+1] *alphabet_size;
+       size_t vec_len = factor[0] *alphabet_size;
+
+       Vector **vecs= (Vector**)malloc(n_seqs*sizeof(Vector*));
+       vec_set->n_vecs=n_seqs;
+       vec_set->vecs=vecs;
+       int *used = identify_fields(seq_set, k, factor, &vec_len, alphabet);
+       size_t i;
+       for (i = 0; i<n_seqs; ++i)
+               vecs[i] = seq2vec_kmer(seq_set->seqs[i], k, factor, vec_len, i, alphabet, used);
+       vec_set->dim=vec_len;
+//     size_t l;
+//     double max_val=0;
+//     double tmp_val;
+//     double all=0;
+//     for(i=0; i<n_seqs; ++i)
+//     {
+//             max_val=0;
+//             for (l=0; l<n_seqs; ++l)
+//             {
+//                     if (i!=l)
+//                     {
+//                     tmp_val=km_sq_dist(vecs[i], vecs[l], vec_len);
+//                     if (tmp_val>max_val)
+//                             max_val=tmp_val;
+//                     }
+//             }
+//             all+=max_val;
+//             printf("max_val:%f\n", max_val);
+//     }
+
+//     printf("ALL: %f\n", all/n_seqs);
+//             km_
+//     exit(1);
+
+/*
+       //TEST see if taking distances to x vectors make a better vector and calculate distances to them is better
+       double *distances = my_calloc(n_seqs, sizeof(double));
+       double tmp_score;
+
+
+//     calc nearest point to everybody.
+       for (i = 0; i<n_seqs; ++i)
+       {
+               for (j = i+1; j<n_seqs; ++j)
+               {
+                       tmp_score=km_muscle_dist(vecs[i], vecs[j], vec_len, k);
+                       distances[i] += tmp_score;
+                       distances[j] += tmp_score;
+               }
+       }
+
+       unsigned int id =-1;
+       tmp_score=DBL_MAX;
+       for (i=0; i<n_seqs; ++i)
+       {
+               if (distances[i] < tmp_score)
+               {
+                       tmp_score = distances[i];
+                       id=i;
+               }
+       }
+
+
+//     Take the next farthest points to any one existing in the set
+       int n_dist_id=10;
+       Vector **dist_points = my_malloc(n_dist_id*sizeof(Vector));
+       dist_points[0]=new_vec(vecs[id], vec_len);
+       for (i = 0; i<n_seqs; ++i)
+               distances[i] = DBL_MAX;
+
+       int l;
+       double max_score=0;
+       int max_id=-1;
+       for (k=1; k<n_dist_id; ++k)
+       {
+               max_score=0;
+               for (i=0; i< n_seqs; ++i)
+               {
+                       tmp_score = km_muscle_dist(vecs[i], dist_points[k-1], vec_len, k);
+
+                       if (tmp_score <distances[i])
+                               distances[i] = tmp_score;
+                       if (distances[i] > max_score)
+                       {
+                               max_score = distances[i];
+                               max_id = i;
+                       }
+               }
+//             printf("tmp_score: %i\n", max_id);
+               dist_points[k]=new_vec(vecs[max_id], vec_len);
+       }
+
+       Vector *old_data;
+       for (i=0; i< n_seqs; ++i)
+       {
+               old_data=vecs[i];
+               vecs[i]=new_vec_nodata(old_data, n_dist_id);
+               for(j=0; j<n_dist_id; ++j)
+                       vecs[i]->data[j] = km_muscle_dist(dist_points[j], old_data, vec_len, k);
+               free(old_data->data);
+               free(old_data);
+       }
+
+
+       vec_len=n_dist_id;
+
+       //TEST test_end;
+
+//*/
+
+//     find_distant(seq_set, vec_set);
+//     exit(1);
+
+       free(used);
+       free(factor);
+       return vec_set;
+}
+
+
+Vector *
+new_vec(Vector *vec, int vec_len)
+{
+       Vector *new_vec=(Vector*)my_malloc(sizeof(Vector));
+       new_vec->data=(double*)my_malloc(vec_len*sizeof(double));
+       memcpy(new_vec->data, vec->data, vec_len*sizeof(double));
+       new_vec->id = vec->id;
+       new_vec->seq_len = vec->seq_len;
+       new_vec->assignment = vec->assignment;
+       return new_vec;
+}
+
+Vector *
+new_vec_nodata(Vector *vec, int vec_len)
+{
+       Vector *new_vec=(Vector*)my_malloc(sizeof(Vector));
+       new_vec->data=(double*)my_malloc(vec_len*sizeof(double));
+       new_vec->id = vec->id;
+       new_vec->seq_len = vec->seq_len;
+       new_vec->assignment = vec->assignment;
+       return new_vec;
+}
+
+void
+delVecSet(VectorSet* set)
+{
+       size_t n_vecs = set->n_vecs;
+       int i;
+       Vector** vecs = set->vecs;
+       for (i=0; i < n_vecs; ++i)
+       {
+               free(vecs[i]->data);
+               free(vecs[i]);
+       }
+       free(vecs);
+       free(set);
+}
+
+
+double
+km_sq_dist(const Vector *vec1, const Vector *vec2, size_t dim)
+{
+       double dist = 0;
+       double tmp;
+       size_t i;
+       double *d1=vec1->data;
+       double *d2=vec2->data;
+       for (i=0; i<dim; ++i)
+       {
+               tmp = d1[i]-d2[i];
+               dist += (tmp * tmp);
+       }
+       return dist;
+}
+
+
+double
+km_angle_dist(const Vector *vec1, const Vector *vec2, size_t dim)
+{
+       double dist = 0, len1=0, len2=0;
+       size_t i;
+       double *d1=vec1->data;
+       double *d2=vec2->data;
+       for (i=0; i<dim; ++i)
+       {
+               dist += d1[i]*d2[i];
+               len1 += d1[i]*d1[i];
+               len2 += d2[i]*d2[i];
+       }
+       double val =dist/(sqrt(len1)*sqrt(len2));
+       if (val > 1.0)
+               val = 1.0;
+       return acos(val);
+}
+
+
+double
+km_muscle_dist(const Vector *vec1, const Vector *vec2, size_t dim, int k)
+{
+       double dist = 0;
+       size_t i;
+       double *d1=vec1->data;
+       double *d2=vec2->data;
+       for (i=0; i<dim; ++i)
+               dist += (d1[i] < d2[i])?d1[i]:d2[i];
+       int min_len= (vec1->seq_len<vec2->seq_len)?vec1->seq_len:vec2->seq_len;
+       min_len = min_len-k+1;
+       return 1-dist/min_len;
+}
+
+
+void
+print_vecs(VectorSet *set, char *out_f)
+{
+       size_t n_vecs = set->n_vecs;
+       size_t dim = set->dim;
+       size_t i,j;
+       FILE *out_F = fopen(out_f, "w");
+       double *data;
+       for (i=0; i<n_vecs; ++i)
+       {
+               data=set->vecs[i]->data;
+               fprintf(out_F, "%i ", i);
+               for (j=0; j<dim; ++j)
+                       fprintf(out_F, "%f ", data[j]);
+               fprintf(out_F, "\n");
+       }
+       fclose(out_F);
+}
+
+void
+read_vecs(VectorSet *set, char *in_f)
+{
+       FILE *in_F = fopen(in_f, "r");
+       const int LINE_LENGTH=1001;
+       char line[LINE_LENGTH];
+       size_t i=0, j=0;
+       double *data;
+       char *value;
+       while (fgets(line, LINE_LENGTH, in_F)!=NULL)
+       {
+               data = set->vecs[i++]->data;
+               data[0] = atof(strtok(line, " \t\n"));
+               while ((value=strtok(NULL, " \t\n"))!=NULL)
+                       data[++j] = atof(value);
+       }
+       set->dim=j+1;
+}
+