8 #include "io_lib_header.h"
9 #include "util_lib_header.h"
10 #include "define_header.h"
11 #include "dp_lib_header.h"
14 int commonsextet( int *table, int *pointt );
15 void makecompositiontable( int *table, int *pointt );
16 int *code_seq (char *seq, char *type);
17 int * makepointtable( int *pointt, int *n, int ktup );
22 * calculates the number of common tuples
24 int commonsextet( int *table, int *pointt )
29 static int *memo = NULL;
30 static int *ct = NULL;
35 memo = vcalloc( tsize+1, sizeof( int ) );
36 ct = vcalloc( tsize+1, sizeof( int ) );
40 while( ( point = *pointt++ ) != END_ARRAY )
43 if( tmp < table[point] )
53 while( *cp != END_ARRAY )
60 * calculates how many of each tuple exist
62 void makecompositiontable( int *table, int *pointt )
66 while( ( point = *pointt++ ) != END_ARRAY )
72 int *code_seq (char *seq, char *type)
82 if ( strm (type, "DNA") || strm (type, "RNA"))
84 gl=declare_char (4,5);
85 sprintf ( gl[ng++], "Aa");
86 sprintf ( gl[ng++], "Gg");
87 sprintf ( gl[ng++], "TtUu");
88 sprintf ( gl[ng++], "Cc");
93 gl=make_group_aa ( &ng, "mafft");
95 aa=vcalloc ( 256, sizeof (int));
98 for ( b=0; b< strlen (gl[a]); b++)
111 if ( !code || read_array_size (code, sizeof (int))<(l+2))
114 code=vcalloc (l+2, sizeof (int));
120 code[a]=aa[(int)seq[a]];
128 int * makepointtable( int *pointt, int *n, int ktup )
138 prod=vcalloc ( ktup, sizeof (int));
139 for ( a=0; a<ktup; a++)
141 prod[ktup-a-1]=(int)pow(n[-1],a);
146 for (point=0,a=0; a<ktup; a++)
148 point+= *n++ *prod[a];
153 while( *n != END_ARRAY )
155 point -= *p++ * prod[0];
165 int ** ktup_dist_mat ( char **seq, int nseq, int ktup, char *type)
167 //Adapted from MAFFT 5: fast ktup
168 int **pointt,*code=NULL, **pscore;
170 double **mtx, score0;
173 if (!seq || nseq==0)return NULL;
174 for (minl=strlen(seq[0]),l=0,i=0;i<nseq; i++)
181 ktup=MIN(minl, ktup);
182 pointt=declare_int (nseq, l+1);
183 mtx=declare_double (nseq, nseq);
184 pscore=declare_int ( nseq, nseq);
186 for( i=0; i<nseq; i++ )
188 makepointtable( pointt[i], code=code_seq (seq[i], type),ktup);
190 tsize=(int)pow(code[-1], ktup);
192 for ( i=0; i<nseq; i++)
195 table1=vcalloc ( tsize,sizeof (int));
196 makecompositiontable( table1, pointt[i]);
197 for (j=i; j<nseq; j++)
199 mtx[i][j] = commonsextet( table1, pointt[j] );
203 for( i=0; i<nseq; i++ )
206 for( j=0; j<nseq; j++ )
207 pscore[i][j] = (int)( ( score0 - mtx[MIN(i,j)][MAX(i,j)] ) / score0 * 3 * 10.0 + 0.5 );
209 for( i=0; i<nseq-1; i++ )
210 for( j=i+1; j<nseq; j++ )
212 pscore[i][j] = pscore[j][i]=100-MIN( pscore[i][j], pscore[j][i] );
218 int ** evaluate_diagonals_with_ktup_1 ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup);
219 int ** evaluate_diagonals_with_ktup_2 ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup);
222 int ** evaluate_diagonals_for_two_sequences ( char *seq1, char *seq2,int maximise,Constraint_list *CL,int ktup)
227 static int *ns, **l_s;
237 CL=vcalloc ( 1, sizeof (Constraint_list));
239 sprintf ( CL->matrix_for_aa_group, "vasiliky");
240 CL->M=read_matrice ("blosum62mt");
241 CL->evaluate_residue_pair=evaluate_cdna_matrix_score;
242 CL->get_dp_cost=slow_get_dp_cost;
243 type=get_string_type(seq1);
245 if ( strm (type, "CDNA"))
246 CL->evaluate_residue_pair= evaluate_matrix_score;
247 else if ( strm(type, "PROTEIN"))
248 CL->evaluate_residue_pair=evaluate_matrix_score;
249 else if ( strm (type, "DNA") || strm (type, "RNA"))
250 CL->evaluate_residue_pair= evaluate_matrix_score;
263 gl=make_group_aa (&ng, CL->matrix_for_aa_group);
264 ns=vcalloc (2, sizeof (int));
266 l_s=declare_int (2, 2);
272 A=strings2aln (2, "A",seq1,"B", seq2);
278 diag=evaluate_diagonals ( A,ns, l_s, CL,maximise, ng, gl, ktup);
279 free_sequence (A->S, (A->S)->nseq);
283 free_int (CL->M, -1);
292 int ** evaluate_diagonals ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup)
298 if ( CL->residue_index)
300 tot_diag=evaluate_diagonals_with_clist ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
302 else if ( CL->use_fragments)
305 tot_diag=evaluate_segments_with_ktup ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
310 tot_diag=evaluate_diagonals_with_ktup ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
315 int ** evaluate_segments_with_ktup ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup)
318 Reads in an alignmnet A, with two groups of sequences marked.
319 1-Turn each group into a conscensus, using the group list identifier.
320 -if the group list is left empty original symbols are used
321 2-hash groupc the two sequences
322 3-score each diagonal, sort the list and return it (diag_list)
325 char *seq1, *seq2, *alphabet=NULL;
326 int a,b,l1, l2, n_ktup,pos_ktup1, pos_ktup2, **pos;
327 int *hasched_seq1, *hasched_seq2,*lu_seq1,*lu_seq2;
328 int n_diag, **diag, current_diag, **dot_list, n_dots, cost;
329 int l,delta_diag, delta_res;
332 pos=aln2pos_simple ( A,-1, ns, l_s);
333 seq1=aln2cons_seq (A, ns[0], l_s[0], n_groups, group_list);
334 seq2=aln2cons_seq (A, ns[1], l_s[1], n_groups, group_list);
338 alphabet=get_alphabet (seq1,alphabet);
339 alphabet=get_alphabet (seq2,alphabet);
347 diag=declare_int ( n_diag+2, 3);
348 n_ktup=(int)pow ( (double)alphabet[0]+1, (double)ktup);
350 hasch_seq(seq1, &hasched_seq1, &lu_seq1,ktup, alphabet);
351 hasch_seq(seq2, &hasched_seq2, &lu_seq2,ktup, alphabet);
355 /*EVALUATE THE DIAGONALS*/
356 for ( a=0; a<= n_diag; a++)diag[a][0]=a;
357 for ( n_dots=0,a=1; a<= n_ktup; a++)
359 pos_ktup1=lu_seq1[a];
362 if (!pos_ktup1)break;
363 pos_ktup2=lu_seq2[a];
367 pos_ktup2=hasched_seq2[pos_ktup2];
369 pos_ktup1=hasched_seq1[pos_ktup1];
378 vfree (hasched_seq1);
379 vfree (hasched_seq2);
383 return evaluate_segments_with_ktup (A,ns,l_s,CL,maximise,n_groups, group_list,ktup-1);
386 dot_list=declare_int ( n_dots,3);
388 for ( n_dots=0,a=1; a<= n_ktup; a++)
390 pos_ktup1=lu_seq1[a];
393 if (!pos_ktup1)break;
394 pos_ktup2=lu_seq2[a];
397 current_diag=(pos_ktup2-pos_ktup1+l1);
398 dot_list[n_dots][0]=current_diag;
399 dot_list[n_dots][1]=pos_ktup1;
400 dot_list[n_dots][2]=pos_ktup2;
401 pos_ktup2=hasched_seq2[pos_ktup2];
404 pos_ktup1=hasched_seq1[pos_ktup1];
410 hsort_list_array ((void **)dot_list, n_dots, sizeof (int), 3, 0, 3);
411 current_diag= (int)dot_list[0][0];
413 for ( b=0; b< ktup; b++)diag[current_diag][2]+=(CL->get_dp_cost) ( A, pos, ns[0], l_s[0], dot_list[0][1]+b-1, pos,ns[1], l_s[1], dot_list[0][2]+b-1, CL);
416 for ( l=0,a=1; a< n_dots; a++)
419 delta_diag=dot_list[a][0]-dot_list[a-1][0];
420 delta_res =dot_list[a][1]-dot_list[a-1][1];
422 for ( cost=0, b=0; b< ktup; b++)cost++;
424 /*=(CL->get_dp_cost) ( A, pos, ns[0], l_s[0], dot_list[a][1]+b-1, pos,ns[1], l_s[1], dot_list[a][2]+b-1, CL);*/
428 if (delta_diag!=0 || FABS(delta_res)>5)
432 diag[current_diag][1]=best_of_a_b(diag[current_diag][2], diag[current_diag][1], 1);
433 if ( diag[current_diag][2]<0);
434 else diag[current_diag][1]= MAX(diag[current_diag][1],diag[current_diag][2]);
435 diag[current_diag][2]=0;
436 current_diag=dot_list[a][0];
439 diag[current_diag][2]+=cost;
442 diag[current_diag][1]=best_of_a_b(diag[current_diag][2], diag[current_diag][1], 1);
443 sort_int (diag+1, 3, 1,0, n_diag-1);
449 vfree (hasched_seq1);
450 vfree (hasched_seq2);
454 free_int (dot_list, -1);
462 int ** evaluate_diagonals_with_clist ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup)
466 Reads in an alignmnent A, with two groups of sequences marked.
467 Weight the diagonals with the values read in the constraint list
470 int l1, l2,n_diag, s1, s2, r1=0, r2=0;
478 if ( !entry)entry=vcalloc ( CL->entry_len+1, CL->el_size);
479 l1=strlen (A->seq_al[l_s[0][0]]);
480 l2=strlen (A->seq_al[l_s[1][0]]);
483 diag=declare_int ( n_diag+2, 3);
484 for ( a=0; a<= n_diag; a++)diag[a][0]=a;
487 code=seq2aln_pos (A, ns, l_s);
488 pos =aln2pos_simple ( A,-1, ns, l_s);
491 for (a=0; a<ns[0]; a++)
494 s1=A->order[l_s[0][a]][0];
495 for (b=0; b<ns[1]; b++)
497 s2=A->order[l_s[1][b]][0];
498 for (r1=1; r1<=(A->S)->len[s1]; r1++)
501 for (e=1; e<CL->residue_index[s1][r1][0]; e+=ICHUNK)
503 if (CL->residue_index[s1][r1][e+SEQ2]==s2)
505 r2=CL->residue_index[s1][r1][e+R2];
506 diag[(r2-r1+l1)][1]+=(CL->get_dp_cost) ( A, pos, ns[0], l_s[0],r1-1, pos,ns[1], l_s[1], r2-1, CL);
513 sort_int (diag+1, 2, 1,0, n_diag-1);
520 int * flag_diagonals (int l1, int l2, int **sorted_diag, float T, int window)
522 int a, b, up, low,current_diag,n_diag;
531 mean=return_mean_int ( sorted_diag, n_diag+1, 1);
533 sd =return_sd_int ( sorted_diag, n_diag+1, 1, (int)mean);
538 T=(((double)sorted_diag[n_diag][1]-mean)/sd)/25;
542 diag_list=vcalloc (l1+l2+1, sizeof (int));
543 slopes=vcalloc ( n_diag+1, sizeof (int));
545 for ( a=n_diag; a>0; a--)
547 current_diag=sorted_diag[a][0];
550 if ( !use_z_score && sorted_diag[a][1]>T)
552 up=MAX(1,current_diag-window);
553 low=MIN(n_diag, current_diag+window);
554 for ( b=up; b<=low; b++)slopes[b]=1;
556 else if (use_z_score && ((double)sorted_diag[a][1]-mean)/sd>T)
558 up=MAX(1,current_diag-window);
559 low=MIN(n_diag, current_diag+window);
560 for ( b=up; b<=low; b++)slopes[b]=1;
565 for ( a=1, b=0; a<=n_diag; a++)
573 for (a=0; a<= (l1+l2-1); a++)
574 if ( slopes[a]){diag_list[++diag_list[0]]=a;}
580 int * extract_N_diag (int l1, int l2, int **sorted_diag, int n_chosen_diag, int window)
582 int a, b, up, low,current_diag,n_diag;
589 diag_list=vcalloc (l1+l2+1, sizeof (int));
590 slopes=vcalloc ( n_diag+1, sizeof (int));
593 for ( a=n_diag; a>0 && a>(n_diag-n_chosen_diag); a--)
595 current_diag=sorted_diag[a][0];
596 up=MAX(1,current_diag-window);
597 low=MIN(n_diag, current_diag+window);
599 for ( b=up; b<=low; b++)slopes[b]=1;
602 /*flag bottom right*/
603 up=MAX(1,1-window);low=MIN(n_diag,1+window);
604 for ( a=up; a<=low; a++) slopes[a]=1;
607 up=MAX(1,(l1+l2-1)-window);low=MIN(n_diag,(l1+l2-1)+window);
608 for ( a=up; a<=low; a++) slopes[a]=1;
611 /*flag MAIN DIAG SEQ1*/
612 up=MAX(1,l1-window);low=MIN(n_diag,l1+window);
613 for ( a=up; a<=low; a++) slopes[a]=1;
615 /*flag MAIN DIAG SEQ2*/
616 up=MAX(1,l2-window);low=MIN(n_diag,l2+window);
617 for ( a=up; a<=low; a++) slopes[a]=1;
620 for (a=0; a<= (l1+l2-1); a++)
621 if ( slopes[a]){diag_list[++diag_list[0]]=a;}
630 int cfasta_gotoh_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL)
632 /*TREATMENT OF THE TERMINAL GAP PENALTIES*/
633 /*TG_MODE=0---> gop and gep*/
634 /*TG_MODE=1---> --- gep*/
635 /*TG_MODE=2---> --- ---*/
640 /*VARIABLES FOR THE MULTIPLE SEQUENCE ALIGNMENT*/
646 static char **group_list;
647 int score, new_score;
648 int n_chosen_diag=20;
650 int max_n_chosen_diag;
652 /********Prepare Penalties******/
655 maximise=CL->maximise;
658 /********************************/
666 group_list=make_group_aa (&n_groups, CL->matrix_for_aa_group);
669 l1=strlen (A->seq_al[l_s[0][0]]);
670 l2=strlen (A->seq_al[l_s[1][0]]);
672 if ( !CL->fasta_step)
675 step=(int) log ((double)MAX(step, 1));
684 tot_diag=evaluate_diagonals ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
687 max_n_chosen_diag=strlen (A->seq_al[l_s[0][0]])+strlen (A->seq_al[l_s[1][0]])-1;
688 /*max_n_chosen_diag=(int)log10((double)(l1+l2))*10;*/
691 n_chosen_diag=MIN(n_chosen_diag, max_n_chosen_diag);
694 diag=extract_N_diag (strlen (A->seq_al[l_s[0][0]]),strlen (A->seq_al[l_s[1][0]]), tot_diag, n_chosen_diag, 0);
697 score =make_fasta_gotoh_pair_wise ( A, ns, l_s, CL, diag);
703 while (new_score!=score && n_chosen_diag< max_n_chosen_diag )
709 ungap_sub_aln ( A, ns[0], l_s[0]);
710 ungap_sub_aln ( A, ns[1], l_s[1]);
714 n_chosen_diag=MIN(n_chosen_diag, max_n_chosen_diag);
717 diag =extract_N_diag (strlen (A->seq_al[l_s[0][0]]),strlen (A->seq_al[l_s[1][0]]), tot_diag, n_chosen_diag, 0);
718 new_score=make_fasta_gotoh_pair_wise ( A, ns, l_s, CL, diag);
725 free_int (tot_diag, -1);
730 int fasta_gotoh_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL)
732 /*TREATMENT OF THE TERMINAL GAP PENALTIES*/
733 /*TG_MODE=0---> gop and gep*/
734 /*TG_MODE=1---> --- gep*/
735 /*TG_MODE=2---> --- ---*/
740 /*VARIABLES FOR THE MULTIPLE SEQUENCE ALIGNMENT*/
744 float diagonal_threshold;
746 static char **group_list;
748 /********Prepare Penalties******/
751 maximise=CL->maximise;
753 diagonal_threshold=CL->diagonal_threshold;
754 /********************************/
760 group_list=make_group_aa (&n_groups, CL->matrix_for_aa_group);
764 tot_diag=evaluate_diagonals ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
766 if ( !CL->fasta_step)
768 diag=flag_diagonals (strlen(A->seq_al[l_s[0][0]]),strlen(A->seq_al[l_s[1][0]]), tot_diag,diagonal_threshold,0);
774 diag=extract_N_diag (strlen (A->seq_al[l_s[0][0]]),strlen (A->seq_al[l_s[1][0]]), tot_diag,CL->fasta_step,0);
777 score=make_fasta_gotoh_pair_wise ( A, ns, l_s, CL, diag);
779 free_int (tot_diag, -1);
783 int very_fast_gotoh_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL)
785 /*TREATMENT OF THE TERMINAL GAP PENALTIES*/
786 /*TG_MODE=0---> gop and gep*/
787 /*TG_MODE=1---> --- gep*/
788 /*TG_MODE=2---> --- ---*/
792 /*VARIABLES FOR THE MULTIPLE SEQUENCE ALIGNMENT*/
797 static char **group_list;
799 /********Prepare Penalties******/
802 maximise=CL->maximise;
804 /********************************/
810 group_list=make_group_aa (&n_groups, CL->matrix_for_aa_group);
814 tot_diag=evaluate_diagonals ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
816 /*Note: 20 diagonals. 5 shadows on each side: tunned on Hom39, 2/2/04 */
817 diag=extract_N_diag (strlen (A->seq_al[l_s[0][0]]),strlen (A->seq_al[l_s[1][0]]), tot_diag,20,5);
818 score=make_fasta_gotoh_pair_wise ( A, ns, l_s, CL, diag);
819 free_int (tot_diag, -1);
823 int make_fasta_gotoh_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL, int *diag)
825 /*TREATMENT OF THE TERMINAL GAP PENALTIES*/
826 /*TG_MODE=0---> gop and gep*/
827 /*TG_MODE=1---> --- gep*/
828 /*TG_MODE=2---> --- ---*/
831 int TG_MODE, gop, l_gop, gep,l_gep, maximise;
833 /*VARIABLES FOR THE MULTIPLE SEQUENCE ALIGNMENT*/
835 int l1, l2,eg, ch, sub,score=0, last_i=0, last_j=0, i, delta_i, j, pos_j, ala, alb, LEN, n_diag, match1, match2;
838 int **C, **D, **I, **trace, **pos0, **LD;
840 char *buffer, *char_buf;
843 /********Prepare Penalties******/
847 maximise=CL->maximise;
850 /********************************/
857 l1=lenal[0]=strlen (A->seq_al[l_s[0][0]]);
858 l2=lenal[1]=strlen (A->seq_al[l_s[1][0]]);
860 if ( getenv ("DEBUG_TCOFFEE"))fprintf ( stderr, "\n\tNdiag=%d%% ", (diag[0]*100)/(l1+l2));
863 diag[1..n_diag]--> flaged diagonal in order;
864 diag[0]=0--> first diagonal;
865 diag[n_diag+1]=l1+l2-1;
868 /*numeration of the diagonals strats from the bottom right [1...l1+l2-1]*/
869 /*sequence s1 is vertical and seq s2 is horizontal*/
870 /*D contains the best Deletion in S2==>comes from diagonal N+1*/
871 /*I contains the best insertion in S2=> comes from diagonal N-1*/
877 C=declare_int (lenal[0]+lenal[1]+1, n_diag+2);
878 D=declare_int (lenal[0]+lenal[1]+1, n_diag+2);
879 LD=declare_int (lenal[0]+lenal[1]+1, n_diag+2);
880 I=declare_int (lenal[0]+lenal[1]+1, n_diag+2);
881 trace=declare_int (lenal[0]+lenal[1]+1, n_diag+2);
884 al=declare_char (2,lenal[0]+lenal[1]+lenal[1]+1);
886 len= MAX(lenal[0],lenal[1])+1;
887 buffer=vcalloc ( 2*len, sizeof (char));
888 char_buf= vcalloc (2*len, sizeof (char));
890 pos0=aln2pos_simple ( A,-1, ns, l_s);
893 t=(TG_MODE==0)?gop:0;
894 for ( j=1; j<= n_diag; j++)
896 l_gop=(TG_MODE==0)?gop:0;
897 l_gep=(TG_MODE==2)?0:gep;
901 if ( (diag[j]-lenal[0])<0 )
903 trace[0][j]=UNDEFINED;
906 C[0][j]=(diag[j]-lenal[0])*l_gep +l_gop;
907 D[0][j]=(diag[j]-lenal[0])*l_gep +l_gop+gop;
909 D[0][j]=D[0][j-1]+gep;
912 t=(TG_MODE==0)?gop:0;
913 for ( i=1; i<=lenal[0]; i++)
915 l_gop=(TG_MODE==0)?gop:0;
916 l_gep=(TG_MODE==2)?0:gep;
918 C[i][0]=C[i][n_diag+1]=t=t+l_gep;
919 I[i][0]=D[i][n_diag+1]=t+ gop;
921 for ( j=1; j<=n_diag; j++)
924 D[i][j]=I[i][j]=I[i][0];
927 for (eg=0, j=1; j<=n_diag; j++)
930 pos_j=diag[j]-lenal[0]+i;
931 if (pos_j<=0 || pos_j>l2 )
933 trace[i][j]=UNDEFINED;
936 sub=(CL->get_dp_cost) ( A, pos0, ns[0], l_s[0], i-1, pos0, ns[1], l_s[1],pos_j-1, CL );
938 /*1 identify the best insertion in S2:*/
939 l_gop=(i==lenal[0])?((TG_MODE==0)?gop:0):gop;
940 l_gep=(i==lenal[0])?((TG_MODE==2)?0:gep):gep;
941 len=(j==1)?0:(diag[j]-diag[j-1]);
942 if ( a_better_than_b(I[i][j-1], C[i][j-1]+l_gop, maximise))eg++;
944 I[i][j]=best_of_a_b (I[i][j-1], C[i][j-1]+l_gop, maximise)+len*l_gep;
946 /*2 Identify the best deletion in S2*/
947 l_gop=(pos_j==lenal[1])?((TG_MODE==0)?gop:0):gop;
948 l_gep=(pos_j==lenal[1])?((TG_MODE==2)?0:gep):gep;
950 len=(j==n_diag)?0:(diag[j+1]-diag[j]);
951 delta_i=((i-len)>0)?(i-len):0;
953 if ( a_better_than_b(D[delta_i][j+1],C[delta_i][j+1]+l_gop, maximise)){LD[i][j]=LD[delta_i][j+1]+1;}
955 D[i][j]=best_of_a_b (D[delta_i][j+1],C[delta_i][j+1]+l_gop, maximise)+len*l_gep;
958 /*Identify the best way*/
960 score=C[i][j]=best_int ( 3, maximise, &fop, I[i][j], C[i-1][j]+sub, D[i][j]);
962 if ( fop<0)trace[i][j]=fop*eg;
963 else if ( fop>0 ) {trace[i][j]=fop*LD[i][j];}
964 else if ( fop==0) trace[i][j]=0;
971 /*HERE ("%d %d %d", su, in, de);*/
972 if (su>=in && su>=de)
1016 while (!(match1==l1 && match2==l2))
1023 for ( a=0; a< len; a++)
1035 else if ( match2==l2)
1038 for ( a=0; a< len; a++)
1056 if ( match2==l2 || match1==l1);
1070 len=diag[j+k]-diag[j];
1071 for ( a=0; a<len; a++)
1073 if ( match1==l1)break;
1084 len=diag[j]-diag[j-k];
1085 for ( a=0; a<len; a++)
1087 if ( match2==l2)break;
1100 invert_list_char ( al[0], LEN);
1101 invert_list_char ( al[1], LEN);
1102 if ( A->declared_len<=LEN)A=realloc_aln2 ( A,A->max_n_seq, 2*LEN);
1105 for ( c=0; c< 2; c++)
1107 for ( a=0; a< ns[c]; a++)
1110 for ( b=0; b< LEN; b++)
1113 char_buf[b]=aln[l_s[c][a]][ch++];
1118 sprintf (aln[l_s[c][a]],"%s", char_buf);
1124 A->nseq=ns[0]+ns[1];
1126 free_int (pos0, -1);
1130 free_int (trace, -1);
1132 free_char ( al, -1);
1140 int hasch_seq(char *seq, int **hs, int **lu,int ktup,char *alp)
1144 int i,j,l,limit,code,flag;
1155 for ( i=0; i< alp_size; i++)
1157 alp_lu[(int)alp[i]]=i;
1163 limit = (int) pow((double)(alp_size+1),(double)ktup);
1164 hs[0]=vcalloc ( l+1,sizeof (int));
1165 lu[0]=vcalloc ( limit+1, sizeof(int));
1168 if ( l==0)myexit(EXIT_FAILURE);
1170 for (i=1;i<=ktup;i++)
1171 a[i] = (int) pow((double)(alp_size+1),(double)(i-1));
1174 for(i=1;i<=(l-ktup+1);++i)
1178 for(j=1;j<=ktup;++j)
1180 if (is_gap(seq[i+j-2])){flag=TRUE;break;}
1181 else residue=alp_lu[(int)seq[i+j-2]];
1188 if (lu[0][code])hs[0][i]=lu[0][code];
1196 /*********************************************************************/
1201 /*********************************************************************/
1203 /**************Hasch DAta Handling*******************************************************/
1205 struct Hasch_data * free_ktup_hasch_data (struct Hasch_data *d);
1206 struct Hasch_data * declare_ktup_hasch_data (struct Hasch_entry *e);
1207 struct Hasch_data * allocate_ktup_hasch_data (struct Hasch_data *e, int action);
1213 typedef struct Hasch_data Hasch_data;
1214 struct Hasch_data * free_ktup_hasch_data (struct Hasch_data *d)
1216 return allocate_ktup_hasch_data (d, FREE);
1218 struct Hasch_data * declare_ktup_hasch_data (struct Hasch_entry *e)
1220 e->data=allocate_ktup_hasch_data (NULL,DECLARE);
1224 struct Hasch_data * allocate_ktup_hasch_data (struct Hasch_data *e, int action)
1226 static struct Hasch_data **heap;
1227 static int heap_size, free_heap, a;
1231 fprintf ( stderr, "\nHeap size: %d, Free Heap: %d", heap_size, free_heap);
1234 else if ( action==DECLARE)
1239 heap_size+=free_heap;
1240 heap=vrealloc (heap,heap_size*sizeof (struct Hasch_entry *));
1241 for ( a=0; a<free_heap; a++)
1243 (heap[a])=vcalloc ( 1, sizeof ( struct Hasch_entry *));
1244 (heap[a])->list=vcalloc ( 10, sizeof (int));
1245 (heap[a])->list[0]=10;
1248 return heap[--free_heap];
1250 else if ( action==FREE)
1252 heap[free_heap++]=e;
1260 /**************Hasch DAta Handling*******************************************************/
1262 int precomputed_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL)
1264 int l1, l2, a, b, c;
1265 int nid=0, npos=0, id;
1268 l1=strlen(A->seq_al[l_s[0][0]]);
1269 l2=strlen(A->seq_al[l_s[1][0]]);
1272 fprintf ( stderr, "\nERROR: improper use of the function precomputed pairwise:[FATAL:%s]", PROGRAM);
1277 A->score_aln=A->score=0;
1281 for (npos=0, nid=0, a=0; a< ns[0]; a++)
1285 for (b=0; b< ns[1]; b++)
1288 for ( c=0; c<l1; c++)
1290 r1=A->seq_al[s1][c];
1291 r2=A->seq_al[s2][c];
1292 if ( is_gap(r1) || is_gap(r2));
1301 id=(npos==0)?0:((nid*100)/npos);
1302 A->score=A->score_aln=id;
1305 int ktup_comparison_str ( char *seq1, char *seq2, const int ktup);
1306 int ktup_comparison_hasch ( char *i_seq1, char *i_seq2, const int ktup);
1307 int ktup_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL)
1319 gl=make_group_aa (&ng, "vasiliky");
1322 if ( ns[0]>1)seq1=sub_aln2cons_seq_mat (A, ns[0], l_s[0],"blosum62mt");
1325 seq1=vcalloc ( strlen (A->seq_al[l_s[0][0]])+1, sizeof (char));
1326 sprintf ( seq1, "%s",A->seq_al[l_s[0][0]]);
1328 if ( ns[1]>1)seq2=sub_aln2cons_seq_mat (A, ns[1], l_s[1],"blosum62mt");
1331 seq2=vcalloc ( strlen (A->seq_al[l_s[1][0]])+1, sizeof (char));
1332 sprintf ( seq2, "%s",A->seq_al[l_s[1][0]]);
1335 if ( strlen (seq1)<min_len || strlen (seq2)<min_len)
1339 ungap(seq1); ungap(seq2);
1340 B=align_two_sequences ( seq1, seq2, "blosum62mt",-10, -1, "myers_miller_pair_wise");
1341 A->score=A->score_aln=aln2sim(B, "idmat");
1348 string_convert (seq1, ng, gl);
1349 string_convert (seq2, ng, gl);
1350 A->score=A->score_aln=ktup_comparison (seq1,seq2, CL->ktup);
1353 vfree (seq1); vfree (seq2);
1356 int ktup_comparison( char *seq2, char *seq1, const int ktup)
1358 return ktup_comparison_hasch ( seq2, seq1, ktup);
1360 int ktup_comparison_str ( char *seq2, char *seq1, const int ktup)
1362 int a,l1, l2,c1, c2, end, start;
1367 if ( max_dist==-1)max_dist=MAX((strlen (seq1)),(strlen (seq2)));
1368 l1=strlen (seq1)-ktup;
1372 for ( a=0; a< l1; a++)
1374 c1=seq1[a+ktup];seq1[a+ktup]='\0';
1377 start=((a-max_dist)<0)?0:a-max_dist;
1378 end=((a+max_dist)>=l2)?l2:a+max_dist;
1380 c2=seq2[end];seq2[end]='\0';
1383 score+=(strstr(s2, s1)!=NULL)?1:0;
1388 score/=(l1==0)?1:l1;
1389 score=((log(0.1+score)-log(0.1))/(log(1.1)-log(0.1)));
1394 int ktup_comparison_hasch ( char *i_seq1, char *i_seq2, const int ktup)
1396 /*Ktup comparison adapted from Rob Edgar, NAR, vol32, No1, 381, 2004*/
1397 /*1: hasch sequence 1
1398 2: Count the number of seq2 ktup found in seq1
1409 int p, a, max_dist=-1;
1414 if (!strm (i_seq1, pseq))
1418 hdestroy (H1, declare_ktup_hasch_data, free_ktup_hasch_data);
1419 string2key (NULL, NULL);
1421 H1=hasch_sequence ( i_seq1, ktup);
1422 vfree (pseq);pseq=vcalloc ( strlen (i_seq1)+1, sizeof (char));
1423 sprintf ( pseq, "%s", i_seq1);
1426 ls=l=strlen (i_seq2);
1431 c=s[ktup];s[ktup]='\0';
1432 key=string2key (s, NULL);
1433 e=hsearch (H1,key,FIND, declare_ktup_hasch_data, free_ktup_hasch_data);
1436 else if ( max_dist==-1)score++;
1439 for ( a=1; a<=(e->data)->list[1]; a++)
1440 if (FABS((p-(e->data)->list[a]))<=max_dist)
1443 s[ktup]=c;s++;p++;ls--;
1446 score=(log(0.1+score)-log(0.1))/(log(1.1)-log(0.1));
1448 if ( score>100) score=100;
1449 return (int)(score*100);
1452 HaschT* hasch_sequence ( char *seq1, int ktup)
1455 int key, offset=0, ls;
1459 H=hcreate ( strlen (seq1), declare_ktup_hasch_data, free_ktup_hasch_data);
1463 c=seq1[ktup];seq1[ktup]='\0';
1464 key=string2key (seq1, NULL);
1465 e=hsearch (H,key,FIND, declare_ktup_hasch_data, free_ktup_hasch_data);
1469 e=hsearch (H,key,ADD,declare_ktup_hasch_data,free_ktup_hasch_data);
1470 (e->data)->list[++(e->data)->list[1]+1]=offset;
1474 if ((e->data)->list[0]==((e->data)->list[1]+2)){(e->data)->list[0]+=10;(e->data)->list=vrealloc ((e->data)->list,(e->data)->list[0]*sizeof (int));}
1475 (e->data)->list[++(e->data)->list[1]+1]=offset;
1477 seq1[ktup]=c;seq1++;ls--;
1485 char *dayhoff_translate (char *seq1)
1489 for ( a=0; a< l; a++)
1492 if ( strchr ("agpst", c))seq1[a]='a';
1493 else if (strchr ("denq", c))seq1[a]='d';
1494 else if (strchr ("fwy", c))seq1[a]='f';
1495 else if (strchr ("hkr", c))seq1[a]='h';
1496 else if (strchr ("ilmv", c))seq1[a]='i';
1501 int ** evaluate_diagonals_with_ktup ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup)
1503 /*Ktup comparison as in Rob Edgar, NAR, vol32, No1, 381, 2004*/
1507 Hasch_entry *e1, *e2;
1508 char *s, *sb, *seq1, *seq2;
1511 int **diag,n_diag, ktup1, ktup2,a,b,c,d, **pos;
1514 pos=aln2pos_simple ( A,-1, ns, l_s);
1516 seq1=aln2cons_maj (A, ns[0], l_s[0], n_groups, group_list);
1517 seq2=aln2cons_maj (A, ns[1], l_s[1], n_groups, group_list);
1523 diag=declare_int (n_diag+2, 3);
1524 for ( a=0; a<n_diag+2; a++)diag[a][0]=a;
1526 H1=hasch_sequence ( seq1, ktup);
1527 H2=hasch_sequence ( seq2, ktup);
1528 s=sb=vcalloc (strlen (seq1)+strlen (seq2)+1, sizeof (char));
1529 sprintf (s, "%s%s", seq1, seq2);
1534 character=s[ktup];s[ktup]='\0';
1535 key=string2key (s, NULL);
1536 e1=hsearch (H1,key,FIND,declare_ktup_hasch_data, free_ktup_hasch_data);
1537 e2=hsearch (H2,key,FIND,declare_ktup_hasch_data, free_ktup_hasch_data);
1542 for (b=2; b<(e1->data)->list[1]+2; b++)
1543 for (c=2; c<(e2->data)->list[1]+2; c++)
1546 ktup1=(e1->data)->list[b];
1547 ktup2=(e2->data)->list[c];
1548 diag[(ktup2-ktup1)+l1][2]++;
1549 for (score=0, d=0; d<ktup; d++)
1550 score+=(CL->get_dp_cost) ( A, pos, ns[0], l_s[0], ktup1+d, pos,ns[1], l_s[1], ktup2+d, CL);
1551 diag[(ktup2-ktup1)+l1][1]+=score;
1554 (e1->data)->list[1]=(e2->data)->list[1]=0;
1556 s[ktup]=character;s++;ls--;
1559 sort_int (diag+1, 2, 1,0,n_diag-1);
1561 hdestroy (H1,declare_ktup_hasch_data, free_ktup_hasch_data); hdestroy (H2,declare_ktup_hasch_data, free_ktup_hasch_data);
1562 vfree (seq1); vfree (seq2);vfree (sb);free_int (pos, -1);
1565 /*********************************************************************/
1570 /*********************************************************************/
1571 int ** evaluate_diagonals_with_ktup_1 ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup)
1574 Reads in an alignmnent A, with two groups of sequences marked.
1575 1-Turn each group into a conscensus, using the group list identifier.
1576 -if the group list is left empty original symbols are used
1577 2-hasch the two sequences
1578 3-score each diagonal, sort the list and return it (diag_list)
1584 char *seq1, *seq2, *alphabet=NULL;
1585 int a,b,l1, l2, n_ktup,pos_ktup1, pos_ktup2, **pos;
1586 int *hasched_seq1, *hasched_seq2,*lu_seq1,*lu_seq2;
1587 int n_diag, **diag, current_diag, n_dots;
1589 pos=aln2pos_simple ( A,-1, ns, l_s);
1592 seq1=aln2cons_seq (A, ns[0], l_s[0], n_groups, group_list);
1593 seq2=aln2cons_seq (A, ns[1], l_s[1], n_groups, group_list);
1598 alphabet=get_alphabet (seq1,alphabet);
1599 alphabet=get_alphabet (seq2,alphabet);
1605 diag=declare_int ( n_diag+2, 3);
1606 n_ktup=(int)pow ( (double)alphabet[0]+1, (double)ktup);
1609 hasch_seq(seq1, &hasched_seq1, &lu_seq1,ktup, alphabet);
1610 hasch_seq(seq2, &hasched_seq2, &lu_seq2,ktup, alphabet);
1615 /*EVALUATE THE DIAGONALS*/
1616 for ( a=0; a<= n_diag; a++)diag[a][0]=a;
1617 for ( n_dots=0,a=1; a<= n_ktup; a++)
1619 pos_ktup1=lu_seq1[a];
1622 if (!pos_ktup1)break;
1623 pos_ktup2=lu_seq2[a];
1626 current_diag=(pos_ktup2-pos_ktup1+l1);
1627 for ( b=0; b< ktup; b++)
1629 diag[current_diag][1]+=(CL->get_dp_cost) ( A, pos, ns[0], l_s[0], pos_ktup1+b-1, pos,ns[1], l_s[1], pos_ktup2+b-1, CL);
1633 diag[current_diag][2]++;
1634 pos_ktup2=hasched_seq2[pos_ktup2];
1636 pos_ktup1=hasched_seq1[pos_ktup1];
1644 buf=vcalloc ( 30, sizeof (30));
1645 sprintf ( buf, "abcdefghijklmnopqrstuvwxyz");
1647 vfree ( hasched_seq1);
1648 vfree ( hasched_seq2);
1651 return evaluate_diagonals_with_ktup ( A,ns,l_s, CL,maximise,1,&buf,1);
1655 sort_int (diag+1, 2, 1,0, n_diag-1);
1659 vfree ( hasched_seq1);
1660 vfree ( hasched_seq2);
1666 /////////////////////////////////////////////////////////////////
1668 Constraint_list * hasch2constraint_list (Sequence*S, Constraint_list *CL)
1676 entry=vcalloc ( CL->entry_len+1, sizeof (int));
1678 for (a=0; a<S->nseq; a++)
1680 H=seq2hasch (a, S->seq[a],ktup,H);
1688 for (a=0; a<h->n-2; a+=2)
1690 for (b=a+2; b<h->n; b+=2)
1693 if (h->l[a]==h->l[b])continue;
1696 for (c=0; c<ktup; c++)
1698 entry[SEQ1]=h->l[a];
1699 entry[SEQ2]=h->l[b];
1700 entry[R1]=h->l[a+1]+c;
1701 entry[R2]=h->l[b+1]+c;
1703 add_entry2list (entry,CL);
1713 SeqHasch *cleanhasch (SeqHasch *H)
1717 N=vcalloc (2, sizeof (SeqHasch));
1730 int hasch2sim (SeqHasch *H, int nseq)
1739 for (ps=-1,ns=0,a=0; a<(H[n])->n; a+=2)
1741 //HERE ("%d--[%d %d]",n, (H[n])->l[a], (H[n])->l[a+1]);
1751 return (id*MAXID)/tot;
1753 SeqHasch * seq2hasch (int i,char *seq, int ktup, SeqHasch *H)
1761 H=vcalloc (2, sizeof (SeqHasch));
1762 H[0]=vcalloc (1, sizeof (hseq));
1772 for (a=0; a<l-ktup; a++)
1775 for (b=a; b<a+ktup; b++)
1779 if (!h->hl[r]) h->hl[r]=vcalloc (1, sizeof (hseq));
1786 h->l=vcalloc (2, sizeof (int));
1787 H=vrealloc (H,(n+2)*sizeof (SeqHasch));
1794 h->l=vrealloc (h->l, (h->n)*sizeof (int));
1803 /******************************COPYRIGHT NOTICE*******************************/
1804 /*© Centro de Regulacio Genomica */
1806 /*Cedric Notredame */
1807 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
1808 /*All rights reserved.*/
1809 /*This file is part of T-COFFEE.*/
1811 /* T-COFFEE is free software; you can redistribute it and/or modify*/
1812 /* it under the terms of the GNU General Public License as published by*/
1813 /* the Free Software Foundation; either version 2 of the License, or*/
1814 /* (at your option) any later version.*/
1816 /* T-COFFEE is distributed in the hope that it will be useful,*/
1817 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
1818 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
1819 /* GNU General Public License for more details.*/
1821 /* You should have received a copy of the GNU General Public License*/
1822 /* along with Foobar; if not, write to the Free Software*/
1823 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
1824 /*............................................... |*/
1825 /* If you need some more information*/
1826 /* cedric.notredame@europe.com*/
1827 /*............................................... |*/
1831 /******************************COPYRIGHT NOTICE*******************************/