8 #include "io_lib_header.h"
9 #include "util_lib_header.h"
10 #include "define_header.h"
11 #include "dp_lib_header.h"
14 int commonsextet( short *table, int *pointt );
15 void makecompositiontable( short *table, int *pointt );
16 int *code_seq (char *seq, char *type);
17 int * makepointtable( int *pointt, int *n, int ktup );
22 * calculates the number of common tuples
24 int commonsextet( short *table, int *pointt )
29 static short *memo = NULL;
30 static int *ct = NULL;
35 memo = vcalloc( tsize+1, sizeof( short ) );
36 ct = vcalloc( tsize+1, sizeof( int ) );
40 while( ( point = *pointt++ ) != END_ARRAY )
43 if( tmp < table[point] )
53 while( *cp != END_ARRAY )
60 * calculates how many of each tupble exist
62 void makecompositiontable( short *table, int *pointt )
66 while( ( point = *pointt++ ) != END_ARRAY )
72 int *code_seq (char *seq, char *type)
82 if ( strm (type, "DNA") || strm (type, "RNA"))
84 gl=declare_char (4,5);
85 sprintf ( gl[ng++], "Aa");
86 sprintf ( gl[ng++], "Gg");
87 sprintf ( gl[ng++], "TtUu");
88 sprintf ( gl[ng++], "Cc");
93 gl=make_group_aa ( &ng, "mafft");
95 aa=vcalloc ( 256, sizeof (int));
98 for ( b=0; b< strlen (gl[a]); b++)
111 if ( !code || read_array_size (code, sizeof (int))<(l+2))
114 code=vcalloc (l+2, sizeof (int));
120 code[a]=aa[(int)seq[a]];
128 int * makepointtable( int *pointt, int *n, int ktup )
138 prod=vcalloc ( ktup, sizeof (int));
139 for ( a=0; a<ktup; a++)
141 prod[ktup-a-1]=(int)pow(n[-1],a);
146 for (point=0,a=0; a<ktup; a++)
148 point+= *n++ *prod[a];
153 while( *n != END_ARRAY )
155 point -= *p++ * prod[0];
165 int ** ktup_dist_mat ( char **seq, int nseq, int ktup, char *type)
167 //Adapted from MAFFT 5: fast ktup
168 int **pointt,*code=NULL, **pscore;
170 double **mtx, score0;
173 if (!seq || nseq==0)return NULL;
174 for (minl=strlen(seq[0]),l=0,i=0;i<nseq; i++)
181 ktup=MIN(minl, ktup);
182 pointt=declare_int (nseq, l+1);
183 mtx=declare_double (nseq, nseq);
184 pscore=declare_int ( nseq, nseq);
186 for( i=0; i<nseq; i++ )
188 makepointtable( pointt[i], code=code_seq (seq[i], type),ktup);
190 tsize=(int)pow(code[-1], ktup);
192 for ( i=0; i<nseq; i++)
195 table1=vcalloc ( tsize,sizeof (short));
196 makecompositiontable( table1, pointt[i]);
197 for (j=i; j<nseq; j++)
199 mtx[i][j] = commonsextet( table1, pointt[j] );
203 for( i=0; i<nseq; i++ )
206 for( j=0; j<nseq; j++ )
207 pscore[i][j] = (int)( ( score0 - mtx[MIN(i,j)][MAX(i,j)] ) / score0 * 3 * 10.0 + 0.5 );
209 for( i=0; i<nseq-1; i++ )
210 for( j=i+1; j<nseq; j++ )
212 pscore[i][j] = pscore[j][i]=100-MIN( pscore[i][j], pscore[j][i] );
218 int ** evaluate_diagonals_with_ktup_1 ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup);
219 int ** evaluate_diagonals_with_ktup_2 ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup);
222 int ** evaluate_diagonals_for_two_sequences ( char *seq1, char *seq2,int maximise,Constraint_list *CL,int ktup)
227 static int *ns, **l_s;
237 CL=vcalloc ( 1, sizeof (Constraint_list));
239 sprintf ( CL->matrix_for_aa_group, "vasiliky");
240 CL->M=read_matrice ("blosum62mt");
241 CL->evaluate_residue_pair=evaluate_cdna_matrix_score;
242 CL->get_dp_cost=slow_get_dp_cost;
243 type=get_string_type(seq1);
245 if ( strm (type, "CDNA"))
246 CL->evaluate_residue_pair= evaluate_matrix_score;
247 else if ( strm(type, "PROTEIN"))
248 CL->evaluate_residue_pair=evaluate_matrix_score;
249 else if ( strm (type, "DNA") || strm (type, "RNA"))
250 CL->evaluate_residue_pair= evaluate_matrix_score;
263 gl=make_group_aa (&ng, CL->matrix_for_aa_group);
264 ns=vcalloc (2, sizeof (int));
266 l_s=declare_int (2, 2);
272 A=strings2aln (2, "A",seq1,"B", seq2);
278 diag=evaluate_diagonals ( A,ns, l_s, CL,maximise, ng, gl, ktup);
279 free_sequence (A->S, (A->S)->nseq);
283 free_int (CL->M, -1);
292 int ** evaluate_diagonals ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup)
300 tot_diag=evaluate_diagonals_with_clist ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
302 else if ( CL->use_fragments)
305 tot_diag=evaluate_segments_with_ktup ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
310 tot_diag=evaluate_diagonals_with_ktup ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
315 int ** evaluate_segments_with_ktup ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup)
318 Reads in an alignmnet A, with two groups of sequences marked.
319 1-Turn each group into a conscensus, using the group list identifier.
320 -if the group list is left empty original symbols are used
321 2-hash groupc the two sequences
322 3-score each diagonal, sort the list and return it (diag_list)
325 char *seq1, *seq2, *alphabet=NULL;
326 int a,b,l1, l2, n_ktup,pos_ktup1, pos_ktup2, **pos;
327 int *hasched_seq1, *hasched_seq2,*lu_seq1,*lu_seq2;
328 int n_diag, **diag, current_diag, **dot_list, n_dots, cost;
329 int l,delta_diag, delta_res;
332 pos=aln2pos_simple ( A,-1, ns, l_s);
333 seq1=aln2cons_seq (A, ns[0], l_s[0], n_groups, group_list);
334 seq2=aln2cons_seq (A, ns[1], l_s[1], n_groups, group_list);
338 alphabet=get_alphabet (seq1,alphabet);
339 alphabet=get_alphabet (seq2,alphabet);
347 diag=declare_int ( n_diag+2, 3);
348 n_ktup=(int)pow ( (double)alphabet[0]+1, (double)ktup);
350 hasch_seq(seq1, &hasched_seq1, &lu_seq1,ktup, alphabet);
351 hasch_seq(seq2, &hasched_seq2, &lu_seq2,ktup, alphabet);
355 /*EVALUATE THE DIAGONALS*/
356 for ( a=0; a<= n_diag; a++)diag[a][0]=a;
357 for ( n_dots=0,a=1; a<= n_ktup; a++)
359 pos_ktup1=lu_seq1[a];
362 if (!pos_ktup1)break;
363 pos_ktup2=lu_seq2[a];
367 pos_ktup2=hasched_seq2[pos_ktup2];
369 pos_ktup1=hasched_seq1[pos_ktup1];
378 vfree (hasched_seq1);
379 vfree (hasched_seq2);
383 return evaluate_segments_with_ktup (A,ns,l_s,CL,maximise,n_groups, group_list,ktup-1);
386 dot_list=declare_int ( n_dots,3);
388 for ( n_dots=0,a=1; a<= n_ktup; a++)
390 pos_ktup1=lu_seq1[a];
393 if (!pos_ktup1)break;
394 pos_ktup2=lu_seq2[a];
397 current_diag=(pos_ktup2-pos_ktup1+l1);
398 dot_list[n_dots][0]=current_diag;
399 dot_list[n_dots][1]=pos_ktup1;
400 dot_list[n_dots][2]=pos_ktup2;
401 pos_ktup2=hasched_seq2[pos_ktup2];
404 pos_ktup1=hasched_seq1[pos_ktup1];
410 hsort_list_array ((void **)dot_list, n_dots, sizeof (int), 3, 0, 3);
411 current_diag= (int)dot_list[0][0];
413 for ( b=0; b< ktup; b++)diag[current_diag][2]+=(CL->get_dp_cost) ( A, pos, ns[0], l_s[0], dot_list[0][1]+b-1, pos,ns[1], l_s[1], dot_list[0][2]+b-1, CL);
416 for ( l=0,a=1; a< n_dots; a++)
419 delta_diag=dot_list[a][0]-dot_list[a-1][0];
420 delta_res =dot_list[a][1]-dot_list[a-1][1];
422 for ( cost=0, b=0; b< ktup; b++)cost++;
424 /*=(CL->get_dp_cost) ( A, pos, ns[0], l_s[0], dot_list[a][1]+b-1, pos,ns[1], l_s[1], dot_list[a][2]+b-1, CL);*/
428 if (delta_diag!=0 || FABS(delta_res)>5)
432 diag[current_diag][1]=best_of_a_b(diag[current_diag][2], diag[current_diag][1], 1);
433 if ( diag[current_diag][2]<0);
434 else diag[current_diag][1]= MAX(diag[current_diag][1],diag[current_diag][2]);
435 diag[current_diag][2]=0;
436 current_diag=dot_list[a][0];
439 diag[current_diag][2]+=cost;
442 diag[current_diag][1]=best_of_a_b(diag[current_diag][2], diag[current_diag][1], 1);
443 sort_int (diag+1, 3, 1,0, n_diag-1);
449 vfree (hasched_seq1);
450 vfree (hasched_seq2);
454 free_int (dot_list, -1);
462 int ** evaluate_diagonals_with_clist ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup)
466 Reads in an alignmnent A, with two groups of sequences marked.
467 Weight the diagonals with the values read in the constraint list
470 int l1, l2,n_diag, s1, s2, r1=0, r2=0;
478 if ( !entry)entry=vcalloc ( CL->entry_len, CL->el_size);
479 CL=index_constraint_list (CL);
481 l1=strlen (A->seq_al[l_s[0][0]]);
482 l2=strlen (A->seq_al[l_s[1][0]]);
485 diag=declare_int ( n_diag+2, 3);
486 for ( a=0; a<= n_diag; a++)diag[a][0]=a;
489 code=seq2aln_pos (A, ns, l_s);
490 pos =aln2pos_simple ( A,-1, ns, l_s);
493 for (a=0; a<ns[0]; a++)
496 s1=A->order[l_s[0][a]][0];
497 for (b=0; b<ns[1]; b++)
499 s2=A->order[l_s[1][b]][0];
500 for (c=CL->start_index[s1][s2], d=0; c<CL->end_index[s1][s2];c++, d++)
502 entry=extract_entry ( entry,c, CL);
505 r1=code [s1][entry[R1]];
506 r2=code [s2][entry[R2]];
508 else if ( s2==entry[SEQ1])
510 r2=code [s2][entry[R1]];
511 r1=code [s1][entry[R2]];
515 diag[(r2-r1+l1)][1]+=(CL->get_dp_cost) ( A, pos, ns[0], l_s[0],r1-1, pos,ns[1], l_s[1], r2-1, CL);
522 sort_int (diag+1, 2, 1,0, n_diag-1);
529 int * flag_diagonals (int l1, int l2, int **sorted_diag, float T, int window)
531 int a, b, up, low,current_diag,n_diag;
540 mean=return_mean_int ( sorted_diag, n_diag+1, 1);
542 sd =return_sd_int ( sorted_diag, n_diag+1, 1, (int)mean);
547 T=(((double)sorted_diag[n_diag][1]-mean)/sd)/25;
551 diag_list=vcalloc (l1+l2+1, sizeof (int));
552 slopes=vcalloc ( n_diag+1, sizeof (int));
554 for ( a=n_diag; a>0; a--)
556 current_diag=sorted_diag[a][0];
559 if ( !use_z_score && sorted_diag[a][1]>T)
561 up=MAX(1,current_diag-window);
562 low=MIN(n_diag, current_diag+window);
563 for ( b=up; b<=low; b++)slopes[b]=1;
565 else if (use_z_score && ((double)sorted_diag[a][1]-mean)/sd>T)
567 up=MAX(1,current_diag-window);
568 low=MIN(n_diag, current_diag+window);
569 for ( b=up; b<=low; b++)slopes[b]=1;
574 for ( a=1, b=0; a<=n_diag; a++)
582 for (a=0; a<= (l1+l2-1); a++)
583 if ( slopes[a]){diag_list[++diag_list[0]]=a;}
589 int * extract_N_diag (int l1, int l2, int **sorted_diag, int n_chosen_diag, int window)
591 int a, b, up, low,current_diag,n_diag;
598 diag_list=vcalloc (l1+l2+1, sizeof (int));
599 slopes=vcalloc ( n_diag+1, sizeof (int));
602 for ( a=n_diag; a>0 && a>(n_diag-n_chosen_diag); a--)
604 current_diag=sorted_diag[a][0];
605 up=MAX(1,current_diag-window);
606 low=MIN(n_diag, current_diag+window);
608 for ( b=up; b<=low; b++)slopes[b]=1;
611 /*flag bottom right*/
612 up=MAX(1,1-window);low=MIN(n_diag,1+window);
613 for ( a=up; a<=low; a++) slopes[a]=1;
616 up=MAX(1,(l1+l2-1)-window);low=MIN(n_diag,(l1+l2-1)+window);
617 for ( a=up; a<=low; a++) slopes[a]=1;
620 /*flag MAIN DIAG SEQ1*/
621 up=MAX(1,l1-window);low=MIN(n_diag,l1+window);
622 for ( a=up; a<=low; a++) slopes[a]=1;
624 /*flag MAIN DIAG SEQ2*/
625 up=MAX(1,l2-window);low=MIN(n_diag,l2+window);
626 for ( a=up; a<=low; a++) slopes[a]=1;
629 for (a=0; a<= (l1+l2-1); a++)
630 if ( slopes[a]){diag_list[++diag_list[0]]=a;}
639 int cfasta_gotoh_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL)
641 /*TREATMENT OF THE TERMINAL GAP PENALTIES*/
642 /*TG_MODE=0---> gop and gep*/
643 /*TG_MODE=1---> --- gep*/
644 /*TG_MODE=2---> --- ---*/
649 /*VARIABLES FOR THE MULTIPLE SEQUENCE ALIGNMENT*/
655 static char **group_list;
656 int score, new_score;
657 int n_chosen_diag=20;
659 int max_n_chosen_diag;
661 /********Prepare Penalties******/
664 maximise=CL->maximise;
667 /********************************/
675 group_list=make_group_aa (&n_groups, CL->matrix_for_aa_group);
678 l1=strlen (A->seq_al[l_s[0][0]]);
679 l2=strlen (A->seq_al[l_s[1][0]]);
681 if ( !CL->fasta_step)
684 step=(int) log ((double)MAX(step, 1));
693 tot_diag=evaluate_diagonals ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
696 max_n_chosen_diag=strlen (A->seq_al[l_s[0][0]])+strlen (A->seq_al[l_s[1][0]])-1;
697 /*max_n_chosen_diag=(int)log10((double)(l1+l2))*10;*/
700 n_chosen_diag=MIN(n_chosen_diag, max_n_chosen_diag);
703 diag=extract_N_diag (strlen (A->seq_al[l_s[0][0]]),strlen (A->seq_al[l_s[1][0]]), tot_diag, n_chosen_diag, 0);
706 score =make_fasta_gotoh_pair_wise ( A, ns, l_s, CL, diag);
712 while (new_score!=score && n_chosen_diag< max_n_chosen_diag )
718 ungap_sub_aln ( A, ns[0], l_s[0]);
719 ungap_sub_aln ( A, ns[1], l_s[1]);
723 n_chosen_diag=MIN(n_chosen_diag, max_n_chosen_diag);
726 diag =extract_N_diag (strlen (A->seq_al[l_s[0][0]]),strlen (A->seq_al[l_s[1][0]]), tot_diag, n_chosen_diag, 0);
727 new_score=make_fasta_gotoh_pair_wise ( A, ns, l_s, CL, diag);
734 free_int (tot_diag, -1);
739 int fasta_gotoh_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL)
741 /*TREATMENT OF THE TERMINAL GAP PENALTIES*/
742 /*TG_MODE=0---> gop and gep*/
743 /*TG_MODE=1---> --- gep*/
744 /*TG_MODE=2---> --- ---*/
749 /*VARIABLES FOR THE MULTIPLE SEQUENCE ALIGNMENT*/
753 float diagonal_threshold;
755 static char **group_list;
757 /********Prepare Penalties******/
760 maximise=CL->maximise;
762 diagonal_threshold=CL->diagonal_threshold;
763 /********************************/
769 group_list=make_group_aa (&n_groups, CL->matrix_for_aa_group);
773 tot_diag=evaluate_diagonals ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
775 if ( !CL->fasta_step)
777 diag=flag_diagonals (strlen(A->seq_al[l_s[0][0]]),strlen(A->seq_al[l_s[1][0]]), tot_diag,diagonal_threshold,0);
783 diag=extract_N_diag (strlen (A->seq_al[l_s[0][0]]),strlen (A->seq_al[l_s[1][0]]), tot_diag,CL->fasta_step,0);
786 score=make_fasta_gotoh_pair_wise ( A, ns, l_s, CL, diag);
788 free_int (tot_diag, -1);
792 int very_fast_gotoh_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL)
794 /*TREATMENT OF THE TERMINAL GAP PENALTIES*/
795 /*TG_MODE=0---> gop and gep*/
796 /*TG_MODE=1---> --- gep*/
797 /*TG_MODE=2---> --- ---*/
801 /*VARIABLES FOR THE MULTIPLE SEQUENCE ALIGNMENT*/
806 static char **group_list;
808 /********Prepare Penalties******/
811 maximise=CL->maximise;
813 /********************************/
819 group_list=make_group_aa (&n_groups, CL->matrix_for_aa_group);
823 tot_diag=evaluate_diagonals ( A, ns, l_s, CL, maximise,n_groups,group_list, ktup);
825 /*Note: 20 diagonals. 5 shadows on each side: tunned on Hom39, 2/2/04 */
826 diag=extract_N_diag (strlen (A->seq_al[l_s[0][0]]),strlen (A->seq_al[l_s[1][0]]), tot_diag,20,5);
827 score=make_fasta_gotoh_pair_wise ( A, ns, l_s, CL, diag);
828 free_int (tot_diag, -1);
832 int make_fasta_gotoh_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL, int *diag)
834 /*TREATMENT OF THE TERMINAL GAP PENALTIES*/
835 /*TG_MODE=0---> gop and gep*/
836 /*TG_MODE=1---> --- gep*/
837 /*TG_MODE=2---> --- ---*/
840 int TG_MODE, gop, l_gop, gep,l_gep, maximise;
842 /*VARIABLES FOR THE MULTIPLE SEQUENCE ALIGNMENT*/
844 int l1, l2,eg, ch, sub,score=0, last_i=0, last_j=0, i, delta_i, j, pos_j, ala, alb, LEN, n_diag, match1, match2;
847 int **C, **D, **I, **trace, **pos0, **LD;
849 char *buffer, *char_buf;
852 /********Prepare Penalties******/
856 maximise=CL->maximise;
859 /********************************/
866 l1=lenal[0]=strlen (A->seq_al[l_s[0][0]]);
867 l2=lenal[1]=strlen (A->seq_al[l_s[1][0]]);
869 if ( getenv ("DEBUG_TCOFFEE"))fprintf ( stderr, "\n\tNdiag=%d%% ", (diag[0]*100)/(l1+l2));
872 diag[1..n_diag]--> flaged diagonal in order;
873 diag[0]=0--> first diagonal;
874 diag[n_diag+1]=l1+l2-1;
877 /*numeration of the diagonals strats from the bottom right [1...l1+l2-1]*/
878 /*sequence s1 is vertical and seq s2 is horizontal*/
879 /*D contains the best Deletion in S2==>comes from diagonal N+1*/
880 /*I contains the best insertion in S2=> comes from diagonal N-1*/
886 C=declare_int (lenal[0]+lenal[1]+1, n_diag+2);
887 D=declare_int (lenal[0]+lenal[1]+1, n_diag+2);
888 LD=declare_int (lenal[0]+lenal[1]+1, n_diag+2);
889 I=declare_int (lenal[0]+lenal[1]+1, n_diag+2);
890 trace=declare_int (lenal[0]+lenal[1]+1, n_diag+2);
893 al=declare_char (2,lenal[0]+lenal[1]+lenal[1]+1);
895 len= MAX(lenal[0],lenal[1])+1;
896 buffer=vcalloc ( 2*len, sizeof (char));
897 char_buf= vcalloc (2*len, sizeof (char));
899 pos0=aln2pos_simple ( A,-1, ns, l_s);
902 t=(TG_MODE==0)?gop:0;
903 for ( j=1; j<= n_diag; j++)
905 l_gop=(TG_MODE==0)?gop:0;
906 l_gep=(TG_MODE==2)?0:gep;
910 if ( (diag[j]-lenal[0])<0 )
912 trace[0][j]=UNDEFINED;
915 C[0][j]=(diag[j]-lenal[0])*l_gep +l_gop;
916 D[0][j]=(diag[j]-lenal[0])*l_gep +l_gop+gop;
918 D[0][j]=D[0][j-1]+gep;
921 t=(TG_MODE==0)?gop:0;
922 for ( i=1; i<=lenal[0]; i++)
924 l_gop=(TG_MODE==0)?gop:0;
925 l_gep=(TG_MODE==2)?0:gep;
927 C[i][0]=C[i][n_diag+1]=t=t+l_gep;
928 I[i][0]=D[i][n_diag+1]=t+ gop;
930 for ( j=1; j<=n_diag; j++)
933 D[i][j]=I[i][j]=I[i][0];
936 for (eg=0, j=1; j<=n_diag; j++)
939 pos_j=diag[j]-lenal[0]+i;
940 if (pos_j<=0 || pos_j>l2 )
942 trace[i][j]=UNDEFINED;
945 sub=(CL->get_dp_cost) ( A, pos0, ns[0], l_s[0], i-1, pos0, ns[1], l_s[1],pos_j-1, CL );
947 /*1 identify the best insertion in S2:*/
948 l_gop=(i==lenal[0])?((TG_MODE==0)?gop:0):gop;
949 l_gep=(i==lenal[0])?((TG_MODE==2)?0:gep):gep;
950 len=(j==1)?0:(diag[j]-diag[j-1]);
951 if ( a_better_than_b(I[i][j-1], C[i][j-1]+l_gop, maximise))eg++;
953 I[i][j]=best_of_a_b (I[i][j-1], C[i][j-1]+l_gop, maximise)+len*l_gep;
955 /*2 Identify the best deletion in S2*/
956 l_gop=(pos_j==lenal[1])?((TG_MODE==0)?gop:0):gop;
957 l_gep=(pos_j==lenal[1])?((TG_MODE==2)?0:gep):gep;
959 len=(j==n_diag)?0:(diag[j+1]-diag[j]);
960 delta_i=((i-len)>0)?(i-len):0;
962 if ( a_better_than_b(D[delta_i][j+1],C[delta_i][j+1]+l_gop, maximise)){LD[i][j]=LD[delta_i][j+1]+1;}
964 D[i][j]=best_of_a_b (D[delta_i][j+1],C[delta_i][j+1]+l_gop, maximise)+len*l_gep;
967 /*Identify the best way*/
969 score=C[i][j]=best_int ( 3, maximise, &fop, I[i][j], C[i-1][j]+sub, D[i][j]);
971 if ( fop<0)trace[i][j]=fop*eg;
972 else if ( fop>0 ) {trace[i][j]=fop*LD[i][j];}
973 else if ( fop==0) trace[i][j]=0;
980 /*HERE ("%d %d %d", su, in, de);*/
981 if (su>=in && su>=de)
1025 while (!(match1==l1 && match2==l2))
1032 for ( a=0; a< len; a++)
1044 else if ( match2==l2)
1047 for ( a=0; a< len; a++)
1065 if ( match2==l2 || match1==l1);
1079 len=diag[j+k]-diag[j];
1080 for ( a=0; a<len; a++)
1082 if ( match1==l1)break;
1093 len=diag[j]-diag[j-k];
1094 for ( a=0; a<len; a++)
1096 if ( match2==l2)break;
1109 invert_list_char ( al[0], LEN);
1110 invert_list_char ( al[1], LEN);
1111 if ( A->declared_len<=LEN)A=realloc_aln2 ( A,A->max_n_seq, 2*LEN);
1114 for ( c=0; c< 2; c++)
1116 for ( a=0; a< ns[c]; a++)
1119 for ( b=0; b< LEN; b++)
1122 char_buf[b]=aln[l_s[c][a]][ch++];
1127 sprintf (aln[l_s[c][a]],"%s", char_buf);
1133 A->nseq=ns[0]+ns[1];
1135 free_int (pos0, -1);
1139 free_int (trace, -1);
1141 free_char ( al, -1);
1149 int hasch_seq(char *seq, int **hs, int **lu,int ktup,char *alp)
1153 int i,j,l,limit,code,flag;
1164 for ( i=0; i< alp_size; i++)
1166 alp_lu[(int)alp[i]]=i;
1172 limit = (int) pow((double)(alp_size+1),(double)ktup);
1173 hs[0]=vcalloc ( l+1,sizeof (int));
1174 lu[0]=vcalloc ( limit+1, sizeof(int));
1177 if ( l==0)myexit(EXIT_FAILURE);
1179 for (i=1;i<=ktup;i++)
1180 a[i] = (int) pow((double)(alp_size+1),(double)(i-1));
1183 for(i=1;i<=(l-ktup+1);++i)
1187 for(j=1;j<=ktup;++j)
1189 if (is_gap(seq[i+j-2])){flag=TRUE;break;}
1190 else residue=alp_lu[(int)seq[i+j-2]];
1197 if (lu[0][code])hs[0][i]=lu[0][code];
1205 /*********************************************************************/
1210 /*********************************************************************/
1212 /**************Hasch DAta Handling*******************************************************/
1214 struct Hasch_data * free_ktup_hasch_data (struct Hasch_data *d);
1215 struct Hasch_data * declare_ktup_hasch_data (struct Hasch_entry *e);
1216 struct Hasch_data * allocate_ktup_hasch_data (struct Hasch_data *e, int action);
1222 typedef struct Hasch_data Hasch_data;
1223 struct Hasch_data * free_ktup_hasch_data (struct Hasch_data *d)
1225 return allocate_ktup_hasch_data (d, FREE);
1227 struct Hasch_data * declare_ktup_hasch_data (struct Hasch_entry *e)
1229 e->data=allocate_ktup_hasch_data (NULL,DECLARE);
1233 struct Hasch_data * allocate_ktup_hasch_data (struct Hasch_data *e, int action)
1235 static struct Hasch_data **heap;
1236 static int heap_size, free_heap, a;
1240 fprintf ( stderr, "\nHeap size: %d, Free Heap: %d", heap_size, free_heap);
1243 else if ( action==DECLARE)
1248 heap_size+=free_heap;
1249 heap=vrealloc (heap,heap_size*sizeof (struct Hasch_entry *));
1250 for ( a=0; a<free_heap; a++)
1252 (heap[a])=vcalloc ( 1, sizeof ( struct Hasch_entry *));
1253 (heap[a])->list=vcalloc ( 10, sizeof (int));
1254 (heap[a])->list[0]=10;
1257 return heap[--free_heap];
1259 else if ( action==FREE)
1261 heap[free_heap++]=e;
1269 /**************Hasch DAta Handling*******************************************************/
1271 int precomputed_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL)
1273 int l1, l2, a, b, c;
1274 int nid=0, npos=0, id;
1277 l1=strlen(A->seq_al[l_s[0][0]]);
1278 l2=strlen(A->seq_al[l_s[1][0]]);
1281 fprintf ( stderr, "\nERROR: improper use of the function precomputed pairwise:[FATAL:%s]", PROGRAM);
1286 A->score_aln=A->score=0;
1290 for (npos=0, nid=0, a=0; a< ns[0]; a++)
1294 for (b=0; b< ns[1]; b++)
1297 for ( c=0; c<l1; c++)
1299 r1=A->seq_al[s1][c];
1300 r2=A->seq_al[s2][c];
1301 if ( is_gap(r1) || is_gap(r2));
1310 id=(npos==0)?0:((nid*100)/npos);
1311 A->score=A->score_aln=id;
1314 int ktup_comparison_str ( char *seq1, char *seq2, const int ktup);
1315 int ktup_comparison_hasch ( char *i_seq1, char *i_seq2, const int ktup);
1316 int ktup_pair_wise (Alignment *A,int*ns, int **l_s,Constraint_list *CL)
1328 gl=make_group_aa (&ng, "vasiliky");
1331 if ( ns[0]>1)seq1=sub_aln2cons_seq_mat (A, ns[0], l_s[0],"blosum62mt");
1334 seq1=vcalloc ( strlen (A->seq_al[l_s[0][0]])+1, sizeof (char));
1335 sprintf ( seq1, "%s",A->seq_al[l_s[0][0]]);
1337 if ( ns[1]>1)seq2=sub_aln2cons_seq_mat (A, ns[1], l_s[1],"blosum62mt");
1340 seq2=vcalloc ( strlen (A->seq_al[l_s[1][0]])+1, sizeof (char));
1341 sprintf ( seq2, "%s",A->seq_al[l_s[1][0]]);
1344 if ( strlen (seq1)<min_len || strlen (seq2)<min_len)
1348 ungap(seq1); ungap(seq2);
1349 B=align_two_sequences ( seq1, seq2, "blosum62mt",-10, -1, "myers_miller_pair_wise");
1350 A->score=A->score_aln=aln2sim(B, "idmat");
1357 string_convert (seq1, ng, gl);
1358 string_convert (seq2, ng, gl);
1359 A->score=A->score_aln=ktup_comparison (seq1,seq2, CL->ktup);
1362 vfree (seq1); vfree (seq2);
1365 int ktup_comparison( char *seq2, char *seq1, const int ktup)
1367 return ktup_comparison_hasch ( seq2, seq1, ktup);
1369 int ktup_comparison_str ( char *seq2, char *seq1, const int ktup)
1371 int a,l1, l2,c1, c2, end, start;
1376 if ( max_dist==-1)max_dist=MAX((strlen (seq1)),(strlen (seq2)));
1377 l1=strlen (seq1)-ktup;
1381 for ( a=0; a< l1; a++)
1383 c1=seq1[a+ktup];seq1[a+ktup]='\0';
1386 start=((a-max_dist)<0)?0:a-max_dist;
1387 end=((a+max_dist)>=l2)?l2:a+max_dist;
1389 c2=seq2[end];seq2[end]='\0';
1392 score+=(strstr(s2, s1)!=NULL)?1:0;
1397 score/=(l1==0)?1:l1;
1398 score=((log(0.1+score)-log(0.1))/(log(1.1)-log(0.1)));
1403 int ktup_comparison_hasch ( char *i_seq1, char *i_seq2, const int ktup)
1405 /*Ktup comparison adapted from Rob Edgar, NAR, vol32, No1, 381, 2004*/
1406 /*1: hasch sequence 1
1407 2: Count the number of seq2 ktup found in seq1
1418 int p, a, max_dist=-1;
1423 if (!strm (i_seq1, pseq))
1427 hdestroy (H1, declare_ktup_hasch_data, free_ktup_hasch_data);
1428 string2key (NULL, NULL);
1430 H1=hasch_sequence ( i_seq1, ktup);
1431 vfree (pseq);pseq=vcalloc ( strlen (i_seq1)+1, sizeof (char));
1432 sprintf ( pseq, "%s", i_seq1);
1435 ls=l=strlen (i_seq2);
1440 c=s[ktup];s[ktup]='\0';
1441 key=string2key (s, NULL);
1442 e=hsearch (H1,key,FIND, declare_ktup_hasch_data, free_ktup_hasch_data);
1445 else if ( max_dist==-1)score++;
1448 for ( a=1; a<=(e->data)->list[1]; a++)
1449 if (FABS((p-(e->data)->list[a]))<=max_dist)
1452 s[ktup]=c;s++;p++;ls--;
1455 score=(log(0.1+score)-log(0.1))/(log(1.1)-log(0.1));
1457 if ( score>100) score=100;
1458 return (int)(score*100);
1461 HaschT* hasch_sequence ( char *seq1, int ktup)
1464 int key, offset=0, ls;
1468 H=hcreate ( strlen (seq1), declare_ktup_hasch_data, free_ktup_hasch_data);
1472 c=seq1[ktup];seq1[ktup]='\0';
1473 key=string2key (seq1, NULL);
1474 e=hsearch (H,key,FIND, declare_ktup_hasch_data, free_ktup_hasch_data);
1478 e=hsearch (H,key,ADD,declare_ktup_hasch_data,free_ktup_hasch_data);
1479 (e->data)->list[++(e->data)->list[1]+1]=offset;
1483 if ((e->data)->list[0]==((e->data)->list[1]+2)){(e->data)->list[0]+=10;(e->data)->list=vrealloc ((e->data)->list,(e->data)->list[0]*sizeof (int));}
1484 (e->data)->list[++(e->data)->list[1]+1]=offset;
1486 seq1[ktup]=c;seq1++;ls--;
1494 char *dayhoff_translate (char *seq1)
1498 for ( a=0; a< l; a++)
1501 if ( strchr ("agpst", c))seq1[a]='a';
1502 else if (strchr ("denq", c))seq1[a]='d';
1503 else if (strchr ("fwy", c))seq1[a]='f';
1504 else if (strchr ("hkr", c))seq1[a]='h';
1505 else if (strchr ("ilmv", c))seq1[a]='i';
1510 int ** evaluate_diagonals_with_ktup ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup)
1512 /*Ktup comparison as in Rob Edgar, NAR, vol32, No1, 381, 2004*/
1516 Hasch_entry *e1, *e2;
1517 char *s, *sb, *seq1, *seq2;
1520 int **diag,n_diag, ktup1, ktup2,a,b,c,d, **pos;
1523 pos=aln2pos_simple ( A,-1, ns, l_s);
1525 seq1=aln2cons_maj (A, ns[0], l_s[0], n_groups, group_list);
1526 seq2=aln2cons_maj (A, ns[1], l_s[1], n_groups, group_list);
1532 diag=declare_int (n_diag+2, 3);
1533 for ( a=0; a<n_diag+2; a++)diag[a][0]=a;
1535 H1=hasch_sequence ( seq1, ktup);
1536 H2=hasch_sequence ( seq2, ktup);
1537 s=sb=vcalloc (strlen (seq1)+strlen (seq2)+1, sizeof (char));
1538 sprintf (s, "%s%s", seq1, seq2);
1543 character=s[ktup];s[ktup]='\0';
1544 key=string2key (s, NULL);
1545 e1=hsearch (H1,key,FIND,declare_ktup_hasch_data, free_ktup_hasch_data);
1546 e2=hsearch (H2,key,FIND,declare_ktup_hasch_data, free_ktup_hasch_data);
1551 for (b=2; b<(e1->data)->list[1]+2; b++)
1552 for (c=2; c<(e2->data)->list[1]+2; c++)
1555 ktup1=(e1->data)->list[b];
1556 ktup2=(e2->data)->list[c];
1557 diag[(ktup2-ktup1)+l1][2]++;
1558 for (score=0, d=0; d<ktup; d++)
1559 score+=(CL->get_dp_cost) ( A, pos, ns[0], l_s[0], ktup1+d, pos,ns[1], l_s[1], ktup2+d, CL);
1560 diag[(ktup2-ktup1)+l1][1]+=score;
1563 (e1->data)->list[1]=(e2->data)->list[1]=0;
1565 s[ktup]=character;s++;ls--;
1568 sort_int (diag+1, 2, 1,0,n_diag-1);
1570 hdestroy (H1,declare_ktup_hasch_data, free_ktup_hasch_data); hdestroy (H2,declare_ktup_hasch_data, free_ktup_hasch_data);
1571 vfree (seq1); vfree (seq2);vfree (sb);free_int (pos, -1);
1574 /*********************************************************************/
1579 /*********************************************************************/
1580 int ** evaluate_diagonals_with_ktup_1 ( Alignment *A, int *ns, int **l_s, Constraint_list *CL,int maximise,int n_groups, char **group_list, int ktup)
1583 Reads in an alignmnent A, with two groups of sequences marked.
1584 1-Turn each group into a conscensus, using the group list identifier.
1585 -if the group list is left empty original symbols are used
1586 2-hasch the two sequences
1587 3-score each diagonal, sort the list and return it (diag_list)
1593 char *seq1, *seq2, *alphabet=NULL;
1594 int a,b,l1, l2, n_ktup,pos_ktup1, pos_ktup2, **pos;
1595 int *hasched_seq1, *hasched_seq2,*lu_seq1,*lu_seq2;
1596 int n_diag, **diag, current_diag, n_dots;
1598 pos=aln2pos_simple ( A,-1, ns, l_s);
1601 seq1=aln2cons_seq (A, ns[0], l_s[0], n_groups, group_list);
1602 seq2=aln2cons_seq (A, ns[1], l_s[1], n_groups, group_list);
1607 alphabet=get_alphabet (seq1,alphabet);
1608 alphabet=get_alphabet (seq2,alphabet);
1614 diag=declare_int ( n_diag+2, 3);
1615 n_ktup=(int)pow ( (double)alphabet[0]+1, (double)ktup);
1618 hasch_seq(seq1, &hasched_seq1, &lu_seq1,ktup, alphabet);
1619 hasch_seq(seq2, &hasched_seq2, &lu_seq2,ktup, alphabet);
1624 /*EVALUATE THE DIAGONALS*/
1625 for ( a=0; a<= n_diag; a++)diag[a][0]=a;
1626 for ( n_dots=0,a=1; a<= n_ktup; a++)
1628 pos_ktup1=lu_seq1[a];
1631 if (!pos_ktup1)break;
1632 pos_ktup2=lu_seq2[a];
1635 current_diag=(pos_ktup2-pos_ktup1+l1);
1636 for ( b=0; b< ktup; b++)
1638 diag[current_diag][1]+=(CL->get_dp_cost) ( A, pos, ns[0], l_s[0], pos_ktup1+b-1, pos,ns[1], l_s[1], pos_ktup2+b-1, CL);
1642 diag[current_diag][2]++;
1643 pos_ktup2=hasched_seq2[pos_ktup2];
1645 pos_ktup1=hasched_seq1[pos_ktup1];
1653 buf=vcalloc ( 30, sizeof (30));
1654 sprintf ( buf, "abcdefghijklmnopqrstuvwxyz");
1656 vfree ( hasched_seq1);
1657 vfree ( hasched_seq2);
1660 return evaluate_diagonals_with_ktup ( A,ns,l_s, CL,maximise,1,&buf,1);
1664 sort_int (diag+1, 2, 1,0, n_diag-1);
1668 vfree ( hasched_seq1);
1669 vfree ( hasched_seq2);
1675 /////////////////////////////////////////////////////////////////
1677 Constraint_list * hasch2constraint_list (Sequence*S, Constraint_list *CL)
1685 entry=vcalloc ( CL->entry_len, sizeof (int));
1687 for (a=0; a<S->nseq; a++)
1689 H=seq2hasch (a, S->seq[a],ktup,H);
1697 for (a=0; a<h->n-2; a+=2)
1699 for (b=a+2; b<h->n; b+=2)
1702 if (h->l[a]==h->l[b])continue;
1705 for (c=0; c<ktup; c++)
1707 entry[SEQ1]=h->l[a];
1708 entry[SEQ2]=h->l[b];
1709 entry[R1]=h->l[a+1]+c;
1710 entry[R2]=h->l[b+1]+c;
1712 add_entry2list (entry,CL);
1722 SeqHasch *cleanhasch (SeqHasch *H)
1726 N=vcalloc (2, sizeof (SeqHasch));
1739 int hasch2sim (SeqHasch *H, int nseq)
1748 for (ps=-1,ns=0,a=0; a<(H[n])->n; a+=2)
1750 //HERE ("%d--[%d %d]",n, (H[n])->l[a], (H[n])->l[a+1]);
1760 return (id*MAXID)/tot;
1762 SeqHasch * seq2hasch (int i,char *seq, int ktup, SeqHasch *H)
1770 H=vcalloc (2, sizeof (SeqHasch));
1771 H[0]=vcalloc (1, sizeof (hseq));
1781 for (a=0; a<l-ktup; a++)
1784 for (b=a; b<a+ktup; b++)
1788 if (!h->hl[r]) h->hl[r]=vcalloc (1, sizeof (hseq));
1795 h->l=vcalloc (2, sizeof (int));
1796 H=vrealloc (H,(n+2)*sizeof (SeqHasch));
1803 h->l=vrealloc (h->l, (h->n)*sizeof (int));
1812 /*********************************COPYRIGHT NOTICE**********************************/
1813 /*© Centro de Regulacio Genomica */
1815 /*Cedric Notredame */
1816 /*Tue Oct 27 10:12:26 WEST 2009. */
1817 /*All rights reserved.*/
1818 /*This file is part of T-COFFEE.*/
1820 /* T-COFFEE is free software; you can redistribute it and/or modify*/
1821 /* it under the terms of the GNU General Public License as published by*/
1822 /* the Free Software Foundation; either version 2 of the License, or*/
1823 /* (at your option) any later version.*/
1825 /* T-COFFEE is distributed in the hope that it will be useful,*/
1826 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
1827 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
1828 /* GNU General Public License for more details.*/
1830 /* You should have received a copy of the GNU General Public License*/
1831 /* along with Foobar; if not, write to the Free Software*/
1832 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
1833 /*............................................... |*/
1834 /* If you need some more information*/
1835 /* cedric.notredame@europe.com*/
1836 /*............................................... |*/
1840 /*********************************COPYRIGHT NOTICE**********************************/