7 #include "io_lib_header.h"
8 #include "util_lib_header.h"
9 #include "dp_lib_header.h"
10 #include "define_header.h"
11 /************************************************************************************/
12 /* NEW ANALYZE 2 : SAR */
13 /************************************************************************************/
14 float display_prediction_old (int **prediction, int n, Alignment *A, Alignment *S, int field);
16 float display_prediction (int ***count, Alignment *S, int c, int n);
17 Alignment * filter_aln4sar0 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
18 Alignment * filter_aln4sar1 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
19 Alignment * filter_aln4sar2 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
20 Alignment * filter_aln4sar3 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
21 Alignment * filter_aln4sar4 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
22 Alignment * filter_aln4sar5 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
24 int **sar2profile ( Alignment *A, Alignment *S, int c, int leave);
25 int **sar2profile_sim ( Alignment *A, Alignment *S, int **sim, int comp, int leave);
26 int sar_profile2score ( char *seq, int **profile);
27 double sar_vs_iseq1( char *sar, int *seq, float gl, int **sim, char *best_aa);
28 double sar_vs_seq1 ( char *sar, char *seq, float gl, int **sim, char *best_aa);
29 double sar_vs_seq2 ( char *sar, char *seq, float ng, int **mat, char *a);
30 double sar_vs_seq3 ( char *sar, char *seq, float ng, int **mat, char *a);
32 double sar_vs_iseq4 ( char *sar, int *seq, float ng, int **mat, char *a);//supports an extended alphabet
33 double sar_vs_seq4 ( char *sar, char *seq, float ng, int **mat, char *a);
35 double sar_vs_seq5 ( char *sar, char *seq, float ng, int **mat, char *a);
36 int make_sim_pred ( Alignment *A,Alignment *S, int comp, int seq);
38 int **sar2profile_sim ( Alignment *A, Alignment *S, int **sim, int comp, int leave)
41 int a, b, r, c, c1, c2, r1, r2, s, p;
42 int ***cache, **profile;
45 profile=declare_int (A->len_aln, 26);
46 cache=declare_arrayN (3,sizeof (int),2,A->len_aln, 26);
48 for ( a=0; a< A->len_aln; a++)
49 for ( b=0; b< A->nseq; b++)
51 r=tolower(A->seq_al[b][a]);
52 c=( S->seq_al[comp][b]=='I')?1:0;
53 if (b==leave || is_gap(r)) continue;
54 cache [c][a][r-'a']++;
56 for (a=0; a< A->nseq; a++)
58 if ( a==leave) continue;
59 for ( b=0; b< A->nseq; b++)
61 c1=(S->seq_al[comp][a]=='I')?1:0;
62 c2=(S->seq_al[comp][b]=='I')?1:0;
63 if ( b==leave || b==a || c1!=1 || c1==c2) continue;
66 for (p=0; p<A->len_aln; p++)
68 r1=tolower(A->seq_al[a][p]);
69 r2=tolower(A->seq_al[b][p]);
70 if ( is_gap(r1) || is_gap(r2) || r1==r2)continue;
72 if (cache[1][p][r2])continue;
79 free_arrayN((void***)cache,3);
83 int **sar2profile ( Alignment *A, Alignment *S, int comp, int leave)
86 int a, b,c,r, n, v, npos=0;
87 int ***cache, **profile;
90 profile=declare_int (A->len_aln, 26);
91 cache=declare_arrayN (3,sizeof (int),2,A->len_aln, 26);
95 for ( n=0, a=0; a< A->nseq; a++)
97 if ( a==leave) continue;
98 else n+=(S->seq_al[comp][a]=='I')?1:0;
101 for ( a=0; a< A->len_aln; a++)
102 for ( b=0; b< A->nseq; b++)
104 r=tolower(A->seq_al[b][a]);
105 c=( S->seq_al[comp][b]=='I')?1:0;
106 if (b==leave) continue;
107 else if (is_gap(r))continue;
112 ncat=15; /*ncat: limit the analysis to columns containing less than ncat categories of aa*/
114 for (a=0; a< A->len_aln; a++)
116 for (n_gap=0,b=0; b< A->nseq; b++)
117 n_gap+=(is_gap(A->seq_al[b][a]));
118 n_gap/=(float)A->nseq;
120 if ( n_gap> max_gap)continue;
122 for (v=0,r=0; r< 26; r++)
124 if (cache [0][a][r] || cache[1][a][r])v++;
127 for (n=0,r=0; r< 26 && v<ncat; r++)
129 if (cache [0][a][r] && !cache[1][a][r])
132 profile[a][r]=-cache[0][a][r];
138 free_arrayN((void***)cache,3);
142 Alignment * filter_aln4sar0 ( Alignment *A, Alignment *S, int comp, int leave, char *mode)
144 return copy_aln (A,NULL);
146 Alignment * filter_aln4sar1 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode)
149 int a, b,c, i,r, n0, n1,g, score;
150 int ***cache, **list1, **list2;
155 /*Keep only the positions where there are residues ONLY associated with 0 sequences*/
157 list1=declare_int ( inA->nseq, 2);
158 list2=declare_int ( inA->len_aln, 2);
160 cache=declare_arrayN (3,sizeof (int),inA->len_aln,2, 26);
161 F=copy_aln (inA, NULL);
163 A=copy_aln (inA, NULL);
164 A->nseq=strlen (S->seq_al[comp]);
166 strget_param (mode, "_T1_", "5", "%d", &T1);
167 for ( a=0; a< A->len_aln; a++)
170 for (b=0; b< A->nseq; b++)
172 if ( b==leave) continue;
173 i=(S->seq_al[comp][b]=='I')?1:0;
174 r=tolower(A->seq_al[b][a]);
175 if ( r=='-')continue;
176 cache[a][i][r-'a']++;
180 for (a=0; a< A->nseq; a++)
181 for ( score=0,b=0; b<A->len_aln; b++)
183 r=tolower (A->seq_al[a][b]);
184 if ( is_gap(r))continue;
185 else if ( cache[b][0][r-'a'] && !cache[b][1][r-'a'])list1[a][0]++;
188 for (a=0; a< A->len_aln; a++)
190 for ( score=0,b=0; b< A->nseq; b++)
192 r=tolower (A->seq_al[b][a]);
193 if ( r=='-')continue;
195 if ( cache[a][0][r] && !cache[a][1][r])score ++;
200 sort_int (list2, 2, 1, 0, F->len_aln-1);
202 Delta=A->len_aln/(100/T1);
203 for ( a=0; a< F->len_aln-Delta; a++)
206 for ( c=0; c<F->nseq; c++)
214 free_arrayN ( (void ***)cache, 3);
215 free_arrayN ((void**)list1, 2);
216 free_arrayN ((void**)list2, 2);
220 Alignment * filter_aln4sar2 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode)
227 /*Keep Low entropy columns that contain less than ncat categories of different amino acids*/
228 /*REmove columns containing 10% or more gaps*/
230 cache=vcalloc ( 500, sizeof (char));
231 F=copy_aln (inA, NULL);
232 A=copy_aln (inA, NULL);
233 A->nseq=strlen (S->seq_al[comp]);
234 for ( a=0; a< A->len_aln; a++)
236 for (ncat=0,b=0; b< A->nseq; b++)
238 if ( b==leave) continue;
240 r=tolower(A->seq_al[b][a]);
241 if ( !cache[r])ncat++;
245 if ( ncat <max_ncat && ((cache['-']*100)/A->nseq)<10)
251 for (b=0; b<F->nseq; b++)
253 r=tolower(F->seq_al[b][a]);
258 for (b=0; b<A->nseq; b++)
260 r=tolower(A->seq_al[b][a]);
271 Alignment * filter_aln4sar3 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode)
273 Alignment *F, *rA, *A;
280 /*Keep the 10% positions most correlated with the 0/1 pattern*/
282 A=copy_aln (inA, NULL);
283 A->nseq=strlen (S->seq_al[comp]);
284 F=copy_aln (inA, NULL);
285 rA=rotate_aln (A, NULL);
287 strget_param (mode, "_T3_", "10", "%d", &T3);
290 list1=declare_int ( inA->len_aln, 2);
291 bufA=vcalloc ( A->nseq+1, sizeof (char));
292 bufS=vcalloc ( A->nseq+1, sizeof (char));
294 sprintf ( bufS, "%s", S->seq_al[comp]);
295 splice_out_seg(bufS,leave, 1);
298 for (a=0; a< A->len_aln; a++)
302 sprintf (bufA, "%s", rA->seq_al[a]);
303 splice_out_seg (bufA,leave,1);
304 list1[a][1]=(int)sar_vs_seq3 ( bufS, bufA,0,NULL, &aa);
307 sort_int (list1, 2, 1, 0, F->len_aln-1);
308 Delta=F->len_aln/(100/T3);
309 for ( a=0; a< F->len_aln-Delta; a++)
313 for ( c=0; c<F->nseq; c++)
319 F->score_aln=list1[F->len_aln-1][1];
324 free_arrayN ((void**)list1, 2);
325 vfree (bufS);vfree (bufA);
328 Alignment * filter_aln4sar4 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode)
331 int a, b,c, i,r, n0, n1,g,score;
332 int ***cache, **list1, **list2;
334 /*Keep only the positions where there are residues ONLY associated with 0 sequences*/
336 list1=declare_int ( inA->nseq, 2);
337 list2=declare_int ( inA->len_aln, 2);
339 cache=declare_arrayN (3,sizeof (int),inA->len_aln,2, 26);
340 F=copy_aln (inA, NULL);
341 A=copy_aln (inA, NULL);
342 A->nseq=strlen (S->seq_al[comp]);
344 for ( a=0; a< A->len_aln; a++)
347 for (b=0; b< A->nseq; b++)
349 if ( b==leave) continue;
350 i=(S->seq_al[comp][b]=='I')?1:0;
351 r=tolower(A->seq_al[b][a]);
352 if ( r=='-')continue;
353 cache[a][i][r-'a']++;
359 for (a=0; a< A->len_aln; a++)
361 for ( score=0,b=0; b< A->nseq; b++)
363 r=tolower (F->seq_al[b][a]);
364 if ( r=='-')continue;
366 if (cache[a][1][r]>=n1/2)score=1;
373 for ( a=0; a< F->len_aln; a++)
375 if ( list2[a][1]==1);
379 for ( c=0; c<F->nseq; c++)
387 free_arrayN ( (void ***)cache, 3);
388 free_arrayN ((void**)list1, 2);
389 free_arrayN ((void**)list2, 2);
394 Alignment * filter_aln4sar5 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode)
396 Alignment *F, *rA, *A;
401 /*Look for the positions that show the best correlation between the sequence variation and the SAR*/
403 A=copy_aln (inA, NULL);
404 A->nseq=strlen (S->seq_al[comp]);
406 rA=rotate_aln (inA, NULL);
407 F=copy_aln (inA, NULL);
409 list1=declare_int ( A->len_aln, 2);
410 bufA=vcalloc ( A->nseq+1, sizeof (char));
411 bufS=vcalloc ( A->nseq+1, sizeof (char));
415 sprintf ( bufS, "%s", S->seq_al[comp]);
416 splice_out_seg(bufS,leave, 1);
419 for (a=0; a< A->len_aln; a++)
423 sprintf (bufA, "%s", rA->seq_al[a]);
424 splice_out_seg (bufA,leave,1);
425 list1[a][1]=(int)sar_vs_seq4 ( bufS, bufA,0,NULL, &aa);
428 sort_int (list1, 2, 1, 0, F->len_aln-1);
429 max=F->score=list1[F->len_aln-1][1];
433 for ( a=0; a< F->len_aln-10; a++)
438 for ( c=0; c<F->nseq; c++)
448 free_arrayN ((void**)list1, 2);
449 vfree (bufS);vfree (bufA);
453 int sar_profile2score ( char *seq, int **P)
458 for ( score=0,a=0; a< l; a++)
461 if ( is_gap(r))continue;
462 score+=P[a][tolower(r)-'a'];
466 int make_sim_pred ( Alignment *A,Alignment *S, int comp, int seq)
469 static float **cscore;
470 static float **tscore;
474 cscore=declare_float (2, 2);
475 tscore=declare_float (2, 2);
478 for (a=0; a< 2; a++)for (b=0; b<2; b++)cscore[a][b]=tscore[a][b]=0;
480 for ( a=0; a<A->len_aln; a++)
482 r1=A->seq_al[seq][a];
483 if ( r1=='-') continue;
486 for ( b=0; b< A->nseq; b++)
488 if (b==seq) continue;
492 if (r2=='-')continue;
496 i=(S->seq_al[comp][b]=='I')?1:0;
497 cscore[i][0]+=(r1==r2)?1:0;
505 cscore[i][0]/=(cscore[i][1]==0)?1:cscore[i][1];
506 tscore[i][0]+=cscore[i][0];tscore[i][1]++;
507 cscore[i][0]=cscore[i][1]=0;
512 fprintf ( stdout, "\nn\t 1: %.2f 0: %.2f", tscore[1][0],tscore[0][0]);
513 return ( tscore[1][0]>=tscore[0][0])?1:0;
517 Alignment * sar_analyze (Alignment *inA, Alignment *inS, char *mode)
519 int ***sim,***glob_results, ***comp_results;
523 Alignment *A=NULL,*S=NULL,*F, *SUBSET;
524 char *subset, *target;
528 char *prediction, *reliability;
529 int pred_start=0, pred_end, ref_start=0, ref_end;
530 int display, CSV=1, NONCSV=0;
533 strget_param (mode, "_METHOD_", "1111", "%s_", method);
534 ff=vcalloc (6,sizeof (filter_func));
535 if (method[0]=='1')ff[n_methods++]=filter_aln4sar0;
536 if (method[1]=='1')ff[n_methods++]=filter_aln4sar1;
537 if (method[2]=='1')ff[n_methods++]=filter_aln4sar2;
538 if (method[3]=='1')ff[n_methods++]=filter_aln4sar3;
540 ff[n_methods++]=filter_aln4sar4;
541 ff[n_methods++]=filter_aln4sar5;
543 sim=vcalloc (n_methods, sizeof (int**));
546 tot2=vcalloc ( 10, sizeof (float));
547 subset=vcalloc ( 100, sizeof (char));
548 target=vcalloc ( 100, sizeof (char));
550 strget_param (mode, "_TARGET_", "no", "%s_", target);
551 strget_param (mode, "_SUBSET_", "no", "%s_", subset);
552 strget_param (mode, "_JACK_", "0", "%d", &jack);
553 strget_param (mode, "_T_", "0", "%d", &T);
554 strget_param (mode, "_FILTER_", "11", "%d", &filter);
555 strget_param (mode, "_DISPLAY_", "0", "%d", &display);
559 if ( !strm (target, "no"))
562 T=main_read_aln(target, NULL);
563 if ( T->len_aln !=inA->len_aln )
565 printf_exit ( EXIT_FAILURE,stderr, "Error: %s is incompatible with the reference alignment [FATAL:%s]",target,PROGRAM);
568 inA=stack_aln (inA, T);
572 if ( !strm(subset, "no"))
574 SUBSET=main_read_aln (subset, NULL);
575 sarset2subsarset ( inA, inS, &A, &S, SUBSET);
584 prediction=vcalloc ( n_methods+1, sizeof (char));
585 reliability=vcalloc ( n_methods+1, sizeof (char));
587 glob_results=declare_arrayN(3, sizeof (int), n_methods*2, 2, 2);
589 count=vcalloc (S->nseq, sizeof (int));
590 for (a=0; a<S->nseq; a++)
593 l=strlen (S->seq_al[a]);
595 count[a]+=(S->seq_al[a][b]=='I')?1:0;
598 {fprintf ( stdout, "\nCompound %s ; Ntargets %d", S->name[a],count[a]);
599 pred_start=(strlen (S->seq_al[0])==A->nseq)?0:strlen (S->seq_al[0]);
601 for (a=pred_start; a< pred_end; a++)
602 fprintf ( stdout, ";%s", A->name[a]);
603 fprintf ( stdout, ";npred;");
607 for (a=0; a<S->nseq; a++)
610 comp_results=declare_arrayN(3, sizeof (int), n_methods*2, 2, 2);
612 pred_start=(strlen (S->seq_al[a])==A->nseq)?0:strlen (S->seq_al[a]);
614 if ( display==CSV)fprintf ( stdout, "\n%s;%d", S->name[a],count[a]);
616 for (n_pred=0,b=pred_start; b<pred_end;b++)
618 int t, score=0,pred, real;
620 if ( display==NONCSV)fprintf ( stdout, "\n>%-15s %10s %c ", S->name[a], A->name[b], (pred_start==0)?S->seq_al[a][b]:'?');
621 if (jack || b==pred_start)
623 for (m=0; m<n_methods; m++)
625 free_int (sim[m], -1);
626 F=(ff[m]) (A,S,a,(jack==0)?-1:b, mode);
627 sim[m]=aln2sim_mat(F, "idmat");
632 for (m=0; m<n_methods; m++)
634 int Nbsim=0,Ybsim=0,bsim=0;
636 ref_end=strlen (S->seq_al[m]);
638 for (c=ref_start;c<ref_end; c++)
641 else if ( S->seq_al[a][c]=='O')
643 Nbsim=MAX(Nbsim,sim[m][b][c]);
647 Ybsim=MAX(Ybsim,sim[m][b][c]);
651 bsim=(Ybsim>Nbsim)?Ybsim:-Nbsim;
653 real=(S->seq_al[a][b]=='O')?0:1;
654 comp_results[m][pred][real]++;
655 glob_results[m][pred][real]++;
657 prediction[m]=pred+'0';
658 reliability[m]=(FABS((Ybsim-Nbsim))-1)/10+'0';
661 if ( score>0)n_pred++;
662 prediction[m]=reliability[m]='\0';
663 if (display==NONCSV)fprintf ( stdout, "Compound_Count:%d primary_predictions: %s Total: %d", count[a],prediction, score);
664 else if ( display==CSV)fprintf ( stdout, ";%d", score);
665 for (t=0; t<n_methods; t++)
669 comp_results[t+n_methods][1][real]++;
670 glob_results[t+n_methods][1][real]++;
674 comp_results[t+n_methods][0][real]++;
675 glob_results[t+n_methods][0][real]++;
679 if ( display==NONCSV)
680 {if ( pred_start==0)display_prediction (comp_results, S,a, n_methods*2);}
681 else fprintf (stdout, ";%d;",n_pred);
683 if ( display==NONCSV)if (pred_start==0)display_prediction (glob_results, S,-1, n_methods*2);
688 float display_prediction (int ***count, Alignment *S, int c, int n)
690 float tp,tn,fn,fp,sp,sn,sn2;
705 if ( a<nm)fprintf ( stdout, "\n>#Method %d Compound %15s sp=%.2f sn=%.2f sn2=%.2f",a, (c==-1)?"TOTAL":S->name[c],sp, sn, sn2 );
706 else fprintf ( stdout, "\n>#Combined: T=%d Compound %15s sp=%.2f sn=%.2f sn2=%.2f",a-nm, (c==-1)?"TOTAL":S->name[c],sp, sn, sn2 );
708 fprintf ( stdout, "\n");
712 float display_prediction_2 (int **prediction, int n,Alignment *A, Alignment *S, int field)
715 float max_sn, max_sp;
717 if ( field==17 || field ==18)
719 printf_exit ( EXIT_FAILURE, stderr, "\nERROR: Do not use filed %d in display_prediction", field);
722 sort_int_inv ( prediction, 10,field, 0, n-1);
723 for (t=0,a=0; a<n; a++)
729 for (t=0,a=n-1; a>=0; a--)
738 float tp, fn, fp, sp, sn;
740 tp=prediction[a][17];
741 fn=prediction[a][18];
744 sp=((tp+fp)==0)?0:tp/(tp+fp);
745 sn=((tp+fn)==0)?0:tp/(tp+fn);
754 T=prediction[a][field];
759 fprintf (stdout, "\n T =%d SN=%.2f SP= %.2f",T,max_sn,max_sp);
761 fprintf (stdout, "\n T =%d SN=%.2f SP= %.2f",T,max_sn,max_sp);
767 /************************************************************************************/
768 /* NEW ANALYZE : SAR */
769 /************************************************************************************/
770 float** cache2pred1 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
771 float** cache2pred2 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
772 float** cache2pred3 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
773 float** cache2pred4 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
774 float** cache2pred5 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
775 float** cache2pred_new (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
777 int **sar2cache_adriana ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
778 int **sar2cache_proba_old ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
779 int **sar2cache_count1 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
780 int **sar2cache_count2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
781 int **sar2cache_count3 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
783 int **sar2cache_proba_new ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
784 int **sar2cache_proba2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
785 int **analyze_sar_compound1 ( char *name, char *seq, Alignment *A, char *mode);
786 int **analyze_sar_compound2 ( char *name, char *seq, Alignment *A, char *mode);
788 int aln2n_comp_col ( Alignment *A, Alignment *S, int ci);
793 int ***simple_sar_analyze_vot ( Alignment *inA, Alignment *SAR, char *mode);
794 int ***simple_sar_analyze_col ( Alignment *inA, Alignment *SAR, char *mode);
798 int sarset2subsarset ( Alignment *A, Alignment *S, Alignment **subA, Alignment **subS, Alignment *SUB);
799 int benchmark_sar (int v);
800 int aln2jack_group1 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2);
801 int aln2jack_group2 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2);
802 int aln2jack_group3 (Alignment *A, char *sar_seq, int **l1, int *nl1, int **l2, int *nl2);
803 float** jacknife5 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
804 float** jacknife6 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
806 int process_cache ( Alignment *A,Alignment *S, int ***Cache, char *mode);
807 Alignment *analyze_compounds (Alignment *A, Alignment *S, char *mode);
809 Alignment *analyze_compounds (Alignment *A, Alignment *S, char *mode)
815 sim=aln2sim_mat (A, "idmat");
816 for (a=0; a< S->nseq; a++)
818 for (n=0, tot=0, b=0; b< A->nseq-1; b++)
820 sar1=(S->seq_al[a][b]=='I')?1:0;
821 for ( c=b+1; c<A->nseq; c++)
823 sar2=(S->seq_al[a][c]=='I')?1:0;
832 fprintf ( stdout, ">%-10s CMPSIM: %.2f\n", S->name[a],(float)tot/(float)n);
838 int print_seq_pos ( int pos, Alignment *A, char *seq);
839 int abl1_evaluation (int p);
840 int print_seq_pos ( int pos, Alignment *A, char *seq)
844 s=name_is_in_list (seq, A->name, A->nseq, MAXNAMES);
845 fprintf ( stdout, "S=%d", s);
847 for (b=0,a=0; a<pos; a++)
849 if (!is_gap (A->seq_al[s][a]))b++;
851 fprintf ( stdout, "Pos %d SEQ %s: %d ", pos+1, seq, b+246);
852 if ( strm ( seq, "ABL1")) fprintf ( stdout , "PT: %d", abl1_evaluation (b+246));
856 int process_cache ( Alignment *A,Alignment *S, int ***Cache, char *mode)
864 strget_param ( mode, "_WEIGHT_", "1", "%d", &weight_mode);
865 pos=declare_int(A->len_aln+1,2);
866 pos2=declare_int (A->len_aln+1,S->nseq);
867 for (a=0; a<S->nseq; a++)
870 for (b=0; b< A->len_aln; b++)
882 ab1=name_is_in_list ("ABL1", A->name, A->nseq,100);
883 ab1_pos=vcalloc (A->len_aln+1, sizeof (int));
885 for ( b=0,a=0; a< A->len_aln; a++)
887 if ( A->seq_al[ab1][a]=='-')ab1_pos[a]=-1;
891 for ( a=0; a< A->len_aln; a++)
893 fprintf ( stdout, "\n%4d %5d %5d %5d [%c] [%2d] ALN", a+1, pos[a][0], pos[a][1], ab1_pos[a]+246,A->seq_al[ab1][a],abl1_evaluation (ab1_pos[a]+246));
894 for ( b=0; b< S->nseq; b++)fprintf ( stdout, "%d", pos2[a][b]);
898 int abl1_evaluation (int p)
900 if ( p==248) return 10;
901 if ( p==250) return 10;
902 if ( p==253) return 10;
903 if ( p==254) return 10;
904 if ( p==255) return 9;
905 if ( p==256) return 10;
906 if ( p==257) return 5;
907 if ( p==258) return 8;
908 if ( p==269) return 8;
909 if ( p==291) return 4;
910 if ( p==294) return 8;
911 if ( p==299) return 10;
912 if ( p==306) return 0;
913 if ( p==314) return 9;
914 if ( p==315) return 10;
915 if ( p==318) return 10;
917 if ( p==319) return 10;
918 if ( p==321) return 10;
919 if ( p==323) return 0;
920 if ( p==324) return 0;
921 if ( p==339) return 0;
922 if ( p==340) return 0;
923 if ( p==355) return 5;
924 if ( p==364) return 10;
926 if ( p==366) return 0;
927 if ( p==368) return 10;
928 if ( p==370) return 10;
929 if ( p==372) return 0;
930 if ( p==378) return 8;
931 if ( p==382) return 10;
933 if ( p==384) return 10;
934 if ( p==387) return 10;
935 if ( p==395) return 8;
937 if ( p==398) return 8;
938 if ( p==399) return 8;
939 if ( p==400) return 8;
940 if ( p==403) return 0;
941 if ( p==416) return 8;
942 if ( p==419) return 5;
943 if ( p>400) return 0;
946 float** cache2pred1 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
948 int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci;
949 double score, max, id, m;
953 int used_col, used_res,is_used_col, n_res=0;
955 /*Predict on ns[1] what was trained on ns[0]*/
957 strget_param ( mode, "_THR_", "0.09", "%f", &T);
958 strget_param ( mode, "_WEIGHT_", "0", "%d", &weight_mode);
960 R=declare_float (2, 2);
961 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
965 for (s1=0; s1<ns[1]; s1++)
970 for (max=0,score=0, col=0; col<A->len_aln; col++)
973 r1=tolower (A->seq_al[seq1][col]);
974 for (max1=0,id=0, m=0,s2=0; s2<ns[0]; s2++)
977 if ( S->seq_al[ci][seq2]=='O')continue;
978 if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue;
980 r2=tolower ( A->seq_al[seq2][col]);
981 if ( is_gap(r2))continue;
983 v=(cache[seq2][col]>0 && weight_mode==1)?cache[seq2][col]:1;
994 pred=(( score/max) >T)?1:0;
995 real=(S->seq_al[ci][seq1]=='I')?1:0;
998 fprintf ( stdout, "\n>%s %d%d SCORE %.2f C %s [SEQ]\n", A->name[seq1],real, pred, (float)score/(float)max, compound);
1001 for (used_col=0,used_res=0,col=0; col<A->len_aln; col++)
1003 for (is_used_col=0,s2=0; s2<ns[0]; s2++)
1006 if ( cache[seq2][col]==0 && !is_gap(A->seq_al[seq2][col]))n_res++;
1007 else if (is_gap(A->seq_al[seq2][col]));
1014 used_col+=is_used_col;
1016 fprintf ( stdout, "\n>%s USED_POSITIONS: COL: %.2f RES: %.2f COMP\n", S->name[ci], (float)used_col/(float)A->len_aln, (float)used_res/(float) n_res);
1021 float** cache2pred2 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
1023 int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci;
1028 int used_col, used_res,is_used_col, n_res=0;
1029 /*Predict on ns[1] what was trained on ns[0]*/
1031 strget_param ( mode, "_THR_", "0.5", "%f", &T);
1034 R=declare_float (2, 2);
1035 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
1037 for (s1=0; s1<ns[1]; s1++)
1041 fprintf ( stdout, "\n");
1042 for (max=0,score=0, col=0; col<A->len_aln; col++)
1046 r1=tolower (A->seq_al[seq1][col]);
1047 for (used=0,s2=0; s2<ns[0]; s2++)
1051 if ( S->seq_al[ci][seq2]=='O')continue;
1052 if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue;
1055 r2=tolower ( A->seq_al[seq2][col]);
1056 if ( is_gap(r2))continue;
1059 if ( r2==r1){score+=v;}
1063 if (used) fprintf ( stdout, "%c", r1);
1066 pred=(( score/max) >T)?1:0;
1067 real=(S->seq_al[ci][seq1]=='I')?1:0;
1070 fprintf ( stdout, "PSEQ: %-10s SC: %4d MAX: %4d S: %.2f R: %4d", A->name[seq1],(int)score, (int)max, (float)score/max,real);
1074 for (used_col=0,used_res=0,col=0; col<A->len_aln; col++)
1076 for (is_used_col=0,s2=0; s2<ns[0]; s2++)
1079 if ( cache[seq2][col]==0 && !is_gap(A->seq_al[seq2][col]))n_res++;
1080 else if (is_gap(A->seq_al[seq2][col]));
1087 used_col+=is_used_col;
1089 fprintf ( stdout, "\n>%s USED_POSITIONS: COL: %.2f RES: %.2f COMP\n", S->name[ci], (float)used_col/(float)A->len_aln, (float)used_res/(float) n_res);
1094 float** cache2pred3 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
1096 int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci, a, n;
1103 int best_tp, best_fp;
1104 int delta, best_delta;
1107 /*Predict on ns[1] what was trained on ns[0]*/
1109 strget_param ( mode, "_THR_", "0.5", "%f", &T);
1112 R=declare_float (2, 2);
1113 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
1114 list=declare_int ( ns[1],3);
1116 for (s1=0; s1<ns[1]; s1++)
1121 for (max=0,score=0, col=0; col<A->len_aln; col++)
1125 r1=tolower (A->seq_al[seq1][col]);
1126 for (used=0,s2=0; s2<ns[0]; s2++)
1130 if ( S->seq_al[ci][seq2]=='O')continue;
1131 if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue;
1134 r2=tolower ( A->seq_al[seq2][col]);
1135 if ( is_gap(r2))continue;
1138 if ( r2==r1){score+=v;}
1146 pred=(( score/max) >T)?1:0;
1147 real=(S->seq_al[ci][seq1]=='I')?1:0;
1150 list[s1][1]=(int)((score/max)*(float)1000);
1156 sort_int_inv (list, 3, 1, 0, ns[1]-1);
1158 for ( a=0; a<ns[1]; a++)
1161 fprintf ( stdout, "PSEQ: %-10s SC: %5d R: %4d\n", A->name[seq1],list[a][0], list[a][1]);
1164 for (n=0, a=0; a<ns[1]; a++)n+=list[a][0];
1165 for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<ns[1]; a++)
1170 if (FABS(delta)<best_delta)
1181 tn=ns[1]-(tp+fp+fn);
1186 free_int (list, -1);
1189 float** cache2pred4 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
1191 int s1, s2, seq1, seq2, ci, a,b, c, n;
1197 int best_tp, best_fp;
1198 int delta, best_delta;
1204 /*Predict on ns[1] what was trained on ns[0]*/
1205 /*Identify interesting coloumns*/
1206 ul=vcalloc ( A->len_aln, sizeof (int));
1207 for (a=0; a< A->len_aln; a++)
1208 for ( b=0; b< A->nseq; b++)
1209 if ( cache[b][a])ul[nused++]=a;
1211 /*compute the similarity on the used columns*/
1213 R=declare_float (2, 2);
1214 sim=declare_int (A->nseq, A->nseq);
1215 for (a=0; a< A->nseq; a++)
1216 for ( b=0; b< A->nseq; b++)
1218 for (c=0; c< nused; c++)
1220 if ( A->seq_al[a][ul[c]]==A->seq_al[b][ul[c]])sim[a][b]++;
1222 sim[a][b]=(sim[a][b]*100)/nused;
1229 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
1230 list=declare_int ( ns[1],2);
1232 for (s1=0; s1<ns[1]; s1++)
1237 for (score=0,s2=0; s2<ns[0]; s2++)
1241 if ( seq1==seq2)continue;
1242 if (S->seq_al[ci][seq2]=='I')score=MAX(score, sim[seq1][seq2]);
1244 list[s1][0]=(S->seq_al[ci][seq1]=='I')?1:0;
1245 list[s1][1]=(int)score;
1248 sort_int_inv (list, 2, 1, 0, ns[1]-1);
1250 for (n=0, a=0; a<ns[1]; a++)n+=list[a][0];
1251 for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<ns[1]; a++)
1256 if (FABS(delta)<best_delta)
1267 tn=ns[1]-(tp+fp+fn);
1272 free_int (list, -1);
1277 float** cache2pred5 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
1279 int s1, s2, seq1, seq2, ci, a, n;
1286 int best_tp, best_fp;
1287 int delta, best_delta;
1291 /*Predict on ns[1] what was trained on ns[0]*/
1293 R=declare_float (2, 2);
1296 sim=aln2sim_mat (A, "idmat");
1300 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
1301 list=declare_int ( ns[1],2);
1303 for (s1=0; s1<ns[1]; s1++)
1308 for (score=0,s2=0; s2<ns[0]; s2++)
1312 if ( seq1==seq2)continue;
1313 if (S->seq_al[ci][seq2]=='I')score=MAX(score, sim[seq1][seq2]);
1315 list[s1][0]=(S->seq_al[ci][seq1]=='I')?1:0;
1316 list[s1][1]=(int)score;
1319 sort_int_inv (list, 2, 1, 0, ns[1]-1);
1321 for (n=0, a=0; a<ns[1]; a++)n+=list[a][0];
1322 for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<ns[1]; a++)
1327 if (FABS(delta)<best_delta)
1338 tn=ns[1]-(tp+fp+fn);
1343 free_int (list, -1);
1347 float** jacknife5 (Alignment*A,int **cacheIN, int *ns, int **ls, Alignment *S, char *compound, char *mode)
1349 int seq1, ci, a,b, c, n;
1350 double score, max_score;
1355 int best_tp, best_fp;
1356 int delta, best_delta;
1360 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
1361 list=declare_int (A->nseq,2);
1362 R=declare_float (2, 2);
1365 for ( a=0; a<A->nseq; a++)
1371 for (c=0,b=0; b<A->nseq; b++)
1372 if (a!=b)ls[0][c++]=b;
1376 cache=sar2cache_count1 (A, ns, ls,S, compound, mode);
1377 for (b=0; b<=26; b++)
1378 for ( c=0; c< A->len_aln; c++)
1379 cacheIN[b][c]+=cache[b][c];
1382 real=(S->seq_al[ci][seq1]=='I')?1:0;
1383 fprintf ( stdout, ">%-10s %d ", A->name[seq1], real);
1387 for (max_score=0,b=0; b<A->len_aln; b++)
1388 max_score+=cache[26][b];
1390 for (score=0,b=0; b<A->len_aln; b++)
1392 res=tolower (A->seq_al[seq1][b]);
1393 if ( cache[26][b]==0) continue;
1396 score+=cache[res-'a'][b];
1398 /*fprintf ( stdout, "%c[%3d]", res,b);*/
1400 fprintf ( stdout, " SCORE: %5d SPRED %d RATIO: %.2f \n", (int)score, a, (score*100)/max_score);
1403 if ( strstr (mode, "SIMTEST"))list[a][1]=(score*100)/max_score;
1404 else list[a][1]=(score*100)/max_score;
1405 free_int (cache, -1);
1409 sort_int_inv (list, 2, 1, 0, A->nseq-1);
1410 for (n=0, a=0; a<A->nseq; a++)
1415 for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<A->nseq; a++)
1421 if (FABS(delta)<best_delta)
1433 tn=A->nseq-(tp+fp+fn);
1438 free_int (list, -1);
1442 float** jacknife6 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
1444 int seq1, ci, a,b, c,d,e,f, n;
1450 int best_tp, best_fp;
1451 int delta, best_delta;
1454 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
1455 list=declare_int (A->len_aln,2);
1456 R=declare_float (2, 2);
1459 for ( a=0; a<A->nseq; a++)
1466 for (c=0,b=0; b<A->nseq; b++)
1467 if (a!=b)ls[0][c++]=b;
1470 cache=sar2cache_proba_new (A, ns, ls,S, compound, mode);
1473 new_cache=declare_int (27,A->len_aln);
1475 for (d=0; d< A->len_aln; d++)
1478 if ( cache[26][d]==0)continue;
1479 analyze=declare_int (26, 2);
1481 for ( e=0; e< ns[0]; e++)
1484 sar=(S->seq_al[ci][f]=='I')?1:0;
1485 res=tolower (A->seq_al[f][d]);
1487 if ( res=='-') continue;
1488 analyze[res-'a'][sar]++;
1492 if ( analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=cache[e][d];}
1494 if ( analyze[e][0] && analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=analyze[e][1];}
1495 else if ( analyze[e][0]){new_cache[26][d]=1;new_cache[e][d]-=analyze[e][0]*10;}
1496 else if ( analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=analyze[e][1];}
1497 else if ( !analyze[e][0] &&!analyze[e][1]);
1500 free_int (analyze, -1);
1504 sar=(S->seq_al[ci][seq1]=='I')?1:0;
1505 fprintf ( stdout, ">%-10s %d ", A->name[seq1], sar);
1507 for (score=0,b=0; b<A->len_aln; b++)
1509 res=tolower (A->seq_al[seq1][b]);
1510 if ( cache[26][b]==0) continue;
1513 score+=new_cache[res-'a'][b];
1516 fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score);
1518 list[seq1][1]=(int)score;
1520 free_int (new_cache, -1);
1521 free_int (cache, -1);
1523 sort_int_inv (list, 2, 1, 0, A->nseq-1);
1524 for (n=0, a=0; a<A->nseq; a++)n+=list[a][0];
1525 for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<A->nseq; a++)
1530 if (FABS(delta)<best_delta)
1538 fprintf ( stderr, "\n%d %d", best_tp, best_fp);
1543 tn=A->nseq-(tp+fp+fn);
1548 free_int (list, -1);
1553 float** cache2pred_new (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
1555 int s1, seq1, ci, a,b, n;
1561 int best_tp, best_fp;
1562 int delta, best_delta;
1565 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
1566 list=declare_int ( ns[1],2);
1567 R=declare_float (2, 2);
1569 for (s1=0; s1<ns[1]; s1++)
1574 real=(S->seq_al[ci][seq1]=='I')?1:0;
1575 fprintf ( stdout, ">%-10s %d ", A->name[seq1], real);
1576 for (score=0,b=0; b<A->len_aln; b++)
1578 res=tolower (A->seq_al[seq1][b]);
1579 if ( cache[26][b]==0) continue;
1582 score+=cache[res-'a'][b];
1584 fprintf ( stdout, "%c", res);
1586 fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score);
1588 list[s1][1]=(int)score;
1591 sort_int_inv (list, 2, 1, 0, ns[1]-1);
1593 for (n=0, a=0; a<ns[1]; a++)n+=list[a][0];
1594 for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<ns[1]; a++)
1599 if (FABS(delta)<best_delta)
1613 tn=ns[1]-(tp+fp+fn);
1618 free_int (list, -1);
1623 float** cache2pred_forbiden_res (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
1625 int s1,seq1, ci, a,b, c, n;
1631 int best_tp, best_fp;
1632 int delta, best_delta;
1637 mat=read_matrice ( "blosum62mt");
1639 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
1640 list=declare_int ( ns[1],2);
1641 R=declare_float (2, 2);
1643 for (s1=0; s1<ns[1]; s1++)
1648 real=(S->seq_al[ci][seq1]=='I')?1:0;
1649 fprintf ( stdout, ">%-10s %d ", A->name[seq1], real);
1650 for (score=0,b=0; b<A->len_aln; b++)
1652 res=tolower (A->seq_al[seq1][b]);
1653 if ( cache[26][b]==0) continue;
1656 score+=cache[res-'a'][b];
1658 fprintf ( stdout, "%c", res);
1660 fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score);
1662 list[s1][1]=(int)score;
1664 new_cache=declare_int (27,A->len_aln);
1665 for (a=0; a< A->len_aln; a++)
1667 int **analyze, real, res, d;
1671 keep=vcalloc ( 26, sizeof (int));
1672 res_type=vcalloc ( 26, sizeof (int));
1673 sub=declare_int (256, 2);
1675 if ( cache[26][a]==0)continue;
1676 analyze=declare_int (26, 2);
1677 for ( b=0; b< ns[0]; b++)
1680 real=(S->seq_al[ci][seq1]=='I')?1:0;
1681 res=tolower (A->seq_al[seq1][a]);
1683 if ( res=='-') continue;
1684 analyze[res-'a'][real]++;
1686 fprintf ( stdout, "RSPRED: ");
1687 for (c=0;c<26; c++)fprintf ( stdout, "%c", c+'a');
1688 fprintf ( stdout, "\nRSPRED: ");
1691 if ( analyze[c][0] && analyze[c][1]){fprintf ( stdout, "1");res_type[c]='1';}
1692 else if ( analyze[c][0]){new_cache[26][a]=1;new_cache[c][a]-=analyze[c][0];fprintf ( stdout, "0");res_type[c]='0';}
1693 else if ( analyze[c][1]){new_cache[26][a]=1;new_cache[c][a]+=analyze[c][1];fprintf ( stdout, "1");res_type[c]='1';}
1694 else if ( !analyze[c][0] &&!analyze[c][1]){fprintf ( stdout, "-");res_type[c]='-';}
1698 for ( c=0; c<26; c++)
1700 for ( d=0; d<26; d++)
1703 if ( res_type[c]==res_type[d])
1705 sub[res_type[c]][0]+=mat[c][d];
1706 sub[res_type[c]][1]++;
1708 if ( res_type[c]!='-' && res_type[d]!='-')
1710 sub['m'][0]+=mat[c][d];
1715 for ( c=0; c< 256; c++)
1717 if ( sub[c][1])fprintf ( stdout, " %c: %5.2f ", c, (float)sub[c][0]/(float)sub[c][1]);
1719 fprintf ( stdout, " SC: %d\nRSPRED ", cache[26][a]);
1721 for ( c=0; c<26; c++)
1722 if ( res_type[c]=='1')
1724 for (d=0; d<26; d++)
1725 if (mat[c][d]>0)keep[d]++;
1729 for (c=0; c<26; c++)
1731 if ( keep[c]>10)fprintf ( stdout, "9");
1732 else fprintf ( stdout, "%d", keep[c]);
1734 for ( c=0; c<26; c++)
1736 if ( keep[c]>8)new_cache[c][a]=10;
1737 else new_cache[c][a]=-10;
1739 fprintf ( stdout, "\n");
1740 free_int (analyze, -1);
1746 for ( a=0; a<25; a++)
1747 for (b=a+1; b<26; b++)
1751 if ( strchr("bjoxz", r1))continue;
1752 if ( strchr("bjoxz",r2))continue;
1754 if ( mat[a][b]>0 && a!=b)fprintf ( stdout, "\nMATANALYZE %c %c %d", a+'a', b+'a', mat[a][b]);
1757 for (s1=0; s1<ns[1]; s1++)
1762 real=(S->seq_al[ci][seq1]=='I')?1:0;
1763 fprintf ( stdout, ">%-10s %d ", A->name[seq1], real);
1764 for (score=0,b=0; b<A->len_aln; b++)
1766 res=tolower (A->seq_al[seq1][b]);
1767 if ( cache[26][b]==0) continue;
1770 score+=new_cache[res-'a'][b];
1772 fprintf ( stdout, "%c", res);
1774 fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score);
1776 list[s1][1]=(int)score;
1778 free_int (new_cache, -1);
1779 sort_int_inv (list, 2, 1, 0, ns[1]-1);
1782 for (n=0, a=0; a<ns[1]; a++)n+=list[a][0];
1783 for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<ns[1]; a++)
1788 if (FABS(delta)<best_delta)
1802 tn=ns[1]-(tp+fp+fn);
1807 free_int (list, -1);
1813 int **sar2cache_proba_old ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
1815 int col, s, seq,ms,mseq, res, mres, res1, n,maxn1, maxn2,maxn3, t, ci, a;
1819 int N1msa,N1sar, N, N11, N10, N01,N00, SCORE, COL_INDEX, RES;
1822 float T1, T2, T3, T4;
1826 int sim_weight, w, sw_thr;
1831 RES=nfield++;COL_INDEX=nfield++;N1msa=nfield++;N1sar=nfield++;N=nfield++;N11=nfield++;N10=nfield++;N01=nfield++;N00=nfield++;SCORE=nfield++;
1832 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
1833 cache=declare_int (A->nseq, A->len_aln);
1835 strget_param ( mode, "_FILTER1_", "0" , "%f", &T1);
1836 strget_param ( mode, "_FILTER2_", "1000000", "%f", &T2);
1837 strget_param ( mode, "_FILTER3_", "0" , "%f", &T3);
1838 strget_param ( mode, "_FILTER4_", "1000000", "%f", &T4);
1839 strget_param ( mode, "_SIMWEIGHT_", "1", "%d", &sim_weight);
1840 strget_param ( mode, "_SWTHR_", "30", "%d", &sw_thr);
1841 strget_param (mode, "_TRAIN_","1", "%d", &train_mode);
1842 strget_param (mode, "_ZSCORE_","0", "%f", &zscore);
1848 if (sim_weight==1 && !sim) sim=aln2sim_mat(A, "idmat");
1849 for ( ms=0; ms<ns[0]; ms++)
1852 if ( S->seq_al[ci][mseq]!='I')continue;
1854 list=declare_int (A->len_aln+1, nfield);
1855 for (t=0,n=0, col=0; col< A->len_aln; col++)
1859 mres=tolower(A->seq_al[mseq][col]);
1860 list[col][RES]=mres;
1861 list[col][COL_INDEX]=col;
1863 if ( is_gap(mres))continue;
1864 for ( s=0; s<ns[0]; s++)
1867 res=tolower(A->seq_al[seq][col]);
1868 if (is_gap(res))continue;
1873 w=sim[seq][mseq];w=(mres==res)?100-w:w;
1881 if ( S->seq_al[ci][seq]=='I')same_res=1;
1882 else same_res=(res==mres)?1:0;
1885 same_res=(res==mres)?1:0;
1889 if (S->seq_al[ci][seq]=='I' && same_res)list[col][N11]+=w;
1890 else if (S->seq_al[ci][seq]=='I' && same_res)list[col][N10]+=w;
1891 else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N01]+=w;
1892 else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N00]+=w;
1894 if ( S->seq_al[ci][seq]=='I')list[col][N1sar]+=w;
1895 if ( same_res)list[col][N1msa]+=w;
1899 list[col][SCORE]=(int)evaluate_sar_score1 (list[col][N], list[col][N11], list[col][N1msa], list[col][N1sar]);
1903 strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1);
1904 strget_param ( mode, "_WEIGHT_", "1", "%d", &weight_mode);
1905 strget_param ( mode, "_QUANT_", "0.0", "%f", &quant);
1907 sort_int_inv (list,nfield,SCORE,0,A->len_aln-1);
1911 n=quantile_rank ( list,SCORE, A->len_aln,quant);
1912 sort_int (list,nfield,N1msa, 0, n-1);
1916 for (a=0; a<maxn1; a++)
1918 col=list[a][COL_INDEX];
1920 value=list[a][SCORE];
1921 if ( value>T1 && value<T2){cache[mseq][col]= value;}
1923 free_int (list, -1);
1927 list=declare_int (A->len_aln+1, nfield);
1928 for ( col=0; col< A->len_aln; col++)
1930 list[col][COL_INDEX]=col;
1931 for ( s=0; s<ns[0]; s++)
1934 list[col][SCORE]+=cache[seq][col];
1938 /*Filter Columns with a score not between T2 and T3*/
1940 for (col=0; col< A->len_aln; col++)
1941 if (list[col][SCORE]<T3 || list[col][SCORE]>T4)
1944 for (s=0; s< A->nseq; s++)
1945 if (!is_gap(A->seq_al[s][col]))cache[s][col]=0;
1948 /*Keep The N Best Columns*/
1951 double sum=0, sum2=0, z;
1953 for (a=0; a< A->len_aln; a++)
1955 if ( list[a][SCORE]>0)
1957 sum+=list[a][SCORE];
1958 sum2+=list[a][SCORE]*list[a][SCORE];
1962 for (a=0; a<A->len_aln; a++)
1964 if ( list[a][SCORE]>0)
1966 z=return_z_score (list[a][SCORE], sum, sum2,n);
1967 if ((float)z<zscore)
1969 col=list[a][COL_INDEX];
1970 for (s=0; s<A->nseq; s++)
1975 fprintf ( stdout, "\nZSCORE: KEEP COL %d SCORE: %f SCORE: %d\n", list[a][COL_INDEX], (float)z, list[a][SCORE]);
1982 sort_int_inv (list,nfield,SCORE,0,A->len_aln-1);
1983 strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2);
1985 for (a=maxn2;a<A->len_aln; a++)
1987 col=list[a][COL_INDEX];
1988 for (s=0; s<A->nseq; s++)
1993 /*Get Rid of the N best Columns*/;
1994 strget_param ( mode, "_MAXN3_", "0", "%d", &maxn3);
1996 for (a=0; a<maxn3;a++)
1998 col=list[a][COL_INDEX];
1999 for (s=0; s<A->nseq; s++)
2005 int aln2n_comp_col ( Alignment *A, Alignment *S, int ci)
2007 int res, seq,sar, col, r;
2012 analyze=declare_int (27, 2);
2013 for ( col=0; col< A->len_aln; col++)
2018 for ( n1=0, n0=0,seq=0; seq<A->nseq; seq++)
2020 res=tolower(A->seq_al[seq][col]);
2021 sar=(S->seq_al[ci][seq]=='I')?1:0;
2024 if ( res=='-')continue;
2026 analyze[res][sar]++;
2029 for (r=0; r<26; r++)
2036 if ( a1==n1 && a0<n0)
2041 for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0;
2044 free_int (analyze, -1);
2047 int **sar2cache_count1 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
2049 int maxn2, res, seq,sar, ci, col,s, r;
2050 int **analyze, **list, **cache;
2054 if (!mat) mat=read_matrice ("blosum62mt");
2057 list=declare_int ( A->len_aln, 2);
2058 cache=declare_int ( 27, A->len_aln);
2059 analyze=declare_int (27, 2);
2061 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
2063 for ( col=0; col< A->len_aln; col++)
2068 for ( n1=0, n0=0,s=0; s<ns[0]; s++)
2071 res=tolower(A->seq_al[seq][col]);
2072 sar=(S->seq_al[ci][seq]=='I')?1:0;
2075 if ( res=='-')continue;
2078 analyze[res][sar]++;
2081 for (r=0; r<26; r++)
2087 if ( strstr (mode, "SIMTEST"))
2098 cache[26][col]=MAX(w, cache[26][col]);
2101 for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0;
2103 list[col][1]=cache[26][col];
2106 free_int (analyze, -1);
2108 sort_int_inv (list, 2, 1, 0, A->len_aln-1);
2110 strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2);
2112 for ( col=maxn2; col<A->len_aln; col++)
2113 for ( r=0; r<=26; r++)cache[r][list[col][0]]=0;
2115 free_int (list, -1);
2120 int **sar2cache_count2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
2122 int maxn2, res, seq,sar, ci, col,s, r;
2123 int **analyze, **list, **cache, **conseq;
2126 if (!mat) mat=read_matrice ("blosum62mt");
2129 list=declare_int ( A->len_aln, 2);
2130 cache=declare_int ( 27, A->len_aln);
2131 conseq=declare_int ( A->len_aln,3);
2133 analyze=declare_int (27, 2);
2135 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
2136 for ( col=0; col< A->len_aln; col++)
2140 for ( n1=0, n0=0,s=0; s<ns[0]; s++)
2143 res=tolower(A->seq_al[seq][col]);
2144 sar=(S->seq_al[ci][seq]=='I')?1:0;
2147 if ( res=='-')continue;
2149 analyze[res][sar]++;
2151 for (r=0; r<26; r++)
2156 if ( a1==n1 && a0<n0)
2164 for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0;
2166 free_int (analyze, -1);
2168 for (s=0; s<ns[0]; s++)
2172 for (w1=0,w2=0,col=0; col<A->len_aln; col++)
2175 res=tolower(A->seq_al[seq][col]);
2176 if ( is_gap(res))continue;
2179 if ( conseq[col][1] && res!=conseq[col][0])w1++;
2180 if ( conseq[col][1])w2++;
2182 for (col=0; col<A->len_aln; col++)
2184 res=tolower(A->seq_al[seq][col]);
2185 if ( is_gap(res))continue;
2188 if ( conseq[col][1] && res!=conseq[col][0])conseq[col][2]+=(w2-w1);
2192 for (col=0; col<A->len_aln; col++)
2198 cache[r][col]=cache[26][col]=list[col][1]=w;
2201 sort_int_inv (list, 2, 1, 0, A->len_aln-1);
2202 strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2);
2204 for ( col=maxn2; col<A->len_aln; col++)
2205 for ( r=0; r<=26; r++)cache[r][list[col][0]]=0;
2208 free_int (list, -1);
2212 int **sar2cache_count3 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
2214 int maxn2, res, seq,sar, ci, col,s, r, a1, a0, n1, n0;
2215 int **analyze, **list, **cache;
2218 if (!mat) mat=read_matrice ("blosum62mt");
2221 list=declare_int ( A->len_aln, 2);
2222 cache=declare_int ( 27, A->len_aln);
2223 analyze=declare_int (27, 2);
2225 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
2227 for ( col=0; col< A->len_aln; col++)
2230 for ( n1=0, n0=0,s=0; s<ns[0]; s++)
2233 res=tolower(A->seq_al[seq][col]);
2234 sar=(S->seq_al[ci][seq]=='I')?1:0;
2237 if ( res=='-')continue;
2240 analyze[res][sar]++;
2244 for (g=0,r=0; r<A->nseq; r++)
2245 g+=is_gap(A->seq_al[r][col]);
2249 for (e=0, r=0; r<26; r++)
2256 e+= t/(double)A->nseq*log(t/(double)A->nseq);
2264 if ( strstr ( mode, "SIMTEST"))
2266 for (r=0; r<26; r++)
2275 cache[26][col]=MAX(cache[26][col],a1);
2284 for (r=0; r<26; r++)
2293 cache[26][col]=MAX(cache[26][col],a0);
2298 for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0;
2300 list[col][1]=cache[26][col];
2303 free_int (analyze, -1);
2305 sort_int_inv (list, 2, 1, 0, A->len_aln-1);
2307 strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2);
2309 for ( col=maxn2; col<A->len_aln; col++)
2310 for ( r=0; r<=26; r++)cache[r][list[col][0]]=0;
2312 free_int (list, -1);
2317 int **sar2cache_proba_new ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
2319 int col, s, seq,ms,mseq, res, mres, res1, n,maxn1, maxn2,maxn3, t, ci, a,w;
2323 int N1msa,N1sar, N, N11, N10, N01,N00, SCORE, COL_INDEX, RES;
2333 RES=nfield++;COL_INDEX=nfield++;N1msa=nfield++;N1sar=nfield++;N=nfield++;N11=nfield++;N10=nfield++;N01=nfield++;N00=nfield++;SCORE=nfield++;
2334 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
2335 cache=declare_int (27, A->len_aln);
2337 strget_param ( mode, "_SWTHR_", "30", "%d", &sw_thr);
2338 strget_param (mode, "_ZSCORE_","0", "%f", &zscore);
2341 if (!sim)sim=aln2sim_mat(A, "idmat");
2342 for ( ms=0; ms<ns[0]; ms++)
2345 if ( S->seq_al[ci][mseq]!='I')continue;
2347 list=declare_int (A->len_aln+1, nfield);
2348 for (t=0,n=0, col=0; col< A->len_aln; col++)
2352 mres=tolower(A->seq_al[mseq][col]);
2353 if ( is_gap(mres))continue;
2355 list[col][RES]=mres;
2356 list[col][COL_INDEX]=col;
2358 for ( s=0; s<ns[0]; s++)
2361 res=tolower(A->seq_al[seq][col]);
2362 if (is_gap(res))continue;
2363 w=sim[seq][mseq];w=(mres==res)?100-w:w;
2365 same_res=(res==mres)?1:0;
2369 if (S->seq_al[ci][seq]=='I' && same_res)list[col][N11]+=w;
2370 else if (S->seq_al[ci][seq]=='I' && same_res)list[col][N10]+=w;
2371 else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N01]+=w;
2372 else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N00]+=w;
2374 if ( S->seq_al[ci][seq]=='I')list[col][N1sar]+=w;
2375 if ( same_res)list[col][N1msa]+=w;
2379 list[col][SCORE]=(int)evaluate_sar_score1 (list[col][N], list[col][N11], list[col][N1msa], list[col][N1sar]);
2382 strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1);
2383 sort_int_inv (list,nfield,SCORE,0,A->len_aln-1);
2384 for (a=0; a<maxn1; a++)
2386 col=list[a][COL_INDEX];
2388 value=list[a][SCORE];
2392 cache[res1-'a'][col]+= value;
2393 cache[26][col]+=value;
2396 free_int (list, -1);
2400 list=declare_int (A->len_aln+1, nfield);
2401 for ( col=0; col< A->len_aln; col++)
2403 list[col][COL_INDEX]=col;
2404 list[col][SCORE]=cache[26][col];
2406 /*Keep The N Best Columns*/
2409 double sum=0, sum2=0, z;
2411 for (a=0; a< A->len_aln; a++)
2413 if ( list[a][SCORE]>0)
2415 sum+=list[a][SCORE];
2416 sum2+=list[a][SCORE]*list[a][SCORE];
2420 for (a=0; a<A->len_aln; a++)
2422 if ( list[a][SCORE]>0)
2424 z=return_z_score (list[a][SCORE], sum, sum2,n);
2425 if ((float)z<zscore)
2427 col=list[a][COL_INDEX];
2428 for (s=0; s<27; s++)
2433 fprintf ( stdout, "\nZSCORE: KEEP COL %d SCORE: %f SCORE: %d\n", list[a][COL_INDEX], (float)z, list[a][SCORE]);
2440 sort_int_inv (list,nfield,SCORE,0,A->len_aln-1);
2441 strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2);
2443 for (a=maxn2;a<A->len_aln; a++)
2445 col=list[a][COL_INDEX];
2446 for (s=0; s<27; s++)
2451 /*Get Rid of the N best Columns*/;
2452 strget_param ( mode, "_MAXN3_", "0", "%d", &maxn3);
2454 for (a=0; a<maxn3;a++)
2456 col=list[a][COL_INDEX];
2457 for (s=0; s<27; s++)
2462 int **sar2cache_adriana ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
2465 int col,maxn1, s, seq,ms,mseq, res, mres,res1, n, t, ci, a;
2471 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
2472 cache=declare_int (A->nseq, A->len_aln);
2475 for ( ms=0; ms<ns[0]; ms++)
2478 if ( S->seq_al[ci][mseq]!='I')continue;
2480 list=declare_int (A->len_aln+1, 5);
2481 for (t=0,n=0, col=0; col< A->len_aln; col++)
2483 mres=tolower(A->seq_al[mseq][col]);
2487 if ( is_gap(mres))continue;
2488 for ( s=0; s<ns[0]; s++)
2491 res=tolower(A->seq_al[seq][col]);
2492 if (is_gap(res))continue;
2494 if (S->seq_al[ci][seq]=='I' && res==mres)list[col][3]++;
2495 if (res==mres)list[col][2]++;
2499 sort_int_inv (list,5,3,0,A->len_aln-1);
2501 strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1);
2502 strget_param ( mode, "_QUANT_", "0.95", "%f", &quant);
2504 n=quantile_rank ( list, 3, A->len_aln,quant);
2505 sort_int (list, 5, 2, 0, n-1);
2507 for (a=0; a<maxn1; a++)
2512 cache[mseq][col]=list[a][3];
2514 free_int (list, -1);
2519 int **sar2cache_proba2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
2521 int col, s, seq,ms,mseq, res, mres,n,maxn1, t, ci, a,b;
2527 float T1, T2, T3, T4;
2530 cache=declare_int ( A->nseq, A->len_aln);
2531 ci=name_is_in_list ( compound, S->name, S->nseq, -1);
2533 strget_param ( mode, "_FILTER1_", "0" , "%f", &T1);
2534 strget_param ( mode, "_FILTER2_", "1000000", "%f", &T2);
2535 strget_param ( mode, "_FILTER3_", "0" , "%f", &T3);
2536 strget_param ( mode, "_FILTER4_", "1000000", "%f", &T4);
2538 list=declare_int (A->len_aln+1,A->nseq+2);
2542 for ( ms=0; ms<ns[0]; ms++)
2545 if ( S->seq_al[ci][mseq]!='I')continue;
2547 for (t=0,n=0, col=0; col< A->len_aln; col++)
2549 int N11=0,N10=0,N01=0,N00=0,N1sar=0,N1msa=0,N=0;
2551 mres=tolower(A->seq_al[mseq][col]);
2552 if ( is_gap(mres))continue;
2553 for ( s=0; s<ns[0]; s++)
2556 res=tolower(A->seq_al[seq][col]);
2557 if (is_gap(res))continue;
2560 if (S->seq_al[ci][seq]=='I' && res==mres)N11++;
2561 else if (S->seq_al[ci][seq]=='I' && res!=mres)N10++;
2562 else if (S->seq_al[ci][seq]=='O' && res==mres)N01++;
2563 else if (S->seq_al[ci][seq]=='O' && res!=mres)N00++;
2565 if ( S->seq_al[ci][seq]=='I')N1sar++;
2566 if ( res==mres)N1msa++;
2568 list[col][mseq]=(int)evaluate_sar_score1 (N,N11,N1msa,N1sar);
2569 list[col][SCORE]+=list[col][mseq];
2574 strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1);
2575 strget_param ( mode, "_QUANT_", "0.95", "%f", &quant);
2576 sort_int_inv (list,A->nseq+2,SCORE, 0, A->len_aln-1);
2577 n=quantile_rank ( list,A->nseq, A->len_aln,quant);
2586 for ( b=0; b<A->nseq; b++)
2589 if ( value>T1 && value<T2){cache[b][col]= value;}
2593 free_int (list, -1);
2600 /************************************************************************************/
2601 /* ALIGNMENT ANALYZE : SAR */
2602 /************************************************************************************/
2603 int aln2jack_group3 (Alignment *A,char *comp, int **l1, int *nl1, int **l2, int *nl2)
2605 int **seq_list, **sar_list, nsar=0, nseq=0;
2609 sar_list=declare_int (A->nseq, 2);
2610 seq_list=declare_int (A->nseq, 2);
2611 for (a=0; a< A->nseq; a++)
2615 sar_list[nsar][0]=a;
2616 sar_list[nsar][1]=rand()%100000;
2621 seq_list[nseq][0]=a;
2622 seq_list[nseq][1]=rand()%100000;
2628 l1[0]=vcalloc (A->nseq, sizeof (int));
2629 l2[0]=vcalloc (A->nseq, sizeof (int));
2632 sort_int (seq_list, 2, 1, 0,nseq-1);
2633 sort_int (sar_list, 2, 1, 0,nsar-1);
2635 for (a=0; a<mid; a++)
2637 l1[0][nl1[0]++]=sar_list[a][0];
2639 for (a=0,b=mid; b<nsar; b++, a++)
2641 l2[0][nl2[0]++]=sar_list[b][0];
2645 for (a=0; a<mid; a++)
2647 l1[0][nl1[0]++]=seq_list[a][0];
2649 for (a=0,b=mid; b<nseq; b++, a++)
2651 l2[0][nl2[0]++]=seq_list[b][0];
2655 free_int (seq_list, -1);
2656 free_int (sar_list, -1);
2660 int aln2jack_group2 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2)
2666 list=declare_int (A->nseq, 2);
2667 l1[0]=vcalloc (A->nseq, sizeof (int));
2668 l2[0]=vcalloc (A->nseq, sizeof (int));
2672 for ( a=0; a< A->nseq; a++)
2675 list[a][1]=rand()%100000;
2677 sort_int (list, 2, 1, 0,A->nseq-1);
2679 for (a=0; a<mid; a++)
2681 l1[0][nl1[0]++]=list[a][0];
2683 for (a=0,b=mid; b<A->nseq; b++, a++)
2685 l2[0][nl2[0]++]=list[b][0];
2688 free_int (list, -1);
2691 int aln2jack_group1 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2)
2697 list=declare_int ( A->nseq, 3);
2698 l1[0]=vcalloc (A->nseq, sizeof (int));
2699 l2[0]=vcalloc (A->nseq, sizeof (int));
2702 sim=aln2sim_mat (A, "idmat");
2703 for ( a=0; a< A->nseq; a++)
2707 list[a][2]=(a==seq)?100:sim[seq][a];
2709 sort_int_inv (list, 3, 2, 0, A->nseq-1);
2710 fprintf ( stderr, "\nJacknife fromsequence %s [%d]\n", A->name[seq], seq);
2712 for (a=0; a< mid; a++)
2713 l1[0][nl1[0]++]=list[a][1];
2714 for (a=mid; a<A->nseq; a++)
2715 l2[0][nl2[0]++]=list[a][1];
2720 int sarset2subsarset ( Alignment *A, Alignment *S, Alignment **subA, Alignment **subS, Alignment *SUB)
2722 Alignment *rotS, *intS;
2725 list=vcalloc ( SUB->nseq, sizeof (int));
2726 for (nl=0,a=0; a<SUB->nseq; a++)
2728 b=name_is_in_list(SUB->name[a], A->name, A->nseq, 100);
2729 if ( b!=-1)list[nl++]=b;
2732 subA[0]=extract_sub_aln (A, nl, list);
2733 rotS=rotate_aln (S, NULL);
2734 intS=extract_sub_aln (rotS, nl, list);
2736 subS[0]=rotate_aln (intS, NULL);
2738 for ( a=0; a<S->nseq; a++) sprintf ( (subS[0])->name[a], "%s", S->name[a]);
2744 int ***simple_sar_analyze_vot ( Alignment *A, Alignment *SAR, char *mode)
2747 int res1, res2, sar1, sar2;
2750 static float ***result;
2751 static int ***iresult;
2754 result=declare_arrayN (3,sizeof (float),SAR->nseq, A->len_aln,3);
2755 iresult=declare_arrayN (3,sizeof (int),SAR->nseq, A->len_aln,3);
2758 sim=aln2sim_mat (A, "idmat");
2761 for (a=0; a<SAR->nseq; a++)
2762 for (b=0; b<A->len_aln; b++)
2765 for ( a=0; a< SAR->nseq; a++)
2766 for ( b=0; b<A->nseq-1; b++)
2767 for ( c=b+1; c< A->nseq; c++)
2768 for ( d=0; d<A->len_aln; d++)
2770 res1=A->seq_al[b][d];
2771 res2=A->seq_al[c][d];
2773 sar1=(SAR->seq_al[a][b]=='I')?1:0;
2774 sar2=(SAR->seq_al[a][c]=='I')?1:0;
2781 if ( sar1!=sar2 && res1!=res2)
2782 result[a][d][0]*=(1/(100-s));
2784 else if ( sar1==sar2 && sar1==1 && res1==res2)
2785 result[a][d][0]*=1/s;
2791 else if ( sar1==sar2 && res1==res2)result[a][d][0]+=(100-s)*(100-s);
2792 else if ( sar1==sar2 && res1!=res2)result[a][d][0]-=s*s;
2793 else if ( sar1!=sar2 && res1==res2)result[a][d][0]-=(100-s)*(100-s);
2796 result[a][d][1]='a';
2798 for ( a=0; a<SAR->nseq; a++)
2799 for ( b=0; b<A->len_aln; b++)
2801 fprintf ( stderr, "\n%f", result[a][b][0]);
2802 iresult[a][b][0]=100*log(1-result[a][b][0]);
2806 int display_simple_sar_analyze_pair_col (Alignment *A, Alignment *SAR, char *mode)
2816 strget_param (mode, "_TM_", "0", "%d", &do_tm);
2817 r=simple_sar_analyze_pair_col (A, SAR,mode);
2818 rA=rotate_aln (A, NULL);
2821 nI=vcalloc ( SAR->nseq, sizeof (int));
2822 for (a=0; a< SAR->nseq; a++)
2823 for (b=0; b<SAR->len_aln; b++) nI[a]+=(SAR->seq_al[a][b]=='I')?1:0;
2827 while ( r[n][0]!=-1)
2830 {fprintf ( stdout, "COMP S: %3d %3d %s %20s %2d #\n", r[n][3],0,SAR->seq_al[r[n][0]], SAR->name[r[n][0]], nI[r[n][0]]);
2831 fprintf ( stdout, "SEQ1 S: %3d %3d %s %20s %2d #\n", r[n][3],r[n][1],rA->seq_al[r[n][1]], SAR->name[r[n][0]],nI[r[n][0]]);
2832 fprintf ( stdout, "SEQ2 S: %3d %3d %s %20s %2d #\n\n", r[n][3],r[n][2],rA->seq_al[r[n][2]], SAR->name[r[n][0]],nI[r[n][0]]);
2838 int display_simple_sar_analyze_col (Alignment *A, Alignment *SAR, char *mode)
2840 int ***result, **r2, **r3, **r4, **aa;
2845 strget_param (mode, "_TM_", "0", "%d", &do_tm);
2846 result=simple_sar_analyze_col (A, SAR,mode);
2847 r2=declare_int (A->len_aln*SAR->nseq, 5);
2848 r3=declare_int (A->len_aln+1, 5);
2849 r4=declare_int (A->len_aln+1, SAR->nseq+1);
2850 aa=declare_int (2, 256);
2851 cons=vcalloc (A->len_aln+1, sizeof (char));
2852 for (a=0; a<A->len_aln; a++){r3[a][0]=a;cons[a]='A';}
2856 for (n=0,a=0; a< SAR->nseq; a++)
2859 for (sum=0, sum2=0,b=0; b<A->len_aln; b++)
2861 sum+=result[a][b][0];
2862 sum2+=result[a][b][0]*result[a][b][0];
2865 for (b=0; b<A->len_aln; b++, n++)
2867 r2[n][0]=a;//compound
2869 r2[n][2]=result[a][b][1]; //AA
2870 r2[n][3]=result[a][b][0]; //Score
2871 r2[n][4]=result[a][b][2]; //(int)10*return_z_score ((double)result[a][b][0], sum, sum2, A->len_aln); //ZScore
2874 sort_int (r2,5, 3, 0, n-1);//sort on Score (3rd field)
2875 for ( a=0; a< n; a++)
2881 fprintf ( stdout, "SEQ %5d %5d %5d %s ",r2[a][1]+1,r2[a][3], r2[a][4], (do_tm)?alnpos2hmmtop_pred (A, NULL, r2[a][1], SHORT):"x");
2882 for (c=0; c<A->nseq; c++)fprintf (stdout, "%c", A->seq_al[c][r2[a][1]]);
2887 for (c=0; c< A->nseq; c++)
2891 activity=SAR->seq_al[comp][c];
2892 res=A->seq_al[c][pos];
2894 if (activity=='O')aa[0][res]++;
2895 if (activity=='I')aa[1][res]++;
2898 for (c=0; c< A->nseq; c++)
2901 activity=SAR->seq_al[comp][c];
2902 res=A->seq_al[c][pos];
2903 bad+=(aa[0][res] && aa[1][res])?1:0;
2904 aa[0][res]=aa[1][res]=0;
2906 fprintf ( stdout, " %20s %d |\nCOM %5d %5d %5d %s %s %20s %d |\n\n", SAR->name[r2[a][0]],bad,r2[a][1]+1,r2[a][3],r2[a][4], (do_tm)?alnpos2hmmtop_pred (A, NULL, r2[a][1], SHORT):"x",SAR->seq_al[r2[a][0]], SAR->name[r2[a][0]], bad);
2909 if (r2[a][4]>threshold)
2913 r3[r2[a][1]][2]+=r2[a][3];
2914 r4[r2[a][1]][r2[a][0]]=1;
2917 sort_int (r3, 3,1,0, A->len_aln-1);
2919 for (a=0; a<A->len_aln; a++)
2923 fprintf ( stdout, "\nPOS %4d %4d %4d %c ", r3[a][0]+1, r3[a][1], r3[a][2], cons[r3[a][0]]);
2924 for (b=0; b<SAR->nseq; b++)fprintf ( stdout, "%d", r4[r3[a][0]][b]);
2925 if (do_tm)fprintf ( stdout, " %s",alnpos2hmmtop_pred (A, NULL, r3[a][0], VERBOSE));
2928 for (a=0; a< A->nseq; a++)fprintf ( stdout, "\n#MSA >%s\n#MSA %s",A->name[a], A->seq_al[a]);
2929 fprintf ( stdout, "\n#MSA >cons\n#MSA %s", cons);
2933 int *** simple_sar_predict (Alignment *A, Alignment *SAR, char *mode)
2935 //This function estimates the z score of every poition with every compound
2936 //The best Z-score position is then used for the prediction
2938 int a, b, c, nts, pos, Rscore,Zscore;
2944 aa=declare_int (2,256);
2945 pred=declare_arrayN (3, sizeof (int),SAR->nseq, A->nseq, 5);
2948 r=simple_sar_analyze_col (A, SAR, mode);
2949 nts=SAR->len_aln; //number_trainning_sequences;
2951 for (a=0; a<SAR->nseq; a++)
2953 sort_int (r[a],4, 2, 0, A->len_aln-1);
2955 pos=r[a][A->len_aln-1][3]; //Best Position
2956 Zscore=r[a][A->len_aln-1][2]; //Best Z-Score
2957 Rscore=r[a][A->len_aln-1][0]; //Best Z-Score
2960 for (c=0; c<nts; c++)
2963 if (SAR->seq_al[a][c]=='I')aa[1][(int)A->seq_al[c][pos]]++;//Build Positive Alphabet for Compound a
2964 if (SAR->seq_al[a][c]=='O')aa[0][(int)A->seq_al[c][pos]]++;//Build Positive Alphabet for Compound a
2966 for (c=nts; c<A->nseq; c++)
2969 pred[a][c][1]=Zscore;
2970 pred[a][c][2]=Rscore;
2971 if (aa[1][(int)A->seq_al[c][pos]]>0)
2974 pred[a][c][3]=aa[1][(int)A->seq_al[c][pos]];
2975 pred[a][c][4]=aa[0][(int)A->seq_al[c][pos]];
2978 for (c=0; c<nts; c++)aa[0][(int)A->seq_al[c][pos]]=aa[1][(int)A->seq_al[c][pos]]=0;
2981 for ( a=nts; a< A->nseq; a++)
2983 for ( b=0; b<SAR->nseq; b++)
2985 fprintf ( stdout, ">%-25s %-25s Pos %3d ZScore %3d Rscore %3d Activity +: %d -: %d ", A->name [a], SAR->name[b], pred[b][a][0],pred[b][a][1], pred[b][a][2], pred[b][a][3], pred[b][a][4]);
2986 if (pred[b][a][4]==0)for (c=0; c<pred[b][a][3]; c++)fprintf ( stdout, "*");
2987 fprintf ( stdout, "\n" );
2988 for (c=0; c<A->nseq; c++)fprintf ( stdout, "%c", A->seq_al[c][ pred[b][a][0]]);
2989 fprintf ( stdout, " %s\n", SAR->name[b]);
2990 for (c=0; c<A->nseq-1; c++)fprintf ( stdout, "%c", SAR->seq_al[b][c]);
2991 fprintf ( stdout, " %s\n", SAR->name[b]); fprintf ( stdout, "\n");
2997 int *pair_seq2seq (int *iseq, char *seq1, char *seq2);
2998 int **simple_sar_analyze_pair_col ( Alignment *inA, Alignment *SAR, char *mode)
3003 static int **result, **fresult;
3007 double sum, sum2, score;
3013 result=declare_int (inA->len_aln*inA->len_aln,5);
3015 fresult=declare_int (inA->len_aln*nresults*SAR->nseq, 5);
3019 A=rotate_aln (inA, NULL);
3022 for (n2=0,a=0; a<SAR->nseq; a++)
3025 for (n=0, sum=0, sum2=0,b=0; b<A->nseq-1; b++)
3027 for ( c=b+1; c<A->nseq; c++, n++)
3030 iseq=pair_seq2seq (iseq,A->seq_al[b], A->seq_al[c]);
3032 score=sar_vs_iseq1(SAR->seq_al[a],iseq,maxgapratio,NULL,&aa);
3033 else if (sar_mode==4)
3034 score=sar_vs_iseq4(SAR->seq_al[a],iseq,maxgapratio,NULL,&aa);
3035 //HERE ("%d", (int)score);
3036 result[n][0]=a;//compound;
3037 result[n][1]=b; //pos1
3038 result[n][2]=c; //pos2
3039 result[n][3]=(int)score;
3046 result[c][4]=(int)10*return_z_score ((double)result[c][3], sum, sum2,n); //ZScore
3047 sort_int (result,5,3,0,n-1);
3048 for ( b=n-nresults; b<n; b++, n2++)
3051 for ( c=0; c<5; c++)fresult[n2][c]=result[b][c];
3054 sort_int (fresult,5,3,0,n2-1);
3055 fresult[n2][0]=-1; //Terminator;
3060 int *pair_seq2seq (int *iseq, char *seq1, char *seq2)
3067 lu=declare_int (256,256);
3068 for ( n=a=0; a<256; a++)
3069 for (b=0; b<256; b++)
3073 if (!iseq)iseq=vcalloc ( strlen (seq1)+1, sizeof (int));
3074 for ( a=0; a< strlen (seq1); a++)
3076 if (is_gap(seq1[a]) || is_gap(seq2[a]))iseq[a]=-1;
3077 else iseq[a]=lu[(int)seq1[a]][(int)seq2[a]];
3082 int ***simple_sar_analyze_col ( Alignment *inA, Alignment *SAR, char *mode)
3085 double score=0, best_score=0;
3089 static int ***result;
3097 strget_param (mode, "_METHOD_", "3", "%d", &sar_mode);
3099 result=declare_arrayN (3,sizeof (int),SAR->nseq, inA->len_aln,4);
3102 sim=aln2sim_mat (inA, "idmat");
3103 A=rotate_aln (inA, NULL);
3106 for ( a=0; a<SAR->nseq; a++)
3108 best_pos=best_score=0;
3109 for ( sum=0, sum2=0,b=0; b<A->nseq; b++)
3113 score=sar_vs_seq1(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa);
3114 else if ( sar_mode==2)
3115 score=sar_vs_seq2(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa);
3116 else if (sar_mode ==3)
3117 score=sar_vs_seq3(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa);
3118 else if (sar_mode ==4)
3119 score=sar_vs_seq4(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa);
3120 else if (sar_mode ==5)
3121 score=sar_vs_seq5(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa);
3124 result[a][b][0]+=score*10;
3127 sum+=result[a][b][0];
3128 sum2+=result[a][b][0]*result[a][b][0];
3131 for ( b=0; b< A->nseq; b++)result[a][b][2]=10*return_z_score ((double)result[a][b][0], sum, sum2, A->nseq); //Score
3137 int *seq2iseq ( char *seq);
3138 double sar_vs_seq4 ( char *sar, char *seq, float gl, int **sim, char *best_aa)
3141 return sar_vs_iseq4 (sar, seq2iseq(seq), gl, sim, best_aa);
3143 double sar_vs_seq1 ( char *sar, char *seq, float gl, int **sim, char *best_aa)
3146 return sar_vs_iseq1 (sar, seq2iseq(seq), gl, sim, best_aa);
3149 int *seq2iseq ( char *seq)
3151 static int *iseq, clen;
3154 if (!iseq || clen<strlen (seq))
3158 iseq=vcalloc (clen+1, sizeof (int));
3160 for (a=0; a<strlen (seq); a++)
3167 double sar_vs_iseq1 ( char *sar, int *seq, float gl, int **sim, char *best_aa)
3169 double score=0, return_score=0;
3170 int RN,N11, Nmsa, Nsar, N, N10, N01, N00;
3171 int a, b, r, s, res, res1;
3173 static int *aa, *aal, naa;
3175 /*measure the E-Value for every amino acid. Returns the best one*/
3181 Ng+=(is_gap(seq[a])|| seq[a]==-1);
3184 if (Ng>gl) return 0;
3188 aa=vcalloc (256*256, sizeof(int));
3189 aal=vcalloc (N, sizeof (int));
3192 for ( a=0; a<N; a++)
3202 for (a=0; a<naa; a++)
3207 if (res==-1 || !aa[res]);
3210 RN=Nmsa=Nsar=N11=N10=N01=N00=0;
3216 if (res1=='-' || res1==-1)r=0;
3217 else r=(res1==res)?1:0;
3220 s=(sar[b]=='I')?1:0;
3226 N00+=(!r && !s)?1:0;
3232 score=evaluate_sar_score1 ( RN, N11, Nmsa, Nsar);
3240 if ( score>return_score)
3247 for ( a=0; a<N; a++)aa[seq[a]]=0;
3249 return return_score;
3251 double sar_vs_seq5 ( char *sar, char *seq, float gl, int **sim, char *best_aa)
3254 int N11, Nmsa, Nsar, N, N10, N01, N00;
3259 //measure the E-Value if all the 1AA are considered like alphabet 1*/
3260 //This function is testing ONE Compound (sar) against one column of MSA (seq) containing N sequences
3261 //Returns 0 if the active and inactive alphabet overlap
3263 //N is the nmber of sequences
3269 if (Ng>gl) return 0;
3271 //Identify all the AA associated with a I (Positive alphabet)
3272 aa=vcalloc ( 256, sizeof (int));
3275 s=(sar[b]=='I')?1:0;
3276 if (s)aa[(int)seq[b]]=1;
3278 N11=N10=N01=N00=Nmsa=Nsar=0;
3285 s=(sar[b]=='I')?1:0;
3291 N00+=(!r && !s)?1:0;
3293 if (N10>=1 || N01>=1) return 0;
3296 score=evaluate_sar_score1 ( N, N11, Nmsa, Nsar);
3307 double sar_vs_iseq4 ( char *sar, int *seq, float gl, int **sim, char *best_aa)
3314 /*Correlation between AA conservation and Activity*/
3318 Ng+=(is_gap(seq[a]) || seq[a]==-1)?1:0;
3322 if (Ng>gl) return 0;
3325 if (!aa)aa=declare_int(2,257*257);
3326 for (No=Ni=b=0; b<N; b++)
3329 s=(sar[b]=='I')?1:0;
3330 if (s){aa[1][seq[b]]++;Ni++;}
3331 else {aa[0][seq[b]]++;No++;}
3333 for (r=0,b=0; b<N; b++)
3335 if (aa[1][(int) seq[b]]==Ni)
3337 r=(No-aa[0][(int)seq [b]])*100/No;
3342 for (c=0; c<N; c++)aa[0][seq[c]]=aa[1][seq[c]]=0;
3347 double sar_vs_seq3 ( char *sar, char *seq, float gl, int **sim, char *best_aa)
3350 int N11, Nmsa, Nsar, N, N10, N01, N00;
3355 //measure the E-Value if all the 1AA are considered like alphabet 1*/
3356 //This function is testing ONE Compound (sar) against one column of MSA (seq) containing N sequences
3359 //N is the nmber of sequences
3365 if (Ng>gl) return 0;
3367 //Identify all the AA associated with a I (Positive alphabet)
3368 aa=vcalloc ( 256, sizeof (int));
3371 s=(sar[b]=='I')?1:0;
3372 if (s)aa[(int)seq[b]]=1;
3374 N11=N10=N01=N00=Nmsa=Nsar=0;
3381 s=(sar[b]=='I')?1:0;
3387 N00+=(!r && !s)?1:0;
3392 score=evaluate_sar_score1 ( N, N11, Nmsa, Nsar);
3401 double sar_vs_seq2 ( char *sar, char *seq, float gl, int **sim_mat, char *best_aa)
3403 double score=0, return_score=0;
3404 int L,N11, Nmsa, Nsar,N10, N01, N;
3405 int a, b,c,d, r1, s1,r2, s2, res;
3410 /*Weighted E-Value Similarity*/
3416 if (Ng>gl) return 0;
3417 for (a=0; a<26; a++)
3420 N=Nmsa=Nsar=N11=N10=N01=0;
3422 for (d=0,b=0; b<L; b++)d+=((tolower(seq[b]))==res)?1:0;
3423 if ( d==0) continue;
3427 r1=(tolower(seq[b])==res)?1:0;
3428 s1=(sar[b]=='I')?1:0;
3429 for ( c=0; c<L; c++)
3431 r2=(tolower(seq[c])==res)?1:0;
3432 s2=(sar[c]=='I')?1:0;
3434 sprintf ( string, "%d%d%d%d", r1,s1, r2, s2);
3435 sim= sim_mat[b][c]/10;
3438 if (strm (string, "0000")) {w=diff;N+=2*w;}
3439 else if ( strm (string, "0011")){w=sim ;N+=2*w ; N11+=w ;N10+=0 ;N01+=w ;Nmsa+=w ;Nsar+=w;}
3440 else if ( strm (string, "1010")){w=diff;N+=2*w ; N11+=0 ;N10+=2*w ;N01+=0 ;Nmsa+=2*w ;Nsar+=0;}
3441 else if ( strm (string, "0101")){w=diff;N+=2*w; N11+=0 ;N10+=0 ;N01+=2*w ;Nmsa+=0 ;Nsar+=2*w;}
3442 else if ( strm (string, "1111")){w=diff;N+=2*w; N11+=2*w;N10+=0 ;N01+=0 ;Nmsa+=2*w ;Nsar+=2*w;}
3443 else if ( strm (string, "1001")){w=sim; N+=2*w; N11+=0 ;N10+=w ;N01+=w ;Nmsa+=w;Nsar+=w;}
3444 else if ( strm (string, "0110")){w=sim; N+=2*w; N11+=0 ;N10+=w ;N01+=w ;Nmsa+=w;Nsar+=w;}
3450 score=evaluate_sar_score1 ( N, N11, Nmsa, Nsar);
3452 return_score=MAX(return_score, score);
3454 if ( return_score <0)fprintf ( stderr, "\n%.2f", return_score);
3455 return return_score;
3458 float get_sar_sim (char *seq1, char *seq2)
3461 int n11=0, n10=0, n01=0, n00=0;
3465 for ( a=0; a<l; a++)
3467 s=(seq1[a]=='O')?0:1;
3468 r=(seq2[a]=='O')?0:1;
3470 n00+=(!s && !r)?1:0;
3475 if ( n11==0) return 0;
3476 else return ((float)(n11)*100)/(float)(n11+n10+n01);
3478 float get_sar_sim2 (char *seq1, char *seq2)
3481 int n11=0, n10=0, n01=0, n00=0, Ns=0, Nr=0;
3485 for ( a=0; a<l; a++)
3487 s=(seq1[a]=='O')?0:1;
3488 r=(seq2[a]=='O')?0:1;
3493 n00+=(!s && !r)?1:0;
3498 if ( n11==0) return 0;
3503 score1=evaluate_sar_score1 (l, n11, Ns, Nr);
3507 float sar_aln2cor (Alignment *A);
3508 int sar_aln2ev (Alignment *A);
3510 int sarseq2ev ( char *s1, char *s2, int MODE);
3511 char* sarseq2anti_sarseq (char *seq_in, char *seq_out);
3512 Sequence * compare_sar_sequence( Sequence *S1, Sequence *S2, int depth)
3514 int a, b, c, d, e, f;
3515 char *s1, *s2, *s3,*s4,*s5, *n1, *n2, *n3, *n4, *n5;
3519 char *a1, *a0, *a2, *a3, *a4;
3522 if ( depth>max_depth)
3524 printf_exit (EXIT_FAILURE, stderr,"maximum depth: %d", max_depth);
3526 if ( depth==0) depth=2;
3527 A=declare_aln2 (strlen (S1->seq[0]),depth);
3530 A->len_aln=strlen (S1->seq[0]);
3532 for (a=0; a< S1->nseq; a++)
3533 for ( b=0; b<S2->nseq; b++)
3536 sprintf (a0, "%s", S1->seq[a]);
3537 sprintf (a1, "%s", S2->seq[b]);
3539 if ( strlen (a0)!=strlen (a1))
3541 add_warning (stderr, "WARNING %s (%d) and %s (%d) do not have the same length", S1->name[a], strlen (S1->seq[a]), S2->name[b], strlen (S2->seq[b]));
3545 fprintf ( stdout, ">2 %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b], sar_aln2cor (A), sar_aln2ev (A));
3546 sarseq2anti_sarseq (S1->seq[a],a0);
3547 fprintf ( stdout, ">2 %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], sar_aln2cor (A), sar_aln2ev (A));
3552 for (c=b+1; c<S2->nseq; c++)
3554 sprintf (a0, "%s", S1->seq[a]);
3555 sprintf (a1, "%s", S2->seq[b]);
3556 sprintf (a2, "%s", S2->seq[c]);
3557 fprintf ( stdout, ">2 %15s %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b],S2->name[c], sar_aln2cor (A), sar_aln2ev (A));
3558 sarseq2anti_sarseq (S1->seq[a],a0);
3559 fprintf ( stdout, ">2 %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], S2->name[c],sar_aln2cor (A), sar_aln2ev (A));
3565 for (d=c+1; d<S2->nseq; d++)
3567 sprintf (a0, "%s", S1->seq[a]);
3568 sprintf (a1, "%s", S2->seq[b]);
3569 sprintf (a2, "%s", S2->seq[c]);
3570 sprintf (a3, "%s", S2->seq[d]);
3572 fprintf ( stdout, ">2 %15s %15s %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b],S2->name[c],S2->name[d], sar_aln2cor (A), sar_aln2ev (A));
3573 sarseq2anti_sarseq (S1->seq[a],a0);
3574 fprintf ( stdout, ">2 %15s %15s %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], S2->name[c],S2->name[d],sar_aln2cor (A), sar_aln2ev (A));
3580 for (e=d+1; e<S2->nseq; e++)
3582 sprintf (a0, "%s", S1->seq[a]);
3583 sprintf (a1, "%s", S2->seq[b]);
3584 sprintf (a2, "%s", S2->seq[c]);
3585 sprintf (a3, "%s", S2->seq[d]);
3586 sprintf (a4, "%s", S2->seq[d]);
3595 char* sarseq2anti_sarseq (char *seq_in, char *seq_out)
3598 if (!seq_out)seq_out=vcalloc (strlen (seq_in)+1, sizeof (char));
3599 for (a=0; a<strlen (seq_in); a++)seq_out[a]=(seq_in[a]=='I')?'O':'I';
3602 float sar_aln2cor (Alignment *A)
3604 float n1, n11, tot_cor;
3608 for (n=0,a=0; a<A->nseq-1; a++)
3609 for (b=a+1; b<A->nseq; b++)
3611 for (n11=n1=0,c=0; c<A->len_aln; c++)
3613 n11+=(A->seq_al[a][c]=='I' && A->seq_al[b][c]=='I');
3614 n1+= (A->seq_al[a][c]=='I' || A->seq_al[b][c]=='I');
3616 tot_cor+=(n1==0)?0:n11/n1;
3622 int sarseq_pair2ev ( char *s1, char *s2,int mode);
3623 int sar_aln2ev (Alignment *A)
3626 int a, b, c, tot=0, n=0;
3629 for (a=0; a<A->nseq-1; a++)
3630 for (b=a+1; b<A->nseq; b++)
3632 tot+=sarseq_pair2ev (A->seq_al[a], A->seq_al[b], 1);
3637 int sarseq_pair2ev ( char *s1, char *s2,int mode)
3639 int l, t1, t2, t11,a, n1, n2, s;
3640 if ( (l=strlen (s1))!=strlen (s2))
3652 for (t1=t2=t11=0,a=0; a<l; a++)
3654 if ( mode==1)//CORRELATED
3656 n1=(s1[a]=='I')?1:0;
3657 n2=(s2[a]=='I')?1:0;
3659 else if ( mode==0)//ANTICORRELATED
3661 n1=(s1[a]=='I')?1:0;
3662 n2=(s2[a]=='O')?1:0;
3672 s=(int)(100*evaluate_sar_score1 (l, t11, t1, t2));
3676 double evaluate_sar_score1 ( int N, int n11, int n1msa, int n1sar)
3684 if ( n11==0)return 0;
3685 // if ( (n10)>n11 || n01>n11)return 0;
3687 p1= M_chooses_Nlog (n1msa, N) + M_chooses_Nlog (n1sar-n11, N-n1msa) + M_chooses_Nlog (n11, n1msa);
3688 p2=(M_chooses_Nlog (n1msa, N)+ M_chooses_Nlog (n1sar, N));
3694 double evaluate_sar_score2 ( int N, int n11, int n1msa, int n1sar)
3698 return n11-((n1msa-n11)+(n1sar-n11));
3700 if ( n11<n1msa) return 0;
3701 else if ( n11<n1sar) return 0;
3702 else if ( n11==N)return 0;
3707 int benchmark_sar( int value)
3714 for (a=0; a< 1000; a++)v[a]=0;
3821 Alignment *weight2sar (Alignment *A, Alignment *SAR, char *weight_file, int limit)
3828 weight=vcalloc (SAR->nseq, sizeof (int**));
3831 list=file2list (weight_file, " ");
3834 for (a=0; a< SAR->nseq; a++)
3839 if ( strm (list[b][1], SAR->name[a]) && atoi (list[b][3])>0)c++;
3843 weight[a]=declare_int (c+1, 3);
3844 fprintf ( stderr, "\n%s %d", SAR->name[a], c);
3848 if ( strm (list[b][1], SAR->name[a]) && atoi (list[b][3])>0)
3850 weight[a][c][0]=atoi(list[b][2])-1;
3851 weight[a][c][1]=list[b][5][0];
3852 weight[a][c][2]=atoi (list[b][3]);
3860 for (a=0; a<A->nseq; a++)
3862 fprintf ( stdout, ">%s\n", A->name[a]);
3863 for ( b=0; b< SAR->nseq; b++)
3865 score=seq2weighted_sar_score(A->seq_al[a], weight[b]);
3866 fprintf ( stdout, "%c", (score>limit)?'I':'O');
3868 fprintf (stdout, "\n");
3870 myexit (EXIT_SUCCESS);
3874 Alignment *display_sar ( Alignment *A, Alignment *SAR, char *compound)
3879 c=name_is_in_list ( compound, SAR->name, SAR->nseq, 100);
3880 if ( c==-1)return A;
3882 for ( a=0; a< A->nseq; a++)
3884 sprintf (name, "%s", A->name[a]);
3885 sprintf ( A->name[a], "%c_%s_%s", SAR->seq_al[c][a], name,compound);
3889 Alignment *aln2weighted_sar_score ( Alignment *A,Alignment *SAR, char *weight_file, char *compound)
3902 c=name_is_in_list (compound, SAR->name, SAR->nseq, 100);
3905 list=file2list (weight_file, " ");
3909 if (strm (list[a][1], compound))b++;
3912 weight=declare_int ( b+1, 3);
3918 if ( !strm (list[a][1], compound) || strm ("TOTPOS", list[a][1]));
3921 weight[b][0]=atoi(list[a][2])-1;
3922 weight[b][1]=list[a][5][0];
3923 weight[b][2]=atoi(list[a][3]);
3929 for ( a=0; a< A->nseq; a++)
3931 score=seq2weighted_sar_score (A->seq_al[a], weight);
3932 reactivity=(!SAR || c==-1)?'U':SAR->seq_al[c][a];
3934 sprintf (A->seq_comment[a], "Compound %-15s Reactivity %c SAR_SCORE %5d", compound,reactivity, (int) score);
3940 float seq2weighted_sar_score ( char *seq, int **weight)
3946 while (weight[a][0]!=-1)
3952 if ( is_gap(seq[p]));
3953 else if ( tolower(seq[p])==r)score+=w;
3959 Alignment * sar2simpred (Alignment *A, Alignment *SAR, char *posfile, char *compound, int L1,int L2 )
3961 int a, b, c, c1, c2;
3962 int **sim, **sim_ref, npred=0;
3963 float n11, n10, n01, n00;
3967 int N11=1, N01=2, N10=3, NXX=4, SIM=5;
3973 tot=declare_arrayN(3,sizeof (float), 10, 6, 2);
3975 sim_ref=aln2sim_mat (A, "idmat");
3976 if (strm (posfile, "all"))
3981 B=copy_aln ( A,NULL);
3982 B=extract_aln3(B,posfile);
3984 /*if (B->len_aln==0)L1=100;
3986 L1=((B->len_aln-1)*100)/B->len_aln;
3990 sim=aln2sim_mat (B, "idmat");
3993 for (a=0; a< A->nseq-1; a++)
3995 for ( b=a+1; b< A->nseq; b++)
3997 for ( c=0; c<SAR->nseq; c++)
3999 if ( (strm (compound, SAR->name[c]) || strm ( compound, "all")))
4001 /*if ( sim_ref[a][b]<30 || sim_ref[a][b]>60)continue;*/
4002 i1=0; /*sim_ref[a][b]/10;if (i1==10)i1--;*/
4007 c1=(SAR->seq_al[c][a]=='I')?1:0;
4008 c2=(SAR->seq_al[c][b]=='I')?1:0;
4011 n01=(!c1 && c2)?1:0;
4012 n10=(c1 && !c2)?1:0;
4013 n00=(!c1 && !c2)?1:0;
4015 tot[i1][N11][0]+=n11;
4016 tot[i1][N01][0]+=n01;
4017 tot[i1][N10][0]+=n10;
4018 /*tot[i1][N00][0]+=n00;*/
4020 tot[i1][SIM][0]+=sim_ref[a][b];
4024 tot[i1][N11][1]+=n11;
4025 tot[i1][N01][1]+=n01;
4026 tot[i1][N10][1]+=n10;
4027 /*tot[i1][N00][1]+=n00;*/
4029 tot[i1][SIM][1]+=sim_ref[a][b];
4038 sp=(tot[a][N11][0])/(tot[a][N11][0]+tot[a][N10][0]);
4039 fprintf ( stdout, "\n%15s N11 %5d SP %.2f ",compound, (int)tot[a][N11][0],sp);
4040 sp=((tot[a][N11][1]+tot[a][N10][1])==0)?1:(tot[a][N11][1])/(tot[a][N11][1]+tot[a][N10][1]);
4041 sn=(tot[a][N11][0]==0)?1:(tot[a][N11][1]/tot[a][N11][0]);
4042 fprintf ( stdout, " N11 %5d SP %.2f SN %.2f SIM %.2f", (int)tot[a][N11][1], sp,sn, (tot[a][SIM][1]/tot[a][NXX][1]));
4046 sp=((n11+n01)==0)?1:n11/(n11+n01);
4047 sn=((n11+n01)==0)?1:n11/(n11+n10);
4049 fprintf ( stdout, "\nLimit: %d NPRED %d AVGSIM %d SN %.2f SP %.2f TP %d FP %d FN %d",L1, npred, tot_sim, sn, sp, (int)n11, (int)n01, (int)n10);
4050 myexit (EXIT_SUCCESS);
4054 Alignment * sar2simpred2 (Alignment *A, Alignment *SAR, char *seqlist, char *posfile, char *compound, int L )
4056 int a,b, c,c1, c2, p, s;
4057 float n11, n10, n01, n00, n, sn2, prediction,sp, n1, n0, t, entropy, Delta;
4058 int *rlist, *tlist, *pred, *npred, tsim, psim;
4059 int **sim, **sim_ref;
4067 out=vcalloc (A->nseq+1, sizeof (char));
4068 rlist=vcalloc ( A->nseq, sizeof (int));
4069 tlist=vcalloc ( A->nseq, sizeof (int));
4070 pred=vcalloc(2, sizeof (int));
4071 npred=vcalloc(2, sizeof (int));
4074 if ( strm (seqlist, "first"))
4076 for ( a=0; a<SAR->nseq; a++)
4078 if ( strm ( compound, SAR->name[a]))
4080 for ( b=0; b<A->nseq; b++)
4082 if ( SAR->seq_al[a][b]=='I')
4084 fprintf ( stderr, "COMP: %s REF SEQ: %s\n", A->name[b], compound);
4086 tlist[rlist[nrs]]=1;
4094 else if (strm (seqlist, "all"))
4096 for ( a=0; a< A->nseq; a++)
4103 else if ((a=name_is_in_list ( seqlist, A->name, A->nseq, 100))!=-1)
4106 tlist[rlist[nrs]]=1;
4112 R=main_read_aln (seqlist, NULL);
4113 for (a=0; a<R->nseq; a++)
4115 rlist[a]=name_is_in_list( R->name[a], A->name, A->nseq, 100);
4121 c=name_is_in_list ( compound, SAR->name, SAR->nseq, 100);
4123 sim_ref=aln2sim_mat (A, "idmat");
4124 if (strm (posfile, "all"))
4131 B=copy_aln ( A,NULL);
4132 B=extract_aln3(B,posfile);
4133 sim=aln2sim_mat (B, "idmat");
4136 n11=n10=n01=n00=n=n1=n0=0;
4138 for (a=0; a<A->nseq; a++)
4140 if ( tlist[a] && !strm (seqlist, "all"))
4141 out[a]=(SAR->seq_al[c][a]=='I')?'Z':'z';/*SAR->seq_al[c][a];*/
4146 npred[0]=npred[1]=1;
4147 c1=(SAR->seq_al[c][a]=='I')?1:0;
4148 for (nr=0,tsim=0,psim=0,b=0; b<nrs; b++)
4150 if ( SAR->seq_al[c][rlist[b]]=='o');
4153 c2=(SAR->seq_al[c][rlist[b]]=='I')?1:0;
4156 tsim+=sim_ref[a][rlist[b]];
4157 psim+=sim[a][rlist[b]];
4171 Delta=pred[1]-pred[0];
4173 if (Delta<-delta_max){p=0;out[a]= (c1==0)?'O':'o';}
4174 else if (Delta>delta_max){p=1;out[a]=(c1==1)?'I':'i';}
4175 else {p=-1; out[a]=(c1==1)?'U':'u';}
4178 else if ( p && c1)n11++;
4179 else if ( p && !c1)n10++;
4180 else if ( !p && !c1)n00++;
4181 else if ( !p && c1)n01++;
4184 if (printall)fprintf ( stdout, ">%-15s %d %c OVERALL_SIM:%d POSITION_SIM %d\n%s\n", B->name[a], c1, out[a],tsim/nrs,psim/nrs,B->seq_al[a]);
4187 sp=((n11+n10)==0)?1:n11/(n11+n10);
4188 sn2=((n1)==0)?1:n11/n1;
4189 prediction=(n11+n00)/(n1+n0);
4190 entropy=(float)(M_chooses_Nlog (nr, nrs)/M_chooses_Nlog(nrs/2, nrs));
4192 fprintf ( stdout, ">%-15s Sp %.2f Sn %.2f Pred %.2f E %.2f\n", compound,sp, sn2,prediction,entropy );
4193 fprintf ( stdout, "%s\n", out);
4195 myexit (EXIT_SUCCESS);
4198 /************************************************************************************/
4199 /* ALIGNMENT ANALYZE : SAR FOR OR */
4200 /************************************************************************************/
4202 void display_or_help();
4203 void display_or_help()
4205 fprintf ( stdout, "\nor_sar options:");
4206 fprintf ( stdout, "\n_ORCL_: Command_line in a file");
4208 fprintf ( stdout, "\n_ROTATE_ : rotate the sar matrix (if each entry is a compound rather than a sequence)");
4209 fprintf ( stdout, "\n_JNIT_ : number cycles of Jacknife");
4210 fprintf ( stdout, "\n_JNSEQ_ : Number of sequences picked up in alignment [0 to keep them all, 10 by default]");
4211 fprintf ( stdout, "\n_JSAR_ : Number of compounds picked up in the SAR matrix [0 to keep them all => default");
4212 fprintf ( stdout, "\n_JRSAR_ : Randomization of the SAR file between each JNIT iteration: S, C, R, SC, SR... (S: seq, C, column, R: residue");
4213 fprintf ( stdout, "\n_JRALN_ : Randomization of the ALN file between each JNIT iteration: S, C, R, SC, SR... (S: seq, C, column, R: residue");
4216 fprintf ( stdout, "\n_NPOS_ : Number of positions used to make the prediction [4 by default]");
4217 fprintf ( stdout, "\n_DEPTH_ : Depth of the motif degenerated alphabet [Default=2]");
4218 fprintf ( stdout, "\n_POSFILE_ : Predefined list of positions in a file<P: pos score");
4219 fprintf ( stdout, "\n_RSAR_ : Randomization of the SAR file: S, C, R, SC, SR... (S: seq, C, column, R: residue");
4220 fprintf ( stdout, "\n_RALN_ : Randomization of the SAR file: S, C, R, SC, SR... (S: seq, C, column, R: residue");
4222 fprintf ( stdout, "\n_SARTHR_ : E-value Threshold for the filtered displays in comploo mode (def: 3)");
4226 fprintf ( stdout, "\n_MODE_ : jack, loo (leave one out), pos, sim, predict, self");
4227 fprintf ( stdout, "\n_COMPLIST_: Compound_list provided in a FASTA file. Each compound MUST correspond to the corresponding column in SAR");
4228 fprintf ( stdout, "\n_OUTALN_ : print the alignment corresponding to the list of positions");
4229 fprintf ( stdout, "\n_OUTTREE : Make the tree (default nj) corrwesponding to _OUTALN_");
4230 fprintf ( stdout, "\n\n");
4231 exit (EXIT_SUCCESS);
4233 Alignment * simple_sar_or(Alignment *inA, Alignment *inS, char *param);
4234 void sar2jack (Alignment *A, Alignment *S, int nseq, int sarlen);
4237 void sar2jack (Alignment *A, Alignment *S, int nseq, int sarlen)
4239 A=aln2jacknife (A, nseq, 0);
4240 S=reorder_aln (S, A->name, A->nseq);
4241 S=aln2jacknife (S, 0, sarlen);
4243 Alignment *or_scan (Alignment *A,Alignment *S, char *pmode)
4247 int start, offset,w;
4251 fprintf ( stdout, "\nPARAMETERS: %s\n", pmode);
4252 fprintf ( stderr, "\nPARAMETERS: %s\n", pmode);
4254 strget_param (pmode, "_SW_", "15", "%d",&w);
4255 strget_param (pmode, "_SMINW_", "5", "%d",&start);
4256 strget_param (pmode, "_SSTART_", "5", "%d",&offset);
4257 strget_param (pmode, "_SMODE_", "single", "%s",&mode);
4260 l=intlen (A->len_aln);
4261 poslist=vcalloc ( A->len_aln, sizeof (int));
4264 for ( a=0; a< A->len_aln; a++)poslist[nl++]=a+1;
4266 if ( strm (mode, "single"))
4268 for ( a=0; a<A->len_aln-2; a++)
4271 for (gap=0,c=0; c<A->nseq; c++)gap+=(A->seq_al[c][a]=='-');
4275 B=extract_aln (A, a+1, a+2);
4276 B=or_sar (B, S, pmode, NO_PRINT);
4284 fprintf ( stdout, "P: %*d S: %4d\n",l,a+1, score);
4285 //fprintf ( stderr, "P: %*d S: %4d\n",l,a+1, score);
4289 else if strm (mode, "scan")
4292 for ( ax=0; ax<nl; ax++)
4295 int best_pos=0, best_w=0, best_start=0, best_end=0;
4297 for (b=start; b<=w; b++)
4304 if (pstart<1 || pstart>A->len_aln)continue;
4305 if (pend<1 || pend>A->len_aln)continue;
4306 B=extract_aln (A, a-w, a+w);
4308 B=or_sar (B, S,pmode, NO_PRINT);
4311 if (B->score_aln>=best_score){best_score=B->score_aln; best_pos=a;best_w=b;best_start=pstart; best_end=pend;}
4314 fprintf ( stdout, "P: %*d I: %*d %*d Score: %5d L: %2d\n", l,best_pos, l,best_start+offset, l,best_end, best_score,(best_w*2)+1 );
4317 else if ( strm ( mode, "scan_comp"))
4328 float best_sc, best_eval;
4334 index=aln2pos_simple (A, A->nseq);
4335 strget_param (pmode, "_NPOS_", "1", "%d", &len);
4336 strget_param (pmode, "_STH_", "0", "%d", &sth);
4337 strget_param (pmode, "_SEV_", "0", "%d", &sev);
4339 if (sev==0)for (a=0, sev=1; a<len; a++)sev*=A->len_aln;
4341 tresults=vcalloc ( A->len_aln, sizeof (int));
4342 poscache=vcalloc ( A->len_aln, sizeof (int));
4343 poslist=generate_array_int_list (len, 0, A->len_aln-1,1, &n, NULL);
4344 for (s=0; s<S->len_aln; s++)
4348 NS=aln2block (S, s+1, s+2, NS);
4349 fprintf ( stderr, "\nProcess: %s ...\n", get_compound_name(s, mode));
4351 for (best_sc=0,best_pos=0,best_word=NULL,a=0; a<n; a++)
4353 for (b=0; b<len; b++)poscache[poslist[a][b]]=1;
4354 word=or_id_evaluate2 (A,NS, pmode,poscache, NO_PRINT, &sc);
4355 //if (word)HERE ("%s", word);
4357 for (b=0; b<len; b++)poscache[poslist[a][b]]=0;
4358 if (word && sc>best_sc)
4366 else if ( word && sc>=best_sc)
4374 for (a=0; a<=A->nseq; a++)
4379 fprintf (stdout, "\n>%s PPPP: S: %s SC: %d EV: %d NBest: %d W: %s", get_compound_name(s, mode),(a==A->nseq)?"cons":A->name[a],(int)best_sc, (int) best_eval, nbest,best_word);
4380 for ( b=0; b<len; b++)
4384 p1=poslist[best_pos][b];
4385 if (best_sc>sth && nbest<sev)tresults[p1]++;
4390 p1=index[a][poslist[best_pos][b]];
4393 fprintf ( stdout, " %d ", p1);
4399 for ( p1=0; p1<A->len_aln;p1++)
4400 fprintf ( stdout, "\n>TOT_P: %d %d", p1+1, tresults[p1]);
4403 else if ( strm ( mode, "scan_comp_old"))
4405 Alignment *NS=NULL, *BLOCK=NULL;
4406 int **results, *tresults;
4411 results=declare_int (A->len_aln*A->len_aln, 3);
4412 tresults=vcalloc ( A->len_aln, sizeof (int));
4413 poscache=vcalloc ( A->len_aln, sizeof (int));
4414 for (s=0; s<S->len_aln; s++)
4417 NS=aln2block (S, s+1, s+2, NS);
4418 fprintf ( stderr, "\nProcess: %s ...", get_compound_name(s, mode));
4419 for (n=0,p1=0; p1<A->len_aln-w; p1++, count ++)
4421 if ( count == 50){fprintf ( stderr, "*");count=0;}
4422 for ( p2=p1; p2<A->len_aln-w; p2++, n++)
4427 BLOCK=alnpos2block (A,poscache,BLOCK);
4429 if ( aln2ngap(BLOCK)<=0)BLOCK=or_sar (BLOCK,NS, pmode, NO_PRINT);
4430 else BLOCK->score_aln=0;
4432 //if ( BLOCK->score_aln>0)HERE ("P: %d %d %d", p1, p2,BLOCK->score_aln);
4435 results[n][2]=BLOCK->score_aln;
4436 poscache[p1]=poscache[p2]=0;
4440 sort_int_inv (results, 3, 2, 0, n-1);
4441 for (p1=0; p1<npos; p1++)
4443 fprintf ( stdout, "\n>%s PPPP: %d %d SC: %d", get_compound_name(s, mode), results[p1][0]+1,results[p1][1]+1, results[p1][2]);
4444 fprintf ( stderr, "\n>%s PPPP: %d %d SC: %d", get_compound_name(s, mode), results[p1][0]+1,results[p1][1]+1, results[p1][2]);
4445 tresults[results[p1][0]]++;
4446 tresults[results[p1][1]]++;
4449 for ( p1=0; p1<A->len_aln;p1++)
4450 if ( tresults[p1])fprintf ( stdout, "\n>TOT_P: %d %d", p1+1, tresults[p1]);
4453 exit (EXIT_SUCCESS);
4457 ORP * or_sar_compound(Alignment *A, Alignment *S, char *mode, int print);
4458 ORP* set_orp_name ( ORP* P, char *name);
4459 ORP* set_orp_offset ( ORP* P,int offset);
4460 Alignment * or_sar(Alignment *inA, Alignment *inS, char *mode, int print)
4462 char rsar[4],raln[4], sarmode[100];
4463 int rotate, a, b, start, end;
4467 if ( mode && (strstr (mode, "help") || strstr (mode, "HELP")))
4470 strget_param (mode, "_SARMODE_", "single", "%s", &sarmode);//single | all
4471 strget_param (mode, "_RSAR_", "NO", "%s", rsar);
4472 strget_param (mode, "_RALN_", "NO", "%s", raln);
4473 strget_param (mode, "_ROTATE_", "0", "%d", &rotate);
4475 strget_param (mode, "_START_", "1", "%d", &start);start--;
4476 strget_param (mode, "_END_", "0", "%d", &end);
4478 A=copy_aln (inA, NULL);
4479 S=copy_aln (inS, NULL);
4481 if ( end==0)end=A->len_aln;
4482 if ( start!=0 || end!=A->len_aln)
4485 if ( start>=A->len_aln || end<=start || end >A->len_aln)
4486 printf_exit (EXIT_FAILURE, stderr, "ERROR: _START_%d and _END_%d are incompatible with the aln [len=%d][FATAL:%s]", start, end, A->len_aln, PROGRAM);
4487 for (b=0; b<A->nseq; b++)
4488 for (c=0,a=start; a<end; a++, c++)
4489 A->seq_al[b][c]=A->seq_al[b][a];
4491 for (b=0; b<A->nseq; b++)A->seq_al[b][c]='\0';
4495 S=aln2random_aln (S, rsar);
4496 A=aln2random_aln (A, raln);
4504 if (S->len_aln!=A->nseq)
4505 printf_exit ( EXIT_FAILURE,stderr, "ERROR: Alignment and SAR matrix are incompatible [FATAL:%s]", PROGRAM);
4507 rS=rotate_aln (S, NULL);
4509 for (a=0; a< A->nseq; a++)sprintf (rS->name[a], "%s", A->name[a]);
4514 R=vcalloc ( S->len_aln+2, sizeof (ORP*));
4515 if (strm (sarmode, "all"))R[0]=or_sar_compound (A, S, mode, print);
4516 else if ( strm (sarmode, "single"))
4518 for ( a=0; a<S->len_aln; a++)
4522 NS=aln2block (S, a+1, a+2, NS);
4524 R[a]=or_sar_compound (A, NS, mode, NO_PRINT);
4525 set_orp_name ((R[a]), get_compound_name (a, mode));
4526 set_orp_offset(R[a], start);
4527 display_or_summary (R[a], mode, stdout, print);
4534 ORS=combine_n_predictions (R, A, S);
4536 display_or_summary (ORS, mode, stdout, print);
4537 inA->score_aln=(int)(ORS->best*(float)1000);
4544 ORP* set_orp_offset ( ORP* P,int offset)
4546 if (!P) return NULL;
4550 return set_orp_offset(P->PR, offset);
4554 ORP* set_orp_name ( ORP* P, char *name)
4556 if (!P) return NULL;
4559 sprintf ( P->name, "%s", name);
4560 return set_orp_name(P->PR, name);
4564 ORP * combine_n_predictions (ORP**R, Alignment *A, Alignment *S)
4570 N=combine_2_predictions (R[a++], N, A, S);
4572 sprintf ( N->name, "ALL");
4573 sprintf ( N->mode, "COMBINED");
4578 ORP *combine_2_predictions ( ORP*IN, ORP *TO,Alignment *A, Alignment *S)
4584 TO=declare_or_prediction (IN->ncomp, IN->nseq, IN->len);
4587 TO->P=copy_aln(S, NULL);
4588 TO->offset=IN->offset;
4592 for (a=0; a< IN->len; a++)
4594 TO->pos[a]+=IN->pos[a];
4601 rates2sensitivity (TO->tp, TO->tn, TO->fp, TO->fn, &(TO->sp), &(TO->sn), &(TO->sen2), &(TO->best));
4604 for (a=0; a<(TO->A)->nseq; a++)
4606 (TO->P)->seq_al[a][TO->ncomp]=(IN->P)->seq_al[a][0];
4607 //(TO->S)->seq_al[a][TO->ncomp]=(IN->S)->seq_al[a][0];
4610 (TO->P)->len_aln=TO->ncomp;
4611 (TO->A)->score_aln=TO->best;
4618 ORP * display_or_summary (ORP *CP, char *mode, FILE *fp, int print)
4624 Alignment *A, *P, *S;
4634 pred=vcalloc ( P->nseq*P->len_aln*2, sizeof (char));
4635 exp=vcalloc ( P->nseq*P->len_aln*2, sizeof (char));
4636 motif=vcalloc (CP->len+1, sizeof (char));
4642 for ( a=0; a<P->nseq; a++)
4644 strcat (pred,P->seq_al[a]);
4645 strcat (exp, S->seq_al[a]);
4647 CP->evalue=profile2evalue(pred, exp);
4650 while ( CP->motif && CP->motif[a] && CP->motif[0][a][0])strcat (motif, CP->motif[0][a++]);
4654 fprintf (fp, "\n>%-10s Mode: %s Accuracy: %6.2f E-value: %6.2f Motif: ",CP->name,CP->mode, CP->best, CP->evalue);
4658 fprintf (fp, " %s",motif);
4660 for ( a=0; a<CP->len; a++)
4662 if ( CP->pos[a]) fprintf ( fp, "\n>%-10s Mode: %s P: cons %3d SC: %4d", CP->name, CP->mode, a+1+CP->offset, CP->pos[a]);
4664 fprintf ( fp, "\n");
4666 vfree (pred); vfree(motif); vfree(exp);
4667 if (CP->PR)display_or_summary (CP->PR, mode, fp, print);
4672 ORP * or_sar_compound(Alignment *A, Alignment *S, char *mode, int print)
4678 strget_param (mode, "_MODE_", "predict", "%s", rmode);
4682 if (strm (rmode, "jack"))P=or_jack (A, S, mode);
4683 else if (strm (rmode, "loo")) PR=or_loo (A, S, mode,NULL, print);
4684 else if (strm (rmode, "comploo")) P=or_comp_loo (A, S, mode,NULL, print);
4685 else if ( strm (rmode, "comppos")){or_comp_pos ( A, S, mode,print);exit(0); return NULL;}
4686 else if ( strm (rmode, "pos"))P=or_aln2pos_aln ( A, S, mode);
4687 else if ( strm (rmode, "predict"))P=or_predict ( A, S, mode);
4688 else if ( strm (rmode, "self"))PR=or_self_predict ( A, S, mode, NULL, print);
4689 else if ( strm (rmode, "sim"))P=or_sim ( A, S, mode);
4691 else if ( strm (rmode, "test"))P=or_test ( A, S, mode);
4694 printf_exit (EXIT_FAILURE, stderr, "ERROR: %s is an unknown mode of or_sar [FATAL:%s]\n", rmode,PROGRAM);
4700 PR=vcalloc (1, sizeof (ORP));
4705 Alignment * or_test ( Alignment *inA, Alignment *inS, char *mode)
4712 float or_id_evaluate ( Alignment *A, Alignment *S, char *mode, int *pos, int print)
4717 w=or_id_evaluate2(A,S, mode, pos,print, &score);
4722 char* or_id_evaluate2 ( Alignment *A, Alignment *S, char *mode, int *pos, int print, float *rscore)
4724 static char **words;
4728 int res, p,nl, w, c, s, exp, pred;
4730 float sn, sp, sen2, best, best_score;
4733 {free_char (words, -1);
4739 plist=pos2list (pos, A->len_aln, &nl);
4740 words=declare_char (A->nseq, nl+1);
4741 bword=vcalloc (nl+1, sizeof (char));
4742 for (p=0; p<nl; p++)
4744 for (s=0; s< A->nseq; s++)
4746 res=A->seq_al[s][plist[p]];
4747 if (res=='-'){or_id_evaluate2 (NULL, NULL, NULL, NULL, 0, NULL);vfree (bword);return 0;}
4752 for (best_score=0,w=0; w<A->nseq; w++)
4756 for (c=0; c<S->len_aln; c++)
4758 for (s=0; s<A->nseq; s++)
4760 exp=(S->seq_al[s][c]=='I')?1:0;
4761 pred=strm (words[w], words[s]);
4762 if ( exp && pred)tp++;
4763 else if ( exp && !pred)fn++;
4764 else if (!exp && !pred)tn++;
4765 else if (!exp && pred)fp++;
4768 rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best);
4769 if ( best>best_score)
4772 sprintf (bword, "%s", words[w]);
4775 rscore[0]=(float)1000*best_score;
4776 or_id_evaluate2 (NULL, NULL, NULL, NULL, 0, NULL);
4780 float or_loo_evaluate2 ( Alignment *A, Alignment *S, char *mode, int *pos, int print)
4782 int c, s, p, res, sar;
4786 float sn, sp, sen2, best;
4787 char **words, **positive, **negative;
4790 plist=pos2list (pos, A->len_aln, &nl);
4792 words=declare_char (A->nseq, nl+1);
4794 for (p=0; p<nl; p++)
4796 for (s=0; s< A->nseq; s++)
4798 res=A->seq_al[s][plist[p]];
4799 if (res=='-'){vfree (plist);free_char (words, -1); return 0;}
4803 positive=vcalloc ( A->nseq, sizeof (char*));
4804 negative=vcalloc ( A->nseq, sizeof (char*));
4805 for (c=0; c<S->len_aln; c++)
4807 //Fill the match matrix
4808 for (p=0; p<nl; p++)
4810 for (s=0; s< A->nseq; s++)
4812 sar=S->seq_al[s][c];
4813 if (sar=='I')positive[s]=words[s];
4814 else if ( sar=='O')negative[s]=words[s];
4818 //Evaluate the scores
4819 for (s=0; s< A->nseq; s++)
4821 int pos=0, neg=0, pred;
4822 sar=S->seq_al[s][c];
4823 positive[s]=negative[s]=NULL;
4825 if ( name_is_in_list (words[s], positive, A->nseq, nl+1)!=-1)
4827 if ( name_is_in_list (words[s], negative, A->nseq, nl+1)!=-1)
4830 if (pos & !neg) pred=1;
4833 if ( pred && sar=='I')tp++;
4834 else if (!pred && sar=='I')fn++;
4835 else if (!pred && sar=='O')tn++;
4836 else if ( pred && sar=='O')fp++;
4838 if ( sar=='I')positive[s]=words [s];
4839 else negative[s]=words[s];
4843 vfree (negative); vfree (positive);
4844 vfree (plist); free_char (words, -1);
4845 rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best);
4847 return (float)1000*best;
4849 float or_loo_evaluate ( Alignment *A, Alignment *S, char *mode, int *pos, int print)
4851 int c, s, p, res, sar;
4855 float sn, sp, sen2, best;
4858 plist=pos2list (pos, A->len_aln, &nl);
4859 matP=declare_int (nl, 256);
4860 matN=declare_int (nl, 256);
4862 for (c=0; c<S->len_aln; c++)
4864 //Fill the match matrix
4865 for (p=0; p<nl; p++)
4867 for (s=0; s< A->nseq; s++)
4869 res=A->seq_al[s][plist[p]];
4870 sar=S->seq_al[s][c];
4871 if (res=='-'){vfree (plist); free_int (matP, -1);free_int (matN, -1); return 0;}
4872 if (sar=='I')matP[p][res]++;
4873 if (sar=='O')matN[p][res]++;
4877 //Evaluate the scores
4878 for (s=0; s< A->nseq; s++)
4881 int pred, valP, valN;
4883 sar=S->seq_al[s][c];
4884 for (scoreN=0,scoreP=0,p=0; p<nl; p++)
4886 res=A->seq_al[s][plist[p]];
4888 valP=matP[p][res]-(sar=='I')?1:0;
4889 scoreP+=(valP>0)?1:0;
4891 valN=matN[p][res]-(sar=='O')?1:0;
4892 scoreN+=(valN>0)?1:0;
4895 if ( scoreP==nl && scoreN<nl)pred=1;
4899 if ( pred && sar=='I')tp++;
4900 else if (!pred && sar=='I')fn++;
4901 else if (!pred && sar=='O')tn++;
4902 else if ( pred && sar=='O')fp++;
4906 for (p=0; p<nl; p++)
4908 for (s=0; s< A->nseq; s++)
4910 res=A->seq_al[s][plist[p]];
4911 sar=S->seq_al[s][c];
4912 if (sar=='I')matP[p][res]=0;
4913 else matN[p][res]=0;
4918 vfree (plist); free_int (matP, -1);free_int (matN, -1);
4919 rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best);
4921 return (float)1000*best;
4923 int* or_comp_pos ( Alignment *inA, Alignment *inS, char *mode,int print)
4925 Alignment *A=NULL, *S=NULL, *inS2=NULL;
4927 int *main_pos, *pos=NULL;
4929 set_sar (inA, inS, mode);
4930 main_pos=vcalloc ( inA->len_aln, sizeof (int));
4932 inS2=copy_aln (inS, NULL);
4937 //Run every SAR, one at a time
4938 for ( c=0; c< inS->len_aln; c++)
4942 fprintf ( stdout, ">%d\n", c);
4943 for (a=0; a< inS->nseq; a++)
4945 inS2->seq_al[a][0]=inS->seq_al[a][c];
4946 inS2->seq_al[a][1]='\0';
4953 pos=vcalloc (inA->len_aln, sizeof (int));
4954 A=copy_aln (inA, NULL);
4955 S=copy_aln (inS2, NULL);
4956 set_sar (A,S, mode);
4957 pos=aln2predictive_positions (A, S, mode,PRINT);
4959 for (max=0,b=0; b<A->len_aln; b++)
4961 main_pos[b]+=pos[b];
4962 if (main_pos[b]>max)
4970 for (a=0; a<A->nseq; a++)
4972 fprintf ( stdout, "\t");
4973 for ( b=0; b<A->len_aln; b++)
4974 if ( pos[b]) fprintf ( stdout, "%c", A->seq_al[a][b]);
4975 fprintf ( stdout, " %c\n", inS2->seq_al[a][0]);
4977 fprintf ( stdout, "\n\tBest: %d %d\n", p, max);
4983 for ( a=0; a<inA->len_aln; a++)fprintf ( stdout, "\nP2: cons %4d %4d [FINAL]", a+1, main_pos[a]);
4987 Alignment * or_comp_loo ( Alignment *inA, Alignment *inS, char *mode, int *pos,int print)
4990 char **keep, **remove;
4991 Alignment *A, *S, *P, *P1, *SEQ, *inS2;
4992 int **main_pos, *compound_pos, **comp_list;
4994 char *comp_pred, *comp_exp;
4997 strget_param (mode, "_SARTHRES_", "3", "%d", &sar_threshold);
4999 if (pos)pos_exists=1;
5000 set_sar (inA, inS, mode);
5001 P=copy_aln (inS, NULL);
5002 keep=declare_char (inA->nseq, MAXNAMES);
5003 remove=declare_char (inA->nseq, MAXNAMES);
5005 main_pos=declare_int ( inA->len_aln,4);
5006 comp_list=declare_int (inA->len_aln, sizeof (int*));
5007 inS2=copy_aln (inS, NULL);
5011 comp_pred=vcalloc ( inA->nseq+1, sizeof (int));
5012 comp_exp=vcalloc ( inA->nseq+1, sizeof (int));
5014 //Run every SAR, one at a time
5015 for ( c=0; c< inS->len_aln; c++)
5017 for (a=0; a< inS->nseq; a++)
5019 inS2->seq_al[a][0]=inS->seq_al[a][c];
5020 inS2->seq_al[a][1]='\0';
5022 vfree (compound_pos);
5023 compound_pos=vcalloc (inA->len_aln, sizeof (int));
5024 for (a=0; a<inA->nseq; a++)
5028 A=copy_aln (inA, NULL);
5029 S=copy_aln (inS2, NULL);
5031 for (n=0,b=0; b<A->nseq; b++)
5033 if (b!=a)sprintf (keep[n++ ], "%s", A->name [b]);
5035 sprintf ( remove[0], "%s", A->name[a]);
5036 reorder_aln (A,keep, A->nseq-1);
5038 set_sar (A,S, mode);
5041 pos=aln2predictive_positions (A, S, mode,NO_PRINT);
5043 for (b=0; b<A->len_aln; b++)
5045 compound_pos[b]+=pos[b];
5048 motifs=compounds2motifs (A, S, pos,0, mode, NO_PRINT);
5050 SEQ=copy_aln (inA, NULL);
5051 SEQ=reorder_aln (SEQ, remove, 1);
5053 P1=aln2prediction (SEQ, motifs, pos);
5054 comp_pred[a]=P1->seq_al[0][0];
5055 comp_exp[a]=inS2->seq_al[a][0];
5056 P->seq_al[a][c]=P1->seq_al[0][0];
5063 free_arrayN( (void *)motifs, 3);
5064 if (!pos_exists)vfree (pos);
5067 fprintf ( stdout, ">%-15s SC: %.2f E; %.2f\n%s\n%s\n", get_compound_name(c, mode),profile2sensitivity (comp_pred, comp_exp, NULL, NULL, NULL, NULL),profile2evalue(comp_pred, comp_exp),comp_pred, comp_exp);
5068 for (b=0; b<A->len_aln; b++)
5070 main_pos[b][2]+=compound_pos[b];
5071 if (compound_pos[b])main_pos[b][3]++;
5072 if (profile2evalue(comp_pred, comp_exp)>sar_threshold)
5074 main_pos[b][0]+=compound_pos[b];
5075 if (compound_pos[b])
5079 comp_list[b]=vrealloc (comp_list[b], sizeof (int)*(comp_list[b][0]+1));
5080 comp_list[b][comp_list[b][0]]=c;
5087 P->score_aln=(int)((float)1000*evaluate_prediction (P, inS, mode,print));
5091 for ( a=0; a<inA->len_aln; a++)fprintf ( stdout, "\nP: cons %4d RS: %4d RC: %5d FC: %4d %4d[FINAL]", a+1, main_pos[a][2], main_pos[a][3], main_pos[a][0], main_pos[a][1]);
5093 for ( a=0; a<inA->len_aln; a++)
5095 fprintf ( stdout, "\nP: cons %4d RS: %4d RC: %5d FC: %4d %4d CLIST: ", a+1, main_pos[a][2], main_pos[a][3], main_pos[a][0], main_pos[a][1]);
5096 for ( c=1; c<=comp_list[a][0]; c++)
5098 fprintf ( stdout, "%s ", get_compound_name(comp_list[a][c], mode));
5100 fprintf ( stdout, " [COMP_LIST]");
5103 free_int (main_pos, -1);
5104 free_int (comp_list, -1);
5108 ORP* or_loo ( Alignment *inA, Alignment *inS, char *mode, int *pos,int print)
5111 char **keep, **remove;
5112 Alignment *A, *S, *P, *P1, *SEQ;
5119 if (pos)pos_exists=1;
5120 set_sar (inA, inS, mode);
5121 PR=declare_or_prediction (inS->nseq, inA->nseq, inA->len_aln);
5122 sprintf (PR->mode, "loo ");
5123 P=copy_aln (inS, NULL);
5126 keep=declare_char (inA->nseq, MAXNAMES);
5127 remove=declare_char (inA->nseq, MAXNAMES);
5134 for (a=0; a<inA->nseq; a++)
5138 A=copy_aln (inA, NULL);
5139 S=copy_aln (inS, NULL);
5141 for (n=0,b=0; b<A->nseq; b++)
5143 if (b!=a)sprintf (keep[n++ ], "%s", A->name [b]);
5145 sprintf ( remove[0], "%s", A->name[a]);
5146 reorder_aln (A,keep, A->nseq-1);
5148 set_sar (A,S, mode);
5152 pos=aln2predictive_positions (A, S, mode,print);
5156 for (b=0; b<A->len_aln; b++)
5161 motifs=compounds2motifs (A, S, pos,0, mode, print);
5163 SEQ=copy_aln (inA, NULL);
5164 SEQ=reorder_aln (SEQ, remove, 1);
5167 P1=aln2prediction (SEQ, motifs, pos);
5172 fprintf ( stdout, "\n%s\nPred: %s\nReal: %s\n", P1->name[0], P1->seq_al[0], inS->seq_al[a]);
5174 sprintf ( P->seq_al[a], "%s", P1->seq_al[0]);
5182 free_arrayN( (void *)motifs, 3);
5183 if (!pos_exists)vfree (pos);
5186 free_char (keep, -1);
5187 free_char (remove, -1);
5190 PR=new_evaluate_prediction (PR, mode,print);
5193 PR->PR=or_self_predict(inA, inS, mode,NULL, print);
5195 if (print==PRINT)for ( a=0; a<inA->len_aln; a++)fprintf ( stdout, "\nP: cons %d %d [FINAL]", a+1, PR->pos[a]);
5204 Alignment * or_jack(Alignment *inA, Alignment *inS, char *mode)
5212 char jrsar[10], jraln[10];
5214 strget_param (mode, "_JNIT_", "100", "%d", &n_cycles);
5215 strget_param (mode, "_JNSEQ_", "10", "%d", &subnseq);
5216 strget_param (mode, "_JNAR_", "0", "%d", &subsar);
5218 strget_param (mode, "_JRSAR_", "NO", "%s", jrsar);
5219 strget_param (mode, "_JRALN_", "NO", "%s", jraln);
5223 main_pos=vcalloc ( inA->len_aln, sizeof (int));
5224 for (a=0; a<n_cycles; a++)
5227 A=copy_aln (inA, NULL);
5228 S=copy_aln (inS, NULL);
5230 S=aln2random_aln (S, jrsar);
5231 A=aln2random_aln (A, jraln);
5233 set_sar (A,S, mode);
5234 sar2jack (A, S,subnseq,subsar);
5238 pos=aln2predictive_positions (A, S,mode, PRINT);
5240 for (b=0; b<A->len_aln; b++)main_pos[b]+=pos[b];
5244 display_pos (A, S, main_pos, mode);
5250 Alignment * display_pos (Alignment *A, Alignment *S, int *pos,char *mode)
5258 intl=intlen (A->len_aln);
5259 index=aln2pos_simple (A, A->nseq);
5260 B=copy_aln (A,NULL);
5262 for ( a=0; a<A->len_aln; a++)
5263 fprintf ( stdout, "\nP: cons %*d %*d S: %4d [DISPLAY_FULL_POS]", intl,a+1,intl, a+2, pos[a]);
5264 fprintf ( stdout, "\n\n");
5265 for (a=0; a<A->len_aln; a++)
5269 for ( b=0; b<A->nseq; b++)
5271 B->seq_al[b][B->len_aln]=A->seq_al[b][a];
5272 if (index[b][a]>0)fprintf ( stdout, "\nP: %s %d %d S: %d [DISPLAY_POS]",A->name[b], index[b][a], index[b][a]+1, pos[a]);
5275 fprintf ( stdout, "\nP: cons %d %d S: %d [DISPLAY_POS]", a+1, a+2, pos[a]);
5278 fprintf ( stdout, "\n");
5279 for (a=0; a<B->nseq; a++)B->seq_al[a][B->len_aln]='\0';
5282 Alignment * or_aln2pos_aln (Alignment *A, Alignment *S, char *mode)
5287 char outaln[100], outtree[100];
5290 strget_param (mode, "_OUTALN_", "NO", "%s", outaln);
5291 strget_param (mode, "_OUTTREE_", "NO", "%s", outtree);
5293 set_sar (A, S, mode);
5294 pos=aln2predictive_positions (A, S,mode, PRINT);
5296 B=display_pos (A, S, pos, mode);
5299 if (!strm(outaln, "NO")) vfclose (output_aln (B, vfopen (outaln, "w")));
5300 if (!strm(outtree, "NO"))vfclose (print_tree (aln2tree(B), "newick", vfopen (outtree, "w")));
5304 Alignment * or_sim(Alignment *A, Alignment *S, char *mode)
5306 //Predict all the sequences that are not both in inS and inA
5309 set_sar (A, S, mode);
5310 pos=aln2predictive_positions (A, S,mode, PRINT);
5311 fprintf ( stdout, "R: %.3f", pos2sim (A,S, pos));
5313 exit (EXIT_SUCCESS);
5316 ORP* or_self_predict(Alignment *A, Alignment *S, char *mode,int *pos, int print)
5318 //Predict all the sequences that are not both in inS and inA
5329 set_sar (A, S, mode);
5330 PR=declare_or_prediction (S->nseq, A->nseq, A->len_aln);
5331 sprintf (PR->mode, "self");
5337 pos=aln2predictive_positions (A, S,mode,print);
5343 for (a=0; a< A->len_aln; a++)
5347 PR->motif=motifs=compounds2motifs (A, S, pos,0, mode, print);
5348 P=PR->P=aln2prediction (A, motifs, pos);
5350 if (!pre_set_pos)vfree (pos);
5352 PR=new_evaluate_prediction (PR, mode,print);
5357 Alignment * or_predict(Alignment *inA, Alignment *inS, char *mode)
5359 //Predict all the sequences that are not both in inS and inA
5360 Alignment *P, *A, *S, *T;
5371 A=copy_aln (inA, NULL);
5372 S=copy_aln (inS, NULL);
5373 set_sar (A, S, mode);
5375 pos=aln2predictive_positions (A, S,mode,PRINT);
5376 motifs=compounds2motifs (A, S, pos,0, mode, PRINT);
5377 T=get_prediction_target (inA, inS, mode);
5380 P=aln2prediction (T, motifs, pos);
5381 //recall=evaluate_prediction (S, P, mode);
5382 for ( a=0; a<P->len_aln; a++)
5384 for (b=0; b<P->nseq; b++)
5386 if (tolower(P->seq_al[b][a])=='i')fprintf (stdout, "\n>%20s %20s %c", T->name [0],get_compound_name (a, mode), P->seq_al[b][a]);
5389 fprintf ( stdout, "\n");
5393 Alignment *get_prediction_target (Alignment *A, Alignment *S, char *param)
5399 T=copy_aln (A, NULL);
5400 name=declare_char (A->nseq, 100);
5401 for (n=0,a=0; a< A->nseq; a++)
5403 if ( name_is_in_list (A->name[a], S->name, S->nseq, 100)==-1)
5405 sprintf (name[n++], "%s", A->name[a]);
5408 T=reorder_aln (T,name, n);
5412 Alignment *set_sar (Alignment *A, Alignment *S, char *param)
5417 name=declare_char (A->nseq, 100);
5418 for (n=0,a=0; a< A->nseq; a++)
5420 if ( name_is_in_list (A->name[a], S->name, S->nseq, 100)!=-1)
5422 sprintf (name[n++], "%s", A->name[a]);
5425 A=reorder_aln (A,name, n);
5426 S=reorder_aln (S,name, n);
5427 free_char (name, -1);
5431 ORP* new_evaluate_prediction (ORP *PR, char *mode, int print)
5435 float sn, sp, sen2, best;
5436 float tot_best_seq=0;
5437 float tot_best_comp=0;
5447 recall=vcalloc (P->len_aln, sizeof (float));
5448 if (P->len_aln!=R->len_aln)
5450 HERE ("Mismatch between number of compounds in prediction and reference");
5453 if (print==PRINT)fprintf ( stdout, "\n");
5455 for (a=0; a<P->nseq; a++)
5458 if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1)
5461 for (b=0;b<P->len_aln; b++)
5466 if ( p=='I' && r=='I')tp++;
5467 else if ( p=='I' && r=='O')fp++;
5468 else if ( p=='O' && r=='I')fn++;
5469 else if ( p=='O' && r=='O')tn++;
5471 rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
5472 if (print==PRINT)fprintf (stdout, ">%-s sp: %.2f sn: %.2f sn2: %.2f best: %.2f [SEQ]\n",P->name[a], sp, sn, sen2, best);
5484 if (print==PRINT)fprintf ( stdout, ">TotSeq sp: %.2f N: %d[SEQ]\n",tot_best_seq, ns);
5487 for (ns=0,b=0; b<P->len_aln; b++)
5490 for (a=0; a<P->nseq;a++)
5492 if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1)
5497 if ( p=='I' && r=='I'){PR->tp++;tp++;}
5498 else if ( p=='I' && r=='O'){PR->fp++;fp++;}
5499 else if ( p=='O' && r=='I'){PR->fn++;fn++;}
5500 else if ( p=='O' && r=='O'){PR->tn++;tn++;}
5503 rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
5505 if (print==PRINT) fprintf (stdout, ">%-25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f [COMP]\n",get_compound_name (b, mode), PR->sp, PR->sn, PR->sen2,PR->best);
5509 tot_best_comp+=best;
5517 rates2sensitivity (PR->tp, PR->tn, PR->fp,PR->fn,&(PR->sp), &(PR->sn), &(PR->sen2), &(PR->best));
5518 if (print==PRINT)fprintf ( stdout, ">FullTot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[COMP]\n", PR->sp, PR->sn, PR->sen2,PR->best, ns);
5519 P->score_aln=(int)((float)1000*(PR->best));
5522 float evaluate_prediction (Alignment *R, Alignment *P, char *mode, int print)
5526 int tot_tp, tot_tn, tot_fp, tot_fn;
5527 float sn, sp, sen2, best;
5531 float tot_best_seq=0;
5532 float tot_best_comp=0;
5541 recall=vcalloc (P->len_aln, sizeof (float));
5542 if (P->len_aln!=R->len_aln)
5544 HERE ("Mismatch between number of compounds in prediction and reference");
5547 if (print==PRINT)fprintf ( stdout, "\n");
5548 for (a=0; a<P->nseq; a++)
5551 if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1)
5554 for (b=0;b<P->len_aln; b++)
5559 if ( p=='I' && r=='I')tp++;
5560 else if ( p=='I' && r=='O')fp++;
5561 else if ( p=='O' && r=='I')fn++;
5562 else if ( p=='O' && r=='O')tn++;
5564 rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
5565 if (print==PRINT)fprintf (stdout, ">%-s sp: %.2f sn: %.2f sn2: %.2f best: %.2f [SEQ]\n",P->name[a], sp, sn, sen2, best);
5583 if (print==PRINT)fprintf ( stdout, ">Tot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[SEQ]\n", tot_sp, tot_sn, tot_sen2,tot_best_seq, ns);
5585 tot_fp=tot_fn=tot_tp=tot_tn=0;
5586 tot_sp=tot_sn=tot_sen2=tot_best_comp=0;
5587 for (ns=0,b=0; b<P->len_aln; b++)
5590 for (a=0; a<P->nseq;a++)
5592 if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1)
5597 if ( p=='I' && r=='I'){tot_tp++;tp++;}
5598 else if ( p=='I' && r=='O'){tot_fp++;fp++;}
5599 else if ( p=='O' && r=='I'){tot_fn++;fn++;}
5600 else if ( p=='O' && r=='O'){tot_tn++;tn++;}
5603 rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
5605 if (print==PRINT) fprintf (stdout, ">%-25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f [COMP]\n",get_compound_name (b, mode), sp, sn, sen2,best);
5610 tot_best_comp+=best;
5624 rates2sensitivity (tot_tp, tot_tn, tot_fp,tot_fn,&tot_sp, &tot_sn, &tot_sen2, &tot_best);
5625 if (print==PRINT)fprintf ( stdout, ">FullTot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[COMP]\n", tot_sp, tot_sn, tot_sen2,tot_best, ns);
5631 Alignment * aln2prediction (Alignment *A,char ***motif, int *pos)
5635 char **array, **sar;
5638 nc=read_array_size ((void *)motif, sizeof (char***));
5641 list=pos2list (pos, A->len_aln, &nl);
5644 array=declare_char (A->nseq, nl+1);
5645 sar=declare_char(A->nseq, nc+1);
5646 for (a=0; a<A->nseq; a++)
5648 for (b=0; b<nl; b++)
5649 array[a][b]=A->seq_al[a][list[b]];
5652 for (a=0; a<nc; a++)
5654 for (b=0; b<A->nseq; b++)
5657 sar[b][a]=(match_motif (array[b], motif[a]))?'I':'O';
5662 S=fill_sequence_struc (A->nseq,sar,A->name);
5663 R=seq2aln (S, NULL, KEEP_GAP);
5664 free_sequence (S, S->nseq);
5665 free_char (sar, -1);
5667 free_char (array, -1);
5671 int * file2pos_list (Alignment *A, char *posfile)
5678 //pos_file:<P: seqname pos score\n>
5682 if ( !check_file_exists (posfile))
5684 printf_exit ( EXIT_FAILURE, stderr, "ERROR: Could not read posfile %s\n", posfile);
5687 file=file2list (posfile, " ");
5689 index=aln2inv_pos (A);
5690 pos=vcalloc ( A->len_aln, sizeof (int));
5696 if ( !strm (file[n][1], "P:"));
5699 if ( (strm (file[n][2], "cons")))
5700 p=atoi(file[n][3])-1;
5703 i=name_is_in_list ( file[n][2], A->name, A->nseq, MAXNAMES+1);
5705 p=index[i][atoi(file[n][3])]-1;
5708 if (p!=-1)pos[p]+=atoi(file[n][4]);
5714 free_int (index, -1);
5715 free_arrayN ( (char **)file, 3);
5718 int * aln2predictive_positions (Alignment *A, Alignment *B, char *mode, int print)
5722 if (!mode) return NULL;
5724 strget_param (mode, "_POSMODE_", "scan", "%s", posmode);
5725 if ( strm (posmode, "mat"))return aln2predictive_positions_mat (A, B, mode, print);
5726 else if ( strm (posmode, "scan")) return aln2predictive_positions_scan (A, B, mode, print);
5729 printf_exit (EXIT_FAILURE,stderr, "ERROR: %s is an unknown _POSMODE_ mode",posmode);
5734 int * aln2predictive_positions_mat (Alignment *A, Alignment *B, char *mode, int print)
5736 int a, b, c,gap, res1, res2, sar1, sar2, npos, s, idscore;
5737 float id1,id2,id3,nid1,nid2,nid3;
5739 pos=declare_int (A->len_aln,2);
5740 fpos=vcalloc ( A->len_aln, sizeof (int));
5742 strget_param (mode, "_NPOS_", "2", "%d", &npos);
5743 for ( a=0; a< A->len_aln; a++)
5746 id1=id2=id3=nid1=nid2=nid3=0;
5747 for ( gap=0,b=0; b<A->nseq; b++)gap+=(A->seq_al[b][a]=='-');
5748 if ( gap>0){pos[a][1]=0;continue;}
5750 for (s=0; s<B->len_aln; s++)
5752 for ( gap=0,b=0; b<A->nseq-1; b++)
5754 sar1=B->seq_al[b][s];
5755 res1=A->seq_al[b][a];
5757 for ( c=b+1; c<A->nseq; c++)
5759 sar2=B->seq_al[c][s];
5760 res2=A->seq_al[c][a];
5762 idscore=(res1==res2)?1:0;
5763 if ( sar1 == 'I' && sar2=='I'){id1+=idscore;nid1++;}
5764 else if ( sar1 =='0' && sar2=='0'){id2+=idscore;nid2++;}
5765 else {id3+=idscore; nid3++;}
5769 id1=(nid1==0)?1:id1/nid1;
5770 id2=(nid1==0)?1:id2/nid2;
5771 id3=(nid3==0)?1:id3/nid3;
5772 pos[a][1]=(int)((float)1000*id1*(1-id3));
5777 sort_int (pos, 2,1, 0, A->len_aln-1);
5778 for ( a=MAX(0,(A->len_aln-npos));a<A->len_aln; a++)
5786 int * aln2predictive_positions_scan (Alignment *A, Alignment *B, char *mode, int print)
5788 int a, b, c, best_pos,nl, nplist=0, max, posw;
5789 float best_score, score;
5790 static int *list, *tpos,**plist,*array;
5796 char target_posfile[100];
5805 free_int (plist, -1);
5810 strget_param (mode, "_PREDMODE_", "ID", "%s", predmode);
5811 strget_param (mode, "_POSW_", "1", "%d", &posw);
5812 strget_param (mode, "_NPOS_", "2", "%d", &max);
5813 strget_param (mode, "_POSFILE_", "NO", "%s", posfile);
5814 strget_param (mode, "_TPOSFILE_", "NO", "%s", target_posfile);
5816 if ( !strm(posfile, "NO"))return file2pos_list (A,posfile);
5817 if ( !strm(target_posfile, "NO"))tpos=file2pos_list (A,target_posfile);
5820 tpos=vcalloc (A->len_aln, sizeof (int));
5821 for (a=0; a<A->len_aln; a++)tpos[a]=1;
5824 //Declare the positions that are going to be scanned
5829 plist=declare_int (A->len_aln, 2);
5831 for (a=0; a<A->len_aln; a++)
5844 plist=declare_int (A->len_aln*A->len_aln, 3);
5845 for (a=0; a<A->len_aln; a++)
5846 for (b=0; b<A->len_aln; b++)
5857 plist=declare_int (A->len_aln*A->len_aln*A->len_aln, 3);
5858 for (a=0; a<A->len_aln; a++)
5859 for (b=0; b<A->len_aln; b++)
5872 pos=vcalloc ( A->len_aln, sizeof (int));
5873 if (max==0)max=A->len_aln;
5876 for (a=0; a<A->len_aln; a++)if (tpos[a]){pos[a]=1;}
5877 aln2predictive_positions_scan (NULL, NULL, NULL, 0);
5883 pos=vcalloc ( A->len_aln, sizeof (int));
5884 list=vcalloc (A->len_aln, sizeof (int));
5889 for (a=0; a< max; a++)
5891 int previous_best_pos=-1;
5892 for (best_score=-9999,best_pos=0,b=0; b<nplist; b++)
5894 for (c=0; c<nl; c++)pos[list[c]]=1;
5895 for (c=1; c<=plist[b][0]; c++)pos[plist[b][c]]=1;
5897 if (strm(predmode, "R"))score=sar_aln2r(A,B,pos,0);
5898 else if ( strm (predmode, "ID"))
5900 score =or_id_evaluate (A, B, mode, pos,NO_PRINT);
5902 else if ( strm (predmode, "BP2"))score =or_loo_evaluate2 (A, B, mode, pos,NO_PRINT);
5905 HERE ("Unknown mode: %s", predmode);
5908 if ( score>best_score)
5913 for (c=1; c<=plist[b][0]; c++)pos[plist[b][c]]=0;
5916 if (best_pos==previous_best_pos)break;
5917 else previous_best_pos=best_pos;
5919 //update the best_pos_list
5920 for (b=1; b<=plist[best_pos][0]; b++)
5921 list[nl++]=plist[best_pos][b];
5926 for (b=0; b<nl; b++) pos[list[b]]=1;
5927 fprintf ( stdout, "\nP_current: ");
5928 for ( c=1; c<=plist[best_pos][0]; c++)fprintf ( stdout, "%d ",plist[best_pos][c]+1);
5929 fprintf ( stdout, " S: %.3f D: %d R:%d", best_score, (int)sar_aln2delta(A,B, pos,0), nl);
5931 for (b=0; b<nl; b++) pos[list[b]]=0;
5934 for (a=0; a<nl; a++)
5936 if (print==PRINT)fprintf ( stdout, "\nR_best: %.3f with %d pos" ,best_score, nl);
5938 aln2predictive_positions_scan (NULL, NULL, NULL, 0);
5943 char *** compounds2motifs (Alignment *A, Alignment *B, int *pos, int depth, char *mode, int print)
5948 motifs=vcalloc (B->len_aln, sizeof (char**));
5949 for (a=0; a<B->len_aln; a++)
5952 motifs[a]=compound2motif (A, B, pos, depth, a, mode, print);
5957 char ** compound2regexp_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print);
5958 char ** compound2word_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print);
5960 char ** compound2motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print)
5964 strget_param (mode, "_MOTIFMODE_", "word", "%s", mmode); //words, regexp
5965 if ( strm (mmode, "regexp"))return compound2regexp_motif (A,B,pos, depth, c, mode, print);
5966 else if ( strm (mmode, "word"))return compound2word_motif (A,B,pos, depth, c, mode, print);
5968 char ** compound2word_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print)
5971 char *word, **motif;
5975 word=or_id_evaluate2 (A, B, mode, pos,print, &score);
5976 if ( !word) return NULL;
5979 motif=declare_char (l+1, 2);
5980 for (a=0; a<l; a++)motif[a][0]=word[a];
5984 fprintf ( stdout, "\nMotifCompound %25s best: %.2f motif: ", get_compound_name(c, mode),score);
5987 fprintf ( stdout, "[%2s]",motif[a]);
5996 char ** compound2regexp_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print)
5998 static Alignment *I;
5999 static Alignment *O;
6002 float tp,tn,fp,fn,best, sp, sn, sen2, best_sn, best_sp, best_sen2;
6022 I=copy_aln(A, NULL);
6023 O=copy_aln(A, NULL);
6026 strget_param (mode, "_DEPTH_", "2", "%d", &depth);
6028 I->nseq=O->nseq=I->len_aln=O->len_aln=0;
6029 for (a=0; a<A->len_aln; a++)
6033 for (i=o=0,b=0; b<A->nseq; b++)
6035 if ( is_gap(A->seq_al[b][a]))return 0;
6036 if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a];
6037 else O->seq_al[o++][O->len_aln]=A->seq_al[b][a];
6044 if (O->len_aln==0 || I->len_aln==0) return 0;
6047 for (a=0; a<o; a++)O->seq_al[a][O->len_aln]='\0';
6048 for (a=0; a<i; a++)I->seq_al[a][I->len_aln]='\0';
6050 if (!I->nseq) return NULL;
6054 best_pred=best_motif=best_sn=best_sp=best_sen2=0;
6056 motif_file=vtmpnam (NULL);
6061 alp=vcalloc ( sizeof (char**), I->len_aln);
6062 alp_size= vcalloc ( I->len_aln, sizeof (int));
6063 for (a=0; a<I->len_aln; a++)
6066 alp[a]=string2alphabet ( (col=aln_column2string (I,a)),depth, &alp_size[a]);
6069 generate_array_string_list (I->len_aln, alp, alp_size, &n, motif_file, OVERLAP);
6076 used=vcalloc (256, sizeof (int));
6077 fpp=vfopen (motif_file,"w");
6078 for (a=0;a<I->len_aln; a++)
6080 for (b=0; b<I->nseq; b++)
6083 if (!used[(int)r]){fprintf (fpp, "%c", r);used[(int)r]=1;}
6085 for (b=0; b<I->nseq; b++)
6092 fprintf (fpp, "\n");
6100 buf=vcalloc (2*(I->len_aln*depth)+1, sizeof (char));
6101 best_buf=vcalloc (2*(I->len_aln*depth)+1, sizeof (char));
6102 fpp=vfopen (motif_file, "r");
6107 buf=vfgets (buf, fpp);
6108 m2=string2list (buf);
6114 for (b=0; b<I->nseq; b++)
6116 if (match_motif (I->seq_al[b], m2))tp++;
6119 for (b=0; b<O->nseq; b++)
6121 if (match_motif (O->seq_al[b], m2))fp++;
6124 rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
6126 if (best>= best_pred)
6132 sprintf (best_buf, "%s", buf);
6138 if (print==PRINT)fprintf ( stdout, "\nMotifCompound %25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f motif: ", get_compound_name(c, mode), best_sp, best_sn, best_sen2, best_pred);
6139 m2=string2list (best_buf);
6140 m=declare_char (I->len_aln+1, depth+1);
6142 for (a=0; a<I->len_aln; a++)
6144 sprintf (m[a], "%s", m2[a+1]);
6145 if (print==PRINT) fprintf ( stdout, "[%2s]",m[a]);
6147 if (print==PRINT)fprintf ( stdout, " N-motifs %d", n);
6150 if (alp)free_arrayN((void ***) alp, 3);
6151 if (alp_size)vfree (alp_size);
6152 vfree (buf); vfree(best_buf);
6157 double pos2sim (Alignment *A, Alignment *B, int *pos)
6159 return sar_aln2r (A, B,pos, PRINT);
6161 double sar_aln2r (Alignment *A, Alignment *B, int *pos, int print)
6163 int a, b, c, d,r1, r2, n, score, sim;
6165 static double **slist;
6171 if (!M)M=read_matrice ("blosum62mt");
6175 maxslist=A->nseq*A->nseq*10;
6176 slist=declare_double (maxslist, 2);
6183 pos=vcalloc ( A->len_aln+1, sizeof (int));
6184 for (a=0; a<A->len_aln; a++)pos[a]=1;
6189 for (n=0,a=0; a< A->nseq-1; a++)
6192 for (b=a+1; b<A->nseq; b++)
6196 for (sim=d=0,c=0; c<A->len_aln; c++)
6199 if (pos[c]==0)continue;
6203 if (is_gap(r1) || is_gap(r2))return 0;
6205 sim+=M[r1-'A'][r2-'A']*pos[c];
6206 d+=MAX((M[r1-'A'][r1-'A']),(M[r2-'A'][r2-'A']));
6208 sim=(d==0)?0:(100*sim)/d;
6209 score=(int)get_sar_sim(B->seq_al[a], B->seq_al[b]);
6210 slist[n][0]=(double)sim;
6211 slist[n][1]=(double)score;
6212 if (print==PRINT)fprintf ( stdout, "SIM: %d %d [%s %s]\n", sim, score, A->name[a], A->name[b]);
6217 r=return_r(slist, n);
6218 for (a=0; a<n; a++)slist[a][0]=slist[a][1]=0;
6221 if (declare) vfree (pos);
6227 double sar_aln2delta (Alignment *A, Alignment *B, int *pos, int print)
6229 static Alignment *I;
6230 static Alignment *O;
6235 I=copy_aln(A, NULL);
6236 O=copy_aln(A, NULL);
6241 for (c=0; c<B->len_aln; c++)
6244 I->nseq=O->nseq=I->len_aln=O->len_aln=0;
6245 for (a=0; a<A->len_aln; a++)
6249 for (i=o=0,b=0; b<B->nseq; b++)
6251 if ( is_gap(A->seq_al[b][a]))return 0;
6252 if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a];
6253 else O->seq_al[o++][O->len_aln]=A->seq_al[b][a];
6259 if (O->len_aln==0 || I->len_aln==0) return 0;
6262 for (a=0; a<o; a++)O->seq_al[a][O->len_aln]='\0';
6263 for (a=0; a<i; a++)I->seq_al[a][I->len_aln]='\0';
6265 delta+=aln2sim(I,"blosum62mt")-aln2sim(O, "blosum62mt");
6272 char * get_compound_name (int c, char *mode)
6275 static Alignment *S;
6282 lname=vcalloc (100, sizeof (char));
6287 strget_param (mode, "_COMPLIST_", "NO", "%s", comp_list=vcalloc (100, sizeof (char)));
6288 if (strm(comp_list, "NO"));
6291 S=main_read_aln (comp_list, NULL);
6296 if (!S || c>=S->nseq)sprintf (lname, "%d", c);
6299 sprintf (lname, "%s", S->name [c]);
6303 ORP * declare_or_prediction ( int ncomp, int nseq, int len)
6306 P=vcalloc ( 1, sizeof (ORP));
6312 P->pos=vcalloc (len+1, sizeof (int));
6317 void free_orp_list ( ORP**P)
6325 void free_orp ( ORP*P)
6332 free_arrayN((void **)P->motif, 3);
6333 if (P->PR)free_orp(P->PR);
6351 /*********************************COPYRIGHT NOTICE**********************************/
6352 /*© Centro de Regulacio Genomica */
6354 /*Cedric Notredame */
6355 /*Tue Oct 27 10:12:26 WEST 2009. */
6356 /*All rights reserved.*/
6357 /*This file is part of T-COFFEE.*/
6359 /* T-COFFEE is free software; you can redistribute it and/or modify*/
6360 /* it under the terms of the GNU General Public License as published by*/
6361 /* the Free Software Foundation; either version 2 of the License, or*/
6362 /* (at your option) any later version.*/
6364 /* T-COFFEE is distributed in the hope that it will be useful,*/
6365 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
6366 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
6367 /* GNU General Public License for more details.*/
6369 /* You should have received a copy of the GNU General Public License*/
6370 /* along with Foobar; if not, write to the Free Software*/
6371 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
6372 /*............................................... |*/
6373 /* If you need some more information*/
6374 /* cedric.notredame@europe.com*/
6375 /*............................................... |*/
6379 /*********************************COPYRIGHT NOTICE**********************************/