X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=binaries%2Fsrc%2Ftcoffee%2Ft_coffee_source%2Futil_aln_analyze.c;fp=binaries%2Fsrc%2Ftcoffee%2Ft_coffee_source%2Futil_aln_analyze.c;h=acdde3acda00effb912b4c1d4cacfe66ccc8098a;hb=d8e1d04583007816f5dc77595d322ba669075ae3;hp=0000000000000000000000000000000000000000;hpb=8140043fefd2101326b3e651e3b7e2d4ac806b99;p=jabaws.git diff --git a/binaries/src/tcoffee/t_coffee_source/util_aln_analyze.c b/binaries/src/tcoffee/t_coffee_source/util_aln_analyze.c new file mode 100644 index 0000000..acdde3a --- /dev/null +++ b/binaries/src/tcoffee/t_coffee_source/util_aln_analyze.c @@ -0,0 +1,6379 @@ +#include +#include +#include +#include +#include +#include +#include "io_lib_header.h" +#include "util_lib_header.h" +#include "dp_lib_header.h" +#include "define_header.h" +/************************************************************************************/ +/* NEW ANALYZE 2 : SAR */ +/************************************************************************************/ +float display_prediction_old (int **prediction, int n, Alignment *A, Alignment *S, int field); + +float display_prediction (int ***count, Alignment *S, int c, int n); +Alignment * filter_aln4sar0 ( Alignment *A, Alignment *S, int c, int leave, char *mode); +Alignment * filter_aln4sar1 ( Alignment *A, Alignment *S, int c, int leave, char *mode); +Alignment * filter_aln4sar2 ( Alignment *A, Alignment *S, int c, int leave, char *mode); +Alignment * filter_aln4sar3 ( Alignment *A, Alignment *S, int c, int leave, char *mode); +Alignment * filter_aln4sar4 ( Alignment *A, Alignment *S, int c, int leave, char *mode); +Alignment * filter_aln4sar5 ( Alignment *A, Alignment *S, int c, int leave, char *mode); + +int **sar2profile ( Alignment *A, Alignment *S, int c, int leave); +int **sar2profile_sim ( Alignment *A, Alignment *S, int **sim, int comp, int leave); +int sar_profile2score ( char *seq, int **profile); +double sar_vs_iseq1( char *sar, int *seq, float gl, int **sim, char *best_aa); +double sar_vs_seq1 ( char *sar, char *seq, float gl, int **sim, char *best_aa); +double sar_vs_seq2 ( char *sar, char *seq, float ng, int **mat, char *a); +double sar_vs_seq3 ( char *sar, char *seq, float ng, int **mat, char *a); + +double sar_vs_iseq4 ( char *sar, int *seq, float ng, int **mat, char *a);//supports an extended alphabet +double sar_vs_seq4 ( char *sar, char *seq, float ng, int **mat, char *a); + +double sar_vs_seq5 ( char *sar, char *seq, float ng, int **mat, char *a); +int make_sim_pred ( Alignment *A,Alignment *S, int comp, int seq); + +int **sar2profile_sim ( Alignment *A, Alignment *S, int **sim, int comp, int leave) +{ + + int a, b, r, c, c1, c2, r1, r2, s, p; + int ***cache, **profile; + + + profile=declare_int (A->len_aln, 26); + cache=declare_arrayN (3,sizeof (int),2,A->len_aln, 26); + + for ( a=0; a< A->len_aln; a++) + for ( b=0; b< A->nseq; b++) + { + r=tolower(A->seq_al[b][a]); + c=( S->seq_al[comp][b]=='I')?1:0; + if (b==leave || is_gap(r)) continue; + cache [c][a][r-'a']++; + } + for (a=0; a< A->nseq; a++) + { + if ( a==leave) continue; + for ( b=0; b< A->nseq; b++) + { + c1=(S->seq_al[comp][a]=='I')?1:0; + c2=(S->seq_al[comp][b]=='I')?1:0; + if ( b==leave || b==a || c1!=1 || c1==c2) continue; + s=sim[a][b]; + + for (p=0; plen_aln; p++) + { + r1=tolower(A->seq_al[a][p]); + r2=tolower(A->seq_al[b][p]); + if ( is_gap(r1) || is_gap(r2) || r1==r2)continue; + r1-='a';r2-='a'; + if (cache[1][p][r2])continue; + if ( s<50)continue; + profile[p][r2]-=s; + } + } + } + + free_arrayN((void***)cache,3); + return profile; + +} +int **sar2profile ( Alignment *A, Alignment *S, int comp, int leave) +{ + + int a, b,c,r, n, v, npos=0; + int ***cache, **profile; + int ncat; + float n_gap, max_gap; + profile=declare_int (A->len_aln, 26); + cache=declare_arrayN (3,sizeof (int),2,A->len_aln, 26); + + + + for ( n=0, a=0; a< A->nseq; a++) + { + if ( a==leave) continue; + else n+=(S->seq_al[comp][a]=='I')?1:0; + } + + for ( a=0; a< A->len_aln; a++) + for ( b=0; b< A->nseq; b++) + { + r=tolower(A->seq_al[b][a]); + c=( S->seq_al[comp][b]=='I')?1:0; + if (b==leave) continue; + else if (is_gap(r))continue; + r-='a'; + cache [c][a][r]++; + } + + ncat=15; /*ncat: limit the analysis to columns containing less than ncat categories of aa*/ + max_gap=0.05; + for (a=0; a< A->len_aln; a++) + { + for (n_gap=0,b=0; b< A->nseq; b++) + n_gap+=(is_gap(A->seq_al[b][a])); + n_gap/=(float)A->nseq; + + if ( n_gap> max_gap)continue; + + for (v=0,r=0; r< 26; r++) + { + if (cache [0][a][r] || cache[1][a][r])v++; + } + + for (n=0,r=0; r< 26 && vnseq, 2); + list2=declare_int ( inA->len_aln, 2); + + cache=declare_arrayN (3,sizeof (int),inA->len_aln,2, 26); + F=copy_aln (inA, NULL); + + A=copy_aln (inA, NULL); + A->nseq=strlen (S->seq_al[comp]); + + strget_param (mode, "_T1_", "5", "%d", &T1); + for ( a=0; a< A->len_aln; a++) + { + n1=n0=g=0; + for (b=0; b< A->nseq; b++) + { + if ( b==leave) continue; + i=(S->seq_al[comp][b]=='I')?1:0; + r=tolower(A->seq_al[b][a]); + if ( r=='-')continue; + cache[a][i][r-'a']++; + } + } + + for (a=0; a< A->nseq; a++) + for ( score=0,b=0; blen_aln; b++) + { + r=tolower (A->seq_al[a][b]); + if ( is_gap(r))continue; + else if ( cache[b][0][r-'a'] && !cache[b][1][r-'a'])list1[a][0]++; + } + + for (a=0; a< A->len_aln; a++) + { + for ( score=0,b=0; b< A->nseq; b++) + { + r=tolower (A->seq_al[b][a]); + if ( r=='-')continue; + else r-='a'; + if ( cache[a][0][r] && !cache[a][1][r])score ++; + } + list2[a][0]=a; + list2[a][1]=score; + } + sort_int (list2, 2, 1, 0, F->len_aln-1); + + Delta=A->len_aln/(100/T1); + for ( a=0; a< F->len_aln-Delta; a++) + { + b=list2[a][0]; + for ( c=0; cnseq; c++) + { + F->seq_al[c][b]='-'; + } + } + + ungap_aln (F); + free_aln (A); + free_arrayN ( (void ***)cache, 3); + free_arrayN ((void**)list1, 2); + free_arrayN ((void**)list2, 2); + + return F; +} +Alignment * filter_aln4sar2 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode) +{ + Alignment *F, *A; + int a,b,r,ncat; + int *cache; + int max_ncat=10; + + /*Keep Low entropy columns that contain less than ncat categories of different amino acids*/ + /*REmove columns containing 10% or more gaps*/ + + cache=vcalloc ( 500, sizeof (char)); + F=copy_aln (inA, NULL); + A=copy_aln (inA, NULL); + A->nseq=strlen (S->seq_al[comp]); + for ( a=0; a< A->len_aln; a++) + { + for (ncat=0,b=0; b< A->nseq; b++) + { + if ( b==leave) continue; + + r=tolower(A->seq_al[b][a]); + if ( !cache[r])ncat++; + cache[r]++; + } + + if ( ncat nseq)<10) + { + ; + } + else + { + for (b=0; bnseq; b++) + { + r=tolower(F->seq_al[b][a]); + F->seq_al[b][a]='-'; + cache[r]=0; + } + } + for (b=0; bnseq; b++) + { + r=tolower(A->seq_al[b][a]); + cache[r]=0; + } + } + + free_aln (A); + ungap_aln (F); + vfree (cache); + return F; +} + +Alignment * filter_aln4sar3 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode) +{ + Alignment *F, *rA, *A; + int a, b,c; + int **list1; + char *bufS, *bufA; + int Delta; + int T3; + + /*Keep the 10% positions most correlated with the 0/1 pattern*/ + + A=copy_aln (inA, NULL); + A->nseq=strlen (S->seq_al[comp]); + F=copy_aln (inA, NULL); + rA=rotate_aln (A, NULL); + + strget_param (mode, "_T3_", "10", "%d", &T3); + + + list1=declare_int ( inA->len_aln, 2); + bufA=vcalloc ( A->nseq+1, sizeof (char)); + bufS=vcalloc ( A->nseq+1, sizeof (char)); + + sprintf ( bufS, "%s", S->seq_al[comp]); + splice_out_seg(bufS,leave, 1); + + + for (a=0; a< A->len_aln; a++) + { + char aa; + list1[a][0]=a; + sprintf (bufA, "%s", rA->seq_al[a]); + splice_out_seg (bufA,leave,1); + list1[a][1]=(int)sar_vs_seq3 ( bufS, bufA,0,NULL, &aa); + } + + sort_int (list1, 2, 1, 0, F->len_aln-1); + Delta=F->len_aln/(100/T3); + for ( a=0; a< F->len_aln-Delta; a++) + { + b=list1[a][0]; + + for ( c=0; cnseq; c++) + { + F->seq_al[c][b]='-'; + } + + } + F->score_aln=list1[F->len_aln-1][1]; + ungap_aln (F); + + free_aln (rA); + free_aln(A); + free_arrayN ((void**)list1, 2); + vfree (bufS);vfree (bufA); + return F; +} +Alignment * filter_aln4sar4 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode) +{ + Alignment *F, *A; + int a, b,c, i,r, n0, n1,g,score; + int ***cache, **list1, **list2; + + /*Keep only the positions where there are residues ONLY associated with 0 sequences*/ + + list1=declare_int ( inA->nseq, 2); + list2=declare_int ( inA->len_aln, 2); + + cache=declare_arrayN (3,sizeof (int),inA->len_aln,2, 26); + F=copy_aln (inA, NULL); + A=copy_aln (inA, NULL); + A->nseq=strlen (S->seq_al[comp]); + + for ( a=0; a< A->len_aln; a++) + { + n1=n0=g=0; + for (b=0; b< A->nseq; b++) + { + if ( b==leave) continue; + i=(S->seq_al[comp][b]=='I')?1:0; + r=tolower(A->seq_al[b][a]); + if ( r=='-')continue; + cache[a][i][r-'a']++; + n1+=i; + } + } + + + for (a=0; a< A->len_aln; a++) + { + for ( score=0,b=0; b< A->nseq; b++) + { + r=tolower (F->seq_al[b][a]); + if ( r=='-')continue; + else r-='a'; + if (cache[a][1][r]>=n1/2)score=1; + } + list2[a][0]=a; + list2[a][1]=score; + } + + + for ( a=0; a< F->len_aln; a++) + { + if ( list2[a][1]==1); + else + { + b=list2[a][0]; + for ( c=0; cnseq; c++) + { + F->seq_al[c][b]='-'; + } + } + } + ungap_aln (F); + free_aln (A); + free_arrayN ( (void ***)cache, 3); + free_arrayN ((void**)list1, 2); + free_arrayN ((void**)list2, 2); + + return F; +} + +Alignment * filter_aln4sar5 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode) +{ + Alignment *F, *rA, *A; + int a, b,c; + int **list1; + char *bufS, *bufA; + int max; + /*Look for the positions that show the best correlation between the sequence variation and the SAR*/ + + A=copy_aln (inA, NULL); + A->nseq=strlen (S->seq_al[comp]); + + rA=rotate_aln (inA, NULL); + F=copy_aln (inA, NULL); + + list1=declare_int ( A->len_aln, 2); + bufA=vcalloc ( A->nseq+1, sizeof (char)); + bufS=vcalloc ( A->nseq+1, sizeof (char)); + + + + sprintf ( bufS, "%s", S->seq_al[comp]); + splice_out_seg(bufS,leave, 1); + + + for (a=0; a< A->len_aln; a++) + { + char aa; + list1[a][0]=a; + sprintf (bufA, "%s", rA->seq_al[a]); + splice_out_seg (bufA,leave,1); + list1[a][1]=(int)sar_vs_seq4 ( bufS, bufA,0,NULL, &aa); + } + + sort_int (list1, 2, 1, 0, F->len_aln-1); + max=F->score=list1[F->len_aln-1][1]; + max-=(max/10); + + + for ( a=0; a< F->len_aln-10; a++) + { + + b=list1[a][0]; + + for ( c=0; cnseq; c++) + { + F->seq_al[c][b]='-'; + } + + } + F->score_aln=10; + ungap_aln (F); + free_aln (inA); + free_aln (rA); + free_arrayN ((void**)list1, 2); + vfree (bufS);vfree (bufA); + return F; +} + +int sar_profile2score ( char *seq, int **P) +{ + int a,r, l, score; + + l=strlen (seq); + for ( score=0,a=0; a< l; a++) + { + r=seq[a]; + if ( is_gap(r))continue; + score+=P[a][tolower(r)-'a']; + } + return score; +} +int make_sim_pred ( Alignment *A,Alignment *S, int comp, int seq) +{ + int a, b, i, r1, r2; + static float **cscore; + static float **tscore; + + if ( !cscore) + { + cscore=declare_float (2, 2); + tscore=declare_float (2, 2); + } + + for (a=0; a< 2; a++)for (b=0; b<2; b++)cscore[a][b]=tscore[a][b]=0; + + for ( a=0; alen_aln; a++) + { + r1=A->seq_al[seq][a]; + if ( r1=='-') continue; + else + { + for ( b=0; b< A->nseq; b++) + { + if (b==seq) continue; + else + { + r2=A->seq_al[b][a]; + if (r2=='-')continue; + else + { + + i=(S->seq_al[comp][b]=='I')?1:0; + cscore[i][0]+=(r1==r2)?1:0; + cscore[i][1]++; + } + } + } + + for (i=0; i<2; i++) + { + cscore[i][0]/=(cscore[i][1]==0)?1:cscore[i][1]; + tscore[i][0]+=cscore[i][0];tscore[i][1]++; + cscore[i][0]=cscore[i][1]=0; + } + } + } + + fprintf ( stdout, "\nn\t 1: %.2f 0: %.2f", tscore[1][0],tscore[0][0]); + return ( tscore[1][0]>=tscore[0][0])?1:0; +} + + +Alignment * sar_analyze (Alignment *inA, Alignment *inS, char *mode) +{ + int ***sim,***glob_results, ***comp_results; + int *count; + int a,b,c,m; + float *tot2; + Alignment *A=NULL,*S=NULL,*F, *SUBSET; + char *subset, *target; + int jack, T, filter; + filter_func *ff; + int n_methods=0; + char *prediction, *reliability; + int pred_start=0, pred_end, ref_start=0, ref_end; + int display, CSV=1, NONCSV=0; + char method[5]; + + strget_param (mode, "_METHOD_", "1111", "%s_", method); + ff=vcalloc (6,sizeof (filter_func)); + if (method[0]=='1')ff[n_methods++]=filter_aln4sar0; + if (method[1]=='1')ff[n_methods++]=filter_aln4sar1; + if (method[2]=='1')ff[n_methods++]=filter_aln4sar2; + if (method[3]=='1')ff[n_methods++]=filter_aln4sar3; + /* + ff[n_methods++]=filter_aln4sar4; + ff[n_methods++]=filter_aln4sar5; + */ + sim=vcalloc (n_methods, sizeof (int**)); + + + tot2=vcalloc ( 10, sizeof (float)); + subset=vcalloc ( 100, sizeof (char)); + target=vcalloc ( 100, sizeof (char)); + + strget_param (mode, "_TARGET_", "no", "%s_", target); + strget_param (mode, "_SUBSET_", "no", "%s_", subset); + strget_param (mode, "_JACK_", "0", "%d", &jack); + strget_param (mode, "_T_", "0", "%d", &T); + strget_param (mode, "_FILTER_", "11", "%d", &filter); + strget_param (mode, "_DISPLAY_", "0", "%d", &display); + + + + if ( !strm (target, "no")) + { + Alignment *T; + T=main_read_aln(target, NULL); + if ( T->len_aln !=inA->len_aln ) + { + printf_exit ( EXIT_FAILURE,stderr, "Error: %s is incompatible with the reference alignment [FATAL:%s]",target,PROGRAM); + } + + inA=stack_aln (inA, T); + + } + + if ( !strm(subset, "no")) + { + SUBSET=main_read_aln (subset, NULL); + sarset2subsarset ( inA, inS, &A, &S, SUBSET); + } + else + { + A=inA; + S=inS; + } + + + prediction=vcalloc ( n_methods+1, sizeof (char)); + reliability=vcalloc ( n_methods+1, sizeof (char)); + + glob_results=declare_arrayN(3, sizeof (int), n_methods*2, 2, 2); + + count=vcalloc (S->nseq, sizeof (int)); + for (a=0; anseq; a++) + { + int l; + l=strlen (S->seq_al[a]); + for ( b=0; bseq_al[a][b]=='I')?1:0; + } + if ( display==CSV) + {fprintf ( stdout, "\nCompound %s ; Ntargets %d", S->name[a],count[a]); + pred_start=(strlen (S->seq_al[0])==A->nseq)?0:strlen (S->seq_al[0]); + pred_end=A->nseq; + for (a=pred_start; a< pred_end; a++) + fprintf ( stdout, ";%s", A->name[a]); + fprintf ( stdout, ";npred;"); + } + + + for (a=0; anseq; a++) + { + int n_pred; + comp_results=declare_arrayN(3, sizeof (int), n_methods*2, 2, 2); + + pred_start=(strlen (S->seq_al[a])==A->nseq)?0:strlen (S->seq_al[a]); + pred_end=A->nseq; + if ( display==CSV)fprintf ( stdout, "\n%s;%d", S->name[a],count[a]); + + for (n_pred=0,b=pred_start; b%-15s %10s %c ", S->name[a], A->name[b], (pred_start==0)?S->seq_al[a][b]:'?'); + if (jack || b==pred_start) + { + for (m=0; mseq_al[m]); + + for (c=ref_start;cseq_al[a][c]=='O') + { + Nbsim=MAX(Nbsim,sim[m][b][c]); + } + else + { + Ybsim=MAX(Ybsim,sim[m][b][c]); + } + } + + bsim=(Ybsim>Nbsim)?Ybsim:-Nbsim; + pred=(bsim>0)?1:0; + real=(S->seq_al[a][b]=='O')?0:1; + comp_results[m][pred][real]++; + glob_results[m][pred][real]++; + score+=pred; + prediction[m]=pred+'0'; + reliability[m]=(FABS((Ybsim-Nbsim))-1)/10+'0'; + } + + if ( score>0)n_pred++; + prediction[m]=reliability[m]='\0'; + if (display==NONCSV)fprintf ( stdout, "Compound_Count:%d primary_predictions: %s Total: %d", count[a],prediction, score); + else if ( display==CSV)fprintf ( stdout, ";%d", score); + for (t=0; tt) + { + comp_results[t+n_methods][1][real]++; + glob_results[t+n_methods][1][real]++; + } + else + { + comp_results[t+n_methods][0][real]++; + glob_results[t+n_methods][0][real]++; + } + } + } + if ( display==NONCSV) + {if ( pred_start==0)display_prediction (comp_results, S,a, n_methods*2);} + else fprintf (stdout, ";%d;",n_pred); + } + if ( display==NONCSV)if (pred_start==0)display_prediction (glob_results, S,-1, n_methods*2); + + + myexit (EXIT_SUCCESS); +} +float display_prediction (int ***count, Alignment *S, int c, int n) +{ + float tp,tn,fn,fp,sp,sn,sn2; + int a, nm; + + nm=n/2; + + for (a=0; a#Method %d Compound %15s sp=%.2f sn=%.2f sn2=%.2f",a, (c==-1)?"TOTAL":S->name[c],sp, sn, sn2 ); + else fprintf ( stdout, "\n>#Combined: T=%d Compound %15s sp=%.2f sn=%.2f sn2=%.2f",a-nm, (c==-1)?"TOTAL":S->name[c],sp, sn, sn2 ); + } + fprintf ( stdout, "\n"); + return 0; +} + +float display_prediction_2 (int **prediction, int n,Alignment *A, Alignment *S, int field) +{ + int a, t, T; + float max_sn, max_sp; + + if ( field==17 || field ==18) + { + printf_exit ( EXIT_FAILURE, stderr, "\nERROR: Do not use filed %d in display_prediction", field); + } + + sort_int_inv ( prediction, 10,field, 0, n-1); + for (t=0,a=0; a=0; a--) + { + prediction[a][18]=t; + t+=prediction[a][3]; + } + + max_sn=max_sp=T=0; + for (a=0; a0.8) + { + if (sn>max_sn) + { + max_sn=sn; + max_sp=sp; + + T=prediction[a][field]; + } + } + } + if (max_sn==0) + fprintf (stdout, "\n T =%d SN=%.2f SP= %.2f",T,max_sn,max_sp); + else + fprintf (stdout, "\n T =%d SN=%.2f SP= %.2f",T,max_sn,max_sp); + + return max_sn; +} + + +/************************************************************************************/ +/* NEW ANALYZE : SAR */ +/************************************************************************************/ +float** cache2pred1 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); +float** cache2pred2 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); +float** cache2pred3 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); +float** cache2pred4 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); +float** cache2pred5 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); +float** cache2pred_new (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); + +int **sar2cache_adriana ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); +int **sar2cache_proba_old ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); +int **sar2cache_count1 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); +int **sar2cache_count2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); +int **sar2cache_count3 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); + +int **sar2cache_proba_new ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); +int **sar2cache_proba2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); +int **analyze_sar_compound1 ( char *name, char *seq, Alignment *A, char *mode); +int **analyze_sar_compound2 ( char *name, char *seq, Alignment *A, char *mode); + +int aln2n_comp_col ( Alignment *A, Alignment *S, int ci); + + + + +int ***simple_sar_analyze_vot ( Alignment *inA, Alignment *SAR, char *mode); +int ***simple_sar_analyze_col ( Alignment *inA, Alignment *SAR, char *mode); + + + +int sarset2subsarset ( Alignment *A, Alignment *S, Alignment **subA, Alignment **subS, Alignment *SUB); +int benchmark_sar (int v); +int aln2jack_group1 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2); +int aln2jack_group2 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2); +int aln2jack_group3 (Alignment *A, char *sar_seq, int **l1, int *nl1, int **l2, int *nl2); +float** jacknife5 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); +float** jacknife6 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); + +int process_cache ( Alignment *A,Alignment *S, int ***Cache, char *mode); +Alignment *analyze_compounds (Alignment *A, Alignment *S, char *mode); + +Alignment *analyze_compounds (Alignment *A, Alignment *S, char *mode) +{ + int a, b, c, tot, n; + int **sim; + int sar1, sar2; + + sim=aln2sim_mat (A, "idmat"); + for (a=0; a< S->nseq; a++) + { + for (n=0, tot=0, b=0; b< A->nseq-1; b++) + { + sar1=(S->seq_al[a][b]=='I')?1:0; + for ( c=b+1; cnseq; c++) + { + sar2=(S->seq_al[a][c]=='I')?1:0; + + if (sar1 && sar2) + { + tot+=sim[b][c]; + n++; + } + } + } + fprintf ( stdout, ">%-10s CMPSIM: %.2f\n", S->name[a],(float)tot/(float)n); + } + free_int (sim, -1); + return A; +} + +int print_seq_pos ( int pos, Alignment *A, char *seq); +int abl1_evaluation (int p); +int print_seq_pos ( int pos, Alignment *A, char *seq) +{ + int a, b, s; + + s=name_is_in_list (seq, A->name, A->nseq, MAXNAMES); + fprintf ( stdout, "S=%d", s); + + for (b=0,a=0; aseq_al[s][a]))b++; + } + fprintf ( stdout, "Pos %d SEQ %s: %d ", pos+1, seq, b+246); + if ( strm ( seq, "ABL1")) fprintf ( stdout , "PT: %d", abl1_evaluation (b+246)); + return 0; +} + +int process_cache ( Alignment *A,Alignment *S, int ***Cache, char *mode) +{ + int a, b; + int **pos, **pos2; + int **C; + int ab1, *ab1_pos; + int weight_mode; + + strget_param ( mode, "_WEIGHT_", "1", "%d", &weight_mode); + pos=declare_int(A->len_aln+1,2); + pos2=declare_int (A->len_aln+1,S->nseq); + for (a=0; anseq; a++) + { + C=Cache[a]; + for (b=0; b< A->len_aln; b++) + { + pos[b][0]+=C[26][b]; + if ( C[26][b]>0) + { + pos[b][1]++; + pos2[b][a]=1; + } + } + } + + C=Cache[0]; + ab1=name_is_in_list ("ABL1", A->name, A->nseq,100); + ab1_pos=vcalloc (A->len_aln+1, sizeof (int)); + + for ( b=0,a=0; a< A->len_aln; a++) + { + if ( A->seq_al[ab1][a]=='-')ab1_pos[a]=-1; + else ab1_pos[a]=++b; + } + + for ( a=0; a< A->len_aln; a++) + { + fprintf ( stdout, "\n%4d %5d %5d %5d [%c] [%2d] ALN", a+1, pos[a][0], pos[a][1], ab1_pos[a]+246,A->seq_al[ab1][a],abl1_evaluation (ab1_pos[a]+246)); + for ( b=0; b< S->nseq; b++)fprintf ( stdout, "%d", pos2[a][b]); + } + return 1; +} +int abl1_evaluation (int p) +{ + if ( p==248) return 10; + if ( p==250) return 10; + if ( p==253) return 10; + if ( p==254) return 10; + if ( p==255) return 9; + if ( p==256) return 10; + if ( p==257) return 5; + if ( p==258) return 8; + if ( p==269) return 8; + if ( p==291) return 4; + if ( p==294) return 8; + if ( p==299) return 10; + if ( p==306) return 0; + if ( p==314) return 9; + if ( p==315) return 10; + if ( p==318) return 10; + + if ( p==319) return 10; + if ( p==321) return 10; + if ( p==323) return 0; + if ( p==324) return 0; + if ( p==339) return 0; + if ( p==340) return 0; + if ( p==355) return 5; + if ( p==364) return 10; + + if ( p==366) return 0; + if ( p==368) return 10; + if ( p==370) return 10; + if ( p==372) return 0; + if ( p==378) return 8; + if ( p==382) return 10; + + if ( p==384) return 10; + if ( p==387) return 10; + if ( p==395) return 8; + + if ( p==398) return 8; + if ( p==399) return 8; + if ( p==400) return 8; + if ( p==403) return 0; + if ( p==416) return 8; + if ( p==419) return 5; + if ( p>400) return 0; + return -1; +} +float** cache2pred1 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode) +{ + int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci; + double score, max, id, m; + float **R, T; + + + int used_col, used_res,is_used_col, n_res=0; + int weight_mode; + /*Predict on ns[1] what was trained on ns[0]*/ + + strget_param ( mode, "_THR_", "0.09", "%f", &T); + strget_param ( mode, "_WEIGHT_", "0", "%d", &weight_mode); + + R=declare_float (2, 2); + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + + + + for (s1=0; s1len_aln; col++) + { + int max1; + r1=tolower (A->seq_al[seq1][col]); + for (max1=0,id=0, m=0,s2=0; s2seq_al[ci][seq2]=='O')continue; + if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue; + + r2=tolower ( A->seq_al[seq2][col]); + if ( is_gap(r2))continue; + + v=(cache[seq2][col]>0 && weight_mode==1)?cache[seq2][col]:1; + + max+=v; + if ( r2==r1) + { + score+=v; + } + + } + + } + pred=(( score/max) >T)?1:0; + real=(S->seq_al[ci][seq1]=='I')?1:0; + R[pred][real]++; + + fprintf ( stdout, "\n>%s %d%d SCORE %.2f C %s [SEQ]\n", A->name[seq1],real, pred, (float)score/(float)max, compound); + } + + for (used_col=0,used_res=0,col=0; collen_aln; col++) + { + for (is_used_col=0,s2=0; s2seq_al[seq2][col]))n_res++; + else if (is_gap(A->seq_al[seq2][col])); + else + { + is_used_col=1; + used_res++; + } + } + used_col+=is_used_col; + } + fprintf ( stdout, "\n>%s USED_POSITIONS: COL: %.2f RES: %.2f COMP\n", S->name[ci], (float)used_col/(float)A->len_aln, (float)used_res/(float) n_res); + + return R; +} + +float** cache2pred2 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode) +{ + int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci; + double score, max; + float **R, T; + + + int used_col, used_res,is_used_col, n_res=0; + /*Predict on ns[1] what was trained on ns[0]*/ + + strget_param ( mode, "_THR_", "0.5", "%f", &T); + + + R=declare_float (2, 2); + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + + for (s1=0; s1len_aln; col++) + { + int used; + + r1=tolower (A->seq_al[seq1][col]); + for (used=0,s2=0; s2seq_al[ci][seq2]=='O')continue; + if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue; + + + r2=tolower ( A->seq_al[seq2][col]); + if ( is_gap(r2))continue; + + v=cache[seq2][col]; + if ( r2==r1){score+=v;} + used=1; + max+=v; + } + if (used) fprintf ( stdout, "%c", r1); + } + + pred=(( score/max) >T)?1:0; + real=(S->seq_al[ci][seq1]=='I')?1:0; + R[pred][real]++; + + fprintf ( stdout, "PSEQ: %-10s SC: %4d MAX: %4d S: %.2f R: %4d", A->name[seq1],(int)score, (int)max, (float)score/max,real); + + } + + for (used_col=0,used_res=0,col=0; collen_aln; col++) + { + for (is_used_col=0,s2=0; s2seq_al[seq2][col]))n_res++; + else if (is_gap(A->seq_al[seq2][col])); + else + { + is_used_col=1; + used_res++; + } + } + used_col+=is_used_col; + } + fprintf ( stdout, "\n>%s USED_POSITIONS: COL: %.2f RES: %.2f COMP\n", S->name[ci], (float)used_col/(float)A->len_aln, (float)used_res/(float) n_res); + + return R; +} + +float** cache2pred3 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode) +{ + int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci, a, n; + double score, max; + float **R, T; + + + + int tp, tn, fn, fp; + int best_tp, best_fp; + int delta, best_delta; + int **list; + + /*Predict on ns[1] what was trained on ns[0]*/ + + strget_param ( mode, "_THR_", "0.5", "%f", &T); + + + R=declare_float (2, 2); + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + list=declare_int ( ns[1],3); + + for (s1=0; s1len_aln; col++) + { + int used; + + r1=tolower (A->seq_al[seq1][col]); + for (used=0,s2=0; s2seq_al[ci][seq2]=='O')continue; + if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue; + + + r2=tolower ( A->seq_al[seq2][col]); + if ( is_gap(r2))continue; + + v=cache[seq2][col]; + if ( r2==r1){score+=v;} + used=1; + max+=v; + } + } + + + + pred=(( score/max) >T)?1:0; + real=(S->seq_al[ci][seq1]=='I')?1:0; + + list[s1][0]=real; + list[s1][1]=(int)((score/max)*(float)1000); + list[s1][2]=seq1; + + + + } + sort_int_inv (list, 3, 1, 0, ns[1]-1); + + for ( a=0; aname[seq1],list[a][0], list[a][1]); + } + + for (n=0, a=0; alen_aln, sizeof (int)); + for (a=0; a< A->len_aln; a++) + for ( b=0; b< A->nseq; b++) + if ( cache[b][a])ul[nused++]=a; + + /*compute the similarity on the used columns*/ + + R=declare_float (2, 2); + sim=declare_int (A->nseq, A->nseq); + for (a=0; a< A->nseq; a++) + for ( b=0; b< A->nseq; b++) + { + for (c=0; c< nused; c++) + { + if ( A->seq_al[a][ul[c]]==A->seq_al[b][ul[c]])sim[a][b]++; + } + sim[a][b]=(sim[a][b]*100)/nused; + } + vfree (ul); + + + + + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + list=declare_int ( ns[1],2); + + for (s1=0; s1seq_al[ci][seq2]=='I')score=MAX(score, sim[seq1][seq2]); + } + list[s1][0]=(S->seq_al[ci][seq1]=='I')?1:0; + list[s1][1]=(int)score; + + } + sort_int_inv (list, 2, 1, 0, ns[1]-1); + + for (n=0, a=0; aname, S->nseq, -1); + list=declare_int ( ns[1],2); + + for (s1=0; s1seq_al[ci][seq2]=='I')score=MAX(score, sim[seq1][seq2]); + } + list[s1][0]=(S->seq_al[ci][seq1]=='I')?1:0; + list[s1][1]=(int)score; + + } + sort_int_inv (list, 2, 1, 0, ns[1]-1); + + for (n=0, a=0; aname, S->nseq, -1); + list=declare_int (A->nseq,2); + R=declare_float (2, 2); + + + for ( a=0; anseq; a++) + { + int real, res; + + ns[0]=A->nseq-1; + ns[1]=1; + for (c=0,b=0; bnseq; b++) + if (a!=b)ls[0][c++]=b; + ls[1][0]=a; + + + cache=sar2cache_count1 (A, ns, ls,S, compound, mode); + for (b=0; b<=26; b++) + for ( c=0; c< A->len_aln; c++) + cacheIN[b][c]+=cache[b][c]; + + seq1=a; + real=(S->seq_al[ci][seq1]=='I')?1:0; + fprintf ( stdout, ">%-10s %d ", A->name[seq1], real); + + + + for (max_score=0,b=0; blen_aln; b++) + max_score+=cache[26][b]; + + for (score=0,b=0; blen_aln; b++) + { + res=tolower (A->seq_al[seq1][b]); + if ( cache[26][b]==0) continue; + if ( !is_gap(res)) + { + score+=cache[res-'a'][b]; + } + /*fprintf ( stdout, "%c[%3d]", res,b);*/ + } + fprintf ( stdout, " SCORE: %5d SPRED %d RATIO: %.2f \n", (int)score, a, (score*100)/max_score); + list[a][0]=real; + + if ( strstr (mode, "SIMTEST"))list[a][1]=(score*100)/max_score; + else list[a][1]=(score*100)/max_score; + free_int (cache, -1); + } + + + sort_int_inv (list, 2, 1, 0, A->nseq-1); + for (n=0, a=0; anseq; a++) + { + n+=list[a][0]; + } + + for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; anseq; a++) + { + + tp+=list[a][0]; + fp+=1-list[a][0]; + delta=(n-(tp+fp)); + if (FABS(delta)nseq-(tp+fp+fn); + R[1][1]=tp; + R[1][0]=fp; + R[0][1]=fn; + R[0][0]=tn; + free_int (list, -1); + + return R; +} +float** jacknife6 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode) +{ + int seq1, ci, a,b, c,d,e,f, n; + double score; + float **R; + + + int tp, tn, fn, fp; + int best_tp, best_fp; + int delta, best_delta; + int **list; + + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + list=declare_int (A->len_aln,2); + R=declare_float (2, 2); + + + for ( a=0; anseq; a++) + { + int sar, res; + int **new_cache; + + ns[0]=A->nseq-1; + ns[1]=1; + for (c=0,b=0; bnseq; b++) + if (a!=b)ls[0][c++]=b; + ls[1][0]=a; + + cache=sar2cache_proba_new (A, ns, ls,S, compound, mode); + + + new_cache=declare_int (27,A->len_aln); + + for (d=0; d< A->len_aln; d++) + { + int **analyze; + if ( cache[26][d]==0)continue; + analyze=declare_int (26, 2); + + for ( e=0; e< ns[0]; e++) + { + f=ls[0][e]; + sar=(S->seq_al[ci][f]=='I')?1:0; + res=tolower (A->seq_al[f][d]); + + if ( res=='-') continue; + analyze[res-'a'][sar]++; + } + for (e=0;e<26; e++) + { + if ( analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=cache[e][d];} + /* + if ( analyze[e][0] && analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=analyze[e][1];} + else if ( analyze[e][0]){new_cache[26][d]=1;new_cache[e][d]-=analyze[e][0]*10;} + else if ( analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=analyze[e][1];} + else if ( !analyze[e][0] &&!analyze[e][1]); + */ + } + free_int (analyze, -1); + } + + seq1=a; + sar=(S->seq_al[ci][seq1]=='I')?1:0; + fprintf ( stdout, ">%-10s %d ", A->name[seq1], sar); + + for (score=0,b=0; blen_aln; b++) + { + res=tolower (A->seq_al[seq1][b]); + if ( cache[26][b]==0) continue; + if ( !is_gap(res)) + { + score+=new_cache[res-'a'][b]; + } + } + fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score); + list[seq1][0]=sar; + list[seq1][1]=(int)score; + + free_int (new_cache, -1); + free_int (cache, -1); + } + sort_int_inv (list, 2, 1, 0, A->nseq-1); + for (n=0, a=0; anseq; a++)n+=list[a][0]; + for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; anseq; a++) + { + tp+=list[a][0]; + fp+=1-list[a][0]; + delta=(n-(tp+fp)); + if (FABS(delta)nseq-(tp+fp+fn); + R[1][1]=tp; + R[1][0]=fp; + R[0][1]=fn; + R[0][0]=tn; + free_int (list, -1); + + + return R; +} +float** cache2pred_new (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode) +{ + int s1, seq1, ci, a,b, n; + double score; + float **R; + + + int tp, tn, fn, fp; + int best_tp, best_fp; + int delta, best_delta; + int **list; + + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + list=declare_int ( ns[1],2); + R=declare_float (2, 2); + + for (s1=0; s1seq_al[ci][seq1]=='I')?1:0; + fprintf ( stdout, ">%-10s %d ", A->name[seq1], real); + for (score=0,b=0; blen_aln; b++) + { + res=tolower (A->seq_al[seq1][b]); + if ( cache[26][b]==0) continue; + if ( !is_gap(res)) + { + score+=cache[res-'a'][b]; + } + fprintf ( stdout, "%c", res); + } + fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score); + list[s1][0]=real; + list[s1][1]=(int)score; + } + + sort_int_inv (list, 2, 1, 0, ns[1]-1); + + for (n=0, a=0; aname, S->nseq, -1); + list=declare_int ( ns[1],2); + R=declare_float (2, 2); + + for (s1=0; s1seq_al[ci][seq1]=='I')?1:0; + fprintf ( stdout, ">%-10s %d ", A->name[seq1], real); + for (score=0,b=0; blen_aln; b++) + { + res=tolower (A->seq_al[seq1][b]); + if ( cache[26][b]==0) continue; + if ( !is_gap(res)) + { + score+=cache[res-'a'][b]; + } + fprintf ( stdout, "%c", res); + } + fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score); + list[s1][0]=real; + list[s1][1]=(int)score; + } + new_cache=declare_int (27,A->len_aln); + for (a=0; a< A->len_aln; a++) + { + int **analyze, real, res, d; + int *res_type; + int **sub; + int *keep; + keep=vcalloc ( 26, sizeof (int)); + res_type=vcalloc ( 26, sizeof (int)); + sub=declare_int (256, 2); + + if ( cache[26][a]==0)continue; + analyze=declare_int (26, 2); + for ( b=0; b< ns[0]; b++) + { + seq1=ls[0][b]; + real=(S->seq_al[ci][seq1]=='I')?1:0; + res=tolower (A->seq_al[seq1][a]); + + if ( res=='-') continue; + analyze[res-'a'][real]++; + } + fprintf ( stdout, "RSPRED: "); + for (c=0;c<26; c++)fprintf ( stdout, "%c", c+'a'); + fprintf ( stdout, "\nRSPRED: "); + for (c=0;c<26; c++) + { + if ( analyze[c][0] && analyze[c][1]){fprintf ( stdout, "1");res_type[c]='1';} + else if ( analyze[c][0]){new_cache[26][a]=1;new_cache[c][a]-=analyze[c][0];fprintf ( stdout, "0");res_type[c]='0';} + else if ( analyze[c][1]){new_cache[26][a]=1;new_cache[c][a]+=analyze[c][1];fprintf ( stdout, "1");res_type[c]='1';} + else if ( !analyze[c][0] &&!analyze[c][1]){fprintf ( stdout, "-");res_type[c]='-';} + } + + + for ( c=0; c<26; c++) + { + for ( d=0; d<26; d++) + { + + if ( res_type[c]==res_type[d]) + { + sub[res_type[c]][0]+=mat[c][d]; + sub[res_type[c]][1]++; + } + if ( res_type[c]!='-' && res_type[d]!='-') + { + sub['m'][0]+=mat[c][d]; + sub['m'][1]++; + } + } + } + for ( c=0; c< 256; c++) + { + if ( sub[c][1])fprintf ( stdout, " %c: %5.2f ", c, (float)sub[c][0]/(float)sub[c][1]); + } + fprintf ( stdout, " SC: %d\nRSPRED ", cache[26][a]); + + for ( c=0; c<26; c++) + if ( res_type[c]=='1') + { + for (d=0; d<26; d++) + if (mat[c][d]>0)keep[d]++; + keep[c]=9; + } + + for (c=0; c<26; c++) + { + if ( keep[c]>10)fprintf ( stdout, "9"); + else fprintf ( stdout, "%d", keep[c]); + } + for ( c=0; c<26; c++) + { + if ( keep[c]>8)new_cache[c][a]=10; + else new_cache[c][a]=-10; + } + fprintf ( stdout, "\n"); + free_int (analyze, -1); + free_int (sub, -1); + vfree (res_type); + vfree (keep); + + } + for ( a=0; a<25; a++) + for (b=a+1; b<26; b++) + { + int r1, r2; + r1=a+'a';r2=b+'a'; + if ( strchr("bjoxz", r1))continue; + if ( strchr("bjoxz",r2))continue; + + if ( mat[a][b]>0 && a!=b)fprintf ( stdout, "\nMATANALYZE %c %c %d", a+'a', b+'a', mat[a][b]); + } + + for (s1=0; s1seq_al[ci][seq1]=='I')?1:0; + fprintf ( stdout, ">%-10s %d ", A->name[seq1], real); + for (score=0,b=0; blen_aln; b++) + { + res=tolower (A->seq_al[seq1][b]); + if ( cache[26][b]==0) continue; + if ( !is_gap(res)) + { + score+=new_cache[res-'a'][b]; + } + fprintf ( stdout, "%c", res); + } + fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score); + list[s1][0]=real; + list[s1][1]=(int)score; + } + free_int (new_cache, -1); + sort_int_inv (list, 2, 1, 0, ns[1]-1); + + + for (n=0, a=0; aname, S->nseq, -1); + cache=declare_int (A->nseq, A->len_aln); + + strget_param ( mode, "_FILTER1_", "0" , "%f", &T1); + strget_param ( mode, "_FILTER2_", "1000000", "%f", &T2); + strget_param ( mode, "_FILTER3_", "0" , "%f", &T3); + strget_param ( mode, "_FILTER4_", "1000000", "%f", &T4); + strget_param ( mode, "_SIMWEIGHT_", "1", "%d", &sim_weight); + strget_param ( mode, "_SWTHR_", "30", "%d", &sw_thr); + strget_param (mode, "_TRAIN_","1", "%d", &train_mode); + strget_param (mode, "_ZSCORE_","0", "%f", &zscore); + + + + + + if (sim_weight==1 && !sim) sim=aln2sim_mat(A, "idmat"); + for ( ms=0; msseq_al[ci][mseq]!='I')continue; + + list=declare_int (A->len_aln+1, nfield); + for (t=0,n=0, col=0; col< A->len_aln; col++) + { + int same_res; + + mres=tolower(A->seq_al[mseq][col]); + list[col][RES]=mres; + list[col][COL_INDEX]=col; + + if ( is_gap(mres))continue; + for ( s=0; sseq_al[seq][col]); + if (is_gap(res))continue; + + + if (sim_weight==1) + { + w=sim[seq][mseq];w=(mres==res)?100-w:w; + if (wseq_al[ci][seq]=='I')same_res=1; + else same_res=(res==mres)?1:0; + } + else + same_res=(res==mres)?1:0; + + list[col][N]+=w; + + if (S->seq_al[ci][seq]=='I' && same_res)list[col][N11]+=w; + else if (S->seq_al[ci][seq]=='I' && same_res)list[col][N10]+=w; + else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N01]+=w; + else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N00]+=w; + + if ( S->seq_al[ci][seq]=='I')list[col][N1sar]+=w; + if ( same_res)list[col][N1msa]+=w; + + } + + list[col][SCORE]=(int)evaluate_sar_score1 (list[col][N], list[col][N11], list[col][N1msa], list[col][N1sar]); + + } + + strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1); + strget_param ( mode, "_WEIGHT_", "1", "%d", &weight_mode); + strget_param ( mode, "_QUANT_", "0.0", "%f", &quant); + + sort_int_inv (list,nfield,SCORE,0,A->len_aln-1); + if ( quant !=0) + { + + n=quantile_rank ( list,SCORE, A->len_aln,quant); + sort_int (list,nfield,N1msa, 0, n-1); + maxn1=MIN(n,maxn1); + } + + for (a=0; aT1 && valuelen_aln+1, nfield); + for ( col=0; col< A->len_aln; col++) + { + list[col][COL_INDEX]=col; + for ( s=0; slen_aln; col++) + if (list[col][SCORE]T4) + { + list[col][SCORE]=0; + for (s=0; s< A->nseq; s++) + if (!is_gap(A->seq_al[s][col]))cache[s][col]=0; + } + + /*Keep The N Best Columns*/ + if ( zscore!=0) + { + double sum=0, sum2=0, z; + int n=0; + for (a=0; a< A->len_aln; a++) + { + if ( list[a][SCORE]>0) + { + sum+=list[a][SCORE]; + sum2+=list[a][SCORE]*list[a][SCORE]; + n++; + } + } + for (a=0; alen_aln; a++) + { + if ( list[a][SCORE]>0) + { + z=return_z_score (list[a][SCORE], sum, sum2,n); + if ((float)znseq; s++) + cache [s][col]=0; + } + else + { + fprintf ( stdout, "\nZSCORE: KEEP COL %d SCORE: %f SCORE: %d\n", list[a][COL_INDEX], (float)z, list[a][SCORE]); + } + } + } + } + else + { + sort_int_inv (list,nfield,SCORE,0,A->len_aln-1); + strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2); + + for (a=maxn2;alen_aln; a++) + { + col=list[a][COL_INDEX]; + for (s=0; snseq; s++) + cache [s][col]=0; + } + } + + /*Get Rid of the N best Columns*/; + strget_param ( mode, "_MAXN3_", "0", "%d", &maxn3); + + for (a=0; anseq; s++) + cache [s][col]=0; + } + + return cache; +} +int aln2n_comp_col ( Alignment *A, Alignment *S, int ci) +{ + int res, seq,sar, col, r; + int **analyze; + + int tot=0; + + analyze=declare_int (27, 2); + for ( col=0; col< A->len_aln; col++) + { + int n1, n0; + + + for ( n1=0, n0=0,seq=0; seqnseq; seq++) + { + res=tolower(A->seq_al[seq][col]); + sar=(S->seq_al[ci][seq]=='I')?1:0; + n1+=(sar==1)?1:0; + n0+=(sar==0)?1:0; + if ( res=='-')continue; + res-='a'; + analyze[res][sar]++; + } + + for (r=0; r<26; r++) + { + int a0,a1; + a0=analyze[r][0]; + a1=analyze[r][1]; + + + if ( a1==n1 && a0len_aln, 2); + cache=declare_int ( 27, A->len_aln); + analyze=declare_int (27, 2); + + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + + for ( col=0; col< A->len_aln; col++) + { + int n1, n0; + + + for ( n1=0, n0=0,s=0; sseq_al[seq][col]); + sar=(S->seq_al[ci][seq]=='I')?1:0; + n1+=(sar==1)?1:0; + n0+=(sar==0)?1:0; + if ( res=='-')continue; + res-='a'; + + analyze[res][sar]++; + } + + for (r=0; r<26; r++) + { + + a0=analyze[r][0]; + a1=analyze[r][1]; + + if ( strstr (mode, "SIMTEST")) + { + w=a1; + } + else if (a1 ) + { + w=n0-a0; + } + else w=0; + + cache[r][col]+=w; + cache[26][col]=MAX(w, cache[26][col]); + } + + for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0; + list[col][0]=col; + list[col][1]=cache[26][col]; + } + + free_int (analyze, -1); + + sort_int_inv (list, 2, 1, 0, A->len_aln-1); + + strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2); + + for ( col=maxn2; collen_aln; col++) + for ( r=0; r<=26; r++)cache[r][list[col][0]]=0; + + free_int (list, -1); + return cache; +} + + +int **sar2cache_count2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode) +{ + int maxn2, res, seq,sar, ci, col,s, r; + int **analyze, **list, **cache, **conseq; + static int **mat; + int w=0; + if (!mat) mat=read_matrice ("blosum62mt"); + + + list=declare_int ( A->len_aln, 2); + cache=declare_int ( 27, A->len_aln); + conseq=declare_int ( A->len_aln,3); + + analyze=declare_int (27, 2); + + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + for ( col=0; col< A->len_aln; col++) + { + int n1, n0; + + for ( n1=0, n0=0,s=0; sseq_al[seq][col]); + sar=(S->seq_al[ci][seq]=='I')?1:0; + n1+=(sar==1)?1:0; + n0+=(sar==0)?1:0; + if ( res=='-')continue; + res-='a'; + analyze[res][sar]++; + } + for (r=0; r<26; r++) + { + int a0,a1; + a0=analyze[r][0]; + a1=analyze[r][1]; + if ( a1==n1 && a0len_aln; col++) + { + + res=tolower(A->seq_al[seq][col]); + if ( is_gap(res))continue; + else res-='a'; + + if ( conseq[col][1] && res!=conseq[col][0])w1++; + if ( conseq[col][1])w2++; + } + for (col=0; collen_aln; col++) + { + res=tolower(A->seq_al[seq][col]); + if ( is_gap(res))continue; + else res-='a'; + + if ( conseq[col][1] && res!=conseq[col][0])conseq[col][2]+=(w2-w1); + } + } + + for (col=0; collen_aln; col++) + { + r=conseq[col][0]; + w=conseq[col][2]; + + + cache[r][col]=cache[26][col]=list[col][1]=w; + list[col][0]=col; + } + sort_int_inv (list, 2, 1, 0, A->len_aln-1); + strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2); + + for ( col=maxn2; collen_aln; col++) + for ( r=0; r<=26; r++)cache[r][list[col][0]]=0; + + + free_int (list, -1); + return cache; +} + +int **sar2cache_count3 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode) +{ + int maxn2, res, seq,sar, ci, col,s, r, a1, a0, n1, n0; + int **analyze, **list, **cache; + static int **mat; + + if (!mat) mat=read_matrice ("blosum62mt"); + + + list=declare_int ( A->len_aln, 2); + cache=declare_int ( 27, A->len_aln); + analyze=declare_int (27, 2); + + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + + for ( col=0; col< A->len_aln; col++) + { + double e, g; + for ( n1=0, n0=0,s=0; sseq_al[seq][col]); + sar=(S->seq_al[ci][seq]=='I')?1:0; + n1+=(sar==1)?1:0; + n0+=(sar==0)?1:0; + if ( res=='-')continue; + res-='a'; + + analyze[res][sar]++; + } + + /*Gap*/ + for (g=0,r=0; rnseq; r++) + g+=is_gap(A->seq_al[r][col]); + g=(100*g)/A->nseq; + + /*enthropy + for (e=0, r=0; r<26; r++) + { + a0=analyze[r][0]; + a1=analyze[r][1]; + t=a0+a1; + + if (t>0) + e+= t/(double)A->nseq*log(t/(double)A->nseq); + } + e*=-1; + */ + e=0; + if (g>10) continue; + if (e>10) continue; + + if ( strstr ( mode, "SIMTEST")) + { + for (r=0; r<26; r++) + { + + a0=analyze[r][0]; + a1=analyze[r][1]; + + if (a1) + { + cache[r][col]=a1; + cache[26][col]=MAX(cache[26][col],a1); + } + } + } + else + { + + + + for (r=0; r<26; r++) + { + + a0=analyze[r][0]; + a1=analyze[r][1]; + + if (!a1 && a0) + { + cache[r][col]=a0; + cache[26][col]=MAX(cache[26][col],a0); + } + } + } + + for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0; + list[col][0]=col; + list[col][1]=cache[26][col]; + } + + free_int (analyze, -1); + + sort_int_inv (list, 2, 1, 0, A->len_aln-1); + + strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2); + + for ( col=maxn2; collen_aln; col++) + for ( r=0; r<=26; r++)cache[r][list[col][0]]=0; + + free_int (list, -1); + return cache; +} + + +int **sar2cache_proba_new ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode) +{ + int col, s, seq,ms,mseq, res, mres, res1, n,maxn1, maxn2,maxn3, t, ci, a,w; + + int **list; + + int N1msa,N1sar, N, N11, N10, N01,N00, SCORE, COL_INDEX, RES; + int nfield=0; + int value; + + + int **cache; + static int **sim; + int sw_thr; + float zscore; + + RES=nfield++;COL_INDEX=nfield++;N1msa=nfield++;N1sar=nfield++;N=nfield++;N11=nfield++;N10=nfield++;N01=nfield++;N00=nfield++;SCORE=nfield++; + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + cache=declare_int (27, A->len_aln); + + strget_param ( mode, "_SWTHR_", "30", "%d", &sw_thr); + strget_param (mode, "_ZSCORE_","0", "%f", &zscore); + + + if (!sim)sim=aln2sim_mat(A, "idmat"); + for ( ms=0; msseq_al[ci][mseq]!='I')continue; + + list=declare_int (A->len_aln+1, nfield); + for (t=0,n=0, col=0; col< A->len_aln; col++) + { + int same_res; + + mres=tolower(A->seq_al[mseq][col]); + if ( is_gap(mres))continue; + + list[col][RES]=mres; + list[col][COL_INDEX]=col; + + for ( s=0; sseq_al[seq][col]); + if (is_gap(res))continue; + w=sim[seq][mseq];w=(mres==res)?100-w:w; + if (wseq_al[ci][seq]=='I' && same_res)list[col][N11]+=w; + else if (S->seq_al[ci][seq]=='I' && same_res)list[col][N10]+=w; + else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N01]+=w; + else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N00]+=w; + + if ( S->seq_al[ci][seq]=='I')list[col][N1sar]+=w; + if ( same_res)list[col][N1msa]+=w; + + } + + list[col][SCORE]=(int)evaluate_sar_score1 (list[col][N], list[col][N11], list[col][N1msa], list[col][N1sar]); + + } + strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1); + sort_int_inv (list,nfield,SCORE,0,A->len_aln-1); + for (a=0; alen_aln+1, nfield); + for ( col=0; col< A->len_aln; col++) + { + list[col][COL_INDEX]=col; + list[col][SCORE]=cache[26][col]; + } + /*Keep The N Best Columns*/ + if ( zscore!=0) + { + double sum=0, sum2=0, z; + int n=0; + for (a=0; a< A->len_aln; a++) + { + if ( list[a][SCORE]>0) + { + sum+=list[a][SCORE]; + sum2+=list[a][SCORE]*list[a][SCORE]; + n++; + } + } + for (a=0; alen_aln; a++) + { + if ( list[a][SCORE]>0) + { + z=return_z_score (list[a][SCORE], sum, sum2,n); + if ((float)zlen_aln-1); + strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2); + + for (a=maxn2;alen_aln; a++) + { + col=list[a][COL_INDEX]; + for (s=0; s<27; s++) + cache [s][col]=0; + } + } + + /*Get Rid of the N best Columns*/; + strget_param ( mode, "_MAXN3_", "0", "%d", &maxn3); + + for (a=0; aname, S->nseq, -1); + cache=declare_int (A->nseq, A->len_aln); + + + for ( ms=0; msseq_al[ci][mseq]!='I')continue; + + list=declare_int (A->len_aln+1, 5); + for (t=0,n=0, col=0; col< A->len_aln; col++) + { + mres=tolower(A->seq_al[mseq][col]); + list[col][0]=mres; + list[col][1]=col; + + if ( is_gap(mres))continue; + for ( s=0; sseq_al[seq][col]); + if (is_gap(res))continue; + + if (S->seq_al[ci][seq]=='I' && res==mres)list[col][3]++; + if (res==mres)list[col][2]++; + } + } + + sort_int_inv (list,5,3,0,A->len_aln-1); + + strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1); + strget_param ( mode, "_QUANT_", "0.95", "%f", &quant); + + n=quantile_rank ( list, 3, A->len_aln,quant); + sort_int (list, 5, 2, 0, n-1); + + for (a=0; anseq, A->len_aln); + ci=name_is_in_list ( compound, S->name, S->nseq, -1); + + strget_param ( mode, "_FILTER1_", "0" , "%f", &T1); + strget_param ( mode, "_FILTER2_", "1000000", "%f", &T2); + strget_param ( mode, "_FILTER3_", "0" , "%f", &T3); + strget_param ( mode, "_FILTER4_", "1000000", "%f", &T4); + + list=declare_int (A->len_aln+1,A->nseq+2); + SCORE=A->nseq; + COL=A->nseq+1; + + for ( ms=0; msseq_al[ci][mseq]!='I')continue; + + for (t=0,n=0, col=0; col< A->len_aln; col++) + { + int N11=0,N10=0,N01=0,N00=0,N1sar=0,N1msa=0,N=0; + + mres=tolower(A->seq_al[mseq][col]); + if ( is_gap(mres))continue; + for ( s=0; sseq_al[seq][col]); + if (is_gap(res))continue; + + N++; + if (S->seq_al[ci][seq]=='I' && res==mres)N11++; + else if (S->seq_al[ci][seq]=='I' && res!=mres)N10++; + else if (S->seq_al[ci][seq]=='O' && res==mres)N01++; + else if (S->seq_al[ci][seq]=='O' && res!=mres)N00++; + + if ( S->seq_al[ci][seq]=='I')N1sar++; + if ( res==mres)N1msa++; + } + list[col][mseq]=(int)evaluate_sar_score1 (N,N11,N1msa,N1sar); + list[col][SCORE]+=list[col][mseq]; + list[col][COL]=col; + } + } + + strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1); + strget_param ( mode, "_QUANT_", "0.95", "%f", &quant); + sort_int_inv (list,A->nseq+2,SCORE, 0, A->len_aln-1); + n=quantile_rank ( list,A->nseq, A->len_aln,quant); + n=5; + + + for (a=0; anseq; b++) + { + value=list[col][b]; + if ( value>T1 && valuenseq, 2); + seq_list=declare_int (A->nseq, 2); + for (a=0; a< A->nseq; a++) + { + if (comp[a]=='I') + { + sar_list[nsar][0]=a; + sar_list[nsar][1]=rand()%100000; + nsar++; + } + else + { + seq_list[nseq][0]=a; + seq_list[nseq][1]=rand()%100000; + nseq++; + } + } + + + l1[0]=vcalloc (A->nseq, sizeof (int)); + l2[0]=vcalloc (A->nseq, sizeof (int)); + nl1[0]=nl2[0]=0; + + sort_int (seq_list, 2, 1, 0,nseq-1); + sort_int (sar_list, 2, 1, 0,nsar-1); + mid=nsar/2; + for (a=0; anseq, 2); + l1[0]=vcalloc (A->nseq, sizeof (int)); + l2[0]=vcalloc (A->nseq, sizeof (int)); + nl1[0]=nl2[0]; + + vsrand (0); + for ( a=0; a< A->nseq; a++) + { + list[a][0]=a; + list[a][1]=rand()%100000; + } + sort_int (list, 2, 1, 0,A->nseq-1); + mid=A->nseq/2; + for (a=0; anseq; b++, a++) + { + l2[0][nl2[0]++]=list[b][0]; + } + + free_int (list, -1); + return 1; +} +int aln2jack_group1 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2) +{ + int **sim; + int **list; + int a, mid; + + list=declare_int ( A->nseq, 3); + l1[0]=vcalloc (A->nseq, sizeof (int)); + l2[0]=vcalloc (A->nseq, sizeof (int)); + nl1[0]=nl2[0]; + + sim=aln2sim_mat (A, "idmat"); + for ( a=0; a< A->nseq; a++) + { + list[a][0]=seq; + list[a][1]=a; + list[a][2]=(a==seq)?100:sim[seq][a]; + } + sort_int_inv (list, 3, 2, 0, A->nseq-1); + fprintf ( stderr, "\nJacknife fromsequence %s [%d]\n", A->name[seq], seq); + mid=A->nseq/2; + for (a=0; a< mid; a++) + l1[0][nl1[0]++]=list[a][1]; + for (a=mid; anseq; a++) + l2[0][nl2[0]++]=list[a][1]; + return 1; +} + + +int sarset2subsarset ( Alignment *A, Alignment *S, Alignment **subA, Alignment **subS, Alignment *SUB) +{ + Alignment *rotS, *intS; + int a,b, *list, nl; + + list=vcalloc ( SUB->nseq, sizeof (int)); + for (nl=0,a=0; anseq; a++) + { + b=name_is_in_list(SUB->name[a], A->name, A->nseq, 100); + if ( b!=-1)list[nl++]=b; + } + + subA[0]=extract_sub_aln (A, nl, list); + rotS=rotate_aln (S, NULL); + intS=extract_sub_aln (rotS, nl, list); + + subS[0]=rotate_aln (intS, NULL); + + for ( a=0; anseq; a++) sprintf ( (subS[0])->name[a], "%s", S->name[a]); + + + return 0; +} + +int ***simple_sar_analyze_vot ( Alignment *A, Alignment *SAR, char *mode) +{ + int a, b, c, d; + int res1, res2, sar1, sar2; + float s; + int **sim; + static float ***result; + static int ***iresult; + if (!result) + { + result=declare_arrayN (3,sizeof (float),SAR->nseq, A->len_aln,3); + iresult=declare_arrayN (3,sizeof (int),SAR->nseq, A->len_aln,3); + } + + sim=aln2sim_mat (A, "idmat"); + + + for (a=0; anseq; a++) + for (b=0; blen_aln; b++) + result[a][b][0]=1; + + for ( a=0; a< SAR->nseq; a++) + for ( b=0; bnseq-1; b++) + for ( c=b+1; c< A->nseq; c++) + for ( d=0; dlen_aln; d++) + { + res1=A->seq_al[b][d]; + res2=A->seq_al[c][d]; + + sar1=(SAR->seq_al[a][b]=='I')?1:0; + sar2=(SAR->seq_al[a][c]=='I')?1:0; + + s=sim[b][c]; + + + + + if ( sar1!=sar2 && res1!=res2) + result[a][d][0]*=(1/(100-s)); + + else if ( sar1==sar2 && sar1==1 && res1==res2) + result[a][d][0]*=1/s; + + + + + /* + else if ( sar1==sar2 && res1==res2)result[a][d][0]+=(100-s)*(100-s); + else if ( sar1==sar2 && res1!=res2)result[a][d][0]-=s*s; + else if ( sar1!=sar2 && res1==res2)result[a][d][0]-=(100-s)*(100-s); + */ + + result[a][d][1]='a'; + } + for ( a=0; anseq; a++) + for ( b=0; blen_aln; b++) + { + fprintf ( stderr, "\n%f", result[a][b][0]); + iresult[a][b][0]=100*log(1-result[a][b][0]); + } + return iresult; +} +int display_simple_sar_analyze_pair_col (Alignment *A, Alignment *SAR, char *mode) +{ + int **r; + int a, b, n, do_tm; + + + Alignment *rA; + int *nI; + + + strget_param (mode, "_TM_", "0", "%d", &do_tm); + r=simple_sar_analyze_pair_col (A, SAR,mode); + rA=rotate_aln (A, NULL); + n=0; + + nI=vcalloc ( SAR->nseq, sizeof (int)); + for (a=0; a< SAR->nseq; a++) + for (b=0; blen_aln; b++) nI[a]+=(SAR->seq_al[a][b]=='I')?1:0; + + + + while ( r[n][0]!=-1) + { + if (r[n][3]>0) + {fprintf ( stdout, "COMP S: %3d %3d %s %20s %2d #\n", r[n][3],0,SAR->seq_al[r[n][0]], SAR->name[r[n][0]], nI[r[n][0]]); + fprintf ( stdout, "SEQ1 S: %3d %3d %s %20s %2d #\n", r[n][3],r[n][1],rA->seq_al[r[n][1]], SAR->name[r[n][0]],nI[r[n][0]]); + fprintf ( stdout, "SEQ2 S: %3d %3d %s %20s %2d #\n\n", r[n][3],r[n][2],rA->seq_al[r[n][2]], SAR->name[r[n][0]],nI[r[n][0]]); + } + n++; + } + return 0; +} +int display_simple_sar_analyze_col (Alignment *A, Alignment *SAR, char *mode) +{ + int ***result, **r2, **r3, **r4, **aa; + int a, b, c, n; + char *cons; + int threshold=20; + int do_tm; + strget_param (mode, "_TM_", "0", "%d", &do_tm); + result=simple_sar_analyze_col (A, SAR,mode); + r2=declare_int (A->len_aln*SAR->nseq, 5); + r3=declare_int (A->len_aln+1, 5); + r4=declare_int (A->len_aln+1, SAR->nseq+1); + aa=declare_int (2, 256); + cons=vcalloc (A->len_aln+1, sizeof (char)); + for (a=0; alen_aln; a++){r3[a][0]=a;cons[a]='A';} + + + + for (n=0,a=0; a< SAR->nseq; a++) + { + double sum, sum2; + for (sum=0, sum2=0,b=0; blen_aln; b++) + { + sum+=result[a][b][0]; + sum2+=result[a][b][0]*result[a][b][0]; + } + + for (b=0; blen_aln; b++, n++) + { + r2[n][0]=a;//compound + r2[n][1]=b;//pos + r2[n][2]=result[a][b][1]; //AA + r2[n][3]=result[a][b][0]; //Score + r2[n][4]=result[a][b][2]; //(int)10*return_z_score ((double)result[a][b][0], sum, sum2, A->len_aln); //ZScore + } + } + sort_int (r2,5, 3, 0, n-1);//sort on Score (3rd field) + for ( a=0; a< n; a++) + { + int comp, pos, bad; + + comp=r2[a][0]; + pos=r2[a][1]; + fprintf ( stdout, "SEQ %5d %5d %5d %s ",r2[a][1]+1,r2[a][3], r2[a][4], (do_tm)?alnpos2hmmtop_pred (A, NULL, r2[a][1], SHORT):"x"); + for (c=0; cnseq; c++)fprintf (stdout, "%c", A->seq_al[c][r2[a][1]]); + + + + bad=0; + for (c=0; c< A->nseq; c++) + { + int activity, res; + + activity=SAR->seq_al[comp][c]; + res=A->seq_al[c][pos]; + + if (activity=='O')aa[0][res]++; + if (activity=='I')aa[1][res]++; + } + + for (c=0; c< A->nseq; c++) + { + int activity, res; + activity=SAR->seq_al[comp][c]; + res=A->seq_al[c][pos]; + bad+=(aa[0][res] && aa[1][res])?1:0; + aa[0][res]=aa[1][res]=0; + } + fprintf ( stdout, " %20s %d |\nCOM %5d %5d %5d %s %s %20s %d |\n\n", SAR->name[r2[a][0]],bad,r2[a][1]+1,r2[a][3],r2[a][4], (do_tm)?alnpos2hmmtop_pred (A, NULL, r2[a][1], SHORT):"x",SAR->seq_al[r2[a][0]], SAR->name[r2[a][0]], bad); + + + if (r2[a][4]>threshold) + { + cons[r2[a][1]]++; + r3[r2[a][1]][1]++; + r3[r2[a][1]][2]+=r2[a][3]; + r4[r2[a][1]][r2[a][0]]=1; + } + } + sort_int (r3, 3,1,0, A->len_aln-1); + + for (a=0; alen_aln; a++) + { + if (r3[a][1]>0) + { + fprintf ( stdout, "\nPOS %4d %4d %4d %c ", r3[a][0]+1, r3[a][1], r3[a][2], cons[r3[a][0]]); + for (b=0; bnseq; b++)fprintf ( stdout, "%d", r4[r3[a][0]][b]); + if (do_tm)fprintf ( stdout, " %s",alnpos2hmmtop_pred (A, NULL, r3[a][0], VERBOSE)); + } + } + for (a=0; a< A->nseq; a++)fprintf ( stdout, "\n#MSA >%s\n#MSA %s",A->name[a], A->seq_al[a]); + fprintf ( stdout, "\n#MSA >cons\n#MSA %s", cons); + + return 0; +} +int *** simple_sar_predict (Alignment *A, Alignment *SAR, char *mode) +{ + //This function estimates the z score of every poition with every compound + //The best Z-score position is then used for the prediction + + int a, b, c, nts, pos, Rscore,Zscore; + int ***r; + int ***pred; + int **aa; + + + aa=declare_int (2,256); + pred=declare_arrayN (3, sizeof (int),SAR->nseq, A->nseq, 5); + + + r=simple_sar_analyze_col (A, SAR, mode); + nts=SAR->len_aln; //number_trainning_sequences; + + for (a=0; anseq; a++) + { + sort_int (r[a],4, 2, 0, A->len_aln-1); + + pos=r[a][A->len_aln-1][3]; //Best Position + Zscore=r[a][A->len_aln-1][2]; //Best Z-Score + Rscore=r[a][A->len_aln-1][0]; //Best Z-Score + + + for (c=0; cseq_al[a][c]=='I')aa[1][(int)A->seq_al[c][pos]]++;//Build Positive Alphabet for Compound a + if (SAR->seq_al[a][c]=='O')aa[0][(int)A->seq_al[c][pos]]++;//Build Positive Alphabet for Compound a + } + for (c=nts; cnseq; c++) + { + pred[a][c][0]=pos; + pred[a][c][1]=Zscore; + pred[a][c][2]=Rscore; + if (aa[1][(int)A->seq_al[c][pos]]>0) + { + + pred[a][c][3]=aa[1][(int)A->seq_al[c][pos]]; + pred[a][c][4]=aa[0][(int)A->seq_al[c][pos]]; + } + } + for (c=0; cseq_al[c][pos]]=aa[1][(int)A->seq_al[c][pos]]=0; + } + + for ( a=nts; a< A->nseq; a++) + { + for ( b=0; bnseq; b++) + { + fprintf ( stdout, ">%-25s %-25s Pos %3d ZScore %3d Rscore %3d Activity +: %d -: %d ", A->name [a], SAR->name[b], pred[b][a][0],pred[b][a][1], pred[b][a][2], pred[b][a][3], pred[b][a][4]); + if (pred[b][a][4]==0)for (c=0; cnseq; c++)fprintf ( stdout, "%c", A->seq_al[c][ pred[b][a][0]]); + fprintf ( stdout, " %s\n", SAR->name[b]); + for (c=0; cnseq-1; c++)fprintf ( stdout, "%c", SAR->seq_al[b][c]); + fprintf ( stdout, " %s\n", SAR->name[b]); fprintf ( stdout, "\n"); + + } + } + return pred; +} +int *pair_seq2seq (int *iseq, char *seq1, char *seq2); +int **simple_sar_analyze_pair_col ( Alignment *inA, Alignment *SAR, char *mode) +{ + + int a, b, c, n, n2; + int *iseq=NULL; + static int **result, **fresult; + int sar_mode=1; + int maxgapratio=0; + int nresults=10; + double sum, sum2, score; + Alignment *A; + char aa; + + if (!result) + { + result=declare_int (inA->len_aln*inA->len_aln,5); + + fresult=declare_int (inA->len_aln*nresults*SAR->nseq, 5); + + } + + A=rotate_aln (inA, NULL); + + + for (n2=0,a=0; anseq; a++) + { + + for (n=0, sum=0, sum2=0,b=0; bnseq-1; b++) + { + for ( c=b+1; cnseq; c++, n++) + { + + iseq=pair_seq2seq (iseq,A->seq_al[b], A->seq_al[c]); + if ( sar_mode==1) + score=sar_vs_iseq1(SAR->seq_al[a],iseq,maxgapratio,NULL,&aa); + else if (sar_mode==4) + score=sar_vs_iseq4(SAR->seq_al[a],iseq,maxgapratio,NULL,&aa); + //HERE ("%d", (int)score); + result[n][0]=a;//compound; + result[n][1]=b; //pos1 + result[n][2]=c; //pos2 + result[n][3]=(int)score; + + sum+=score; + sum2+=score*score; + } + } + for (b=0; bnseq, inA->len_aln,4); + + + sim=aln2sim_mat (inA, "idmat"); + A=rotate_aln (inA, NULL); + + + for ( a=0; anseq; a++) + { + best_pos=best_score=0; + for ( sum=0, sum2=0,b=0; bnseq; b++) + { + + if ( sar_mode==1) + score=sar_vs_seq1(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa); + else if ( sar_mode==2) + score=sar_vs_seq2(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa); + else if (sar_mode ==3) + score=sar_vs_seq3(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa); + else if (sar_mode ==4) + score=sar_vs_seq4(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa); + else if (sar_mode ==5) + score=sar_vs_seq5(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa); + + + result[a][b][0]+=score*10; + result[a][b][1]=aa; + result[a][b][3]=b; + sum+=result[a][b][0]; + sum2+=result[a][b][0]*result[a][b][0]; + + } + for ( b=0; b< A->nseq; b++)result[a][b][2]=10*return_z_score ((double)result[a][b][0], sum, sum2, A->nseq); //Score + } + + return result; + + } +int *seq2iseq ( char *seq); +double sar_vs_seq4 ( char *sar, char *seq, float gl, int **sim, char *best_aa) +{ + + return sar_vs_iseq4 (sar, seq2iseq(seq), gl, sim, best_aa); +} +double sar_vs_seq1 ( char *sar, char *seq, float gl, int **sim, char *best_aa) +{ + + return sar_vs_iseq1 (sar, seq2iseq(seq), gl, sim, best_aa); +} + +int *seq2iseq ( char *seq) +{ + static int *iseq, clen; + int a; + + if (!iseq || clengl) return 0; + + if (!aa) + { + aa=vcalloc (256*256, sizeof(int)); + aal=vcalloc (N, sizeof (int)); + } + naa=0; + for ( a=0; areturn_score) + { + best_aa[0]=res; + return_score=score; + } + } + } + for ( a=0; agl) return 0; + + //Identify all the AA associated with a I (Positive alphabet) + aa=vcalloc ( 256, sizeof (int)); + for (b=0; b=1 || N01>=1) return 0; + if (N11) + { + score=evaluate_sar_score1 ( N, N11, Nmsa, Nsar); + } + else score=0; + + vfree (aa); + return score; + +} + + + +double sar_vs_iseq4 ( char *sar, int *seq, float gl, int **sim, char *best_aa) +{ + int N, Ni, No; + int a, b,c, r, s; + double Ng=0; + static int **aa; + + /*Correlation between AA conservation and Activity*/ + + N=strlen (sar); + for (a=0; agl) return 0; + + + if (!aa)aa=declare_int(2,257*257); + for (No=Ni=b=0; bgl) return 0; + + //Identify all the AA associated with a I (Positive alphabet) + aa=vcalloc ( 256, sizeof (int)); + for (b=0; bgl) return 0; + for (a=0; a<26; a++) + { + + N=Nmsa=Nsar=N11=N10=N01=0; + res='a'+a; + for (d=0,b=0; bmax_depth) + { + printf_exit (EXIT_FAILURE, stderr,"maximum depth: %d", max_depth); + } + if ( depth==0) depth=2; + A=declare_aln2 (strlen (S1->seq[0]),depth); + a0=A->seq_al[0]; + a1=A->seq_al[1]; + A->len_aln=strlen (S1->seq[0]); + + for (a=0; a< S1->nseq; a++) + for ( b=0; bnseq; b++) + { + A->nseq=2; + sprintf (a0, "%s", S1->seq[a]); + sprintf (a1, "%s", S2->seq[b]); + + if ( strlen (a0)!=strlen (a1)) + { + add_warning (stderr, "WARNING %s (%d) and %s (%d) do not have the same length", S1->name[a], strlen (S1->seq[a]), S2->name[b], strlen (S2->seq[b])); + myexit (EXIT_FAILURE); + } + + fprintf ( stdout, ">2 %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b], sar_aln2cor (A), sar_aln2ev (A)); + sarseq2anti_sarseq (S1->seq[a],a0); + fprintf ( stdout, ">2 %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], sar_aln2cor (A), sar_aln2ev (A)); + if ( depth >=3) + { + A->nseq=3; + a2=A->seq_al[2]; + for (c=b+1; cnseq; c++) + { + sprintf (a0, "%s", S1->seq[a]); + sprintf (a1, "%s", S2->seq[b]); + sprintf (a2, "%s", S2->seq[c]); + fprintf ( stdout, ">2 %15s %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b],S2->name[c], sar_aln2cor (A), sar_aln2ev (A)); + sarseq2anti_sarseq (S1->seq[a],a0); + fprintf ( stdout, ">2 %15s %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], S2->name[c],sar_aln2cor (A), sar_aln2ev (A)); + } + if ( depth>=4) + { + A->nseq=4; + a3=A->seq_al[2]; + for (d=c+1; dnseq; d++) + { + sprintf (a0, "%s", S1->seq[a]); + sprintf (a1, "%s", S2->seq[b]); + sprintf (a2, "%s", S2->seq[c]); + sprintf (a3, "%s", S2->seq[d]); + + fprintf ( stdout, ">2 %15s %15s %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b],S2->name[c],S2->name[d], sar_aln2cor (A), sar_aln2ev (A)); + sarseq2anti_sarseq (S1->seq[a],a0); + fprintf ( stdout, ">2 %15s %15s %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], S2->name[c],S2->name[d],sar_aln2cor (A), sar_aln2ev (A)); + } + if (depth>=5) + { + A->nseq=5; + a4=A->seq_al[3]; + for (e=d+1; enseq; e++) + { + sprintf (a0, "%s", S1->seq[a]); + sprintf (a1, "%s", S2->seq[b]); + sprintf (a2, "%s", S2->seq[c]); + sprintf (a3, "%s", S2->seq[d]); + sprintf (a4, "%s", S2->seq[d]); + } + } + } + } + } + + return S1; +} +char* sarseq2anti_sarseq (char *seq_in, char *seq_out) +{ + int a; + if (!seq_out)seq_out=vcalloc (strlen (seq_in)+1, sizeof (char)); + for (a=0; anseq-1; a++) + for (b=a+1; bnseq; b++) + { + for (n11=n1=0,c=0; clen_aln; c++) + { + n11+=(A->seq_al[a][c]=='I' && A->seq_al[b][c]=='I'); + n1+= (A->seq_al[a][c]=='I' || A->seq_al[b][c]=='I'); + } + tot_cor+=(n1==0)?0:n11/n1; + n++; + } + tot_cor/=n; + return tot_cor; +} +int sarseq_pair2ev ( char *s1, char *s2,int mode); +int sar_aln2ev (Alignment *A) +{ + float n1, n11; + int a, b, c, tot=0, n=0; + + tot=0; + for (a=0; anseq-1; a++) + for (b=a+1; bnseq; b++) + { + tot+=sarseq_pair2ev (A->seq_al[a], A->seq_al[b], 1); + n++; + } + return tot; +} +int sarseq_pair2ev ( char *s1, char *s2,int mode) +{ + int l, t1, t2, t11,a, n1, n2, s; + if ( (l=strlen (s1))!=strlen (s2)) + { + return -1; + } + if (mode==2) + { + t1=l/2; + t2=l/2; + t11=l/2; + } + else + { + for (t1=t2=t11=0,a=0; an11 || n01>n11)return 0; + + p1= M_chooses_Nlog (n1msa, N) + M_chooses_Nlog (n1sar-n11, N-n1msa) + M_chooses_Nlog (n11, n1msa); + p2=(M_chooses_Nlog (n1msa, N)+ M_chooses_Nlog (n1sar, N)); + p=(p1-p2); + + return -p; + +} +double evaluate_sar_score2 ( int N, int n11, int n1msa, int n1sar) +{ + + + return n11-((n1msa-n11)+(n1sar-n11)); + + if ( n11nseq, sizeof (int**)); + + + list=file2list (weight_file, " "); + + a=b=0; + for (a=0; a< SAR->nseq; a++) + { + b=c=0; + while (list[b]) + { + if ( strm (list[b][1], SAR->name[a]) && atoi (list[b][3])>0)c++; + b++; + } + + weight[a]=declare_int (c+1, 3); + fprintf ( stderr, "\n%s %d", SAR->name[a], c); + b=c=0; + while (list[b]) + { + if ( strm (list[b][1], SAR->name[a]) && atoi (list[b][3])>0) + { + weight[a][c][0]=atoi(list[b][2])-1; + weight[a][c][1]=list[b][5][0]; + weight[a][c][2]=atoi (list[b][3]); + c++; + } + b++; + } + weight[a][c][0]=-1; + } + + for (a=0; anseq; a++) + { + fprintf ( stdout, ">%s\n", A->name[a]); + for ( b=0; b< SAR->nseq; b++) + { + score=seq2weighted_sar_score(A->seq_al[a], weight[b]); + fprintf ( stdout, "%c", (score>limit)?'I':'O'); + } + fprintf (stdout, "\n"); + } + myexit (EXIT_SUCCESS); + return A; +} + +Alignment *display_sar ( Alignment *A, Alignment *SAR, char *compound) +{ + int a,c; + char name[100]; + + c=name_is_in_list ( compound, SAR->name, SAR->nseq, 100); + if ( c==-1)return A; + + for ( a=0; a< A->nseq; a++) + { + sprintf (name, "%s", A->name[a]); + sprintf ( A->name[a], "%c_%s_%s", SAR->seq_al[c][a], name,compound); + } + return A; +} +Alignment *aln2weighted_sar_score ( Alignment *A,Alignment *SAR, char *weight_file, char *compound) +{ + + int a, b, c=0; + int **weight; + + int score; + char reactivity; + char ***list; + + + if ( SAR) + { + c=name_is_in_list (compound, SAR->name, SAR->nseq, 100); + } + + list=file2list (weight_file, " "); + a=b=0; + while (list[a]) + { + if (strm (list[a][1], compound))b++; + a++; + } + weight=declare_int ( b+1, 3); + + + a=b=0; + while (list[a]) + { + if ( !strm (list[a][1], compound) || strm ("TOTPOS", list[a][1])); + else + { + weight[b][0]=atoi(list[a][2])-1; + weight[b][1]=list[a][5][0]; + weight[b][2]=atoi(list[a][3]); + b++; + } + a++; + } + weight[b][0]=-1; + for ( a=0; a< A->nseq; a++) + { + score=seq2weighted_sar_score (A->seq_al[a], weight); + reactivity=(!SAR || c==-1)?'U':SAR->seq_al[c][a]; + + sprintf (A->seq_comment[a], "Compound %-15s Reactivity %c SAR_SCORE %5d", compound,reactivity, (int) score); + + } + return A; +} + +float seq2weighted_sar_score ( char *seq, int **weight) +{ + int a, p, r, w; + float score=0; + + a=0; + while (weight[a][0]!=-1) + { + p=weight[a][0]; + r=weight[a][1]; + w=weight[a][2]; + + if ( is_gap(seq[p])); + else if ( tolower(seq[p])==r)score+=w; + a++; + } + return score; + } + +Alignment * sar2simpred (Alignment *A, Alignment *SAR, char *posfile, char *compound, int L1,int L2 ) +{ + int a, b, c, c1, c2; + int **sim, **sim_ref, npred=0; + float n11, n10, n01, n00; + float sn, sp; + + int tot_sim=0; + int N11=1, N01=2, N10=3, NXX=4, SIM=5; + float ***tot; + int i1, i2; + + + n11=n10=n01=n00=0; + tot=declare_arrayN(3,sizeof (float), 10, 6, 2); + + sim_ref=aln2sim_mat (A, "idmat"); + if (strm (posfile, "all")) + sim=sim_ref; + else + { + Alignment *B; + B=copy_aln ( A,NULL); + B=extract_aln3(B,posfile); + + /*if (B->len_aln==0)L1=100; + else + L1=((B->len_aln-1)*100)/B->len_aln; + + if (L1<=0)L1=100; + */ + sim=aln2sim_mat (B, "idmat"); + } + + for (a=0; a< A->nseq-1; a++) + { + for ( b=a+1; b< A->nseq; b++) + { + for ( c=0; cnseq; c++) + { + if ( (strm (compound, SAR->name[c]) || strm ( compound, "all"))) + { + /*if ( sim_ref[a][b]<30 || sim_ref[a][b]>60)continue;*/ + i1=0; /*sim_ref[a][b]/10;if (i1==10)i1--;*/ + + i2=sim[a][b]; + + + c1=(SAR->seq_al[c][a]=='I')?1:0; + c2=(SAR->seq_al[c][b]=='I')?1:0; + + n11=(c1 && c2)?1:0; + n01=(!c1 && c2)?1:0; + n10=(c1 && !c2)?1:0; + n00=(!c1 && !c2)?1:0; + + tot[i1][N11][0]+=n11; + tot[i1][N01][0]+=n01; + tot[i1][N10][0]+=n10; + /*tot[i1][N00][0]+=n00;*/ + tot[i1][NXX][0]++; + tot[i1][SIM][0]+=sim_ref[a][b]; + + if ( i2>=L1) + { + tot[i1][N11][1]+=n11; + tot[i1][N01][1]+=n01; + tot[i1][N10][1]+=n10; + /*tot[i1][N00][1]+=n00;*/ + tot[i1][NXX][1]++; + tot[i1][SIM][1]+=sim_ref[a][b]; + } + } + } + } + } + + for (a=0; a<1; a++) + { + sp=(tot[a][N11][0])/(tot[a][N11][0]+tot[a][N10][0]); + fprintf ( stdout, "\n%15s N11 %5d SP %.2f ",compound, (int)tot[a][N11][0],sp); + sp=((tot[a][N11][1]+tot[a][N10][1])==0)?1:(tot[a][N11][1])/(tot[a][N11][1]+tot[a][N10][1]); + sn=(tot[a][N11][0]==0)?1:(tot[a][N11][1]/tot[a][N11][0]); + fprintf ( stdout, " N11 %5d SP %.2f SN %.2f SIM %.2f", (int)tot[a][N11][1], sp,sn, (tot[a][SIM][1]/tot[a][NXX][1])); + } + + myexit (EXIT_FAILURE); + sp=((n11+n01)==0)?1:n11/(n11+n01); + sn=((n11+n01)==0)?1:n11/(n11+n10); + + fprintf ( stdout, "\nLimit: %d NPRED %d AVGSIM %d SN %.2f SP %.2f TP %d FP %d FN %d",L1, npred, tot_sim, sn, sp, (int)n11, (int)n01, (int)n10); + myexit (EXIT_SUCCESS); + return A; +} + +Alignment * sar2simpred2 (Alignment *A, Alignment *SAR, char *seqlist, char *posfile, char *compound, int L ) +{ + int a,b, c,c1, c2, p, s; + float n11, n10, n01, n00, n, sn2, prediction,sp, n1, n0, t, entropy, Delta; + int *rlist, *tlist, *pred, *npred, tsim, psim; + int **sim, **sim_ref; + int nr=0; + int nrs; + char *out; + int delta_max; + Alignment *B; + int printall=1; + + out=vcalloc (A->nseq+1, sizeof (char)); + rlist=vcalloc ( A->nseq, sizeof (int)); + tlist=vcalloc ( A->nseq, sizeof (int)); + pred=vcalloc(2, sizeof (int)); + npred=vcalloc(2, sizeof (int)); + + nrs=0; + if ( strm (seqlist, "first")) + { + for ( a=0; anseq; a++) + { + if ( strm ( compound, SAR->name[a])) + { + for ( b=0; bnseq; b++) + { + if ( SAR->seq_al[a][b]=='I') + { + fprintf ( stderr, "COMP: %s REF SEQ: %s\n", A->name[b], compound); + rlist[nrs]=b; + tlist[rlist[nrs]]=1; + nrs++; + break; + } + } + } + } + } + else if (strm (seqlist, "all")) + { + for ( a=0; a< A->nseq; a++) + { + rlist[nrs]=a; + tlist[rlist[a]]=1; + nrs++; + } + } + else if ((a=name_is_in_list ( seqlist, A->name, A->nseq, 100))!=-1) + { + rlist[nrs]=a; + tlist[rlist[nrs]]=1; + nrs++; + } + else + { + Alignment *R; + R=main_read_aln (seqlist, NULL); + for (a=0; anseq; a++) + { + rlist[a]=name_is_in_list( R->name[a], A->name, A->nseq, 100); + tlist[rlist[a]]=1; + } + free_aln (R); + } + + c=name_is_in_list ( compound, SAR->name, SAR->nseq, 100); + + sim_ref=aln2sim_mat (A, "idmat"); + if (strm (posfile, "all")) + { + sim=sim_ref; + B=A; + } + else + { + B=copy_aln ( A,NULL); + B=extract_aln3(B,posfile); + sim=aln2sim_mat (B, "idmat"); + } + + n11=n10=n01=n00=n=n1=n0=0; + delta_max=0; + for (a=0; anseq; a++) + { + if ( tlist[a] && !strm (seqlist, "all")) + out[a]=(SAR->seq_al[c][a]=='I')?'Z':'z';/*SAR->seq_al[c][a];*/ + else + { + + pred[0]=pred[1]=0; + npred[0]=npred[1]=1; + c1=(SAR->seq_al[c][a]=='I')?1:0; + for (nr=0,tsim=0,psim=0,b=0; bseq_al[c][rlist[b]]=='o'); + else + { + c2=(SAR->seq_al[c][rlist[b]]=='I')?1:0; + nr+=c2; + s=sim[a][rlist[b]]; + tsim+=sim_ref[a][rlist[b]]; + psim+=sim[a][rlist[b]]; + if (s>=L) + { + pred[c2]+=s; + npred[c2]++; + } + } + } + + if (c1==0)n0++; + else n1++; + t++; + + + Delta=pred[1]-pred[0]; + + if (Delta<-delta_max){p=0;out[a]= (c1==0)?'O':'o';} + else if (Delta>delta_max){p=1;out[a]=(c1==1)?'I':'i';} + else {p=-1; out[a]=(c1==1)?'U':'u';} + + if ( p==-1); + else if ( p && c1)n11++; + else if ( p && !c1)n10++; + else if ( !p && !c1)n00++; + else if ( !p && c1)n01++; + + if (p!=-1)n++; + if (printall)fprintf ( stdout, ">%-15s %d %c OVERALL_SIM:%d POSITION_SIM %d\n%s\n", B->name[a], c1, out[a],tsim/nrs,psim/nrs,B->seq_al[a]); + } + } + sp=((n11+n10)==0)?1:n11/(n11+n10); + sn2=((n1)==0)?1:n11/n1; + prediction=(n11+n00)/(n1+n0); + entropy=(float)(M_chooses_Nlog (nr, nrs)/M_chooses_Nlog(nrs/2, nrs)); + + fprintf ( stdout, ">%-15s Sp %.2f Sn %.2f Pred %.2f E %.2f\n", compound,sp, sn2,prediction,entropy ); + fprintf ( stdout, "%s\n", out); + + myexit (EXIT_SUCCESS); + return A; +} +/************************************************************************************/ +/* ALIGNMENT ANALYZE : SAR FOR OR */ +/************************************************************************************/ + +void display_or_help(); +void display_or_help() +{ + fprintf ( stdout, "\nor_sar options:"); + fprintf ( stdout, "\n_ORCL_: Command_line in a file"); + + fprintf ( stdout, "\n_ROTATE_ : rotate the sar matrix (if each entry is a compound rather than a sequence)"); + fprintf ( stdout, "\n_JNIT_ : number cycles of Jacknife"); + fprintf ( stdout, "\n_JNSEQ_ : Number of sequences picked up in alignment [0 to keep them all, 10 by default]"); + fprintf ( stdout, "\n_JSAR_ : Number of compounds picked up in the SAR matrix [0 to keep them all => default"); + fprintf ( stdout, "\n_JRSAR_ : Randomization of the SAR file between each JNIT iteration: S, C, R, SC, SR... (S: seq, C, column, R: residue"); + fprintf ( stdout, "\n_JRALN_ : Randomization of the ALN file between each JNIT iteration: S, C, R, SC, SR... (S: seq, C, column, R: residue"); + + + fprintf ( stdout, "\n_NPOS_ : Number of positions used to make the prediction [4 by default]"); + fprintf ( stdout, "\n_DEPTH_ : Depth of the motif degenerated alphabet [Default=2]"); + fprintf ( stdout, "\n_POSFILE_ : Predefined list of positions in a filename, A->nseq); + S=aln2jacknife (S, 0, sarlen); +} +Alignment *or_scan (Alignment *A,Alignment *S, char *pmode) +{ + int l, a,ax,cx, b; + char mode[100]; + int start, offset,w; + int nl, *poslist; + + + fprintf ( stdout, "\nPARAMETERS: %s\n", pmode); + fprintf ( stderr, "\nPARAMETERS: %s\n", pmode); + + strget_param (pmode, "_SW_", "15", "%d",&w); + strget_param (pmode, "_SMINW_", "5", "%d",&start); + strget_param (pmode, "_SSTART_", "5", "%d",&offset); + strget_param (pmode, "_SMODE_", "single", "%s",&mode); + + + l=intlen (A->len_aln); + poslist=vcalloc ( A->len_aln, sizeof (int)); + nl=0; + + for ( a=0; a< A->len_aln; a++)poslist[nl++]=a+1; + + if ( strm (mode, "single")) + { + for ( a=0; alen_aln-2; a++) + { + int c, gap, score; + for (gap=0,c=0; cnseq; c++)gap+=(A->seq_al[c][a]=='-'); + if ( !gap) + { + Alignment *B; + B=extract_aln (A, a+1, a+2); + B=or_sar (B, S, pmode, NO_PRINT); + score=B->score_aln; + free_aln (B); + } + else + { + score=0; + } + fprintf ( stdout, "P: %*d S: %4d\n",l,a+1, score); + //fprintf ( stderr, "P: %*d S: %4d\n",l,a+1, score); + + } + } + else if strm (mode, "scan") + { + + for ( ax=0; axA->len_aln)continue; + if (pend<1 || pend>A->len_aln)continue; + B=extract_aln (A, a-w, a+w); + + B=or_sar (B, S,pmode, NO_PRINT); + + + if (B->score_aln>=best_score){best_score=B->score_aln; best_pos=a;best_w=b;best_start=pstart; best_end=pend;} + free_aln (B); + } + fprintf ( stdout, "P: %*d I: %*d %*d Score: %5d L: %2d\n", l,best_pos, l,best_start+offset, l,best_end, best_score,(best_w*2)+1 ); + } + } + else if ( strm ( mode, "scan_comp")) + { + Alignment *NS=NULL; + int *tresults; + int s, n=0, p1; + int nbest; + int *poscache; + float sc, eval; + int **index; + int **poslist; + int len=1; + float best_sc, best_eval; + int best_pos; + char *best_word; + int sth; + int sev; + + index=aln2pos_simple (A, A->nseq); + strget_param (pmode, "_NPOS_", "1", "%d", &len); + strget_param (pmode, "_STH_", "0", "%d", &sth); + strget_param (pmode, "_SEV_", "0", "%d", &sev); + + if (sev==0)for (a=0, sev=1; alen_aln; + + tresults=vcalloc ( A->len_aln, sizeof (int)); + poscache=vcalloc ( A->len_aln, sizeof (int)); + poslist=generate_array_int_list (len, 0, A->len_aln-1,1, &n, NULL); + for (s=0; slen_aln; s++) + { + + char *word; + NS=aln2block (S, s+1, s+2, NS); + fprintf ( stderr, "\nProcess: %s ...\n", get_compound_name(s, mode)); + + for (best_sc=0,best_pos=0,best_word=NULL,a=0; abest_sc) + { + best_sc=sc; + best_eval=eval; + best_word=word; + best_pos=a; + nbest=1; + } + else if ( word && sc>=best_sc) + { + nbest++; + } + else + vfree (word); + } + nbest/=len; + for (a=0; a<=A->nseq; a++) + { + + + + fprintf (stdout, "\n>%s PPPP: S: %s SC: %d EV: %d NBest: %d W: %s", get_compound_name(s, mode),(a==A->nseq)?"cons":A->name[a],(int)best_sc, (int) best_eval, nbest,best_word); + for ( b=0; bnseq) + { + p1=poslist[best_pos][b]; + if (best_sc>sth && nbest0)p1++; + } + fprintf ( stdout, " %d ", p1); + + } + } + + } + for ( p1=0; p1len_aln;p1++) + fprintf ( stdout, "\n>TOT_P: %d %d", p1+1, tresults[p1]); + } + + else if ( strm ( mode, "scan_comp_old")) + { + Alignment *NS=NULL, *BLOCK=NULL; + int **results, *tresults; + int s, n, p1, p2; + int npos=1; + int *poscache; + + results=declare_int (A->len_aln*A->len_aln, 3); + tresults=vcalloc ( A->len_aln, sizeof (int)); + poscache=vcalloc ( A->len_aln, sizeof (int)); + for (s=0; slen_aln; s++) + { + int count; + NS=aln2block (S, s+1, s+2, NS); + fprintf ( stderr, "\nProcess: %s ...", get_compound_name(s, mode)); + for (n=0,p1=0; p1len_aln-w; p1++, count ++) + { + if ( count == 50){fprintf ( stderr, "*");count=0;} + for ( p2=p1; p2len_aln-w; p2++, n++) + { + poscache[p1]=1; + poscache[p2]=1; + + BLOCK=alnpos2block (A,poscache,BLOCK); + + if ( aln2ngap(BLOCK)<=0)BLOCK=or_sar (BLOCK,NS, pmode, NO_PRINT); + else BLOCK->score_aln=0; + + //if ( BLOCK->score_aln>0)HERE ("P: %d %d %d", p1, p2,BLOCK->score_aln); + results[n][0]=p1; + results[n][1]=p2; + results[n][2]=BLOCK->score_aln; + poscache[p1]=poscache[p2]=0; + + } + } + sort_int_inv (results, 3, 2, 0, n-1); + for (p1=0; p1%s PPPP: %d %d SC: %d", get_compound_name(s, mode), results[p1][0]+1,results[p1][1]+1, results[p1][2]); + fprintf ( stderr, "\n>%s PPPP: %d %d SC: %d", get_compound_name(s, mode), results[p1][0]+1,results[p1][1]+1, results[p1][2]); + tresults[results[p1][0]]++; + tresults[results[p1][1]]++; + } + } + for ( p1=0; p1len_aln;p1++) + if ( tresults[p1])fprintf ( stdout, "\n>TOT_P: %d %d", p1+1, tresults[p1]); + } + + myexit (EXIT_SUCCESS); +} + + +ORP * or_sar_compound(Alignment *A, Alignment *S, char *mode, int print); +ORP* set_orp_name ( ORP* P, char *name); +ORP* set_orp_offset ( ORP* P,int offset); +Alignment * or_sar(Alignment *inA, Alignment *inS, char *mode, int print) +{ + char rsar[4],raln[4], sarmode[100]; + int rotate, a, b, start, end; + ORP **R, *ORS; + Alignment *A, *S; + + if ( mode && (strstr (mode, "help") || strstr (mode, "HELP"))) + display_or_help(); + + strget_param (mode, "_SARMODE_", "single", "%s", &sarmode);//single | all + strget_param (mode, "_RSAR_", "NO", "%s", rsar); + strget_param (mode, "_RALN_", "NO", "%s", raln); + strget_param (mode, "_ROTATE_", "0", "%d", &rotate); + + strget_param (mode, "_START_", "1", "%d", &start);start--; + strget_param (mode, "_END_", "0", "%d", &end); + + A=copy_aln (inA, NULL); + S=copy_aln (inS, NULL); + + if ( end==0)end=A->len_aln; + if ( start!=0 || end!=A->len_aln) + { + int c; + if ( start>=A->len_aln || end<=start || end >A->len_aln) + printf_exit (EXIT_FAILURE, stderr, "ERROR: _START_%d and _END_%d are incompatible with the aln [len=%d][FATAL:%s]", start, end, A->len_aln, PROGRAM); + for (b=0; bnseq; b++) + for (c=0,a=start; aseq_al[b][c]=A->seq_al[b][a]; + A->len_aln=c; + for (b=0; bnseq; b++)A->seq_al[b][c]='\0'; + } + + + S=aln2random_aln (S, rsar); + A=aln2random_aln (A, raln); + + + + if (rotate) + { + Alignment *rS; + + if (S->len_aln!=A->nseq) + printf_exit ( EXIT_FAILURE,stderr, "ERROR: Alignment and SAR matrix are incompatible [FATAL:%s]", PROGRAM); + + rS=rotate_aln (S, NULL); + + for (a=0; a< A->nseq; a++)sprintf (rS->name[a], "%s", A->name[a]); + free_aln (S); + S=rS; + } + + R=vcalloc ( S->len_aln+2, sizeof (ORP*)); + if (strm (sarmode, "all"))R[0]=or_sar_compound (A, S, mode, print); + else if ( strm (sarmode, "single")) + { + for ( a=0; alen_aln; a++) + { + Alignment *NS=NULL; + + NS=aln2block (S, a+1, a+2, NS); + + R[a]=or_sar_compound (A, NS, mode, NO_PRINT); + set_orp_name ((R[a]), get_compound_name (a, mode)); + set_orp_offset(R[a], start); + display_or_summary (R[a], mode, stdout, print); + + } + } + + + + ORS=combine_n_predictions (R, A, S); + + display_or_summary (ORS, mode, stdout, print); + inA->score_aln=(int)(ORS->best*(float)1000); + + free_orp_list (R); + free_orp(ORS); + + return inA; + } +ORP* set_orp_offset ( ORP* P,int offset) +{ + if (!P) return NULL; + else + { + P->offset=offset; + return set_orp_offset(P->PR, offset); + } +} + +ORP* set_orp_name ( ORP* P, char *name) +{ + if (!P) return NULL; + else + { + sprintf ( P->name, "%s", name); + return set_orp_name(P->PR, name); + } +} + +ORP * combine_n_predictions (ORP**R, Alignment *A, Alignment *S) +{ + int a=0; + ORP*N=NULL; + while (R[a]) + { + N=combine_2_predictions (R[a++], N, A, S); + } + sprintf ( N->name, "ALL"); + sprintf ( N->mode, "COMBINED"); + + return N; +} + +ORP *combine_2_predictions ( ORP*IN, ORP *TO,Alignment *A, Alignment *S) +{ + int a; + + if ( !TO) + { + TO=declare_or_prediction (IN->ncomp, IN->nseq, IN->len); + TO->A=A; + TO->S=S; + TO->P=copy_aln(S, NULL); + TO->offset=IN->offset; + TO->ncomp=0; + } + + for (a=0; a< IN->len; a++) + { + TO->pos[a]+=IN->pos[a]; + } + + TO->fp+=IN->fp; + TO->fn+=IN->fn; + TO->tp+=IN->tp; + TO->tn+=IN->tn; + rates2sensitivity (TO->tp, TO->tn, TO->fp, TO->fn, &(TO->sp), &(TO->sn), &(TO->sen2), &(TO->best)); + + + for (a=0; a<(TO->A)->nseq; a++) + { + (TO->P)->seq_al[a][TO->ncomp]=(IN->P)->seq_al[a][0]; + //(TO->S)->seq_al[a][TO->ncomp]=(IN->S)->seq_al[a][0]; + } + TO->ncomp++; + (TO->P)->len_aln=TO->ncomp; + (TO->A)->score_aln=TO->best; + + return TO; +} + + + +ORP * display_or_summary (ORP *CP, char *mode, FILE *fp, int print) +{ + int a; + char *pred; + char *exp; + char *motif; + Alignment *A, *P, *S; + + + + A=CP->A; + P=CP->P; + S=CP->S; + + + + pred=vcalloc ( P->nseq*P->len_aln*2, sizeof (char)); + exp=vcalloc ( P->nseq*P->len_aln*2, sizeof (char)); + motif=vcalloc (CP->len+1, sizeof (char)); + + + + if (P && S) + { + for ( a=0; anseq; a++) + { + strcat (pred,P->seq_al[a]); + strcat (exp, S->seq_al[a]); + } + CP->evalue=profile2evalue(pred, exp); + } + a=0; + while ( CP->motif && CP->motif[a] && CP->motif[0][a][0])strcat (motif, CP->motif[0][a++]); + + if ( print==PRINT) + { + fprintf (fp, "\n>%-10s Mode: %s Accuracy: %6.2f E-value: %6.2f Motif: ",CP->name,CP->mode, CP->best, CP->evalue); + + if (motif[0]) + { + fprintf (fp, " %s",motif); + } + for ( a=0; alen; a++) + { + if ( CP->pos[a]) fprintf ( fp, "\n>%-10s Mode: %s P: cons %3d SC: %4d", CP->name, CP->mode, a+1+CP->offset, CP->pos[a]); + } + fprintf ( fp, "\n"); + } + vfree (pred); vfree(motif); vfree(exp); + if (CP->PR)display_or_summary (CP->PR, mode, fp, print); + return CP; +} + + +ORP * or_sar_compound(Alignment *A, Alignment *S, char *mode, int print) +{ + char rmode[100]; + Alignment *P=NULL; + ORP *PR=NULL; + + strget_param (mode, "_MODE_", "predict", "%s", rmode); + + + + if (strm (rmode, "jack"))P=or_jack (A, S, mode); + else if (strm (rmode, "loo")) PR=or_loo (A, S, mode,NULL, print); + else if (strm (rmode, "comploo")) P=or_comp_loo (A, S, mode,NULL, print); + else if ( strm (rmode, "comppos")){or_comp_pos ( A, S, mode,print);myexit (EXIT_FAILURE); return NULL;} + else if ( strm (rmode, "pos"))P=or_aln2pos_aln ( A, S, mode); + else if ( strm (rmode, "predict"))P=or_predict ( A, S, mode); + else if ( strm (rmode, "self"))PR=or_self_predict ( A, S, mode, NULL, print); + else if ( strm (rmode, "sim"))P=or_sim ( A, S, mode); + + else if ( strm (rmode, "test"))P=or_test ( A, S, mode); + else + { + printf_exit (EXIT_FAILURE, stderr, "ERROR: %s is an unknown mode of or_sar [FATAL:%s]\n", rmode,PROGRAM); + return NULL; + } + + if (!PR) + { + PR=vcalloc (1, sizeof (ORP)); + PR->P=P; + } + return PR; +} +Alignment * or_test ( Alignment *inA, Alignment *inS, char *mode) +{ + return inA; +} + + + +float or_id_evaluate ( Alignment *A, Alignment *S, char *mode, int *pos, int print) +{ + char *w; + float score; + + w=or_id_evaluate2(A,S, mode, pos,print, &score); + + vfree (w); + return score; +} +char* or_id_evaluate2 ( Alignment *A, Alignment *S, char *mode, int *pos, int print, float *rscore) +{ + static char **words; + static int *plist; + char *bword; + + int res, p,nl, w, c, s, exp, pred; + int tp, tn, fp, fn; + float sn, sp, sen2, best, best_score; + + if (!A) + {free_char (words, -1); + vfree (plist); + return NULL; + } + rscore[0]=0; + + plist=pos2list (pos, A->len_aln, &nl); + words=declare_char (A->nseq, nl+1); + bword=vcalloc (nl+1, sizeof (char)); + for (p=0; pnseq; s++) + { + res=A->seq_al[s][plist[p]]; + if (res=='-'){or_id_evaluate2 (NULL, NULL, NULL, NULL, 0, NULL);vfree (bword);return 0;} + words[s][p]=res; + } + } + + for (best_score=0,w=0; wnseq; w++) + { + tp=fp=fn=tn=0; + + for (c=0; clen_aln; c++) + { + for (s=0; snseq; s++) + { + exp=(S->seq_al[s][c]=='I')?1:0; + pred=strm (words[w], words[s]); + if ( exp && pred)tp++; + else if ( exp && !pred)fn++; + else if (!exp && !pred)tn++; + else if (!exp && pred)fp++; + } + } + rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best); + if ( best>best_score) + { + best_score=best; + sprintf (bword, "%s", words[w]); + } + } + rscore[0]=(float)1000*best_score; + or_id_evaluate2 (NULL, NULL, NULL, NULL, 0, NULL); + return bword; +} + +float or_loo_evaluate2 ( Alignment *A, Alignment *S, char *mode, int *pos, int print) +{ + int c, s, p, res, sar; + + int *plist, nl; + int tp, tn, fp, fn; + float sn, sp, sen2, best; + char **words, **positive, **negative; + + tp=tn=fp=fn=0; + plist=pos2list (pos, A->len_aln, &nl); + + words=declare_char (A->nseq, nl+1); + + for (p=0; pnseq; s++) + { + res=A->seq_al[s][plist[p]]; + if (res=='-'){vfree (plist);free_char (words, -1); return 0;} + words[s][p]=res; + } + } + positive=vcalloc ( A->nseq, sizeof (char*)); + negative=vcalloc ( A->nseq, sizeof (char*)); + for (c=0; clen_aln; c++) + { + //Fill the match matrix + for (p=0; pnseq; s++) + { + sar=S->seq_al[s][c]; + if (sar=='I')positive[s]=words[s]; + else if ( sar=='O')negative[s]=words[s]; + } + } + + //Evaluate the scores + for (s=0; s< A->nseq; s++) + { + int pos=0, neg=0, pred; + sar=S->seq_al[s][c]; + positive[s]=negative[s]=NULL; + + if ( name_is_in_list (words[s], positive, A->nseq, nl+1)!=-1) + pos=1; + if ( name_is_in_list (words[s], negative, A->nseq, nl+1)!=-1) + neg=1; + + if (pos & !neg) pred=1; + else pred=0; + + if ( pred && sar=='I')tp++; + else if (!pred && sar=='I')fn++; + else if (!pred && sar=='O')tn++; + else if ( pred && sar=='O')fp++; + + if ( sar=='I')positive[s]=words [s]; + else negative[s]=words[s]; + } + } + + vfree (negative); vfree (positive); + vfree (plist); free_char (words, -1); + rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best); + + return (float)1000*best; +} +float or_loo_evaluate ( Alignment *A, Alignment *S, char *mode, int *pos, int print) +{ + int c, s, p, res, sar; + int **matP,**matN; + int *plist, nl; + int tp, tn, fp, fn; + float sn, sp, sen2, best; + + tp=tn=fp=fn=0; + plist=pos2list (pos, A->len_aln, &nl); + matP=declare_int (nl, 256); + matN=declare_int (nl, 256); + + for (c=0; clen_aln; c++) + { + //Fill the match matrix + for (p=0; pnseq; s++) + { + res=A->seq_al[s][plist[p]]; + sar=S->seq_al[s][c]; + if (res=='-'){vfree (plist); free_int (matP, -1);free_int (matN, -1); return 0;} + if (sar=='I')matP[p][res]++; + if (sar=='O')matN[p][res]++; + } + } + + //Evaluate the scores + for (s=0; s< A->nseq; s++) + { + int scoreP, scoreN; + int pred, valP, valN; + + sar=S->seq_al[s][c]; + for (scoreN=0,scoreP=0,p=0; pseq_al[s][plist[p]]; + + valP=matP[p][res]-(sar=='I')?1:0; + scoreP+=(valP>0)?1:0; + + valN=matN[p][res]-(sar=='O')?1:0; + scoreN+=(valN>0)?1:0; + } + + if ( scoreP==nl && scoreNnseq; s++) + { + res=A->seq_al[s][plist[p]]; + sar=S->seq_al[s][c]; + if (sar=='I')matP[p][res]=0; + else matN[p][res]=0; + } + } + } + + vfree (plist); free_int (matP, -1);free_int (matN, -1); + rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best); + + return (float)1000*best; +} +int* or_comp_pos ( Alignment *inA, Alignment *inS, char *mode,int print) +{ + Alignment *A=NULL, *S=NULL, *inS2=NULL; + int a, b, c; + int *main_pos, *pos=NULL; + + set_sar (inA, inS, mode); + main_pos=vcalloc ( inA->len_aln, sizeof (int)); + + inS2=copy_aln (inS, NULL); + inS2->len_aln=1; + + + + //Run every SAR, one at a time + for ( c=0; c< inS->len_aln; c++) + { + int max, p; + + fprintf ( stdout, ">%d\n", c); + for (a=0; a< inS->nseq; a++) + { + inS2->seq_al[a][0]=inS->seq_al[a][c]; + inS2->seq_al[a][1]='\0'; + } + + vfree (pos); + free_aln (S); + free_aln (A); + + pos=vcalloc (inA->len_aln, sizeof (int)); + A=copy_aln (inA, NULL); + S=copy_aln (inS2, NULL); + set_sar (A,S, mode); + pos=aln2predictive_positions (A, S, mode,PRINT); + + for (max=0,b=0; blen_aln; b++) + { + main_pos[b]+=pos[b]; + if (main_pos[b]>max) + { + max=main_pos[b]; + p=b+1; + } + } + + + for (a=0; anseq; a++) + { + fprintf ( stdout, "\t"); + for ( b=0; blen_aln; b++) + if ( pos[b]) fprintf ( stdout, "%c", A->seq_al[a][b]); + fprintf ( stdout, " %c\n", inS2->seq_al[a][0]); + } + fprintf ( stdout, "\n\tBest: %d %d\n", p, max); + + } + + if (print==PRINT) + { + for ( a=0; alen_aln; a++)fprintf ( stdout, "\nP2: cons %4d %4d [FINAL]", a+1, main_pos[a]); + } + return main_pos; +} +Alignment * or_comp_loo ( Alignment *inA, Alignment *inS, char *mode, int *pos,int print) +{ + int a, b,c, n; + char **keep, **remove; + Alignment *A, *S, *P, *P1, *SEQ, *inS2; + int **main_pos, *compound_pos, **comp_list; + int pos_exists=0; + char *comp_pred, *comp_exp; + int sar_threshold; + + strget_param (mode, "_SARTHRES_", "3", "%d", &sar_threshold); + + if (pos)pos_exists=1; + set_sar (inA, inS, mode); + P=copy_aln (inS, NULL); + keep=declare_char (inA->nseq, MAXNAMES); + remove=declare_char (inA->nseq, MAXNAMES); + + main_pos=declare_int ( inA->len_aln,4); + comp_list=declare_int (inA->len_aln, sizeof (int*)); + inS2=copy_aln (inS, NULL); + inS2->len_aln=1; + + + comp_pred=vcalloc ( inA->nseq+1, sizeof (int)); + comp_exp=vcalloc ( inA->nseq+1, sizeof (int)); + compound_pos=NULL; + //Run every SAR, one at a time + for ( c=0; c< inS->len_aln; c++) + { + for (a=0; a< inS->nseq; a++) + { + inS2->seq_al[a][0]=inS->seq_al[a][c]; + inS2->seq_al[a][1]='\0'; + } + vfree (compound_pos); + compound_pos=vcalloc (inA->len_aln, sizeof (int)); + for (a=0; anseq; a++) + { + char ***motifs; + + A=copy_aln (inA, NULL); + S=copy_aln (inS2, NULL); + + for (n=0,b=0; bnseq; b++) + { + if (b!=a)sprintf (keep[n++ ], "%s", A->name [b]); + } + sprintf ( remove[0], "%s", A->name[a]); + reorder_aln (A,keep, A->nseq-1); + + set_sar (A,S, mode); + if (!pos_exists) + { + pos=aln2predictive_positions (A, S, mode,NO_PRINT); + } + for (b=0; blen_aln; b++) + { + compound_pos[b]+=pos[b]; + } + + motifs=compounds2motifs (A, S, pos,0, mode, NO_PRINT); + + SEQ=copy_aln (inA, NULL); + SEQ=reorder_aln (SEQ, remove, 1); + + P1=aln2prediction (SEQ, motifs, pos); + comp_pred[a]=P1->seq_al[0][0]; + comp_exp[a]=inS2->seq_al[a][0]; + P->seq_al[a][c]=P1->seq_al[0][0]; + + + free_aln (SEQ); + free_aln (S); + free_aln (A); + free_aln (P1); + free_arrayN( (void *)motifs, 3); + if (!pos_exists)vfree (pos); + } + if (print==PRINT) + fprintf ( stdout, ">%-15s SC: %.2f E; %.2f\n%s\n%s\n", get_compound_name(c, mode),profile2sensitivity (comp_pred, comp_exp, NULL, NULL, NULL, NULL),profile2evalue(comp_pred, comp_exp),comp_pred, comp_exp); + for (b=0; blen_aln; b++) + { + main_pos[b][2]+=compound_pos[b]; + if (compound_pos[b])main_pos[b][3]++; + if (profile2evalue(comp_pred, comp_exp)>sar_threshold) + { + main_pos[b][0]+=compound_pos[b]; + if (compound_pos[b]) + { + main_pos[b][1]++; + comp_list[b][0]++; + comp_list[b]=vrealloc (comp_list[b], sizeof (int)*(comp_list[b][0]+1)); + comp_list[b][comp_list[b][0]]=c; + } + } + } + + } + + P->score_aln=(int)((float)1000*evaluate_prediction (P, inS, mode,print)); + + if (print==PRINT) + { + for ( a=0; alen_aln; a++)fprintf ( stdout, "\nP: cons %4d RS: %4d RC: %5d FC: %4d %4d[FINAL]", a+1, main_pos[a][2], main_pos[a][3], main_pos[a][0], main_pos[a][1]); + + for ( a=0; alen_aln; a++) + { + fprintf ( stdout, "\nP: cons %4d RS: %4d RC: %5d FC: %4d %4d CLIST: ", a+1, main_pos[a][2], main_pos[a][3], main_pos[a][0], main_pos[a][1]); + for ( c=1; c<=comp_list[a][0]; c++) + { + fprintf ( stdout, "%s ", get_compound_name(comp_list[a][c], mode)); + } + fprintf ( stdout, " [COMP_LIST]"); + } + } + free_int (main_pos, -1); + free_int (comp_list, -1); + return P; +} + +ORP* or_loo ( Alignment *inA, Alignment *inS, char *mode, int *pos,int print) +{ + int a, b, n; + char **keep, **remove; + Alignment *A, *S, *P, *P1, *SEQ; + + int pos_exists=0; + ORP *PR; + + + + if (pos)pos_exists=1; + set_sar (inA, inS, mode); + PR=declare_or_prediction (inS->nseq, inA->nseq, inA->len_aln); + sprintf (PR->mode, "loo "); + P=copy_aln (inS, NULL); + + + keep=declare_char (inA->nseq, MAXNAMES); + remove=declare_char (inA->nseq, MAXNAMES); + + + PR->A=inA; + PR->P=P; + PR->S=inS; + + for (a=0; anseq; a++) + { + char ***motifs; + + A=copy_aln (inA, NULL); + S=copy_aln (inS, NULL); + + for (n=0,b=0; bnseq; b++) + { + if (b!=a)sprintf (keep[n++ ], "%s", A->name [b]); + } + sprintf ( remove[0], "%s", A->name[a]); + reorder_aln (A,keep, A->nseq-1); + + set_sar (A,S, mode); + + if (!pos_exists) + { + pos=aln2predictive_positions (A, S, mode,print); + + } + + for (b=0; blen_aln; b++) + { + PR->pos[b]+=pos[b]; + } + + motifs=compounds2motifs (A, S, pos,0, mode, print); + + SEQ=copy_aln (inA, NULL); + SEQ=reorder_aln (SEQ, remove, 1); + SEQ->nseq=1; + + P1=aln2prediction (SEQ, motifs, pos); + + + if (print==PRINT) + { + fprintf ( stdout, "\n%s\nPred: %s\nReal: %s\n", P1->name[0], P1->seq_al[0], inS->seq_al[a]); + } + sprintf ( P->seq_al[a], "%s", P1->seq_al[0]); + free_aln (P1); + + + free_aln (SEQ); + free_aln (S); + free_aln (A); + + free_arrayN( (void *)motifs, 3); + if (!pos_exists)vfree (pos); + + } + free_char (keep, -1); + free_char (remove, -1); + + + PR=new_evaluate_prediction (PR, mode,print); + + + PR->PR=or_self_predict(inA, inS, mode,NULL, print); + + if (print==PRINT)for ( a=0; alen_aln; a++)fprintf ( stdout, "\nP: cons %d %d [FINAL]", a+1, PR->pos[a]); + + + + return PR; +} + + + +Alignment * or_jack(Alignment *inA, Alignment *inS, char *mode) +{ + int a,b; + int n_cycles=100; + int subnseq=10; + int subsar=0; + Alignment *A, *S; + int *main_pos,*pos; + char jrsar[10], jraln[10]; + + strget_param (mode, "_JNIT_", "100", "%d", &n_cycles); + strget_param (mode, "_JNSEQ_", "10", "%d", &subnseq); + strget_param (mode, "_JNAR_", "0", "%d", &subsar); + + strget_param (mode, "_JRSAR_", "NO", "%s", jrsar); + strget_param (mode, "_JRALN_", "NO", "%s", jraln); + + + + main_pos=vcalloc ( inA->len_aln, sizeof (int)); + for (a=0; alen_aln; b++)main_pos[b]+=pos[b]; + vfree (pos); + + } + display_pos (A, S, main_pos, mode); + + + return inA; +} + +Alignment * display_pos (Alignment *A, Alignment *S, int *pos,char *mode) +{ + Alignment *B; + int a, b; + int **index; + + int intl; + + intl=intlen (A->len_aln); + index=aln2pos_simple (A, A->nseq); + B=copy_aln (A,NULL); + B->len_aln=0; + for ( a=0; alen_aln; a++) + fprintf ( stdout, "\nP: cons %*d %*d S: %4d [DISPLAY_FULL_POS]", intl,a+1,intl, a+2, pos[a]); + fprintf ( stdout, "\n\n"); + for (a=0; alen_aln; a++) + { + if (pos[a]) + { + for ( b=0; bnseq; b++) + { + B->seq_al[b][B->len_aln]=A->seq_al[b][a]; + if (index[b][a]>0)fprintf ( stdout, "\nP: %s %d %d S: %d [DISPLAY_POS]",A->name[b], index[b][a], index[b][a]+1, pos[a]); + } + B->len_aln++; + fprintf ( stdout, "\nP: cons %d %d S: %d [DISPLAY_POS]", a+1, a+2, pos[a]); + } + } + fprintf ( stdout, "\n"); + for (a=0; anseq; a++)B->seq_al[a][B->len_aln]='\0'; + return B; +} +Alignment * or_aln2pos_aln (Alignment *A, Alignment *S, char *mode) +{ + Alignment *B; + + int *pos; + char outaln[100], outtree[100]; + + + strget_param (mode, "_OUTALN_", "NO", "%s", outaln); + strget_param (mode, "_OUTTREE_", "NO", "%s", outtree); + + set_sar (A, S, mode); + pos=aln2predictive_positions (A, S,mode, PRINT); + + B=display_pos (A, S, pos, mode); + + + if (!strm(outaln, "NO")) vfclose (output_aln (B, vfopen (outaln, "w"))); + if (!strm(outtree, "NO"))vfclose (print_tree (aln2tree(B), "newick", vfopen (outtree, "w"))); + + return B; +} +Alignment * or_sim(Alignment *A, Alignment *S, char *mode) +{ + //Predict all the sequences that are not both in inS and inA + int *pos; + + set_sar (A, S, mode); + pos=aln2predictive_positions (A, S,mode, PRINT); + fprintf ( stdout, "R: %.3f", pos2sim (A,S, pos)); + + myexit (EXIT_SUCCESS); + return A; +} +ORP* or_self_predict(Alignment *A, Alignment *S, char *mode,int *pos, int print) +{ + //Predict all the sequences that are not both in inS and inA + Alignment *P; + char ***motifs; + + + int a; + + int pre_set_pos=0; + ORP *PR; + + + set_sar (A, S, mode); + PR=declare_or_prediction (S->nseq, A->nseq, A->len_aln); + sprintf (PR->mode, "self"); + PR->A=A; + PR->S=S; + + if (!pos) + { + pos=aln2predictive_positions (A, S,mode,print); + pre_set_pos=0; + } + else + pre_set_pos=1; + + for (a=0; a< A->len_aln; a++) + PR->pos[a]=pos[a]; + + + PR->motif=motifs=compounds2motifs (A, S, pos,0, mode, print); + P=PR->P=aln2prediction (A, motifs, pos); + + if (!pre_set_pos)vfree (pos); + + PR=new_evaluate_prediction (PR, mode,print); + return PR; +} + + +Alignment * or_predict(Alignment *inA, Alignment *inS, char *mode) +{ + //Predict all the sequences that are not both in inS and inA + Alignment *P, *A, *S, *T; + char ***motifs; + int *pos; + + int a, b; + + + + + + + A=copy_aln (inA, NULL); + S=copy_aln (inS, NULL); + set_sar (A, S, mode); + + pos=aln2predictive_positions (A, S,mode,PRINT); + motifs=compounds2motifs (A, S, pos,0, mode, PRINT); + T=get_prediction_target (inA, inS, mode); + + + P=aln2prediction (T, motifs, pos); + //recall=evaluate_prediction (S, P, mode); + for ( a=0; alen_aln; a++) + { + for (b=0; bnseq; b++) + { + if (tolower(P->seq_al[b][a])=='i')fprintf (stdout, "\n>%20s %20s %c", T->name [0],get_compound_name (a, mode), P->seq_al[b][a]); + } + } + fprintf ( stdout, "\n"); + return P; +} + +Alignment *get_prediction_target (Alignment *A, Alignment *S, char *param) +{ + char **name; + int n, a; + Alignment *T; + + T=copy_aln (A, NULL); + name=declare_char (A->nseq, 100); + for (n=0,a=0; a< A->nseq; a++) + { + if ( name_is_in_list (A->name[a], S->name, S->nseq, 100)==-1) + { + sprintf (name[n++], "%s", A->name[a]); + } + } + T=reorder_aln (T,name, n); + return T; +} + +Alignment *set_sar (Alignment *A, Alignment *S, char *param) +{ + char **name; + int n, a; + + name=declare_char (A->nseq, 100); + for (n=0,a=0; a< A->nseq; a++) + { + if ( name_is_in_list (A->name[a], S->name, S->nseq, 100)!=-1) + { + sprintf (name[n++], "%s", A->name[a]); + } + } + A=reorder_aln (A,name, n); + S=reorder_aln (S,name, n); + free_char (name, -1); + return S; +} + +ORP* new_evaluate_prediction (ORP *PR, char *mode, int print) +{ + int a,b, i, r, p; + int tp, tn, fp, fn; + float sn, sp, sen2, best; + float tot_best_seq=0; + float tot_best_comp=0; + Alignment *P, *R; + + int ns=0; + float *recall; + + + P=PR->P; + R=PR->S; + + recall=vcalloc (P->len_aln, sizeof (float)); + if (P->len_aln!=R->len_aln) + { + HERE ("Mismatch between number of compounds in prediction and reference"); + myexit (EXIT_FAILURE); + } + if (print==PRINT)fprintf ( stdout, "\n"); + + for (a=0; anseq; a++) + { + tp=tn=fp=fn=0; + if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1) + { + + for (b=0;blen_aln; b++) + { + r=R->seq_al[i][b]; + p=P->seq_al[a][b]; + + if ( p=='I' && r=='I')tp++; + else if ( p=='I' && r=='O')fp++; + else if ( p=='O' && r=='I')fn++; + else if ( p=='O' && r=='O')tn++; + } + rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); + if (print==PRINT)fprintf (stdout, ">%-s sp: %.2f sn: %.2f sn2: %.2f best: %.2f [SEQ]\n",P->name[a], sp, sn, sen2, best); + if ( best>0) + { + ns++; + tot_best_seq+=best; + } + } + } + if (ns) + { + tot_best_seq/=ns; + } + if (print==PRINT)fprintf ( stdout, ">TotSeq sp: %.2f N: %d[SEQ]\n",tot_best_seq, ns); + + tot_best_comp=0; + for (ns=0,b=0; blen_aln; b++) + { + tp=tn=fp=fn=0; + for (a=0; anseq;a++) + { + if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1) + { + r=R->seq_al[i][b]; + p=P->seq_al[a][b]; + + if ( p=='I' && r=='I'){PR->tp++;tp++;} + else if ( p=='I' && r=='O'){PR->fp++;fp++;} + else if ( p=='O' && r=='I'){PR->fn++;fn++;} + else if ( p=='O' && r=='O'){PR->tn++;tn++;} + } + } + rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); + + if (print==PRINT) fprintf (stdout, ">%-25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f [COMP]\n",get_compound_name (b, mode), PR->sp, PR->sn, PR->sen2,PR->best); + if ( best>0) + { + ns++; + tot_best_comp+=best; + } + } + + if (ns) + { + tot_best_comp/=ns; + } + rates2sensitivity (PR->tp, PR->tn, PR->fp,PR->fn,&(PR->sp), &(PR->sn), &(PR->sen2), &(PR->best)); + if (print==PRINT)fprintf ( stdout, ">FullTot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[COMP]\n", PR->sp, PR->sn, PR->sen2,PR->best, ns); + P->score_aln=(int)((float)1000*(PR->best)); + return PR; +} +float evaluate_prediction (Alignment *R, Alignment *P, char *mode, int print) +{ + int a,b, i, r, p; + int tp, tn, fp, fn; + int tot_tp, tot_tn, tot_fp, tot_fn; + float sn, sp, sen2, best; + float tot_sp=0; + float tot_sn=0; + float tot_sen2=0; + float tot_best_seq=0; + float tot_best_comp=0; + float tot_best=0; + + int ns=0; + float *recall; + + + + + recall=vcalloc (P->len_aln, sizeof (float)); + if (P->len_aln!=R->len_aln) + { + HERE ("Mismatch between number of compounds in prediction and reference"); + myexit (EXIT_FAILURE); + } + if (print==PRINT)fprintf ( stdout, "\n"); + for (a=0; anseq; a++) + { + tp=tn=fp=fn=0; + if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1) + { + + for (b=0;blen_aln; b++) + { + r=R->seq_al[i][b]; + p=P->seq_al[a][b]; + + if ( p=='I' && r=='I')tp++; + else if ( p=='I' && r=='O')fp++; + else if ( p=='O' && r=='I')fn++; + else if ( p=='O' && r=='O')tn++; + } + rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); + if (print==PRINT)fprintf (stdout, ">%-s sp: %.2f sn: %.2f sn2: %.2f best: %.2f [SEQ]\n",P->name[a], sp, sn, sen2, best); + if ( best>0) + { + ns++; + tot_best_seq+=best; + tot_sn+=sn; + tot_sp+=sp; + tot_sen2+=sen2; + } + } + } + if (ns) + { + tot_best_seq/=ns; + tot_sn/=ns; + tot_sp/=ns; + tot_sen2/=ns; + } + if (print==PRINT)fprintf ( stdout, ">Tot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[SEQ]\n", tot_sp, tot_sn, tot_sen2,tot_best_seq, ns); + + tot_fp=tot_fn=tot_tp=tot_tn=0; + tot_sp=tot_sn=tot_sen2=tot_best_comp=0; + for (ns=0,b=0; blen_aln; b++) + { + tp=tn=fp=fn=0; + for (a=0; anseq;a++) + { + if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1) + { + r=R->seq_al[i][b]; + p=P->seq_al[a][b]; + + if ( p=='I' && r=='I'){tot_tp++;tp++;} + else if ( p=='I' && r=='O'){tot_fp++;fp++;} + else if ( p=='O' && r=='I'){tot_fn++;fn++;} + else if ( p=='O' && r=='O'){tot_tn++;tn++;} + } + } + rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); + + if (print==PRINT) fprintf (stdout, ">%-25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f [COMP]\n",get_compound_name (b, mode), sp, sn, sen2,best); + recall[b]=sen2; + if ( best>0) + { + ns++; + tot_best_comp+=best; + tot_sn+=sn; + tot_sp+=sp; + tot_sen2+=sen2; + } + } + + if (ns) + { + tot_best_comp/=ns; + tot_sn/=ns; + tot_sp/=ns; + tot_sen2/=ns; + } + rates2sensitivity (tot_tp, tot_tn, tot_fp,tot_fn,&tot_sp, &tot_sn, &tot_sen2, &tot_best); + if (print==PRINT)fprintf ( stdout, ">FullTot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[COMP]\n", tot_sp, tot_sn, tot_sen2,tot_best, ns); + return tot_best; +} + + + +Alignment * aln2prediction (Alignment *A,char ***motif, int *pos) +{ + int a, b,nc, nl; + int *list; + char **array, **sar; + Alignment *R; + Sequence *S; + nc=read_array_size ((void *)motif, sizeof (char***)); + + + list=pos2list (pos, A->len_aln, &nl); + + + array=declare_char (A->nseq, nl+1); + sar=declare_char(A->nseq, nc+1); + for (a=0; anseq; a++) + { + for (b=0; bseq_al[a][list[b]]; + } + + for (a=0; anseq; b++) + { + + sar[b][a]=(match_motif (array[b], motif[a]))?'I':'O'; + } + } + + + S=fill_sequence_struc (A->nseq,sar,A->name); + R=seq2aln (S, NULL, KEEP_GAP); + free_sequence (S, S->nseq); + free_char (sar, -1); + vfree (list); + free_char (array, -1); + return R; +} + +int * file2pos_list (Alignment *A, char *posfile) +{ + char ***file; + int **index; + int *pos; + int i, n, p; + + //pos_file: + // 1 2 3 4 + + + if ( !check_file_exists (posfile)) + { + printf_exit ( EXIT_FAILURE, stderr, "ERROR: Could not read posfile %s\n", posfile); + } + + file=file2list (posfile, " "); + + index=aln2inv_pos (A); + pos=vcalloc ( A->len_aln, sizeof (int)); + + n=0; + while (file[n]) + { + + if ( !strm (file[n][1], "P:")); + else + { + if ( (strm (file[n][2], "cons"))) + p=atoi(file[n][3])-1; + else + { + i=name_is_in_list ( file[n][2], A->name, A->nseq, MAXNAMES+1); + if (i!=-1) + p=index[i][atoi(file[n][3])]-1; + else p=-1; + } + if (p!=-1)pos[p]+=atoi(file[n][4]); + } + n++; + } + + + free_int (index, -1); + free_arrayN ( (char **)file, 3); + return pos; +} +int * aln2predictive_positions (Alignment *A, Alignment *B, char *mode, int print) +{ + char posmode[100]; + + if (!mode) return NULL; + HERE ("%s", mode); + strget_param (mode, "_POSMODE_", "scan", "%s", posmode); + if ( strm (posmode, "mat"))return aln2predictive_positions_mat (A, B, mode, print); + else if ( strm (posmode, "scan")) return aln2predictive_positions_scan (A, B, mode, print); + else + { + printf_exit (EXIT_FAILURE,stderr, "ERROR: %s is an unknown _POSMODE_ mode",posmode); + return NULL; + } +} + +int * aln2predictive_positions_mat (Alignment *A, Alignment *B, char *mode, int print) + { + int a, b, c,gap, res1, res2, sar1, sar2, npos, s, idscore; + float id1,id2,id3,nid1,nid2,nid3; + int **pos, *fpos; + pos=declare_int (A->len_aln,2); + fpos=vcalloc ( A->len_aln, sizeof (int)); + + strget_param (mode, "_NPOS_", "2", "%d", &npos); + for ( a=0; a< A->len_aln; a++) + { + pos[a][0]=a; + id1=id2=id3=nid1=nid2=nid3=0; + for ( gap=0,b=0; bnseq; b++)gap+=(A->seq_al[b][a]=='-'); + if ( gap>0){pos[a][1]=0;continue;} + + for (s=0; slen_aln; s++) + { + for ( gap=0,b=0; bnseq-1; b++) + { + sar1=B->seq_al[b][s]; + res1=A->seq_al[b][a]; + + for ( c=b+1; cnseq; c++) + { + sar2=B->seq_al[c][s]; + res2=A->seq_al[c][a]; + + idscore=(res1==res2)?1:0; + if ( sar1 == 'I' && sar2=='I'){id1+=idscore;nid1++;} + else if ( sar1 =='0' && sar2=='0'){id2+=idscore;nid2++;} + else {id3+=idscore; nid3++;} + + } + } + id1=(nid1==0)?1:id1/nid1; + id2=(nid1==0)?1:id2/nid2; + id3=(nid3==0)?1:id3/nid3; + pos[a][1]=(int)((float)1000*id1*(1-id3)); + + } + } + + sort_int (pos, 2,1, 0, A->len_aln-1); + for ( a=MAX(0,(A->len_aln-npos));alen_aln; a++) + { + fpos[pos[a][0]]=1; + } + + free_int (pos, -1); + return fpos; + } +int * aln2predictive_positions_scan (Alignment *A, Alignment *B, char *mode, int print) +{ + int a, b, c, best_pos,nl, nplist=0, max, posw; + float best_score, score; + static int *list, *tpos,**plist,*array; + int *pos; + + + char posfile[100]; + char predmode[100]; + char target_posfile[100]; + + + + if (!A) + { + vfree (list); + vfree (tpos); + + free_int (plist, -1); + vfree (array); + return NULL; + } + + strget_param (mode, "_PREDMODE_", "ID", "%s", predmode); + strget_param (mode, "_POSW_", "1", "%d", &posw); + strget_param (mode, "_NPOS_", "2", "%d", &max); + strget_param (mode, "_POSFILE_", "NO", "%s", posfile); + strget_param (mode, "_TPOSFILE_", "NO", "%s", target_posfile); + + if ( !strm(posfile, "NO"))return file2pos_list (A,posfile); + if ( !strm(target_posfile, "NO"))tpos=file2pos_list (A,target_posfile); + else + { + tpos=vcalloc (A->len_aln, sizeof (int)); + for (a=0; alen_aln; a++)tpos[a]=1; + } + + //Declare the positions that are going to be scanned + + + if (posw==1) + { + plist=declare_int (A->len_aln, 2); + nplist=0; + for (a=0; alen_aln; a++) + { + if(tpos[a]) + { + plist[nplist][0]=1; + plist[nplist][1]=a; + nplist++; + } + } + } + else if ( posw==2) + { + nplist=0; + plist=declare_int (A->len_aln*A->len_aln, 3); + for (a=0; alen_aln; a++) + for (b=0; blen_aln; b++) + { + plist[nplist][1]=a; + plist[nplist][2]=b; + plist[nplist][0]=2; + nplist++; + } + } + else if ( posw==3) + { + nplist=0; + plist=declare_int (A->len_aln*A->len_aln*A->len_aln, 3); + for (a=0; alen_aln; a++) + for (b=0; blen_aln; b++) + { + plist[nplist][1]=a; + plist[nplist][2]=b; + plist[nplist][3]=0; + + + plist[nplist][0]=3; + nplist++; + } + } + + + pos=vcalloc ( A->len_aln, sizeof (int)); + if (max==0)max=A->len_aln; + else if ( max==-1) + { + for (a=0; alen_aln; a++)if (tpos[a]){pos[a]=1;} + aln2predictive_positions_scan (NULL, NULL, NULL, 0); + return pos; + } + + + + pos=vcalloc ( A->len_aln, sizeof (int)); + list=vcalloc (A->len_aln, sizeof (int)); + nl=0; + + + + for (a=0; a< max; a++) + { + int previous_best_pos=-1; + for (best_score=-9999,best_pos=0,b=0; bbest_score) + { + best_score=score; + best_pos=b; + } + for (c=1; c<=plist[b][0]; c++)pos[plist[b][c]]=0; + + } + if (best_pos==previous_best_pos)break; + else previous_best_pos=best_pos; + + //update the best_pos_list + for (b=1; b<=plist[best_pos][0]; b++) + list[nl++]=plist[best_pos][b]; + + + if ( print==PRINT) + { + for (b=0; blen_aln, sizeof (char**)); + for (a=0; alen_aln; a++) + { + + motifs[a]=compound2motif (A, B, pos, depth, a, mode, print); + } + + return motifs; +} +char ** compound2regexp_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print); +char ** compound2word_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print); + +char ** compound2motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print) +{ + char mmode[100]; + + strget_param (mode, "_MOTIFMODE_", "word", "%s", mmode); //words, regexp + if ( strm (mmode, "regexp"))return compound2regexp_motif (A,B,pos, depth, c, mode, print); + else if ( strm (mmode, "word"))return compound2word_motif (A,B,pos, depth, c, mode, print); + else return NULL;} +char ** compound2word_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print) +{ + int a,l; + char *word, **motif; + float score; + + + word=or_id_evaluate2 (A, B, mode, pos,print, &score); + if ( !word) return NULL; + l=strlen (word); + + motif=declare_char (l+1, 2); + for (a=0; anseq=O->nseq=I->len_aln=O->len_aln=0; + for (a=0; alen_aln; a++) + { + if (pos[a]) + { + for (i=o=0,b=0; bnseq; b++) + { + if ( is_gap(A->seq_al[b][a]))return 0; + if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a]; + else O->seq_al[o++][O->len_aln]=A->seq_al[b][a]; + } + I->len_aln++; + O->len_aln++; + } + } + + if (O->len_aln==0 || I->len_aln==0) return 0; + O->nseq=o; + I->nseq=i; + for (a=0; aseq_al[a][O->len_aln]='\0'; + for (a=0; aseq_al[a][I->len_aln]='\0'; + + if (!I->nseq) return NULL; + + + + best_pred=best_motif=best_sn=best_sp=best_sen2=0; + + motif_file=vtmpnam (NULL); + + n=0; + if (depth>0) + { + alp=vcalloc ( sizeof (char**), I->len_aln); + alp_size= vcalloc ( I->len_aln, sizeof (int)); + for (a=0; alen_aln; a++) + { + char *col; + alp[a]=string2alphabet ( (col=aln_column2string (I,a)),depth, &alp_size[a]); + vfree (col); + } + generate_array_string_list (I->len_aln, alp, alp_size, &n, motif_file, OVERLAP); + } + else + { + int *used; + char r; + + used=vcalloc (256, sizeof (int)); + fpp=vfopen (motif_file,"w"); + for (a=0;alen_aln; a++) + { + for (b=0; bnseq; b++) + { + r=I->seq_al[b][a]; + if (!used[(int)r]){fprintf (fpp, "%c", r);used[(int)r]=1;} + } + for (b=0; bnseq; b++) + { + r=I->seq_al[b][a]; + used[(int)r]=0; + } + fprintf (fpp, " "); + } + fprintf (fpp, "\n"); + vfree (used); + vfclose (fpp); + + n=1; + depth=I->nseq; + } + + buf=vcalloc (2*(I->len_aln*depth)+1, sizeof (char)); + best_buf=vcalloc (2*(I->len_aln*depth)+1, sizeof (char)); + fpp=vfopen (motif_file, "r"); + + for (a=0; anseq; b++) + { + if (match_motif (I->seq_al[b], m2))tp++; + else fn++; + } + for (b=0; bnseq; b++) + { + if (match_motif (O->seq_al[b], m2))fp++; + else tn++; + } + rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); + + if (best>= best_pred) + { + best_pred=best; + best_sp=sp; + best_sen2=sen2; + best_sn=sn; + sprintf (best_buf, "%s", buf); + } + m2--; + free_char (m2, -1); + } + vfclose (fpp); + if (print==PRINT)fprintf ( stdout, "\nMotifCompound %25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f motif: ", get_compound_name(c, mode), best_sp, best_sn, best_sen2, best_pred); + m2=string2list (best_buf); + m=declare_char (I->len_aln+1, depth+1); + + for (a=0; alen_aln; a++) + { + sprintf (m[a], "%s", m2[a+1]); + if (print==PRINT) fprintf ( stdout, "[%2s]",m[a]); + } + if (print==PRINT)fprintf ( stdout, " N-motifs %d", n); + free_char (m2, -1); + + if (alp)free_arrayN((void ***) alp, 3); + if (alp_size)vfree (alp_size); + vfree (buf); vfree(best_buf); + + return m; +} + +double pos2sim (Alignment *A, Alignment *B, int *pos) +{ + return sar_aln2r (A, B,pos, PRINT); +} +double sar_aln2r (Alignment *A, Alignment *B, int *pos, int print) +{ + int a, b, c, d,r1, r2, n, score, sim; + double *r, result; + static double **slist; + int declare=0; + static int **M; + + + + if (!M)M=read_matrice ("blosum62mt"); + if (!slist) + { + int maxslist; + maxslist=A->nseq*A->nseq*10; + slist=declare_double (maxslist, 2); + } + + if (pos==NULL) + { + + declare=1; + pos=vcalloc ( A->len_aln+1, sizeof (int)); + for (a=0; alen_aln; a++)pos[a]=1; + pos[a]=-1; + + } + + for (n=0,a=0; a< A->nseq-1; a++) + { + + for (b=a+1; bnseq; b++) + { + + + for (sim=d=0,c=0; clen_aln; c++) + { + + if (pos[c]==0)continue; + + r1=A->seq_al[a][c]; + r2=A->seq_al[b][c]; + if (is_gap(r1) || is_gap(r2))return 0; + + sim+=M[r1-'A'][r2-'A']*pos[c]; + d+=MAX((M[r1-'A'][r1-'A']),(M[r2-'A'][r2-'A'])); + } + sim=(d==0)?0:(100*sim)/d; + score=(int)get_sar_sim(B->seq_al[a], B->seq_al[b]); + slist[n][0]=(double)sim; + slist[n][1]=(double)score; + if (print==PRINT)fprintf ( stdout, "SIM: %d %d [%s %s]\n", sim, score, A->name[a], A->name[b]); + n++; + } + } + + r=return_r(slist, n); + for (a=0; alen_aln; c++) + { + + I->nseq=O->nseq=I->len_aln=O->len_aln=0; + for (a=0; alen_aln; a++) + { + if (pos[a]) + { + for (i=o=0,b=0; bnseq; b++) + { + if ( is_gap(A->seq_al[b][a]))return 0; + if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a]; + else O->seq_al[o++][O->len_aln]=A->seq_al[b][a]; + } + I->len_aln++; + O->len_aln++; + } + } + if (O->len_aln==0 || I->len_aln==0) return 0; + O->nseq=o; + I->nseq=i; + for (a=0; aseq_al[a][O->len_aln]='\0'; + for (a=0; aseq_al[a][I->len_aln]='\0'; + + delta+=aln2sim(I,"blosum62mt")-aln2sim(O, "blosum62mt"); + + } + + return delta; +} + +char * get_compound_name (int c, char *mode) +{ + static int isset; + static Alignment *S; + static char *lname; + + if (!isset) + { + char *comp_list; + isset=1; + lname=vcalloc (100, sizeof (char)); + + if (!mode); + else + { + strget_param (mode, "_COMPLIST_", "NO", "%s", comp_list=vcalloc (100, sizeof (char))); + if (strm(comp_list, "NO")); + else + { + S=main_read_aln (comp_list, NULL); + vfree (comp_list); + } + } + } + if (!S || c>=S->nseq)sprintf (lname, "%d", c); + else + { + sprintf (lname, "%s", S->name [c]); + } + return lname; +} +ORP * declare_or_prediction ( int ncomp, int nseq, int len) +{ + ORP *P; + P=vcalloc ( 1, sizeof (ORP)); + P->ncomp=ncomp; + P->nseq=nseq; + P->len=len; + P->PR=NULL; + + P->pos=vcalloc (len+1, sizeof (int)); + + return P; +} + +void free_orp_list ( ORP**P) +{ + int a=0; + while (P[a]) + { + free_orp(P[a++]); + } +} +void free_orp ( ORP*P) +{ + if (!P) return; + free_aln (P->A); + free_aln (P->S); + free_aln (P->P); + vfree (P->pos); + free_arrayN((void **)P->motif, 3); + if (P->PR)free_orp(P->PR); + vfree (P); +} + + + + + + + + + + + + + + + +/******************************COPYRIGHT NOTICE*******************************/ +/*© Centro de Regulacio Genomica */ +/*and */ +/*Cedric Notredame */ +/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */ +/*All rights reserved.*/ +/*This file is part of T-COFFEE.*/ +/**/ +/* T-COFFEE is free software; you can redistribute it and/or modify*/ +/* it under the terms of the GNU General Public License as published by*/ +/* the Free Software Foundation; either version 2 of the License, or*/ +/* (at your option) any later version.*/ +/**/ +/* T-COFFEE is distributed in the hope that it will be useful,*/ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of*/ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/ +/* GNU General Public License for more details.*/ +/**/ +/* You should have received a copy of the GNU General Public License*/ +/* along with Foobar; if not, write to the Free Software*/ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/ +/*............................................... |*/ +/* If you need some more information*/ +/* cedric.notredame@europe.com*/ +/*............................................... |*/ +/**/ +/**/ +/* */ +/******************************COPYRIGHT NOTICE*******************************/