#include #include #include #include #include #include #include "io_lib_header.h" #include "util_lib_header.h" #include "dp_lib_header.h" #include "define_header.h" /************************************************************************************/ /* NEW ANALYZE 2 : SAR */ /************************************************************************************/ float display_prediction_old (int **prediction, int n, Alignment *A, Alignment *S, int field); float display_prediction (int ***count, Alignment *S, int c, int n); Alignment * filter_aln4sar0 ( Alignment *A, Alignment *S, int c, int leave, char *mode); Alignment * filter_aln4sar1 ( Alignment *A, Alignment *S, int c, int leave, char *mode); Alignment * filter_aln4sar2 ( Alignment *A, Alignment *S, int c, int leave, char *mode); Alignment * filter_aln4sar3 ( Alignment *A, Alignment *S, int c, int leave, char *mode); Alignment * filter_aln4sar4 ( Alignment *A, Alignment *S, int c, int leave, char *mode); Alignment * filter_aln4sar5 ( Alignment *A, Alignment *S, int c, int leave, char *mode); int **sar2profile ( Alignment *A, Alignment *S, int c, int leave); int **sar2profile_sim ( Alignment *A, Alignment *S, int **sim, int comp, int leave); int sar_profile2score ( char *seq, int **profile); double sar_vs_iseq1( char *sar, int *seq, float gl, int **sim, char *best_aa); double sar_vs_seq1 ( char *sar, char *seq, float gl, int **sim, char *best_aa); double sar_vs_seq2 ( char *sar, char *seq, float ng, int **mat, char *a); double sar_vs_seq3 ( char *sar, char *seq, float ng, int **mat, char *a); double sar_vs_iseq4 ( char *sar, int *seq, float ng, int **mat, char *a);//supports an extended alphabet double sar_vs_seq4 ( char *sar, char *seq, float ng, int **mat, char *a); double sar_vs_seq5 ( char *sar, char *seq, float ng, int **mat, char *a); int make_sim_pred ( Alignment *A,Alignment *S, int comp, int seq); int **sar2profile_sim ( Alignment *A, Alignment *S, int **sim, int comp, int leave) { int a, b, r, c, c1, c2, r1, r2, s, p; int ***cache, **profile; profile=declare_int (A->len_aln, 26); cache=declare_arrayN (3,sizeof (int),2,A->len_aln, 26); for ( a=0; a< A->len_aln; a++) for ( b=0; b< A->nseq; b++) { r=tolower(A->seq_al[b][a]); c=( S->seq_al[comp][b]=='I')?1:0; if (b==leave || is_gap(r)) continue; cache [c][a][r-'a']++; } for (a=0; a< A->nseq; a++) { if ( a==leave) continue; for ( b=0; b< A->nseq; b++) { c1=(S->seq_al[comp][a]=='I')?1:0; c2=(S->seq_al[comp][b]=='I')?1:0; if ( b==leave || b==a || c1!=1 || c1==c2) continue; s=sim[a][b]; for (p=0; plen_aln; p++) { r1=tolower(A->seq_al[a][p]); r2=tolower(A->seq_al[b][p]); if ( is_gap(r1) || is_gap(r2) || r1==r2)continue; r1-='a';r2-='a'; if (cache[1][p][r2])continue; if ( s<50)continue; profile[p][r2]-=s; } } } free_arrayN((void***)cache,3); return profile; } int **sar2profile ( Alignment *A, Alignment *S, int comp, int leave) { int a, b,c,r, n, v, npos=0; int ***cache, **profile; int ncat; float n_gap, max_gap; profile=declare_int (A->len_aln, 26); cache=declare_arrayN (3,sizeof (int),2,A->len_aln, 26); for ( n=0, a=0; a< A->nseq; a++) { if ( a==leave) continue; else n+=(S->seq_al[comp][a]=='I')?1:0; } for ( a=0; a< A->len_aln; a++) for ( b=0; b< A->nseq; b++) { r=tolower(A->seq_al[b][a]); c=( S->seq_al[comp][b]=='I')?1:0; if (b==leave) continue; else if (is_gap(r))continue; r-='a'; cache [c][a][r]++; } ncat=15; /*ncat: limit the analysis to columns containing less than ncat categories of aa*/ max_gap=0.05; for (a=0; a< A->len_aln; a++) { for (n_gap=0,b=0; b< A->nseq; b++) n_gap+=(is_gap(A->seq_al[b][a])); n_gap/=(float)A->nseq; if ( n_gap> max_gap)continue; for (v=0,r=0; r< 26; r++) { if (cache [0][a][r] || cache[1][a][r])v++; } for (n=0,r=0; r< 26 && vnseq, 2); list2=declare_int ( inA->len_aln, 2); cache=declare_arrayN (3,sizeof (int),inA->len_aln,2, 26); F=copy_aln (inA, NULL); A=copy_aln (inA, NULL); A->nseq=strlen (S->seq_al[comp]); strget_param (mode, "_T1_", "5", "%d", &T1); for ( a=0; a< A->len_aln; a++) { n1=n0=g=0; for (b=0; b< A->nseq; b++) { if ( b==leave) continue; i=(S->seq_al[comp][b]=='I')?1:0; r=tolower(A->seq_al[b][a]); if ( r=='-')continue; cache[a][i][r-'a']++; } } for (a=0; a< A->nseq; a++) for ( score=0,b=0; blen_aln; b++) { r=tolower (A->seq_al[a][b]); if ( is_gap(r))continue; else if ( cache[b][0][r-'a'] && !cache[b][1][r-'a'])list1[a][0]++; } for (a=0; a< A->len_aln; a++) { for ( score=0,b=0; b< A->nseq; b++) { r=tolower (A->seq_al[b][a]); if ( r=='-')continue; else r-='a'; if ( cache[a][0][r] && !cache[a][1][r])score ++; } list2[a][0]=a; list2[a][1]=score; } sort_int (list2, 2, 1, 0, F->len_aln-1); Delta=A->len_aln/(100/T1); for ( a=0; a< F->len_aln-Delta; a++) { b=list2[a][0]; for ( c=0; cnseq; c++) { F->seq_al[c][b]='-'; } } ungap_aln (F); free_aln (A); free_arrayN ( (void ***)cache, 3); free_arrayN ((void**)list1, 2); free_arrayN ((void**)list2, 2); return F; } Alignment * filter_aln4sar2 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode) { Alignment *F, *A; int a,b,r,ncat; int *cache; int max_ncat=10; /*Keep Low entropy columns that contain less than ncat categories of different amino acids*/ /*REmove columns containing 10% or more gaps*/ cache=vcalloc ( 500, sizeof (char)); F=copy_aln (inA, NULL); A=copy_aln (inA, NULL); A->nseq=strlen (S->seq_al[comp]); for ( a=0; a< A->len_aln; a++) { for (ncat=0,b=0; b< A->nseq; b++) { if ( b==leave) continue; r=tolower(A->seq_al[b][a]); if ( !cache[r])ncat++; cache[r]++; } if ( ncat nseq)<10) { ; } else { for (b=0; bnseq; b++) { r=tolower(F->seq_al[b][a]); F->seq_al[b][a]='-'; cache[r]=0; } } for (b=0; bnseq; b++) { r=tolower(A->seq_al[b][a]); cache[r]=0; } } free_aln (A); ungap_aln (F); vfree (cache); return F; } Alignment * filter_aln4sar3 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode) { Alignment *F, *rA, *A; int a, b,c; int **list1; char *bufS, *bufA; int Delta; int T3; /*Keep the 10% positions most correlated with the 0/1 pattern*/ A=copy_aln (inA, NULL); A->nseq=strlen (S->seq_al[comp]); F=copy_aln (inA, NULL); rA=rotate_aln (A, NULL); strget_param (mode, "_T3_", "10", "%d", &T3); list1=declare_int ( inA->len_aln, 2); bufA=vcalloc ( A->nseq+1, sizeof (char)); bufS=vcalloc ( A->nseq+1, sizeof (char)); sprintf ( bufS, "%s", S->seq_al[comp]); splice_out_seg(bufS,leave, 1); for (a=0; a< A->len_aln; a++) { char aa; list1[a][0]=a; sprintf (bufA, "%s", rA->seq_al[a]); splice_out_seg (bufA,leave,1); list1[a][1]=(int)sar_vs_seq3 ( bufS, bufA,0,NULL, &aa); } sort_int (list1, 2, 1, 0, F->len_aln-1); Delta=F->len_aln/(100/T3); for ( a=0; a< F->len_aln-Delta; a++) { b=list1[a][0]; for ( c=0; cnseq; c++) { F->seq_al[c][b]='-'; } } F->score_aln=list1[F->len_aln-1][1]; ungap_aln (F); free_aln (rA); free_aln(A); free_arrayN ((void**)list1, 2); vfree (bufS);vfree (bufA); return F; } Alignment * filter_aln4sar4 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode) { Alignment *F, *A; int a, b,c, i,r, n0, n1,g,score; int ***cache, **list1, **list2; /*Keep only the positions where there are residues ONLY associated with 0 sequences*/ list1=declare_int ( inA->nseq, 2); list2=declare_int ( inA->len_aln, 2); cache=declare_arrayN (3,sizeof (int),inA->len_aln,2, 26); F=copy_aln (inA, NULL); A=copy_aln (inA, NULL); A->nseq=strlen (S->seq_al[comp]); for ( a=0; a< A->len_aln; a++) { n1=n0=g=0; for (b=0; b< A->nseq; b++) { if ( b==leave) continue; i=(S->seq_al[comp][b]=='I')?1:0; r=tolower(A->seq_al[b][a]); if ( r=='-')continue; cache[a][i][r-'a']++; n1+=i; } } for (a=0; a< A->len_aln; a++) { for ( score=0,b=0; b< A->nseq; b++) { r=tolower (F->seq_al[b][a]); if ( r=='-')continue; else r-='a'; if (cache[a][1][r]>=n1/2)score=1; } list2[a][0]=a; list2[a][1]=score; } for ( a=0; a< F->len_aln; a++) { if ( list2[a][1]==1); else { b=list2[a][0]; for ( c=0; cnseq; c++) { F->seq_al[c][b]='-'; } } } ungap_aln (F); free_aln (A); free_arrayN ( (void ***)cache, 3); free_arrayN ((void**)list1, 2); free_arrayN ((void**)list2, 2); return F; } Alignment * filter_aln4sar5 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode) { Alignment *F, *rA, *A; int a, b,c; int **list1; char *bufS, *bufA; int max; /*Look for the positions that show the best correlation between the sequence variation and the SAR*/ A=copy_aln (inA, NULL); A->nseq=strlen (S->seq_al[comp]); rA=rotate_aln (inA, NULL); F=copy_aln (inA, NULL); list1=declare_int ( A->len_aln, 2); bufA=vcalloc ( A->nseq+1, sizeof (char)); bufS=vcalloc ( A->nseq+1, sizeof (char)); sprintf ( bufS, "%s", S->seq_al[comp]); splice_out_seg(bufS,leave, 1); for (a=0; a< A->len_aln; a++) { char aa; list1[a][0]=a; sprintf (bufA, "%s", rA->seq_al[a]); splice_out_seg (bufA,leave,1); list1[a][1]=(int)sar_vs_seq4 ( bufS, bufA,0,NULL, &aa); } sort_int (list1, 2, 1, 0, F->len_aln-1); max=F->score=list1[F->len_aln-1][1]; max-=(max/10); for ( a=0; a< F->len_aln-10; a++) { b=list1[a][0]; for ( c=0; cnseq; c++) { F->seq_al[c][b]='-'; } } F->score_aln=10; ungap_aln (F); free_aln (inA); free_aln (rA); free_arrayN ((void**)list1, 2); vfree (bufS);vfree (bufA); return F; } int sar_profile2score ( char *seq, int **P) { int a,r, l, score; l=strlen (seq); for ( score=0,a=0; a< l; a++) { r=seq[a]; if ( is_gap(r))continue; score+=P[a][tolower(r)-'a']; } return score; } int make_sim_pred ( Alignment *A,Alignment *S, int comp, int seq) { int a, b, i, r1, r2; static float **cscore; static float **tscore; if ( !cscore) { cscore=declare_float (2, 2); tscore=declare_float (2, 2); } for (a=0; a< 2; a++)for (b=0; b<2; b++)cscore[a][b]=tscore[a][b]=0; for ( a=0; alen_aln; a++) { r1=A->seq_al[seq][a]; if ( r1=='-') continue; else { for ( b=0; b< A->nseq; b++) { if (b==seq) continue; else { r2=A->seq_al[b][a]; if (r2=='-')continue; else { i=(S->seq_al[comp][b]=='I')?1:0; cscore[i][0]+=(r1==r2)?1:0; cscore[i][1]++; } } } for (i=0; i<2; i++) { cscore[i][0]/=(cscore[i][1]==0)?1:cscore[i][1]; tscore[i][0]+=cscore[i][0];tscore[i][1]++; cscore[i][0]=cscore[i][1]=0; } } } fprintf ( stdout, "\nn\t 1: %.2f 0: %.2f", tscore[1][0],tscore[0][0]); return ( tscore[1][0]>=tscore[0][0])?1:0; } Alignment * sar_analyze (Alignment *inA, Alignment *inS, char *mode) { int ***sim,***glob_results, ***comp_results; int *count; int a,b,c,m; float *tot2; Alignment *A=NULL,*S=NULL,*F, *SUBSET; char *subset, *target; int jack, T, filter; filter_func *ff; int n_methods=0; char *prediction, *reliability; int pred_start=0, pred_end, ref_start=0, ref_end; int display, CSV=1, NONCSV=0; char method[5]; strget_param (mode, "_METHOD_", "1111", "%s_", method); ff=vcalloc (6,sizeof (filter_func)); if (method[0]=='1')ff[n_methods++]=filter_aln4sar0; if (method[1]=='1')ff[n_methods++]=filter_aln4sar1; if (method[2]=='1')ff[n_methods++]=filter_aln4sar2; if (method[3]=='1')ff[n_methods++]=filter_aln4sar3; /* ff[n_methods++]=filter_aln4sar4; ff[n_methods++]=filter_aln4sar5; */ sim=vcalloc (n_methods, sizeof (int**)); tot2=vcalloc ( 10, sizeof (float)); subset=vcalloc ( 100, sizeof (char)); target=vcalloc ( 100, sizeof (char)); strget_param (mode, "_TARGET_", "no", "%s_", target); strget_param (mode, "_SUBSET_", "no", "%s_", subset); strget_param (mode, "_JACK_", "0", "%d", &jack); strget_param (mode, "_T_", "0", "%d", &T); strget_param (mode, "_FILTER_", "11", "%d", &filter); strget_param (mode, "_DISPLAY_", "0", "%d", &display); if ( !strm (target, "no")) { Alignment *T; T=main_read_aln(target, NULL); if ( T->len_aln !=inA->len_aln ) { printf_exit ( EXIT_FAILURE,stderr, "Error: %s is incompatible with the reference alignment [FATAL:%s]",target,PROGRAM); } inA=stack_aln (inA, T); } if ( !strm(subset, "no")) { SUBSET=main_read_aln (subset, NULL); sarset2subsarset ( inA, inS, &A, &S, SUBSET); } else { A=inA; S=inS; } prediction=vcalloc ( n_methods+1, sizeof (char)); reliability=vcalloc ( n_methods+1, sizeof (char)); glob_results=declare_arrayN(3, sizeof (int), n_methods*2, 2, 2); count=vcalloc (S->nseq, sizeof (int)); for (a=0; anseq; a++) { int l; l=strlen (S->seq_al[a]); for ( b=0; bseq_al[a][b]=='I')?1:0; } if ( display==CSV) {fprintf ( stdout, "\nCompound %s ; Ntargets %d", S->name[a],count[a]); pred_start=(strlen (S->seq_al[0])==A->nseq)?0:strlen (S->seq_al[0]); pred_end=A->nseq; for (a=pred_start; a< pred_end; a++) fprintf ( stdout, ";%s", A->name[a]); fprintf ( stdout, ";npred;"); } for (a=0; anseq; a++) { int n_pred; comp_results=declare_arrayN(3, sizeof (int), n_methods*2, 2, 2); pred_start=(strlen (S->seq_al[a])==A->nseq)?0:strlen (S->seq_al[a]); pred_end=A->nseq; if ( display==CSV)fprintf ( stdout, "\n%s;%d", S->name[a],count[a]); for (n_pred=0,b=pred_start; b%-15s %10s %c ", S->name[a], A->name[b], (pred_start==0)?S->seq_al[a][b]:'?'); if (jack || b==pred_start) { for (m=0; mseq_al[m]); for (c=ref_start;cseq_al[a][c]=='O') { Nbsim=MAX(Nbsim,sim[m][b][c]); } else { Ybsim=MAX(Ybsim,sim[m][b][c]); } } bsim=(Ybsim>Nbsim)?Ybsim:-Nbsim; pred=(bsim>0)?1:0; real=(S->seq_al[a][b]=='O')?0:1; comp_results[m][pred][real]++; glob_results[m][pred][real]++; score+=pred; prediction[m]=pred+'0'; reliability[m]=(FABS((Ybsim-Nbsim))-1)/10+'0'; } if ( score>0)n_pred++; prediction[m]=reliability[m]='\0'; if (display==NONCSV)fprintf ( stdout, "Compound_Count:%d primary_predictions: %s Total: %d", count[a],prediction, score); else if ( display==CSV)fprintf ( stdout, ";%d", score); for (t=0; tt) { comp_results[t+n_methods][1][real]++; glob_results[t+n_methods][1][real]++; } else { comp_results[t+n_methods][0][real]++; glob_results[t+n_methods][0][real]++; } } } if ( display==NONCSV) {if ( pred_start==0)display_prediction (comp_results, S,a, n_methods*2);} else fprintf (stdout, ";%d;",n_pred); } if ( display==NONCSV)if (pred_start==0)display_prediction (glob_results, S,-1, n_methods*2); exit (EXIT_SUCCESS); } float display_prediction (int ***count, Alignment *S, int c, int n) { float tp,tn,fn,fp,sp,sn,sn2; int a, nm; nm=n/2; for (a=0; a#Method %d Compound %15s sp=%.2f sn=%.2f sn2=%.2f",a, (c==-1)?"TOTAL":S->name[c],sp, sn, sn2 ); else fprintf ( stdout, "\n>#Combined: T=%d Compound %15s sp=%.2f sn=%.2f sn2=%.2f",a-nm, (c==-1)?"TOTAL":S->name[c],sp, sn, sn2 ); } fprintf ( stdout, "\n"); return 0; } float display_prediction_2 (int **prediction, int n,Alignment *A, Alignment *S, int field) { int a, t, T; float max_sn, max_sp; if ( field==17 || field ==18) { printf_exit ( EXIT_FAILURE, stderr, "\nERROR: Do not use filed %d in display_prediction", field); } sort_int_inv ( prediction, 10,field, 0, n-1); for (t=0,a=0; a=0; a--) { prediction[a][18]=t; t+=prediction[a][3]; } max_sn=max_sp=T=0; for (a=0; a0.8) { if (sn>max_sn) { max_sn=sn; max_sp=sp; T=prediction[a][field]; } } } if (max_sn==0) fprintf (stdout, "\n T =%d SN=%.2f SP= %.2f",T,max_sn,max_sp); else fprintf (stdout, "\n T =%d SN=%.2f SP= %.2f",T,max_sn,max_sp); return max_sn; } /************************************************************************************/ /* NEW ANALYZE : SAR */ /************************************************************************************/ float** cache2pred1 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); float** cache2pred2 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); float** cache2pred3 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); float** cache2pred4 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); float** cache2pred5 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); float** cache2pred_new (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); int **sar2cache_adriana ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); int **sar2cache_proba_old ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); int **sar2cache_count1 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); int **sar2cache_count2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); int **sar2cache_count3 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); int **sar2cache_proba_new ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); int **sar2cache_proba2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode); int **analyze_sar_compound1 ( char *name, char *seq, Alignment *A, char *mode); int **analyze_sar_compound2 ( char *name, char *seq, Alignment *A, char *mode); int aln2n_comp_col ( Alignment *A, Alignment *S, int ci); int ***simple_sar_analyze_vot ( Alignment *inA, Alignment *SAR, char *mode); int ***simple_sar_analyze_col ( Alignment *inA, Alignment *SAR, char *mode); int sarset2subsarset ( Alignment *A, Alignment *S, Alignment **subA, Alignment **subS, Alignment *SUB); int benchmark_sar (int v); int aln2jack_group1 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2); int aln2jack_group2 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2); int aln2jack_group3 (Alignment *A, char *sar_seq, int **l1, int *nl1, int **l2, int *nl2); float** jacknife5 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); float** jacknife6 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode); int process_cache ( Alignment *A,Alignment *S, int ***Cache, char *mode); Alignment *analyze_compounds (Alignment *A, Alignment *S, char *mode); Alignment *analyze_compounds (Alignment *A, Alignment *S, char *mode) { int a, b, c, tot, n; int **sim; int sar1, sar2; sim=aln2sim_mat (A, "idmat"); for (a=0; a< S->nseq; a++) { for (n=0, tot=0, b=0; b< A->nseq-1; b++) { sar1=(S->seq_al[a][b]=='I')?1:0; for ( c=b+1; cnseq; c++) { sar2=(S->seq_al[a][c]=='I')?1:0; if (sar1 && sar2) { tot+=sim[b][c]; n++; } } } fprintf ( stdout, ">%-10s CMPSIM: %.2f\n", S->name[a],(float)tot/(float)n); } free_int (sim, -1); return A; } int print_seq_pos ( int pos, Alignment *A, char *seq); int abl1_evaluation (int p); int print_seq_pos ( int pos, Alignment *A, char *seq) { int a, b, s; s=name_is_in_list (seq, A->name, A->nseq, MAXNAMES); fprintf ( stdout, "S=%d", s); for (b=0,a=0; aseq_al[s][a]))b++; } fprintf ( stdout, "Pos %d SEQ %s: %d ", pos+1, seq, b+246); if ( strm ( seq, "ABL1")) fprintf ( stdout , "PT: %d", abl1_evaluation (b+246)); return 0; } int process_cache ( Alignment *A,Alignment *S, int ***Cache, char *mode) { int a, b; int **pos, **pos2; int **C; int ab1, *ab1_pos; int weight_mode; strget_param ( mode, "_WEIGHT_", "1", "%d", &weight_mode); pos=declare_int(A->len_aln+1,2); pos2=declare_int (A->len_aln+1,S->nseq); for (a=0; anseq; a++) { C=Cache[a]; for (b=0; b< A->len_aln; b++) { pos[b][0]+=C[26][b]; if ( C[26][b]>0) { pos[b][1]++; pos2[b][a]=1; } } } C=Cache[0]; ab1=name_is_in_list ("ABL1", A->name, A->nseq,100); ab1_pos=vcalloc (A->len_aln+1, sizeof (int)); for ( b=0,a=0; a< A->len_aln; a++) { if ( A->seq_al[ab1][a]=='-')ab1_pos[a]=-1; else ab1_pos[a]=++b; } for ( a=0; a< A->len_aln; a++) { fprintf ( stdout, "\n%4d %5d %5d %5d [%c] [%2d] ALN", a+1, pos[a][0], pos[a][1], ab1_pos[a]+246,A->seq_al[ab1][a],abl1_evaluation (ab1_pos[a]+246)); for ( b=0; b< S->nseq; b++)fprintf ( stdout, "%d", pos2[a][b]); } return 1; } int abl1_evaluation (int p) { if ( p==248) return 10; if ( p==250) return 10; if ( p==253) return 10; if ( p==254) return 10; if ( p==255) return 9; if ( p==256) return 10; if ( p==257) return 5; if ( p==258) return 8; if ( p==269) return 8; if ( p==291) return 4; if ( p==294) return 8; if ( p==299) return 10; if ( p==306) return 0; if ( p==314) return 9; if ( p==315) return 10; if ( p==318) return 10; if ( p==319) return 10; if ( p==321) return 10; if ( p==323) return 0; if ( p==324) return 0; if ( p==339) return 0; if ( p==340) return 0; if ( p==355) return 5; if ( p==364) return 10; if ( p==366) return 0; if ( p==368) return 10; if ( p==370) return 10; if ( p==372) return 0; if ( p==378) return 8; if ( p==382) return 10; if ( p==384) return 10; if ( p==387) return 10; if ( p==395) return 8; if ( p==398) return 8; if ( p==399) return 8; if ( p==400) return 8; if ( p==403) return 0; if ( p==416) return 8; if ( p==419) return 5; if ( p>400) return 0; return -1; } float** cache2pred1 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode) { int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci; double score, max, id, m; float **R, T; int used_col, used_res,is_used_col, n_res=0; int weight_mode; /*Predict on ns[1] what was trained on ns[0]*/ strget_param ( mode, "_THR_", "0.09", "%f", &T); strget_param ( mode, "_WEIGHT_", "0", "%d", &weight_mode); R=declare_float (2, 2); ci=name_is_in_list ( compound, S->name, S->nseq, -1); for (s1=0; s1len_aln; col++) { int max1; r1=tolower (A->seq_al[seq1][col]); for (max1=0,id=0, m=0,s2=0; s2seq_al[ci][seq2]=='O')continue; if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue; r2=tolower ( A->seq_al[seq2][col]); if ( is_gap(r2))continue; v=(cache[seq2][col]>0 && weight_mode==1)?cache[seq2][col]:1; max+=v; if ( r2==r1) { score+=v; } } } pred=(( score/max) >T)?1:0; real=(S->seq_al[ci][seq1]=='I')?1:0; R[pred][real]++; fprintf ( stdout, "\n>%s %d%d SCORE %.2f C %s [SEQ]\n", A->name[seq1],real, pred, (float)score/(float)max, compound); } for (used_col=0,used_res=0,col=0; collen_aln; col++) { for (is_used_col=0,s2=0; s2seq_al[seq2][col]))n_res++; else if (is_gap(A->seq_al[seq2][col])); else { is_used_col=1; used_res++; } } used_col+=is_used_col; } fprintf ( stdout, "\n>%s USED_POSITIONS: COL: %.2f RES: %.2f COMP\n", S->name[ci], (float)used_col/(float)A->len_aln, (float)used_res/(float) n_res); return R; } float** cache2pred2 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode) { int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci; double score, max; float **R, T; int used_col, used_res,is_used_col, n_res=0; /*Predict on ns[1] what was trained on ns[0]*/ strget_param ( mode, "_THR_", "0.5", "%f", &T); R=declare_float (2, 2); ci=name_is_in_list ( compound, S->name, S->nseq, -1); for (s1=0; s1len_aln; col++) { int used; r1=tolower (A->seq_al[seq1][col]); for (used=0,s2=0; s2seq_al[ci][seq2]=='O')continue; if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue; r2=tolower ( A->seq_al[seq2][col]); if ( is_gap(r2))continue; v=cache[seq2][col]; if ( r2==r1){score+=v;} used=1; max+=v; } if (used) fprintf ( stdout, "%c", r1); } pred=(( score/max) >T)?1:0; real=(S->seq_al[ci][seq1]=='I')?1:0; R[pred][real]++; fprintf ( stdout, "PSEQ: %-10s SC: %4d MAX: %4d S: %.2f R: %4d", A->name[seq1],(int)score, (int)max, (float)score/max,real); } for (used_col=0,used_res=0,col=0; collen_aln; col++) { for (is_used_col=0,s2=0; s2seq_al[seq2][col]))n_res++; else if (is_gap(A->seq_al[seq2][col])); else { is_used_col=1; used_res++; } } used_col+=is_used_col; } fprintf ( stdout, "\n>%s USED_POSITIONS: COL: %.2f RES: %.2f COMP\n", S->name[ci], (float)used_col/(float)A->len_aln, (float)used_res/(float) n_res); return R; } float** cache2pred3 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode) { int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci, a, n; double score, max; float **R, T; int tp, tn, fn, fp; int best_tp, best_fp; int delta, best_delta; int **list; /*Predict on ns[1] what was trained on ns[0]*/ strget_param ( mode, "_THR_", "0.5", "%f", &T); R=declare_float (2, 2); ci=name_is_in_list ( compound, S->name, S->nseq, -1); list=declare_int ( ns[1],3); for (s1=0; s1len_aln; col++) { int used; r1=tolower (A->seq_al[seq1][col]); for (used=0,s2=0; s2seq_al[ci][seq2]=='O')continue; if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue; r2=tolower ( A->seq_al[seq2][col]); if ( is_gap(r2))continue; v=cache[seq2][col]; if ( r2==r1){score+=v;} used=1; max+=v; } } pred=(( score/max) >T)?1:0; real=(S->seq_al[ci][seq1]=='I')?1:0; list[s1][0]=real; list[s1][1]=(int)((score/max)*(float)1000); list[s1][2]=seq1; } sort_int_inv (list, 3, 1, 0, ns[1]-1); for ( a=0; aname[seq1],list[a][0], list[a][1]); } for (n=0, a=0; alen_aln, sizeof (int)); for (a=0; a< A->len_aln; a++) for ( b=0; b< A->nseq; b++) if ( cache[b][a])ul[nused++]=a; /*compute the similarity on the used columns*/ R=declare_float (2, 2); sim=declare_int (A->nseq, A->nseq); for (a=0; a< A->nseq; a++) for ( b=0; b< A->nseq; b++) { for (c=0; c< nused; c++) { if ( A->seq_al[a][ul[c]]==A->seq_al[b][ul[c]])sim[a][b]++; } sim[a][b]=(sim[a][b]*100)/nused; } vfree (ul); ci=name_is_in_list ( compound, S->name, S->nseq, -1); list=declare_int ( ns[1],2); for (s1=0; s1seq_al[ci][seq2]=='I')score=MAX(score, sim[seq1][seq2]); } list[s1][0]=(S->seq_al[ci][seq1]=='I')?1:0; list[s1][1]=(int)score; } sort_int_inv (list, 2, 1, 0, ns[1]-1); for (n=0, a=0; aname, S->nseq, -1); list=declare_int ( ns[1],2); for (s1=0; s1seq_al[ci][seq2]=='I')score=MAX(score, sim[seq1][seq2]); } list[s1][0]=(S->seq_al[ci][seq1]=='I')?1:0; list[s1][1]=(int)score; } sort_int_inv (list, 2, 1, 0, ns[1]-1); for (n=0, a=0; aname, S->nseq, -1); list=declare_int (A->nseq,2); R=declare_float (2, 2); for ( a=0; anseq; a++) { int real, res; ns[0]=A->nseq-1; ns[1]=1; for (c=0,b=0; bnseq; b++) if (a!=b)ls[0][c++]=b; ls[1][0]=a; cache=sar2cache_count1 (A, ns, ls,S, compound, mode); for (b=0; b<=26; b++) for ( c=0; c< A->len_aln; c++) cacheIN[b][c]+=cache[b][c]; seq1=a; real=(S->seq_al[ci][seq1]=='I')?1:0; fprintf ( stdout, ">%-10s %d ", A->name[seq1], real); for (max_score=0,b=0; blen_aln; b++) max_score+=cache[26][b]; for (score=0,b=0; blen_aln; b++) { res=tolower (A->seq_al[seq1][b]); if ( cache[26][b]==0) continue; if ( !is_gap(res)) { score+=cache[res-'a'][b]; } /*fprintf ( stdout, "%c[%3d]", res,b);*/ } fprintf ( stdout, " SCORE: %5d SPRED %d RATIO: %.2f \n", (int)score, a, (score*100)/max_score); list[a][0]=real; if ( strstr (mode, "SIMTEST"))list[a][1]=(score*100)/max_score; else list[a][1]=(score*100)/max_score; free_int (cache, -1); } sort_int_inv (list, 2, 1, 0, A->nseq-1); for (n=0, a=0; anseq; a++) { n+=list[a][0]; } for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; anseq; a++) { tp+=list[a][0]; fp+=1-list[a][0]; delta=(n-(tp+fp)); if (FABS(delta)nseq-(tp+fp+fn); R[1][1]=tp; R[1][0]=fp; R[0][1]=fn; R[0][0]=tn; free_int (list, -1); return R; } float** jacknife6 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode) { int seq1, ci, a,b, c,d,e,f, n; double score; float **R; int tp, tn, fn, fp; int best_tp, best_fp; int delta, best_delta; int **list; ci=name_is_in_list ( compound, S->name, S->nseq, -1); list=declare_int (A->len_aln,2); R=declare_float (2, 2); for ( a=0; anseq; a++) { int sar, res; int **new_cache; ns[0]=A->nseq-1; ns[1]=1; for (c=0,b=0; bnseq; b++) if (a!=b)ls[0][c++]=b; ls[1][0]=a; cache=sar2cache_proba_new (A, ns, ls,S, compound, mode); new_cache=declare_int (27,A->len_aln); for (d=0; d< A->len_aln; d++) { int **analyze; if ( cache[26][d]==0)continue; analyze=declare_int (26, 2); for ( e=0; e< ns[0]; e++) { f=ls[0][e]; sar=(S->seq_al[ci][f]=='I')?1:0; res=tolower (A->seq_al[f][d]); if ( res=='-') continue; analyze[res-'a'][sar]++; } for (e=0;e<26; e++) { if ( analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=cache[e][d];} /* if ( analyze[e][0] && analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=analyze[e][1];} else if ( analyze[e][0]){new_cache[26][d]=1;new_cache[e][d]-=analyze[e][0]*10;} else if ( analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=analyze[e][1];} else if ( !analyze[e][0] &&!analyze[e][1]); */ } free_int (analyze, -1); } seq1=a; sar=(S->seq_al[ci][seq1]=='I')?1:0; fprintf ( stdout, ">%-10s %d ", A->name[seq1], sar); for (score=0,b=0; blen_aln; b++) { res=tolower (A->seq_al[seq1][b]); if ( cache[26][b]==0) continue; if ( !is_gap(res)) { score+=new_cache[res-'a'][b]; } } fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score); list[seq1][0]=sar; list[seq1][1]=(int)score; free_int (new_cache, -1); free_int (cache, -1); } sort_int_inv (list, 2, 1, 0, A->nseq-1); for (n=0, a=0; anseq; a++)n+=list[a][0]; for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; anseq; a++) { tp+=list[a][0]; fp+=1-list[a][0]; delta=(n-(tp+fp)); if (FABS(delta)nseq-(tp+fp+fn); R[1][1]=tp; R[1][0]=fp; R[0][1]=fn; R[0][0]=tn; free_int (list, -1); return R; } float** cache2pred_new (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode) { int s1, seq1, ci, a,b, n; double score; float **R; int tp, tn, fn, fp; int best_tp, best_fp; int delta, best_delta; int **list; ci=name_is_in_list ( compound, S->name, S->nseq, -1); list=declare_int ( ns[1],2); R=declare_float (2, 2); for (s1=0; s1seq_al[ci][seq1]=='I')?1:0; fprintf ( stdout, ">%-10s %d ", A->name[seq1], real); for (score=0,b=0; blen_aln; b++) { res=tolower (A->seq_al[seq1][b]); if ( cache[26][b]==0) continue; if ( !is_gap(res)) { score+=cache[res-'a'][b]; } fprintf ( stdout, "%c", res); } fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score); list[s1][0]=real; list[s1][1]=(int)score; } sort_int_inv (list, 2, 1, 0, ns[1]-1); for (n=0, a=0; aname, S->nseq, -1); list=declare_int ( ns[1],2); R=declare_float (2, 2); for (s1=0; s1seq_al[ci][seq1]=='I')?1:0; fprintf ( stdout, ">%-10s %d ", A->name[seq1], real); for (score=0,b=0; blen_aln; b++) { res=tolower (A->seq_al[seq1][b]); if ( cache[26][b]==0) continue; if ( !is_gap(res)) { score+=cache[res-'a'][b]; } fprintf ( stdout, "%c", res); } fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score); list[s1][0]=real; list[s1][1]=(int)score; } new_cache=declare_int (27,A->len_aln); for (a=0; a< A->len_aln; a++) { int **analyze, real, res, d; int *res_type; int **sub; int *keep; keep=vcalloc ( 26, sizeof (int)); res_type=vcalloc ( 26, sizeof (int)); sub=declare_int (256, 2); if ( cache[26][a]==0)continue; analyze=declare_int (26, 2); for ( b=0; b< ns[0]; b++) { seq1=ls[0][b]; real=(S->seq_al[ci][seq1]=='I')?1:0; res=tolower (A->seq_al[seq1][a]); if ( res=='-') continue; analyze[res-'a'][real]++; } fprintf ( stdout, "RSPRED: "); for (c=0;c<26; c++)fprintf ( stdout, "%c", c+'a'); fprintf ( stdout, "\nRSPRED: "); for (c=0;c<26; c++) { if ( analyze[c][0] && analyze[c][1]){fprintf ( stdout, "1");res_type[c]='1';} else if ( analyze[c][0]){new_cache[26][a]=1;new_cache[c][a]-=analyze[c][0];fprintf ( stdout, "0");res_type[c]='0';} else if ( analyze[c][1]){new_cache[26][a]=1;new_cache[c][a]+=analyze[c][1];fprintf ( stdout, "1");res_type[c]='1';} else if ( !analyze[c][0] &&!analyze[c][1]){fprintf ( stdout, "-");res_type[c]='-';} } for ( c=0; c<26; c++) { for ( d=0; d<26; d++) { if ( res_type[c]==res_type[d]) { sub[res_type[c]][0]+=mat[c][d]; sub[res_type[c]][1]++; } if ( res_type[c]!='-' && res_type[d]!='-') { sub['m'][0]+=mat[c][d]; sub['m'][1]++; } } } for ( c=0; c< 256; c++) { if ( sub[c][1])fprintf ( stdout, " %c: %5.2f ", c, (float)sub[c][0]/(float)sub[c][1]); } fprintf ( stdout, " SC: %d\nRSPRED ", cache[26][a]); for ( c=0; c<26; c++) if ( res_type[c]=='1') { for (d=0; d<26; d++) if (mat[c][d]>0)keep[d]++; keep[c]=9; } for (c=0; c<26; c++) { if ( keep[c]>10)fprintf ( stdout, "9"); else fprintf ( stdout, "%d", keep[c]); } for ( c=0; c<26; c++) { if ( keep[c]>8)new_cache[c][a]=10; else new_cache[c][a]=-10; } fprintf ( stdout, "\n"); free_int (analyze, -1); free_int (sub, -1); vfree (res_type); vfree (keep); } for ( a=0; a<25; a++) for (b=a+1; b<26; b++) { int r1, r2; r1=a+'a';r2=b+'a'; if ( strchr("bjoxz", r1))continue; if ( strchr("bjoxz",r2))continue; if ( mat[a][b]>0 && a!=b)fprintf ( stdout, "\nMATANALYZE %c %c %d", a+'a', b+'a', mat[a][b]); } for (s1=0; s1seq_al[ci][seq1]=='I')?1:0; fprintf ( stdout, ">%-10s %d ", A->name[seq1], real); for (score=0,b=0; blen_aln; b++) { res=tolower (A->seq_al[seq1][b]); if ( cache[26][b]==0) continue; if ( !is_gap(res)) { score+=new_cache[res-'a'][b]; } fprintf ( stdout, "%c", res); } fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score); list[s1][0]=real; list[s1][1]=(int)score; } free_int (new_cache, -1); sort_int_inv (list, 2, 1, 0, ns[1]-1); for (n=0, a=0; aname, S->nseq, -1); cache=declare_int (A->nseq, A->len_aln); strget_param ( mode, "_FILTER1_", "0" , "%f", &T1); strget_param ( mode, "_FILTER2_", "1000000", "%f", &T2); strget_param ( mode, "_FILTER3_", "0" , "%f", &T3); strget_param ( mode, "_FILTER4_", "1000000", "%f", &T4); strget_param ( mode, "_SIMWEIGHT_", "1", "%d", &sim_weight); strget_param ( mode, "_SWTHR_", "30", "%d", &sw_thr); strget_param (mode, "_TRAIN_","1", "%d", &train_mode); strget_param (mode, "_ZSCORE_","0", "%f", &zscore); if (sim_weight==1 && !sim) sim=aln2sim_mat(A, "idmat"); for ( ms=0; msseq_al[ci][mseq]!='I')continue; list=declare_int (A->len_aln+1, nfield); for (t=0,n=0, col=0; col< A->len_aln; col++) { int same_res; mres=tolower(A->seq_al[mseq][col]); list[col][RES]=mres; list[col][COL_INDEX]=col; if ( is_gap(mres))continue; for ( s=0; sseq_al[seq][col]); if (is_gap(res))continue; if (sim_weight==1) { w=sim[seq][mseq];w=(mres==res)?100-w:w; if (wseq_al[ci][seq]=='I')same_res=1; else same_res=(res==mres)?1:0; } else same_res=(res==mres)?1:0; list[col][N]+=w; if (S->seq_al[ci][seq]=='I' && same_res)list[col][N11]+=w; else if (S->seq_al[ci][seq]=='I' && same_res)list[col][N10]+=w; else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N01]+=w; else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N00]+=w; if ( S->seq_al[ci][seq]=='I')list[col][N1sar]+=w; if ( same_res)list[col][N1msa]+=w; } list[col][SCORE]=(int)evaluate_sar_score1 (list[col][N], list[col][N11], list[col][N1msa], list[col][N1sar]); } strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1); strget_param ( mode, "_WEIGHT_", "1", "%d", &weight_mode); strget_param ( mode, "_QUANT_", "0.0", "%f", &quant); sort_int_inv (list,nfield,SCORE,0,A->len_aln-1); if ( quant !=0) { n=quantile_rank ( list,SCORE, A->len_aln,quant); sort_int (list,nfield,N1msa, 0, n-1); maxn1=MIN(n,maxn1); } for (a=0; aT1 && valuelen_aln+1, nfield); for ( col=0; col< A->len_aln; col++) { list[col][COL_INDEX]=col; for ( s=0; slen_aln; col++) if (list[col][SCORE]T4) { list[col][SCORE]=0; for (s=0; s< A->nseq; s++) if (!is_gap(A->seq_al[s][col]))cache[s][col]=0; } /*Keep The N Best Columns*/ if ( zscore!=0) { double sum=0, sum2=0, z; int n=0; for (a=0; a< A->len_aln; a++) { if ( list[a][SCORE]>0) { sum+=list[a][SCORE]; sum2+=list[a][SCORE]*list[a][SCORE]; n++; } } for (a=0; alen_aln; a++) { if ( list[a][SCORE]>0) { z=return_z_score (list[a][SCORE], sum, sum2,n); if ((float)znseq; s++) cache [s][col]=0; } else { fprintf ( stdout, "\nZSCORE: KEEP COL %d SCORE: %f SCORE: %d\n", list[a][COL_INDEX], (float)z, list[a][SCORE]); } } } } else { sort_int_inv (list,nfield,SCORE,0,A->len_aln-1); strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2); for (a=maxn2;alen_aln; a++) { col=list[a][COL_INDEX]; for (s=0; snseq; s++) cache [s][col]=0; } } /*Get Rid of the N best Columns*/; strget_param ( mode, "_MAXN3_", "0", "%d", &maxn3); for (a=0; anseq; s++) cache [s][col]=0; } return cache; } int aln2n_comp_col ( Alignment *A, Alignment *S, int ci) { int res, seq,sar, col, r; int **analyze; int tot=0; analyze=declare_int (27, 2); for ( col=0; col< A->len_aln; col++) { int n1, n0; for ( n1=0, n0=0,seq=0; seqnseq; seq++) { res=tolower(A->seq_al[seq][col]); sar=(S->seq_al[ci][seq]=='I')?1:0; n1+=(sar==1)?1:0; n0+=(sar==0)?1:0; if ( res=='-')continue; res-='a'; analyze[res][sar]++; } for (r=0; r<26; r++) { int a0,a1; a0=analyze[r][0]; a1=analyze[r][1]; if ( a1==n1 && a0len_aln, 2); cache=declare_int ( 27, A->len_aln); analyze=declare_int (27, 2); ci=name_is_in_list ( compound, S->name, S->nseq, -1); for ( col=0; col< A->len_aln; col++) { int n1, n0; for ( n1=0, n0=0,s=0; sseq_al[seq][col]); sar=(S->seq_al[ci][seq]=='I')?1:0; n1+=(sar==1)?1:0; n0+=(sar==0)?1:0; if ( res=='-')continue; res-='a'; analyze[res][sar]++; } for (r=0; r<26; r++) { a0=analyze[r][0]; a1=analyze[r][1]; if ( strstr (mode, "SIMTEST")) { w=a1; } else if (a1 ) { w=n0-a0; } else w=0; cache[r][col]+=w; cache[26][col]=MAX(w, cache[26][col]); } for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0; list[col][0]=col; list[col][1]=cache[26][col]; } free_int (analyze, -1); sort_int_inv (list, 2, 1, 0, A->len_aln-1); strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2); for ( col=maxn2; collen_aln; col++) for ( r=0; r<=26; r++)cache[r][list[col][0]]=0; free_int (list, -1); return cache; } int **sar2cache_count2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode) { int maxn2, res, seq,sar, ci, col,s, r; int **analyze, **list, **cache, **conseq; static int **mat; int w=0; if (!mat) mat=read_matrice ("blosum62mt"); list=declare_int ( A->len_aln, 2); cache=declare_int ( 27, A->len_aln); conseq=declare_int ( A->len_aln,3); analyze=declare_int (27, 2); ci=name_is_in_list ( compound, S->name, S->nseq, -1); for ( col=0; col< A->len_aln; col++) { int n1, n0; for ( n1=0, n0=0,s=0; sseq_al[seq][col]); sar=(S->seq_al[ci][seq]=='I')?1:0; n1+=(sar==1)?1:0; n0+=(sar==0)?1:0; if ( res=='-')continue; res-='a'; analyze[res][sar]++; } for (r=0; r<26; r++) { int a0,a1; a0=analyze[r][0]; a1=analyze[r][1]; if ( a1==n1 && a0len_aln; col++) { res=tolower(A->seq_al[seq][col]); if ( is_gap(res))continue; else res-='a'; if ( conseq[col][1] && res!=conseq[col][0])w1++; if ( conseq[col][1])w2++; } for (col=0; collen_aln; col++) { res=tolower(A->seq_al[seq][col]); if ( is_gap(res))continue; else res-='a'; if ( conseq[col][1] && res!=conseq[col][0])conseq[col][2]+=(w2-w1); } } for (col=0; collen_aln; col++) { r=conseq[col][0]; w=conseq[col][2]; cache[r][col]=cache[26][col]=list[col][1]=w; list[col][0]=col; } sort_int_inv (list, 2, 1, 0, A->len_aln-1); strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2); for ( col=maxn2; collen_aln; col++) for ( r=0; r<=26; r++)cache[r][list[col][0]]=0; free_int (list, -1); return cache; } int **sar2cache_count3 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode) { int maxn2, res, seq,sar, ci, col,s, r, a1, a0, n1, n0; int **analyze, **list, **cache; static int **mat; if (!mat) mat=read_matrice ("blosum62mt"); list=declare_int ( A->len_aln, 2); cache=declare_int ( 27, A->len_aln); analyze=declare_int (27, 2); ci=name_is_in_list ( compound, S->name, S->nseq, -1); for ( col=0; col< A->len_aln; col++) { double e, g; for ( n1=0, n0=0,s=0; sseq_al[seq][col]); sar=(S->seq_al[ci][seq]=='I')?1:0; n1+=(sar==1)?1:0; n0+=(sar==0)?1:0; if ( res=='-')continue; res-='a'; analyze[res][sar]++; } /*Gap*/ for (g=0,r=0; rnseq; r++) g+=is_gap(A->seq_al[r][col]); g=(100*g)/A->nseq; /*enthropy for (e=0, r=0; r<26; r++) { a0=analyze[r][0]; a1=analyze[r][1]; t=a0+a1; if (t>0) e+= t/(double)A->nseq*log(t/(double)A->nseq); } e*=-1; */ e=0; if (g>10) continue; if (e>10) continue; if ( strstr ( mode, "SIMTEST")) { for (r=0; r<26; r++) { a0=analyze[r][0]; a1=analyze[r][1]; if (a1) { cache[r][col]=a1; cache[26][col]=MAX(cache[26][col],a1); } } } else { for (r=0; r<26; r++) { a0=analyze[r][0]; a1=analyze[r][1]; if (!a1 && a0) { cache[r][col]=a0; cache[26][col]=MAX(cache[26][col],a0); } } } for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0; list[col][0]=col; list[col][1]=cache[26][col]; } free_int (analyze, -1); sort_int_inv (list, 2, 1, 0, A->len_aln-1); strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2); for ( col=maxn2; collen_aln; col++) for ( r=0; r<=26; r++)cache[r][list[col][0]]=0; free_int (list, -1); return cache; } int **sar2cache_proba_new ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode) { int col, s, seq,ms,mseq, res, mres, res1, n,maxn1, maxn2,maxn3, t, ci, a,w; int **list; int N1msa,N1sar, N, N11, N10, N01,N00, SCORE, COL_INDEX, RES; int nfield=0; int value; int **cache; static int **sim; int sw_thr; float zscore; RES=nfield++;COL_INDEX=nfield++;N1msa=nfield++;N1sar=nfield++;N=nfield++;N11=nfield++;N10=nfield++;N01=nfield++;N00=nfield++;SCORE=nfield++; ci=name_is_in_list ( compound, S->name, S->nseq, -1); cache=declare_int (27, A->len_aln); strget_param ( mode, "_SWTHR_", "30", "%d", &sw_thr); strget_param (mode, "_ZSCORE_","0", "%f", &zscore); if (!sim)sim=aln2sim_mat(A, "idmat"); for ( ms=0; msseq_al[ci][mseq]!='I')continue; list=declare_int (A->len_aln+1, nfield); for (t=0,n=0, col=0; col< A->len_aln; col++) { int same_res; mres=tolower(A->seq_al[mseq][col]); if ( is_gap(mres))continue; list[col][RES]=mres; list[col][COL_INDEX]=col; for ( s=0; sseq_al[seq][col]); if (is_gap(res))continue; w=sim[seq][mseq];w=(mres==res)?100-w:w; if (wseq_al[ci][seq]=='I' && same_res)list[col][N11]+=w; else if (S->seq_al[ci][seq]=='I' && same_res)list[col][N10]+=w; else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N01]+=w; else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N00]+=w; if ( S->seq_al[ci][seq]=='I')list[col][N1sar]+=w; if ( same_res)list[col][N1msa]+=w; } list[col][SCORE]=(int)evaluate_sar_score1 (list[col][N], list[col][N11], list[col][N1msa], list[col][N1sar]); } strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1); sort_int_inv (list,nfield,SCORE,0,A->len_aln-1); for (a=0; alen_aln+1, nfield); for ( col=0; col< A->len_aln; col++) { list[col][COL_INDEX]=col; list[col][SCORE]=cache[26][col]; } /*Keep The N Best Columns*/ if ( zscore!=0) { double sum=0, sum2=0, z; int n=0; for (a=0; a< A->len_aln; a++) { if ( list[a][SCORE]>0) { sum+=list[a][SCORE]; sum2+=list[a][SCORE]*list[a][SCORE]; n++; } } for (a=0; alen_aln; a++) { if ( list[a][SCORE]>0) { z=return_z_score (list[a][SCORE], sum, sum2,n); if ((float)zlen_aln-1); strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2); for (a=maxn2;alen_aln; a++) { col=list[a][COL_INDEX]; for (s=0; s<27; s++) cache [s][col]=0; } } /*Get Rid of the N best Columns*/; strget_param ( mode, "_MAXN3_", "0", "%d", &maxn3); for (a=0; aname, S->nseq, -1); cache=declare_int (A->nseq, A->len_aln); for ( ms=0; msseq_al[ci][mseq]!='I')continue; list=declare_int (A->len_aln+1, 5); for (t=0,n=0, col=0; col< A->len_aln; col++) { mres=tolower(A->seq_al[mseq][col]); list[col][0]=mres; list[col][1]=col; if ( is_gap(mres))continue; for ( s=0; sseq_al[seq][col]); if (is_gap(res))continue; if (S->seq_al[ci][seq]=='I' && res==mres)list[col][3]++; if (res==mres)list[col][2]++; } } sort_int_inv (list,5,3,0,A->len_aln-1); strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1); strget_param ( mode, "_QUANT_", "0.95", "%f", &quant); n=quantile_rank ( list, 3, A->len_aln,quant); sort_int (list, 5, 2, 0, n-1); for (a=0; anseq, A->len_aln); ci=name_is_in_list ( compound, S->name, S->nseq, -1); strget_param ( mode, "_FILTER1_", "0" , "%f", &T1); strget_param ( mode, "_FILTER2_", "1000000", "%f", &T2); strget_param ( mode, "_FILTER3_", "0" , "%f", &T3); strget_param ( mode, "_FILTER4_", "1000000", "%f", &T4); list=declare_int (A->len_aln+1,A->nseq+2); SCORE=A->nseq; COL=A->nseq+1; for ( ms=0; msseq_al[ci][mseq]!='I')continue; for (t=0,n=0, col=0; col< A->len_aln; col++) { int N11=0,N10=0,N01=0,N00=0,N1sar=0,N1msa=0,N=0; mres=tolower(A->seq_al[mseq][col]); if ( is_gap(mres))continue; for ( s=0; sseq_al[seq][col]); if (is_gap(res))continue; N++; if (S->seq_al[ci][seq]=='I' && res==mres)N11++; else if (S->seq_al[ci][seq]=='I' && res!=mres)N10++; else if (S->seq_al[ci][seq]=='O' && res==mres)N01++; else if (S->seq_al[ci][seq]=='O' && res!=mres)N00++; if ( S->seq_al[ci][seq]=='I')N1sar++; if ( res==mres)N1msa++; } list[col][mseq]=(int)evaluate_sar_score1 (N,N11,N1msa,N1sar); list[col][SCORE]+=list[col][mseq]; list[col][COL]=col; } } strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1); strget_param ( mode, "_QUANT_", "0.95", "%f", &quant); sort_int_inv (list,A->nseq+2,SCORE, 0, A->len_aln-1); n=quantile_rank ( list,A->nseq, A->len_aln,quant); n=5; for (a=0; anseq; b++) { value=list[col][b]; if ( value>T1 && valuenseq, 2); seq_list=declare_int (A->nseq, 2); for (a=0; a< A->nseq; a++) { if (comp[a]=='I') { sar_list[nsar][0]=a; sar_list[nsar][1]=rand()%100000; nsar++; } else { seq_list[nseq][0]=a; seq_list[nseq][1]=rand()%100000; nseq++; } } l1[0]=vcalloc (A->nseq, sizeof (int)); l2[0]=vcalloc (A->nseq, sizeof (int)); nl1[0]=nl2[0]=0; sort_int (seq_list, 2, 1, 0,nseq-1); sort_int (sar_list, 2, 1, 0,nsar-1); mid=nsar/2; for (a=0; anseq, 2); l1[0]=vcalloc (A->nseq, sizeof (int)); l2[0]=vcalloc (A->nseq, sizeof (int)); nl1[0]=nl2[0]; vsrand (0); for ( a=0; a< A->nseq; a++) { list[a][0]=a; list[a][1]=rand()%100000; } sort_int (list, 2, 1, 0,A->nseq-1); mid=A->nseq/2; for (a=0; anseq; b++, a++) { l2[0][nl2[0]++]=list[b][0]; } free_int (list, -1); return 1; } int aln2jack_group1 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2) { int **sim; int **list; int a, mid; list=declare_int ( A->nseq, 3); l1[0]=vcalloc (A->nseq, sizeof (int)); l2[0]=vcalloc (A->nseq, sizeof (int)); nl1[0]=nl2[0]; sim=aln2sim_mat (A, "idmat"); for ( a=0; a< A->nseq; a++) { list[a][0]=seq; list[a][1]=a; list[a][2]=(a==seq)?100:sim[seq][a]; } sort_int_inv (list, 3, 2, 0, A->nseq-1); fprintf ( stderr, "\nJacknife fromsequence %s [%d]\n", A->name[seq], seq); mid=A->nseq/2; for (a=0; a< mid; a++) l1[0][nl1[0]++]=list[a][1]; for (a=mid; anseq; a++) l2[0][nl2[0]++]=list[a][1]; return 1; } int sarset2subsarset ( Alignment *A, Alignment *S, Alignment **subA, Alignment **subS, Alignment *SUB) { Alignment *rotS, *intS; int a,b, *list, nl; list=vcalloc ( SUB->nseq, sizeof (int)); for (nl=0,a=0; anseq; a++) { b=name_is_in_list(SUB->name[a], A->name, A->nseq, 100); if ( b!=-1)list[nl++]=b; } subA[0]=extract_sub_aln (A, nl, list); rotS=rotate_aln (S, NULL); intS=extract_sub_aln (rotS, nl, list); subS[0]=rotate_aln (intS, NULL); for ( a=0; anseq; a++) sprintf ( (subS[0])->name[a], "%s", S->name[a]); return 0; } int ***simple_sar_analyze_vot ( Alignment *A, Alignment *SAR, char *mode) { int a, b, c, d; int res1, res2, sar1, sar2; float s; int **sim; static float ***result; static int ***iresult; if (!result) { result=declare_arrayN (3,sizeof (float),SAR->nseq, A->len_aln,3); iresult=declare_arrayN (3,sizeof (int),SAR->nseq, A->len_aln,3); } sim=aln2sim_mat (A, "idmat"); for (a=0; anseq; a++) for (b=0; blen_aln; b++) result[a][b][0]=1; for ( a=0; a< SAR->nseq; a++) for ( b=0; bnseq-1; b++) for ( c=b+1; c< A->nseq; c++) for ( d=0; dlen_aln; d++) { res1=A->seq_al[b][d]; res2=A->seq_al[c][d]; sar1=(SAR->seq_al[a][b]=='I')?1:0; sar2=(SAR->seq_al[a][c]=='I')?1:0; s=sim[b][c]; if ( sar1!=sar2 && res1!=res2) result[a][d][0]*=(1/(100-s)); else if ( sar1==sar2 && sar1==1 && res1==res2) result[a][d][0]*=1/s; /* else if ( sar1==sar2 && res1==res2)result[a][d][0]+=(100-s)*(100-s); else if ( sar1==sar2 && res1!=res2)result[a][d][0]-=s*s; else if ( sar1!=sar2 && res1==res2)result[a][d][0]-=(100-s)*(100-s); */ result[a][d][1]='a'; } for ( a=0; anseq; a++) for ( b=0; blen_aln; b++) { fprintf ( stderr, "\n%f", result[a][b][0]); iresult[a][b][0]=100*log(1-result[a][b][0]); } return iresult; } int display_simple_sar_analyze_pair_col (Alignment *A, Alignment *SAR, char *mode) { int **r; int a, b, n, do_tm; Alignment *rA; int *nI; strget_param (mode, "_TM_", "0", "%d", &do_tm); r=simple_sar_analyze_pair_col (A, SAR,mode); rA=rotate_aln (A, NULL); n=0; nI=vcalloc ( SAR->nseq, sizeof (int)); for (a=0; a< SAR->nseq; a++) for (b=0; blen_aln; b++) nI[a]+=(SAR->seq_al[a][b]=='I')?1:0; while ( r[n][0]!=-1) { if (r[n][3]>0) {fprintf ( stdout, "COMP S: %3d %3d %s %20s %2d #\n", r[n][3],0,SAR->seq_al[r[n][0]], SAR->name[r[n][0]], nI[r[n][0]]); fprintf ( stdout, "SEQ1 S: %3d %3d %s %20s %2d #\n", r[n][3],r[n][1],rA->seq_al[r[n][1]], SAR->name[r[n][0]],nI[r[n][0]]); fprintf ( stdout, "SEQ2 S: %3d %3d %s %20s %2d #\n\n", r[n][3],r[n][2],rA->seq_al[r[n][2]], SAR->name[r[n][0]],nI[r[n][0]]); } n++; } return 0; } int display_simple_sar_analyze_col (Alignment *A, Alignment *SAR, char *mode) { int ***result, **r2, **r3, **r4, **aa; int a, b, c, n; char *cons; int threshold=20; int do_tm; strget_param (mode, "_TM_", "0", "%d", &do_tm); result=simple_sar_analyze_col (A, SAR,mode); r2=declare_int (A->len_aln*SAR->nseq, 5); r3=declare_int (A->len_aln+1, 5); r4=declare_int (A->len_aln+1, SAR->nseq+1); aa=declare_int (2, 256); cons=vcalloc (A->len_aln+1, sizeof (char)); for (a=0; alen_aln; a++){r3[a][0]=a;cons[a]='A';} for (n=0,a=0; a< SAR->nseq; a++) { double sum, sum2; for (sum=0, sum2=0,b=0; blen_aln; b++) { sum+=result[a][b][0]; sum2+=result[a][b][0]*result[a][b][0]; } for (b=0; blen_aln; b++, n++) { r2[n][0]=a;//compound r2[n][1]=b;//pos r2[n][2]=result[a][b][1]; //AA r2[n][3]=result[a][b][0]; //Score r2[n][4]=result[a][b][2]; //(int)10*return_z_score ((double)result[a][b][0], sum, sum2, A->len_aln); //ZScore } } sort_int (r2,5, 3, 0, n-1);//sort on Score (3rd field) for ( a=0; a< n; a++) { int comp, pos, bad; comp=r2[a][0]; pos=r2[a][1]; fprintf ( stdout, "SEQ %5d %5d %5d %s ",r2[a][1]+1,r2[a][3], r2[a][4], (do_tm)?alnpos2hmmtop_pred (A, NULL, r2[a][1], SHORT):"x"); for (c=0; cnseq; c++)fprintf (stdout, "%c", A->seq_al[c][r2[a][1]]); bad=0; for (c=0; c< A->nseq; c++) { int activity, res; activity=SAR->seq_al[comp][c]; res=A->seq_al[c][pos]; if (activity=='O')aa[0][res]++; if (activity=='I')aa[1][res]++; } for (c=0; c< A->nseq; c++) { int activity, res; activity=SAR->seq_al[comp][c]; res=A->seq_al[c][pos]; bad+=(aa[0][res] && aa[1][res])?1:0; aa[0][res]=aa[1][res]=0; } fprintf ( stdout, " %20s %d |\nCOM %5d %5d %5d %s %s %20s %d |\n\n", SAR->name[r2[a][0]],bad,r2[a][1]+1,r2[a][3],r2[a][4], (do_tm)?alnpos2hmmtop_pred (A, NULL, r2[a][1], SHORT):"x",SAR->seq_al[r2[a][0]], SAR->name[r2[a][0]], bad); if (r2[a][4]>threshold) { cons[r2[a][1]]++; r3[r2[a][1]][1]++; r3[r2[a][1]][2]+=r2[a][3]; r4[r2[a][1]][r2[a][0]]=1; } } sort_int (r3, 3,1,0, A->len_aln-1); for (a=0; alen_aln; a++) { if (r3[a][1]>0) { fprintf ( stdout, "\nPOS %4d %4d %4d %c ", r3[a][0]+1, r3[a][1], r3[a][2], cons[r3[a][0]]); for (b=0; bnseq; b++)fprintf ( stdout, "%d", r4[r3[a][0]][b]); if (do_tm)fprintf ( stdout, " %s",alnpos2hmmtop_pred (A, NULL, r3[a][0], VERBOSE)); } } for (a=0; a< A->nseq; a++)fprintf ( stdout, "\n#MSA >%s\n#MSA %s",A->name[a], A->seq_al[a]); fprintf ( stdout, "\n#MSA >cons\n#MSA %s", cons); return 0; } int *** simple_sar_predict (Alignment *A, Alignment *SAR, char *mode) { //This function estimates the z score of every poition with every compound //The best Z-score position is then used for the prediction int a, b, c, nts, pos, Rscore,Zscore; int ***r; int ***pred; int **aa; aa=declare_int (2,256); pred=declare_arrayN (3, sizeof (int),SAR->nseq, A->nseq, 5); r=simple_sar_analyze_col (A, SAR, mode); nts=SAR->len_aln; //number_trainning_sequences; for (a=0; anseq; a++) { sort_int (r[a],4, 2, 0, A->len_aln-1); pos=r[a][A->len_aln-1][3]; //Best Position Zscore=r[a][A->len_aln-1][2]; //Best Z-Score Rscore=r[a][A->len_aln-1][0]; //Best Z-Score for (c=0; cseq_al[a][c]=='I')aa[1][(int)A->seq_al[c][pos]]++;//Build Positive Alphabet for Compound a if (SAR->seq_al[a][c]=='O')aa[0][(int)A->seq_al[c][pos]]++;//Build Positive Alphabet for Compound a } for (c=nts; cnseq; c++) { pred[a][c][0]=pos; pred[a][c][1]=Zscore; pred[a][c][2]=Rscore; if (aa[1][(int)A->seq_al[c][pos]]>0) { pred[a][c][3]=aa[1][(int)A->seq_al[c][pos]]; pred[a][c][4]=aa[0][(int)A->seq_al[c][pos]]; } } for (c=0; cseq_al[c][pos]]=aa[1][(int)A->seq_al[c][pos]]=0; } for ( a=nts; a< A->nseq; a++) { for ( b=0; bnseq; b++) { fprintf ( stdout, ">%-25s %-25s Pos %3d ZScore %3d Rscore %3d Activity +: %d -: %d ", A->name [a], SAR->name[b], pred[b][a][0],pred[b][a][1], pred[b][a][2], pred[b][a][3], pred[b][a][4]); if (pred[b][a][4]==0)for (c=0; cnseq; c++)fprintf ( stdout, "%c", A->seq_al[c][ pred[b][a][0]]); fprintf ( stdout, " %s\n", SAR->name[b]); for (c=0; cnseq-1; c++)fprintf ( stdout, "%c", SAR->seq_al[b][c]); fprintf ( stdout, " %s\n", SAR->name[b]); fprintf ( stdout, "\n"); } } return pred; } int *pair_seq2seq (int *iseq, char *seq1, char *seq2); int **simple_sar_analyze_pair_col ( Alignment *inA, Alignment *SAR, char *mode) { int a, b, c, n, n2; int *iseq=NULL; static int **result, **fresult; int sar_mode=1; int maxgapratio=0; int nresults=10; double sum, sum2, score; Alignment *A; char aa; if (!result) { result=declare_int (inA->len_aln*inA->len_aln,5); fresult=declare_int (inA->len_aln*nresults*SAR->nseq, 5); } A=rotate_aln (inA, NULL); for (n2=0,a=0; anseq; a++) { for (n=0, sum=0, sum2=0,b=0; bnseq-1; b++) { for ( c=b+1; cnseq; c++, n++) { iseq=pair_seq2seq (iseq,A->seq_al[b], A->seq_al[c]); if ( sar_mode==1) score=sar_vs_iseq1(SAR->seq_al[a],iseq,maxgapratio,NULL,&aa); else if (sar_mode==4) score=sar_vs_iseq4(SAR->seq_al[a],iseq,maxgapratio,NULL,&aa); //HERE ("%d", (int)score); result[n][0]=a;//compound; result[n][1]=b; //pos1 result[n][2]=c; //pos2 result[n][3]=(int)score; sum+=score; sum2+=score*score; } } for (b=0; bnseq, inA->len_aln,4); sim=aln2sim_mat (inA, "idmat"); A=rotate_aln (inA, NULL); for ( a=0; anseq; a++) { best_pos=best_score=0; for ( sum=0, sum2=0,b=0; bnseq; b++) { if ( sar_mode==1) score=sar_vs_seq1(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa); else if ( sar_mode==2) score=sar_vs_seq2(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa); else if (sar_mode ==3) score=sar_vs_seq3(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa); else if (sar_mode ==4) score=sar_vs_seq4(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa); else if (sar_mode ==5) score=sar_vs_seq5(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa); result[a][b][0]+=score*10; result[a][b][1]=aa; result[a][b][3]=b; sum+=result[a][b][0]; sum2+=result[a][b][0]*result[a][b][0]; } for ( b=0; b< A->nseq; b++)result[a][b][2]=10*return_z_score ((double)result[a][b][0], sum, sum2, A->nseq); //Score } return result; } int *seq2iseq ( char *seq); double sar_vs_seq4 ( char *sar, char *seq, float gl, int **sim, char *best_aa) { return sar_vs_iseq4 (sar, seq2iseq(seq), gl, sim, best_aa); } double sar_vs_seq1 ( char *sar, char *seq, float gl, int **sim, char *best_aa) { return sar_vs_iseq1 (sar, seq2iseq(seq), gl, sim, best_aa); } int *seq2iseq ( char *seq) { static int *iseq, clen; int a; if (!iseq || clengl) return 0; if (!aa) { aa=vcalloc (256*256, sizeof(int)); aal=vcalloc (N, sizeof (int)); } naa=0; for ( a=0; areturn_score) { best_aa[0]=res; return_score=score; } } } for ( a=0; agl) return 0; //Identify all the AA associated with a I (Positive alphabet) aa=vcalloc ( 256, sizeof (int)); for (b=0; b=1 || N01>=1) return 0; if (N11) { score=evaluate_sar_score1 ( N, N11, Nmsa, Nsar); } else score=0; vfree (aa); return score; } double sar_vs_iseq4 ( char *sar, int *seq, float gl, int **sim, char *best_aa) { int N, Ni, No; int a, b,c, r, s; double Ng=0; static int **aa; /*Correlation between AA conservation and Activity*/ N=strlen (sar); for (a=0; agl) return 0; if (!aa)aa=declare_int(2,257*257); for (No=Ni=b=0; bgl) return 0; //Identify all the AA associated with a I (Positive alphabet) aa=vcalloc ( 256, sizeof (int)); for (b=0; bgl) return 0; for (a=0; a<26; a++) { N=Nmsa=Nsar=N11=N10=N01=0; res='a'+a; for (d=0,b=0; bmax_depth) { printf_exit (EXIT_FAILURE, stderr,"maximum depth: %d", max_depth); } if ( depth==0) depth=2; A=declare_aln2 (strlen (S1->seq[0]),depth); a0=A->seq_al[0]; a1=A->seq_al[1]; A->len_aln=strlen (S1->seq[0]); for (a=0; a< S1->nseq; a++) for ( b=0; bnseq; b++) { A->nseq=2; sprintf (a0, "%s", S1->seq[a]); sprintf (a1, "%s", S2->seq[b]); if ( strlen (a0)!=strlen (a1)) { add_warning (stderr, "WARNING %s (%d) and %s (%d) do not have the same length", S1->name[a], strlen (S1->seq[a]), S2->name[b], strlen (S2->seq[b])); exit (0); } fprintf ( stdout, ">2 %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b], sar_aln2cor (A), sar_aln2ev (A)); sarseq2anti_sarseq (S1->seq[a],a0); fprintf ( stdout, ">2 %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], sar_aln2cor (A), sar_aln2ev (A)); if ( depth >=3) { A->nseq=3; a2=A->seq_al[2]; for (c=b+1; cnseq; c++) { sprintf (a0, "%s", S1->seq[a]); sprintf (a1, "%s", S2->seq[b]); sprintf (a2, "%s", S2->seq[c]); fprintf ( stdout, ">2 %15s %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b],S2->name[c], sar_aln2cor (A), sar_aln2ev (A)); sarseq2anti_sarseq (S1->seq[a],a0); fprintf ( stdout, ">2 %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], S2->name[c],sar_aln2cor (A), sar_aln2ev (A)); } if ( depth>=4) { A->nseq=4; a3=A->seq_al[2]; for (d=c+1; dnseq; d++) { sprintf (a0, "%s", S1->seq[a]); sprintf (a1, "%s", S2->seq[b]); sprintf (a2, "%s", S2->seq[c]); sprintf (a3, "%s", S2->seq[d]); fprintf ( stdout, ">2 %15s %15s %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b],S2->name[c],S2->name[d], sar_aln2cor (A), sar_aln2ev (A)); sarseq2anti_sarseq (S1->seq[a],a0); fprintf ( stdout, ">2 %15s %15s %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], S2->name[c],S2->name[d],sar_aln2cor (A), sar_aln2ev (A)); } if (depth>=5) { A->nseq=5; a4=A->seq_al[3]; for (e=d+1; enseq; e++) { sprintf (a0, "%s", S1->seq[a]); sprintf (a1, "%s", S2->seq[b]); sprintf (a2, "%s", S2->seq[c]); sprintf (a3, "%s", S2->seq[d]); sprintf (a4, "%s", S2->seq[d]); } } } } } return S1; } char* sarseq2anti_sarseq (char *seq_in, char *seq_out) { int a; if (!seq_out)seq_out=vcalloc (strlen (seq_in)+1, sizeof (char)); for (a=0; anseq-1; a++) for (b=a+1; bnseq; b++) { for (n11=n1=0,c=0; clen_aln; c++) { n11+=(A->seq_al[a][c]=='I' && A->seq_al[b][c]=='I'); n1+= (A->seq_al[a][c]=='I' || A->seq_al[b][c]=='I'); } tot_cor+=(n1==0)?0:n11/n1; n++; } tot_cor/=n; return tot_cor; } int sarseq_pair2ev ( char *s1, char *s2,int mode); int sar_aln2ev (Alignment *A) { float n1, n11; int a, b, c, tot=0, n=0; tot=0; for (a=0; anseq-1; a++) for (b=a+1; bnseq; b++) { tot+=sarseq_pair2ev (A->seq_al[a], A->seq_al[b], 1); n++; } return tot; } int sarseq_pair2ev ( char *s1, char *s2,int mode) { int l, t1, t2, t11,a, n1, n2, s; if ( (l=strlen (s1))!=strlen (s2)) { return -1; } if (mode==2) { t1=l/2; t2=l/2; t11=l/2; } else { for (t1=t2=t11=0,a=0; an11 || n01>n11)return 0; p1= M_chooses_Nlog (n1msa, N) + M_chooses_Nlog (n1sar-n11, N-n1msa) + M_chooses_Nlog (n11, n1msa); p2=(M_chooses_Nlog (n1msa, N)+ M_chooses_Nlog (n1sar, N)); p=(p1-p2); return -p; } double evaluate_sar_score2 ( int N, int n11, int n1msa, int n1sar) { return n11-((n1msa-n11)+(n1sar-n11)); if ( n11nseq, sizeof (int**)); list=file2list (weight_file, " "); a=b=0; for (a=0; a< SAR->nseq; a++) { b=c=0; while (list[b]) { if ( strm (list[b][1], SAR->name[a]) && atoi (list[b][3])>0)c++; b++; } weight[a]=declare_int (c+1, 3); fprintf ( stderr, "\n%s %d", SAR->name[a], c); b=c=0; while (list[b]) { if ( strm (list[b][1], SAR->name[a]) && atoi (list[b][3])>0) { weight[a][c][0]=atoi(list[b][2])-1; weight[a][c][1]=list[b][5][0]; weight[a][c][2]=atoi (list[b][3]); c++; } b++; } weight[a][c][0]=-1; } for (a=0; anseq; a++) { fprintf ( stdout, ">%s\n", A->name[a]); for ( b=0; b< SAR->nseq; b++) { score=seq2weighted_sar_score(A->seq_al[a], weight[b]); fprintf ( stdout, "%c", (score>limit)?'I':'O'); } fprintf (stdout, "\n"); } myexit (EXIT_SUCCESS); return A; } Alignment *display_sar ( Alignment *A, Alignment *SAR, char *compound) { int a,c; char name[100]; c=name_is_in_list ( compound, SAR->name, SAR->nseq, 100); if ( c==-1)return A; for ( a=0; a< A->nseq; a++) { sprintf (name, "%s", A->name[a]); sprintf ( A->name[a], "%c_%s_%s", SAR->seq_al[c][a], name,compound); } return A; } Alignment *aln2weighted_sar_score ( Alignment *A,Alignment *SAR, char *weight_file, char *compound) { int a, b, c=0; int **weight; int score; char reactivity; char ***list; if ( SAR) { c=name_is_in_list (compound, SAR->name, SAR->nseq, 100); } list=file2list (weight_file, " "); a=b=0; while (list[a]) { if (strm (list[a][1], compound))b++; a++; } weight=declare_int ( b+1, 3); a=b=0; while (list[a]) { if ( !strm (list[a][1], compound) || strm ("TOTPOS", list[a][1])); else { weight[b][0]=atoi(list[a][2])-1; weight[b][1]=list[a][5][0]; weight[b][2]=atoi(list[a][3]); b++; } a++; } weight[b][0]=-1; for ( a=0; a< A->nseq; a++) { score=seq2weighted_sar_score (A->seq_al[a], weight); reactivity=(!SAR || c==-1)?'U':SAR->seq_al[c][a]; sprintf (A->seq_comment[a], "Compound %-15s Reactivity %c SAR_SCORE %5d", compound,reactivity, (int) score); } return A; } float seq2weighted_sar_score ( char *seq, int **weight) { int a, p, r, w; float score=0; a=0; while (weight[a][0]!=-1) { p=weight[a][0]; r=weight[a][1]; w=weight[a][2]; if ( is_gap(seq[p])); else if ( tolower(seq[p])==r)score+=w; a++; } return score; } Alignment * sar2simpred (Alignment *A, Alignment *SAR, char *posfile, char *compound, int L1,int L2 ) { int a, b, c, c1, c2; int **sim, **sim_ref, npred=0; float n11, n10, n01, n00; float sn, sp; int tot_sim=0; int N11=1, N01=2, N10=3, NXX=4, SIM=5; float ***tot; int i1, i2; n11=n10=n01=n00=0; tot=declare_arrayN(3,sizeof (float), 10, 6, 2); sim_ref=aln2sim_mat (A, "idmat"); if (strm (posfile, "all")) sim=sim_ref; else { Alignment *B; B=copy_aln ( A,NULL); B=extract_aln3(B,posfile); /*if (B->len_aln==0)L1=100; else L1=((B->len_aln-1)*100)/B->len_aln; if (L1<=0)L1=100; */ sim=aln2sim_mat (B, "idmat"); } for (a=0; a< A->nseq-1; a++) { for ( b=a+1; b< A->nseq; b++) { for ( c=0; cnseq; c++) { if ( (strm (compound, SAR->name[c]) || strm ( compound, "all"))) { /*if ( sim_ref[a][b]<30 || sim_ref[a][b]>60)continue;*/ i1=0; /*sim_ref[a][b]/10;if (i1==10)i1--;*/ i2=sim[a][b]; c1=(SAR->seq_al[c][a]=='I')?1:0; c2=(SAR->seq_al[c][b]=='I')?1:0; n11=(c1 && c2)?1:0; n01=(!c1 && c2)?1:0; n10=(c1 && !c2)?1:0; n00=(!c1 && !c2)?1:0; tot[i1][N11][0]+=n11; tot[i1][N01][0]+=n01; tot[i1][N10][0]+=n10; /*tot[i1][N00][0]+=n00;*/ tot[i1][NXX][0]++; tot[i1][SIM][0]+=sim_ref[a][b]; if ( i2>=L1) { tot[i1][N11][1]+=n11; tot[i1][N01][1]+=n01; tot[i1][N10][1]+=n10; /*tot[i1][N00][1]+=n00;*/ tot[i1][NXX][1]++; tot[i1][SIM][1]+=sim_ref[a][b]; } } } } } for (a=0; a<1; a++) { sp=(tot[a][N11][0])/(tot[a][N11][0]+tot[a][N10][0]); fprintf ( stdout, "\n%15s N11 %5d SP %.2f ",compound, (int)tot[a][N11][0],sp); sp=((tot[a][N11][1]+tot[a][N10][1])==0)?1:(tot[a][N11][1])/(tot[a][N11][1]+tot[a][N10][1]); sn=(tot[a][N11][0]==0)?1:(tot[a][N11][1]/tot[a][N11][0]); fprintf ( stdout, " N11 %5d SP %.2f SN %.2f SIM %.2f", (int)tot[a][N11][1], sp,sn, (tot[a][SIM][1]/tot[a][NXX][1])); } myexit (0); sp=((n11+n01)==0)?1:n11/(n11+n01); sn=((n11+n01)==0)?1:n11/(n11+n10); fprintf ( stdout, "\nLimit: %d NPRED %d AVGSIM %d SN %.2f SP %.2f TP %d FP %d FN %d",L1, npred, tot_sim, sn, sp, (int)n11, (int)n01, (int)n10); myexit (EXIT_SUCCESS); return A; } Alignment * sar2simpred2 (Alignment *A, Alignment *SAR, char *seqlist, char *posfile, char *compound, int L ) { int a,b, c,c1, c2, p, s; float n11, n10, n01, n00, n, sn2, prediction,sp, n1, n0, t, entropy, Delta; int *rlist, *tlist, *pred, *npred, tsim, psim; int **sim, **sim_ref; int nr=0; int nrs; char *out; int delta_max; Alignment *B; int printall=1; out=vcalloc (A->nseq+1, sizeof (char)); rlist=vcalloc ( A->nseq, sizeof (int)); tlist=vcalloc ( A->nseq, sizeof (int)); pred=vcalloc(2, sizeof (int)); npred=vcalloc(2, sizeof (int)); nrs=0; if ( strm (seqlist, "first")) { for ( a=0; anseq; a++) { if ( strm ( compound, SAR->name[a])) { for ( b=0; bnseq; b++) { if ( SAR->seq_al[a][b]=='I') { fprintf ( stderr, "COMP: %s REF SEQ: %s\n", A->name[b], compound); rlist[nrs]=b; tlist[rlist[nrs]]=1; nrs++; break; } } } } } else if (strm (seqlist, "all")) { for ( a=0; a< A->nseq; a++) { rlist[nrs]=a; tlist[rlist[a]]=1; nrs++; } } else if ((a=name_is_in_list ( seqlist, A->name, A->nseq, 100))!=-1) { rlist[nrs]=a; tlist[rlist[nrs]]=1; nrs++; } else { Alignment *R; R=main_read_aln (seqlist, NULL); for (a=0; anseq; a++) { rlist[a]=name_is_in_list( R->name[a], A->name, A->nseq, 100); tlist[rlist[a]]=1; } free_aln (R); } c=name_is_in_list ( compound, SAR->name, SAR->nseq, 100); sim_ref=aln2sim_mat (A, "idmat"); if (strm (posfile, "all")) { sim=sim_ref; B=A; } else { B=copy_aln ( A,NULL); B=extract_aln3(B,posfile); sim=aln2sim_mat (B, "idmat"); } n11=n10=n01=n00=n=n1=n0=0; delta_max=0; for (a=0; anseq; a++) { if ( tlist[a] && !strm (seqlist, "all")) out[a]=(SAR->seq_al[c][a]=='I')?'Z':'z';/*SAR->seq_al[c][a];*/ else { pred[0]=pred[1]=0; npred[0]=npred[1]=1; c1=(SAR->seq_al[c][a]=='I')?1:0; for (nr=0,tsim=0,psim=0,b=0; bseq_al[c][rlist[b]]=='o'); else { c2=(SAR->seq_al[c][rlist[b]]=='I')?1:0; nr+=c2; s=sim[a][rlist[b]]; tsim+=sim_ref[a][rlist[b]]; psim+=sim[a][rlist[b]]; if (s>=L) { pred[c2]+=s; npred[c2]++; } } } if (c1==0)n0++; else n1++; t++; Delta=pred[1]-pred[0]; if (Delta<-delta_max){p=0;out[a]= (c1==0)?'O':'o';} else if (Delta>delta_max){p=1;out[a]=(c1==1)?'I':'i';} else {p=-1; out[a]=(c1==1)?'U':'u';} if ( p==-1); else if ( p && c1)n11++; else if ( p && !c1)n10++; else if ( !p && !c1)n00++; else if ( !p && c1)n01++; if (p!=-1)n++; if (printall)fprintf ( stdout, ">%-15s %d %c OVERALL_SIM:%d POSITION_SIM %d\n%s\n", B->name[a], c1, out[a],tsim/nrs,psim/nrs,B->seq_al[a]); } } sp=((n11+n10)==0)?1:n11/(n11+n10); sn2=((n1)==0)?1:n11/n1; prediction=(n11+n00)/(n1+n0); entropy=(float)(M_chooses_Nlog (nr, nrs)/M_chooses_Nlog(nrs/2, nrs)); fprintf ( stdout, ">%-15s Sp %.2f Sn %.2f Pred %.2f E %.2f\n", compound,sp, sn2,prediction,entropy ); fprintf ( stdout, "%s\n", out); myexit (EXIT_SUCCESS); return A; } /************************************************************************************/ /* ALIGNMENT ANALYZE : SAR FOR OR */ /************************************************************************************/ void display_or_help(); void display_or_help() { fprintf ( stdout, "\nor_sar options:"); fprintf ( stdout, "\n_ORCL_: Command_line in a file"); fprintf ( stdout, "\n_ROTATE_ : rotate the sar matrix (if each entry is a compound rather than a sequence)"); fprintf ( stdout, "\n_JNIT_ : number cycles of Jacknife"); fprintf ( stdout, "\n_JNSEQ_ : Number of sequences picked up in alignment [0 to keep them all, 10 by default]"); fprintf ( stdout, "\n_JSAR_ : Number of compounds picked up in the SAR matrix [0 to keep them all => default"); fprintf ( stdout, "\n_JRSAR_ : Randomization of the SAR file between each JNIT iteration: S, C, R, SC, SR... (S: seq, C, column, R: residue"); fprintf ( stdout, "\n_JRALN_ : Randomization of the ALN file between each JNIT iteration: S, C, R, SC, SR... (S: seq, C, column, R: residue"); fprintf ( stdout, "\n_NPOS_ : Number of positions used to make the prediction [4 by default]"); fprintf ( stdout, "\n_DEPTH_ : Depth of the motif degenerated alphabet [Default=2]"); fprintf ( stdout, "\n_POSFILE_ : Predefined list of positions in a filename, A->nseq); S=aln2jacknife (S, 0, sarlen); } Alignment *or_scan (Alignment *A,Alignment *S, char *pmode) { int l, a,ax,cx, b; char mode[100]; int start, offset,w; int nl, *poslist; fprintf ( stdout, "\nPARAMETERS: %s\n", pmode); fprintf ( stderr, "\nPARAMETERS: %s\n", pmode); strget_param (pmode, "_SW_", "15", "%d",&w); strget_param (pmode, "_SMINW_", "5", "%d",&start); strget_param (pmode, "_SSTART_", "5", "%d",&offset); strget_param (pmode, "_SMODE_", "single", "%s",&mode); l=intlen (A->len_aln); poslist=vcalloc ( A->len_aln, sizeof (int)); nl=0; for ( a=0; a< A->len_aln; a++)poslist[nl++]=a+1; if ( strm (mode, "single")) { for ( a=0; alen_aln-2; a++) { int c, gap, score; for (gap=0,c=0; cnseq; c++)gap+=(A->seq_al[c][a]=='-'); if ( !gap) { Alignment *B; B=extract_aln (A, a+1, a+2); B=or_sar (B, S, pmode, NO_PRINT); score=B->score_aln; free_aln (B); } else { score=0; } fprintf ( stdout, "P: %*d S: %4d\n",l,a+1, score); //fprintf ( stderr, "P: %*d S: %4d\n",l,a+1, score); } } else if strm (mode, "scan") { for ( ax=0; axA->len_aln)continue; if (pend<1 || pend>A->len_aln)continue; B=extract_aln (A, a-w, a+w); B=or_sar (B, S,pmode, NO_PRINT); if (B->score_aln>=best_score){best_score=B->score_aln; best_pos=a;best_w=b;best_start=pstart; best_end=pend;} free_aln (B); } fprintf ( stdout, "P: %*d I: %*d %*d Score: %5d L: %2d\n", l,best_pos, l,best_start+offset, l,best_end, best_score,(best_w*2)+1 ); } } else if ( strm ( mode, "scan_comp")) { Alignment *NS=NULL; int *tresults; int s, n=0, p1; int nbest; int *poscache; float sc, eval; int **index; int **poslist; int len=1; float best_sc, best_eval; int best_pos; char *best_word; int sth; int sev; index=aln2pos_simple (A, A->nseq); strget_param (pmode, "_NPOS_", "1", "%d", &len); strget_param (pmode, "_STH_", "0", "%d", &sth); strget_param (pmode, "_SEV_", "0", "%d", &sev); if (sev==0)for (a=0, sev=1; alen_aln; tresults=vcalloc ( A->len_aln, sizeof (int)); poscache=vcalloc ( A->len_aln, sizeof (int)); poslist=generate_array_int_list (len, 0, A->len_aln-1,1, &n, NULL); for (s=0; slen_aln; s++) { char *word; NS=aln2block (S, s+1, s+2, NS); fprintf ( stderr, "\nProcess: %s ...\n", get_compound_name(s, mode)); for (best_sc=0,best_pos=0,best_word=NULL,a=0; abest_sc) { best_sc=sc; best_eval=eval; best_word=word; best_pos=a; nbest=1; } else if ( word && sc>=best_sc) { nbest++; } else vfree (word); } nbest/=len; for (a=0; a<=A->nseq; a++) { fprintf (stdout, "\n>%s PPPP: S: %s SC: %d EV: %d NBest: %d W: %s", get_compound_name(s, mode),(a==A->nseq)?"cons":A->name[a],(int)best_sc, (int) best_eval, nbest,best_word); for ( b=0; bnseq) { p1=poslist[best_pos][b]; if (best_sc>sth && nbest0)p1++; } fprintf ( stdout, " %d ", p1); } } } for ( p1=0; p1len_aln;p1++) fprintf ( stdout, "\n>TOT_P: %d %d", p1+1, tresults[p1]); } else if ( strm ( mode, "scan_comp_old")) { Alignment *NS=NULL, *BLOCK=NULL; int **results, *tresults; int s, n, p1, p2; int npos=1; int *poscache; results=declare_int (A->len_aln*A->len_aln, 3); tresults=vcalloc ( A->len_aln, sizeof (int)); poscache=vcalloc ( A->len_aln, sizeof (int)); for (s=0; slen_aln; s++) { int count; NS=aln2block (S, s+1, s+2, NS); fprintf ( stderr, "\nProcess: %s ...", get_compound_name(s, mode)); for (n=0,p1=0; p1len_aln-w; p1++, count ++) { if ( count == 50){fprintf ( stderr, "*");count=0;} for ( p2=p1; p2len_aln-w; p2++, n++) { poscache[p1]=1; poscache[p2]=1; BLOCK=alnpos2block (A,poscache,BLOCK); if ( aln2ngap(BLOCK)<=0)BLOCK=or_sar (BLOCK,NS, pmode, NO_PRINT); else BLOCK->score_aln=0; //if ( BLOCK->score_aln>0)HERE ("P: %d %d %d", p1, p2,BLOCK->score_aln); results[n][0]=p1; results[n][1]=p2; results[n][2]=BLOCK->score_aln; poscache[p1]=poscache[p2]=0; } } sort_int_inv (results, 3, 2, 0, n-1); for (p1=0; p1%s PPPP: %d %d SC: %d", get_compound_name(s, mode), results[p1][0]+1,results[p1][1]+1, results[p1][2]); fprintf ( stderr, "\n>%s PPPP: %d %d SC: %d", get_compound_name(s, mode), results[p1][0]+1,results[p1][1]+1, results[p1][2]); tresults[results[p1][0]]++; tresults[results[p1][1]]++; } } for ( p1=0; p1len_aln;p1++) if ( tresults[p1])fprintf ( stdout, "\n>TOT_P: %d %d", p1+1, tresults[p1]); } exit (EXIT_SUCCESS); } ORP * or_sar_compound(Alignment *A, Alignment *S, char *mode, int print); ORP* set_orp_name ( ORP* P, char *name); ORP* set_orp_offset ( ORP* P,int offset); Alignment * or_sar(Alignment *inA, Alignment *inS, char *mode, int print) { char rsar[4],raln[4], sarmode[100]; int rotate, a, b, start, end; ORP **R, *ORS; Alignment *A, *S; if ( mode && (strstr (mode, "help") || strstr (mode, "HELP"))) display_or_help(); strget_param (mode, "_SARMODE_", "single", "%s", &sarmode);//single | all strget_param (mode, "_RSAR_", "NO", "%s", rsar); strget_param (mode, "_RALN_", "NO", "%s", raln); strget_param (mode, "_ROTATE_", "0", "%d", &rotate); strget_param (mode, "_START_", "1", "%d", &start);start--; strget_param (mode, "_END_", "0", "%d", &end); A=copy_aln (inA, NULL); S=copy_aln (inS, NULL); if ( end==0)end=A->len_aln; if ( start!=0 || end!=A->len_aln) { int c; if ( start>=A->len_aln || end<=start || end >A->len_aln) printf_exit (EXIT_FAILURE, stderr, "ERROR: _START_%d and _END_%d are incompatible with the aln [len=%d][FATAL:%s]", start, end, A->len_aln, PROGRAM); for (b=0; bnseq; b++) for (c=0,a=start; aseq_al[b][c]=A->seq_al[b][a]; A->len_aln=c; for (b=0; bnseq; b++)A->seq_al[b][c]='\0'; } S=aln2random_aln (S, rsar); A=aln2random_aln (A, raln); if (rotate) { Alignment *rS; if (S->len_aln!=A->nseq) printf_exit ( EXIT_FAILURE,stderr, "ERROR: Alignment and SAR matrix are incompatible [FATAL:%s]", PROGRAM); rS=rotate_aln (S, NULL); for (a=0; a< A->nseq; a++)sprintf (rS->name[a], "%s", A->name[a]); free_aln (S); S=rS; } R=vcalloc ( S->len_aln+2, sizeof (ORP*)); if (strm (sarmode, "all"))R[0]=or_sar_compound (A, S, mode, print); else if ( strm (sarmode, "single")) { for ( a=0; alen_aln; a++) { Alignment *NS=NULL; NS=aln2block (S, a+1, a+2, NS); R[a]=or_sar_compound (A, NS, mode, NO_PRINT); set_orp_name ((R[a]), get_compound_name (a, mode)); set_orp_offset(R[a], start); display_or_summary (R[a], mode, stdout, print); } } ORS=combine_n_predictions (R, A, S); display_or_summary (ORS, mode, stdout, print); inA->score_aln=(int)(ORS->best*(float)1000); free_orp_list (R); free_orp(ORS); return inA; } ORP* set_orp_offset ( ORP* P,int offset) { if (!P) return NULL; else { P->offset=offset; return set_orp_offset(P->PR, offset); } } ORP* set_orp_name ( ORP* P, char *name) { if (!P) return NULL; else { sprintf ( P->name, "%s", name); return set_orp_name(P->PR, name); } } ORP * combine_n_predictions (ORP**R, Alignment *A, Alignment *S) { int a=0; ORP*N=NULL; while (R[a]) { N=combine_2_predictions (R[a++], N, A, S); } sprintf ( N->name, "ALL"); sprintf ( N->mode, "COMBINED"); return N; } ORP *combine_2_predictions ( ORP*IN, ORP *TO,Alignment *A, Alignment *S) { int a; if ( !TO) { TO=declare_or_prediction (IN->ncomp, IN->nseq, IN->len); TO->A=A; TO->S=S; TO->P=copy_aln(S, NULL); TO->offset=IN->offset; TO->ncomp=0; } for (a=0; a< IN->len; a++) { TO->pos[a]+=IN->pos[a]; } TO->fp+=IN->fp; TO->fn+=IN->fn; TO->tp+=IN->tp; TO->tn+=IN->tn; rates2sensitivity (TO->tp, TO->tn, TO->fp, TO->fn, &(TO->sp), &(TO->sn), &(TO->sen2), &(TO->best)); for (a=0; a<(TO->A)->nseq; a++) { (TO->P)->seq_al[a][TO->ncomp]=(IN->P)->seq_al[a][0]; //(TO->S)->seq_al[a][TO->ncomp]=(IN->S)->seq_al[a][0]; } TO->ncomp++; (TO->P)->len_aln=TO->ncomp; (TO->A)->score_aln=TO->best; return TO; } ORP * display_or_summary (ORP *CP, char *mode, FILE *fp, int print) { int a; char *pred; char *exp; char *motif; Alignment *A, *P, *S; A=CP->A; P=CP->P; S=CP->S; pred=vcalloc ( P->nseq*P->len_aln*2, sizeof (char)); exp=vcalloc ( P->nseq*P->len_aln*2, sizeof (char)); motif=vcalloc (CP->len+1, sizeof (char)); if (P && S) { for ( a=0; anseq; a++) { strcat (pred,P->seq_al[a]); strcat (exp, S->seq_al[a]); } CP->evalue=profile2evalue(pred, exp); } a=0; while ( CP->motif && CP->motif[a] && CP->motif[0][a][0])strcat (motif, CP->motif[0][a++]); if ( print==PRINT) { fprintf (fp, "\n>%-10s Mode: %s Accuracy: %6.2f E-value: %6.2f Motif: ",CP->name,CP->mode, CP->best, CP->evalue); if (motif[0]) { fprintf (fp, " %s",motif); } for ( a=0; alen; a++) { if ( CP->pos[a]) fprintf ( fp, "\n>%-10s Mode: %s P: cons %3d SC: %4d", CP->name, CP->mode, a+1+CP->offset, CP->pos[a]); } fprintf ( fp, "\n"); } vfree (pred); vfree(motif); vfree(exp); if (CP->PR)display_or_summary (CP->PR, mode, fp, print); return CP; } ORP * or_sar_compound(Alignment *A, Alignment *S, char *mode, int print) { char rmode[100]; Alignment *P=NULL; ORP *PR=NULL; strget_param (mode, "_MODE_", "predict", "%s", rmode); if (strm (rmode, "jack"))P=or_jack (A, S, mode); else if (strm (rmode, "loo")) PR=or_loo (A, S, mode,NULL, print); else if (strm (rmode, "comploo")) P=or_comp_loo (A, S, mode,NULL, print); else if ( strm (rmode, "comppos")){or_comp_pos ( A, S, mode,print);exit(0); return NULL;} else if ( strm (rmode, "pos"))P=or_aln2pos_aln ( A, S, mode); else if ( strm (rmode, "predict"))P=or_predict ( A, S, mode); else if ( strm (rmode, "self"))PR=or_self_predict ( A, S, mode, NULL, print); else if ( strm (rmode, "sim"))P=or_sim ( A, S, mode); else if ( strm (rmode, "test"))P=or_test ( A, S, mode); else { printf_exit (EXIT_FAILURE, stderr, "ERROR: %s is an unknown mode of or_sar [FATAL:%s]\n", rmode,PROGRAM); return NULL; } if (!PR) { PR=vcalloc (1, sizeof (ORP)); PR->P=P; } return PR; } Alignment * or_test ( Alignment *inA, Alignment *inS, char *mode) { return inA; } float or_id_evaluate ( Alignment *A, Alignment *S, char *mode, int *pos, int print) { char *w; float score; w=or_id_evaluate2(A,S, mode, pos,print, &score); vfree (w); return score; } char* or_id_evaluate2 ( Alignment *A, Alignment *S, char *mode, int *pos, int print, float *rscore) { static char **words; static int *plist; char *bword; int res, p,nl, w, c, s, exp, pred; int tp, tn, fp, fn; float sn, sp, sen2, best, best_score; if (!A) {free_char (words, -1); vfree (plist); return NULL; } rscore[0]=0; plist=pos2list (pos, A->len_aln, &nl); words=declare_char (A->nseq, nl+1); bword=vcalloc (nl+1, sizeof (char)); for (p=0; pnseq; s++) { res=A->seq_al[s][plist[p]]; if (res=='-'){or_id_evaluate2 (NULL, NULL, NULL, NULL, 0, NULL);vfree (bword);return 0;} words[s][p]=res; } } for (best_score=0,w=0; wnseq; w++) { tp=fp=fn=tn=0; for (c=0; clen_aln; c++) { for (s=0; snseq; s++) { exp=(S->seq_al[s][c]=='I')?1:0; pred=strm (words[w], words[s]); if ( exp && pred)tp++; else if ( exp && !pred)fn++; else if (!exp && !pred)tn++; else if (!exp && pred)fp++; } } rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best); if ( best>best_score) { best_score=best; sprintf (bword, "%s", words[w]); } } rscore[0]=(float)1000*best_score; or_id_evaluate2 (NULL, NULL, NULL, NULL, 0, NULL); return bword; } float or_loo_evaluate2 ( Alignment *A, Alignment *S, char *mode, int *pos, int print) { int c, s, p, res, sar; int *plist, nl; int tp, tn, fp, fn; float sn, sp, sen2, best; char **words, **positive, **negative; tp=tn=fp=fn=0; plist=pos2list (pos, A->len_aln, &nl); words=declare_char (A->nseq, nl+1); for (p=0; pnseq; s++) { res=A->seq_al[s][plist[p]]; if (res=='-'){vfree (plist);free_char (words, -1); return 0;} words[s][p]=res; } } positive=vcalloc ( A->nseq, sizeof (char*)); negative=vcalloc ( A->nseq, sizeof (char*)); for (c=0; clen_aln; c++) { //Fill the match matrix for (p=0; pnseq; s++) { sar=S->seq_al[s][c]; if (sar=='I')positive[s]=words[s]; else if ( sar=='O')negative[s]=words[s]; } } //Evaluate the scores for (s=0; s< A->nseq; s++) { int pos=0, neg=0, pred; sar=S->seq_al[s][c]; positive[s]=negative[s]=NULL; if ( name_is_in_list (words[s], positive, A->nseq, nl+1)!=-1) pos=1; if ( name_is_in_list (words[s], negative, A->nseq, nl+1)!=-1) neg=1; if (pos & !neg) pred=1; else pred=0; if ( pred && sar=='I')tp++; else if (!pred && sar=='I')fn++; else if (!pred && sar=='O')tn++; else if ( pred && sar=='O')fp++; if ( sar=='I')positive[s]=words [s]; else negative[s]=words[s]; } } vfree (negative); vfree (positive); vfree (plist); free_char (words, -1); rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best); return (float)1000*best; } float or_loo_evaluate ( Alignment *A, Alignment *S, char *mode, int *pos, int print) { int c, s, p, res, sar; int **matP,**matN; int *plist, nl; int tp, tn, fp, fn; float sn, sp, sen2, best; tp=tn=fp=fn=0; plist=pos2list (pos, A->len_aln, &nl); matP=declare_int (nl, 256); matN=declare_int (nl, 256); for (c=0; clen_aln; c++) { //Fill the match matrix for (p=0; pnseq; s++) { res=A->seq_al[s][plist[p]]; sar=S->seq_al[s][c]; if (res=='-'){vfree (plist); free_int (matP, -1);free_int (matN, -1); return 0;} if (sar=='I')matP[p][res]++; if (sar=='O')matN[p][res]++; } } //Evaluate the scores for (s=0; s< A->nseq; s++) { int scoreP, scoreN; int pred, valP, valN; sar=S->seq_al[s][c]; for (scoreN=0,scoreP=0,p=0; pseq_al[s][plist[p]]; valP=matP[p][res]-(sar=='I')?1:0; scoreP+=(valP>0)?1:0; valN=matN[p][res]-(sar=='O')?1:0; scoreN+=(valN>0)?1:0; } if ( scoreP==nl && scoreNnseq; s++) { res=A->seq_al[s][plist[p]]; sar=S->seq_al[s][c]; if (sar=='I')matP[p][res]=0; else matN[p][res]=0; } } } vfree (plist); free_int (matP, -1);free_int (matN, -1); rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best); return (float)1000*best; } int* or_comp_pos ( Alignment *inA, Alignment *inS, char *mode,int print) { Alignment *A=NULL, *S=NULL, *inS2=NULL; int a, b, c; int *main_pos, *pos=NULL; set_sar (inA, inS, mode); main_pos=vcalloc ( inA->len_aln, sizeof (int)); inS2=copy_aln (inS, NULL); inS2->len_aln=1; //Run every SAR, one at a time for ( c=0; c< inS->len_aln; c++) { int max, p; fprintf ( stdout, ">%d\n", c); for (a=0; a< inS->nseq; a++) { inS2->seq_al[a][0]=inS->seq_al[a][c]; inS2->seq_al[a][1]='\0'; } vfree (pos); free_aln (S); free_aln (A); pos=vcalloc (inA->len_aln, sizeof (int)); A=copy_aln (inA, NULL); S=copy_aln (inS2, NULL); set_sar (A,S, mode); pos=aln2predictive_positions (A, S, mode,PRINT); for (max=0,b=0; blen_aln; b++) { main_pos[b]+=pos[b]; if (main_pos[b]>max) { max=main_pos[b]; p=b+1; } } for (a=0; anseq; a++) { fprintf ( stdout, "\t"); for ( b=0; blen_aln; b++) if ( pos[b]) fprintf ( stdout, "%c", A->seq_al[a][b]); fprintf ( stdout, " %c\n", inS2->seq_al[a][0]); } fprintf ( stdout, "\n\tBest: %d %d\n", p, max); } if (print==PRINT) { for ( a=0; alen_aln; a++)fprintf ( stdout, "\nP2: cons %4d %4d [FINAL]", a+1, main_pos[a]); } return main_pos; } Alignment * or_comp_loo ( Alignment *inA, Alignment *inS, char *mode, int *pos,int print) { int a, b,c, n; char **keep, **remove; Alignment *A, *S, *P, *P1, *SEQ, *inS2; int **main_pos, *compound_pos, **comp_list; int pos_exists=0; char *comp_pred, *comp_exp; int sar_threshold; strget_param (mode, "_SARTHRES_", "3", "%d", &sar_threshold); if (pos)pos_exists=1; set_sar (inA, inS, mode); P=copy_aln (inS, NULL); keep=declare_char (inA->nseq, MAXNAMES); remove=declare_char (inA->nseq, MAXNAMES); main_pos=declare_int ( inA->len_aln,4); comp_list=declare_int (inA->len_aln, sizeof (int*)); inS2=copy_aln (inS, NULL); inS2->len_aln=1; comp_pred=vcalloc ( inA->nseq+1, sizeof (int)); comp_exp=vcalloc ( inA->nseq+1, sizeof (int)); compound_pos=NULL; //Run every SAR, one at a time for ( c=0; c< inS->len_aln; c++) { for (a=0; a< inS->nseq; a++) { inS2->seq_al[a][0]=inS->seq_al[a][c]; inS2->seq_al[a][1]='\0'; } vfree (compound_pos); compound_pos=vcalloc (inA->len_aln, sizeof (int)); for (a=0; anseq; a++) { char ***motifs; A=copy_aln (inA, NULL); S=copy_aln (inS2, NULL); for (n=0,b=0; bnseq; b++) { if (b!=a)sprintf (keep[n++ ], "%s", A->name [b]); } sprintf ( remove[0], "%s", A->name[a]); reorder_aln (A,keep, A->nseq-1); set_sar (A,S, mode); if (!pos_exists) { pos=aln2predictive_positions (A, S, mode,NO_PRINT); } for (b=0; blen_aln; b++) { compound_pos[b]+=pos[b]; } motifs=compounds2motifs (A, S, pos,0, mode, NO_PRINT); SEQ=copy_aln (inA, NULL); SEQ=reorder_aln (SEQ, remove, 1); P1=aln2prediction (SEQ, motifs, pos); comp_pred[a]=P1->seq_al[0][0]; comp_exp[a]=inS2->seq_al[a][0]; P->seq_al[a][c]=P1->seq_al[0][0]; free_aln (SEQ); free_aln (S); free_aln (A); free_aln (P1); free_arrayN( (void *)motifs, 3); if (!pos_exists)vfree (pos); } if (print==PRINT) fprintf ( stdout, ">%-15s SC: %.2f E; %.2f\n%s\n%s\n", get_compound_name(c, mode),profile2sensitivity (comp_pred, comp_exp, NULL, NULL, NULL, NULL),profile2evalue(comp_pred, comp_exp),comp_pred, comp_exp); for (b=0; blen_aln; b++) { main_pos[b][2]+=compound_pos[b]; if (compound_pos[b])main_pos[b][3]++; if (profile2evalue(comp_pred, comp_exp)>sar_threshold) { main_pos[b][0]+=compound_pos[b]; if (compound_pos[b]) { main_pos[b][1]++; comp_list[b][0]++; comp_list[b]=vrealloc (comp_list[b], sizeof (int)*(comp_list[b][0]+1)); comp_list[b][comp_list[b][0]]=c; } } } } P->score_aln=(int)((float)1000*evaluate_prediction (P, inS, mode,print)); if (print==PRINT) { for ( a=0; alen_aln; a++)fprintf ( stdout, "\nP: cons %4d RS: %4d RC: %5d FC: %4d %4d[FINAL]", a+1, main_pos[a][2], main_pos[a][3], main_pos[a][0], main_pos[a][1]); for ( a=0; alen_aln; a++) { fprintf ( stdout, "\nP: cons %4d RS: %4d RC: %5d FC: %4d %4d CLIST: ", a+1, main_pos[a][2], main_pos[a][3], main_pos[a][0], main_pos[a][1]); for ( c=1; c<=comp_list[a][0]; c++) { fprintf ( stdout, "%s ", get_compound_name(comp_list[a][c], mode)); } fprintf ( stdout, " [COMP_LIST]"); } } free_int (main_pos, -1); free_int (comp_list, -1); return P; } ORP* or_loo ( Alignment *inA, Alignment *inS, char *mode, int *pos,int print) { int a, b, n; char **keep, **remove; Alignment *A, *S, *P, *P1, *SEQ; int pos_exists=0; ORP *PR; if (pos)pos_exists=1; set_sar (inA, inS, mode); PR=declare_or_prediction (inS->nseq, inA->nseq, inA->len_aln); sprintf (PR->mode, "loo "); P=copy_aln (inS, NULL); keep=declare_char (inA->nseq, MAXNAMES); remove=declare_char (inA->nseq, MAXNAMES); PR->A=inA; PR->P=P; PR->S=inS; for (a=0; anseq; a++) { char ***motifs; A=copy_aln (inA, NULL); S=copy_aln (inS, NULL); for (n=0,b=0; bnseq; b++) { if (b!=a)sprintf (keep[n++ ], "%s", A->name [b]); } sprintf ( remove[0], "%s", A->name[a]); reorder_aln (A,keep, A->nseq-1); set_sar (A,S, mode); if (!pos_exists) { pos=aln2predictive_positions (A, S, mode,print); } for (b=0; blen_aln; b++) { PR->pos[b]+=pos[b]; } motifs=compounds2motifs (A, S, pos,0, mode, print); SEQ=copy_aln (inA, NULL); SEQ=reorder_aln (SEQ, remove, 1); SEQ->nseq=1; P1=aln2prediction (SEQ, motifs, pos); if (print==PRINT) { fprintf ( stdout, "\n%s\nPred: %s\nReal: %s\n", P1->name[0], P1->seq_al[0], inS->seq_al[a]); } sprintf ( P->seq_al[a], "%s", P1->seq_al[0]); free_aln (P1); free_aln (SEQ); free_aln (S); free_aln (A); free_arrayN( (void *)motifs, 3); if (!pos_exists)vfree (pos); } free_char (keep, -1); free_char (remove, -1); PR=new_evaluate_prediction (PR, mode,print); PR->PR=or_self_predict(inA, inS, mode,NULL, print); if (print==PRINT)for ( a=0; alen_aln; a++)fprintf ( stdout, "\nP: cons %d %d [FINAL]", a+1, PR->pos[a]); return PR; } Alignment * or_jack(Alignment *inA, Alignment *inS, char *mode) { int a,b; int n_cycles=100; int subnseq=10; int subsar=0; Alignment *A, *S; int *main_pos,*pos; char jrsar[10], jraln[10]; strget_param (mode, "_JNIT_", "100", "%d", &n_cycles); strget_param (mode, "_JNSEQ_", "10", "%d", &subnseq); strget_param (mode, "_JNAR_", "0", "%d", &subsar); strget_param (mode, "_JRSAR_", "NO", "%s", jrsar); strget_param (mode, "_JRALN_", "NO", "%s", jraln); main_pos=vcalloc ( inA->len_aln, sizeof (int)); for (a=0; alen_aln; b++)main_pos[b]+=pos[b]; vfree (pos); } display_pos (A, S, main_pos, mode); return inA; } Alignment * display_pos (Alignment *A, Alignment *S, int *pos,char *mode) { Alignment *B; int a, b; int **index; int intl; intl=intlen (A->len_aln); index=aln2pos_simple (A, A->nseq); B=copy_aln (A,NULL); B->len_aln=0; for ( a=0; alen_aln; a++) fprintf ( stdout, "\nP: cons %*d %*d S: %4d [DISPLAY_FULL_POS]", intl,a+1,intl, a+2, pos[a]); fprintf ( stdout, "\n\n"); for (a=0; alen_aln; a++) { if (pos[a]) { for ( b=0; bnseq; b++) { B->seq_al[b][B->len_aln]=A->seq_al[b][a]; if (index[b][a]>0)fprintf ( stdout, "\nP: %s %d %d S: %d [DISPLAY_POS]",A->name[b], index[b][a], index[b][a]+1, pos[a]); } B->len_aln++; fprintf ( stdout, "\nP: cons %d %d S: %d [DISPLAY_POS]", a+1, a+2, pos[a]); } } fprintf ( stdout, "\n"); for (a=0; anseq; a++)B->seq_al[a][B->len_aln]='\0'; return B; } Alignment * or_aln2pos_aln (Alignment *A, Alignment *S, char *mode) { Alignment *B; int *pos; char outaln[100], outtree[100]; strget_param (mode, "_OUTALN_", "NO", "%s", outaln); strget_param (mode, "_OUTTREE_", "NO", "%s", outtree); set_sar (A, S, mode); pos=aln2predictive_positions (A, S,mode, PRINT); B=display_pos (A, S, pos, mode); if (!strm(outaln, "NO")) vfclose (output_aln (B, vfopen (outaln, "w"))); if (!strm(outtree, "NO"))vfclose (print_tree (aln2tree(B), "newick", vfopen (outtree, "w"))); return B; } Alignment * or_sim(Alignment *A, Alignment *S, char *mode) { //Predict all the sequences that are not both in inS and inA int *pos; set_sar (A, S, mode); pos=aln2predictive_positions (A, S,mode, PRINT); fprintf ( stdout, "R: %.3f", pos2sim (A,S, pos)); exit (EXIT_SUCCESS); return A; } ORP* or_self_predict(Alignment *A, Alignment *S, char *mode,int *pos, int print) { //Predict all the sequences that are not both in inS and inA Alignment *P; char ***motifs; int a; int pre_set_pos=0; ORP *PR; set_sar (A, S, mode); PR=declare_or_prediction (S->nseq, A->nseq, A->len_aln); sprintf (PR->mode, "self"); PR->A=A; PR->S=S; if (!pos) { pos=aln2predictive_positions (A, S,mode,print); pre_set_pos=0; } else pre_set_pos=1; for (a=0; a< A->len_aln; a++) PR->pos[a]=pos[a]; PR->motif=motifs=compounds2motifs (A, S, pos,0, mode, print); P=PR->P=aln2prediction (A, motifs, pos); if (!pre_set_pos)vfree (pos); PR=new_evaluate_prediction (PR, mode,print); return PR; } Alignment * or_predict(Alignment *inA, Alignment *inS, char *mode) { //Predict all the sequences that are not both in inS and inA Alignment *P, *A, *S, *T; char ***motifs; int *pos; int a, b; A=copy_aln (inA, NULL); S=copy_aln (inS, NULL); set_sar (A, S, mode); pos=aln2predictive_positions (A, S,mode,PRINT); motifs=compounds2motifs (A, S, pos,0, mode, PRINT); T=get_prediction_target (inA, inS, mode); P=aln2prediction (T, motifs, pos); //recall=evaluate_prediction (S, P, mode); for ( a=0; alen_aln; a++) { for (b=0; bnseq; b++) { if (tolower(P->seq_al[b][a])=='i')fprintf (stdout, "\n>%20s %20s %c", T->name [0],get_compound_name (a, mode), P->seq_al[b][a]); } } fprintf ( stdout, "\n"); return P; } Alignment *get_prediction_target (Alignment *A, Alignment *S, char *param) { char **name; int n, a; Alignment *T; T=copy_aln (A, NULL); name=declare_char (A->nseq, 100); for (n=0,a=0; a< A->nseq; a++) { if ( name_is_in_list (A->name[a], S->name, S->nseq, 100)==-1) { sprintf (name[n++], "%s", A->name[a]); } } T=reorder_aln (T,name, n); return T; } Alignment *set_sar (Alignment *A, Alignment *S, char *param) { char **name; int n, a; name=declare_char (A->nseq, 100); for (n=0,a=0; a< A->nseq; a++) { if ( name_is_in_list (A->name[a], S->name, S->nseq, 100)!=-1) { sprintf (name[n++], "%s", A->name[a]); } } A=reorder_aln (A,name, n); S=reorder_aln (S,name, n); free_char (name, -1); return S; } ORP* new_evaluate_prediction (ORP *PR, char *mode, int print) { int a,b, i, r, p; int tp, tn, fp, fn; float sn, sp, sen2, best; float tot_best_seq=0; float tot_best_comp=0; Alignment *P, *R; int ns=0; float *recall; P=PR->P; R=PR->S; recall=vcalloc (P->len_aln, sizeof (float)); if (P->len_aln!=R->len_aln) { HERE ("Mismatch between number of compounds in prediction and reference"); exit (0); } if (print==PRINT)fprintf ( stdout, "\n"); for (a=0; anseq; a++) { tp=tn=fp=fn=0; if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1) { for (b=0;blen_aln; b++) { r=R->seq_al[i][b]; p=P->seq_al[a][b]; if ( p=='I' && r=='I')tp++; else if ( p=='I' && r=='O')fp++; else if ( p=='O' && r=='I')fn++; else if ( p=='O' && r=='O')tn++; } rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); if (print==PRINT)fprintf (stdout, ">%-s sp: %.2f sn: %.2f sn2: %.2f best: %.2f [SEQ]\n",P->name[a], sp, sn, sen2, best); if ( best>0) { ns++; tot_best_seq+=best; } } } if (ns) { tot_best_seq/=ns; } if (print==PRINT)fprintf ( stdout, ">TotSeq sp: %.2f N: %d[SEQ]\n",tot_best_seq, ns); tot_best_comp=0; for (ns=0,b=0; blen_aln; b++) { tp=tn=fp=fn=0; for (a=0; anseq;a++) { if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1) { r=R->seq_al[i][b]; p=P->seq_al[a][b]; if ( p=='I' && r=='I'){PR->tp++;tp++;} else if ( p=='I' && r=='O'){PR->fp++;fp++;} else if ( p=='O' && r=='I'){PR->fn++;fn++;} else if ( p=='O' && r=='O'){PR->tn++;tn++;} } } rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); if (print==PRINT) fprintf (stdout, ">%-25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f [COMP]\n",get_compound_name (b, mode), PR->sp, PR->sn, PR->sen2,PR->best); if ( best>0) { ns++; tot_best_comp+=best; } } if (ns) { tot_best_comp/=ns; } rates2sensitivity (PR->tp, PR->tn, PR->fp,PR->fn,&(PR->sp), &(PR->sn), &(PR->sen2), &(PR->best)); if (print==PRINT)fprintf ( stdout, ">FullTot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[COMP]\n", PR->sp, PR->sn, PR->sen2,PR->best, ns); P->score_aln=(int)((float)1000*(PR->best)); return PR; } float evaluate_prediction (Alignment *R, Alignment *P, char *mode, int print) { int a,b, i, r, p; int tp, tn, fp, fn; int tot_tp, tot_tn, tot_fp, tot_fn; float sn, sp, sen2, best; float tot_sp=0; float tot_sn=0; float tot_sen2=0; float tot_best_seq=0; float tot_best_comp=0; float tot_best=0; int ns=0; float *recall; recall=vcalloc (P->len_aln, sizeof (float)); if (P->len_aln!=R->len_aln) { HERE ("Mismatch between number of compounds in prediction and reference"); exit (0); } if (print==PRINT)fprintf ( stdout, "\n"); for (a=0; anseq; a++) { tp=tn=fp=fn=0; if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1) { for (b=0;blen_aln; b++) { r=R->seq_al[i][b]; p=P->seq_al[a][b]; if ( p=='I' && r=='I')tp++; else if ( p=='I' && r=='O')fp++; else if ( p=='O' && r=='I')fn++; else if ( p=='O' && r=='O')tn++; } rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); if (print==PRINT)fprintf (stdout, ">%-s sp: %.2f sn: %.2f sn2: %.2f best: %.2f [SEQ]\n",P->name[a], sp, sn, sen2, best); if ( best>0) { ns++; tot_best_seq+=best; tot_sn+=sn; tot_sp+=sp; tot_sen2+=sen2; } } } if (ns) { tot_best_seq/=ns; tot_sn/=ns; tot_sp/=ns; tot_sen2/=ns; } if (print==PRINT)fprintf ( stdout, ">Tot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[SEQ]\n", tot_sp, tot_sn, tot_sen2,tot_best_seq, ns); tot_fp=tot_fn=tot_tp=tot_tn=0; tot_sp=tot_sn=tot_sen2=tot_best_comp=0; for (ns=0,b=0; blen_aln; b++) { tp=tn=fp=fn=0; for (a=0; anseq;a++) { if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1) { r=R->seq_al[i][b]; p=P->seq_al[a][b]; if ( p=='I' && r=='I'){tot_tp++;tp++;} else if ( p=='I' && r=='O'){tot_fp++;fp++;} else if ( p=='O' && r=='I'){tot_fn++;fn++;} else if ( p=='O' && r=='O'){tot_tn++;tn++;} } } rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); if (print==PRINT) fprintf (stdout, ">%-25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f [COMP]\n",get_compound_name (b, mode), sp, sn, sen2,best); recall[b]=sen2; if ( best>0) { ns++; tot_best_comp+=best; tot_sn+=sn; tot_sp+=sp; tot_sen2+=sen2; } } if (ns) { tot_best_comp/=ns; tot_sn/=ns; tot_sp/=ns; tot_sen2/=ns; } rates2sensitivity (tot_tp, tot_tn, tot_fp,tot_fn,&tot_sp, &tot_sn, &tot_sen2, &tot_best); if (print==PRINT)fprintf ( stdout, ">FullTot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[COMP]\n", tot_sp, tot_sn, tot_sen2,tot_best, ns); return tot_best; } Alignment * aln2prediction (Alignment *A,char ***motif, int *pos) { int a, b,nc, nl; int *list; char **array, **sar; Alignment *R; Sequence *S; nc=read_array_size ((void *)motif, sizeof (char***)); list=pos2list (pos, A->len_aln, &nl); array=declare_char (A->nseq, nl+1); sar=declare_char(A->nseq, nc+1); for (a=0; anseq; a++) { for (b=0; bseq_al[a][list[b]]; } for (a=0; anseq; b++) { sar[b][a]=(match_motif (array[b], motif[a]))?'I':'O'; } } S=fill_sequence_struc (A->nseq,sar,A->name); R=seq2aln (S, NULL, KEEP_GAP); free_sequence (S, S->nseq); free_char (sar, -1); vfree (list); free_char (array, -1); return R; } int * file2pos_list (Alignment *A, char *posfile) { char ***file; int **index; int *pos; int i, n, p; //pos_file: // 1 2 3 4 if ( !check_file_exists (posfile)) { printf_exit ( EXIT_FAILURE, stderr, "ERROR: Could not read posfile %s\n", posfile); } file=file2list (posfile, " "); index=aln2inv_pos (A); pos=vcalloc ( A->len_aln, sizeof (int)); n=0; while (file[n]) { if ( !strm (file[n][1], "P:")); else { if ( (strm (file[n][2], "cons"))) p=atoi(file[n][3])-1; else { i=name_is_in_list ( file[n][2], A->name, A->nseq, MAXNAMES+1); if (i!=-1) p=index[i][atoi(file[n][3])]-1; else p=-1; } if (p!=-1)pos[p]+=atoi(file[n][4]); } n++; } free_int (index, -1); free_arrayN ( (char **)file, 3); return pos; } int * aln2predictive_positions (Alignment *A, Alignment *B, char *mode, int print) { char posmode[100]; if (!mode) return NULL; HERE ("%s", mode); strget_param (mode, "_POSMODE_", "scan", "%s", posmode); if ( strm (posmode, "mat"))return aln2predictive_positions_mat (A, B, mode, print); else if ( strm (posmode, "scan")) return aln2predictive_positions_scan (A, B, mode, print); else { printf_exit (EXIT_FAILURE,stderr, "ERROR: %s is an unknown _POSMODE_ mode",posmode); return NULL; } } int * aln2predictive_positions_mat (Alignment *A, Alignment *B, char *mode, int print) { int a, b, c,gap, res1, res2, sar1, sar2, npos, s, idscore; float id1,id2,id3,nid1,nid2,nid3; int **pos, *fpos; pos=declare_int (A->len_aln,2); fpos=vcalloc ( A->len_aln, sizeof (int)); strget_param (mode, "_NPOS_", "2", "%d", &npos); for ( a=0; a< A->len_aln; a++) { pos[a][0]=a; id1=id2=id3=nid1=nid2=nid3=0; for ( gap=0,b=0; bnseq; b++)gap+=(A->seq_al[b][a]=='-'); if ( gap>0){pos[a][1]=0;continue;} for (s=0; slen_aln; s++) { for ( gap=0,b=0; bnseq-1; b++) { sar1=B->seq_al[b][s]; res1=A->seq_al[b][a]; for ( c=b+1; cnseq; c++) { sar2=B->seq_al[c][s]; res2=A->seq_al[c][a]; idscore=(res1==res2)?1:0; if ( sar1 == 'I' && sar2=='I'){id1+=idscore;nid1++;} else if ( sar1 =='0' && sar2=='0'){id2+=idscore;nid2++;} else {id3+=idscore; nid3++;} } } id1=(nid1==0)?1:id1/nid1; id2=(nid1==0)?1:id2/nid2; id3=(nid3==0)?1:id3/nid3; pos[a][1]=(int)((float)1000*id1*(1-id3)); } } sort_int (pos, 2,1, 0, A->len_aln-1); for ( a=MAX(0,(A->len_aln-npos));alen_aln; a++) { fpos[pos[a][0]]=1; } free_int (pos, -1); return fpos; } int * aln2predictive_positions_scan (Alignment *A, Alignment *B, char *mode, int print) { int a, b, c, best_pos,nl, nplist=0, max, posw; float best_score, score; static int *list, *tpos,**plist,*array; int *pos; char posfile[100]; char predmode[100]; char target_posfile[100]; if (!A) { vfree (list); vfree (tpos); free_int (plist, -1); vfree (array); return NULL; } strget_param (mode, "_PREDMODE_", "ID", "%s", predmode); strget_param (mode, "_POSW_", "1", "%d", &posw); strget_param (mode, "_NPOS_", "2", "%d", &max); strget_param (mode, "_POSFILE_", "NO", "%s", posfile); strget_param (mode, "_TPOSFILE_", "NO", "%s", target_posfile); if ( !strm(posfile, "NO"))return file2pos_list (A,posfile); if ( !strm(target_posfile, "NO"))tpos=file2pos_list (A,target_posfile); else { tpos=vcalloc (A->len_aln, sizeof (int)); for (a=0; alen_aln; a++)tpos[a]=1; } //Declare the positions that are going to be scanned if (posw==1) { plist=declare_int (A->len_aln, 2); nplist=0; for (a=0; alen_aln; a++) { if(tpos[a]) { plist[nplist][0]=1; plist[nplist][1]=a; nplist++; } } } else if ( posw==2) { nplist=0; plist=declare_int (A->len_aln*A->len_aln, 3); for (a=0; alen_aln; a++) for (b=0; blen_aln; b++) { plist[nplist][1]=a; plist[nplist][2]=b; plist[nplist][0]=2; nplist++; } } else if ( posw==3) { nplist=0; plist=declare_int (A->len_aln*A->len_aln*A->len_aln, 3); for (a=0; alen_aln; a++) for (b=0; blen_aln; b++) { plist[nplist][1]=a; plist[nplist][2]=b; plist[nplist][3]=c; plist[nplist][0]=3; nplist++; } } pos=vcalloc ( A->len_aln, sizeof (int)); if (max==0)max=A->len_aln; else if ( max==-1) { for (a=0; alen_aln; a++)if (tpos[a]){pos[a]=1;} aln2predictive_positions_scan (NULL, NULL, NULL, 0); return pos; } pos=vcalloc ( A->len_aln, sizeof (int)); list=vcalloc (A->len_aln, sizeof (int)); nl=0; for (a=0; a< max; a++) { int previous_best_pos=-1; for (best_score=-9999,best_pos=0,b=0; bbest_score) { best_score=score; best_pos=b; } for (c=1; c<=plist[b][0]; c++)pos[plist[b][c]]=0; } if (best_pos==previous_best_pos)break; else previous_best_pos=best_pos; //update the best_pos_list for (b=1; b<=plist[best_pos][0]; b++) list[nl++]=plist[best_pos][b]; if ( print==PRINT) { for (b=0; blen_aln, sizeof (char**)); for (a=0; alen_aln; a++) { motifs[a]=compound2motif (A, B, pos, depth, a, mode, print); } return motifs; } char ** compound2regexp_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print); char ** compound2word_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print); char ** compound2motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print) { char mmode[100]; strget_param (mode, "_MOTIFMODE_", "word", "%s", mmode); //words, regexp if ( strm (mmode, "regexp"))return compound2regexp_motif (A,B,pos, depth, c, mode, print); else if ( strm (mmode, "word"))return compound2word_motif (A,B,pos, depth, c, mode, print); else return NULL;} char ** compound2word_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print) { int a,l; char *word, **motif; float score; word=or_id_evaluate2 (A, B, mode, pos,print, &score); if ( !word) return NULL; l=strlen (word); motif=declare_char (l+1, 2); for (a=0; anseq=O->nseq=I->len_aln=O->len_aln=0; for (a=0; alen_aln; a++) { if (pos[a]) { for (i=o=0,b=0; bnseq; b++) { if ( is_gap(A->seq_al[b][a]))return 0; if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a]; else O->seq_al[o++][O->len_aln]=A->seq_al[b][a]; } I->len_aln++; O->len_aln++; } } if (O->len_aln==0 || I->len_aln==0) return 0; O->nseq=o; I->nseq=i; for (a=0; aseq_al[a][O->len_aln]='\0'; for (a=0; aseq_al[a][I->len_aln]='\0'; if (!I->nseq) return NULL; best_pred=best_motif=best_sn=best_sp=best_sen2=0; motif_file=vtmpnam (NULL); n=0; if (depth>0) { alp=vcalloc ( sizeof (char**), I->len_aln); alp_size= vcalloc ( I->len_aln, sizeof (int)); for (a=0; alen_aln; a++) { char *col; alp[a]=string2alphabet ( (col=aln_column2string (I,a)),depth, &alp_size[a]); vfree (col); } generate_array_string_list (I->len_aln, alp, alp_size, &n, motif_file, OVERLAP); } else { int *used; char r; used=vcalloc (256, sizeof (int)); fpp=vfopen (motif_file,"w"); for (a=0;alen_aln; a++) { for (b=0; bnseq; b++) { r=I->seq_al[b][a]; if (!used[(int)r]){fprintf (fpp, "%c", r);used[(int)r]=1;} } for (b=0; bnseq; b++) { r=I->seq_al[b][a]; used[(int)r]=0; } fprintf (fpp, " "); } fprintf (fpp, "\n"); vfree (used); vfclose (fpp); n=1; depth=I->nseq; } buf=vcalloc (2*(I->len_aln*depth)+1, sizeof (char)); best_buf=vcalloc (2*(I->len_aln*depth)+1, sizeof (char)); fpp=vfopen (motif_file, "r"); for (a=0; anseq; b++) { if (match_motif (I->seq_al[b], m2))tp++; else fn++; } for (b=0; bnseq; b++) { if (match_motif (O->seq_al[b], m2))fp++; else tn++; } rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); if (best>= best_pred) { best_pred=best; best_sp=sp; best_sen2=sen2; best_sn=sn; sprintf (best_buf, "%s", buf); } m2--; free_char (m2, -1); } vfclose (fpp); if (print==PRINT)fprintf ( stdout, "\nMotifCompound %25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f motif: ", get_compound_name(c, mode), best_sp, best_sn, best_sen2, best_pred); m2=string2list (best_buf); m=declare_char (I->len_aln+1, depth+1); for (a=0; alen_aln; a++) { sprintf (m[a], "%s", m2[a+1]); if (print==PRINT) fprintf ( stdout, "[%2s]",m[a]); } if (print==PRINT)fprintf ( stdout, " N-motifs %d", n); free_char (m2, -1); if (alp)free_arrayN((void ***) alp, 3); if (alp_size)vfree (alp_size); vfree (buf); vfree(best_buf); return m; } double pos2sim (Alignment *A, Alignment *B, int *pos) { return sar_aln2r (A, B,pos, PRINT); } double sar_aln2r (Alignment *A, Alignment *B, int *pos, int print) { int a, b, c, d,r1, r2, n, score, sim; double *r, result; static double **slist; int declare=0; static int **M; if (!M)M=read_matrice ("blosum62mt"); if (!slist) { int maxslist; maxslist=A->nseq*A->nseq*10; slist=declare_double (maxslist, 2); } if (pos==NULL) { declare=1; pos=vcalloc ( A->len_aln+1, sizeof (int)); for (a=0; alen_aln; a++)pos[a]=1; pos[a]=-1; } for (n=0,a=0; a< A->nseq-1; a++) { for (b=a+1; bnseq; b++) { for (sim=d=0,c=0; clen_aln; c++) { if (pos[c]==0)continue; r1=A->seq_al[a][c]; r2=A->seq_al[b][c]; if (is_gap(r1) || is_gap(r2))return 0; sim+=M[r1-'A'][r2-'A']*pos[c]; d+=MAX((M[r1-'A'][r1-'A']),(M[r2-'A'][r2-'A'])); } sim=(d==0)?0:(100*sim)/d; score=(int)get_sar_sim(B->seq_al[a], B->seq_al[b]); slist[n][0]=(double)sim; slist[n][1]=(double)score; if (print==PRINT)fprintf ( stdout, "SIM: %d %d [%s %s]\n", sim, score, A->name[a], A->name[b]); n++; } } r=return_r(slist, n); for (a=0; alen_aln; c++) { I->nseq=O->nseq=I->len_aln=O->len_aln=0; for (a=0; alen_aln; a++) { if (pos[a]) { for (i=o=0,b=0; bnseq; b++) { if ( is_gap(A->seq_al[b][a]))return 0; if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a]; else O->seq_al[o++][O->len_aln]=A->seq_al[b][a]; } I->len_aln++; O->len_aln++; } } if (O->len_aln==0 || I->len_aln==0) return 0; O->nseq=o; I->nseq=i; for (a=0; aseq_al[a][O->len_aln]='\0'; for (a=0; aseq_al[a][I->len_aln]='\0'; delta+=aln2sim(I,"blosum62mt")-aln2sim(O, "blosum62mt"); } return delta; } char * get_compound_name (int c, char *mode) { static int isset; static Alignment *S; static char *lname; if (!isset) { char *comp_list; isset=1; lname=vcalloc (100, sizeof (char)); if (!mode); else { strget_param (mode, "_COMPLIST_", "NO", "%s", comp_list=vcalloc (100, sizeof (char))); if (strm(comp_list, "NO")); else { S=main_read_aln (comp_list, NULL); vfree (comp_list); } } } if (!S || c>=S->nseq)sprintf (lname, "%d", c); else { sprintf (lname, "%s", S->name [c]); } return lname; } ORP * declare_or_prediction ( int ncomp, int nseq, int len) { ORP *P; P=vcalloc ( 1, sizeof (ORP)); P->ncomp=ncomp; P->nseq=nseq; P->len=len; P->PR=NULL; P->pos=vcalloc (len+1, sizeof (int)); return P; } void free_orp_list ( ORP**P) { int a=0; while (P[a]) { free_orp(P[a++]); } } void free_orp ( ORP*P) { if (!P) return; free_aln (P->A); free_aln (P->S); free_aln (P->P); vfree (P->pos); free_arrayN((void **)P->motif, 3); if (P->PR)free_orp(P->PR); vfree (P); } /*********************************COPYRIGHT NOTICE**********************************/ /*© Centro de Regulacio Genomica */ /*and */ /*Cedric Notredame */ /*Tue Oct 27 10:12:26 WEST 2009. */ /*All rights reserved.*/ /*This file is part of T-COFFEE.*/ /**/ /* T-COFFEE is free software; you can redistribute it and/or modify*/ /* it under the terms of the GNU General Public License as published by*/ /* the Free Software Foundation; either version 2 of the License, or*/ /* (at your option) any later version.*/ /**/ /* T-COFFEE is distributed in the hope that it will be useful,*/ /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/ /* GNU General Public License for more details.*/ /**/ /* You should have received a copy of the GNU General Public License*/ /* along with Foobar; if not, write to the Free Software*/ /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/ /*............................................... |*/ /* If you need some more information*/ /* cedric.notredame@europe.com*/ /*............................................... |*/ /**/ /**/ /* */ /*********************************COPYRIGHT NOTICE**********************************/