--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+#include "io_lib_header.h"
+#include "util_lib_header.h"
+#include "dp_lib_header.h"
+#include "define_header.h"
+/************************************************************************************/
+/* NEW ANALYZE 2 : SAR */
+/************************************************************************************/
+float display_prediction_old (int **prediction, int n, Alignment *A, Alignment *S, int field);
+
+float display_prediction (int ***count, Alignment *S, int c, int n);
+Alignment * filter_aln4sar0 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
+Alignment * filter_aln4sar1 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
+Alignment * filter_aln4sar2 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
+Alignment * filter_aln4sar3 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
+Alignment * filter_aln4sar4 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
+Alignment * filter_aln4sar5 ( Alignment *A, Alignment *S, int c, int leave, char *mode);
+
+int **sar2profile ( Alignment *A, Alignment *S, int c, int leave);
+int **sar2profile_sim ( Alignment *A, Alignment *S, int **sim, int comp, int leave);
+int sar_profile2score ( char *seq, int **profile);
+double sar_vs_iseq1( char *sar, int *seq, float gl, int **sim, char *best_aa);
+double sar_vs_seq1 ( char *sar, char *seq, float gl, int **sim, char *best_aa);
+double sar_vs_seq2 ( char *sar, char *seq, float ng, int **mat, char *a);
+double sar_vs_seq3 ( char *sar, char *seq, float ng, int **mat, char *a);
+
+double sar_vs_iseq4 ( char *sar, int *seq, float ng, int **mat, char *a);//supports an extended alphabet
+double sar_vs_seq4 ( char *sar, char *seq, float ng, int **mat, char *a);
+
+double sar_vs_seq5 ( char *sar, char *seq, float ng, int **mat, char *a);
+int make_sim_pred ( Alignment *A,Alignment *S, int comp, int seq);
+
+int **sar2profile_sim ( Alignment *A, Alignment *S, int **sim, int comp, int leave)
+{
+
+ int a, b, r, c, c1, c2, r1, r2, s, p;
+ int ***cache, **profile;
+
+
+ profile=declare_int (A->len_aln, 26);
+ cache=declare_arrayN (3,sizeof (int),2,A->len_aln, 26);
+
+ for ( a=0; a< A->len_aln; a++)
+ for ( b=0; b< A->nseq; b++)
+ {
+ r=tolower(A->seq_al[b][a]);
+ c=( S->seq_al[comp][b]=='I')?1:0;
+ if (b==leave || is_gap(r)) continue;
+ cache [c][a][r-'a']++;
+ }
+ for (a=0; a< A->nseq; a++)
+ {
+ if ( a==leave) continue;
+ for ( b=0; b< A->nseq; b++)
+ {
+ c1=(S->seq_al[comp][a]=='I')?1:0;
+ c2=(S->seq_al[comp][b]=='I')?1:0;
+ if ( b==leave || b==a || c1!=1 || c1==c2) continue;
+ s=sim[a][b];
+
+ for (p=0; p<A->len_aln; p++)
+ {
+ r1=tolower(A->seq_al[a][p]);
+ r2=tolower(A->seq_al[b][p]);
+ if ( is_gap(r1) || is_gap(r2) || r1==r2)continue;
+ r1-='a';r2-='a';
+ if (cache[1][p][r2])continue;
+ if ( s<50)continue;
+ profile[p][r2]-=s;
+ }
+ }
+ }
+
+ free_arrayN((void***)cache,3);
+ return profile;
+
+}
+int **sar2profile ( Alignment *A, Alignment *S, int comp, int leave)
+{
+
+ int a, b,c,r, n, v, npos=0;
+ int ***cache, **profile;
+ int ncat;
+ float n_gap, max_gap;
+ profile=declare_int (A->len_aln, 26);
+ cache=declare_arrayN (3,sizeof (int),2,A->len_aln, 26);
+
+
+
+ for ( n=0, a=0; a< A->nseq; a++)
+ {
+ if ( a==leave) continue;
+ else n+=(S->seq_al[comp][a]=='I')?1:0;
+ }
+
+ for ( a=0; a< A->len_aln; a++)
+ for ( b=0; b< A->nseq; b++)
+ {
+ r=tolower(A->seq_al[b][a]);
+ c=( S->seq_al[comp][b]=='I')?1:0;
+ if (b==leave) continue;
+ else if (is_gap(r))continue;
+ r-='a';
+ cache [c][a][r]++;
+ }
+
+ ncat=15; /*ncat: limit the analysis to columns containing less than ncat categories of aa*/
+ max_gap=0.05;
+ for (a=0; a< A->len_aln; a++)
+ {
+ for (n_gap=0,b=0; b< A->nseq; b++)
+ n_gap+=(is_gap(A->seq_al[b][a]));
+ n_gap/=(float)A->nseq;
+
+ if ( n_gap> max_gap)continue;
+
+ for (v=0,r=0; r< 26; r++)
+ {
+ if (cache [0][a][r] || cache[1][a][r])v++;
+ }
+
+ for (n=0,r=0; r< 26 && v<ncat; r++)
+ {
+ if (cache [0][a][r] && !cache[1][a][r])
+ {
+ n++;
+ profile[a][r]=-cache[0][a][r];
+ }
+ }
+ if (n) npos++;
+ }
+
+ free_arrayN((void***)cache,3);
+ return profile;
+
+}
+Alignment * filter_aln4sar0 ( Alignment *A, Alignment *S, int comp, int leave, char *mode)
+{
+ return copy_aln (A,NULL);
+}
+Alignment * filter_aln4sar1 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode)
+{
+ Alignment *F, *A;
+ int a, b,c, i,r, n0, n1,g, score;
+ int ***cache, **list1, **list2;
+ int Delta;
+
+ int T1;
+
+ /*Keep only the positions where there are residues ONLY associated with 0 sequences*/
+
+ list1=declare_int ( inA->nseq, 2);
+ list2=declare_int ( inA->len_aln, 2);
+
+ cache=declare_arrayN (3,sizeof (int),inA->len_aln,2, 26);
+ F=copy_aln (inA, NULL);
+
+ A=copy_aln (inA, NULL);
+ A->nseq=strlen (S->seq_al[comp]);
+
+ strget_param (mode, "_T1_", "5", "%d", &T1);
+ for ( a=0; a< A->len_aln; a++)
+ {
+ n1=n0=g=0;
+ for (b=0; b< A->nseq; b++)
+ {
+ if ( b==leave) continue;
+ i=(S->seq_al[comp][b]=='I')?1:0;
+ r=tolower(A->seq_al[b][a]);
+ if ( r=='-')continue;
+ cache[a][i][r-'a']++;
+ }
+ }
+
+ for (a=0; a< A->nseq; a++)
+ for ( score=0,b=0; b<A->len_aln; b++)
+ {
+ r=tolower (A->seq_al[a][b]);
+ if ( is_gap(r))continue;
+ else if ( cache[b][0][r-'a'] && !cache[b][1][r-'a'])list1[a][0]++;
+ }
+
+ for (a=0; a< A->len_aln; a++)
+ {
+ for ( score=0,b=0; b< A->nseq; b++)
+ {
+ r=tolower (A->seq_al[b][a]);
+ if ( r=='-')continue;
+ else r-='a';
+ if ( cache[a][0][r] && !cache[a][1][r])score ++;
+ }
+ list2[a][0]=a;
+ list2[a][1]=score;
+ }
+ sort_int (list2, 2, 1, 0, F->len_aln-1);
+
+ Delta=A->len_aln/(100/T1);
+ for ( a=0; a< F->len_aln-Delta; a++)
+ {
+ b=list2[a][0];
+ for ( c=0; c<F->nseq; c++)
+ {
+ F->seq_al[c][b]='-';
+ }
+ }
+
+ ungap_aln (F);
+ free_aln (A);
+ free_arrayN ( (void ***)cache, 3);
+ free_arrayN ((void**)list1, 2);
+ free_arrayN ((void**)list2, 2);
+
+ return F;
+}
+Alignment * filter_aln4sar2 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode)
+{
+ Alignment *F, *A;
+ int a,b,r,ncat;
+ int *cache;
+ int max_ncat=10;
+
+ /*Keep Low entropy columns that contain less than ncat categories of different amino acids*/
+ /*REmove columns containing 10% or more gaps*/
+
+ cache=vcalloc ( 500, sizeof (char));
+ F=copy_aln (inA, NULL);
+ A=copy_aln (inA, NULL);
+ A->nseq=strlen (S->seq_al[comp]);
+ for ( a=0; a< A->len_aln; a++)
+ {
+ for (ncat=0,b=0; b< A->nseq; b++)
+ {
+ if ( b==leave) continue;
+
+ r=tolower(A->seq_al[b][a]);
+ if ( !cache[r])ncat++;
+ cache[r]++;
+ }
+
+ if ( ncat <max_ncat && ((cache['-']*100)/A->nseq)<10)
+ {
+ ;
+ }
+ else
+ {
+ for (b=0; b<F->nseq; b++)
+ {
+ r=tolower(F->seq_al[b][a]);
+ F->seq_al[b][a]='-';
+ cache[r]=0;
+ }
+ }
+ for (b=0; b<A->nseq; b++)
+ {
+ r=tolower(A->seq_al[b][a]);
+ cache[r]=0;
+ }
+ }
+
+ free_aln (A);
+ ungap_aln (F);
+ vfree (cache);
+ return F;
+}
+
+Alignment * filter_aln4sar3 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode)
+{
+ Alignment *F, *rA, *A;
+ int a, b,c;
+ int **list1;
+ char *bufS, *bufA;
+ int Delta;
+ int T3;
+
+ /*Keep the 10% positions most correlated with the 0/1 pattern*/
+
+ A=copy_aln (inA, NULL);
+ A->nseq=strlen (S->seq_al[comp]);
+ F=copy_aln (inA, NULL);
+ rA=rotate_aln (A, NULL);
+
+ strget_param (mode, "_T3_", "10", "%d", &T3);
+
+
+ list1=declare_int ( inA->len_aln, 2);
+ bufA=vcalloc ( A->nseq+1, sizeof (char));
+ bufS=vcalloc ( A->nseq+1, sizeof (char));
+
+ sprintf ( bufS, "%s", S->seq_al[comp]);
+ splice_out_seg(bufS,leave, 1);
+
+
+ for (a=0; a< A->len_aln; a++)
+ {
+ char aa;
+ list1[a][0]=a;
+ sprintf (bufA, "%s", rA->seq_al[a]);
+ splice_out_seg (bufA,leave,1);
+ list1[a][1]=(int)sar_vs_seq3 ( bufS, bufA,0,NULL, &aa);
+ }
+
+ sort_int (list1, 2, 1, 0, F->len_aln-1);
+ Delta=F->len_aln/(100/T3);
+ for ( a=0; a< F->len_aln-Delta; a++)
+ {
+ b=list1[a][0];
+
+ for ( c=0; c<F->nseq; c++)
+ {
+ F->seq_al[c][b]='-';
+ }
+
+ }
+ F->score_aln=list1[F->len_aln-1][1];
+ ungap_aln (F);
+
+ free_aln (rA);
+ free_aln(A);
+ free_arrayN ((void**)list1, 2);
+ vfree (bufS);vfree (bufA);
+ return F;
+}
+Alignment * filter_aln4sar4 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode)
+{
+ Alignment *F, *A;
+ int a, b,c, i,r, n0, n1,g,score;
+ int ***cache, **list1, **list2;
+
+ /*Keep only the positions where there are residues ONLY associated with 0 sequences*/
+
+ list1=declare_int ( inA->nseq, 2);
+ list2=declare_int ( inA->len_aln, 2);
+
+ cache=declare_arrayN (3,sizeof (int),inA->len_aln,2, 26);
+ F=copy_aln (inA, NULL);
+ A=copy_aln (inA, NULL);
+ A->nseq=strlen (S->seq_al[comp]);
+
+ for ( a=0; a< A->len_aln; a++)
+ {
+ n1=n0=g=0;
+ for (b=0; b< A->nseq; b++)
+ {
+ if ( b==leave) continue;
+ i=(S->seq_al[comp][b]=='I')?1:0;
+ r=tolower(A->seq_al[b][a]);
+ if ( r=='-')continue;
+ cache[a][i][r-'a']++;
+ n1+=i;
+ }
+ }
+
+
+ for (a=0; a< A->len_aln; a++)
+ {
+ for ( score=0,b=0; b< A->nseq; b++)
+ {
+ r=tolower (F->seq_al[b][a]);
+ if ( r=='-')continue;
+ else r-='a';
+ if (cache[a][1][r]>=n1/2)score=1;
+ }
+ list2[a][0]=a;
+ list2[a][1]=score;
+ }
+
+
+ for ( a=0; a< F->len_aln; a++)
+ {
+ if ( list2[a][1]==1);
+ else
+ {
+ b=list2[a][0];
+ for ( c=0; c<F->nseq; c++)
+ {
+ F->seq_al[c][b]='-';
+ }
+ }
+ }
+ ungap_aln (F);
+ free_aln (A);
+ free_arrayN ( (void ***)cache, 3);
+ free_arrayN ((void**)list1, 2);
+ free_arrayN ((void**)list2, 2);
+
+ return F;
+}
+
+Alignment * filter_aln4sar5 ( Alignment *inA, Alignment *S, int comp, int leave, char *mode)
+{
+ Alignment *F, *rA, *A;
+ int a, b,c;
+ int **list1;
+ char *bufS, *bufA;
+ int max;
+ /*Look for the positions that show the best correlation between the sequence variation and the SAR*/
+
+ A=copy_aln (inA, NULL);
+ A->nseq=strlen (S->seq_al[comp]);
+
+ rA=rotate_aln (inA, NULL);
+ F=copy_aln (inA, NULL);
+
+ list1=declare_int ( A->len_aln, 2);
+ bufA=vcalloc ( A->nseq+1, sizeof (char));
+ bufS=vcalloc ( A->nseq+1, sizeof (char));
+
+
+
+ sprintf ( bufS, "%s", S->seq_al[comp]);
+ splice_out_seg(bufS,leave, 1);
+
+
+ for (a=0; a< A->len_aln; a++)
+ {
+ char aa;
+ list1[a][0]=a;
+ sprintf (bufA, "%s", rA->seq_al[a]);
+ splice_out_seg (bufA,leave,1);
+ list1[a][1]=(int)sar_vs_seq4 ( bufS, bufA,0,NULL, &aa);
+ }
+
+ sort_int (list1, 2, 1, 0, F->len_aln-1);
+ max=F->score=list1[F->len_aln-1][1];
+ max-=(max/10);
+
+
+ for ( a=0; a< F->len_aln-10; a++)
+ {
+
+ b=list1[a][0];
+
+ for ( c=0; c<F->nseq; c++)
+ {
+ F->seq_al[c][b]='-';
+ }
+
+ }
+ F->score_aln=10;
+ ungap_aln (F);
+ free_aln (inA);
+ free_aln (rA);
+ free_arrayN ((void**)list1, 2);
+ vfree (bufS);vfree (bufA);
+ return F;
+}
+
+int sar_profile2score ( char *seq, int **P)
+{
+ int a,r, l, score;
+
+ l=strlen (seq);
+ for ( score=0,a=0; a< l; a++)
+ {
+ r=seq[a];
+ if ( is_gap(r))continue;
+ score+=P[a][tolower(r)-'a'];
+ }
+ return score;
+}
+int make_sim_pred ( Alignment *A,Alignment *S, int comp, int seq)
+{
+ int a, b, i, r1, r2;
+ static float **cscore;
+ static float **tscore;
+
+ if ( !cscore)
+ {
+ cscore=declare_float (2, 2);
+ tscore=declare_float (2, 2);
+ }
+
+ for (a=0; a< 2; a++)for (b=0; b<2; b++)cscore[a][b]=tscore[a][b]=0;
+
+ for ( a=0; a<A->len_aln; a++)
+ {
+ r1=A->seq_al[seq][a];
+ if ( r1=='-') continue;
+ else
+ {
+ for ( b=0; b< A->nseq; b++)
+ {
+ if (b==seq) continue;
+ else
+ {
+ r2=A->seq_al[b][a];
+ if (r2=='-')continue;
+ else
+ {
+
+ i=(S->seq_al[comp][b]=='I')?1:0;
+ cscore[i][0]+=(r1==r2)?1:0;
+ cscore[i][1]++;
+ }
+ }
+ }
+
+ for (i=0; i<2; i++)
+ {
+ cscore[i][0]/=(cscore[i][1]==0)?1:cscore[i][1];
+ tscore[i][0]+=cscore[i][0];tscore[i][1]++;
+ cscore[i][0]=cscore[i][1]=0;
+ }
+ }
+ }
+
+ fprintf ( stdout, "\nn\t 1: %.2f 0: %.2f", tscore[1][0],tscore[0][0]);
+ return ( tscore[1][0]>=tscore[0][0])?1:0;
+}
+
+
+Alignment * sar_analyze (Alignment *inA, Alignment *inS, char *mode)
+{
+ int ***sim,***glob_results, ***comp_results;
+ int *count;
+ int a,b,c,m;
+ float *tot2;
+ Alignment *A=NULL,*S=NULL,*F, *SUBSET;
+ char *subset, *target;
+ int jack, T, filter;
+ filter_func *ff;
+ int n_methods=0;
+ char *prediction, *reliability;
+ int pred_start=0, pred_end, ref_start=0, ref_end;
+ int display, CSV=1, NONCSV=0;
+ char method[5];
+
+ strget_param (mode, "_METHOD_", "1111", "%s_", method);
+ ff=vcalloc (6,sizeof (filter_func));
+ if (method[0]=='1')ff[n_methods++]=filter_aln4sar0;
+ if (method[1]=='1')ff[n_methods++]=filter_aln4sar1;
+ if (method[2]=='1')ff[n_methods++]=filter_aln4sar2;
+ if (method[3]=='1')ff[n_methods++]=filter_aln4sar3;
+ /*
+ ff[n_methods++]=filter_aln4sar4;
+ ff[n_methods++]=filter_aln4sar5;
+ */
+ sim=vcalloc (n_methods, sizeof (int**));
+
+
+ tot2=vcalloc ( 10, sizeof (float));
+ subset=vcalloc ( 100, sizeof (char));
+ target=vcalloc ( 100, sizeof (char));
+
+ strget_param (mode, "_TARGET_", "no", "%s_", target);
+ strget_param (mode, "_SUBSET_", "no", "%s_", subset);
+ strget_param (mode, "_JACK_", "0", "%d", &jack);
+ strget_param (mode, "_T_", "0", "%d", &T);
+ strget_param (mode, "_FILTER_", "11", "%d", &filter);
+ strget_param (mode, "_DISPLAY_", "0", "%d", &display);
+
+
+
+ if ( !strm (target, "no"))
+ {
+ Alignment *T;
+ T=main_read_aln(target, NULL);
+ if ( T->len_aln !=inA->len_aln )
+ {
+ printf_exit ( EXIT_FAILURE,stderr, "Error: %s is incompatible with the reference alignment [FATAL:%s]",target,PROGRAM);
+ }
+
+ inA=stack_aln (inA, T);
+
+ }
+
+ if ( !strm(subset, "no"))
+ {
+ SUBSET=main_read_aln (subset, NULL);
+ sarset2subsarset ( inA, inS, &A, &S, SUBSET);
+ }
+ else
+ {
+ A=inA;
+ S=inS;
+ }
+
+
+ prediction=vcalloc ( n_methods+1, sizeof (char));
+ reliability=vcalloc ( n_methods+1, sizeof (char));
+
+ glob_results=declare_arrayN(3, sizeof (int), n_methods*2, 2, 2);
+
+ count=vcalloc (S->nseq, sizeof (int));
+ for (a=0; a<S->nseq; a++)
+ {
+ int l;
+ l=strlen (S->seq_al[a]);
+ for ( b=0; b<l; b++)
+ count[a]+=(S->seq_al[a][b]=='I')?1:0;
+ }
+ if ( display==CSV)
+ {fprintf ( stdout, "\nCompound %s ; Ntargets %d", S->name[a],count[a]);
+ pred_start=(strlen (S->seq_al[0])==A->nseq)?0:strlen (S->seq_al[0]);
+ pred_end=A->nseq;
+ for (a=pred_start; a< pred_end; a++)
+ fprintf ( stdout, ";%s", A->name[a]);
+ fprintf ( stdout, ";npred;");
+ }
+
+
+ for (a=0; a<S->nseq; a++)
+ {
+ int n_pred;
+ comp_results=declare_arrayN(3, sizeof (int), n_methods*2, 2, 2);
+
+ pred_start=(strlen (S->seq_al[a])==A->nseq)?0:strlen (S->seq_al[a]);
+ pred_end=A->nseq;
+ if ( display==CSV)fprintf ( stdout, "\n%s;%d", S->name[a],count[a]);
+
+ for (n_pred=0,b=pred_start; b<pred_end;b++)
+ {
+ int t, score=0,pred, real;
+
+ if ( display==NONCSV)fprintf ( stdout, "\n>%-15s %10s %c ", S->name[a], A->name[b], (pred_start==0)?S->seq_al[a][b]:'?');
+ if (jack || b==pred_start)
+ {
+ for (m=0; m<n_methods; m++)
+ {
+ free_int (sim[m], -1);
+ F=(ff[m]) (A,S,a,(jack==0)?-1:b, mode);
+ sim[m]=aln2sim_mat(F, "idmat");
+ free_aln (F);
+ }
+ }
+
+ for (m=0; m<n_methods; m++)
+ {
+ int Nbsim=0,Ybsim=0,bsim=0;
+ ref_start=0;
+ ref_end=strlen (S->seq_al[m]);
+
+ for (c=ref_start;c<ref_end; c++)
+ {
+ if ( b==c) continue;
+ else if ( S->seq_al[a][c]=='O')
+ {
+ Nbsim=MAX(Nbsim,sim[m][b][c]);
+ }
+ else
+ {
+ Ybsim=MAX(Ybsim,sim[m][b][c]);
+ }
+ }
+
+ bsim=(Ybsim>Nbsim)?Ybsim:-Nbsim;
+ pred=(bsim>0)?1:0;
+ real=(S->seq_al[a][b]=='O')?0:1;
+ comp_results[m][pred][real]++;
+ glob_results[m][pred][real]++;
+ score+=pred;
+ prediction[m]=pred+'0';
+ reliability[m]=(FABS((Ybsim-Nbsim))-1)/10+'0';
+ }
+
+ if ( score>0)n_pred++;
+ prediction[m]=reliability[m]='\0';
+ if (display==NONCSV)fprintf ( stdout, "Compound_Count:%d primary_predictions: %s Total: %d", count[a],prediction, score);
+ else if ( display==CSV)fprintf ( stdout, ";%d", score);
+ for (t=0; t<n_methods; t++)
+ {
+ if (score>t)
+ {
+ comp_results[t+n_methods][1][real]++;
+ glob_results[t+n_methods][1][real]++;
+ }
+ else
+ {
+ comp_results[t+n_methods][0][real]++;
+ glob_results[t+n_methods][0][real]++;
+ }
+ }
+ }
+ if ( display==NONCSV)
+ {if ( pred_start==0)display_prediction (comp_results, S,a, n_methods*2);}
+ else fprintf (stdout, ";%d;",n_pred);
+ }
+ if ( display==NONCSV)if (pred_start==0)display_prediction (glob_results, S,-1, n_methods*2);
+
+
+ myexit (EXIT_SUCCESS);
+}
+float display_prediction (int ***count, Alignment *S, int c, int n)
+{
+ float tp,tn,fn,fp,sp,sn,sn2;
+ int a, nm;
+
+ nm=n/2;
+
+ for (a=0; a<n; a++)
+ {
+ tp=count[a][1][1];
+ tn=count[a][0][0];
+ fp=count[a][1][0];
+ fn=count[a][0][1];
+
+ sn2=tp/(tp+fp);
+ sn=tp/(tp+fn);
+ sp=tn/(tn+fp);
+ if ( a<nm)fprintf ( stdout, "\n>#Method %d Compound %15s sp=%.2f sn=%.2f sn2=%.2f",a, (c==-1)?"TOTAL":S->name[c],sp, sn, sn2 );
+ else fprintf ( stdout, "\n>#Combined: T=%d Compound %15s sp=%.2f sn=%.2f sn2=%.2f",a-nm, (c==-1)?"TOTAL":S->name[c],sp, sn, sn2 );
+ }
+ fprintf ( stdout, "\n");
+ return 0;
+}
+
+float display_prediction_2 (int **prediction, int n,Alignment *A, Alignment *S, int field)
+{
+ int a, t, T;
+ float max_sn, max_sp;
+
+ if ( field==17 || field ==18)
+ {
+ printf_exit ( EXIT_FAILURE, stderr, "\nERROR: Do not use filed %d in display_prediction", field);
+ }
+
+ sort_int_inv ( prediction, 10,field, 0, n-1);
+ for (t=0,a=0; a<n; a++)
+ {
+ t+=prediction[a][3];
+ prediction[a][17]=t;
+ }
+
+ for (t=0,a=n-1; a>=0; a--)
+ {
+ prediction[a][18]=t;
+ t+=prediction[a][3];
+ }
+
+ max_sn=max_sp=T=0;
+ for (a=0; a<n; a++)
+ {
+ float tp, fn, fp, sp, sn;
+
+ tp=prediction[a][17];
+ fn=prediction[a][18];
+ fp=(a+1)-tp;
+
+ sp=((tp+fp)==0)?0:tp/(tp+fp);
+ sn=((tp+fn)==0)?0:tp/(tp+fn);
+
+ if (sp>0.8)
+ {
+ if (sn>max_sn)
+ {
+ max_sn=sn;
+ max_sp=sp;
+
+ T=prediction[a][field];
+ }
+ }
+ }
+ if (max_sn==0)
+ fprintf (stdout, "\n T =%d SN=%.2f SP= %.2f",T,max_sn,max_sp);
+ else
+ fprintf (stdout, "\n T =%d SN=%.2f SP= %.2f",T,max_sn,max_sp);
+
+ return max_sn;
+}
+
+
+/************************************************************************************/
+/* NEW ANALYZE : SAR */
+/************************************************************************************/
+float** cache2pred1 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
+float** cache2pred2 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
+float** cache2pred3 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
+float** cache2pred4 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
+float** cache2pred5 (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
+float** cache2pred_new (Alignment *A,int**cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
+
+int **sar2cache_adriana ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
+int **sar2cache_proba_old ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
+int **sar2cache_count1 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
+int **sar2cache_count2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
+int **sar2cache_count3 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
+
+int **sar2cache_proba_new ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
+int **sar2cache_proba2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode);
+int **analyze_sar_compound1 ( char *name, char *seq, Alignment *A, char *mode);
+int **analyze_sar_compound2 ( char *name, char *seq, Alignment *A, char *mode);
+
+int aln2n_comp_col ( Alignment *A, Alignment *S, int ci);
+
+
+
+
+int ***simple_sar_analyze_vot ( Alignment *inA, Alignment *SAR, char *mode);
+int ***simple_sar_analyze_col ( Alignment *inA, Alignment *SAR, char *mode);
+
+
+
+int sarset2subsarset ( Alignment *A, Alignment *S, Alignment **subA, Alignment **subS, Alignment *SUB);
+int benchmark_sar (int v);
+int aln2jack_group1 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2);
+int aln2jack_group2 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2);
+int aln2jack_group3 (Alignment *A, char *sar_seq, int **l1, int *nl1, int **l2, int *nl2);
+float** jacknife5 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
+float** jacknife6 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode);
+
+int process_cache ( Alignment *A,Alignment *S, int ***Cache, char *mode);
+Alignment *analyze_compounds (Alignment *A, Alignment *S, char *mode);
+
+Alignment *analyze_compounds (Alignment *A, Alignment *S, char *mode)
+{
+ int a, b, c, tot, n;
+ int **sim;
+ int sar1, sar2;
+
+ sim=aln2sim_mat (A, "idmat");
+ for (a=0; a< S->nseq; a++)
+ {
+ for (n=0, tot=0, b=0; b< A->nseq-1; b++)
+ {
+ sar1=(S->seq_al[a][b]=='I')?1:0;
+ for ( c=b+1; c<A->nseq; c++)
+ {
+ sar2=(S->seq_al[a][c]=='I')?1:0;
+
+ if (sar1 && sar2)
+ {
+ tot+=sim[b][c];
+ n++;
+ }
+ }
+ }
+ fprintf ( stdout, ">%-10s CMPSIM: %.2f\n", S->name[a],(float)tot/(float)n);
+ }
+ free_int (sim, -1);
+ return A;
+}
+
+int print_seq_pos ( int pos, Alignment *A, char *seq);
+int abl1_evaluation (int p);
+int print_seq_pos ( int pos, Alignment *A, char *seq)
+{
+ int a, b, s;
+
+ s=name_is_in_list (seq, A->name, A->nseq, MAXNAMES);
+ fprintf ( stdout, "S=%d", s);
+
+ for (b=0,a=0; a<pos; a++)
+ {
+ if (!is_gap (A->seq_al[s][a]))b++;
+ }
+ fprintf ( stdout, "Pos %d SEQ %s: %d ", pos+1, seq, b+246);
+ if ( strm ( seq, "ABL1")) fprintf ( stdout , "PT: %d", abl1_evaluation (b+246));
+ return 0;
+}
+
+int process_cache ( Alignment *A,Alignment *S, int ***Cache, char *mode)
+{
+ int a, b;
+ int **pos, **pos2;
+ int **C;
+ int ab1, *ab1_pos;
+ int weight_mode;
+
+ strget_param ( mode, "_WEIGHT_", "1", "%d", &weight_mode);
+ pos=declare_int(A->len_aln+1,2);
+ pos2=declare_int (A->len_aln+1,S->nseq);
+ for (a=0; a<S->nseq; a++)
+ {
+ C=Cache[a];
+ for (b=0; b< A->len_aln; b++)
+ {
+ pos[b][0]+=C[26][b];
+ if ( C[26][b]>0)
+ {
+ pos[b][1]++;
+ pos2[b][a]=1;
+ }
+ }
+ }
+
+ C=Cache[0];
+ ab1=name_is_in_list ("ABL1", A->name, A->nseq,100);
+ ab1_pos=vcalloc (A->len_aln+1, sizeof (int));
+
+ for ( b=0,a=0; a< A->len_aln; a++)
+ {
+ if ( A->seq_al[ab1][a]=='-')ab1_pos[a]=-1;
+ else ab1_pos[a]=++b;
+ }
+
+ for ( a=0; a< A->len_aln; a++)
+ {
+ fprintf ( stdout, "\n%4d %5d %5d %5d [%c] [%2d] ALN", a+1, pos[a][0], pos[a][1], ab1_pos[a]+246,A->seq_al[ab1][a],abl1_evaluation (ab1_pos[a]+246));
+ for ( b=0; b< S->nseq; b++)fprintf ( stdout, "%d", pos2[a][b]);
+ }
+ return 1;
+}
+int abl1_evaluation (int p)
+{
+ if ( p==248) return 10;
+ if ( p==250) return 10;
+ if ( p==253) return 10;
+ if ( p==254) return 10;
+ if ( p==255) return 9;
+ if ( p==256) return 10;
+ if ( p==257) return 5;
+ if ( p==258) return 8;
+ if ( p==269) return 8;
+ if ( p==291) return 4;
+ if ( p==294) return 8;
+ if ( p==299) return 10;
+ if ( p==306) return 0;
+ if ( p==314) return 9;
+ if ( p==315) return 10;
+ if ( p==318) return 10;
+
+ if ( p==319) return 10;
+ if ( p==321) return 10;
+ if ( p==323) return 0;
+ if ( p==324) return 0;
+ if ( p==339) return 0;
+ if ( p==340) return 0;
+ if ( p==355) return 5;
+ if ( p==364) return 10;
+
+ if ( p==366) return 0;
+ if ( p==368) return 10;
+ if ( p==370) return 10;
+ if ( p==372) return 0;
+ if ( p==378) return 8;
+ if ( p==382) return 10;
+
+ if ( p==384) return 10;
+ if ( p==387) return 10;
+ if ( p==395) return 8;
+
+ if ( p==398) return 8;
+ if ( p==399) return 8;
+ if ( p==400) return 8;
+ if ( p==403) return 0;
+ if ( p==416) return 8;
+ if ( p==419) return 5;
+ if ( p>400) return 0;
+ return -1;
+}
+float** cache2pred1 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
+{
+ int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci;
+ double score, max, id, m;
+ float **R, T;
+
+
+ int used_col, used_res,is_used_col, n_res=0;
+ int weight_mode;
+ /*Predict on ns[1] what was trained on ns[0]*/
+
+ strget_param ( mode, "_THR_", "0.09", "%f", &T);
+ strget_param ( mode, "_WEIGHT_", "0", "%d", &weight_mode);
+
+ R=declare_float (2, 2);
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+
+
+
+ for (s1=0; s1<ns[1]; s1++)
+ {
+ int v;
+ seq1=ls[1][s1];
+
+ for (max=0,score=0, col=0; col<A->len_aln; col++)
+ {
+ int max1;
+ r1=tolower (A->seq_al[seq1][col]);
+ for (max1=0,id=0, m=0,s2=0; s2<ns[0]; s2++)
+ {
+ seq2=ls[0][s2];
+ if ( S->seq_al[ci][seq2]=='O')continue;
+ if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue;
+
+ r2=tolower ( A->seq_al[seq2][col]);
+ if ( is_gap(r2))continue;
+
+ v=(cache[seq2][col]>0 && weight_mode==1)?cache[seq2][col]:1;
+
+ max+=v;
+ if ( r2==r1)
+ {
+ score+=v;
+ }
+
+ }
+
+ }
+ pred=(( score/max) >T)?1:0;
+ real=(S->seq_al[ci][seq1]=='I')?1:0;
+ R[pred][real]++;
+
+ fprintf ( stdout, "\n>%s %d%d SCORE %.2f C %s [SEQ]\n", A->name[seq1],real, pred, (float)score/(float)max, compound);
+ }
+
+ for (used_col=0,used_res=0,col=0; col<A->len_aln; col++)
+ {
+ for (is_used_col=0,s2=0; s2<ns[0]; s2++)
+ {
+ seq2=ls[0][s2];
+ if ( cache[seq2][col]==0 && !is_gap(A->seq_al[seq2][col]))n_res++;
+ else if (is_gap(A->seq_al[seq2][col]));
+ else
+ {
+ is_used_col=1;
+ used_res++;
+ }
+ }
+ used_col+=is_used_col;
+ }
+ fprintf ( stdout, "\n>%s USED_POSITIONS: COL: %.2f RES: %.2f COMP\n", S->name[ci], (float)used_col/(float)A->len_aln, (float)used_res/(float) n_res);
+
+ return R;
+}
+
+float** cache2pred2 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
+{
+ int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci;
+ double score, max;
+ float **R, T;
+
+
+ int used_col, used_res,is_used_col, n_res=0;
+ /*Predict on ns[1] what was trained on ns[0]*/
+
+ strget_param ( mode, "_THR_", "0.5", "%f", &T);
+
+
+ R=declare_float (2, 2);
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+
+ for (s1=0; s1<ns[1]; s1++)
+ {
+ int v;
+ seq1=ls[1][s1];
+ fprintf ( stdout, "\n");
+ for (max=0,score=0, col=0; col<A->len_aln; col++)
+ {
+ int used;
+
+ r1=tolower (A->seq_al[seq1][col]);
+ for (used=0,s2=0; s2<ns[0]; s2++)
+ {
+ seq2=ls[0][s2];
+
+ if ( S->seq_al[ci][seq2]=='O')continue;
+ if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue;
+
+
+ r2=tolower ( A->seq_al[seq2][col]);
+ if ( is_gap(r2))continue;
+
+ v=cache[seq2][col];
+ if ( r2==r1){score+=v;}
+ used=1;
+ max+=v;
+ }
+ if (used) fprintf ( stdout, "%c", r1);
+ }
+
+ pred=(( score/max) >T)?1:0;
+ real=(S->seq_al[ci][seq1]=='I')?1:0;
+ R[pred][real]++;
+
+ fprintf ( stdout, "PSEQ: %-10s SC: %4d MAX: %4d S: %.2f R: %4d", A->name[seq1],(int)score, (int)max, (float)score/max,real);
+
+ }
+
+ for (used_col=0,used_res=0,col=0; col<A->len_aln; col++)
+ {
+ for (is_used_col=0,s2=0; s2<ns[0]; s2++)
+ {
+ seq2=ls[0][s2];
+ if ( cache[seq2][col]==0 && !is_gap(A->seq_al[seq2][col]))n_res++;
+ else if (is_gap(A->seq_al[seq2][col]));
+ else
+ {
+ is_used_col=1;
+ used_res++;
+ }
+ }
+ used_col+=is_used_col;
+ }
+ fprintf ( stdout, "\n>%s USED_POSITIONS: COL: %.2f RES: %.2f COMP\n", S->name[ci], (float)used_col/(float)A->len_aln, (float)used_res/(float) n_res);
+
+ return R;
+}
+
+float** cache2pred3 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
+{
+ int s1, s2, seq1, seq2, r1, r2,col, pred, real, ci, a, n;
+ double score, max;
+ float **R, T;
+
+
+
+ int tp, tn, fn, fp;
+ int best_tp, best_fp;
+ int delta, best_delta;
+ int **list;
+
+ /*Predict on ns[1] what was trained on ns[0]*/
+
+ strget_param ( mode, "_THR_", "0.5", "%f", &T);
+
+
+ R=declare_float (2, 2);
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ list=declare_int ( ns[1],3);
+
+ for (s1=0; s1<ns[1]; s1++)
+ {
+ int v;
+ seq1=ls[1][s1];
+
+ for (max=0,score=0, col=0; col<A->len_aln; col++)
+ {
+ int used;
+
+ r1=tolower (A->seq_al[seq1][col]);
+ for (used=0,s2=0; s2<ns[0]; s2++)
+ {
+ seq2=ls[0][s2];
+
+ if ( S->seq_al[ci][seq2]=='O')continue;
+ if ( cache[seq2][col]==0 && !is_gap( A->seq_al[seq2][col]))continue;
+
+
+ r2=tolower ( A->seq_al[seq2][col]);
+ if ( is_gap(r2))continue;
+
+ v=cache[seq2][col];
+ if ( r2==r1){score+=v;}
+ used=1;
+ max+=v;
+ }
+ }
+
+
+
+ pred=(( score/max) >T)?1:0;
+ real=(S->seq_al[ci][seq1]=='I')?1:0;
+
+ list[s1][0]=real;
+ list[s1][1]=(int)((score/max)*(float)1000);
+ list[s1][2]=seq1;
+
+
+
+ }
+ sort_int_inv (list, 3, 1, 0, ns[1]-1);
+
+ for ( a=0; a<ns[1]; a++)
+ {
+ seq1=list[a][2];
+ fprintf ( stdout, "PSEQ: %-10s SC: %5d R: %4d\n", A->name[seq1],list[a][0], list[a][1]);
+ }
+
+ for (n=0, a=0; a<ns[1]; a++)n+=list[a][0];
+ for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<ns[1]; a++)
+ {
+ tp+=list[a][0];
+ fp+=1-list[a][0];
+ delta=(n-(tp+fp));
+ if (FABS(delta)<best_delta)
+ {
+ best_delta=delta;
+ best_tp=tp;
+ best_fp=fp;
+ }
+ }
+ /*R[pred][real]*/
+ tp=best_tp;
+ fp=best_fp;
+ fn=n-tp;
+ tn=ns[1]-(tp+fp+fn);
+ R[1][1]=tp;
+ R[1][0]=fp;
+ R[0][1]=fn;
+ R[0][0]=tn;
+ free_int (list, -1);
+ return R;
+}
+float** cache2pred4 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
+{
+ int s1, s2, seq1, seq2, ci, a,b, c, n;
+ double score;
+ float **R;
+
+
+ int tp, tn, fn, fp;
+ int best_tp, best_fp;
+ int delta, best_delta;
+ int **list;
+ int **sim;
+ int *ul;
+ int nused=0;
+
+ /*Predict on ns[1] what was trained on ns[0]*/
+ /*Identify interesting coloumns*/
+ ul=vcalloc ( A->len_aln, sizeof (int));
+ for (a=0; a< A->len_aln; a++)
+ for ( b=0; b< A->nseq; b++)
+ if ( cache[b][a])ul[nused++]=a;
+
+ /*compute the similarity on the used columns*/
+
+ R=declare_float (2, 2);
+ sim=declare_int (A->nseq, A->nseq);
+ for (a=0; a< A->nseq; a++)
+ for ( b=0; b< A->nseq; b++)
+ {
+ for (c=0; c< nused; c++)
+ {
+ if ( A->seq_al[a][ul[c]]==A->seq_al[b][ul[c]])sim[a][b]++;
+ }
+ sim[a][b]=(sim[a][b]*100)/nused;
+ }
+ vfree (ul);
+
+
+
+
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ list=declare_int ( ns[1],2);
+
+ for (s1=0; s1<ns[1]; s1++)
+ {
+
+ seq1=ls[1][s1];
+
+ for (score=0,s2=0; s2<ns[0]; s2++)
+ {
+ seq2=ls[0][s2];
+
+ if ( seq1==seq2)continue;
+ if (S->seq_al[ci][seq2]=='I')score=MAX(score, sim[seq1][seq2]);
+ }
+ list[s1][0]=(S->seq_al[ci][seq1]=='I')?1:0;
+ list[s1][1]=(int)score;
+
+ }
+ sort_int_inv (list, 2, 1, 0, ns[1]-1);
+
+ for (n=0, a=0; a<ns[1]; a++)n+=list[a][0];
+ for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<ns[1]; a++)
+ {
+ tp+=list[a][0];
+ fp+=1-list[a][0];
+ delta=(n-(tp+fp));
+ if (FABS(delta)<best_delta)
+ {
+ best_delta=delta;
+ best_tp=tp;
+ best_fp=fp;
+ }
+ }
+ /*R[pred][real]*/
+ tp=best_tp;
+ fp=best_fp;
+ fn=n-tp;
+ tn=ns[1]-(tp+fp+fn);
+ R[1][1]=tp;
+ R[1][0]=fp;
+ R[0][1]=fn;
+ R[0][0]=tn;
+ free_int (list, -1);
+ free_int (sim, -1);
+ return R;
+}
+
+float** cache2pred5 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
+{
+ int s1, s2, seq1, seq2, ci, a, n;
+ double score;
+ float **R;
+
+
+
+ int tp, tn, fn, fp;
+ int best_tp, best_fp;
+ int delta, best_delta;
+ int **list;
+ static int **sim;
+
+ /*Predict on ns[1] what was trained on ns[0]*/
+
+ R=declare_float (2, 2);
+
+ if ( sim==NULL)
+ sim=aln2sim_mat (A, "idmat");
+
+
+
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ list=declare_int ( ns[1],2);
+
+ for (s1=0; s1<ns[1]; s1++)
+ {
+
+ seq1=ls[1][s1];
+
+ for (score=0,s2=0; s2<ns[0]; s2++)
+ {
+ seq2=ls[0][s2];
+
+ if ( seq1==seq2)continue;
+ if (S->seq_al[ci][seq2]=='I')score=MAX(score, sim[seq1][seq2]);
+ }
+ list[s1][0]=(S->seq_al[ci][seq1]=='I')?1:0;
+ list[s1][1]=(int)score;
+
+ }
+ sort_int_inv (list, 2, 1, 0, ns[1]-1);
+
+ for (n=0, a=0; a<ns[1]; a++)n+=list[a][0];
+ for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<ns[1]; a++)
+ {
+ tp+=list[a][0];
+ fp+=1-list[a][0];
+ delta=(n-(tp+fp));
+ if (FABS(delta)<best_delta)
+ {
+ best_delta=delta;
+ best_tp=tp;
+ best_fp=fp;
+ }
+ }
+ /*R[pred][real]*/
+ tp=best_tp;
+ fp=best_fp;
+ fn=n-tp;
+ tn=ns[1]-(tp+fp+fn);
+ R[1][1]=tp;
+ R[1][0]=fp;
+ R[0][1]=fn;
+ R[0][0]=tn;
+ free_int (list, -1);
+ return R;
+}
+
+float** jacknife5 (Alignment*A,int **cacheIN, int *ns, int **ls, Alignment *S, char *compound, char *mode)
+{
+ int seq1, ci, a,b, c, n;
+ double score, max_score;
+ float **R;
+
+
+ int tp, tn, fn, fp;
+ int best_tp, best_fp;
+ int delta, best_delta;
+ int **list;
+ int **cache;
+
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ list=declare_int (A->nseq,2);
+ R=declare_float (2, 2);
+
+
+ for ( a=0; a<A->nseq; a++)
+ {
+ int real, res;
+
+ ns[0]=A->nseq-1;
+ ns[1]=1;
+ for (c=0,b=0; b<A->nseq; b++)
+ if (a!=b)ls[0][c++]=b;
+ ls[1][0]=a;
+
+
+ cache=sar2cache_count1 (A, ns, ls,S, compound, mode);
+ for (b=0; b<=26; b++)
+ for ( c=0; c< A->len_aln; c++)
+ cacheIN[b][c]+=cache[b][c];
+
+ seq1=a;
+ real=(S->seq_al[ci][seq1]=='I')?1:0;
+ fprintf ( stdout, ">%-10s %d ", A->name[seq1], real);
+
+
+
+ for (max_score=0,b=0; b<A->len_aln; b++)
+ max_score+=cache[26][b];
+
+ for (score=0,b=0; b<A->len_aln; b++)
+ {
+ res=tolower (A->seq_al[seq1][b]);
+ if ( cache[26][b]==0) continue;
+ if ( !is_gap(res))
+ {
+ score+=cache[res-'a'][b];
+ }
+ /*fprintf ( stdout, "%c[%3d]", res,b);*/
+ }
+ fprintf ( stdout, " SCORE: %5d SPRED %d RATIO: %.2f \n", (int)score, a, (score*100)/max_score);
+ list[a][0]=real;
+
+ if ( strstr (mode, "SIMTEST"))list[a][1]=(score*100)/max_score;
+ else list[a][1]=(score*100)/max_score;
+ free_int (cache, -1);
+ }
+
+
+ sort_int_inv (list, 2, 1, 0, A->nseq-1);
+ for (n=0, a=0; a<A->nseq; a++)
+ {
+ n+=list[a][0];
+ }
+
+ for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<A->nseq; a++)
+ {
+
+ tp+=list[a][0];
+ fp+=1-list[a][0];
+ delta=(n-(tp+fp));
+ if (FABS(delta)<best_delta)
+ {
+ best_delta=delta;
+ best_tp=tp;
+ best_fp=fp;
+ }
+ }
+
+ /*R[pred][real]*/
+ tp=best_tp;
+ fp=best_fp;
+ fn=n-tp;
+ tn=A->nseq-(tp+fp+fn);
+ R[1][1]=tp;
+ R[1][0]=fp;
+ R[0][1]=fn;
+ R[0][0]=tn;
+ free_int (list, -1);
+
+ return R;
+}
+float** jacknife6 (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
+{
+ int seq1, ci, a,b, c,d,e,f, n;
+ double score;
+ float **R;
+
+
+ int tp, tn, fn, fp;
+ int best_tp, best_fp;
+ int delta, best_delta;
+ int **list;
+
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ list=declare_int (A->len_aln,2);
+ R=declare_float (2, 2);
+
+
+ for ( a=0; a<A->nseq; a++)
+ {
+ int sar, res;
+ int **new_cache;
+
+ ns[0]=A->nseq-1;
+ ns[1]=1;
+ for (c=0,b=0; b<A->nseq; b++)
+ if (a!=b)ls[0][c++]=b;
+ ls[1][0]=a;
+
+ cache=sar2cache_proba_new (A, ns, ls,S, compound, mode);
+
+
+ new_cache=declare_int (27,A->len_aln);
+
+ for (d=0; d< A->len_aln; d++)
+ {
+ int **analyze;
+ if ( cache[26][d]==0)continue;
+ analyze=declare_int (26, 2);
+
+ for ( e=0; e< ns[0]; e++)
+ {
+ f=ls[0][e];
+ sar=(S->seq_al[ci][f]=='I')?1:0;
+ res=tolower (A->seq_al[f][d]);
+
+ if ( res=='-') continue;
+ analyze[res-'a'][sar]++;
+ }
+ for (e=0;e<26; e++)
+ {
+ if ( analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=cache[e][d];}
+ /*
+ if ( analyze[e][0] && analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=analyze[e][1];}
+ else if ( analyze[e][0]){new_cache[26][d]=1;new_cache[e][d]-=analyze[e][0]*10;}
+ else if ( analyze[e][1]){new_cache[26][d]=1;new_cache[e][d]+=analyze[e][1];}
+ else if ( !analyze[e][0] &&!analyze[e][1]);
+ */
+ }
+ free_int (analyze, -1);
+ }
+
+ seq1=a;
+ sar=(S->seq_al[ci][seq1]=='I')?1:0;
+ fprintf ( stdout, ">%-10s %d ", A->name[seq1], sar);
+
+ for (score=0,b=0; b<A->len_aln; b++)
+ {
+ res=tolower (A->seq_al[seq1][b]);
+ if ( cache[26][b]==0) continue;
+ if ( !is_gap(res))
+ {
+ score+=new_cache[res-'a'][b];
+ }
+ }
+ fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score);
+ list[seq1][0]=sar;
+ list[seq1][1]=(int)score;
+
+ free_int (new_cache, -1);
+ free_int (cache, -1);
+ }
+ sort_int_inv (list, 2, 1, 0, A->nseq-1);
+ for (n=0, a=0; a<A->nseq; a++)n+=list[a][0];
+ for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<A->nseq; a++)
+ {
+ tp+=list[a][0];
+ fp+=1-list[a][0];
+ delta=(n-(tp+fp));
+ if (FABS(delta)<best_delta)
+ {
+ best_delta=delta;
+ best_tp=tp;
+ best_fp=fp;
+ }
+ }
+
+ fprintf ( stderr, "\n%d %d", best_tp, best_fp);
+ /*R[pred][real]*/
+ tp=best_tp;
+ fp=best_fp;
+ fn=n-tp;
+ tn=A->nseq-(tp+fp+fn);
+ R[1][1]=tp;
+ R[1][0]=fp;
+ R[0][1]=fn;
+ R[0][0]=tn;
+ free_int (list, -1);
+
+
+ return R;
+}
+float** cache2pred_new (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
+{
+ int s1, seq1, ci, a,b, n;
+ double score;
+ float **R;
+
+
+ int tp, tn, fn, fp;
+ int best_tp, best_fp;
+ int delta, best_delta;
+ int **list;
+
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ list=declare_int ( ns[1],2);
+ R=declare_float (2, 2);
+
+ for (s1=0; s1<ns[1]; s1++)
+ {
+ int res, real;
+
+ seq1=ls[1][s1];
+ real=(S->seq_al[ci][seq1]=='I')?1:0;
+ fprintf ( stdout, ">%-10s %d ", A->name[seq1], real);
+ for (score=0,b=0; b<A->len_aln; b++)
+ {
+ res=tolower (A->seq_al[seq1][b]);
+ if ( cache[26][b]==0) continue;
+ if ( !is_gap(res))
+ {
+ score+=cache[res-'a'][b];
+ }
+ fprintf ( stdout, "%c", res);
+ }
+ fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score);
+ list[s1][0]=real;
+ list[s1][1]=(int)score;
+ }
+
+ sort_int_inv (list, 2, 1, 0, ns[1]-1);
+
+ for (n=0, a=0; a<ns[1]; a++)n+=list[a][0];
+ for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<ns[1]; a++)
+ {
+ tp+=list[a][0];
+ fp+=1-list[a][0];
+ delta=(n-(tp+fp));
+ if (FABS(delta)<best_delta)
+ {
+ best_delta=delta;
+ best_tp=tp;
+ best_fp=fp;
+ }
+ }
+
+
+
+ /*R[pred][real]*/
+ tp=best_tp;
+ fp=best_fp;
+ fn=n-tp;
+ tn=ns[1]-(tp+fp+fn);
+ R[1][1]=tp;
+ R[1][0]=fp;
+ R[0][1]=fn;
+ R[0][0]=tn;
+ free_int (list, -1);
+
+
+ return R;
+}
+float** cache2pred_forbiden_res (Alignment*A,int **cache, int *ns, int **ls, Alignment *S, char *compound, char *mode)
+{
+ int s1,seq1, ci, a,b, c, n;
+ double score;
+ float **R;
+
+
+ int tp, tn, fn, fp;
+ int best_tp, best_fp;
+ int delta, best_delta;
+ int **list;
+ int **new_cache;
+ int **mat;
+
+ mat=read_matrice ( "blosum62mt");
+
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ list=declare_int ( ns[1],2);
+ R=declare_float (2, 2);
+
+ for (s1=0; s1<ns[1]; s1++)
+ {
+ int res, real;
+
+ seq1=ls[1][s1];
+ real=(S->seq_al[ci][seq1]=='I')?1:0;
+ fprintf ( stdout, ">%-10s %d ", A->name[seq1], real);
+ for (score=0,b=0; b<A->len_aln; b++)
+ {
+ res=tolower (A->seq_al[seq1][b]);
+ if ( cache[26][b]==0) continue;
+ if ( !is_gap(res))
+ {
+ score+=cache[res-'a'][b];
+ }
+ fprintf ( stdout, "%c", res);
+ }
+ fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score);
+ list[s1][0]=real;
+ list[s1][1]=(int)score;
+ }
+ new_cache=declare_int (27,A->len_aln);
+ for (a=0; a< A->len_aln; a++)
+ {
+ int **analyze, real, res, d;
+ int *res_type;
+ int **sub;
+ int *keep;
+ keep=vcalloc ( 26, sizeof (int));
+ res_type=vcalloc ( 26, sizeof (int));
+ sub=declare_int (256, 2);
+
+ if ( cache[26][a]==0)continue;
+ analyze=declare_int (26, 2);
+ for ( b=0; b< ns[0]; b++)
+ {
+ seq1=ls[0][b];
+ real=(S->seq_al[ci][seq1]=='I')?1:0;
+ res=tolower (A->seq_al[seq1][a]);
+
+ if ( res=='-') continue;
+ analyze[res-'a'][real]++;
+ }
+ fprintf ( stdout, "RSPRED: ");
+ for (c=0;c<26; c++)fprintf ( stdout, "%c", c+'a');
+ fprintf ( stdout, "\nRSPRED: ");
+ for (c=0;c<26; c++)
+ {
+ if ( analyze[c][0] && analyze[c][1]){fprintf ( stdout, "1");res_type[c]='1';}
+ else if ( analyze[c][0]){new_cache[26][a]=1;new_cache[c][a]-=analyze[c][0];fprintf ( stdout, "0");res_type[c]='0';}
+ else if ( analyze[c][1]){new_cache[26][a]=1;new_cache[c][a]+=analyze[c][1];fprintf ( stdout, "1");res_type[c]='1';}
+ else if ( !analyze[c][0] &&!analyze[c][1]){fprintf ( stdout, "-");res_type[c]='-';}
+ }
+
+
+ for ( c=0; c<26; c++)
+ {
+ for ( d=0; d<26; d++)
+ {
+
+ if ( res_type[c]==res_type[d])
+ {
+ sub[res_type[c]][0]+=mat[c][d];
+ sub[res_type[c]][1]++;
+ }
+ if ( res_type[c]!='-' && res_type[d]!='-')
+ {
+ sub['m'][0]+=mat[c][d];
+ sub['m'][1]++;
+ }
+ }
+ }
+ for ( c=0; c< 256; c++)
+ {
+ if ( sub[c][1])fprintf ( stdout, " %c: %5.2f ", c, (float)sub[c][0]/(float)sub[c][1]);
+ }
+ fprintf ( stdout, " SC: %d\nRSPRED ", cache[26][a]);
+
+ for ( c=0; c<26; c++)
+ if ( res_type[c]=='1')
+ {
+ for (d=0; d<26; d++)
+ if (mat[c][d]>0)keep[d]++;
+ keep[c]=9;
+ }
+
+ for (c=0; c<26; c++)
+ {
+ if ( keep[c]>10)fprintf ( stdout, "9");
+ else fprintf ( stdout, "%d", keep[c]);
+ }
+ for ( c=0; c<26; c++)
+ {
+ if ( keep[c]>8)new_cache[c][a]=10;
+ else new_cache[c][a]=-10;
+ }
+ fprintf ( stdout, "\n");
+ free_int (analyze, -1);
+ free_int (sub, -1);
+ vfree (res_type);
+ vfree (keep);
+
+ }
+ for ( a=0; a<25; a++)
+ for (b=a+1; b<26; b++)
+ {
+ int r1, r2;
+ r1=a+'a';r2=b+'a';
+ if ( strchr("bjoxz", r1))continue;
+ if ( strchr("bjoxz",r2))continue;
+
+ if ( mat[a][b]>0 && a!=b)fprintf ( stdout, "\nMATANALYZE %c %c %d", a+'a', b+'a', mat[a][b]);
+ }
+
+ for (s1=0; s1<ns[1]; s1++)
+ {
+ int res, real;
+
+ seq1=ls[1][s1];
+ real=(S->seq_al[ci][seq1]=='I')?1:0;
+ fprintf ( stdout, ">%-10s %d ", A->name[seq1], real);
+ for (score=0,b=0; b<A->len_aln; b++)
+ {
+ res=tolower (A->seq_al[seq1][b]);
+ if ( cache[26][b]==0) continue;
+ if ( !is_gap(res))
+ {
+ score+=new_cache[res-'a'][b];
+ }
+ fprintf ( stdout, "%c", res);
+ }
+ fprintf ( stdout, " SCORE: %5d SPRED\n", (int)score);
+ list[s1][0]=real;
+ list[s1][1]=(int)score;
+ }
+ free_int (new_cache, -1);
+ sort_int_inv (list, 2, 1, 0, ns[1]-1);
+
+
+ for (n=0, a=0; a<ns[1]; a++)n+=list[a][0];
+ for (best_delta=100000,best_tp=0,tp=0,fp=0,best_fp=0,a=0; a<ns[1]; a++)
+ {
+ tp+=list[a][0];
+ fp+=1-list[a][0];
+ delta=(n-(tp+fp));
+ if (FABS(delta)<best_delta)
+ {
+ best_delta=delta;
+ best_tp=tp;
+ best_fp=fp;
+ }
+ }
+
+
+
+ /*R[pred][real]*/
+ tp=best_tp;
+ fp=best_fp;
+ fn=n-tp;
+ tn=ns[1]-(tp+fp+fn);
+ R[1][1]=tp;
+ R[1][0]=fp;
+ R[0][1]=fn;
+ R[0][0]=tn;
+ free_int (list, -1);
+
+
+ return R;
+}
+
+int **sar2cache_proba_old ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
+{
+ int col, s, seq,ms,mseq, res, mres, res1, n,maxn1, maxn2,maxn3, t, ci, a;
+ float quant=0;
+ int **list;
+
+ int N1msa,N1sar, N, N11, N10, N01,N00, SCORE, COL_INDEX, RES;
+ int nfield=0;
+ int value;
+ float T1, T2, T3, T4;
+ int weight_mode;
+ int **cache;
+ static int **sim;
+ int sim_weight, w, sw_thr;
+ int train_mode;
+
+ float zscore;
+
+ RES=nfield++;COL_INDEX=nfield++;N1msa=nfield++;N1sar=nfield++;N=nfield++;N11=nfield++;N10=nfield++;N01=nfield++;N00=nfield++;SCORE=nfield++;
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ cache=declare_int (A->nseq, A->len_aln);
+
+ strget_param ( mode, "_FILTER1_", "0" , "%f", &T1);
+ strget_param ( mode, "_FILTER2_", "1000000", "%f", &T2);
+ strget_param ( mode, "_FILTER3_", "0" , "%f", &T3);
+ strget_param ( mode, "_FILTER4_", "1000000", "%f", &T4);
+ strget_param ( mode, "_SIMWEIGHT_", "1", "%d", &sim_weight);
+ strget_param ( mode, "_SWTHR_", "30", "%d", &sw_thr);
+ strget_param (mode, "_TRAIN_","1", "%d", &train_mode);
+ strget_param (mode, "_ZSCORE_","0", "%f", &zscore);
+
+
+
+
+
+ if (sim_weight==1 && !sim) sim=aln2sim_mat(A, "idmat");
+ for ( ms=0; ms<ns[0]; ms++)
+ {
+ mseq=ls[0][ms];
+ if ( S->seq_al[ci][mseq]!='I')continue;
+
+ list=declare_int (A->len_aln+1, nfield);
+ for (t=0,n=0, col=0; col< A->len_aln; col++)
+ {
+ int same_res;
+
+ mres=tolower(A->seq_al[mseq][col]);
+ list[col][RES]=mres;
+ list[col][COL_INDEX]=col;
+
+ if ( is_gap(mres))continue;
+ for ( s=0; s<ns[0]; s++)
+ {
+ seq=ls[0][s];
+ res=tolower(A->seq_al[seq][col]);
+ if (is_gap(res))continue;
+
+
+ if (sim_weight==1)
+ {
+ w=sim[seq][mseq];w=(mres==res)?100-w:w;
+ if (w<sw_thr)w=0;
+ }
+ else
+ w=1;
+
+ if ( train_mode==4)
+ {
+ if ( S->seq_al[ci][seq]=='I')same_res=1;
+ else same_res=(res==mres)?1:0;
+ }
+ else
+ same_res=(res==mres)?1:0;
+
+ list[col][N]+=w;
+
+ if (S->seq_al[ci][seq]=='I' && same_res)list[col][N11]+=w;
+ else if (S->seq_al[ci][seq]=='I' && same_res)list[col][N10]+=w;
+ else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N01]+=w;
+ else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N00]+=w;
+
+ if ( S->seq_al[ci][seq]=='I')list[col][N1sar]+=w;
+ if ( same_res)list[col][N1msa]+=w;
+
+ }
+
+ list[col][SCORE]=(int)evaluate_sar_score1 (list[col][N], list[col][N11], list[col][N1msa], list[col][N1sar]);
+
+ }
+
+ strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1);
+ strget_param ( mode, "_WEIGHT_", "1", "%d", &weight_mode);
+ strget_param ( mode, "_QUANT_", "0.0", "%f", &quant);
+
+ sort_int_inv (list,nfield,SCORE,0,A->len_aln-1);
+ if ( quant !=0)
+ {
+
+ n=quantile_rank ( list,SCORE, A->len_aln,quant);
+ sort_int (list,nfield,N1msa, 0, n-1);
+ maxn1=MIN(n,maxn1);
+ }
+
+ for (a=0; a<maxn1; a++)
+ {
+ col=list[a][COL_INDEX];
+ res1=list[a][RES];
+ value=list[a][SCORE];
+ if ( value>T1 && value<T2){cache[mseq][col]= value;}
+ }
+ free_int (list, -1);
+ }
+
+ /*Filter Columns*/
+ list=declare_int (A->len_aln+1, nfield);
+ for ( col=0; col< A->len_aln; col++)
+ {
+ list[col][COL_INDEX]=col;
+ for ( s=0; s<ns[0]; s++)
+ {
+ seq=ls[0][s];
+ list[col][SCORE]+=cache[seq][col];
+ }
+ }
+
+ /*Filter Columns with a score not between T2 and T3*/
+
+ for (col=0; col< A->len_aln; col++)
+ if (list[col][SCORE]<T3 || list[col][SCORE]>T4)
+ {
+ list[col][SCORE]=0;
+ for (s=0; s< A->nseq; s++)
+ if (!is_gap(A->seq_al[s][col]))cache[s][col]=0;
+ }
+
+ /*Keep The N Best Columns*/
+ if ( zscore!=0)
+ {
+ double sum=0, sum2=0, z;
+ int n=0;
+ for (a=0; a< A->len_aln; a++)
+ {
+ if ( list[a][SCORE]>0)
+ {
+ sum+=list[a][SCORE];
+ sum2+=list[a][SCORE]*list[a][SCORE];
+ n++;
+ }
+ }
+ for (a=0; a<A->len_aln; a++)
+ {
+ if ( list[a][SCORE]>0)
+ {
+ z=return_z_score (list[a][SCORE], sum, sum2,n);
+ if ((float)z<zscore)
+ {
+ col=list[a][COL_INDEX];
+ for (s=0; s<A->nseq; s++)
+ cache [s][col]=0;
+ }
+ else
+ {
+ fprintf ( stdout, "\nZSCORE: KEEP COL %d SCORE: %f SCORE: %d\n", list[a][COL_INDEX], (float)z, list[a][SCORE]);
+ }
+ }
+ }
+ }
+ else
+ {
+ sort_int_inv (list,nfield,SCORE,0,A->len_aln-1);
+ strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2);
+
+ for (a=maxn2;a<A->len_aln; a++)
+ {
+ col=list[a][COL_INDEX];
+ for (s=0; s<A->nseq; s++)
+ cache [s][col]=0;
+ }
+ }
+
+ /*Get Rid of the N best Columns*/;
+ strget_param ( mode, "_MAXN3_", "0", "%d", &maxn3);
+
+ for (a=0; a<maxn3;a++)
+ {
+ col=list[a][COL_INDEX];
+ for (s=0; s<A->nseq; s++)
+ cache [s][col]=0;
+ }
+
+ return cache;
+}
+int aln2n_comp_col ( Alignment *A, Alignment *S, int ci)
+{
+ int res, seq,sar, col, r;
+ int **analyze;
+
+ int tot=0;
+
+ analyze=declare_int (27, 2);
+ for ( col=0; col< A->len_aln; col++)
+ {
+ int n1, n0;
+
+
+ for ( n1=0, n0=0,seq=0; seq<A->nseq; seq++)
+ {
+ res=tolower(A->seq_al[seq][col]);
+ sar=(S->seq_al[ci][seq]=='I')?1:0;
+ n1+=(sar==1)?1:0;
+ n0+=(sar==0)?1:0;
+ if ( res=='-')continue;
+ res-='a';
+ analyze[res][sar]++;
+ }
+
+ for (r=0; r<26; r++)
+ {
+ int a0,a1;
+ a0=analyze[r][0];
+ a1=analyze[r][1];
+
+
+ if ( a1==n1 && a0<n0)
+ {
+ tot++;
+ }
+ }
+ for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0;
+ }
+
+ free_int (analyze, -1);
+ return tot;
+}
+int **sar2cache_count1 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
+{
+ int maxn2, res, seq,sar, ci, col,s, r;
+ int **analyze, **list, **cache;
+ static int **mat;
+
+ int a0,a1, w;
+ if (!mat) mat=read_matrice ("blosum62mt");
+
+
+ list=declare_int ( A->len_aln, 2);
+ cache=declare_int ( 27, A->len_aln);
+ analyze=declare_int (27, 2);
+
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+
+ for ( col=0; col< A->len_aln; col++)
+ {
+ int n1, n0;
+
+
+ for ( n1=0, n0=0,s=0; s<ns[0]; s++)
+ {
+ seq=ls[0][s];
+ res=tolower(A->seq_al[seq][col]);
+ sar=(S->seq_al[ci][seq]=='I')?1:0;
+ n1+=(sar==1)?1:0;
+ n0+=(sar==0)?1:0;
+ if ( res=='-')continue;
+ res-='a';
+
+ analyze[res][sar]++;
+ }
+
+ for (r=0; r<26; r++)
+ {
+
+ a0=analyze[r][0];
+ a1=analyze[r][1];
+
+ if ( strstr (mode, "SIMTEST"))
+ {
+ w=a1;
+ }
+ else if (a1 )
+ {
+ w=n0-a0;
+ }
+ else w=0;
+
+ cache[r][col]+=w;
+ cache[26][col]=MAX(w, cache[26][col]);
+ }
+
+ for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0;
+ list[col][0]=col;
+ list[col][1]=cache[26][col];
+ }
+
+ free_int (analyze, -1);
+
+ sort_int_inv (list, 2, 1, 0, A->len_aln-1);
+
+ strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2);
+
+ for ( col=maxn2; col<A->len_aln; col++)
+ for ( r=0; r<=26; r++)cache[r][list[col][0]]=0;
+
+ free_int (list, -1);
+ return cache;
+}
+
+
+int **sar2cache_count2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
+{
+ int maxn2, res, seq,sar, ci, col,s, r;
+ int **analyze, **list, **cache, **conseq;
+ static int **mat;
+ int w=0;
+ if (!mat) mat=read_matrice ("blosum62mt");
+
+
+ list=declare_int ( A->len_aln, 2);
+ cache=declare_int ( 27, A->len_aln);
+ conseq=declare_int ( A->len_aln,3);
+
+ analyze=declare_int (27, 2);
+
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ for ( col=0; col< A->len_aln; col++)
+ {
+ int n1, n0;
+
+ for ( n1=0, n0=0,s=0; s<ns[0]; s++)
+ {
+ seq=ls[0][s];
+ res=tolower(A->seq_al[seq][col]);
+ sar=(S->seq_al[ci][seq]=='I')?1:0;
+ n1+=(sar==1)?1:0;
+ n0+=(sar==0)?1:0;
+ if ( res=='-')continue;
+ res-='a';
+ analyze[res][sar]++;
+ }
+ for (r=0; r<26; r++)
+ {
+ int a0,a1;
+ a0=analyze[r][0];
+ a1=analyze[r][1];
+ if ( a1==n1 && a0<n0)
+ {
+
+ w=n0-a0;
+ conseq[col][0]=r;
+ conseq[col][1]=w;
+ }
+ }
+ for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0;
+ }
+ free_int (analyze, -1);
+
+ for (s=0; s<ns[0]; s++)
+ {
+ int w1, w2;
+ seq=ls[0][s];
+ for (w1=0,w2=0,col=0; col<A->len_aln; col++)
+ {
+
+ res=tolower(A->seq_al[seq][col]);
+ if ( is_gap(res))continue;
+ else res-='a';
+
+ if ( conseq[col][1] && res!=conseq[col][0])w1++;
+ if ( conseq[col][1])w2++;
+ }
+ for (col=0; col<A->len_aln; col++)
+ {
+ res=tolower(A->seq_al[seq][col]);
+ if ( is_gap(res))continue;
+ else res-='a';
+
+ if ( conseq[col][1] && res!=conseq[col][0])conseq[col][2]+=(w2-w1);
+ }
+ }
+
+ for (col=0; col<A->len_aln; col++)
+ {
+ r=conseq[col][0];
+ w=conseq[col][2];
+
+
+ cache[r][col]=cache[26][col]=list[col][1]=w;
+ list[col][0]=col;
+ }
+ sort_int_inv (list, 2, 1, 0, A->len_aln-1);
+ strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2);
+
+ for ( col=maxn2; col<A->len_aln; col++)
+ for ( r=0; r<=26; r++)cache[r][list[col][0]]=0;
+
+
+ free_int (list, -1);
+ return cache;
+}
+
+int **sar2cache_count3 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
+{
+ int maxn2, res, seq,sar, ci, col,s, r, a1, a0, n1, n0;
+ int **analyze, **list, **cache;
+ static int **mat;
+
+ if (!mat) mat=read_matrice ("blosum62mt");
+
+
+ list=declare_int ( A->len_aln, 2);
+ cache=declare_int ( 27, A->len_aln);
+ analyze=declare_int (27, 2);
+
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+
+ for ( col=0; col< A->len_aln; col++)
+ {
+ double e, g;
+ for ( n1=0, n0=0,s=0; s<ns[0]; s++)
+ {
+ seq=ls[0][s];
+ res=tolower(A->seq_al[seq][col]);
+ sar=(S->seq_al[ci][seq]=='I')?1:0;
+ n1+=(sar==1)?1:0;
+ n0+=(sar==0)?1:0;
+ if ( res=='-')continue;
+ res-='a';
+
+ analyze[res][sar]++;
+ }
+
+ /*Gap*/
+ for (g=0,r=0; r<A->nseq; r++)
+ g+=is_gap(A->seq_al[r][col]);
+ g=(100*g)/A->nseq;
+
+ /*enthropy
+ for (e=0, r=0; r<26; r++)
+ {
+ a0=analyze[r][0];
+ a1=analyze[r][1];
+ t=a0+a1;
+
+ if (t>0)
+ e+= t/(double)A->nseq*log(t/(double)A->nseq);
+ }
+ e*=-1;
+ */
+ e=0;
+ if (g>10) continue;
+ if (e>10) continue;
+
+ if ( strstr ( mode, "SIMTEST"))
+ {
+ for (r=0; r<26; r++)
+ {
+
+ a0=analyze[r][0];
+ a1=analyze[r][1];
+
+ if (a1)
+ {
+ cache[r][col]=a1;
+ cache[26][col]=MAX(cache[26][col],a1);
+ }
+ }
+ }
+ else
+ {
+
+
+
+ for (r=0; r<26; r++)
+ {
+
+ a0=analyze[r][0];
+ a1=analyze[r][1];
+
+ if (!a1 && a0)
+ {
+ cache[r][col]=a0;
+ cache[26][col]=MAX(cache[26][col],a0);
+ }
+ }
+ }
+
+ for ( r=0; r<26; r++)analyze[r][0]=analyze[r][1]=0;
+ list[col][0]=col;
+ list[col][1]=cache[26][col];
+ }
+
+ free_int (analyze, -1);
+
+ sort_int_inv (list, 2, 1, 0, A->len_aln-1);
+
+ strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2);
+
+ for ( col=maxn2; col<A->len_aln; col++)
+ for ( r=0; r<=26; r++)cache[r][list[col][0]]=0;
+
+ free_int (list, -1);
+ return cache;
+}
+
+
+int **sar2cache_proba_new ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
+{
+ int col, s, seq,ms,mseq, res, mres, res1, n,maxn1, maxn2,maxn3, t, ci, a,w;
+
+ int **list;
+
+ int N1msa,N1sar, N, N11, N10, N01,N00, SCORE, COL_INDEX, RES;
+ int nfield=0;
+ int value;
+
+
+ int **cache;
+ static int **sim;
+ int sw_thr;
+ float zscore;
+
+ RES=nfield++;COL_INDEX=nfield++;N1msa=nfield++;N1sar=nfield++;N=nfield++;N11=nfield++;N10=nfield++;N01=nfield++;N00=nfield++;SCORE=nfield++;
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ cache=declare_int (27, A->len_aln);
+
+ strget_param ( mode, "_SWTHR_", "30", "%d", &sw_thr);
+ strget_param (mode, "_ZSCORE_","0", "%f", &zscore);
+
+
+ if (!sim)sim=aln2sim_mat(A, "idmat");
+ for ( ms=0; ms<ns[0]; ms++)
+ {
+ mseq=ls[0][ms];
+ if ( S->seq_al[ci][mseq]!='I')continue;
+
+ list=declare_int (A->len_aln+1, nfield);
+ for (t=0,n=0, col=0; col< A->len_aln; col++)
+ {
+ int same_res;
+
+ mres=tolower(A->seq_al[mseq][col]);
+ if ( is_gap(mres))continue;
+
+ list[col][RES]=mres;
+ list[col][COL_INDEX]=col;
+
+ for ( s=0; s<ns[0]; s++)
+ {
+ seq=ls[0][s];
+ res=tolower(A->seq_al[seq][col]);
+ if (is_gap(res))continue;
+ w=sim[seq][mseq];w=(mres==res)?100-w:w;
+ if (w<sw_thr)w=0;
+ same_res=(res==mres)?1:0;
+
+ list[col][N]+=w;
+
+ if (S->seq_al[ci][seq]=='I' && same_res)list[col][N11]+=w;
+ else if (S->seq_al[ci][seq]=='I' && same_res)list[col][N10]+=w;
+ else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N01]+=w;
+ else if (S->seq_al[ci][seq]=='O' && same_res)list[col][N00]+=w;
+
+ if ( S->seq_al[ci][seq]=='I')list[col][N1sar]+=w;
+ if ( same_res)list[col][N1msa]+=w;
+
+ }
+
+ list[col][SCORE]=(int)evaluate_sar_score1 (list[col][N], list[col][N11], list[col][N1msa], list[col][N1sar]);
+
+ }
+ strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1);
+ sort_int_inv (list,nfield,SCORE,0,A->len_aln-1);
+ for (a=0; a<maxn1; a++)
+ {
+ col=list[a][COL_INDEX];
+ res1=list[a][RES];
+ value=list[a][SCORE];
+
+ if ( res1!=0)
+ {
+ cache[res1-'a'][col]+= value;
+ cache[26][col]+=value;
+ }
+ }
+ free_int (list, -1);
+ }
+
+ /*Filter Columns*/
+ list=declare_int (A->len_aln+1, nfield);
+ for ( col=0; col< A->len_aln; col++)
+ {
+ list[col][COL_INDEX]=col;
+ list[col][SCORE]=cache[26][col];
+ }
+ /*Keep The N Best Columns*/
+ if ( zscore!=0)
+ {
+ double sum=0, sum2=0, z;
+ int n=0;
+ for (a=0; a< A->len_aln; a++)
+ {
+ if ( list[a][SCORE]>0)
+ {
+ sum+=list[a][SCORE];
+ sum2+=list[a][SCORE]*list[a][SCORE];
+ n++;
+ }
+ }
+ for (a=0; a<A->len_aln; a++)
+ {
+ if ( list[a][SCORE]>0)
+ {
+ z=return_z_score (list[a][SCORE], sum, sum2,n);
+ if ((float)z<zscore)
+ {
+ col=list[a][COL_INDEX];
+ for (s=0; s<27; s++)
+ cache [s][col]=0;
+ }
+ else
+ {
+ fprintf ( stdout, "\nZSCORE: KEEP COL %d SCORE: %f SCORE: %d\n", list[a][COL_INDEX], (float)z, list[a][SCORE]);
+ }
+ }
+ }
+ }
+ else
+ {
+ sort_int_inv (list,nfield,SCORE,0,A->len_aln-1);
+ strget_param ( mode, "_MAXN2_", "100000", "%d", &maxn2);
+
+ for (a=maxn2;a<A->len_aln; a++)
+ {
+ col=list[a][COL_INDEX];
+ for (s=0; s<27; s++)
+ cache [s][col]=0;
+ }
+ }
+
+ /*Get Rid of the N best Columns*/;
+ strget_param ( mode, "_MAXN3_", "0", "%d", &maxn3);
+
+ for (a=0; a<maxn3;a++)
+ {
+ col=list[a][COL_INDEX];
+ for (s=0; s<27; s++)
+ cache [s][col]=0;
+ }
+ return cache;
+}
+int **sar2cache_adriana ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
+{
+
+ int col,maxn1, s, seq,ms,mseq, res, mres,res1, n, t, ci, a;
+ float quant=0;
+ int **list;
+
+
+ int **cache;
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+ cache=declare_int (A->nseq, A->len_aln);
+
+
+ for ( ms=0; ms<ns[0]; ms++)
+ {
+ mseq=ls[0][ms];
+ if ( S->seq_al[ci][mseq]!='I')continue;
+
+ list=declare_int (A->len_aln+1, 5);
+ for (t=0,n=0, col=0; col< A->len_aln; col++)
+ {
+ mres=tolower(A->seq_al[mseq][col]);
+ list[col][0]=mres;
+ list[col][1]=col;
+
+ if ( is_gap(mres))continue;
+ for ( s=0; s<ns[0]; s++)
+ {
+ seq=ls[0][s];
+ res=tolower(A->seq_al[seq][col]);
+ if (is_gap(res))continue;
+
+ if (S->seq_al[ci][seq]=='I' && res==mres)list[col][3]++;
+ if (res==mres)list[col][2]++;
+ }
+ }
+
+ sort_int_inv (list,5,3,0,A->len_aln-1);
+
+ strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1);
+ strget_param ( mode, "_QUANT_", "0.95", "%f", &quant);
+
+ n=quantile_rank ( list, 3, A->len_aln,quant);
+ sort_int (list, 5, 2, 0, n-1);
+
+ for (a=0; a<maxn1; a++)
+ {
+
+ col=list[a][1];
+ res1=list[a][0];
+ cache[mseq][col]=list[a][3];
+ }
+ free_int (list, -1);
+
+ }
+ return cache;
+}
+int **sar2cache_proba2 ( Alignment *A, int *ns,int **ls, Alignment *S, char *compound, char *mode)
+{
+ int col, s, seq,ms,mseq, res, mres,n,maxn1, t, ci, a,b;
+ int COL, SCORE;
+
+ float quant=0;
+ int **list;
+
+ float T1, T2, T3, T4;
+
+ int **cache;
+ cache=declare_int ( A->nseq, A->len_aln);
+ ci=name_is_in_list ( compound, S->name, S->nseq, -1);
+
+ strget_param ( mode, "_FILTER1_", "0" , "%f", &T1);
+ strget_param ( mode, "_FILTER2_", "1000000", "%f", &T2);
+ strget_param ( mode, "_FILTER3_", "0" , "%f", &T3);
+ strget_param ( mode, "_FILTER4_", "1000000", "%f", &T4);
+
+ list=declare_int (A->len_aln+1,A->nseq+2);
+ SCORE=A->nseq;
+ COL=A->nseq+1;
+
+ for ( ms=0; ms<ns[0]; ms++)
+ {
+ mseq=ls[0][ms];
+ if ( S->seq_al[ci][mseq]!='I')continue;
+
+ for (t=0,n=0, col=0; col< A->len_aln; col++)
+ {
+ int N11=0,N10=0,N01=0,N00=0,N1sar=0,N1msa=0,N=0;
+
+ mres=tolower(A->seq_al[mseq][col]);
+ if ( is_gap(mres))continue;
+ for ( s=0; s<ns[0]; s++)
+ {
+ seq=ls[0][s];
+ res=tolower(A->seq_al[seq][col]);
+ if (is_gap(res))continue;
+
+ N++;
+ if (S->seq_al[ci][seq]=='I' && res==mres)N11++;
+ else if (S->seq_al[ci][seq]=='I' && res!=mres)N10++;
+ else if (S->seq_al[ci][seq]=='O' && res==mres)N01++;
+ else if (S->seq_al[ci][seq]=='O' && res!=mres)N00++;
+
+ if ( S->seq_al[ci][seq]=='I')N1sar++;
+ if ( res==mres)N1msa++;
+ }
+ list[col][mseq]=(int)evaluate_sar_score1 (N,N11,N1msa,N1sar);
+ list[col][SCORE]+=list[col][mseq];
+ list[col][COL]=col;
+ }
+ }
+
+ strget_param ( mode, "_MAXN1_", "5", "%d", &maxn1);
+ strget_param ( mode, "_QUANT_", "0.95", "%f", &quant);
+ sort_int_inv (list,A->nseq+2,SCORE, 0, A->len_aln-1);
+ n=quantile_rank ( list,A->nseq, A->len_aln,quant);
+ n=5;
+
+
+ for (a=0; a<n; a++)
+ {
+ int value;
+
+ col=list[a][COL];
+ for ( b=0; b<A->nseq; b++)
+ {
+ value=list[col][b];
+ if ( value>T1 && value<T2){cache[b][col]= value;}
+ }
+ }
+
+ free_int (list, -1);
+ return cache;
+}
+
+
+
+
+/************************************************************************************/
+/* ALIGNMENT ANALYZE : SAR */
+/************************************************************************************/
+int aln2jack_group3 (Alignment *A,char *comp, int **l1, int *nl1, int **l2, int *nl2)
+{
+ int **seq_list, **sar_list, nsar=0, nseq=0;
+ int a, b, mid;
+
+ vsrand (0);
+ sar_list=declare_int (A->nseq, 2);
+ seq_list=declare_int (A->nseq, 2);
+ for (a=0; a< A->nseq; a++)
+ {
+ if (comp[a]=='I')
+ {
+ sar_list[nsar][0]=a;
+ sar_list[nsar][1]=rand()%100000;
+ nsar++;
+ }
+ else
+ {
+ seq_list[nseq][0]=a;
+ seq_list[nseq][1]=rand()%100000;
+ nseq++;
+ }
+ }
+
+
+ l1[0]=vcalloc (A->nseq, sizeof (int));
+ l2[0]=vcalloc (A->nseq, sizeof (int));
+ nl1[0]=nl2[0]=0;
+
+ sort_int (seq_list, 2, 1, 0,nseq-1);
+ sort_int (sar_list, 2, 1, 0,nsar-1);
+ mid=nsar/2;
+ for (a=0; a<mid; a++)
+ {
+ l1[0][nl1[0]++]=sar_list[a][0];
+ }
+ for (a=0,b=mid; b<nsar; b++, a++)
+ {
+ l2[0][nl2[0]++]=sar_list[b][0];
+ }
+
+ mid=nseq/2;
+ for (a=0; a<mid; a++)
+ {
+ l1[0][nl1[0]++]=seq_list[a][0];
+ }
+ for (a=0,b=mid; b<nseq; b++, a++)
+ {
+ l2[0][nl2[0]++]=seq_list[b][0];
+ }
+
+
+ free_int (seq_list, -1);
+ free_int (sar_list, -1);
+ return 1;
+}
+
+int aln2jack_group2 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2)
+{
+ int **list;
+ int a, b, mid;
+
+
+ list=declare_int (A->nseq, 2);
+ l1[0]=vcalloc (A->nseq, sizeof (int));
+ l2[0]=vcalloc (A->nseq, sizeof (int));
+ nl1[0]=nl2[0];
+
+ vsrand (0);
+ for ( a=0; a< A->nseq; a++)
+ {
+ list[a][0]=a;
+ list[a][1]=rand()%100000;
+ }
+ sort_int (list, 2, 1, 0,A->nseq-1);
+ mid=A->nseq/2;
+ for (a=0; a<mid; a++)
+ {
+ l1[0][nl1[0]++]=list[a][0];
+ }
+ for (a=0,b=mid; b<A->nseq; b++, a++)
+ {
+ l2[0][nl2[0]++]=list[b][0];
+ }
+
+ free_int (list, -1);
+ return 1;
+}
+int aln2jack_group1 (Alignment *A, int seq, int **l1, int *nl1, int **l2, int *nl2)
+{
+ int **sim;
+ int **list;
+ int a, mid;
+
+ list=declare_int ( A->nseq, 3);
+ l1[0]=vcalloc (A->nseq, sizeof (int));
+ l2[0]=vcalloc (A->nseq, sizeof (int));
+ nl1[0]=nl2[0];
+
+ sim=aln2sim_mat (A, "idmat");
+ for ( a=0; a< A->nseq; a++)
+ {
+ list[a][0]=seq;
+ list[a][1]=a;
+ list[a][2]=(a==seq)?100:sim[seq][a];
+ }
+ sort_int_inv (list, 3, 2, 0, A->nseq-1);
+ fprintf ( stderr, "\nJacknife fromsequence %s [%d]\n", A->name[seq], seq);
+ mid=A->nseq/2;
+ for (a=0; a< mid; a++)
+ l1[0][nl1[0]++]=list[a][1];
+ for (a=mid; a<A->nseq; a++)
+ l2[0][nl2[0]++]=list[a][1];
+ return 1;
+}
+
+
+int sarset2subsarset ( Alignment *A, Alignment *S, Alignment **subA, Alignment **subS, Alignment *SUB)
+{
+ Alignment *rotS, *intS;
+ int a,b, *list, nl;
+
+ list=vcalloc ( SUB->nseq, sizeof (int));
+ for (nl=0,a=0; a<SUB->nseq; a++)
+ {
+ b=name_is_in_list(SUB->name[a], A->name, A->nseq, 100);
+ if ( b!=-1)list[nl++]=b;
+ }
+
+ subA[0]=extract_sub_aln (A, nl, list);
+ rotS=rotate_aln (S, NULL);
+ intS=extract_sub_aln (rotS, nl, list);
+
+ subS[0]=rotate_aln (intS, NULL);
+
+ for ( a=0; a<S->nseq; a++) sprintf ( (subS[0])->name[a], "%s", S->name[a]);
+
+
+ return 0;
+}
+
+int ***simple_sar_analyze_vot ( Alignment *A, Alignment *SAR, char *mode)
+{
+ int a, b, c, d;
+ int res1, res2, sar1, sar2;
+ float s;
+ int **sim;
+ static float ***result;
+ static int ***iresult;
+ if (!result)
+ {
+ result=declare_arrayN (3,sizeof (float),SAR->nseq, A->len_aln,3);
+ iresult=declare_arrayN (3,sizeof (int),SAR->nseq, A->len_aln,3);
+ }
+
+ sim=aln2sim_mat (A, "idmat");
+
+
+ for (a=0; a<SAR->nseq; a++)
+ for (b=0; b<A->len_aln; b++)
+ result[a][b][0]=1;
+
+ for ( a=0; a< SAR->nseq; a++)
+ for ( b=0; b<A->nseq-1; b++)
+ for ( c=b+1; c< A->nseq; c++)
+ for ( d=0; d<A->len_aln; d++)
+ {
+ res1=A->seq_al[b][d];
+ res2=A->seq_al[c][d];
+
+ sar1=(SAR->seq_al[a][b]=='I')?1:0;
+ sar2=(SAR->seq_al[a][c]=='I')?1:0;
+
+ s=sim[b][c];
+
+
+
+
+ if ( sar1!=sar2 && res1!=res2)
+ result[a][d][0]*=(1/(100-s));
+
+ else if ( sar1==sar2 && sar1==1 && res1==res2)
+ result[a][d][0]*=1/s;
+
+
+
+
+ /*
+ else if ( sar1==sar2 && res1==res2)result[a][d][0]+=(100-s)*(100-s);
+ else if ( sar1==sar2 && res1!=res2)result[a][d][0]-=s*s;
+ else if ( sar1!=sar2 && res1==res2)result[a][d][0]-=(100-s)*(100-s);
+ */
+
+ result[a][d][1]='a';
+ }
+ for ( a=0; a<SAR->nseq; a++)
+ for ( b=0; b<A->len_aln; b++)
+ {
+ fprintf ( stderr, "\n%f", result[a][b][0]);
+ iresult[a][b][0]=100*log(1-result[a][b][0]);
+ }
+ return iresult;
+}
+int display_simple_sar_analyze_pair_col (Alignment *A, Alignment *SAR, char *mode)
+{
+ int **r;
+ int a, b, n, do_tm;
+
+
+ Alignment *rA;
+ int *nI;
+
+
+ strget_param (mode, "_TM_", "0", "%d", &do_tm);
+ r=simple_sar_analyze_pair_col (A, SAR,mode);
+ rA=rotate_aln (A, NULL);
+ n=0;
+
+ nI=vcalloc ( SAR->nseq, sizeof (int));
+ for (a=0; a< SAR->nseq; a++)
+ for (b=0; b<SAR->len_aln; b++) nI[a]+=(SAR->seq_al[a][b]=='I')?1:0;
+
+
+
+ while ( r[n][0]!=-1)
+ {
+ if (r[n][3]>0)
+ {fprintf ( stdout, "COMP S: %3d %3d %s %20s %2d #\n", r[n][3],0,SAR->seq_al[r[n][0]], SAR->name[r[n][0]], nI[r[n][0]]);
+ fprintf ( stdout, "SEQ1 S: %3d %3d %s %20s %2d #\n", r[n][3],r[n][1],rA->seq_al[r[n][1]], SAR->name[r[n][0]],nI[r[n][0]]);
+ fprintf ( stdout, "SEQ2 S: %3d %3d %s %20s %2d #\n\n", r[n][3],r[n][2],rA->seq_al[r[n][2]], SAR->name[r[n][0]],nI[r[n][0]]);
+ }
+ n++;
+ }
+ return 0;
+}
+int display_simple_sar_analyze_col (Alignment *A, Alignment *SAR, char *mode)
+{
+ int ***result, **r2, **r3, **r4, **aa;
+ int a, b, c, n;
+ char *cons;
+ int threshold=20;
+ int do_tm;
+ strget_param (mode, "_TM_", "0", "%d", &do_tm);
+ result=simple_sar_analyze_col (A, SAR,mode);
+ r2=declare_int (A->len_aln*SAR->nseq, 5);
+ r3=declare_int (A->len_aln+1, 5);
+ r4=declare_int (A->len_aln+1, SAR->nseq+1);
+ aa=declare_int (2, 256);
+ cons=vcalloc (A->len_aln+1, sizeof (char));
+ for (a=0; a<A->len_aln; a++){r3[a][0]=a;cons[a]='A';}
+
+
+
+ for (n=0,a=0; a< SAR->nseq; a++)
+ {
+ double sum, sum2;
+ for (sum=0, sum2=0,b=0; b<A->len_aln; b++)
+ {
+ sum+=result[a][b][0];
+ sum2+=result[a][b][0]*result[a][b][0];
+ }
+
+ for (b=0; b<A->len_aln; b++, n++)
+ {
+ r2[n][0]=a;//compound
+ r2[n][1]=b;//pos
+ r2[n][2]=result[a][b][1]; //AA
+ r2[n][3]=result[a][b][0]; //Score
+ r2[n][4]=result[a][b][2]; //(int)10*return_z_score ((double)result[a][b][0], sum, sum2, A->len_aln); //ZScore
+ }
+ }
+ sort_int (r2,5, 3, 0, n-1);//sort on Score (3rd field)
+ for ( a=0; a< n; a++)
+ {
+ int comp, pos, bad;
+
+ comp=r2[a][0];
+ pos=r2[a][1];
+ fprintf ( stdout, "SEQ %5d %5d %5d %s ",r2[a][1]+1,r2[a][3], r2[a][4], (do_tm)?alnpos2hmmtop_pred (A, NULL, r2[a][1], SHORT):"x");
+ for (c=0; c<A->nseq; c++)fprintf (stdout, "%c", A->seq_al[c][r2[a][1]]);
+
+
+
+ bad=0;
+ for (c=0; c< A->nseq; c++)
+ {
+ int activity, res;
+
+ activity=SAR->seq_al[comp][c];
+ res=A->seq_al[c][pos];
+
+ if (activity=='O')aa[0][res]++;
+ if (activity=='I')aa[1][res]++;
+ }
+
+ for (c=0; c< A->nseq; c++)
+ {
+ int activity, res;
+ activity=SAR->seq_al[comp][c];
+ res=A->seq_al[c][pos];
+ bad+=(aa[0][res] && aa[1][res])?1:0;
+ aa[0][res]=aa[1][res]=0;
+ }
+ fprintf ( stdout, " %20s %d |\nCOM %5d %5d %5d %s %s %20s %d |\n\n", SAR->name[r2[a][0]],bad,r2[a][1]+1,r2[a][3],r2[a][4], (do_tm)?alnpos2hmmtop_pred (A, NULL, r2[a][1], SHORT):"x",SAR->seq_al[r2[a][0]], SAR->name[r2[a][0]], bad);
+
+
+ if (r2[a][4]>threshold)
+ {
+ cons[r2[a][1]]++;
+ r3[r2[a][1]][1]++;
+ r3[r2[a][1]][2]+=r2[a][3];
+ r4[r2[a][1]][r2[a][0]]=1;
+ }
+ }
+ sort_int (r3, 3,1,0, A->len_aln-1);
+
+ for (a=0; a<A->len_aln; a++)
+ {
+ if (r3[a][1]>0)
+ {
+ fprintf ( stdout, "\nPOS %4d %4d %4d %c ", r3[a][0]+1, r3[a][1], r3[a][2], cons[r3[a][0]]);
+ for (b=0; b<SAR->nseq; b++)fprintf ( stdout, "%d", r4[r3[a][0]][b]);
+ if (do_tm)fprintf ( stdout, " %s",alnpos2hmmtop_pred (A, NULL, r3[a][0], VERBOSE));
+ }
+ }
+ for (a=0; a< A->nseq; a++)fprintf ( stdout, "\n#MSA >%s\n#MSA %s",A->name[a], A->seq_al[a]);
+ fprintf ( stdout, "\n#MSA >cons\n#MSA %s", cons);
+
+ return 0;
+}
+int *** simple_sar_predict (Alignment *A, Alignment *SAR, char *mode)
+{
+ //This function estimates the z score of every poition with every compound
+ //The best Z-score position is then used for the prediction
+
+ int a, b, c, nts, pos, Rscore,Zscore;
+ int ***r;
+ int ***pred;
+ int **aa;
+
+
+ aa=declare_int (2,256);
+ pred=declare_arrayN (3, sizeof (int),SAR->nseq, A->nseq, 5);
+
+
+ r=simple_sar_analyze_col (A, SAR, mode);
+ nts=SAR->len_aln; //number_trainning_sequences;
+
+ for (a=0; a<SAR->nseq; a++)
+ {
+ sort_int (r[a],4, 2, 0, A->len_aln-1);
+
+ pos=r[a][A->len_aln-1][3]; //Best Position
+ Zscore=r[a][A->len_aln-1][2]; //Best Z-Score
+ Rscore=r[a][A->len_aln-1][0]; //Best Z-Score
+
+
+ for (c=0; c<nts; c++)
+ {
+
+ if (SAR->seq_al[a][c]=='I')aa[1][(int)A->seq_al[c][pos]]++;//Build Positive Alphabet for Compound a
+ if (SAR->seq_al[a][c]=='O')aa[0][(int)A->seq_al[c][pos]]++;//Build Positive Alphabet for Compound a
+ }
+ for (c=nts; c<A->nseq; c++)
+ {
+ pred[a][c][0]=pos;
+ pred[a][c][1]=Zscore;
+ pred[a][c][2]=Rscore;
+ if (aa[1][(int)A->seq_al[c][pos]]>0)
+ {
+
+ pred[a][c][3]=aa[1][(int)A->seq_al[c][pos]];
+ pred[a][c][4]=aa[0][(int)A->seq_al[c][pos]];
+ }
+ }
+ for (c=0; c<nts; c++)aa[0][(int)A->seq_al[c][pos]]=aa[1][(int)A->seq_al[c][pos]]=0;
+ }
+
+ for ( a=nts; a< A->nseq; a++)
+ {
+ for ( b=0; b<SAR->nseq; b++)
+ {
+ fprintf ( stdout, ">%-25s %-25s Pos %3d ZScore %3d Rscore %3d Activity +: %d -: %d ", A->name [a], SAR->name[b], pred[b][a][0],pred[b][a][1], pred[b][a][2], pred[b][a][3], pred[b][a][4]);
+ if (pred[b][a][4]==0)for (c=0; c<pred[b][a][3]; c++)fprintf ( stdout, "*");
+ fprintf ( stdout, "\n" );
+ for (c=0; c<A->nseq; c++)fprintf ( stdout, "%c", A->seq_al[c][ pred[b][a][0]]);
+ fprintf ( stdout, " %s\n", SAR->name[b]);
+ for (c=0; c<A->nseq-1; c++)fprintf ( stdout, "%c", SAR->seq_al[b][c]);
+ fprintf ( stdout, " %s\n", SAR->name[b]); fprintf ( stdout, "\n");
+
+ }
+ }
+ return pred;
+}
+int *pair_seq2seq (int *iseq, char *seq1, char *seq2);
+int **simple_sar_analyze_pair_col ( Alignment *inA, Alignment *SAR, char *mode)
+{
+
+ int a, b, c, n, n2;
+ int *iseq=NULL;
+ static int **result, **fresult;
+ int sar_mode=1;
+ int maxgapratio=0;
+ int nresults=10;
+ double sum, sum2, score;
+ Alignment *A;
+ char aa;
+
+ if (!result)
+ {
+ result=declare_int (inA->len_aln*inA->len_aln,5);
+
+ fresult=declare_int (inA->len_aln*nresults*SAR->nseq, 5);
+
+ }
+
+ A=rotate_aln (inA, NULL);
+
+
+ for (n2=0,a=0; a<SAR->nseq; a++)
+ {
+
+ for (n=0, sum=0, sum2=0,b=0; b<A->nseq-1; b++)
+ {
+ for ( c=b+1; c<A->nseq; c++, n++)
+ {
+
+ iseq=pair_seq2seq (iseq,A->seq_al[b], A->seq_al[c]);
+ if ( sar_mode==1)
+ score=sar_vs_iseq1(SAR->seq_al[a],iseq,maxgapratio,NULL,&aa);
+ else if (sar_mode==4)
+ score=sar_vs_iseq4(SAR->seq_al[a],iseq,maxgapratio,NULL,&aa);
+ //HERE ("%d", (int)score);
+ result[n][0]=a;//compound;
+ result[n][1]=b; //pos1
+ result[n][2]=c; //pos2
+ result[n][3]=(int)score;
+
+ sum+=score;
+ sum2+=score*score;
+ }
+ }
+ for (b=0; b<n; b++)
+ result[c][4]=(int)10*return_z_score ((double)result[c][3], sum, sum2,n); //ZScore
+ sort_int (result,5,3,0,n-1);
+ for ( b=n-nresults; b<n; b++, n2++)
+ {
+
+ for ( c=0; c<5; c++)fresult[n2][c]=result[b][c];
+ }
+ }
+ sort_int (fresult,5,3,0,n2-1);
+ fresult[n2][0]=-1; //Terminator;
+
+ return fresult;
+}
+
+int *pair_seq2seq (int *iseq, char *seq1, char *seq2)
+{
+ static int **lu;
+ int a;
+ if (!lu)
+ {
+ int a, b, n=0;
+ lu=declare_int (256,256);
+ for ( n=a=0; a<256; a++)
+ for (b=0; b<256; b++)
+ lu[a][b]=n++;
+ }
+
+ if (!iseq)iseq=vcalloc ( strlen (seq1)+1, sizeof (int));
+ for ( a=0; a< strlen (seq1); a++)
+ {
+ if (is_gap(seq1[a]) || is_gap(seq2[a]))iseq[a]=-1;
+ else iseq[a]=lu[(int)seq1[a]][(int)seq2[a]];
+ }
+
+ return iseq;
+}
+int ***simple_sar_analyze_col ( Alignment *inA, Alignment *SAR, char *mode)
+{
+ Alignment *A;
+ double score=0, best_score=0;
+ int best_pos=0;
+ int a, b;
+
+ static int ***result;
+ int **sim;
+ char aa;
+ int sar_mode=3;
+ int maxgapratio=0;
+ double sum, sum2;
+
+
+ strget_param (mode, "_METHOD_", "3", "%d", &sar_mode);
+ if (!result)
+ result=declare_arrayN (3,sizeof (int),SAR->nseq, inA->len_aln,4);
+
+
+ sim=aln2sim_mat (inA, "idmat");
+ A=rotate_aln (inA, NULL);
+
+
+ for ( a=0; a<SAR->nseq; a++)
+ {
+ best_pos=best_score=0;
+ for ( sum=0, sum2=0,b=0; b<A->nseq; b++)
+ {
+
+ if ( sar_mode==1)
+ score=sar_vs_seq1(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa);
+ else if ( sar_mode==2)
+ score=sar_vs_seq2(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa);
+ else if (sar_mode ==3)
+ score=sar_vs_seq3(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa);
+ else if (sar_mode ==4)
+ score=sar_vs_seq4(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa);
+ else if (sar_mode ==5)
+ score=sar_vs_seq5(SAR->seq_al[a], A->seq_al[b],maxgapratio, sim, &aa);
+
+
+ result[a][b][0]+=score*10;
+ result[a][b][1]=aa;
+ result[a][b][3]=b;
+ sum+=result[a][b][0];
+ sum2+=result[a][b][0]*result[a][b][0];
+
+ }
+ for ( b=0; b< A->nseq; b++)result[a][b][2]=10*return_z_score ((double)result[a][b][0], sum, sum2, A->nseq); //Score
+ }
+
+ return result;
+
+ }
+int *seq2iseq ( char *seq);
+double sar_vs_seq4 ( char *sar, char *seq, float gl, int **sim, char *best_aa)
+{
+
+ return sar_vs_iseq4 (sar, seq2iseq(seq), gl, sim, best_aa);
+}
+double sar_vs_seq1 ( char *sar, char *seq, float gl, int **sim, char *best_aa)
+{
+
+ return sar_vs_iseq1 (sar, seq2iseq(seq), gl, sim, best_aa);
+}
+
+int *seq2iseq ( char *seq)
+{
+ static int *iseq, clen;
+ int a;
+
+ if (!iseq || clen<strlen (seq))
+ {
+ clen=strlen (seq);
+ vfree (iseq);
+ iseq=vcalloc (clen+1, sizeof (int));
+ }
+ for (a=0; a<strlen (seq); a++)
+ {
+ iseq[a]=seq[a];
+ }
+ return iseq;
+}
+
+double sar_vs_iseq1 ( char *sar, int *seq, float gl, int **sim, char *best_aa)
+{
+ double score=0, return_score=0;
+ int RN,N11, Nmsa, Nsar, N, N10, N01, N00;
+ int a, b, r, s, res, res1;
+ double Ng=0;
+ static int *aa, *aal, naa;
+
+ /*measure the E-Value for every amino acid. Returns the best one*/
+
+
+
+ N=strlen (sar);
+ for (a=0; a<N; a++)
+ Ng+=(is_gap(seq[a])|| seq[a]==-1);
+ Ng/=N;
+
+ if (Ng>gl) return 0;
+
+ if (!aa)
+ {
+ aa=vcalloc (256*256, sizeof(int));
+ aal=vcalloc (N, sizeof (int));
+ }
+ naa=0;
+ for ( a=0; a<N; a++)
+ {
+ if (!aa[seq[a]])
+ {
+ aal[naa++]=seq[a];
+ aa[seq[a]]=1;
+ }
+ }
+
+ best_aa[0]='-';
+ for (a=0; a<naa; a++)
+ {
+
+ res=aal[a];
+
+ if (res==-1 || !aa[res]);
+ else
+ {
+ RN=Nmsa=Nsar=N11=N10=N01=N00=0;
+
+ for (b=0; b<N; b++)
+ {
+
+ res1=seq[b];
+ if (res1=='-' || res1==-1)r=0;
+ else r=(res1==res)?1:0;
+
+
+ s=(sar[b]=='I')?1:0;
+
+ Nmsa+=r; Nsar+=s;
+ N11+=(r && s)?1:0;
+ N01+=(!r &&s)?1:0;
+ N10+=(r && !s)?1:0;
+ N00+=(!r && !s)?1:0;
+ RN++;
+ }
+
+ if (N11)
+ {
+ score=evaluate_sar_score1 ( RN, N11, Nmsa, Nsar);
+
+ }
+ else
+ {
+ score=0;
+ }
+
+ if ( score>return_score)
+ {
+ best_aa[0]=res;
+ return_score=score;
+ }
+ }
+ }
+ for ( a=0; a<N; a++)aa[seq[a]]=0;
+
+ return return_score;
+}
+double sar_vs_seq5 ( char *sar, char *seq, float gl, int **sim, char *best_aa)
+{
+ double score=0;
+ int N11, Nmsa, Nsar, N, N10, N01, N00;
+ int a, b, r, s;
+ double Ng=0;
+ int *aa;
+
+ //measure the E-Value if all the 1AA are considered like alphabet 1*/
+ //This function is testing ONE Compound (sar) against one column of MSA (seq) containing N sequences
+ //Returns 0 if the active and inactive alphabet overlap
+ N=strlen (sar);
+ //N is the nmber of sequences
+
+ for (a=0; a<N; a++)
+ Ng+=is_gap(seq[a]);
+ Ng/=N;
+
+ if (Ng>gl) return 0;
+
+ //Identify all the AA associated with a I (Positive alphabet)
+ aa=vcalloc ( 256, sizeof (int));
+ for (b=0; b<N; b++)
+ {
+ s=(sar[b]=='I')?1:0;
+ if (s)aa[(int)seq[b]]=1;
+ }
+ N11=N10=N01=N00=Nmsa=Nsar=0;
+
+
+ for (b=0; b<N; b++)
+ {
+
+ r=aa[(int)seq[b]];
+ s=(sar[b]=='I')?1:0;
+
+ Nmsa+=r; Nsar+=s;
+ N11+=(r && s)?1:0;
+ N01+=(!r &&s)?1:0;
+ N10+=(r && !s)?1:0;
+ N00+=(!r && !s)?1:0;
+ }
+ if (N10>=1 || N01>=1) return 0;
+ if (N11)
+ {
+ score=evaluate_sar_score1 ( N, N11, Nmsa, Nsar);
+ }
+ else score=0;
+
+ vfree (aa);
+ return score;
+
+}
+
+
+
+double sar_vs_iseq4 ( char *sar, int *seq, float gl, int **sim, char *best_aa)
+{
+ int N, Ni, No;
+ int a, b,c, r, s;
+ double Ng=0;
+ static int **aa;
+
+ /*Correlation between AA conservation and Activity*/
+
+ N=strlen (sar);
+ for (a=0; a<N; a++)
+ Ng+=(is_gap(seq[a]) || seq[a]==-1)?1:0;
+ Ng/=N;
+ if (gl<1)Ng*=100;
+
+ if (Ng>gl) return 0;
+
+
+ if (!aa)aa=declare_int(2,257*257);
+ for (No=Ni=b=0; b<N; b++)
+ {
+
+ s=(sar[b]=='I')?1:0;
+ if (s){aa[1][seq[b]]++;Ni++;}
+ else {aa[0][seq[b]]++;No++;}
+ }
+ for (r=0,b=0; b<N; b++)
+ {
+ if (aa[1][(int) seq[b]]==Ni)
+ {
+ r=(No-aa[0][(int)seq [b]])*100/No;
+ break;
+ }
+
+ }
+ for (c=0; c<N; c++)aa[0][seq[c]]=aa[1][seq[c]]=0;
+
+ return r;
+}
+
+double sar_vs_seq3 ( char *sar, char *seq, float gl, int **sim, char *best_aa)
+{
+ double score=0;
+ int N11, Nmsa, Nsar, N, N10, N01, N00;
+ int a, b, r, s;
+ double Ng=0;
+ int *aa;
+
+ //measure the E-Value if all the 1AA are considered like alphabet 1*/
+ //This function is testing ONE Compound (sar) against one column of MSA (seq) containing N sequences
+
+ N=strlen (sar);
+ //N is the nmber of sequences
+
+ for (a=0; a<N; a++)
+ Ng+=is_gap(seq[a]);
+ Ng/=N;
+
+ if (Ng>gl) return 0;
+
+ //Identify all the AA associated with a I (Positive alphabet)
+ aa=vcalloc ( 256, sizeof (int));
+ for (b=0; b<N; b++)
+ {
+ s=(sar[b]=='I')?1:0;
+ if (s)aa[(int)seq[b]]=1;
+ }
+ N11=N10=N01=N00=Nmsa=Nsar=0;
+
+
+ for (b=0; b<N; b++)
+ {
+
+ r=aa[(int)seq[b]];
+ s=(sar[b]=='I')?1:0;
+
+ Nmsa+=r; Nsar+=s;
+ N11+=(r && s)?1:0;
+ N01+=(!r &&s)?1:0;
+ N10+=(r && !s)?1:0;
+ N00+=(!r && !s)?1:0;
+ }
+
+ if (N11)
+ {
+ score=evaluate_sar_score1 ( N, N11, Nmsa, Nsar);
+ }
+ else score=0;
+
+ vfree (aa);
+ return score;
+
+}
+
+double sar_vs_seq2 ( char *sar, char *seq, float gl, int **sim_mat, char *best_aa)
+{
+ double score=0, return_score=0;
+ int L,N11, Nmsa, Nsar,N10, N01, N;
+ int a, b,c,d, r1, s1,r2, s2, res;
+ double Ng=0;
+ int sim, diff, w;
+ char string[5];
+
+ /*Weighted E-Value Similarity*/
+ L=strlen (sar);
+ for (a=0; a<L; a++)
+ Ng+=is_gap(seq[a]);
+ Ng/=L;
+
+ if (Ng>gl) return 0;
+ for (a=0; a<26; a++)
+ {
+
+ N=Nmsa=Nsar=N11=N10=N01=0;
+ res='a'+a;
+ for (d=0,b=0; b<L; b++)d+=((tolower(seq[b]))==res)?1:0;
+ if ( d==0) continue;
+
+ for (b=0; b<L; b++)
+ {
+ r1=(tolower(seq[b])==res)?1:0;
+ s1=(sar[b]=='I')?1:0;
+ for ( c=0; c<L; c++)
+ {
+ r2=(tolower(seq[c])==res)?1:0;
+ s2=(sar[c]=='I')?1:0;
+
+ sprintf ( string, "%d%d%d%d", r1,s1, r2, s2);
+ sim= sim_mat[b][c]/10;
+ diff=10-sim;
+
+ if (strm (string, "0000")) {w=diff;N+=2*w;}
+ else if ( strm (string, "0011")){w=sim ;N+=2*w ; N11+=w ;N10+=0 ;N01+=w ;Nmsa+=w ;Nsar+=w;}
+ else if ( strm (string, "1010")){w=diff;N+=2*w ; N11+=0 ;N10+=2*w ;N01+=0 ;Nmsa+=2*w ;Nsar+=0;}
+ else if ( strm (string, "0101")){w=diff;N+=2*w; N11+=0 ;N10+=0 ;N01+=2*w ;Nmsa+=0 ;Nsar+=2*w;}
+ else if ( strm (string, "1111")){w=diff;N+=2*w; N11+=2*w;N10+=0 ;N01+=0 ;Nmsa+=2*w ;Nsar+=2*w;}
+ else if ( strm (string, "1001")){w=sim; N+=2*w; N11+=0 ;N10+=w ;N01+=w ;Nmsa+=w;Nsar+=w;}
+ else if ( strm (string, "0110")){w=sim; N+=2*w; N11+=0 ;N10+=w ;N01+=w ;Nmsa+=w;Nsar+=w;}
+ }
+ }
+ if (N11)
+ {
+
+ score=evaluate_sar_score1 ( N, N11, Nmsa, Nsar);
+ }
+ return_score=MAX(return_score, score);
+ }
+ if ( return_score <0)fprintf ( stderr, "\n%.2f", return_score);
+ return return_score;
+}
+
+float get_sar_sim (char *seq1, char *seq2)
+{
+ int a, l, s, r;
+ int n11=0, n10=0, n01=0, n00=0;
+
+
+ l=strlen (seq1);
+ for ( a=0; a<l; a++)
+ {
+ s=(seq1[a]=='O')?0:1;
+ r=(seq2[a]=='O')?0:1;
+
+ n00+=(!s && !r)?1:0;
+ n11+=(s && r)?1:0;
+ n01+=(!s && r)?1:0;
+ n10+=(s && !r)?1:0;
+ }
+ if ( n11==0) return 0;
+ else return ((float)(n11)*100)/(float)(n11+n10+n01);
+}
+float get_sar_sim2 (char *seq1, char *seq2)
+{
+ int a, l, s, r;
+ int n11=0, n10=0, n01=0, n00=0, Ns=0, Nr=0;
+ float score1;
+
+ l=strlen (seq1);
+ for ( a=0; a<l; a++)
+ {
+ s=(seq1[a]=='O')?0:1;
+ r=(seq2[a]=='O')?0:1;
+
+ Ns+=s;
+ Nr+=r;
+
+ n00+=(!s && !r)?1:0;
+ n11+=(s && r)?1:0;
+ n01+=(!s && r)?1:0;
+ n10+=(s && !r)?1:0;
+ }
+ if ( n11==0) return 0;
+
+
+
+
+ score1=evaluate_sar_score1 (l, n11, Ns, Nr);
+
+ return score1;
+}
+float sar_aln2cor (Alignment *A);
+int sar_aln2ev (Alignment *A);
+
+int sarseq2ev ( char *s1, char *s2, int MODE);
+char* sarseq2anti_sarseq (char *seq_in, char *seq_out);
+Sequence * compare_sar_sequence( Sequence *S1, Sequence *S2, int depth)
+{
+ int a, b, c, d, e, f;
+ char *s1, *s2, *s3,*s4,*s5, *n1, *n2, *n3, *n4, *n5;
+ int max=0;;
+ int max_depth=5;
+ int score;
+ char *a1, *a0, *a2, *a3, *a4;
+ Alignment *A;
+
+ if ( depth>max_depth)
+ {
+ printf_exit (EXIT_FAILURE, stderr,"maximum depth: %d", max_depth);
+ }
+ if ( depth==0) depth=2;
+ A=declare_aln2 (strlen (S1->seq[0]),depth);
+ a0=A->seq_al[0];
+ a1=A->seq_al[1];
+ A->len_aln=strlen (S1->seq[0]);
+
+ for (a=0; a< S1->nseq; a++)
+ for ( b=0; b<S2->nseq; b++)
+ {
+ A->nseq=2;
+ sprintf (a0, "%s", S1->seq[a]);
+ sprintf (a1, "%s", S2->seq[b]);
+
+ if ( strlen (a0)!=strlen (a1))
+ {
+ add_warning (stderr, "WARNING %s (%d) and %s (%d) do not have the same length", S1->name[a], strlen (S1->seq[a]), S2->name[b], strlen (S2->seq[b]));
+ myexit (EXIT_FAILURE);
+ }
+
+ fprintf ( stdout, ">2 %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b], sar_aln2cor (A), sar_aln2ev (A));
+ sarseq2anti_sarseq (S1->seq[a],a0);
+ fprintf ( stdout, ">2 %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], sar_aln2cor (A), sar_aln2ev (A));
+ if ( depth >=3)
+ {
+ A->nseq=3;
+ a2=A->seq_al[2];
+ for (c=b+1; c<S2->nseq; c++)
+ {
+ sprintf (a0, "%s", S1->seq[a]);
+ sprintf (a1, "%s", S2->seq[b]);
+ sprintf (a2, "%s", S2->seq[c]);
+ fprintf ( stdout, ">2 %15s %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b],S2->name[c], sar_aln2cor (A), sar_aln2ev (A));
+ sarseq2anti_sarseq (S1->seq[a],a0);
+ fprintf ( stdout, ">2 %15s %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], S2->name[c],sar_aln2cor (A), sar_aln2ev (A));
+ }
+ if ( depth>=4)
+ {
+ A->nseq=4;
+ a3=A->seq_al[2];
+ for (d=c+1; d<S2->nseq; d++)
+ {
+ sprintf (a0, "%s", S1->seq[a]);
+ sprintf (a1, "%s", S2->seq[b]);
+ sprintf (a2, "%s", S2->seq[c]);
+ sprintf (a3, "%s", S2->seq[d]);
+
+ fprintf ( stdout, ">2 %15s %15s %15s %15s CORR: %.3f EVAL: %5d\n",S1->name[a], S2->name[b],S2->name[c],S2->name[d], sar_aln2cor (A), sar_aln2ev (A));
+ sarseq2anti_sarseq (S1->seq[a],a0);
+ fprintf ( stdout, ">2 %15s %15s %15s %15s ANTI: %.3f EVAL: %5d\n", S1->name[a], S2->name[b], S2->name[c],S2->name[d],sar_aln2cor (A), sar_aln2ev (A));
+ }
+ if (depth>=5)
+ {
+ A->nseq=5;
+ a4=A->seq_al[3];
+ for (e=d+1; e<S2->nseq; e++)
+ {
+ sprintf (a0, "%s", S1->seq[a]);
+ sprintf (a1, "%s", S2->seq[b]);
+ sprintf (a2, "%s", S2->seq[c]);
+ sprintf (a3, "%s", S2->seq[d]);
+ sprintf (a4, "%s", S2->seq[d]);
+ }
+ }
+ }
+ }
+ }
+
+ return S1;
+}
+char* sarseq2anti_sarseq (char *seq_in, char *seq_out)
+{
+ int a;
+ if (!seq_out)seq_out=vcalloc (strlen (seq_in)+1, sizeof (char));
+ for (a=0; a<strlen (seq_in); a++)seq_out[a]=(seq_in[a]=='I')?'O':'I';
+ return seq_out;
+}
+float sar_aln2cor (Alignment *A)
+{
+ float n1, n11, tot_cor;
+ int a, b, c,n;
+
+ tot_cor=0;
+ for (n=0,a=0; a<A->nseq-1; a++)
+ for (b=a+1; b<A->nseq; b++)
+ {
+ for (n11=n1=0,c=0; c<A->len_aln; c++)
+ {
+ n11+=(A->seq_al[a][c]=='I' && A->seq_al[b][c]=='I');
+ n1+= (A->seq_al[a][c]=='I' || A->seq_al[b][c]=='I');
+ }
+ tot_cor+=(n1==0)?0:n11/n1;
+ n++;
+ }
+ tot_cor/=n;
+ return tot_cor;
+}
+int sarseq_pair2ev ( char *s1, char *s2,int mode);
+int sar_aln2ev (Alignment *A)
+{
+ float n1, n11;
+ int a, b, c, tot=0, n=0;
+
+ tot=0;
+ for (a=0; a<A->nseq-1; a++)
+ for (b=a+1; b<A->nseq; b++)
+ {
+ tot+=sarseq_pair2ev (A->seq_al[a], A->seq_al[b], 1);
+ n++;
+ }
+ return tot;
+}
+int sarseq_pair2ev ( char *s1, char *s2,int mode)
+{
+ int l, t1, t2, t11,a, n1, n2, s;
+ if ( (l=strlen (s1))!=strlen (s2))
+ {
+ return -1;
+ }
+ if (mode==2)
+ {
+ t1=l/2;
+ t2=l/2;
+ t11=l/2;
+ }
+ else
+ {
+ for (t1=t2=t11=0,a=0; a<l; a++)
+ {
+ if ( mode==1)//CORRELATED
+ {
+ n1=(s1[a]=='I')?1:0;
+ n2=(s2[a]=='I')?1:0;
+ }
+ else if ( mode==0)//ANTICORRELATED
+ {
+ n1=(s1[a]=='I')?1:0;
+ n2=(s2[a]=='O')?1:0;
+ }
+ t1+=n1;
+ t2+=n2;
+
+ t11+=(n1&&n2)?1:0;
+ }
+ }
+
+
+ s=(int)(100*evaluate_sar_score1 (l, t11, t1, t2));
+ return s;
+}
+
+double evaluate_sar_score1 ( int N, int n11, int n1msa, int n1sar)
+{
+ double p, p1, p2;
+ int n10, n01;
+
+ n10=n1msa-n11;
+ n01=n1sar-n11;
+
+ if ( n11==0)return 0;
+ // if ( (n10)>n11 || n01>n11)return 0;
+
+ p1= M_chooses_Nlog (n1msa, N) + M_chooses_Nlog (n1sar-n11, N-n1msa) + M_chooses_Nlog (n11, n1msa);
+ p2=(M_chooses_Nlog (n1msa, N)+ M_chooses_Nlog (n1sar, N));
+ p=(p1-p2);
+
+ return -p;
+
+}
+double evaluate_sar_score2 ( int N, int n11, int n1msa, int n1sar)
+{
+
+
+ return n11-((n1msa-n11)+(n1sar-n11));
+
+ if ( n11<n1msa) return 0;
+ else if ( n11<n1sar) return 0;
+ else if ( n11==N)return 0;
+ return n11;
+}
+
+
+int benchmark_sar( int value)
+{
+ static int v[1000];
+ static int a;
+
+ if (a==0)
+ {
+ for (a=0; a< 1000; a++)v[a]=0;
+ v[2]=1;
+ v[3]=2;
+ v[6]=2;
+ v[7]=1;
+ v[8]=2;
+ v[9]=1;
+ v[10]=1;
+ v[11]=1;
+ v[12]=2;
+ v[30]=2;
+ v[31]=1;
+ v[32]=2;
+ v[33]=1;
+ v[34]=2;
+ v[35]=1;
+ v[36]=1;
+ v[37]=2;
+ v[43]=2;
+ v[44]=1;
+ v[45]=2;
+ v[73]=2;
+ v[74]=1;
+ v[75]=1;
+ v[76]=2;
+ v[80]=2;
+ v[81]=1;
+ v[82]=2;
+ v[83]=1;
+ v[85]=2;
+ v[86]=1;
+ v[87]=1;
+ v[88]=2;
+ v[89]=2;
+ v[90]=1;
+ v[91]=2;
+ v[92]=1;
+ v[93]=2;
+ v[103]=2;
+ v[104]=1;
+ v[105]=1;
+ v[106]=1;
+ v[107]=2;
+ v[130]=2;
+ v[131]=1;
+ v[132]=2;
+ v[133]=1;
+ v[134]=1;
+ v[135]=1;
+ v[136]=2;
+ v[137]=1;
+ v[138]=2;
+ v[271]=2;
+ v[272]=1;
+ v[273]=2;
+ v[281]=2;
+ v[282]=1;
+ v[283]=2;
+ v[284]=1;
+ v[285]=1;
+ v[286]=1;
+ v[287]=2;
+ v[319]=2;
+ v[320]=1;
+ v[321]=1;
+ v[322]=1;
+ v[323]=1;
+ v[324]=2;
+ v[325]=1;
+ v[326]=2;
+ v[327]=1;
+ v[328]=2;
+ v[356]=2;
+ v[357]=1;
+ v[358]=1;
+ v[359]=2;
+ v[377]=2;
+ v[378]=1;
+ v[379]=2;
+ v[386]=3;
+ v[388]=2;
+ v[389]=1;
+ v[390]=1;
+ v[391]=1;
+ v[392]=2;
+ v[393]=2;
+ v[394]=2;
+ v[395]=1;
+ v[396]=1;
+ v[397]=2;
+ v[399]=2;
+ v[400]=1;
+ v[401]=2;
+ v[414]=2;
+ v[415]=1;
+ v[416]=2;
+ v[420]=2;
+ v[421]=1;
+ v[422]=1;
+ v[423]=1;
+ v[424]=2;
+ v[425]=1;
+ v[426]=2;
+ }
+ return v[value];
+}
+
+Alignment *weight2sar (Alignment *A, Alignment *SAR, char *weight_file, int limit)
+{
+ int a, b, c;
+ int ***weight;
+ char ***list;
+ float score;
+
+ weight=vcalloc (SAR->nseq, sizeof (int**));
+
+
+ list=file2list (weight_file, " ");
+
+ a=b=0;
+ for (a=0; a< SAR->nseq; a++)
+ {
+ b=c=0;
+ while (list[b])
+ {
+ if ( strm (list[b][1], SAR->name[a]) && atoi (list[b][3])>0)c++;
+ b++;
+ }
+
+ weight[a]=declare_int (c+1, 3);
+ fprintf ( stderr, "\n%s %d", SAR->name[a], c);
+ b=c=0;
+ while (list[b])
+ {
+ if ( strm (list[b][1], SAR->name[a]) && atoi (list[b][3])>0)
+ {
+ weight[a][c][0]=atoi(list[b][2])-1;
+ weight[a][c][1]=list[b][5][0];
+ weight[a][c][2]=atoi (list[b][3]);
+ c++;
+ }
+ b++;
+ }
+ weight[a][c][0]=-1;
+ }
+
+ for (a=0; a<A->nseq; a++)
+ {
+ fprintf ( stdout, ">%s\n", A->name[a]);
+ for ( b=0; b< SAR->nseq; b++)
+ {
+ score=seq2weighted_sar_score(A->seq_al[a], weight[b]);
+ fprintf ( stdout, "%c", (score>limit)?'I':'O');
+ }
+ fprintf (stdout, "\n");
+ }
+ myexit (EXIT_SUCCESS);
+ return A;
+}
+
+Alignment *display_sar ( Alignment *A, Alignment *SAR, char *compound)
+{
+ int a,c;
+ char name[100];
+
+ c=name_is_in_list ( compound, SAR->name, SAR->nseq, 100);
+ if ( c==-1)return A;
+
+ for ( a=0; a< A->nseq; a++)
+ {
+ sprintf (name, "%s", A->name[a]);
+ sprintf ( A->name[a], "%c_%s_%s", SAR->seq_al[c][a], name,compound);
+ }
+ return A;
+}
+Alignment *aln2weighted_sar_score ( Alignment *A,Alignment *SAR, char *weight_file, char *compound)
+{
+
+ int a, b, c=0;
+ int **weight;
+
+ int score;
+ char reactivity;
+ char ***list;
+
+
+ if ( SAR)
+ {
+ c=name_is_in_list (compound, SAR->name, SAR->nseq, 100);
+ }
+
+ list=file2list (weight_file, " ");
+ a=b=0;
+ while (list[a])
+ {
+ if (strm (list[a][1], compound))b++;
+ a++;
+ }
+ weight=declare_int ( b+1, 3);
+
+
+ a=b=0;
+ while (list[a])
+ {
+ if ( !strm (list[a][1], compound) || strm ("TOTPOS", list[a][1]));
+ else
+ {
+ weight[b][0]=atoi(list[a][2])-1;
+ weight[b][1]=list[a][5][0];
+ weight[b][2]=atoi(list[a][3]);
+ b++;
+ }
+ a++;
+ }
+ weight[b][0]=-1;
+ for ( a=0; a< A->nseq; a++)
+ {
+ score=seq2weighted_sar_score (A->seq_al[a], weight);
+ reactivity=(!SAR || c==-1)?'U':SAR->seq_al[c][a];
+
+ sprintf (A->seq_comment[a], "Compound %-15s Reactivity %c SAR_SCORE %5d", compound,reactivity, (int) score);
+
+ }
+ return A;
+}
+
+float seq2weighted_sar_score ( char *seq, int **weight)
+{
+ int a, p, r, w;
+ float score=0;
+
+ a=0;
+ while (weight[a][0]!=-1)
+ {
+ p=weight[a][0];
+ r=weight[a][1];
+ w=weight[a][2];
+
+ if ( is_gap(seq[p]));
+ else if ( tolower(seq[p])==r)score+=w;
+ a++;
+ }
+ return score;
+ }
+
+Alignment * sar2simpred (Alignment *A, Alignment *SAR, char *posfile, char *compound, int L1,int L2 )
+{
+ int a, b, c, c1, c2;
+ int **sim, **sim_ref, npred=0;
+ float n11, n10, n01, n00;
+ float sn, sp;
+
+ int tot_sim=0;
+ int N11=1, N01=2, N10=3, NXX=4, SIM=5;
+ float ***tot;
+ int i1, i2;
+
+
+ n11=n10=n01=n00=0;
+ tot=declare_arrayN(3,sizeof (float), 10, 6, 2);
+
+ sim_ref=aln2sim_mat (A, "idmat");
+ if (strm (posfile, "all"))
+ sim=sim_ref;
+ else
+ {
+ Alignment *B;
+ B=copy_aln ( A,NULL);
+ B=extract_aln3(B,posfile);
+
+ /*if (B->len_aln==0)L1=100;
+ else
+ L1=((B->len_aln-1)*100)/B->len_aln;
+
+ if (L1<=0)L1=100;
+ */
+ sim=aln2sim_mat (B, "idmat");
+ }
+
+ for (a=0; a< A->nseq-1; a++)
+ {
+ for ( b=a+1; b< A->nseq; b++)
+ {
+ for ( c=0; c<SAR->nseq; c++)
+ {
+ if ( (strm (compound, SAR->name[c]) || strm ( compound, "all")))
+ {
+ /*if ( sim_ref[a][b]<30 || sim_ref[a][b]>60)continue;*/
+ i1=0; /*sim_ref[a][b]/10;if (i1==10)i1--;*/
+
+ i2=sim[a][b];
+
+
+ c1=(SAR->seq_al[c][a]=='I')?1:0;
+ c2=(SAR->seq_al[c][b]=='I')?1:0;
+
+ n11=(c1 && c2)?1:0;
+ n01=(!c1 && c2)?1:0;
+ n10=(c1 && !c2)?1:0;
+ n00=(!c1 && !c2)?1:0;
+
+ tot[i1][N11][0]+=n11;
+ tot[i1][N01][0]+=n01;
+ tot[i1][N10][0]+=n10;
+ /*tot[i1][N00][0]+=n00;*/
+ tot[i1][NXX][0]++;
+ tot[i1][SIM][0]+=sim_ref[a][b];
+
+ if ( i2>=L1)
+ {
+ tot[i1][N11][1]+=n11;
+ tot[i1][N01][1]+=n01;
+ tot[i1][N10][1]+=n10;
+ /*tot[i1][N00][1]+=n00;*/
+ tot[i1][NXX][1]++;
+ tot[i1][SIM][1]+=sim_ref[a][b];
+ }
+ }
+ }
+ }
+ }
+
+ for (a=0; a<1; a++)
+ {
+ sp=(tot[a][N11][0])/(tot[a][N11][0]+tot[a][N10][0]);
+ fprintf ( stdout, "\n%15s N11 %5d SP %.2f ",compound, (int)tot[a][N11][0],sp);
+ sp=((tot[a][N11][1]+tot[a][N10][1])==0)?1:(tot[a][N11][1])/(tot[a][N11][1]+tot[a][N10][1]);
+ sn=(tot[a][N11][0]==0)?1:(tot[a][N11][1]/tot[a][N11][0]);
+ fprintf ( stdout, " N11 %5d SP %.2f SN %.2f SIM %.2f", (int)tot[a][N11][1], sp,sn, (tot[a][SIM][1]/tot[a][NXX][1]));
+ }
+
+ myexit (EXIT_FAILURE);
+ sp=((n11+n01)==0)?1:n11/(n11+n01);
+ sn=((n11+n01)==0)?1:n11/(n11+n10);
+
+ fprintf ( stdout, "\nLimit: %d NPRED %d AVGSIM %d SN %.2f SP %.2f TP %d FP %d FN %d",L1, npred, tot_sim, sn, sp, (int)n11, (int)n01, (int)n10);
+ myexit (EXIT_SUCCESS);
+ return A;
+}
+
+Alignment * sar2simpred2 (Alignment *A, Alignment *SAR, char *seqlist, char *posfile, char *compound, int L )
+{
+ int a,b, c,c1, c2, p, s;
+ float n11, n10, n01, n00, n, sn2, prediction,sp, n1, n0, t, entropy, Delta;
+ int *rlist, *tlist, *pred, *npred, tsim, psim;
+ int **sim, **sim_ref;
+ int nr=0;
+ int nrs;
+ char *out;
+ int delta_max;
+ Alignment *B;
+ int printall=1;
+
+ out=vcalloc (A->nseq+1, sizeof (char));
+ rlist=vcalloc ( A->nseq, sizeof (int));
+ tlist=vcalloc ( A->nseq, sizeof (int));
+ pred=vcalloc(2, sizeof (int));
+ npred=vcalloc(2, sizeof (int));
+
+ nrs=0;
+ if ( strm (seqlist, "first"))
+ {
+ for ( a=0; a<SAR->nseq; a++)
+ {
+ if ( strm ( compound, SAR->name[a]))
+ {
+ for ( b=0; b<A->nseq; b++)
+ {
+ if ( SAR->seq_al[a][b]=='I')
+ {
+ fprintf ( stderr, "COMP: %s REF SEQ: %s\n", A->name[b], compound);
+ rlist[nrs]=b;
+ tlist[rlist[nrs]]=1;
+ nrs++;
+ break;
+ }
+ }
+ }
+ }
+ }
+ else if (strm (seqlist, "all"))
+ {
+ for ( a=0; a< A->nseq; a++)
+ {
+ rlist[nrs]=a;
+ tlist[rlist[a]]=1;
+ nrs++;
+ }
+ }
+ else if ((a=name_is_in_list ( seqlist, A->name, A->nseq, 100))!=-1)
+ {
+ rlist[nrs]=a;
+ tlist[rlist[nrs]]=1;
+ nrs++;
+ }
+ else
+ {
+ Alignment *R;
+ R=main_read_aln (seqlist, NULL);
+ for (a=0; a<R->nseq; a++)
+ {
+ rlist[a]=name_is_in_list( R->name[a], A->name, A->nseq, 100);
+ tlist[rlist[a]]=1;
+ }
+ free_aln (R);
+ }
+
+ c=name_is_in_list ( compound, SAR->name, SAR->nseq, 100);
+
+ sim_ref=aln2sim_mat (A, "idmat");
+ if (strm (posfile, "all"))
+ {
+ sim=sim_ref;
+ B=A;
+ }
+ else
+ {
+ B=copy_aln ( A,NULL);
+ B=extract_aln3(B,posfile);
+ sim=aln2sim_mat (B, "idmat");
+ }
+
+ n11=n10=n01=n00=n=n1=n0=0;
+ delta_max=0;
+ for (a=0; a<A->nseq; a++)
+ {
+ if ( tlist[a] && !strm (seqlist, "all"))
+ out[a]=(SAR->seq_al[c][a]=='I')?'Z':'z';/*SAR->seq_al[c][a];*/
+ else
+ {
+
+ pred[0]=pred[1]=0;
+ npred[0]=npred[1]=1;
+ c1=(SAR->seq_al[c][a]=='I')?1:0;
+ for (nr=0,tsim=0,psim=0,b=0; b<nrs; b++)
+ {
+ if ( SAR->seq_al[c][rlist[b]]=='o');
+ else
+ {
+ c2=(SAR->seq_al[c][rlist[b]]=='I')?1:0;
+ nr+=c2;
+ s=sim[a][rlist[b]];
+ tsim+=sim_ref[a][rlist[b]];
+ psim+=sim[a][rlist[b]];
+ if (s>=L)
+ {
+ pred[c2]+=s;
+ npred[c2]++;
+ }
+ }
+ }
+
+ if (c1==0)n0++;
+ else n1++;
+ t++;
+
+
+ Delta=pred[1]-pred[0];
+
+ if (Delta<-delta_max){p=0;out[a]= (c1==0)?'O':'o';}
+ else if (Delta>delta_max){p=1;out[a]=(c1==1)?'I':'i';}
+ else {p=-1; out[a]=(c1==1)?'U':'u';}
+
+ if ( p==-1);
+ else if ( p && c1)n11++;
+ else if ( p && !c1)n10++;
+ else if ( !p && !c1)n00++;
+ else if ( !p && c1)n01++;
+
+ if (p!=-1)n++;
+ if (printall)fprintf ( stdout, ">%-15s %d %c OVERALL_SIM:%d POSITION_SIM %d\n%s\n", B->name[a], c1, out[a],tsim/nrs,psim/nrs,B->seq_al[a]);
+ }
+ }
+ sp=((n11+n10)==0)?1:n11/(n11+n10);
+ sn2=((n1)==0)?1:n11/n1;
+ prediction=(n11+n00)/(n1+n0);
+ entropy=(float)(M_chooses_Nlog (nr, nrs)/M_chooses_Nlog(nrs/2, nrs));
+
+ fprintf ( stdout, ">%-15s Sp %.2f Sn %.2f Pred %.2f E %.2f\n", compound,sp, sn2,prediction,entropy );
+ fprintf ( stdout, "%s\n", out);
+
+ myexit (EXIT_SUCCESS);
+ return A;
+}
+/************************************************************************************/
+/* ALIGNMENT ANALYZE : SAR FOR OR */
+/************************************************************************************/
+
+void display_or_help();
+void display_or_help()
+{
+ fprintf ( stdout, "\nor_sar options:");
+ fprintf ( stdout, "\n_ORCL_: Command_line in a file");
+
+ fprintf ( stdout, "\n_ROTATE_ : rotate the sar matrix (if each entry is a compound rather than a sequence)");
+ fprintf ( stdout, "\n_JNIT_ : number cycles of Jacknife");
+ fprintf ( stdout, "\n_JNSEQ_ : Number of sequences picked up in alignment [0 to keep them all, 10 by default]");
+ fprintf ( stdout, "\n_JSAR_ : Number of compounds picked up in the SAR matrix [0 to keep them all => default");
+ fprintf ( stdout, "\n_JRSAR_ : Randomization of the SAR file between each JNIT iteration: S, C, R, SC, SR... (S: seq, C, column, R: residue");
+ fprintf ( stdout, "\n_JRALN_ : Randomization of the ALN file between each JNIT iteration: S, C, R, SC, SR... (S: seq, C, column, R: residue");
+
+
+ fprintf ( stdout, "\n_NPOS_ : Number of positions used to make the prediction [4 by default]");
+ fprintf ( stdout, "\n_DEPTH_ : Depth of the motif degenerated alphabet [Default=2]");
+ fprintf ( stdout, "\n_POSFILE_ : Predefined list of positions in a file<P: pos score");
+ fprintf ( stdout, "\n_RSAR_ : Randomization of the SAR file: S, C, R, SC, SR... (S: seq, C, column, R: residue");
+ fprintf ( stdout, "\n_RALN_ : Randomization of the SAR file: S, C, R, SC, SR... (S: seq, C, column, R: residue");
+
+ fprintf ( stdout, "\n_SARTHR_ : E-value Threshold for the filtered displays in comploo mode (def: 3)");
+
+
+
+ fprintf ( stdout, "\n_MODE_ : jack, loo (leave one out), pos, sim, predict, self");
+ fprintf ( stdout, "\n_COMPLIST_: Compound_list provided in a FASTA file. Each compound MUST correspond to the corresponding column in SAR");
+ fprintf ( stdout, "\n_OUTALN_ : print the alignment corresponding to the list of positions");
+ fprintf ( stdout, "\n_OUTTREE : Make the tree (default nj) corrwesponding to _OUTALN_");
+ fprintf ( stdout, "\n\n");
+ myexit (EXIT_SUCCESS);
+}
+Alignment * simple_sar_or(Alignment *inA, Alignment *inS, char *param);
+void sar2jack (Alignment *A, Alignment *S, int nseq, int sarlen);
+
+
+void sar2jack (Alignment *A, Alignment *S, int nseq, int sarlen)
+{
+ A=aln2jacknife (A, nseq, 0);
+ S=reorder_aln (S, A->name, A->nseq);
+ S=aln2jacknife (S, 0, sarlen);
+}
+Alignment *or_scan (Alignment *A,Alignment *S, char *pmode)
+{
+ int l, a,ax,cx, b;
+ char mode[100];
+ int start, offset,w;
+ int nl, *poslist;
+
+
+ fprintf ( stdout, "\nPARAMETERS: %s\n", pmode);
+ fprintf ( stderr, "\nPARAMETERS: %s\n", pmode);
+
+ strget_param (pmode, "_SW_", "15", "%d",&w);
+ strget_param (pmode, "_SMINW_", "5", "%d",&start);
+ strget_param (pmode, "_SSTART_", "5", "%d",&offset);
+ strget_param (pmode, "_SMODE_", "single", "%s",&mode);
+
+
+ l=intlen (A->len_aln);
+ poslist=vcalloc ( A->len_aln, sizeof (int));
+ nl=0;
+
+ for ( a=0; a< A->len_aln; a++)poslist[nl++]=a+1;
+
+ if ( strm (mode, "single"))
+ {
+ for ( a=0; a<A->len_aln-2; a++)
+ {
+ int c, gap, score;
+ for (gap=0,c=0; c<A->nseq; c++)gap+=(A->seq_al[c][a]=='-');
+ if ( !gap)
+ {
+ Alignment *B;
+ B=extract_aln (A, a+1, a+2);
+ B=or_sar (B, S, pmode, NO_PRINT);
+ score=B->score_aln;
+ free_aln (B);
+ }
+ else
+ {
+ score=0;
+ }
+ fprintf ( stdout, "P: %*d S: %4d\n",l,a+1, score);
+ //fprintf ( stderr, "P: %*d S: %4d\n",l,a+1, score);
+
+ }
+ }
+ else if strm (mode, "scan")
+ {
+
+ for ( ax=0; ax<nl; ax++)
+ {
+ int best_score=0;
+ int best_pos=0, best_w=0, best_start=0, best_end=0;
+ a=poslist[ax];
+ for (b=start; b<=w; b++)
+ {
+ Alignment *B;
+ int pstart, pend;
+ pstart=a-b;
+ pend=a+b;
+
+ if (pstart<1 || pstart>A->len_aln)continue;
+ if (pend<1 || pend>A->len_aln)continue;
+ B=extract_aln (A, a-w, a+w);
+
+ B=or_sar (B, S,pmode, NO_PRINT);
+
+
+ if (B->score_aln>=best_score){best_score=B->score_aln; best_pos=a;best_w=b;best_start=pstart; best_end=pend;}
+ free_aln (B);
+ }
+ fprintf ( stdout, "P: %*d I: %*d %*d Score: %5d L: %2d\n", l,best_pos, l,best_start+offset, l,best_end, best_score,(best_w*2)+1 );
+ }
+ }
+ else if ( strm ( mode, "scan_comp"))
+ {
+ Alignment *NS=NULL;
+ int *tresults;
+ int s, n=0, p1;
+ int nbest;
+ int *poscache;
+ float sc, eval;
+ int **index;
+ int **poslist;
+ int len=1;
+ float best_sc, best_eval;
+ int best_pos;
+ char *best_word;
+ int sth;
+ int sev;
+
+ index=aln2pos_simple (A, A->nseq);
+ strget_param (pmode, "_NPOS_", "1", "%d", &len);
+ strget_param (pmode, "_STH_", "0", "%d", &sth);
+ strget_param (pmode, "_SEV_", "0", "%d", &sev);
+
+ if (sev==0)for (a=0, sev=1; a<len; a++)sev*=A->len_aln;
+
+ tresults=vcalloc ( A->len_aln, sizeof (int));
+ poscache=vcalloc ( A->len_aln, sizeof (int));
+ poslist=generate_array_int_list (len, 0, A->len_aln-1,1, &n, NULL);
+ for (s=0; s<S->len_aln; s++)
+ {
+
+ char *word;
+ NS=aln2block (S, s+1, s+2, NS);
+ fprintf ( stderr, "\nProcess: %s ...\n", get_compound_name(s, mode));
+
+ for (best_sc=0,best_pos=0,best_word=NULL,a=0; a<n; a++)
+ {
+ for (b=0; b<len; b++)poscache[poslist[a][b]]=1;
+ word=or_id_evaluate2 (A,NS, pmode,poscache, NO_PRINT, &sc);
+ //if (word)HERE ("%s", word);
+ eval=0;
+ for (b=0; b<len; b++)poscache[poslist[a][b]]=0;
+ if (word && sc>best_sc)
+ {
+ best_sc=sc;
+ best_eval=eval;
+ best_word=word;
+ best_pos=a;
+ nbest=1;
+ }
+ else if ( word && sc>=best_sc)
+ {
+ nbest++;
+ }
+ else
+ vfree (word);
+ }
+ nbest/=len;
+ for (a=0; a<=A->nseq; a++)
+ {
+
+
+
+ fprintf (stdout, "\n>%s PPPP: S: %s SC: %d EV: %d NBest: %d W: %s", get_compound_name(s, mode),(a==A->nseq)?"cons":A->name[a],(int)best_sc, (int) best_eval, nbest,best_word);
+ for ( b=0; b<len; b++)
+ {
+ if ( a==A->nseq)
+ {
+ p1=poslist[best_pos][b];
+ if (best_sc>sth && nbest<sev)tresults[p1]++;
+ p1++;
+ }
+ else
+ {
+ p1=index[a][poslist[best_pos][b]];
+ if (p1>0)p1++;
+ }
+ fprintf ( stdout, " %d ", p1);
+
+ }
+ }
+
+ }
+ for ( p1=0; p1<A->len_aln;p1++)
+ fprintf ( stdout, "\n>TOT_P: %d %d", p1+1, tresults[p1]);
+ }
+
+ else if ( strm ( mode, "scan_comp_old"))
+ {
+ Alignment *NS=NULL, *BLOCK=NULL;
+ int **results, *tresults;
+ int s, n, p1, p2;
+ int npos=1;
+ int *poscache;
+
+ results=declare_int (A->len_aln*A->len_aln, 3);
+ tresults=vcalloc ( A->len_aln, sizeof (int));
+ poscache=vcalloc ( A->len_aln, sizeof (int));
+ for (s=0; s<S->len_aln; s++)
+ {
+ int count;
+ NS=aln2block (S, s+1, s+2, NS);
+ fprintf ( stderr, "\nProcess: %s ...", get_compound_name(s, mode));
+ for (n=0,p1=0; p1<A->len_aln-w; p1++, count ++)
+ {
+ if ( count == 50){fprintf ( stderr, "*");count=0;}
+ for ( p2=p1; p2<A->len_aln-w; p2++, n++)
+ {
+ poscache[p1]=1;
+ poscache[p2]=1;
+
+ BLOCK=alnpos2block (A,poscache,BLOCK);
+
+ if ( aln2ngap(BLOCK)<=0)BLOCK=or_sar (BLOCK,NS, pmode, NO_PRINT);
+ else BLOCK->score_aln=0;
+
+ //if ( BLOCK->score_aln>0)HERE ("P: %d %d %d", p1, p2,BLOCK->score_aln);
+ results[n][0]=p1;
+ results[n][1]=p2;
+ results[n][2]=BLOCK->score_aln;
+ poscache[p1]=poscache[p2]=0;
+
+ }
+ }
+ sort_int_inv (results, 3, 2, 0, n-1);
+ for (p1=0; p1<npos; p1++)
+ {
+ fprintf ( stdout, "\n>%s PPPP: %d %d SC: %d", get_compound_name(s, mode), results[p1][0]+1,results[p1][1]+1, results[p1][2]);
+ fprintf ( stderr, "\n>%s PPPP: %d %d SC: %d", get_compound_name(s, mode), results[p1][0]+1,results[p1][1]+1, results[p1][2]);
+ tresults[results[p1][0]]++;
+ tresults[results[p1][1]]++;
+ }
+ }
+ for ( p1=0; p1<A->len_aln;p1++)
+ if ( tresults[p1])fprintf ( stdout, "\n>TOT_P: %d %d", p1+1, tresults[p1]);
+ }
+
+ myexit (EXIT_SUCCESS);
+}
+
+
+ORP * or_sar_compound(Alignment *A, Alignment *S, char *mode, int print);
+ORP* set_orp_name ( ORP* P, char *name);
+ORP* set_orp_offset ( ORP* P,int offset);
+Alignment * or_sar(Alignment *inA, Alignment *inS, char *mode, int print)
+{
+ char rsar[4],raln[4], sarmode[100];
+ int rotate, a, b, start, end;
+ ORP **R, *ORS;
+ Alignment *A, *S;
+
+ if ( mode && (strstr (mode, "help") || strstr (mode, "HELP")))
+ display_or_help();
+
+ strget_param (mode, "_SARMODE_", "single", "%s", &sarmode);//single | all
+ strget_param (mode, "_RSAR_", "NO", "%s", rsar);
+ strget_param (mode, "_RALN_", "NO", "%s", raln);
+ strget_param (mode, "_ROTATE_", "0", "%d", &rotate);
+
+ strget_param (mode, "_START_", "1", "%d", &start);start--;
+ strget_param (mode, "_END_", "0", "%d", &end);
+
+ A=copy_aln (inA, NULL);
+ S=copy_aln (inS, NULL);
+
+ if ( end==0)end=A->len_aln;
+ if ( start!=0 || end!=A->len_aln)
+ {
+ int c;
+ if ( start>=A->len_aln || end<=start || end >A->len_aln)
+ printf_exit (EXIT_FAILURE, stderr, "ERROR: _START_%d and _END_%d are incompatible with the aln [len=%d][FATAL:%s]", start, end, A->len_aln, PROGRAM);
+ for (b=0; b<A->nseq; b++)
+ for (c=0,a=start; a<end; a++, c++)
+ A->seq_al[b][c]=A->seq_al[b][a];
+ A->len_aln=c;
+ for (b=0; b<A->nseq; b++)A->seq_al[b][c]='\0';
+ }
+
+
+ S=aln2random_aln (S, rsar);
+ A=aln2random_aln (A, raln);
+
+
+
+ if (rotate)
+ {
+ Alignment *rS;
+
+ if (S->len_aln!=A->nseq)
+ printf_exit ( EXIT_FAILURE,stderr, "ERROR: Alignment and SAR matrix are incompatible [FATAL:%s]", PROGRAM);
+
+ rS=rotate_aln (S, NULL);
+
+ for (a=0; a< A->nseq; a++)sprintf (rS->name[a], "%s", A->name[a]);
+ free_aln (S);
+ S=rS;
+ }
+
+ R=vcalloc ( S->len_aln+2, sizeof (ORP*));
+ if (strm (sarmode, "all"))R[0]=or_sar_compound (A, S, mode, print);
+ else if ( strm (sarmode, "single"))
+ {
+ for ( a=0; a<S->len_aln; a++)
+ {
+ Alignment *NS=NULL;
+
+ NS=aln2block (S, a+1, a+2, NS);
+
+ R[a]=or_sar_compound (A, NS, mode, NO_PRINT);
+ set_orp_name ((R[a]), get_compound_name (a, mode));
+ set_orp_offset(R[a], start);
+ display_or_summary (R[a], mode, stdout, print);
+
+ }
+ }
+
+
+
+ ORS=combine_n_predictions (R, A, S);
+
+ display_or_summary (ORS, mode, stdout, print);
+ inA->score_aln=(int)(ORS->best*(float)1000);
+
+ free_orp_list (R);
+ free_orp(ORS);
+
+ return inA;
+ }
+ORP* set_orp_offset ( ORP* P,int offset)
+{
+ if (!P) return NULL;
+ else
+ {
+ P->offset=offset;
+ return set_orp_offset(P->PR, offset);
+ }
+}
+
+ORP* set_orp_name ( ORP* P, char *name)
+{
+ if (!P) return NULL;
+ else
+ {
+ sprintf ( P->name, "%s", name);
+ return set_orp_name(P->PR, name);
+ }
+}
+
+ORP * combine_n_predictions (ORP**R, Alignment *A, Alignment *S)
+{
+ int a=0;
+ ORP*N=NULL;
+ while (R[a])
+ {
+ N=combine_2_predictions (R[a++], N, A, S);
+ }
+ sprintf ( N->name, "ALL");
+ sprintf ( N->mode, "COMBINED");
+
+ return N;
+}
+
+ORP *combine_2_predictions ( ORP*IN, ORP *TO,Alignment *A, Alignment *S)
+{
+ int a;
+
+ if ( !TO)
+ {
+ TO=declare_or_prediction (IN->ncomp, IN->nseq, IN->len);
+ TO->A=A;
+ TO->S=S;
+ TO->P=copy_aln(S, NULL);
+ TO->offset=IN->offset;
+ TO->ncomp=0;
+ }
+
+ for (a=0; a< IN->len; a++)
+ {
+ TO->pos[a]+=IN->pos[a];
+ }
+
+ TO->fp+=IN->fp;
+ TO->fn+=IN->fn;
+ TO->tp+=IN->tp;
+ TO->tn+=IN->tn;
+ rates2sensitivity (TO->tp, TO->tn, TO->fp, TO->fn, &(TO->sp), &(TO->sn), &(TO->sen2), &(TO->best));
+
+
+ for (a=0; a<(TO->A)->nseq; a++)
+ {
+ (TO->P)->seq_al[a][TO->ncomp]=(IN->P)->seq_al[a][0];
+ //(TO->S)->seq_al[a][TO->ncomp]=(IN->S)->seq_al[a][0];
+ }
+ TO->ncomp++;
+ (TO->P)->len_aln=TO->ncomp;
+ (TO->A)->score_aln=TO->best;
+
+ return TO;
+}
+
+
+
+ORP * display_or_summary (ORP *CP, char *mode, FILE *fp, int print)
+{
+ int a;
+ char *pred;
+ char *exp;
+ char *motif;
+ Alignment *A, *P, *S;
+
+
+
+ A=CP->A;
+ P=CP->P;
+ S=CP->S;
+
+
+
+ pred=vcalloc ( P->nseq*P->len_aln*2, sizeof (char));
+ exp=vcalloc ( P->nseq*P->len_aln*2, sizeof (char));
+ motif=vcalloc (CP->len+1, sizeof (char));
+
+
+
+ if (P && S)
+ {
+ for ( a=0; a<P->nseq; a++)
+ {
+ strcat (pred,P->seq_al[a]);
+ strcat (exp, S->seq_al[a]);
+ }
+ CP->evalue=profile2evalue(pred, exp);
+ }
+ a=0;
+ while ( CP->motif && CP->motif[a] && CP->motif[0][a][0])strcat (motif, CP->motif[0][a++]);
+
+ if ( print==PRINT)
+ {
+ fprintf (fp, "\n>%-10s Mode: %s Accuracy: %6.2f E-value: %6.2f Motif: ",CP->name,CP->mode, CP->best, CP->evalue);
+
+ if (motif[0])
+ {
+ fprintf (fp, " %s",motif);
+ }
+ for ( a=0; a<CP->len; a++)
+ {
+ if ( CP->pos[a]) fprintf ( fp, "\n>%-10s Mode: %s P: cons %3d SC: %4d", CP->name, CP->mode, a+1+CP->offset, CP->pos[a]);
+ }
+ fprintf ( fp, "\n");
+ }
+ vfree (pred); vfree(motif); vfree(exp);
+ if (CP->PR)display_or_summary (CP->PR, mode, fp, print);
+ return CP;
+}
+
+
+ORP * or_sar_compound(Alignment *A, Alignment *S, char *mode, int print)
+{
+ char rmode[100];
+ Alignment *P=NULL;
+ ORP *PR=NULL;
+
+ strget_param (mode, "_MODE_", "predict", "%s", rmode);
+
+
+
+ if (strm (rmode, "jack"))P=or_jack (A, S, mode);
+ else if (strm (rmode, "loo")) PR=or_loo (A, S, mode,NULL, print);
+ else if (strm (rmode, "comploo")) P=or_comp_loo (A, S, mode,NULL, print);
+ else if ( strm (rmode, "comppos")){or_comp_pos ( A, S, mode,print);myexit (EXIT_FAILURE); return NULL;}
+ else if ( strm (rmode, "pos"))P=or_aln2pos_aln ( A, S, mode);
+ else if ( strm (rmode, "predict"))P=or_predict ( A, S, mode);
+ else if ( strm (rmode, "self"))PR=or_self_predict ( A, S, mode, NULL, print);
+ else if ( strm (rmode, "sim"))P=or_sim ( A, S, mode);
+
+ else if ( strm (rmode, "test"))P=or_test ( A, S, mode);
+ else
+ {
+ printf_exit (EXIT_FAILURE, stderr, "ERROR: %s is an unknown mode of or_sar [FATAL:%s]\n", rmode,PROGRAM);
+ return NULL;
+ }
+
+ if (!PR)
+ {
+ PR=vcalloc (1, sizeof (ORP));
+ PR->P=P;
+ }
+ return PR;
+}
+Alignment * or_test ( Alignment *inA, Alignment *inS, char *mode)
+{
+ return inA;
+}
+
+
+
+float or_id_evaluate ( Alignment *A, Alignment *S, char *mode, int *pos, int print)
+{
+ char *w;
+ float score;
+
+ w=or_id_evaluate2(A,S, mode, pos,print, &score);
+
+ vfree (w);
+ return score;
+}
+char* or_id_evaluate2 ( Alignment *A, Alignment *S, char *mode, int *pos, int print, float *rscore)
+{
+ static char **words;
+ static int *plist;
+ char *bword;
+
+ int res, p,nl, w, c, s, exp, pred;
+ int tp, tn, fp, fn;
+ float sn, sp, sen2, best, best_score;
+
+ if (!A)
+ {free_char (words, -1);
+ vfree (plist);
+ return NULL;
+ }
+ rscore[0]=0;
+
+ plist=pos2list (pos, A->len_aln, &nl);
+ words=declare_char (A->nseq, nl+1);
+ bword=vcalloc (nl+1, sizeof (char));
+ for (p=0; p<nl; p++)
+ {
+ for (s=0; s< A->nseq; s++)
+ {
+ res=A->seq_al[s][plist[p]];
+ if (res=='-'){or_id_evaluate2 (NULL, NULL, NULL, NULL, 0, NULL);vfree (bword);return 0;}
+ words[s][p]=res;
+ }
+ }
+
+ for (best_score=0,w=0; w<A->nseq; w++)
+ {
+ tp=fp=fn=tn=0;
+
+ for (c=0; c<S->len_aln; c++)
+ {
+ for (s=0; s<A->nseq; s++)
+ {
+ exp=(S->seq_al[s][c]=='I')?1:0;
+ pred=strm (words[w], words[s]);
+ if ( exp && pred)tp++;
+ else if ( exp && !pred)fn++;
+ else if (!exp && !pred)tn++;
+ else if (!exp && pred)fp++;
+ }
+ }
+ rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best);
+ if ( best>best_score)
+ {
+ best_score=best;
+ sprintf (bword, "%s", words[w]);
+ }
+ }
+ rscore[0]=(float)1000*best_score;
+ or_id_evaluate2 (NULL, NULL, NULL, NULL, 0, NULL);
+ return bword;
+}
+
+float or_loo_evaluate2 ( Alignment *A, Alignment *S, char *mode, int *pos, int print)
+{
+ int c, s, p, res, sar;
+
+ int *plist, nl;
+ int tp, tn, fp, fn;
+ float sn, sp, sen2, best;
+ char **words, **positive, **negative;
+
+ tp=tn=fp=fn=0;
+ plist=pos2list (pos, A->len_aln, &nl);
+
+ words=declare_char (A->nseq, nl+1);
+
+ for (p=0; p<nl; p++)
+ {
+ for (s=0; s< A->nseq; s++)
+ {
+ res=A->seq_al[s][plist[p]];
+ if (res=='-'){vfree (plist);free_char (words, -1); return 0;}
+ words[s][p]=res;
+ }
+ }
+ positive=vcalloc ( A->nseq, sizeof (char*));
+ negative=vcalloc ( A->nseq, sizeof (char*));
+ for (c=0; c<S->len_aln; c++)
+ {
+ //Fill the match matrix
+ for (p=0; p<nl; p++)
+ {
+ for (s=0; s< A->nseq; s++)
+ {
+ sar=S->seq_al[s][c];
+ if (sar=='I')positive[s]=words[s];
+ else if ( sar=='O')negative[s]=words[s];
+ }
+ }
+
+ //Evaluate the scores
+ for (s=0; s< A->nseq; s++)
+ {
+ int pos=0, neg=0, pred;
+ sar=S->seq_al[s][c];
+ positive[s]=negative[s]=NULL;
+
+ if ( name_is_in_list (words[s], positive, A->nseq, nl+1)!=-1)
+ pos=1;
+ if ( name_is_in_list (words[s], negative, A->nseq, nl+1)!=-1)
+ neg=1;
+
+ if (pos & !neg) pred=1;
+ else pred=0;
+
+ if ( pred && sar=='I')tp++;
+ else if (!pred && sar=='I')fn++;
+ else if (!pred && sar=='O')tn++;
+ else if ( pred && sar=='O')fp++;
+
+ if ( sar=='I')positive[s]=words [s];
+ else negative[s]=words[s];
+ }
+ }
+
+ vfree (negative); vfree (positive);
+ vfree (plist); free_char (words, -1);
+ rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best);
+
+ return (float)1000*best;
+}
+float or_loo_evaluate ( Alignment *A, Alignment *S, char *mode, int *pos, int print)
+{
+ int c, s, p, res, sar;
+ int **matP,**matN;
+ int *plist, nl;
+ int tp, tn, fp, fn;
+ float sn, sp, sen2, best;
+
+ tp=tn=fp=fn=0;
+ plist=pos2list (pos, A->len_aln, &nl);
+ matP=declare_int (nl, 256);
+ matN=declare_int (nl, 256);
+
+ for (c=0; c<S->len_aln; c++)
+ {
+ //Fill the match matrix
+ for (p=0; p<nl; p++)
+ {
+ for (s=0; s< A->nseq; s++)
+ {
+ res=A->seq_al[s][plist[p]];
+ sar=S->seq_al[s][c];
+ if (res=='-'){vfree (plist); free_int (matP, -1);free_int (matN, -1); return 0;}
+ if (sar=='I')matP[p][res]++;
+ if (sar=='O')matN[p][res]++;
+ }
+ }
+
+ //Evaluate the scores
+ for (s=0; s< A->nseq; s++)
+ {
+ int scoreP, scoreN;
+ int pred, valP, valN;
+
+ sar=S->seq_al[s][c];
+ for (scoreN=0,scoreP=0,p=0; p<nl; p++)
+ {
+ res=A->seq_al[s][plist[p]];
+
+ valP=matP[p][res]-(sar=='I')?1:0;
+ scoreP+=(valP>0)?1:0;
+
+ valN=matN[p][res]-(sar=='O')?1:0;
+ scoreN+=(valN>0)?1:0;
+ }
+
+ if ( scoreP==nl && scoreN<nl)pred=1;
+ else pred=0;
+
+
+ if ( pred && sar=='I')tp++;
+ else if (!pred && sar=='I')fn++;
+ else if (!pred && sar=='O')tn++;
+ else if ( pred && sar=='O')fp++;
+ }
+
+ //reset the matrix
+ for (p=0; p<nl; p++)
+ {
+ for (s=0; s< A->nseq; s++)
+ {
+ res=A->seq_al[s][plist[p]];
+ sar=S->seq_al[s][c];
+ if (sar=='I')matP[p][res]=0;
+ else matN[p][res]=0;
+ }
+ }
+ }
+
+ vfree (plist); free_int (matP, -1);free_int (matN, -1);
+ rates2sensitivity (tp, tn, fp,fn,&sp, &sn, &sen2, &best);
+
+ return (float)1000*best;
+}
+int* or_comp_pos ( Alignment *inA, Alignment *inS, char *mode,int print)
+{
+ Alignment *A=NULL, *S=NULL, *inS2=NULL;
+ int a, b, c;
+ int *main_pos, *pos=NULL;
+
+ set_sar (inA, inS, mode);
+ main_pos=vcalloc ( inA->len_aln, sizeof (int));
+
+ inS2=copy_aln (inS, NULL);
+ inS2->len_aln=1;
+
+
+
+ //Run every SAR, one at a time
+ for ( c=0; c< inS->len_aln; c++)
+ {
+ int max, p;
+
+ fprintf ( stdout, ">%d\n", c);
+ for (a=0; a< inS->nseq; a++)
+ {
+ inS2->seq_al[a][0]=inS->seq_al[a][c];
+ inS2->seq_al[a][1]='\0';
+ }
+
+ vfree (pos);
+ free_aln (S);
+ free_aln (A);
+
+ pos=vcalloc (inA->len_aln, sizeof (int));
+ A=copy_aln (inA, NULL);
+ S=copy_aln (inS2, NULL);
+ set_sar (A,S, mode);
+ pos=aln2predictive_positions (A, S, mode,PRINT);
+
+ for (max=0,b=0; b<A->len_aln; b++)
+ {
+ main_pos[b]+=pos[b];
+ if (main_pos[b]>max)
+ {
+ max=main_pos[b];
+ p=b+1;
+ }
+ }
+
+
+ for (a=0; a<A->nseq; a++)
+ {
+ fprintf ( stdout, "\t");
+ for ( b=0; b<A->len_aln; b++)
+ if ( pos[b]) fprintf ( stdout, "%c", A->seq_al[a][b]);
+ fprintf ( stdout, " %c\n", inS2->seq_al[a][0]);
+ }
+ fprintf ( stdout, "\n\tBest: %d %d\n", p, max);
+
+ }
+
+ if (print==PRINT)
+ {
+ for ( a=0; a<inA->len_aln; a++)fprintf ( stdout, "\nP2: cons %4d %4d [FINAL]", a+1, main_pos[a]);
+ }
+ return main_pos;
+}
+Alignment * or_comp_loo ( Alignment *inA, Alignment *inS, char *mode, int *pos,int print)
+{
+ int a, b,c, n;
+ char **keep, **remove;
+ Alignment *A, *S, *P, *P1, *SEQ, *inS2;
+ int **main_pos, *compound_pos, **comp_list;
+ int pos_exists=0;
+ char *comp_pred, *comp_exp;
+ int sar_threshold;
+
+ strget_param (mode, "_SARTHRES_", "3", "%d", &sar_threshold);
+
+ if (pos)pos_exists=1;
+ set_sar (inA, inS, mode);
+ P=copy_aln (inS, NULL);
+ keep=declare_char (inA->nseq, MAXNAMES);
+ remove=declare_char (inA->nseq, MAXNAMES);
+
+ main_pos=declare_int ( inA->len_aln,4);
+ comp_list=declare_int (inA->len_aln, sizeof (int*));
+ inS2=copy_aln (inS, NULL);
+ inS2->len_aln=1;
+
+
+ comp_pred=vcalloc ( inA->nseq+1, sizeof (int));
+ comp_exp=vcalloc ( inA->nseq+1, sizeof (int));
+ compound_pos=NULL;
+ //Run every SAR, one at a time
+ for ( c=0; c< inS->len_aln; c++)
+ {
+ for (a=0; a< inS->nseq; a++)
+ {
+ inS2->seq_al[a][0]=inS->seq_al[a][c];
+ inS2->seq_al[a][1]='\0';
+ }
+ vfree (compound_pos);
+ compound_pos=vcalloc (inA->len_aln, sizeof (int));
+ for (a=0; a<inA->nseq; a++)
+ {
+ char ***motifs;
+
+ A=copy_aln (inA, NULL);
+ S=copy_aln (inS2, NULL);
+
+ for (n=0,b=0; b<A->nseq; b++)
+ {
+ if (b!=a)sprintf (keep[n++ ], "%s", A->name [b]);
+ }
+ sprintf ( remove[0], "%s", A->name[a]);
+ reorder_aln (A,keep, A->nseq-1);
+
+ set_sar (A,S, mode);
+ if (!pos_exists)
+ {
+ pos=aln2predictive_positions (A, S, mode,NO_PRINT);
+ }
+ for (b=0; b<A->len_aln; b++)
+ {
+ compound_pos[b]+=pos[b];
+ }
+
+ motifs=compounds2motifs (A, S, pos,0, mode, NO_PRINT);
+
+ SEQ=copy_aln (inA, NULL);
+ SEQ=reorder_aln (SEQ, remove, 1);
+
+ P1=aln2prediction (SEQ, motifs, pos);
+ comp_pred[a]=P1->seq_al[0][0];
+ comp_exp[a]=inS2->seq_al[a][0];
+ P->seq_al[a][c]=P1->seq_al[0][0];
+
+
+ free_aln (SEQ);
+ free_aln (S);
+ free_aln (A);
+ free_aln (P1);
+ free_arrayN( (void *)motifs, 3);
+ if (!pos_exists)vfree (pos);
+ }
+ if (print==PRINT)
+ fprintf ( stdout, ">%-15s SC: %.2f E; %.2f\n%s\n%s\n", get_compound_name(c, mode),profile2sensitivity (comp_pred, comp_exp, NULL, NULL, NULL, NULL),profile2evalue(comp_pred, comp_exp),comp_pred, comp_exp);
+ for (b=0; b<A->len_aln; b++)
+ {
+ main_pos[b][2]+=compound_pos[b];
+ if (compound_pos[b])main_pos[b][3]++;
+ if (profile2evalue(comp_pred, comp_exp)>sar_threshold)
+ {
+ main_pos[b][0]+=compound_pos[b];
+ if (compound_pos[b])
+ {
+ main_pos[b][1]++;
+ comp_list[b][0]++;
+ comp_list[b]=vrealloc (comp_list[b], sizeof (int)*(comp_list[b][0]+1));
+ comp_list[b][comp_list[b][0]]=c;
+ }
+ }
+ }
+
+ }
+
+ P->score_aln=(int)((float)1000*evaluate_prediction (P, inS, mode,print));
+
+ if (print==PRINT)
+ {
+ for ( a=0; a<inA->len_aln; a++)fprintf ( stdout, "\nP: cons %4d RS: %4d RC: %5d FC: %4d %4d[FINAL]", a+1, main_pos[a][2], main_pos[a][3], main_pos[a][0], main_pos[a][1]);
+
+ for ( a=0; a<inA->len_aln; a++)
+ {
+ fprintf ( stdout, "\nP: cons %4d RS: %4d RC: %5d FC: %4d %4d CLIST: ", a+1, main_pos[a][2], main_pos[a][3], main_pos[a][0], main_pos[a][1]);
+ for ( c=1; c<=comp_list[a][0]; c++)
+ {
+ fprintf ( stdout, "%s ", get_compound_name(comp_list[a][c], mode));
+ }
+ fprintf ( stdout, " [COMP_LIST]");
+ }
+ }
+ free_int (main_pos, -1);
+ free_int (comp_list, -1);
+ return P;
+}
+
+ORP* or_loo ( Alignment *inA, Alignment *inS, char *mode, int *pos,int print)
+{
+ int a, b, n;
+ char **keep, **remove;
+ Alignment *A, *S, *P, *P1, *SEQ;
+
+ int pos_exists=0;
+ ORP *PR;
+
+
+
+ if (pos)pos_exists=1;
+ set_sar (inA, inS, mode);
+ PR=declare_or_prediction (inS->nseq, inA->nseq, inA->len_aln);
+ sprintf (PR->mode, "loo ");
+ P=copy_aln (inS, NULL);
+
+
+ keep=declare_char (inA->nseq, MAXNAMES);
+ remove=declare_char (inA->nseq, MAXNAMES);
+
+
+ PR->A=inA;
+ PR->P=P;
+ PR->S=inS;
+
+ for (a=0; a<inA->nseq; a++)
+ {
+ char ***motifs;
+
+ A=copy_aln (inA, NULL);
+ S=copy_aln (inS, NULL);
+
+ for (n=0,b=0; b<A->nseq; b++)
+ {
+ if (b!=a)sprintf (keep[n++ ], "%s", A->name [b]);
+ }
+ sprintf ( remove[0], "%s", A->name[a]);
+ reorder_aln (A,keep, A->nseq-1);
+
+ set_sar (A,S, mode);
+
+ if (!pos_exists)
+ {
+ pos=aln2predictive_positions (A, S, mode,print);
+
+ }
+
+ for (b=0; b<A->len_aln; b++)
+ {
+ PR->pos[b]+=pos[b];
+ }
+
+ motifs=compounds2motifs (A, S, pos,0, mode, print);
+
+ SEQ=copy_aln (inA, NULL);
+ SEQ=reorder_aln (SEQ, remove, 1);
+ SEQ->nseq=1;
+
+ P1=aln2prediction (SEQ, motifs, pos);
+
+
+ if (print==PRINT)
+ {
+ fprintf ( stdout, "\n%s\nPred: %s\nReal: %s\n", P1->name[0], P1->seq_al[0], inS->seq_al[a]);
+ }
+ sprintf ( P->seq_al[a], "%s", P1->seq_al[0]);
+ free_aln (P1);
+
+
+ free_aln (SEQ);
+ free_aln (S);
+ free_aln (A);
+
+ free_arrayN( (void *)motifs, 3);
+ if (!pos_exists)vfree (pos);
+
+ }
+ free_char (keep, -1);
+ free_char (remove, -1);
+
+
+ PR=new_evaluate_prediction (PR, mode,print);
+
+
+ PR->PR=or_self_predict(inA, inS, mode,NULL, print);
+
+ if (print==PRINT)for ( a=0; a<inA->len_aln; a++)fprintf ( stdout, "\nP: cons %d %d [FINAL]", a+1, PR->pos[a]);
+
+
+
+ return PR;
+}
+
+
+
+Alignment * or_jack(Alignment *inA, Alignment *inS, char *mode)
+{
+ int a,b;
+ int n_cycles=100;
+ int subnseq=10;
+ int subsar=0;
+ Alignment *A, *S;
+ int *main_pos,*pos;
+ char jrsar[10], jraln[10];
+
+ strget_param (mode, "_JNIT_", "100", "%d", &n_cycles);
+ strget_param (mode, "_JNSEQ_", "10", "%d", &subnseq);
+ strget_param (mode, "_JNAR_", "0", "%d", &subsar);
+
+ strget_param (mode, "_JRSAR_", "NO", "%s", jrsar);
+ strget_param (mode, "_JRALN_", "NO", "%s", jraln);
+
+
+
+ main_pos=vcalloc ( inA->len_aln, sizeof (int));
+ for (a=0; a<n_cycles; a++)
+ {
+
+ A=copy_aln (inA, NULL);
+ S=copy_aln (inS, NULL);
+
+ S=aln2random_aln (S, jrsar);
+ A=aln2random_aln (A, jraln);
+
+ set_sar (A,S, mode);
+ sar2jack (A, S,subnseq,subsar);
+
+
+
+ pos=aln2predictive_positions (A, S,mode, PRINT);
+
+ for (b=0; b<A->len_aln; b++)main_pos[b]+=pos[b];
+ vfree (pos);
+
+ }
+ display_pos (A, S, main_pos, mode);
+
+
+ return inA;
+}
+
+Alignment * display_pos (Alignment *A, Alignment *S, int *pos,char *mode)
+{
+ Alignment *B;
+ int a, b;
+ int **index;
+
+ int intl;
+
+ intl=intlen (A->len_aln);
+ index=aln2pos_simple (A, A->nseq);
+ B=copy_aln (A,NULL);
+ B->len_aln=0;
+ for ( a=0; a<A->len_aln; a++)
+ fprintf ( stdout, "\nP: cons %*d %*d S: %4d [DISPLAY_FULL_POS]", intl,a+1,intl, a+2, pos[a]);
+ fprintf ( stdout, "\n\n");
+ for (a=0; a<A->len_aln; a++)
+ {
+ if (pos[a])
+ {
+ for ( b=0; b<A->nseq; b++)
+ {
+ B->seq_al[b][B->len_aln]=A->seq_al[b][a];
+ if (index[b][a]>0)fprintf ( stdout, "\nP: %s %d %d S: %d [DISPLAY_POS]",A->name[b], index[b][a], index[b][a]+1, pos[a]);
+ }
+ B->len_aln++;
+ fprintf ( stdout, "\nP: cons %d %d S: %d [DISPLAY_POS]", a+1, a+2, pos[a]);
+ }
+ }
+ fprintf ( stdout, "\n");
+ for (a=0; a<B->nseq; a++)B->seq_al[a][B->len_aln]='\0';
+ return B;
+}
+Alignment * or_aln2pos_aln (Alignment *A, Alignment *S, char *mode)
+{
+ Alignment *B;
+
+ int *pos;
+ char outaln[100], outtree[100];
+
+
+ strget_param (mode, "_OUTALN_", "NO", "%s", outaln);
+ strget_param (mode, "_OUTTREE_", "NO", "%s", outtree);
+
+ set_sar (A, S, mode);
+ pos=aln2predictive_positions (A, S,mode, PRINT);
+
+ B=display_pos (A, S, pos, mode);
+
+
+ if (!strm(outaln, "NO")) vfclose (output_aln (B, vfopen (outaln, "w")));
+ if (!strm(outtree, "NO"))vfclose (print_tree (aln2tree(B), "newick", vfopen (outtree, "w")));
+
+ return B;
+}
+Alignment * or_sim(Alignment *A, Alignment *S, char *mode)
+{
+ //Predict all the sequences that are not both in inS and inA
+ int *pos;
+
+ set_sar (A, S, mode);
+ pos=aln2predictive_positions (A, S,mode, PRINT);
+ fprintf ( stdout, "R: %.3f", pos2sim (A,S, pos));
+
+ myexit (EXIT_SUCCESS);
+ return A;
+}
+ORP* or_self_predict(Alignment *A, Alignment *S, char *mode,int *pos, int print)
+{
+ //Predict all the sequences that are not both in inS and inA
+ Alignment *P;
+ char ***motifs;
+
+
+ int a;
+
+ int pre_set_pos=0;
+ ORP *PR;
+
+
+ set_sar (A, S, mode);
+ PR=declare_or_prediction (S->nseq, A->nseq, A->len_aln);
+ sprintf (PR->mode, "self");
+ PR->A=A;
+ PR->S=S;
+
+ if (!pos)
+ {
+ pos=aln2predictive_positions (A, S,mode,print);
+ pre_set_pos=0;
+ }
+ else
+ pre_set_pos=1;
+
+ for (a=0; a< A->len_aln; a++)
+ PR->pos[a]=pos[a];
+
+
+ PR->motif=motifs=compounds2motifs (A, S, pos,0, mode, print);
+ P=PR->P=aln2prediction (A, motifs, pos);
+
+ if (!pre_set_pos)vfree (pos);
+
+ PR=new_evaluate_prediction (PR, mode,print);
+ return PR;
+}
+
+
+Alignment * or_predict(Alignment *inA, Alignment *inS, char *mode)
+{
+ //Predict all the sequences that are not both in inS and inA
+ Alignment *P, *A, *S, *T;
+ char ***motifs;
+ int *pos;
+
+ int a, b;
+
+
+
+
+
+
+ A=copy_aln (inA, NULL);
+ S=copy_aln (inS, NULL);
+ set_sar (A, S, mode);
+
+ pos=aln2predictive_positions (A, S,mode,PRINT);
+ motifs=compounds2motifs (A, S, pos,0, mode, PRINT);
+ T=get_prediction_target (inA, inS, mode);
+
+
+ P=aln2prediction (T, motifs, pos);
+ //recall=evaluate_prediction (S, P, mode);
+ for ( a=0; a<P->len_aln; a++)
+ {
+ for (b=0; b<P->nseq; b++)
+ {
+ if (tolower(P->seq_al[b][a])=='i')fprintf (stdout, "\n>%20s %20s %c", T->name [0],get_compound_name (a, mode), P->seq_al[b][a]);
+ }
+ }
+ fprintf ( stdout, "\n");
+ return P;
+}
+
+Alignment *get_prediction_target (Alignment *A, Alignment *S, char *param)
+{
+ char **name;
+ int n, a;
+ Alignment *T;
+
+ T=copy_aln (A, NULL);
+ name=declare_char (A->nseq, 100);
+ for (n=0,a=0; a< A->nseq; a++)
+ {
+ if ( name_is_in_list (A->name[a], S->name, S->nseq, 100)==-1)
+ {
+ sprintf (name[n++], "%s", A->name[a]);
+ }
+ }
+ T=reorder_aln (T,name, n);
+ return T;
+}
+
+Alignment *set_sar (Alignment *A, Alignment *S, char *param)
+{
+ char **name;
+ int n, a;
+
+ name=declare_char (A->nseq, 100);
+ for (n=0,a=0; a< A->nseq; a++)
+ {
+ if ( name_is_in_list (A->name[a], S->name, S->nseq, 100)!=-1)
+ {
+ sprintf (name[n++], "%s", A->name[a]);
+ }
+ }
+ A=reorder_aln (A,name, n);
+ S=reorder_aln (S,name, n);
+ free_char (name, -1);
+ return S;
+}
+
+ORP* new_evaluate_prediction (ORP *PR, char *mode, int print)
+{
+ int a,b, i, r, p;
+ int tp, tn, fp, fn;
+ float sn, sp, sen2, best;
+ float tot_best_seq=0;
+ float tot_best_comp=0;
+ Alignment *P, *R;
+
+ int ns=0;
+ float *recall;
+
+
+ P=PR->P;
+ R=PR->S;
+
+ recall=vcalloc (P->len_aln, sizeof (float));
+ if (P->len_aln!=R->len_aln)
+ {
+ HERE ("Mismatch between number of compounds in prediction and reference");
+ myexit (EXIT_FAILURE);
+ }
+ if (print==PRINT)fprintf ( stdout, "\n");
+
+ for (a=0; a<P->nseq; a++)
+ {
+ tp=tn=fp=fn=0;
+ if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1)
+ {
+
+ for (b=0;b<P->len_aln; b++)
+ {
+ r=R->seq_al[i][b];
+ p=P->seq_al[a][b];
+
+ if ( p=='I' && r=='I')tp++;
+ else if ( p=='I' && r=='O')fp++;
+ else if ( p=='O' && r=='I')fn++;
+ else if ( p=='O' && r=='O')tn++;
+ }
+ rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
+ if (print==PRINT)fprintf (stdout, ">%-s sp: %.2f sn: %.2f sn2: %.2f best: %.2f [SEQ]\n",P->name[a], sp, sn, sen2, best);
+ if ( best>0)
+ {
+ ns++;
+ tot_best_seq+=best;
+ }
+ }
+ }
+ if (ns)
+ {
+ tot_best_seq/=ns;
+ }
+ if (print==PRINT)fprintf ( stdout, ">TotSeq sp: %.2f N: %d[SEQ]\n",tot_best_seq, ns);
+
+ tot_best_comp=0;
+ for (ns=0,b=0; b<P->len_aln; b++)
+ {
+ tp=tn=fp=fn=0;
+ for (a=0; a<P->nseq;a++)
+ {
+ if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1)
+ {
+ r=R->seq_al[i][b];
+ p=P->seq_al[a][b];
+
+ if ( p=='I' && r=='I'){PR->tp++;tp++;}
+ else if ( p=='I' && r=='O'){PR->fp++;fp++;}
+ else if ( p=='O' && r=='I'){PR->fn++;fn++;}
+ else if ( p=='O' && r=='O'){PR->tn++;tn++;}
+ }
+ }
+ rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
+
+ if (print==PRINT) fprintf (stdout, ">%-25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f [COMP]\n",get_compound_name (b, mode), PR->sp, PR->sn, PR->sen2,PR->best);
+ if ( best>0)
+ {
+ ns++;
+ tot_best_comp+=best;
+ }
+ }
+
+ if (ns)
+ {
+ tot_best_comp/=ns;
+ }
+ rates2sensitivity (PR->tp, PR->tn, PR->fp,PR->fn,&(PR->sp), &(PR->sn), &(PR->sen2), &(PR->best));
+ if (print==PRINT)fprintf ( stdout, ">FullTot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[COMP]\n", PR->sp, PR->sn, PR->sen2,PR->best, ns);
+ P->score_aln=(int)((float)1000*(PR->best));
+ return PR;
+}
+float evaluate_prediction (Alignment *R, Alignment *P, char *mode, int print)
+{
+ int a,b, i, r, p;
+ int tp, tn, fp, fn;
+ int tot_tp, tot_tn, tot_fp, tot_fn;
+ float sn, sp, sen2, best;
+ float tot_sp=0;
+ float tot_sn=0;
+ float tot_sen2=0;
+ float tot_best_seq=0;
+ float tot_best_comp=0;
+ float tot_best=0;
+
+ int ns=0;
+ float *recall;
+
+
+
+
+ recall=vcalloc (P->len_aln, sizeof (float));
+ if (P->len_aln!=R->len_aln)
+ {
+ HERE ("Mismatch between number of compounds in prediction and reference");
+ myexit (EXIT_FAILURE);
+ }
+ if (print==PRINT)fprintf ( stdout, "\n");
+ for (a=0; a<P->nseq; a++)
+ {
+ tp=tn=fp=fn=0;
+ if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1)
+ {
+
+ for (b=0;b<P->len_aln; b++)
+ {
+ r=R->seq_al[i][b];
+ p=P->seq_al[a][b];
+
+ if ( p=='I' && r=='I')tp++;
+ else if ( p=='I' && r=='O')fp++;
+ else if ( p=='O' && r=='I')fn++;
+ else if ( p=='O' && r=='O')tn++;
+ }
+ rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
+ if (print==PRINT)fprintf (stdout, ">%-s sp: %.2f sn: %.2f sn2: %.2f best: %.2f [SEQ]\n",P->name[a], sp, sn, sen2, best);
+ if ( best>0)
+ {
+ ns++;
+ tot_best_seq+=best;
+ tot_sn+=sn;
+ tot_sp+=sp;
+ tot_sen2+=sen2;
+ }
+ }
+ }
+ if (ns)
+ {
+ tot_best_seq/=ns;
+ tot_sn/=ns;
+ tot_sp/=ns;
+ tot_sen2/=ns;
+ }
+ if (print==PRINT)fprintf ( stdout, ">Tot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[SEQ]\n", tot_sp, tot_sn, tot_sen2,tot_best_seq, ns);
+
+ tot_fp=tot_fn=tot_tp=tot_tn=0;
+ tot_sp=tot_sn=tot_sen2=tot_best_comp=0;
+ for (ns=0,b=0; b<P->len_aln; b++)
+ {
+ tp=tn=fp=fn=0;
+ for (a=0; a<P->nseq;a++)
+ {
+ if ((i=name_is_in_list (P->name[a], R->name, R->nseq, 100))!=-1)
+ {
+ r=R->seq_al[i][b];
+ p=P->seq_al[a][b];
+
+ if ( p=='I' && r=='I'){tot_tp++;tp++;}
+ else if ( p=='I' && r=='O'){tot_fp++;fp++;}
+ else if ( p=='O' && r=='I'){tot_fn++;fn++;}
+ else if ( p=='O' && r=='O'){tot_tn++;tn++;}
+ }
+ }
+ rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
+
+ if (print==PRINT) fprintf (stdout, ">%-25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f [COMP]\n",get_compound_name (b, mode), sp, sn, sen2,best);
+ recall[b]=sen2;
+ if ( best>0)
+ {
+ ns++;
+ tot_best_comp+=best;
+ tot_sn+=sn;
+ tot_sp+=sp;
+ tot_sen2+=sen2;
+ }
+ }
+
+ if (ns)
+ {
+ tot_best_comp/=ns;
+ tot_sn/=ns;
+ tot_sp/=ns;
+ tot_sen2/=ns;
+ }
+ rates2sensitivity (tot_tp, tot_tn, tot_fp,tot_fn,&tot_sp, &tot_sn, &tot_sen2, &tot_best);
+ if (print==PRINT)fprintf ( stdout, ">FullTot sp: %.2f sn: %.2f sen2: %.2f best: %.2f N: %d[COMP]\n", tot_sp, tot_sn, tot_sen2,tot_best, ns);
+ return tot_best;
+}
+
+
+
+Alignment * aln2prediction (Alignment *A,char ***motif, int *pos)
+{
+ int a, b,nc, nl;
+ int *list;
+ char **array, **sar;
+ Alignment *R;
+ Sequence *S;
+ nc=read_array_size ((void *)motif, sizeof (char***));
+
+
+ list=pos2list (pos, A->len_aln, &nl);
+
+
+ array=declare_char (A->nseq, nl+1);
+ sar=declare_char(A->nseq, nc+1);
+ for (a=0; a<A->nseq; a++)
+ {
+ for (b=0; b<nl; b++)
+ array[a][b]=A->seq_al[a][list[b]];
+ }
+
+ for (a=0; a<nc; a++)
+ {
+ for (b=0; b<A->nseq; b++)
+ {
+
+ sar[b][a]=(match_motif (array[b], motif[a]))?'I':'O';
+ }
+ }
+
+
+ S=fill_sequence_struc (A->nseq,sar,A->name);
+ R=seq2aln (S, NULL, KEEP_GAP);
+ free_sequence (S, S->nseq);
+ free_char (sar, -1);
+ vfree (list);
+ free_char (array, -1);
+ return R;
+}
+
+int * file2pos_list (Alignment *A, char *posfile)
+{
+ char ***file;
+ int **index;
+ int *pos;
+ int i, n, p;
+
+ //pos_file:<P: seqname pos score\n>
+ // 1 2 3 4
+
+
+ if ( !check_file_exists (posfile))
+ {
+ printf_exit ( EXIT_FAILURE, stderr, "ERROR: Could not read posfile %s\n", posfile);
+ }
+
+ file=file2list (posfile, " ");
+
+ index=aln2inv_pos (A);
+ pos=vcalloc ( A->len_aln, sizeof (int));
+
+ n=0;
+ while (file[n])
+ {
+
+ if ( !strm (file[n][1], "P:"));
+ else
+ {
+ if ( (strm (file[n][2], "cons")))
+ p=atoi(file[n][3])-1;
+ else
+ {
+ i=name_is_in_list ( file[n][2], A->name, A->nseq, MAXNAMES+1);
+ if (i!=-1)
+ p=index[i][atoi(file[n][3])]-1;
+ else p=-1;
+ }
+ if (p!=-1)pos[p]+=atoi(file[n][4]);
+ }
+ n++;
+ }
+
+
+ free_int (index, -1);
+ free_arrayN ( (char **)file, 3);
+ return pos;
+}
+int * aln2predictive_positions (Alignment *A, Alignment *B, char *mode, int print)
+{
+ char posmode[100];
+
+ if (!mode) return NULL;
+ HERE ("%s", mode);
+ strget_param (mode, "_POSMODE_", "scan", "%s", posmode);
+ if ( strm (posmode, "mat"))return aln2predictive_positions_mat (A, B, mode, print);
+ else if ( strm (posmode, "scan")) return aln2predictive_positions_scan (A, B, mode, print);
+ else
+ {
+ printf_exit (EXIT_FAILURE,stderr, "ERROR: %s is an unknown _POSMODE_ mode",posmode);
+ return NULL;
+ }
+}
+
+int * aln2predictive_positions_mat (Alignment *A, Alignment *B, char *mode, int print)
+ {
+ int a, b, c,gap, res1, res2, sar1, sar2, npos, s, idscore;
+ float id1,id2,id3,nid1,nid2,nid3;
+ int **pos, *fpos;
+ pos=declare_int (A->len_aln,2);
+ fpos=vcalloc ( A->len_aln, sizeof (int));
+
+ strget_param (mode, "_NPOS_", "2", "%d", &npos);
+ for ( a=0; a< A->len_aln; a++)
+ {
+ pos[a][0]=a;
+ id1=id2=id3=nid1=nid2=nid3=0;
+ for ( gap=0,b=0; b<A->nseq; b++)gap+=(A->seq_al[b][a]=='-');
+ if ( gap>0){pos[a][1]=0;continue;}
+
+ for (s=0; s<B->len_aln; s++)
+ {
+ for ( gap=0,b=0; b<A->nseq-1; b++)
+ {
+ sar1=B->seq_al[b][s];
+ res1=A->seq_al[b][a];
+
+ for ( c=b+1; c<A->nseq; c++)
+ {
+ sar2=B->seq_al[c][s];
+ res2=A->seq_al[c][a];
+
+ idscore=(res1==res2)?1:0;
+ if ( sar1 == 'I' && sar2=='I'){id1+=idscore;nid1++;}
+ else if ( sar1 =='0' && sar2=='0'){id2+=idscore;nid2++;}
+ else {id3+=idscore; nid3++;}
+
+ }
+ }
+ id1=(nid1==0)?1:id1/nid1;
+ id2=(nid1==0)?1:id2/nid2;
+ id3=(nid3==0)?1:id3/nid3;
+ pos[a][1]=(int)((float)1000*id1*(1-id3));
+
+ }
+ }
+
+ sort_int (pos, 2,1, 0, A->len_aln-1);
+ for ( a=MAX(0,(A->len_aln-npos));a<A->len_aln; a++)
+ {
+ fpos[pos[a][0]]=1;
+ }
+
+ free_int (pos, -1);
+ return fpos;
+ }
+int * aln2predictive_positions_scan (Alignment *A, Alignment *B, char *mode, int print)
+{
+ int a, b, c, best_pos,nl, nplist=0, max, posw;
+ float best_score, score;
+ static int *list, *tpos,**plist,*array;
+ int *pos;
+
+
+ char posfile[100];
+ char predmode[100];
+ char target_posfile[100];
+
+
+
+ if (!A)
+ {
+ vfree (list);
+ vfree (tpos);
+
+ free_int (plist, -1);
+ vfree (array);
+ return NULL;
+ }
+
+ strget_param (mode, "_PREDMODE_", "ID", "%s", predmode);
+ strget_param (mode, "_POSW_", "1", "%d", &posw);
+ strget_param (mode, "_NPOS_", "2", "%d", &max);
+ strget_param (mode, "_POSFILE_", "NO", "%s", posfile);
+ strget_param (mode, "_TPOSFILE_", "NO", "%s", target_posfile);
+
+ if ( !strm(posfile, "NO"))return file2pos_list (A,posfile);
+ if ( !strm(target_posfile, "NO"))tpos=file2pos_list (A,target_posfile);
+ else
+ {
+ tpos=vcalloc (A->len_aln, sizeof (int));
+ for (a=0; a<A->len_aln; a++)tpos[a]=1;
+ }
+
+ //Declare the positions that are going to be scanned
+
+
+ if (posw==1)
+ {
+ plist=declare_int (A->len_aln, 2);
+ nplist=0;
+ for (a=0; a<A->len_aln; a++)
+ {
+ if(tpos[a])
+ {
+ plist[nplist][0]=1;
+ plist[nplist][1]=a;
+ nplist++;
+ }
+ }
+ }
+ else if ( posw==2)
+ {
+ nplist=0;
+ plist=declare_int (A->len_aln*A->len_aln, 3);
+ for (a=0; a<A->len_aln; a++)
+ for (b=0; b<A->len_aln; b++)
+ {
+ plist[nplist][1]=a;
+ plist[nplist][2]=b;
+ plist[nplist][0]=2;
+ nplist++;
+ }
+ }
+ else if ( posw==3)
+ {
+ nplist=0;
+ plist=declare_int (A->len_aln*A->len_aln*A->len_aln, 3);
+ for (a=0; a<A->len_aln; a++)
+ for (b=0; b<A->len_aln; b++)
+ {
+ plist[nplist][1]=a;
+ plist[nplist][2]=b;
+ plist[nplist][3]=0;
+
+
+ plist[nplist][0]=3;
+ nplist++;
+ }
+ }
+
+
+ pos=vcalloc ( A->len_aln, sizeof (int));
+ if (max==0)max=A->len_aln;
+ else if ( max==-1)
+ {
+ for (a=0; a<A->len_aln; a++)if (tpos[a]){pos[a]=1;}
+ aln2predictive_positions_scan (NULL, NULL, NULL, 0);
+ return pos;
+ }
+
+
+
+ pos=vcalloc ( A->len_aln, sizeof (int));
+ list=vcalloc (A->len_aln, sizeof (int));
+ nl=0;
+
+
+
+ for (a=0; a< max; a++)
+ {
+ int previous_best_pos=-1;
+ for (best_score=-9999,best_pos=0,b=0; b<nplist; b++)
+ {
+ for (c=0; c<nl; c++)pos[list[c]]=1;
+ for (c=1; c<=plist[b][0]; c++)pos[plist[b][c]]=1;
+
+ if (strm(predmode, "R"))score=sar_aln2r(A,B,pos,0);
+ else if ( strm (predmode, "ID"))
+ {
+ score =or_id_evaluate (A, B, mode, pos,NO_PRINT);
+ }
+ else if ( strm (predmode, "BP2"))score =or_loo_evaluate2 (A, B, mode, pos,NO_PRINT);
+ else
+ {
+ HERE ("Unknown mode: %s", predmode);
+ myexit (EXIT_FAILURE);
+ }
+ if ( score>best_score)
+ {
+ best_score=score;
+ best_pos=b;
+ }
+ for (c=1; c<=plist[b][0]; c++)pos[plist[b][c]]=0;
+
+ }
+ if (best_pos==previous_best_pos)break;
+ else previous_best_pos=best_pos;
+
+ //update the best_pos_list
+ for (b=1; b<=plist[best_pos][0]; b++)
+ list[nl++]=plist[best_pos][b];
+
+
+ if ( print==PRINT)
+ {
+ for (b=0; b<nl; b++) pos[list[b]]=1;
+ fprintf ( stdout, "\nP_current: ");
+ for ( c=1; c<=plist[best_pos][0]; c++)fprintf ( stdout, "%d ",plist[best_pos][c]+1);
+ fprintf ( stdout, " S: %.3f D: %d R:%d", best_score, (int)sar_aln2delta(A,B, pos,0), nl);
+ }
+ for (b=0; b<nl; b++) pos[list[b]]=0;
+ }
+
+ for (a=0; a<nl; a++)
+ pos[list[a]]=1;
+ if (print==PRINT)fprintf ( stdout, "\nR_best: %.3f with %d pos" ,best_score, nl);
+
+ aln2predictive_positions_scan (NULL, NULL, NULL, 0);
+
+ return pos;
+}
+
+char *** compounds2motifs (Alignment *A, Alignment *B, int *pos, int depth, char *mode, int print)
+{
+ char ***motifs;
+ int a;
+
+ motifs=vcalloc (B->len_aln, sizeof (char**));
+ for (a=0; a<B->len_aln; a++)
+ {
+
+ motifs[a]=compound2motif (A, B, pos, depth, a, mode, print);
+ }
+
+ return motifs;
+}
+char ** compound2regexp_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print);
+char ** compound2word_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print);
+
+char ** compound2motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print)
+{
+ char mmode[100];
+
+ strget_param (mode, "_MOTIFMODE_", "word", "%s", mmode); //words, regexp
+ if ( strm (mmode, "regexp"))return compound2regexp_motif (A,B,pos, depth, c, mode, print);
+ else if ( strm (mmode, "word"))return compound2word_motif (A,B,pos, depth, c, mode, print);
+ else return NULL;}
+char ** compound2word_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print)
+{
+ int a,l;
+ char *word, **motif;
+ float score;
+
+
+ word=or_id_evaluate2 (A, B, mode, pos,print, &score);
+ if ( !word) return NULL;
+ l=strlen (word);
+
+ motif=declare_char (l+1, 2);
+ for (a=0; a<l; a++)motif[a][0]=word[a];
+
+ if (print==PRINT)
+ {
+ fprintf ( stdout, "\nMotifCompound %25s best: %.2f motif: ", get_compound_name(c, mode),score);
+ for (a=0; a<l; a++)
+ {
+ fprintf ( stdout, "[%2s]",motif[a]);
+ }
+ }
+ vfree (word);
+ return motif;
+}
+
+
+
+char ** compound2regexp_motif (Alignment *A, Alignment *B, int *pos, int depth, int c, char *mode, int print)
+{
+ static Alignment *I;
+ static Alignment *O;
+ int a, b, o, i;
+
+ float tp,tn,fp,fn,best, sp, sn, sen2, best_sn, best_sp, best_sen2;
+ float best_pred=-1;
+ int best_motif=0;
+
+
+
+ char ***alp=NULL;
+ int *alp_size=NULL;
+
+ char *motif_file;
+ int n;
+ char **m, **m2;
+
+ char *buf=NULL;
+ char *best_buf;
+ FILE *fpp;
+
+ free_aln (I);
+ free_aln (O);
+
+ I=copy_aln(A, NULL);
+ O=copy_aln(A, NULL);
+
+ if (depth==0)
+ strget_param (mode, "_DEPTH_", "2", "%d", &depth);
+
+ I->nseq=O->nseq=I->len_aln=O->len_aln=0;
+ for (a=0; a<A->len_aln; a++)
+ {
+ if (pos[a])
+ {
+ for (i=o=0,b=0; b<A->nseq; b++)
+ {
+ if ( is_gap(A->seq_al[b][a]))return 0;
+ if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a];
+ else O->seq_al[o++][O->len_aln]=A->seq_al[b][a];
+ }
+ I->len_aln++;
+ O->len_aln++;
+ }
+ }
+
+ if (O->len_aln==0 || I->len_aln==0) return 0;
+ O->nseq=o;
+ I->nseq=i;
+ for (a=0; a<o; a++)O->seq_al[a][O->len_aln]='\0';
+ for (a=0; a<i; a++)I->seq_al[a][I->len_aln]='\0';
+
+ if (!I->nseq) return NULL;
+
+
+
+ best_pred=best_motif=best_sn=best_sp=best_sen2=0;
+
+ motif_file=vtmpnam (NULL);
+
+ n=0;
+ if (depth>0)
+ {
+ alp=vcalloc ( sizeof (char**), I->len_aln);
+ alp_size= vcalloc ( I->len_aln, sizeof (int));
+ for (a=0; a<I->len_aln; a++)
+ {
+ char *col;
+ alp[a]=string2alphabet ( (col=aln_column2string (I,a)),depth, &alp_size[a]);
+ vfree (col);
+ }
+ generate_array_string_list (I->len_aln, alp, alp_size, &n, motif_file, OVERLAP);
+ }
+ else
+ {
+ int *used;
+ char r;
+
+ used=vcalloc (256, sizeof (int));
+ fpp=vfopen (motif_file,"w");
+ for (a=0;a<I->len_aln; a++)
+ {
+ for (b=0; b<I->nseq; b++)
+ {
+ r=I->seq_al[b][a];
+ if (!used[(int)r]){fprintf (fpp, "%c", r);used[(int)r]=1;}
+ }
+ for (b=0; b<I->nseq; b++)
+ {
+ r=I->seq_al[b][a];
+ used[(int)r]=0;
+ }
+ fprintf (fpp, " ");
+ }
+ fprintf (fpp, "\n");
+ vfree (used);
+ vfclose (fpp);
+
+ n=1;
+ depth=I->nseq;
+ }
+
+ buf=vcalloc (2*(I->len_aln*depth)+1, sizeof (char));
+ best_buf=vcalloc (2*(I->len_aln*depth)+1, sizeof (char));
+ fpp=vfopen (motif_file, "r");
+
+ for (a=0; a<n; a++)
+ {
+
+ buf=vfgets (buf, fpp);
+ m2=string2list (buf);
+ m2++;
+
+
+ tp=tn=fp=fn=0;
+
+ for (b=0; b<I->nseq; b++)
+ {
+ if (match_motif (I->seq_al[b], m2))tp++;
+ else fn++;
+ }
+ for (b=0; b<O->nseq; b++)
+ {
+ if (match_motif (O->seq_al[b], m2))fp++;
+ else tn++;
+ }
+ rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
+
+ if (best>= best_pred)
+ {
+ best_pred=best;
+ best_sp=sp;
+ best_sen2=sen2;
+ best_sn=sn;
+ sprintf (best_buf, "%s", buf);
+ }
+ m2--;
+ free_char (m2, -1);
+ }
+ vfclose (fpp);
+ if (print==PRINT)fprintf ( stdout, "\nMotifCompound %25s sp: %.2f sn: %.2f sen2: %.2f best: %.2f motif: ", get_compound_name(c, mode), best_sp, best_sn, best_sen2, best_pred);
+ m2=string2list (best_buf);
+ m=declare_char (I->len_aln+1, depth+1);
+
+ for (a=0; a<I->len_aln; a++)
+ {
+ sprintf (m[a], "%s", m2[a+1]);
+ if (print==PRINT) fprintf ( stdout, "[%2s]",m[a]);
+ }
+ if (print==PRINT)fprintf ( stdout, " N-motifs %d", n);
+ free_char (m2, -1);
+
+ if (alp)free_arrayN((void ***) alp, 3);
+ if (alp_size)vfree (alp_size);
+ vfree (buf); vfree(best_buf);
+
+ return m;
+}
+
+double pos2sim (Alignment *A, Alignment *B, int *pos)
+{
+ return sar_aln2r (A, B,pos, PRINT);
+}
+double sar_aln2r (Alignment *A, Alignment *B, int *pos, int print)
+{
+ int a, b, c, d,r1, r2, n, score, sim;
+ double *r, result;
+ static double **slist;
+ int declare=0;
+ static int **M;
+
+
+
+ if (!M)M=read_matrice ("blosum62mt");
+ if (!slist)
+ {
+ int maxslist;
+ maxslist=A->nseq*A->nseq*10;
+ slist=declare_double (maxslist, 2);
+ }
+
+ if (pos==NULL)
+ {
+
+ declare=1;
+ pos=vcalloc ( A->len_aln+1, sizeof (int));
+ for (a=0; a<A->len_aln; a++)pos[a]=1;
+ pos[a]=-1;
+
+ }
+
+ for (n=0,a=0; a< A->nseq-1; a++)
+ {
+
+ for (b=a+1; b<A->nseq; b++)
+ {
+
+
+ for (sim=d=0,c=0; c<A->len_aln; c++)
+ {
+
+ if (pos[c]==0)continue;
+
+ r1=A->seq_al[a][c];
+ r2=A->seq_al[b][c];
+ if (is_gap(r1) || is_gap(r2))return 0;
+
+ sim+=M[r1-'A'][r2-'A']*pos[c];
+ d+=MAX((M[r1-'A'][r1-'A']),(M[r2-'A'][r2-'A']));
+ }
+ sim=(d==0)?0:(100*sim)/d;
+ score=(int)get_sar_sim(B->seq_al[a], B->seq_al[b]);
+ slist[n][0]=(double)sim;
+ slist[n][1]=(double)score;
+ if (print==PRINT)fprintf ( stdout, "SIM: %d %d [%s %s]\n", sim, score, A->name[a], A->name[b]);
+ n++;
+ }
+ }
+
+ r=return_r(slist, n);
+ for (a=0; a<n; a++)slist[a][0]=slist[a][1]=0;
+ result=r[0];
+ vfree (r);
+ if (declare) vfree (pos);
+
+ return result;
+}
+
+
+double sar_aln2delta (Alignment *A, Alignment *B, int *pos, int print)
+{
+ static Alignment *I;
+ static Alignment *O;
+ int a, b, c, o, i;
+ double delta=0;
+ if (!I)
+ {
+ I=copy_aln(A, NULL);
+ O=copy_aln(A, NULL);
+ }
+
+
+
+ for (c=0; c<B->len_aln; c++)
+ {
+
+ I->nseq=O->nseq=I->len_aln=O->len_aln=0;
+ for (a=0; a<A->len_aln; a++)
+ {
+ if (pos[a])
+ {
+ for (i=o=0,b=0; b<B->nseq; b++)
+ {
+ if ( is_gap(A->seq_al[b][a]))return 0;
+ if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a];
+ else O->seq_al[o++][O->len_aln]=A->seq_al[b][a];
+ }
+ I->len_aln++;
+ O->len_aln++;
+ }
+ }
+ if (O->len_aln==0 || I->len_aln==0) return 0;
+ O->nseq=o;
+ I->nseq=i;
+ for (a=0; a<o; a++)O->seq_al[a][O->len_aln]='\0';
+ for (a=0; a<i; a++)I->seq_al[a][I->len_aln]='\0';
+
+ delta+=aln2sim(I,"blosum62mt")-aln2sim(O, "blosum62mt");
+
+ }
+
+ return delta;
+}
+
+char * get_compound_name (int c, char *mode)
+{
+ static int isset;
+ static Alignment *S;
+ static char *lname;
+
+ if (!isset)
+ {
+ char *comp_list;
+ isset=1;
+ lname=vcalloc (100, sizeof (char));
+
+ if (!mode);
+ else
+ {
+ strget_param (mode, "_COMPLIST_", "NO", "%s", comp_list=vcalloc (100, sizeof (char)));
+ if (strm(comp_list, "NO"));
+ else
+ {
+ S=main_read_aln (comp_list, NULL);
+ vfree (comp_list);
+ }
+ }
+ }
+ if (!S || c>=S->nseq)sprintf (lname, "%d", c);
+ else
+ {
+ sprintf (lname, "%s", S->name [c]);
+ }
+ return lname;
+}
+ORP * declare_or_prediction ( int ncomp, int nseq, int len)
+{
+ ORP *P;
+ P=vcalloc ( 1, sizeof (ORP));
+ P->ncomp=ncomp;
+ P->nseq=nseq;
+ P->len=len;
+ P->PR=NULL;
+
+ P->pos=vcalloc (len+1, sizeof (int));
+
+ return P;
+}
+
+void free_orp_list ( ORP**P)
+{
+ int a=0;
+ while (P[a])
+ {
+ free_orp(P[a++]);
+ }
+}
+void free_orp ( ORP*P)
+{
+ if (!P) return;
+ free_aln (P->A);
+ free_aln (P->S);
+ free_aln (P->P);
+ vfree (P->pos);
+ free_arrayN((void **)P->motif, 3);
+ if (P->PR)free_orp(P->PR);
+ vfree (P);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/******************************COPYRIGHT NOTICE*******************************/
+/*© Centro de Regulacio Genomica */
+/*and */
+/*Cedric Notredame */
+/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
+/*All rights reserved.*/
+/*This file is part of T-COFFEE.*/
+/**/
+/* T-COFFEE is free software; you can redistribute it and/or modify*/
+/* it under the terms of the GNU General Public License as published by*/
+/* the Free Software Foundation; either version 2 of the License, or*/
+/* (at your option) any later version.*/
+/**/
+/* T-COFFEE is distributed in the hope that it will be useful,*/
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
+/* GNU General Public License for more details.*/
+/**/
+/* You should have received a copy of the GNU General Public License*/
+/* along with Foobar; if not, write to the Free Software*/
+/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
+/*............................................... |*/
+/* If you need some more information*/
+/* cedric.notredame@europe.com*/
+/*............................................... |*/
+/**/
+/**/
+/* */
+/******************************COPYRIGHT NOTICE*******************************/