JPRED-2 Add sources of all binaries (except alscript) to Git
[jpred.git] / sources / ncoils / ncoils.c
diff --git a/sources/ncoils/ncoils.c b/sources/ncoils/ncoils.c
new file mode 100644 (file)
index 0000000..8371df9
--- /dev/null
@@ -0,0 +1,276 @@
+#include <ncoils.h>
+
+/* Rob Russell's attempt to make a COILS program */
+
+main(int argc, char *argv[]) {
+
+       int i,j,k,l;
+       int verb;
+       int window,pt;
+       int which,weighted;
+       int nseq;
+       int t,tc;
+       int seqlen;
+       int mode;
+       int min_seg;
+
+       char heptfile[1000];
+       char *buff;
+       static char *env;
+       char *seq,*title,*ident;
+
+       float min_P;
+
+       struct hept_pref *h;
+
+       FILE *MAT;
+
+
+       /* defaults */
+       window = 21;
+       weighted = 0;
+       verb = 0;
+       mode = 0; /* 0 = column mode, 1 = fasta, 2 = concise */
+       min_P = 0.5;
+
+       if((env=getenv("COILSDIR"))==NULL) {
+               fprintf(stderr,"error: environment variable COILSDIR must be set\n");
+               exit(-1);
+       }
+
+       strcpy(&heptfile[0],env);
+       strcpy(&heptfile[strlen(heptfile)],"/new.mat");
+
+
+       for(i=1; i<argc; ++i) {
+           if(argv[i][0]!='-') exit_error();
+          if(strcmp(&argv[i][1],"m")==0) {
+             if((i+1)>=argc) exit_error();
+             strcpy(&heptfile[0],argv[i+1]);
+             i++;
+          } else if(strcmp(&argv[i][1],"win")==0) {
+             if((i+1)>=argc) exit_error();
+             sscanf(argv[i+1],"%d",&window);
+             i++;
+          } else if(strcmp(&argv[i][1],"c")==0) {
+             mode=2;
+          } else if(strcmp(&argv[i][1],"min_seg")==0) {
+             if((i+1)>=argc) exit_error();
+             sscanf(argv[i+1],"%d",&min_seg);
+             i++;
+          } else if(strcmp(&argv[i][1],"c")==0) {
+             mode=2;
+          } else if((strcmp(&argv[i][1],"f")==0) || (strcmp(&argv[i][1],"fasta")==0)) {
+            mode=1;
+          } else if((strcmp(&argv[i][1],"min_P")==0)) {
+             if((i+1)>=argc) exit_error();
+             sscanf(argv[i+1],"%f",&min_P);
+             i++;
+          } else if(strcmp(&argv[i][1],"help")==0) {
+            exit_error();
+          } else if(strcmp(&argv[i][1],"w")==0) {
+            weighted=1;
+          } else if(strcmp(&argv[i][1],"V")==0 || strcmp(&argv[i][1],"v")==0) {
+            verb=1;
+           } else {
+            fprintf(stderr," can't understand flag/field %s\n",argv[i]);
+             exit_error();
+           }
+        }
+
+       if(verb) printf("Matrix file is %s\n",heptfile);
+       /* Read in matrix */
+       if((MAT=fopen(heptfile,"r"))==NULL) {
+               fprintf(stderr,"Error reading %s\n",heptfile);
+               exit(-1);
+       }
+       h = read_matrix(MAT);
+       if(verb) {
+          for(i=0; i<strlen(AAs); ++i) if(AAs[i]!='_') {
+               pt=(int)(AAs[i]-'A');
+               printf("AA %c %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f\n",AAs[i],
+                  h->m[pt][0],h->m[pt][1],h->m[pt][2],h->m[pt][3],h->m[pt][4],
+                  h->m[pt][5],h->m[pt][6]);
+          }
+          for(i=0; i<h->n; ++i) {
+               printf("Window %4d %1d %f %f %f %f %f\n",
+                       h->f[i].win,h->f[i].w,h->f[i].m_cc,h->f[i].sd_cc,h->f[i].m_g,h->f[i].sd_g,h->f[i].sc);
+          }
+       }
+
+       /* See if there is a file for our chosen window length/weight scheme */
+       which = -1;
+       for(i=0; i<h->n; ++i) {
+               if((h->f[i].win == window) && (h->f[i].w == weighted)) { /* match */
+                       if(verb) printf("Found fitting data for win %4d w %d\n",window,weighted);
+                       which = i;
+               }
+       }
+
+       /* Read in a sequence from the standard input */
+       nseq = 0;
+       ident = (char*) malloc(100*sizeof(char));
+       title = (char*) malloc(100000*sizeof(char));
+       buff  = (char*) malloc(100000*sizeof(char));
+       t = 0;
+       tc = 0;
+       while(fgets(buff,99999,stdin)!=NULL) {
+               /* There is a memory problem - the matrix data gets corrupted under OSF after this fgets */
+               for(i=0; i<strlen(buff); ++i) if(buff[i]=='\n' || buff[i]=='\r') buff[i]='\0';
+               if(buff[0]=='>') {
+                       if(nseq>0) { 
+                               pred_coils(seq,ident,title,h,window,which,weighted,mode,min_P,&t,&tc,min_seg); 
+                               free(seq);
+                       }
+                        seqlen = 0;
+                       i=1;
+                       while((buff[i]!=' ') && (buff[i]!='\0') && (buff[i]!='\n') && (buff[i]!='\r')) { ident[i-1]=buff[i]; i++; }
+                       ident[i-1]='\0';
+                       i++; j=0;
+                       seq = (char*)malloc(sizeof(char));
+                       seq[0]='\0';
+                       while(buff[i]!='\n' && buff[i]!='\0' && buff[i]!='\r') {
+                               title[j]=buff[i]; i++;
+                               j++;
+                       }
+                       title[j]='\0';
+                       nseq++;
+               } else {
+/*                     printf("Adding |%s| to |%s| = \n",buff,seq);  */
+                       seq=(char*)realloc(seq,(seqlen+strlen(buff)+1)*sizeof(char));
+                       strcpy(&seq[seqlen],buff); 
+                        seqlen=strlen(seq);
+/*                     printf("       |%s|\n",seq);  */
+               }
+       }
+       if(nseq>0) { 
+               pred_coils(seq,ident,title,h,window,which,weighted,mode,min_P,&t,&tc,min_seg); 
+               free(seq);
+       }
+       fprintf(stderr,"%8d sequences %8d aas %8d in coil\n",nseq,t,tc);
+       free(title); free(ident); 
+
+       exit(0);
+
+}
+
+void exit_error() {
+       fprintf(stderr,"NCOILS: R.B. Russell, A.N. Lupas, 1999\n");
+       fprintf(stderr," based on Lupas, Van Dyck & Stock (1991) Science 252,1162-1164\n");
+       fprintf(stderr,"format: ncoils [options] < [sequence file]\n");
+       fprintf(stderr,"       -f or -fasta        [fasta output - coils as 'x', like '-x' in seg]\n");
+       fprintf(stderr,"       -c                  [concise mode - which sequences have any coils (and how many)]\n");
+       fprintf(stderr,"       -min_seg <int>      [for concise mode - only report sequence if >= min coil segments]\n");
+       fprintf(stderr,"       -min_P <float>      [minimum P to define coil segment; DEFAULT = 0.5]\n");
+       fprintf(stderr,"       -win <int>          [window size; DEFAULT = 21]\n");
+       fprintf(stderr,"       -w                  [weight heptad positions a&d the same as b,c,e,f,g]\n");
+       fprintf(stderr,"       -v                  [verbose/debug mode - print extra junk]\n");
+       fprintf(stderr,"       -max_seq_len <int>  [longest sequence tolerated; DEFAULT = 100 000]\n");
+       exit(-1); 
+}
+
+void pred_coils(char *seq,char *ident,char *title,struct hept_pref *h,int win, int which, int weighted,int mode, float min_P, int *t, int *tc, int min_seg) {
+
+       int i,j;
+       int len,pos,aa_pt;
+       int pt;
+       int total_coil_segments;
+       int are_there_coils;
+
+       float actual_win;
+       float this_score,Gg,Gcc,power;
+       float t1,t2,t3,t4;
+       float *score;
+       float *P;
+
+       char *hept_seq;
+       
+       len=strlen(seq);
+
+       score    = (float*)malloc(len*sizeof(float));
+       P        = (float*)malloc(len*sizeof(float));
+       hept_seq =  (char*)malloc(len*sizeof(char));
+
+/*     printf("Sequence is %s length is %d\n",seq,len); */
+       for(i=0; i<len; ++i) { P[i]=0.0; score[i] = 0.0; hept_seq[i] = 'x'; }
+
+       for(i=0; i<(len-win+1); ++i) {
+               this_score = 1.0;
+               actual_win=0.0;
+                for(j=0; ((j<win) && ((i+j)<len)); ++j) {
+                  aa_pt = (int)(seq[i+j]-'A');
+                  if((aa_pt>=0) && (aa_pt<26) && (AAs[aa_pt]!='_')) {
+                       pos = j%7; /* where are we in the heptad?  pos modulus 7 */
+/*                     printf("AA %c in hept %c %7.5f\n",seq[i+j],('a'+pos),h->m[aa_pt][pos]);  */
+                       if(weighted && (pos==0 || pos==3)) { power = 2.5; }
+                       else { power = 1.0; }
+                       actual_win+=power;
+                       if(h->m[aa_pt][pos]!=-1) {
+                               this_score*=pow(h->m[aa_pt][pos],power);
+                       } else {
+                               this_score*=pow(h->smallest,power);
+                       }
+                   }
+                }
+               if(actual_win>0) {
+                  this_score = pow(this_score,(1/(float)actual_win));
+               } else {
+                  this_score = 0;
+               }
+                for(j=0; ((j<win) && ((i+j)<len)); ++j) {
+                   aa_pt = (int)(seq[i+j]-'A');
+                   if((aa_pt>=0) && (aa_pt<26) && (AAs[aa_pt]!='_')) {
+                       pos = j%7; /* where are we in the heptad?  pos modulus 7 */
+                       if(this_score>score[i+j]) { score[i+j]=this_score; hept_seq[i+j]='a'+pos; }
+                   }
+               }
+       }
+
+
+       if(mode==1) {
+               printf(">%s %s\n",ident,title);
+       }
+       are_there_coils=0;
+       total_coil_segments=0;
+       for(i=0; i<len; ++i) {
+               /* Calculate P */
+               t1 = 1/(h->f[which].sd_cc);
+               t2 = (score[i]-(h->f[which].m_cc))/h->f[which].sd_cc;
+               t3 = fabs(t2);
+               t4 = pow(t3,2);
+               t4 = t3*t3;
+               Gcc = t1 * exp(-0.5*t4);
+/*             printf("Gcc %f %f %f %f %f\n",t1cc,t2cc,t3cc,t4cc,Gcc); */
+               t1 = 1/(h->f[which].sd_g);
+               t2 = (score[i]-(h->f[which].m_g))/h->f[which].sd_g;
+               t3 = fabs(t2);
+               t4 = pow(t3,2);
+               t4 = t3*t3;
+               Gg = t1 * exp(-0.5*t4);
+/*             printf("Gg %f %f %f %f %f\n",t1g,t2g,t3g,t4g,Gg); */
+               P[i] = Gcc/(h->f[which].sc*Gg+Gcc);
+               if(P[i]>=min_P) {
+                   are_there_coils=1;
+                   if((i==0) || (P[i-1]<min_P)) { total_coil_segments++; }
+                  (*tc)++; 
+                }
+               (*t)++;
+               if(mode==1) {
+                       if(P[i]>=min_P) { printf("x"); }
+                       else { printf("%c",seq[i]); }
+                       if(((i+1)%60)==0) { printf("\n"); }
+               } else if(mode==0) {
+                       printf("%4d %c %c %7.3f %7.3f (%7.3f %7.3f)\n",i+1,seq[i],hept_seq[i],score[i],P[i],Gcc,Gg);
+               }
+       }
+       if(mode==1) { printf("\n"); } 
+        if((mode==2) && (are_there_coils==1) && (total_coil_segments>=min_seg)) {
+               if(total_coil_segments==1) {
+                       printf("Pred %4d coil segment  : %s %s\n",total_coil_segments,ident,title);
+               } else {
+                       printf("Pred %4d coil segments : %s %s\n",total_coil_segments,ident,title);
+               }
+       }
+
+       free(P); free(score); free(hept_seq);
+}