JPRED-2 Add sources of all binaries (except alscript) to Git
[jpred.git] / sources / ncoils / ncoils.c
1 #include <ncoils.h>
2
3 /* Rob Russell's attempt to make a COILS program */
4
5 main(int argc, char *argv[]) {
6
7         int i,j,k,l;
8         int verb;
9         int window,pt;
10         int which,weighted;
11         int nseq;
12         int t,tc;
13         int seqlen;
14         int mode;
15         int min_seg;
16
17         char heptfile[1000];
18         char *buff;
19         static char *env;
20         char *seq,*title,*ident;
21
22         float min_P;
23
24         struct hept_pref *h;
25
26         FILE *MAT;
27
28
29         /* defaults */
30         window = 21;
31         weighted = 0;
32         verb = 0;
33         mode = 0; /* 0 = column mode, 1 = fasta, 2 = concise */
34         min_P = 0.5;
35
36         if((env=getenv("COILSDIR"))==NULL) {
37                 fprintf(stderr,"error: environment variable COILSDIR must be set\n");
38                 exit(-1);
39         }
40
41         strcpy(&heptfile[0],env);
42         strcpy(&heptfile[strlen(heptfile)],"/new.mat");
43
44
45         for(i=1; i<argc; ++i) {
46            if(argv[i][0]!='-') exit_error();
47            if(strcmp(&argv[i][1],"m")==0) {
48              if((i+1)>=argc) exit_error();
49              strcpy(&heptfile[0],argv[i+1]);
50              i++;
51            } else if(strcmp(&argv[i][1],"win")==0) {
52              if((i+1)>=argc) exit_error();
53              sscanf(argv[i+1],"%d",&window);
54              i++;
55            } else if(strcmp(&argv[i][1],"c")==0) {
56              mode=2;
57            } else if(strcmp(&argv[i][1],"min_seg")==0) {
58              if((i+1)>=argc) exit_error();
59              sscanf(argv[i+1],"%d",&min_seg);
60              i++;
61            } else if(strcmp(&argv[i][1],"c")==0) {
62              mode=2;
63            } else if((strcmp(&argv[i][1],"f")==0) || (strcmp(&argv[i][1],"fasta")==0)) {
64              mode=1;
65            } else if((strcmp(&argv[i][1],"min_P")==0)) {
66              if((i+1)>=argc) exit_error();
67              sscanf(argv[i+1],"%f",&min_P);
68              i++;
69            } else if(strcmp(&argv[i][1],"help")==0) {
70              exit_error();
71            } else if(strcmp(&argv[i][1],"w")==0) {
72              weighted=1;
73            } else if(strcmp(&argv[i][1],"V")==0 || strcmp(&argv[i][1],"v")==0) {
74              verb=1;
75            } else {
76              fprintf(stderr," can't understand flag/field %s\n",argv[i]);
77              exit_error();
78            }
79         }
80
81         if(verb) printf("Matrix file is %s\n",heptfile);
82         /* Read in matrix */
83         if((MAT=fopen(heptfile,"r"))==NULL) {
84                 fprintf(stderr,"Error reading %s\n",heptfile);
85                 exit(-1);
86         }
87         h = read_matrix(MAT);
88         if(verb) {
89            for(i=0; i<strlen(AAs); ++i) if(AAs[i]!='_') {
90                 pt=(int)(AAs[i]-'A');
91                 printf("AA %c %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f\n",AAs[i],
92                    h->m[pt][0],h->m[pt][1],h->m[pt][2],h->m[pt][3],h->m[pt][4],
93                    h->m[pt][5],h->m[pt][6]);
94            }
95            for(i=0; i<h->n; ++i) {
96                 printf("Window %4d %1d %f %f %f %f %f\n",
97                         h->f[i].win,h->f[i].w,h->f[i].m_cc,h->f[i].sd_cc,h->f[i].m_g,h->f[i].sd_g,h->f[i].sc);
98            }
99         }
100
101         /* See if there is a file for our chosen window length/weight scheme */
102         which = -1;
103         for(i=0; i<h->n; ++i) {
104                 if((h->f[i].win == window) && (h->f[i].w == weighted)) { /* match */
105                         if(verb) printf("Found fitting data for win %4d w %d\n",window,weighted);
106                         which = i;
107                 }
108         }
109
110         /* Read in a sequence from the standard input */
111         nseq = 0;
112         ident = (char*) malloc(100*sizeof(char));
113         title = (char*) malloc(100000*sizeof(char));
114         buff  = (char*) malloc(100000*sizeof(char));
115         t = 0;
116         tc = 0;
117         while(fgets(buff,99999,stdin)!=NULL) {
118                 /* There is a memory problem - the matrix data gets corrupted under OSF after this fgets */
119                 for(i=0; i<strlen(buff); ++i) if(buff[i]=='\n' || buff[i]=='\r') buff[i]='\0';
120                 if(buff[0]=='>') {
121                         if(nseq>0) { 
122                                 pred_coils(seq,ident,title,h,window,which,weighted,mode,min_P,&t,&tc,min_seg); 
123                                 free(seq);
124                         }
125                         seqlen = 0;
126                         i=1;
127                         while((buff[i]!=' ') && (buff[i]!='\0') && (buff[i]!='\n') && (buff[i]!='\r')) { ident[i-1]=buff[i]; i++; }
128                         ident[i-1]='\0';
129                         i++; j=0;
130                         seq = (char*)malloc(sizeof(char));
131                         seq[0]='\0';
132                         while(buff[i]!='\n' && buff[i]!='\0' && buff[i]!='\r') {
133                                 title[j]=buff[i]; i++;
134                                 j++;
135                         }
136                         title[j]='\0';
137                         nseq++;
138                 } else {
139 /*                      printf("Adding |%s| to |%s| = \n",buff,seq);  */
140                         seq=(char*)realloc(seq,(seqlen+strlen(buff)+1)*sizeof(char));
141                         strcpy(&seq[seqlen],buff); 
142                         seqlen=strlen(seq);
143 /*                      printf("       |%s|\n",seq);  */
144                 }
145         }
146         if(nseq>0) { 
147                 pred_coils(seq,ident,title,h,window,which,weighted,mode,min_P,&t,&tc,min_seg); 
148                 free(seq);
149         }
150         fprintf(stderr,"%8d sequences %8d aas %8d in coil\n",nseq,t,tc);
151         free(title); free(ident); 
152
153         exit(0);
154
155 }
156
157 void exit_error() {
158         fprintf(stderr,"NCOILS: R.B. Russell, A.N. Lupas, 1999\n");
159         fprintf(stderr," based on Lupas, Van Dyck & Stock (1991) Science 252,1162-1164\n");
160         fprintf(stderr,"format: ncoils [options] < [sequence file]\n");
161         fprintf(stderr,"       -f or -fasta        [fasta output - coils as 'x', like '-x' in seg]\n");
162         fprintf(stderr,"       -c                  [concise mode - which sequences have any coils (and how many)]\n");
163         fprintf(stderr,"       -min_seg <int>      [for concise mode - only report sequence if >= min coil segments]\n");
164         fprintf(stderr,"       -min_P <float>      [minimum P to define coil segment; DEFAULT = 0.5]\n");
165         fprintf(stderr,"       -win <int>          [window size; DEFAULT = 21]\n");
166         fprintf(stderr,"       -w                  [weight heptad positions a&d the same as b,c,e,f,g]\n");
167         fprintf(stderr,"       -v                  [verbose/debug mode - print extra junk]\n");
168         fprintf(stderr,"       -max_seq_len <int>  [longest sequence tolerated; DEFAULT = 100 000]\n");
169         exit(-1); 
170 }
171
172 void pred_coils(char *seq,char *ident,char *title,struct hept_pref *h,int win, int which, int weighted,int mode, float min_P, int *t, int *tc, int min_seg) {
173
174         int i,j;
175         int len,pos,aa_pt;
176         int pt;
177         int total_coil_segments;
178         int are_there_coils;
179
180         float actual_win;
181         float this_score,Gg,Gcc,power;
182         float t1,t2,t3,t4;
183         float *score;
184         float *P;
185
186         char *hept_seq;
187         
188         len=strlen(seq);
189
190         score    = (float*)malloc(len*sizeof(float));
191         P        = (float*)malloc(len*sizeof(float));
192         hept_seq =  (char*)malloc(len*sizeof(char));
193
194 /*      printf("Sequence is %s length is %d\n",seq,len); */
195         for(i=0; i<len; ++i) { P[i]=0.0; score[i] = 0.0; hept_seq[i] = 'x'; }
196
197         for(i=0; i<(len-win+1); ++i) {
198                 this_score = 1.0;
199                 actual_win=0.0;
200                 for(j=0; ((j<win) && ((i+j)<len)); ++j) {
201                    aa_pt = (int)(seq[i+j]-'A');
202                    if((aa_pt>=0) && (aa_pt<26) && (AAs[aa_pt]!='_')) {
203                         pos = j%7; /* where are we in the heptad?  pos modulus 7 */
204 /*                      printf("AA %c in hept %c %7.5f\n",seq[i+j],('a'+pos),h->m[aa_pt][pos]);  */
205                         if(weighted && (pos==0 || pos==3)) { power = 2.5; }
206                         else { power = 1.0; }
207                         actual_win+=power;
208                         if(h->m[aa_pt][pos]!=-1) {
209                                 this_score*=pow(h->m[aa_pt][pos],power);
210                         } else {
211                                 this_score*=pow(h->smallest,power);
212                         }
213                     }
214                 }
215                 if(actual_win>0) {
216                    this_score = pow(this_score,(1/(float)actual_win));
217                 } else {
218                    this_score = 0;
219                 }
220                 for(j=0; ((j<win) && ((i+j)<len)); ++j) {
221                     aa_pt = (int)(seq[i+j]-'A');
222                     if((aa_pt>=0) && (aa_pt<26) && (AAs[aa_pt]!='_')) {
223                         pos = j%7; /* where are we in the heptad?  pos modulus 7 */
224                         if(this_score>score[i+j]) { score[i+j]=this_score; hept_seq[i+j]='a'+pos; }
225                     }
226                 }
227        }
228
229
230         if(mode==1) {
231                 printf(">%s %s\n",ident,title);
232         }
233         are_there_coils=0;
234         total_coil_segments=0;
235         for(i=0; i<len; ++i) {
236                 /* Calculate P */
237                 t1 = 1/(h->f[which].sd_cc);
238                 t2 = (score[i]-(h->f[which].m_cc))/h->f[which].sd_cc;
239                 t3 = fabs(t2);
240                 t4 = pow(t3,2);
241                 t4 = t3*t3;
242                 Gcc = t1 * exp(-0.5*t4);
243 /*              printf("Gcc %f %f %f %f %f\n",t1cc,t2cc,t3cc,t4cc,Gcc); */
244                 t1 = 1/(h->f[which].sd_g);
245                 t2 = (score[i]-(h->f[which].m_g))/h->f[which].sd_g;
246                 t3 = fabs(t2);
247                 t4 = pow(t3,2);
248                 t4 = t3*t3;
249                 Gg = t1 * exp(-0.5*t4);
250 /*              printf("Gg %f %f %f %f %f\n",t1g,t2g,t3g,t4g,Gg); */
251                 P[i] = Gcc/(h->f[which].sc*Gg+Gcc);
252                 if(P[i]>=min_P) {
253                    are_there_coils=1;
254                    if((i==0) || (P[i-1]<min_P)) { total_coil_segments++; }
255                    (*tc)++; 
256                 }
257                 (*t)++;
258                 if(mode==1) {
259                         if(P[i]>=min_P) { printf("x"); }
260                         else { printf("%c",seq[i]); }
261                         if(((i+1)%60)==0) { printf("\n"); }
262                 } else if(mode==0) {
263                         printf("%4d %c %c %7.3f %7.3f (%7.3f %7.3f)\n",i+1,seq[i],hept_seq[i],score[i],P[i],Gcc,Gg);
264                 }
265         }
266         if(mode==1) { printf("\n"); } 
267         if((mode==2) && (are_there_coils==1) && (total_coil_segments>=min_seg)) {
268                 if(total_coil_segments==1) {
269                         printf("Pred %4d coil segment  : %s %s\n",total_coil_segments,ident,title);
270                 } else {
271                         printf("Pred %4d coil segments : %s %s\n",total_coil_segments,ident,title);
272                 }
273         }
274
275         free(P); free(score); free(hept_seq);
276 }