9 #define AA "GAVLIFPSTCMWYNQDEKRH"
22 #define MAX(a,b) ((a)>(b)?(a):(b))
23 #define MIN(a,b) ((a)<(b)?(a):(b))
51 void read_mat(char *path, char *fn, double **MAT, double *matave);
52 int getargs(char *line,char *args[],int max);
53 void read_ref(char *path, char *fn, double **REF);
54 void Get_Histo( P_STR *P, char *path, char *);
55 double **DMatrix(int n_rows, int n_cols);
56 void *my_malloc(size_t size);
57 void IUPred(SEQ_STR *SEQ, P_STR *P);
58 void getRegions(SEQ_STR *SEQ );
59 void Get_Seq(char *fn, SEQ_STR *SEQ);
69 int main(int argc, char **argv)
80 printf(" Usage: %s seqfile type \n",argv[0]);
81 printf(" where type stands for one of the options of \n");
82 printf(" \"long\", \"short\" or \"glob\"\n");
85 if ((path=getenv("IUPred_PATH"))==NULL) {
86 fprintf(stderr,"IUPred_PATH environment variable is not set\n");
90 printf("# IUPred \n");
91 printf("# Copyright (c) Zsuzsanna Dosztanyi, 2005\n");
93 printf("# Z. Dosztanyi, V. Csizmok, P. Tompa and I. Simon\n");
94 printf("# J. Mol. Biol. (2005) 347, 827-839. \n");
101 if ((strncmp(argv[2],"long",4))==0) {
104 else if ((strncmp(argv[2],"short",5))==0) {
107 else if ((strncmp(argv[2],"glob",4))==0) {
111 printf("Wrong argument\n");exit(1);
115 SEQ=malloc(sizeof(SEQ_STR));
116 Get_Seq(argv[1],SEQ);
117 if (SEQ->le==0) {printf(" Sequence length 0\n");exit(1);}
120 printf("%s %d\n%s\n",SEQ->name,SEQ->le,SEQ->seq);
124 P=malloc(sizeof(P_STR));
125 P->CC= DMatrix(AAN,AAN);
134 read_ref(path,"ss",P->CC);
135 Get_Histo(P, path, "histo");
140 printf("# Prediction output \n");
141 printf("# %s\n",SEQ->name);
142 for (i=0;i<SEQ->le;i++)
143 printf("%5d %c %10.4f\n",i+1,SEQ->seq[i],SEQ->en[i]);
152 read_ref(path,"ss_casp",P->CC);
153 Get_Histo(P, path, "histo_casp");
157 printf("# Prediction output \n");
158 printf("# %s\n",SEQ->name);
159 for (i=0;i<SEQ->le;i++)
160 printf("%5d %c %10.4f\n",i+1,SEQ->seq[i],SEQ->en[i]);
172 read_ref(path,"ss",P->CC);
173 Get_Histo(P,path,"histo");
182 globseq=malloc((SEQ->le+1)*sizeof(char));
183 for (i=0;i<SEQ->le;i++) globseq[i]=tolower(SEQ->seq[i]);
185 printf("# Prediction output \n");
186 printf("# %s\n",SEQ->name);
187 printf("Number of globular domains: %5d \n",SEQ->ngr);
188 for (i=0;i<SEQ->ngr;i++) {
189 printf(" globular domain %5d. %d - %d \n",
190 i+1,SEQ->gr[i][0]+1,SEQ->gr[i][1]+1);
191 for (j=SEQ->gr[i][0];j<SEQ->gr[i][1]+1;j++) {
192 globseq[j]=toupper(globseq[j]);
195 printf(">%s\n",SEQ->name);
196 for (i=0;i<SEQ->le;i++) {
197 if ((i>0)&&(i%60==0)) printf("\n");
198 else if ((i>0)&&(i%10==0)) printf(" ");
199 printf("%c",globseq[i]);
206 for (i=0;i<SEQ->le;i++)
207 printf("%5d %c %10.4f\n",i,SEQ->seq[i],SEQ->en[i]);
212 free(SEQ->eprof);free(SEQ->en);free(SEQ->smp);
220 void IUPred(SEQ_STR *SEQ, P_STR *P)
225 double min, max, step;
228 min=P->min; max=P->max;step=P->step;
230 SEQ->eprof=malloc(naa*sizeof(double));
231 for (i=0;i<naa;i++) SEQ->eprof[i]=0;
232 SEQ->en=malloc(naa*sizeof(double));
233 for (i=0;i<naa;i++) SEQ->en[i]=0;
234 SEQ->smp=malloc(naa*sizeof(double));
235 for (i=0;i<naa;i++) SEQ->smp[i]=0;
239 for (i=0;i<naa;i++) {
240 a1=strchr(AA,((toupper(SEQ->seq[i]))))-AA;
241 if ((a1<0) || (a1>=AAN)) continue;
243 for (j=0;j<naa;j++) if (((abs(i-j))>LC)&&((abs(i-j))<UC)) {
244 a2=strchr(AA,((toupper(SEQ->seq[j]))))-AA;
245 if ((a2<0) || (a2>=AAN)) continue;
246 SEQ->eprof[i]+=P->CC[a1][a2];
249 SEQ->expscore+=SEQ->eprof[i]/(naa*n2);
255 for (i=0;i<naa;i++) {
257 for (j=MAX(0,i-WS);j<=MIN(naa,i+WS+1);j++) {
258 SEQ->smp[i]+=SEQ->eprof[j];
264 for (i=0;i<naa;i++) {
266 for (j=i-WS;j<i+WS;j++) {
267 if ((j<0)||(j>=naa)) SEQ->smp[i]+=EP;
268 else SEQ->smp[i]+=SEQ->eprof[j];
276 for (i=0;i<naa;i++) {
278 if (SEQ->smp[i]<=min+2*step) SEQ->en[i]=1;
279 if (SEQ->smp[i]>=max-2*step) SEQ->en[i]=0;
280 if ((SEQ->smp[i]>min+2*step)&&(SEQ->smp[i]<max-2*step)) {
281 p=(int)((SEQ->smp[i]-min)*(1.0/step));
282 SEQ->en[i]=P->distro[p];
286 printf("%5d %10.4f %10.4f %10.4f\n",
287 i,SEQ->eprof[i], SEQ->smp[i],SEQ->en[i]);
297 void getRegions(SEQ_STR *SEQ )
317 for (i=0;i<naa;i++) {
319 if ((in_GR==1)&&(SEQ->smp[i]<=Min_Ene)) {
320 GR=realloc(GR,(nr+1)*sizeof(int *));
321 GR[nr]=malloc(2*sizeof(int));
329 if ((SEQ->smp[i]>Min_Ene)&&(in_GR==0)) {
336 GR=realloc(GR,(nr+1)*sizeof(int *));
337 GR[nr]=malloc(2*sizeof(int));
349 beg=GR[0][0];end=GR[0][1];
352 if ((kk<nr)&&(GR[kk][0]-end)<JOIN) {
357 else if ((end-beg+1)<DEL) {
365 mGR=realloc(mGR,(mnr+1)*sizeof(int*));
366 mGR[mnr]=malloc(2*sizeof(int));
380 for (i=0;i<nr;i++) free(GR[i]);
391 void Get_Histo(P_STR *P, char *path, char *fn)
396 double v, min, max, cutoff, c;
400 sl=strlen(path)+strlen(fn)+2;
401 fullfn=malloc(sl*sizeof(char));
402 sprintf(fullfn,"%s/%s",path,fn);
404 if ((f=fopen(fullfn,"r"))==NULL) {
405 printf("Could not open %s\n",fullfn);
411 fscanf(f,"%*s %lf %lf %d\n",&min, &max, &nb);
413 P->distro=malloc(nb*sizeof(double ));
414 for (i=0;i<nb;i++) P->distro[i]=0;
417 for (i=0,set=0;i<nb;i++) {
420 if (ln[0]=='#') continue;
422 sscanf(ln,"%*s %lf %*s %*s %lf\n", &c,&v);
423 if ((set==0)&&(v<=0.5)) {set=1;cutoff=c;}
435 P->step=(max-min)/nb;
441 void read_ref(char *path, char *fn, double **REF)
447 char line[1000],s[20];
452 sl=strlen(path)+strlen(fn)+2;
453 fullfn=malloc(sl*sizeof(char));
454 sprintf(fullfn,"%s/%s",path,fn);
456 if ((f=fopen(fullfn,"r"))==NULL) {
457 printf("Could not open %s\n",fullfn);
465 sscanf(line,"%d",&p1);
466 sscanf(&line[8],"%d",&p2);
467 sscanf(&line[17],"%s",s);
475 for (i=0;i<AAN;i++) for (j=0;j<AAN;j++)
486 void read_mat(char *path, char *fn, double **MAT, double *matave)
491 char *args[AAN+2], AAL[AAN],a1,a2;
497 sl=strlen(path)+strlen(fn)+2;
498 fullfn=malloc(sl*sizeof(char));
499 sprintf(fullfn,"%s/%s",path,fn);
501 if ((f=fopen(fullfn,"r"))==NULL) {
502 printf("Could not open %s\n",fullfn);
508 sprintf(AAL,"%s",ln);
516 if (ln[0] == '\n') continue;
517 if (ln[0] == '#') continue;
518 numargs = getargs(ln,args,AAN+2);
522 for (j=0;j<numargs;j++) {
537 for (i=0;i<AAN;i++) for (j=0;j<AAN;j++) *matave+=MAT[i][j];
547 int getargs(char *line,char *args[],int max)
553 for (i=0;i<max;i++) {
554 if ((args[i]=strtok(inptr," \t\n"))==NULL)
564 void *my_malloc(size_t size)
571 new_mem = malloc(size);
572 if (new_mem == NULL) {
573 fprintf(stderr, "can't allocate enough memory: %d bytes\n", size);
580 double **DMatrix(int n_rows, int n_cols)
585 matrix = (double **) my_malloc(n_rows*sizeof(double *));
586 matrix[0] = (double *) my_malloc(n_rows*n_cols*sizeof(double));
588 for (i = 1; i < n_rows; i++)
589 matrix[i] = matrix[i-1] + n_cols;
590 for (i=0;i<n_rows;i++) for (j=0;j<n_cols;j++) matrix[i][j]=0;
597 void Get_Seq(char *fn, SEQ_STR *SEQ)
604 if ((fn==NULL)||(strlen(fn)==0)) {
605 printf("No sequence filename\n"),exit(1);
608 if ((f=fopen(fn,"r"))==NULL) {
609 printf("Could not open %s\n",fn),exit(1);
613 SEQ->seq=calloc(MSL,sizeof(char));
616 sscanf(&line[1],"%s",SEQ->name);
620 while ((c!='>') && (!feof(f))) {
622 if (isalpha((int)(c))) {
625 if ((j>0)&&(j%MSL==0)) {
626 SEQ->seq=realloc(SEQ->seq,(j+MSL)*sizeof(char));
633 printf("%s %5d\n%s\n",SEQ->name,SEQ->le,SEQ->seq);