12 #define AA "GAVLIFPSTCMWYNQDEKRH"
25 #define MAX(a,b) ((a)>(b)?(a):(b))
26 #define MIN(a,b) ((a)<(b)?(a):(b))
54 void read_mat(char *path, char *fn, double **MAT, double *matave);
55 int getargs(char *line,char *args[],int max);
56 void read_ref(char *path, char *fn, double **REF);
57 void Get_Histo( P_STR *PARAMS, char *path, char *);
58 double **DMatrix(int n_rows, int n_cols);
59 void *my_malloc(size_t size);
60 void IUPred(SEQ_STR *SEQ, P_STR *PARAMS);
61 void getRegions(SEQ_STR *SEQ );
62 void Get_Seq(char *fn, SEQ_STR *SEQ);
63 void createOutputFiles(int type);
64 void closeOutputFiles(int type);
77 int main(int argc, char **argv)
86 printf(" Usage: %s seqfile type \n",argv[0]);
87 printf(" where type stands for one of the options of \n");
88 printf(" \"long\", \"short\", \"glob\". If no type is provided the program assumes all types\n");
92 if ((path=getenv("IUPred_PATH"))==NULL) {
93 fprintf(stderr,"IUPred_PATH environment variable is not set\n");
97 printf("# IUPred \n");
98 printf("# Copyright (c) Zsuzsanna Dosztanyi, 2005\n");
100 printf("# Z. Dosztanyi, V. Csizmok, P. Tompa and I. Simon\n");
101 printf("# J. Mol. Biol. (2005) 347, 827-839. \n");
103 printf("# Modified to work within JABAWS (http://www.compbio.dundee.ac.uk/jabaws) framework by \n");
104 printf("# Peter Troshin (pvtroshin@dundee.ac.uk) and Geoff Barton gjbarton@dundee.ac.uk\n");
105 printf("# June, 2011\n");
112 else if ((strncmp(argv[2],"long",4))==0) {
115 else if ((strncmp(argv[2],"short",5))==0) {
118 else if ((strncmp(argv[2],"glob",4))==0) {
122 printf("Cannot recognise disorder type assuming long\n");
128 if ((fasta=fopen(argv[1],"r"))==NULL) {
129 printf("Could not open %s\n",argv[1]),exit(1);
132 /* Creating output files depending on the type */
133 createOutputFiles(type);
135 /* Read input file sequence by sequence */
141 fastaseq = gseq_fasta(fasta);
145 SEQ=malloc(sizeof(SEQ_STR));
147 //printf("No: %s\n",fastaseq->id );
148 //printf("L: %d\n",fastaseq->slen);
150 SEQ->name=fastaseq->id;
151 SEQ->seq=fastaseq->seq;
152 SEQ->le=fastaseq->slen;
155 printf("N: %s\n",SEQ->name);
156 printf("S: %s\n",SEQ->seq);
157 printf("L: %d\n",SEQ->le);
161 if (SEQ->le==0) {printf(" Sequence length 0\n");exit(1);}
164 printf("%s %d\n%s\n",SEQ->name,SEQ->le,SEQ->seq);
168 PARAMS=malloc(sizeof(P_STR));
169 PARAMS->CC= DMatrix(AAN,AAN);
171 /* LONG - DISORDER */
172 if (type==0 || type==3) {
178 read_ref(path,"ss",PARAMS->CC);
179 Get_Histo(PARAMS, path, "histo");
183 fprintf(longout, "# %s\n",SEQ->name);
185 for (i=0;i<SEQ->le;i++)
186 fprintf(longout, "%5d %c %10.4f\n",i+1,SEQ->seq[i],SEQ->en[i]);
189 /* SHORT - DISORDER */
190 if (type==1 || type==3) {
197 read_ref(path,"ss_casp",PARAMS->CC);
198 Get_Histo(PARAMS, path, "histo_casp");
202 fprintf(shortout, "# %s\n",SEQ->name);
203 for (i=0;i<SEQ->le;i++)
204 fprintf(shortout, "%5d %c %10.4f\n",i+1,SEQ->seq[i],SEQ->en[i]);
207 /* GLOB - GLOBULAR DOMAINS */
208 if (type==2 || type==3) {
216 read_ref(path,"ss",PARAMS->CC);
217 Get_Histo(PARAMS,path,"histo");
226 globseq=malloc((SEQ->le+1)*sizeof(char));
227 for (i=0;i<SEQ->le;i++) globseq[i]=tolower(SEQ->seq[i]);
229 fprintf(globout,"# %s\n",SEQ->name);
230 fprintf(globout,"Number of globular domains: %5d \n",SEQ->ngr);
231 for (i=0;i<SEQ->ngr;i++) {
232 fprintf(globout," globular domain %5d. %d - %d \n",
233 i+1,SEQ->gr[i][0]+1,SEQ->gr[i][1]+1);
234 for (j=SEQ->gr[i][0];j<SEQ->gr[i][1]+1;j++) {
235 globseq[j]=toupper(globseq[j]);
238 fprintf(globout,">%s\n",SEQ->name);
239 for (i=0;i<SEQ->le;i++) {
240 if ((i>0)&&(i%60==0)) fprintf(globout,"\n");
241 else if ((i>0)&&(i%10==0)) fprintf(globout," ");
242 fprintf(globout,"%c",globseq[i]);
244 fprintf(globout,"\n");
249 for (i=0;i<SEQ->le;i++)
250 printf("%5d %c %10.4f\n",i,SEQ->seq[i],SEQ->en[i]);
259 } while(fastaseq != NULL);
262 closeOutputFiles(type);
268 void closeOutputFiles(int type) {
285 void createOutputFiles(int type) {
287 longout = fopen("out.long", "w");
290 shortout = fopen("out.short", "w");
293 globout = fopen("out.glob", "w");
296 longout = fopen("out.long", "w");
297 shortout = fopen("out.short", "w");
298 globout = fopen("out.glob", "w");
302 void IUPred(SEQ_STR *SEQ, P_STR *PARAMS)
307 double min, max, step;
310 min=PARAMS->min; max=PARAMS->max;step=PARAMS->step;
312 SEQ->eprof=malloc(naa*sizeof(double));
313 for (i=0;i<naa;i++) SEQ->eprof[i]=0;
314 SEQ->en=malloc(naa*sizeof(double));
315 for (i=0;i<naa;i++) SEQ->en[i]=0;
316 SEQ->smp=malloc(naa*sizeof(double));
318 for (i=0;i<naa;i++) SEQ->smp[i]=0;
321 for (i=0;i<naa;i++) {
322 a1=strchr(AA,((toupper(SEQ->seq[i]))))-AA;
323 if ((a1<0) || (a1>=AAN)) continue;
325 for (j=0;j<naa;j++) if (((abs(i-j))>LC)&&((abs(i-j))<UC)) {
326 a2=strchr(AA,((toupper(SEQ->seq[j]))))-AA;
327 if ((a2<0) || (a2>=AAN)) continue;
328 SEQ->eprof[i]+=PARAMS->CC[a1][a2];
331 SEQ->expscore+=SEQ->eprof[i]/(naa*n2);
337 for (i=0;i<naa;i++) {
339 for (j=MAX(0,i-WS);j<=MIN(naa,i+WS+1);j++) {
340 SEQ->smp[i]+=SEQ->eprof[j];
346 for (i=0;i<naa;i++) {
348 for (j=i-WS;j<i+WS;j++) {
349 if ((j<0)||(j>=naa)) SEQ->smp[i]+=EP;
350 else SEQ->smp[i]+=SEQ->eprof[j];
358 for (i=0;i<naa;i++) {
360 if (SEQ->smp[i]<=min+2*step) SEQ->en[i]=1;
361 if (SEQ->smp[i]>=max-2*step) SEQ->en[i]=0;
362 if ((SEQ->smp[i]>min+2*step)&&(SEQ->smp[i]<max-2*step)) {
363 p=(int)((SEQ->smp[i]-min)*(1.0/step));
364 SEQ->en[i]=PARAMS->distro[p];
368 printf("%5d %10.4f %10.4f %10.4f\n",
369 i,SEQ->eprof[i], SEQ->smp[i],SEQ->en[i]);
378 void getRegions(SEQ_STR *SEQ )
398 for (i=0;i<naa;i++) {
400 if ((in_GR==1)&&(SEQ->smp[i]<=Min_Ene)) {
401 GR=realloc(GR,(nr+1)*sizeof(int *));
402 GR[nr]=malloc(2*sizeof(int));
410 if ((SEQ->smp[i]>Min_Ene)&&(in_GR==0)) {
417 GR=realloc(GR,(nr+1)*sizeof(int *));
418 GR[nr]=malloc(2*sizeof(int));
430 beg=GR[0][0];end=GR[0][1];
433 if ((kk<nr)&&(GR[kk][0]-end)<JOIN) {
438 else if ((end-beg+1)<DEL) {
446 mGR=realloc(mGR,(mnr+1)*sizeof(int*));
447 mGR[mnr]=malloc(2*sizeof(int));
461 for (i=0;i<nr;i++) free(GR[i]);
472 void Get_Histo(P_STR *PARAMS, char *path, char *fn)
477 double v, min, max, cutoff, c;
481 sl=strlen(path)+strlen(fn)+2;
482 fullfn=malloc(sl*sizeof(char));
483 sprintf(fullfn,"%s/%s",path,fn);
485 if ((f=fopen(fullfn,"r"))==NULL) {
486 printf("Could not open %s\n",fullfn);
492 fscanf(f,"%*s %lf %lf %d\n",&min, &max, &nb);
494 PARAMS->distro=malloc(nb*sizeof(double ));
495 for (i=0;i<nb;i++) PARAMS->distro[i]=0;
498 for (i=0,set=0;i<nb;i++) {
501 if (ln[0]=='#') continue;
503 sscanf(ln,"%*s %lf %*s %*s %lf\n", &c,&v);
504 if ((set==0)&&(v<=0.5)) {set=1;cutoff=c;}
512 PARAMS->cutoff=cutoff;
515 PARAMS->step=(max-min)/nb;
516 PARAMS->cutoff-=PARAMS->step;
521 void read_ref(char *path, char *fn, double **REF)
527 char line[1000],s[20];
532 sl=strlen(path)+strlen(fn)+2;
533 fullfn=malloc(sl*sizeof(char));
534 sprintf(fullfn,"%s/%s",path,fn);
536 if ((f=fopen(fullfn,"r"))==NULL) {
537 printf("Could not open %s\n",fullfn);
545 sscanf(line,"%d",&p1);
546 sscanf(&line[8],"%d",&p2);
547 sscanf(&line[17],"%s",s);
555 for (i=0;i<AAN;i++) for (j=0;j<AAN;j++)
566 void read_mat(char *path, char *fn, double **MAT, double *matave)
571 char *args[AAN+2], AAL[AAN],a1,a2;
577 sl=strlen(path)+strlen(fn)+2;
578 fullfn=malloc(sl*sizeof(char));
579 sprintf(fullfn,"%s/%s",path,fn);
581 if ((f=fopen(fullfn,"r"))==NULL) {
582 printf("Could not open %s\n",fullfn);
588 sprintf(AAL,"%s",ln);
596 if (ln[0] == '\n') continue;
597 if (ln[0] == '#') continue;
598 numargs = getargs(ln,args,AAN+2);
602 for (j=0;j<numargs;j++) {
617 for (i=0;i<AAN;i++) for (j=0;j<AAN;j++) *matave+=MAT[i][j];
627 int getargs(char *line,char *args[],int max)
633 for (i=0;i<max;i++) {
634 if ((args[i]=strtok(inptr," \t\n"))==NULL)
644 void *my_malloc(size_t size)
651 new_mem = malloc(size);
652 if (new_mem == NULL) {
653 fprintf(stderr, "can't allocate enough memory: %d bytes\n", size);
660 double **DMatrix(int n_rows, int n_cols)
665 matrix = (double **) my_malloc(n_rows*sizeof(double *));
666 matrix[0] = (double *) my_malloc(n_rows*n_cols*sizeof(double));
668 for (i = 1; i < n_rows; i++)
669 matrix[i] = matrix[i-1] + n_cols;
670 for (i=0;i<n_rows;i++) for (j=0;j<n_cols;j++) matrix[i][j]=0;
678 void Get_Seq(char *fn, SEQ_STR *SEQ)
685 if ((fn==NULL)||(strlen(fn)==0)) {
686 printf("No sequence filename\n"),exit(1);
689 if ((f=fopen(fn,"r"))==NULL) {
690 printf("Could not open %s\n",fn),exit(1);
694 SEQ->seq=calloc(MSL,sizeof(char));
697 sscanf(&line[1],"%s",SEQ->name);
701 while ((c!='>') && (!feof(f))) {
703 if (isalpha((int)(c))) {
706 if ((j>0)&&(j%MSL==0)) {
707 SEQ->seq=realloc(SEQ->seq,(j+MSL)*sizeof(char));
714 printf("%s %5d\n%s\n",SEQ->name,SEQ->le,SEQ->seq);