#include #include #include #include #include #include #include "getquery.h" #include "gjutil.h" #define AA "GAVLIFPSTCMWYNQDEKRH" #define AAN 20 #define MSL 40000 #define ML 1000 #define DLC 1 #define DUC 100 #define DWS 10 #define DMin_Ene 0.3 #define DJOIN 45 #define DDEL 35 #define MAX(a,b) ((a)>(b)?(a):(b)) #define MIN(a,b) ((a)<(b)?(a):(b)) typedef struct { char *name; int le; char *seq; double expscore; double *eprof; double *smp; double *en; int ngr; int **gr; } SEQ_STR; typedef struct { double **CC; double *distro; double min, max; double step; double cutoff; int nb; } P_STR; void read_mat(char *path, char *fn, double **MAT, double *matave); int getargs(char *line,char *args[],int max); void read_ref(char *path, char *fn, double **REF); void Get_Histo( P_STR *PARAMS, char *path, char *); double **DMatrix(int n_rows, int n_cols); void *my_malloc(size_t size); void IUPred(SEQ_STR *SEQ, P_STR *PARAMS); void getRegions(SEQ_STR *SEQ ); void Get_Seq(char *fn, SEQ_STR *SEQ); void createOutputFiles(int type); void closeOutputFiles(int type); int LC, UC, WS; double Min_Ene; int JOIN, DEL; int Flag_EP; double EP; FILE *shortout; FILE *longout; FILE *globout; int main(int argc, char **argv) { P_STR *PARAMS; SEQ_STR *SEQ; int i,j; int type; char *path; if (argc<2) { printf(" Usage: %s seqfile type \n",argv[0]); printf(" where type stands for one of the options of \n"); printf(" \"long\", \"short\", \"glob\". If no type is provided the program assumes all types\n"); exit(1); } if ((path=getenv("IUPred_PATH"))==NULL) { fprintf(stderr,"IUPred_PATH environment variable is not set\n"); path="./"; } printf("# IUPred \n"); printf("# Copyright (c) Zsuzsanna Dosztanyi, 2005\n"); printf("#\n"); printf("# Z. Dosztanyi, V. Csizmok, P. Tompa and I. Simon\n"); printf("# J. Mol. Biol. (2005) 347, 827-839. \n"); printf("#\n"); printf("# Modified to work within JABAWS (http://www.compbio.dundee.ac.uk/jabaws) framework by \n"); printf("# Peter Troshin (pvtroshin@dundee.ac.uk) and Geoff Barton gjbarton@dundee.ac.uk\n"); printf("# June, 2011\n"); printf("#\n"); printf("#\n"); if(argv[2]==NULL) { type=3; } else if ((strncmp(argv[2],"long",4))==0) { type=0; } else if ((strncmp(argv[2],"short",5))==0) { type=1; } else if ((strncmp(argv[2],"glob",4))==0) { type=2; } else { printf("Cannot recognise disorder type assuming long\n"); type=0; } FILE *fasta; if ((fasta=fopen(argv[1],"r"))==NULL) { printf("Could not open %s\n",argv[1]),exit(1); } /* Creating output files depending on the type */ createOutputFiles(type); /* Read input file sequence by sequence */ SEQS *fastaseq; do { fastaseq = gseq_fasta(fasta); if(fastaseq==NULL) { break; } SEQ=malloc(sizeof(SEQ_STR)); //printf("No: %s\n",fastaseq->id ); //printf("L: %d\n",fastaseq->slen); SEQ->name=fastaseq->id; SEQ->seq=fastaseq->seq; SEQ->le=fastaseq->slen; #ifdef DEBUG printf("N: %s\n",SEQ->name); printf("S: %s\n",SEQ->seq); printf("L: %d\n",SEQ->le); #endif if (SEQ->le==0) {printf(" Sequence length 0\n");exit(1);} #ifdef DEBUG printf("%s %d\n%s\n",SEQ->name,SEQ->le,SEQ->seq); #endif PARAMS=malloc(sizeof(P_STR)); PARAMS->CC= DMatrix(AAN,AAN); /* LONG - DISORDER */ if (type==0 || type==3) { LC=1; UC=100; WS=10; Flag_EP=0; read_ref(path,"ss",PARAMS->CC); Get_Histo(PARAMS, path, "histo"); IUPred(SEQ,PARAMS); fprintf(longout, "# %s\n",SEQ->name); for (i=0;ile;i++) fprintf(longout, "%5d %c %10.4f\n",i+1,SEQ->seq[i],SEQ->en[i]); } /* SHORT - DISORDER */ if (type==1 || type==3) { LC=1; UC=25; WS=10; Flag_EP=1; EP=-1.26; read_ref(path,"ss_casp",PARAMS->CC); Get_Histo(PARAMS, path, "histo_casp"); IUPred(SEQ,PARAMS); fprintf(shortout, "# %s\n",SEQ->name); for (i=0;ile;i++) fprintf(shortout, "%5d %c %10.4f\n",i+1,SEQ->seq[i],SEQ->en[i]); } /* GLOB - GLOBULAR DOMAINS */ if (type==2 || type==3) { char *globseq; LC=1; UC=100; WS=15; Flag_EP=0; read_ref(path,"ss",PARAMS->CC); Get_Histo(PARAMS,path,"histo"); IUPred(SEQ,PARAMS); Min_Ene=DMin_Ene; JOIN=DJOIN; DEL=DDEL; getRegions(SEQ); globseq=malloc((SEQ->le+1)*sizeof(char)); for (i=0;ile;i++) globseq[i]=tolower(SEQ->seq[i]); fprintf(globout,"# %s\n",SEQ->name); fprintf(globout,"Number of globular domains: %5d \n",SEQ->ngr); for (i=0;ingr;i++) { fprintf(globout," globular domain %5d. %d - %d \n", i+1,SEQ->gr[i][0]+1,SEQ->gr[i][1]+1); for (j=SEQ->gr[i][0];jgr[i][1]+1;j++) { globseq[j]=toupper(globseq[j]); } } fprintf(globout,">%s\n",SEQ->name); for (i=0;ile;i++) { if ((i>0)&&(i%60==0)) fprintf(globout,"\n"); else if ((i>0)&&(i%10==0)) fprintf(globout," "); fprintf(globout,"%c",globseq[i]); } fprintf(globout,"\n"); free(globseq); } #ifdef DEBUG for (i=0;ile;i++) printf("%5d %c %10.4f\n",i,SEQ->seq[i],SEQ->en[i]); #endif free(SEQ->name); free(SEQ->seq); free(SEQ); free(fastaseq); } while(fastaseq != NULL); fclose(fasta); closeOutputFiles(type); return 0; } void closeOutputFiles(int type) { if (type==0) { fclose(longout); } else if (type==1) { fclose(shortout); } else if (type==2) { fclose(globout); } else if (type==3) { fclose(longout); fclose(shortout); fclose(globout); } } void createOutputFiles(int type) { if (type==0) { longout = fopen("out.long", "w"); } else if (type==1) { shortout = fopen("out.short", "w"); } else if (type==2) { globout = fopen("out.glob", "w"); } else if (type==3) { longout = fopen("out.long", "w"); shortout = fopen("out.short", "w"); globout = fopen("out.glob", "w"); } } void IUPred(SEQ_STR *SEQ, P_STR *PARAMS) { int i,j, a1, a2, p; int naa; double n2; double min, max, step; naa=SEQ->le; min=PARAMS->min; max=PARAMS->max;step=PARAMS->step; SEQ->eprof=malloc(naa*sizeof(double)); for (i=0;ieprof[i]=0; SEQ->en=malloc(naa*sizeof(double)); for (i=0;ien[i]=0; SEQ->smp=malloc(naa*sizeof(double)); for (i=0;ismp[i]=0; SEQ->expscore=0; for (i=0;iseq[i]))))-AA; if ((a1<0) || (a1>=AAN)) continue; n2=0; for (j=0;jLC)&&((abs(i-j))seq[j]))))-AA; if ((a2<0) || (a2>=AAN)) continue; SEQ->eprof[i]+=PARAMS->CC[a1][a2]; n2++; } SEQ->expscore+=SEQ->eprof[i]/(naa*n2); SEQ->eprof[i]/=n2; } if (Flag_EP==0) { for (i=0;ismp[i]+=SEQ->eprof[j]; n2++; } SEQ->smp[i]/=n2; } } else { for (i=0;i=naa)) SEQ->smp[i]+=EP; else SEQ->smp[i]+=SEQ->eprof[j]; n2++; } SEQ->smp[i]/=n2; } } for (i=0;ismp[i]<=min+2*step) SEQ->en[i]=1; if (SEQ->smp[i]>=max-2*step) SEQ->en[i]=0; if ((SEQ->smp[i]>min+2*step)&&(SEQ->smp[i]smp[i]-min)*(1.0/step)); SEQ->en[i]=PARAMS->distro[p]; } #ifdef DEBUG printf("%5d %10.4f %10.4f %10.4f\n", i,SEQ->eprof[i], SEQ->smp[i],SEQ->en[i]); #endif } } void getRegions(SEQ_STR *SEQ ) { int naa; int i, k,kk; int **GR, **mGR; int in_GR; int nr, mnr; int beg_GR, end_GR; int beg,end; naa=SEQ->le; GR=NULL; nr=0; in_GR=0; beg_GR=end_GR=0; for (i=0;ismp[i]<=Min_Ene)) { GR=realloc(GR,(nr+1)*sizeof(int *)); GR[nr]=malloc(2*sizeof(int)); GR[nr][0]=beg_GR; GR[nr][1]=end_GR; in_GR=0; nr++; } else if (in_GR==1) end_GR++; if ((SEQ->smp[i]>Min_Ene)&&(in_GR==0)) { beg_GR=i; end_GR=i; in_GR=1; } } if (in_GR==1) { GR=realloc(GR,(nr+1)*sizeof(int *)); GR[nr]=malloc(2*sizeof(int)); GR[nr][0]=beg_GR; GR[nr][1]=end_GR; in_GR=0; nr++; } mnr=0; k=0;mGR=NULL; kk=k+1; if (nr>0) { beg=GR[0][0];end=GR[0][1]; } while (kngr=mnr; SEQ->gr=mGR; } void Get_Histo(P_STR *PARAMS, char *path, char *fn) { FILE *f; char ln[ML]; int i,nb, set; double v, min, max, cutoff, c; char *fullfn; int sl; sl=strlen(path)+strlen(fn)+2; fullfn=malloc(sl*sizeof(char)); sprintf(fullfn,"%s/%s",path,fn); if ((f=fopen(fullfn,"r"))==NULL) { printf("Could not open %s\n",fullfn); exit(1); } fscanf(f,"%*s %lf %lf %d\n",&min, &max, &nb); PARAMS->distro=malloc(nb*sizeof(double )); for (i=0;idistro[i]=0; for (i=0,set=0;idistro[i]=v; } fclose(f); PARAMS->max=max; PARAMS->min=min; PARAMS->nb=nb; PARAMS->cutoff=cutoff; PARAMS->step=(max-min)/nb; PARAMS->cutoff-=PARAMS->step; } void read_ref(char *path, char *fn, double **REF) { FILE *f; int p1,p2; double v; char line[1000],s[20]; int i,j; char *fullfn; int sl; sl=strlen(path)+strlen(fn)+2; fullfn=malloc(sl*sizeof(char)); sprintf(fullfn,"%s/%s",path,fn); if ((f=fopen(fullfn,"r"))==NULL) { printf("Could not open %s\n",fullfn); exit(1); } while (!feof(f)) { fgets(line,1000,f); sscanf(line,"%d",&p1); sscanf(&line[8],"%d",&p2); sscanf(&line[17],"%s",s); v=atof(s); REF[p1][p2]=v; } if (REF[9][9]<0) { for (i=0;iseq=calloc(MSL,sizeof(char)); fgets(line,ML,f); sscanf(&line[1],"%s",SEQ->name); j=0; while ((c!='>') && (!feof(f))) { c=fgetc(f); if (isalpha((int)(c))) { SEQ->seq[j]=c; j++; if ((j>0)&&(j%MSL==0)) { SEQ->seq=realloc(SEQ->seq,(j+MSL)*sizeof(char)); } } } SEQ->le=j; #ifdef DEBUG printf("%s %5d\n%s\n",SEQ->name,SEQ->le,SEQ->seq); #endif } */