--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+
+#include <ctype.h>
+#include "getquery.h"
+#include "gjutil.h"
+
+
+#define AA "GAVLIFPSTCMWYNQDEKRH"
+#define AAN 20
+#define MSL 40000
+#define ML 1000
+
+#define DLC 1
+#define DUC 100
+#define DWS 10
+#define DMin_Ene 0.3
+#define DJOIN 45
+#define DDEL 35
+
+
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
+
+typedef struct
+{
+ char *name;
+ int le;
+ char *seq;
+ double expscore;
+ double *eprof;
+ double *smp;
+ double *en;
+ int ngr;
+ int **gr;
+} SEQ_STR;
+
+
+typedef struct
+{
+ double **CC;
+ double *distro;
+ double min, max;
+ double step;
+ double cutoff;
+ int nb;
+} P_STR;
+
+
+void read_mat(char *path, char *fn, double **MAT, double *matave);
+int getargs(char *line,char *args[],int max);
+void read_ref(char *path, char *fn, double **REF);
+void Get_Histo( P_STR *PARAMS, char *path, char *);
+double **DMatrix(int n_rows, int n_cols);
+void *my_malloc(size_t size);
+void IUPred(SEQ_STR *SEQ, P_STR *PARAMS);
+void getRegions(SEQ_STR *SEQ );
+void Get_Seq(char *fn, SEQ_STR *SEQ);
+void createOutputFiles(int type);
+void closeOutputFiles(int type);
+
+
+int LC, UC, WS;
+double Min_Ene;
+int JOIN, DEL;
+int Flag_EP;
+double EP;
+FILE *shortout;
+FILE *longout;
+FILE *globout;
+
+
+int main(int argc, char **argv)
+{
+ P_STR *PARAMS;
+ SEQ_STR *SEQ;
+ int i,j;
+ int type;
+ char *path;
+
+ if (argc<2) {
+ printf(" Usage: %s seqfile type \n",argv[0]);
+ printf(" where type stands for one of the options of \n");
+ printf(" \"long\", \"short\", \"glob\". If no type is provided the program assumes all types\n");
+ exit(1);
+ }
+
+ if ((path=getenv("IUPred_PATH"))==NULL) {
+ fprintf(stderr,"IUPred_PATH environment variable is not set\n");
+ path="./";
+ }
+
+ printf("# IUPred \n");
+ printf("# Copyright (c) Zsuzsanna Dosztanyi, 2005\n");
+ printf("#\n");
+ printf("# Z. Dosztanyi, V. Csizmok, P. Tompa and I. Simon\n");
+ printf("# J. Mol. Biol. (2005) 347, 827-839. \n");
+ printf("#\n");
+ printf("# Modified to work within JABAWS (http://www.compbio.dundee.ac.uk/jabaws) framework by \n");
+ printf("# Peter Troshin (pvtroshin@dundee.ac.uk) and Geoff Barton gjbarton@dundee.ac.uk\n");
+ printf("# June, 2011\n");
+ printf("#\n");
+ printf("#\n");
+
+ if(argv[2]==NULL) {
+ type=3;
+ }
+ else if ((strncmp(argv[2],"long",4))==0) {
+ type=0;
+ }
+ else if ((strncmp(argv[2],"short",5))==0) {
+ type=1;
+ }
+ else if ((strncmp(argv[2],"glob",4))==0) {
+ type=2;
+ }
+ else {
+ printf("Cannot recognise disorder type assuming long\n");
+ type=0;
+ }
+
+ FILE *fasta;
+
+ if ((fasta=fopen(argv[1],"r"))==NULL) {
+ printf("Could not open %s\n",argv[1]),exit(1);
+ }
+
+/* Creating output files depending on the type */
+ createOutputFiles(type);
+
+/* Read input file sequence by sequence */
+
+ SEQS *fastaseq;
+
+ do {
+
+ fastaseq = gseq_fasta(fasta);
+ if(fastaseq==NULL) {
+ break;
+ }
+ SEQ=malloc(sizeof(SEQ_STR));
+
+ //printf("No: %s\n",fastaseq->id );
+ //printf("L: %d\n",fastaseq->slen);
+
+ SEQ->name=fastaseq->id;
+ SEQ->seq=fastaseq->seq;
+ SEQ->le=fastaseq->slen;
+
+#ifdef DEBUG
+ printf("N: %s\n",SEQ->name);
+ printf("S: %s\n",SEQ->seq);
+ printf("L: %d\n",SEQ->le);
+#endif
+
+
+ if (SEQ->le==0) {printf(" Sequence length 0\n");exit(1);}
+
+#ifdef DEBUG
+ printf("%s %d\n%s\n",SEQ->name,SEQ->le,SEQ->seq);
+#endif
+
+
+ PARAMS=malloc(sizeof(P_STR));
+ PARAMS->CC= DMatrix(AAN,AAN);
+
+ /* LONG - DISORDER */
+ if (type==0 || type==3) {
+ LC=1;
+ UC=100;
+ WS=10;
+ Flag_EP=0;
+
+ read_ref(path,"ss",PARAMS->CC);
+ Get_Histo(PARAMS, path, "histo");
+
+ IUPred(SEQ,PARAMS);
+
+ fprintf(longout, "# %s\n",SEQ->name);
+
+ for (i=0;i<SEQ->le;i++)
+ fprintf(longout, "%5d %c %10.4f\n",i+1,SEQ->seq[i],SEQ->en[i]);
+
+}
+/* SHORT - DISORDER */
+ if (type==1 || type==3) {
+ LC=1;
+ UC=25;
+ WS=10;
+ Flag_EP=1;
+ EP=-1.26;
+
+ read_ref(path,"ss_casp",PARAMS->CC);
+ Get_Histo(PARAMS, path, "histo_casp");
+
+ IUPred(SEQ,PARAMS);
+
+ fprintf(shortout, "# %s\n",SEQ->name);
+ for (i=0;i<SEQ->le;i++)
+ fprintf(shortout, "%5d %c %10.4f\n",i+1,SEQ->seq[i],SEQ->en[i]);
+}
+
+/* GLOB - GLOBULAR DOMAINS */
+ if (type==2 || type==3) {
+ char *globseq;
+
+ LC=1;
+ UC=100;
+ WS=15;
+ Flag_EP=0;
+
+ read_ref(path,"ss",PARAMS->CC);
+ Get_Histo(PARAMS,path,"histo");
+
+ IUPred(SEQ,PARAMS);
+
+ Min_Ene=DMin_Ene;
+ JOIN=DJOIN;
+ DEL=DDEL;
+
+ getRegions(SEQ);
+ globseq=malloc((SEQ->le+1)*sizeof(char));
+ for (i=0;i<SEQ->le;i++) globseq[i]=tolower(SEQ->seq[i]);
+
+ fprintf(globout,"# %s\n",SEQ->name);
+ fprintf(globout,"Number of globular domains: %5d \n",SEQ->ngr);
+ for (i=0;i<SEQ->ngr;i++) {
+ fprintf(globout," globular domain %5d. %d - %d \n",
+ i+1,SEQ->gr[i][0]+1,SEQ->gr[i][1]+1);
+ for (j=SEQ->gr[i][0];j<SEQ->gr[i][1]+1;j++) {
+ globseq[j]=toupper(globseq[j]);
+ }
+ }
+ fprintf(globout,">%s\n",SEQ->name);
+ for (i=0;i<SEQ->le;i++) {
+ if ((i>0)&&(i%60==0)) fprintf(globout,"\n");
+ else if ((i>0)&&(i%10==0)) fprintf(globout," ");
+ fprintf(globout,"%c",globseq[i]);
+ }
+ fprintf(globout,"\n");
+ free(globseq);
+ }
+
+#ifdef DEBUG
+ for (i=0;i<SEQ->le;i++)
+ printf("%5d %c %10.4f\n",i,SEQ->seq[i],SEQ->en[i]);
+#endif
+
+
+ free(SEQ->name);
+ free(SEQ->seq);
+ free(SEQ);
+ free(fastaseq);
+
+ } while(fastaseq != NULL);
+
+ fclose(fasta);
+ closeOutputFiles(type);
+
+ return 0;
+}
+
+
+void closeOutputFiles(int type) {
+ if (type==0) {
+ fclose(longout);
+ }
+ else if (type==1) {
+ fclose(shortout);
+ }
+ else if (type==2) {
+ fclose(globout);
+ }
+ else if (type==3) {
+ fclose(longout);
+ fclose(shortout);
+ fclose(globout);
+ }
+}
+
+void createOutputFiles(int type) {
+ if (type==0) {
+ longout = fopen("out.long", "w");
+ }
+ else if (type==1) {
+ shortout = fopen("out.short", "w");
+ }
+ else if (type==2) {
+ globout = fopen("out.glob", "w");
+ }
+ else if (type==3) {
+ longout = fopen("out.long", "w");
+ shortout = fopen("out.short", "w");
+ globout = fopen("out.glob", "w");
+ }
+}
+
+void IUPred(SEQ_STR *SEQ, P_STR *PARAMS)
+{
+ int i,j, a1, a2, p;
+ int naa;
+ double n2;
+ double min, max, step;
+
+ naa=SEQ->le;
+ min=PARAMS->min; max=PARAMS->max;step=PARAMS->step;
+
+ SEQ->eprof=malloc(naa*sizeof(double));
+ for (i=0;i<naa;i++) SEQ->eprof[i]=0;
+ SEQ->en=malloc(naa*sizeof(double));
+ for (i=0;i<naa;i++) SEQ->en[i]=0;
+ SEQ->smp=malloc(naa*sizeof(double));
+
+ for (i=0;i<naa;i++) SEQ->smp[i]=0;
+
+ SEQ->expscore=0;
+ for (i=0;i<naa;i++) {
+ a1=strchr(AA,((toupper(SEQ->seq[i]))))-AA;
+ if ((a1<0) || (a1>=AAN)) continue;
+ n2=0;
+ for (j=0;j<naa;j++) if (((abs(i-j))>LC)&&((abs(i-j))<UC)) {
+ a2=strchr(AA,((toupper(SEQ->seq[j]))))-AA;
+ if ((a2<0) || (a2>=AAN)) continue;
+ SEQ->eprof[i]+=PARAMS->CC[a1][a2];
+ n2++;
+ }
+ SEQ->expscore+=SEQ->eprof[i]/(naa*n2);
+ SEQ->eprof[i]/=n2;
+
+ }
+
+ if (Flag_EP==0) {
+ for (i=0;i<naa;i++) {
+ n2=0;
+ for (j=MAX(0,i-WS);j<=MIN(naa,i+WS+1);j++) {
+ SEQ->smp[i]+=SEQ->eprof[j];
+ n2++;
+ }
+ SEQ->smp[i]/=n2;
+ }
+ } else {
+ for (i=0;i<naa;i++) {
+ n2=0;
+ for (j=i-WS;j<i+WS;j++) {
+ if ((j<0)||(j>=naa)) SEQ->smp[i]+=EP;
+ else SEQ->smp[i]+=SEQ->eprof[j];
+
+ n2++;
+ }
+ SEQ->smp[i]/=n2;
+ }
+ }
+
+ for (i=0;i<naa;i++) {
+
+ if (SEQ->smp[i]<=min+2*step) SEQ->en[i]=1;
+ if (SEQ->smp[i]>=max-2*step) SEQ->en[i]=0;
+ if ((SEQ->smp[i]>min+2*step)&&(SEQ->smp[i]<max-2*step)) {
+ p=(int)((SEQ->smp[i]-min)*(1.0/step));
+ SEQ->en[i]=PARAMS->distro[p];
+ }
+
+#ifdef DEBUG
+ printf("%5d %10.4f %10.4f %10.4f\n",
+ i,SEQ->eprof[i], SEQ->smp[i],SEQ->en[i]);
+
+#endif
+ }
+
+}
+
+
+
+void getRegions(SEQ_STR *SEQ )
+{
+ int naa;
+ int i, k,kk;
+ int **GR, **mGR;
+ int in_GR;
+ int nr, mnr;
+ int beg_GR, end_GR;
+ int beg,end;
+
+
+ naa=SEQ->le;
+
+
+ GR=NULL;
+ nr=0;
+ in_GR=0;
+ beg_GR=end_GR=0;
+
+
+ for (i=0;i<naa;i++) {
+
+ if ((in_GR==1)&&(SEQ->smp[i]<=Min_Ene)) {
+ GR=realloc(GR,(nr+1)*sizeof(int *));
+ GR[nr]=malloc(2*sizeof(int));
+ GR[nr][0]=beg_GR;
+ GR[nr][1]=end_GR;
+ in_GR=0;
+ nr++;
+ }
+ else if (in_GR==1)
+ end_GR++;
+ if ((SEQ->smp[i]>Min_Ene)&&(in_GR==0)) {
+ beg_GR=i;
+ end_GR=i;
+ in_GR=1;
+ }
+ }
+ if (in_GR==1) {
+ GR=realloc(GR,(nr+1)*sizeof(int *));
+ GR[nr]=malloc(2*sizeof(int));
+ GR[nr][0]=beg_GR;
+ GR[nr][1]=end_GR;
+ in_GR=0;
+ nr++;
+ }
+
+
+ mnr=0;
+ k=0;mGR=NULL;
+ kk=k+1;
+ if (nr>0) {
+ beg=GR[0][0];end=GR[0][1];
+ }
+ while (k<nr) {
+ if ((kk<nr)&&(GR[kk][0]-end)<JOIN) {
+ beg=GR[k][0];
+ end=GR[kk][1];
+ kk++;
+ }
+ else if ((end-beg+1)<DEL) {
+ k++;
+ if (k<nr) {
+ beg=GR[k][0];
+ end=GR[k][1];
+ }
+ }
+ else {
+ mGR=realloc(mGR,(mnr+1)*sizeof(int*));
+ mGR[mnr]=malloc(2*sizeof(int));
+ mGR[mnr][0]=beg;
+ mGR[mnr][1]=end;
+ mnr++;
+ k=kk;
+ kk++;
+ if (k<nr) {
+ beg=GR[k][0];
+ end=GR[k][1];
+ }
+ }
+ }
+
+
+ for (i=0;i<nr;i++) free(GR[i]);
+ free(GR);
+
+ SEQ->ngr=mnr;
+ SEQ->gr=mGR;
+
+
+
+}
+
+
+void Get_Histo(P_STR *PARAMS, char *path, char *fn)
+{
+ FILE *f;
+ char ln[ML];
+ int i,nb, set;
+ double v, min, max, cutoff, c;
+ char *fullfn;
+ int sl;
+
+ sl=strlen(path)+strlen(fn)+2;
+ fullfn=malloc(sl*sizeof(char));
+ sprintf(fullfn,"%s/%s",path,fn);
+
+ if ((f=fopen(fullfn,"r"))==NULL) {
+ printf("Could not open %s\n",fullfn);
+ exit(1);
+ }
+
+
+
+ fscanf(f,"%*s %lf %lf %d\n",&min, &max, &nb);
+
+ PARAMS->distro=malloc(nb*sizeof(double ));
+ for (i=0;i<nb;i++) PARAMS->distro[i]=0;
+
+
+ for (i=0,set=0;i<nb;i++) {
+ fgets(ln,ML,f);
+ if (feof(f)) break;
+ if (ln[0]=='#') continue;
+
+ sscanf(ln,"%*s %lf %*s %*s %lf\n", &c,&v);
+ if ((set==0)&&(v<=0.5)) {set=1;cutoff=c;}
+ PARAMS->distro[i]=v;
+ }
+
+ fclose(f);
+ PARAMS->max=max;
+ PARAMS->min=min;
+ PARAMS->nb=nb;
+ PARAMS->cutoff=cutoff;
+
+
+ PARAMS->step=(max-min)/nb;
+ PARAMS->cutoff-=PARAMS->step;
+
+}
+
+
+void read_ref(char *path, char *fn, double **REF)
+{
+
+ FILE *f;
+ int p1,p2;
+ double v;
+ char line[1000],s[20];
+ int i,j;
+ char *fullfn;
+ int sl;
+
+ sl=strlen(path)+strlen(fn)+2;
+ fullfn=malloc(sl*sizeof(char));
+ sprintf(fullfn,"%s/%s",path,fn);
+
+ if ((f=fopen(fullfn,"r"))==NULL) {
+ printf("Could not open %s\n",fullfn);
+ exit(1);
+ }
+
+
+ while (!feof(f)) {
+ fgets(line,1000,f);
+
+ sscanf(line,"%d",&p1);
+ sscanf(&line[8],"%d",&p2);
+ sscanf(&line[17],"%s",s);
+ v=atof(s);
+ REF[p1][p2]=v;
+
+ }
+
+
+ if (REF[9][9]<0) {
+ for (i=0;i<AAN;i++) for (j=0;j<AAN;j++)
+ REF[i][j]*=-1;
+ }
+
+
+ fclose(f);
+
+}
+
+
+
+void read_mat(char *path, char *fn, double **MAT, double *matave)
+{
+ FILE *f;
+ char ln[ML];
+ int numargs;
+ char *args[AAN+2], AAL[AAN],a1,a2;
+ int i,j,k ,p, q;
+ double val;
+ int sl;
+ char *fullfn;
+
+ sl=strlen(path)+strlen(fn)+2;
+ fullfn=malloc(sl*sizeof(char));
+ sprintf(fullfn,"%s/%s",path,fn);
+
+ if ((f=fopen(fullfn,"r"))==NULL) {
+ printf("Could not open %s\n",fullfn);
+ exit(1);
+ }
+
+
+ fgets(ln,ML,f);
+ sprintf(AAL,"%s",ln);
+ i=0;
+ while (!feof(f)) {
+ fgets(ln,ML,f);
+ k=0;j=0;
+ if (feof(f)) break;
+
+
+ if (ln[0] == '\n') continue;
+ if (ln[0] == '#') continue;
+ numargs = getargs(ln,args,AAN+2);
+
+
+
+ for (j=0;j<numargs;j++) {
+ val=atof(args[j]);
+ a1=AAL[i];
+ a2=AAL[j];
+ p=strchr(AA,a1)-AA;
+ q=strchr(AA,a2)-AA;
+
+ MAT[p][q]=val;
+
+ }
+ i++;
+
+ }
+
+ *matave=0;
+ for (i=0;i<AAN;i++) for (j=0;j<AAN;j++) *matave+=MAT[i][j];
+ *matave/=(AAN*AAN);
+
+
+ fclose(f);
+
+
+}
+
+
+int getargs(char *line,char *args[],int max)
+{
+ char *inptr;
+ int i;
+
+ inptr=line;
+ for (i=0;i<max;i++) {
+ if ((args[i]=strtok(inptr," \t\n"))==NULL)
+ break;
+ inptr=NULL;
+
+ }
+
+ return(i);
+
+}
+
+void *my_malloc(size_t size)
+{
+
+ void *new_mem;
+
+ if (size == 0)
+ return NULL;
+ new_mem = malloc(size);
+ if (new_mem == NULL) {
+ fprintf(stderr, "can't allocate enough memory: %d bytes\n", size);
+ }
+ return new_mem;
+}
+
+
+
+double **DMatrix(int n_rows, int n_cols)
+{
+ double **matrix;
+ int i,j;
+
+ matrix = (double **) my_malloc(n_rows*sizeof(double *));
+ matrix[0] = (double *) my_malloc(n_rows*n_cols*sizeof(double));
+
+ for (i = 1; i < n_rows; i++)
+ matrix[i] = matrix[i-1] + n_cols;
+ for (i=0;i<n_rows;i++) for (j=0;j<n_cols;j++) matrix[i][j]=0;
+
+ return matrix;
+
+}
+
+
+/*
+void Get_Seq(char *fn, SEQ_STR *SEQ)
+{
+ char line[ML];
+ char c=0;
+ int j;
+ FILE *f;
+
+ if ((fn==NULL)||(strlen(fn)==0)) {
+ printf("No sequence filename\n"),exit(1);
+ }
+
+ if ((f=fopen(fn,"r"))==NULL) {
+ printf("Could not open %s\n",fn),exit(1);
+ }
+
+
+ SEQ->seq=calloc(MSL,sizeof(char));
+
+ fgets(line,ML,f);
+ sscanf(&line[1],"%s",SEQ->name);
+
+
+ j=0;
+ while ((c!='>') && (!feof(f))) {
+ c=fgetc(f);
+ if (isalpha((int)(c))) {
+ SEQ->seq[j]=c;
+ j++;
+ if ((j>0)&&(j%MSL==0)) {
+ SEQ->seq=realloc(SEQ->seq,(j+MSL)*sizeof(char));
+ }
+ }
+ }
+ SEQ->le=j;
+
+#ifdef DEBUG
+ printf("%s %5d\n%s\n",SEQ->name,SEQ->le,SEQ->seq);
+#endif
+
+}
+*/
+