X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=inline;f=website%2Farchive%2Fbinaries%2Fmac%2Fsrc%2Ffasta34%2Fres_stats.c;fp=website%2Farchive%2Fbinaries%2Fmac%2Fsrc%2Ffasta34%2Fres_stats.c;h=b756493c5266be019a4537ad0e27e25712e210e9;hb=dbde3fb6f00b9bb770343631a517c0e599db8528;hp=0000000000000000000000000000000000000000;hpb=85f830bbd51a7277994bd4233141016304e210c9;p=jabaws.git diff --git a/website/archive/binaries/mac/src/fasta34/res_stats.c b/website/archive/binaries/mac/src/fasta34/res_stats.c new file mode 100644 index 0000000..b756493 --- /dev/null +++ b/website/archive/binaries/mac/src/fasta34/res_stats.c @@ -0,0 +1,685 @@ +/* calculate stats from results file using scalesws.c */ + +#include +#include +#include + +#include +#include + +#define MAX_LLEN 200 + +#define LN_FACT 10.0 + +#include "defs.h" +#include "structs.h" +#include "param.h" + +struct beststr { + int score; /* smith-waterman score */ + int sscore; /* duplicate for compatibility with fasta */ + double comp; + double H; + double zscore; + double escore; + int n1; +#ifndef USE_FTELLO + long lseek; /* position in library file */ +#else + off_t lseek; +#endif + int cont; /* offset into sequence */ + int frame; + int lib; + char libstr[13]; +} *bbp, *bestptr, **bptr, *best; + +struct stat_str { + int score; + int n1; + double comp; + double H; +}; + +static struct db_str qtt = {0l, 0l, 0}; + +char gstring2[MAX_STR]; /* string for label */ +char gstring3[MAX_STR]; +char hstring1[MAX_STR]; + +FILE *outfd; + +int nbest; /* number of sequences better than bestcut in best */ +int bestcut=1; /* cut off for getting into MAXBEST */ +int bestfull; + +int dohist = 0; +int zsflag = 1; +int outtty=1; +int llen=40; + +/* statistics functions */ +extern void +process_hist(struct stat_str *sptr, int nstat, struct pstruct pst, + struct hist_str *hist, void **); +extern void addhistz(double, struct hist_str *); /* scaleswn.c */ +void selectbestz(struct beststr **, int, int ); + +extern double zs_to_E(double, int, int, long, struct db_str); +extern double zs_to_Ec(double zs, long entries); + +extern double (*find_zp)(int score, int length, double comp, void *); + +void prhist(FILE *, struct mngmsg, struct pstruct, struct hist_str, + int, struct db_str, char *); + +int nshow=20, mshow=50, ashow= -1; +double e_cut=10.0; + +main(argc, argv) + int argc; char **argv; +{ + FILE *fin; + char line[512]; + int max, icol, iarg, i, qsfnum, lsfnum, n0, n1, s[3], frame; + double comp, H; + int idup, ndup, max_s; + char libstr[20], *bp; + char bin_file[80]; + FILE *bout=NULL; + struct mngmsg m_msg; /* Message from host to manager */ + struct pstruct pst; + struct stat_str *stats; + int nstats; + double zscor, mu, var; + +#if defined(UNIX) + outtty = isatty(1); +#else + outtty = 1; +#endif + + if (argc < 2 ) { + fprintf(stderr," useage - res_stats -c col -r bin_file file\n"); + exit(1); + } + + m_msg.db.length = qtt.length = 0l; + m_msg.db.entries = m_msg.db.carry = qtt.entries = qtt.carry = 0; + m_msg.pstat_void = NULL; + m_msg.hist.hist_a = NULL; + m_msg.nohist = 0; + m_msg.markx = 0; + + pst.n0 = 200; /* sensible dummy value */ + pst.zsflag = 1; + pst.dnaseq = 0; + pst.histint = 2; + + bin_file[0]='\0'; + icol = 1; + iarg = 1; + ndup = 1; + while (1) { + if (argv[iarg][0]=='-' && argv[iarg][1]=='c') { + sscanf(argv[iarg+1],"%d",&icol); + iarg += 2; + } + else if (argv[iarg][0]=='-' && argv[iarg][1]=='r') { + strncpy(bin_file,argv[iarg+1],sizeof(bin_file)); + iarg += 2; + } + else if (argv[iarg][0]=='-' && argv[iarg][1]=='z') { + sscanf(argv[iarg+1],"%d",&pst.zsflag); + iarg += 2; + } + else if (argv[iarg][0]=='-' && argv[iarg][1]=='n') { + pst.dnaseq = 1; + iarg += 1; + } + else if (argv[iarg][0]=='-' && argv[iarg][1]=='s') { + sscanf(argv[iarg+1],"%d",&ndup); + iarg += 2; + } + else if (argv[iarg][0]=='-' && argv[iarg][1]=='q') { + outtty = 0; + iarg += 1; + } + else break; + } + + icol--; + + if ((fin=fopen(argv[iarg],"r"))==NULL) { + fprintf(stderr," cannot open %s\n",argv[1]); + exit(1); + } + + if (bin_file[0]!='\0' && ((bout=fopen(bin_file,"w"))==NULL)) { + fprintf(stderr,"cannot open %s for output\n",bin_file); + } + + if ((stats = + (struct stat_str *)malloc((MAXSTATS)*sizeof(struct stat_str)))==NULL) + s_abort ("Cannot allocate stats struct",""); + nstats = 0; + + initbest(MAXBEST+1); /* +1 required for select() */ + + for (nbest=0; nbest 0) pst.n0 = n0; + + while (fgets(line,sizeof(line),fin)!=NULL) { + if (line[0]=='/' && line[1]=='*') { + fputs(line,stdout); + strncpy(gstring2,line,sizeof(gstring2)); + if ((bp=strchr(gstring2,'\n'))!=NULL) *bp = '\0'; + break; + } + if (line[0]==';') { + if ((bp=strchr(line,'|'))!=NULL) qsfnum = atoi(bp+1); + else continue; + if ((bp=strchr(line,'('))!=NULL) { + n0 = atoi(bp+1); + pst.n0 = n0; + } + else { + fprintf(stderr, "cannot find n0:\n %s\n",line); + continue; + } + } + else { + sscanf(line,"%s %d %d %d %lf %lf %d %d %d", + libstr,&lsfnum,&n1,&frame,&comp, &H, &s[0],&s[1],&s[2]); + if (lsfnum==0 && n1==0) { + fputs(line,stderr); + continue; + } + if (n1 < 10 || s[icol]<=0) fputs(line,stderr); + idup++; + + if (s[icol] > max_s) max_s = s[icol]; + if (idup < ndup) continue; + + m_msg.db.entries++; + m_msg.db.length += n1; + + if (dohist) addhistz(zscor=(*find_zp)(max_s,n1,comp,m_msg.pstat_void), + &m_msg.hist); + else zscor = (double)max_s; + + if (nstats < MAXSTATS) { + stats[nstats].n1 = n1; + stats[nstats].comp = comp; + stats[nstats].H = H; + stats[nstats++].score = max_s; + } + + else if (!dohist) { + /* do_bout(bout,stats,nstats); */ + process_hist(stats,nstats,pst,&m_msg.hist, &m_msg.pstat_void); + for (i=0; izscore = + (*find_zp)(bptr[i]->score,bptr[i]->n1,bptr[i]->comp, + m_msg.pstat_void); + dohist = 1; + } + + if (dohist) { + zscor =(*find_zp)(max_s,n1,comp,m_msg.pstat_void); + addhistz(zscor,&m_msg.hist); + } + else zscor = (double)max_s; + + if (nbest >= MAXBEST) { + bestfull = nbest-MAXBEST/4; + selectz(bestfull-1,nbest); + bestcut = (int)(bptr[bestfull-1]->zscore+0.5); + nbest = bestfull; + } + bestptr = bptr[nbest]; + bestptr->score = max_s; + bestptr->sscore = max_s; + bestptr->n1 = n1; + bestptr->comp = comp; + bestptr->H = H; + bestptr->lib = lsfnum; + bestptr->zscore = zscor; + strncpy(bestptr->libstr,libstr,12); + bestptr->libstr[12]='\0'; + nbest++; + + max_s = -1; + idup = 0; + } + } /* done with reading results */ + + if (!dohist) { + if (nbest < 20) { + zsflag = 0; + } + else { + /* do_bout(bout,stats,nstats); */ + process_hist(stats,nstats,pst,&m_msg.hist,&m_msg.pstat_void); + for (i=0; izscore = + (*find_zp)(bptr[i]->score,bptr[i]->n1,bptr[i]->comp,m_msg.pstat_void); + dohist = 1; + } + } + + printf(" using n0: %d\n",pst.n0); + + /* print histogram, statistics */ + + m_msg.nbr_seq = m_msg.db.entries; + pst.zdb_size = m_msg.db.entries; + /* get_param(&pst, gstring2,gstring3); */ + + prhist(stdout,m_msg,pst,m_msg.hist,nstats,m_msg.db,gstring2); + + if (!zsflag) sortbest(); + else { + sortbestz(bptr,nbest); + for (i=0; iescore = zs_to_E(bptr[i]->zscore,bptr[i]->n1,pst.dnaseq, + pst.zdb_size, m_msg.db); + } + + outfd = stdout; + showbest(m_msg.db); /* display best matches */ +} + +initbest(nbest) /* allocate arrays for best sort */ + int nbest; +{ + + if ((best=(struct beststr *)calloc((size_t)nbest,sizeof(struct beststr))) + == NULL) {fprintf(stderr,"cannot allocate best struct\n"); exit(1);} + if ((bptr=(struct beststr **)calloc((size_t)nbest,sizeof(struct beststr *))) + == NULL) {fprintf(stderr,"cannot allocate bptr\n"); exit(1);} +} + +void +prhist(FILE *fd, struct mngmsg m_msg, + struct pstruct pst, + struct hist_str hist, + int nstats, + struct db_str ntt, + char *gstring2) +{ + int i,j,hl,hll, el, ell, ev; + char hline[80], pch, *bp; + int mh1, mht; + int maxval, maxvalt, dotsiz, ddotsiz,doinset; + double cur_e, prev_e, f_int; + double max_dev, x_tmp; + double db_tt; + int n_chi_sq, cum_hl, max_i; + + + fprintf(fd,"\n"); + + if (pst.zsflag < 0 || nstats <= 10) { + fprintf(fd, "%7ld residues in %5ld sequences\n", ntt.length,ntt.entries); + fprintf(fd,"\n%s\n",gstring2); + return; + } + + max_dev = 0.0; + mh1 = hist.maxh-1; + mht = (3*hist.maxh-3)/4 - 1; + + if (!m_msg.nohist && mh1 > 0) { + for (i=0,maxval=0,maxvalt=0; i maxval) maxval = hist.hist_a[i]; + if (i >= mht && hist.hist_a[i]>maxvalt) maxvalt = hist.hist_a[i]; + } + n_chi_sq = 0; + cum_hl = -hist.hist_a[0]; + dotsiz = (maxval-1)/60+1; + ddotsiz = (maxvalt-1)/50+1; + doinset = (ddotsiz < dotsiz && dotsiz > 2); + + if (pst.zsflag>=0) + fprintf(fd," opt E()\n"); + else + fprintf(fd," opt\n"); + + prev_e = zs_to_Ec((double)(hist.min_hist-hist.histint/2),hist.entries); + for (i=0; i<=mh1; i++) { + pch = (i==mh1) ? '>' : ' '; + pch = (i==0) ? '<' : pch; + hll = hl = hist.hist_a[i]; + if (pst.zsflag>=0) { + cum_hl += hl; + f_int = (double)(i*hist.histint+hist.min_hist)+(double)hist.histint/2.0; + cur_e = (double)zs_to_Ec(f_int,hist.entries); + ev = el = ell = (int)(cur_e - prev_e + 0.5); + if (hl > 0 && i > 5 && i < (90-hist.min_hist)/hist.histint) { + x_tmp = fabs(cum_hl - cur_e); + if ( x_tmp > max_dev) { + max_dev = x_tmp; + max_i = i; + } + n_chi_sq++; + } + if ((el=(el+dotsiz-1)/dotsiz) > 60) el = 60; + if ((ell=(ell+ddotsiz-1)/ddotsiz) > 40) ell = 40; + fprintf(fd,"%c%3d %5d %5d:", + pch,(i 60) hl = 60; + if ((hll=(hll+ddotsiz-1)/ddotsiz) > 40) hll = 40; + for (j=0; j=0) { + if (el <= hl ) { + if (el > 0) hline[el-1]='*'; + hline[hl]='\0'; + } + else { + for (j = hl; j < el; j++) hline[j]=' '; + hline[el-1]='*'; + hline[hl=el]='\0'; + } + } + else hline[hl] = 0; + if (i==1) { + for (j=hl; j<10; j++) hline[j]=' '; + sprintf(&hline[10]," one = represents %d library sequences",dotsiz); + } + if (doinset && i == mht-2) { + for (j = hl; j < 10; j++) hline[j]=' '; + sprintf(&hline[10]," inset = represents %d library sequences",ddotsiz); + } + if (i >= mht&& doinset ) { + for (j = hl; j < 10; j++) hline[j]=' '; + hline[10]=':'; + for (j = 11; j<11+hll; j++) hline[j]='='; + hline[11+hll]='\0'; + if (pst.zsflag>=0) { + if (ell <= hll) hline[10+ell]='*'; + else { + for (j = 11+hll; j < 10+ell; j++) hline[j]=' '; + hline[10+ell] = '*'; + hline[11+ell] = '\0'; + } + } + } + + fprintf(fd,"%s\n",hline); + prev_e = cur_e; + } + } + + if (ntt.carry==0) { + fprintf(fd, "%7ld residues in %5ld sequences\n", ntt.length, ntt.entries); + } + else { + db_tt = (double)ntt.carry*(double)LONG_MAX + (double)ntt.length; + fprintf(fd, "%.0f residues in %5ld library sequences\n", db_tt, ntt.entries); + } + + if (pst.zsflag>=0) { + if (MAXSTATS < hist.entries) + fprintf(fd," statistics extrapolated from %d to %ld sequences\n", + MAXSTATS,hist.entries); + /* summ_stats(stat_info); */ + fprintf(fd," %s\n",hist.stat_info); + if (!m_msg.nohist && cum_hl > 0) + fprintf(fd," Kolmogorov-Smirnov statistic: %6.4f (N=%d) at %3d\n", + max_dev/(double)cum_hl, n_chi_sq,max_i*hist.histint+hist.min_hist); + if (m_msg.markx & MX_M10FORM) { + while ((bp=strchr(hist.stat_info,'\n'))!=NULL) *bp=' '; + if (cum_hl <= 0) cum_hl = -1; + sprintf(hstring1,"; mp_extrap: %d %ld\n; mp_stats: %s\n; mp_KS: %6.4f (N=%d) at %3d\n", + MAXSTATS,hist.entries,hist.stat_info,max_dev/(double)cum_hl, n_chi_sq,max_i*hist.histint+hist.min_hist); + } + } + fprintf(fd,"\n%s\n",gstring2); + fflush(fd); +} + +showbest(struct db_str ntt) + { + int ib, istart, istop; + char bline[200], fmt[40], pad[200]; + char rline[20]; + int ntmp; + int lcont, ccont, loff; + int hcutoff; + + sprintf(fmt,"%%-%ds (%%3d)",llen-10); + + nshow = min(20,nbest); + mshow = min(20,nbest); + + if (outtty) { + printf(" How many scores would you like to see? [%d] ",nshow); + fflush(stdout); + if (fgets(rline,sizeof(rline),stdin)==NULL) exit(0); + if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&nshow); + if (nshow<=0) nshow = min(20,nbest); + } + else nshow=mshow; + + memset(pad,' ',llen-10); + pad[llen-31]='\0'; + if (zsflag) + fprintf(outfd,"The best scores are:%s s-w Z-score E(%ld)\n",pad,ntt.entries); + else + fprintf(outfd,"The best scores are:%s s-w\n",pad); + + if (outfd != stdout) + if (zsflag) + fprintf(stdout,"The best scores are:%s s-w Z-score E(%ld)\n",pad,ntt.entries); + else + fprintf(stdout,"The best scores are:%s s-w\n",pad); + + istart = 0; + l1: istop = min(nbest,nshow); + for (ib=istart; ibescore > e_cut) { + nshow = ib; + goto done; + } + + sprintf(bline,"%-12s %d",bbp->libstr,bbp->lib); + bline[13]='\0'; + + fprintf(outfd,fmt,bline,bbp->n1); + + if (zsflag) + fprintf(outfd,"%4d %4.1f %6.2g\n", + bbp->score,bbp->zscore, + bbp->escore); + else + fprintf(outfd,"%4d\n",bbp->score); + + if (outfd!=stdout) { + fprintf(stdout,fmt,bline,bbp->n1); + if (zsflag) + printf("%4d %4.1f %6.2g\n", + bbp->score,bbp->zscore, + bbp->escore); + else + printf("%4d\n",bbp->score); + } + } + + fflush(outfd); if (outfd!=stdout) fflush(stdout); + + if (outtty) { + printf(" More scores? [0] "); + fflush(stdout); + if (fgets(rline,sizeof(rline),stdin)==NULL) exit(0); + ntmp = 0; + if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&ntmp); + if (ntmp<=0) ntmp = 0; + if (ntmp>0) { + istart = istop; + nshow += ntmp; + mshow += ntmp; + goto l1; + } + } + else if (zsflag && bbp->escore < e_cut) { + istart=istop; + nshow += 10; + goto l1; + } + + done: + if (outfd!=stdout) fprintf(outfd,"\n"); +} + +selectz(k,n) /* k is rank in array */ + int k,n; +{ + int t, i, j, l, r; + double v; + struct beststr *tmptr; + + l=0; r=n-1; + + while ( r > l ) { + i = l-1; + j = r; + v = bptr[r]->zscore; + do { + while (bptr[++i]->zscore > v ) ; + while (bptr[--j]->zscore < v ) ; + tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr; + } while (j > i); + bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr; + if (i>=k) r = i-1; + if (i<=k) l = i+1; + } +} + +sortbest() +{ + int cmps(), cmp1(), cmpa(), cmpz(); + ksort(bptr,nbest,cmps); +} + +sortbeste() +{ + int cmpe(); + ksort(bptr,nbest,cmpe); +} + +sortbestz() +{ + int cmpz(); + ksort(bptr,nbest,cmpz); +} + +cmps(ptr1,ptr2) + struct beststr *ptr1, *ptr2; +{ + if (ptr1->score < ptr2->score) return (1); + else if (ptr1->score > ptr2->score) return (-1); + else return (0); +} + +cmpe(ptr1,ptr2) + struct beststr *ptr1, *ptr2; +{ + if (ptr1->escore < ptr2->escore) return (-1); + else if (ptr1->escore > ptr2->escore) return (1); + else return (0); +} + +cmpz(ptr1,ptr2) + struct beststr *ptr1, *ptr2; +{ + if (ptr1->zscore < ptr2->zscore) return (1); + else if (ptr1->zscore > ptr2->zscore) return (-1); + else return (0); +} + +ksort(v,n,comp) + char *v[]; int n, (*comp)(); +{ + int gap, i, j; + char *tmp; + + for (gap=n/2; gap>0; gap/=2) + for (i=gap; i=0; j -= gap) { + if ((*comp)(v[j],v[j+gap]) <=0) + break; + tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp; + } +} + +/* +do_bout(FILE *bout,struct stat_str **bptr, int nbest) +{ + int i, min_hist, max_hist; + double mu, var; + + if (bout==NULL) return; + + inithist(); + for (i = 0; iscore,bptr[i]->n1); + + for (i=0; i0) { + min_hist=i; + break; + } + + for (i=MAX_LLEN-1; i>=0; i--) + if (llen_hist[i]>0) { + max_hist=i; + break; + } + + for (i=min_hist; i<=max_hist; i++) { + mu=(double)score_sums[i]/(double)llen_hist[i]; + if (llen_hist[i]>1) { + var = ((double)score2_sums[i]-(double)llen_hist[i]*mu*mu)/ + (double)(llen_hist[i]-1); + + fprintf(bout,"%d\t%d\t%.1f\t%.1f\t%.1f\t%.4f\t%.4f\n", + i,llen_hist[i],exp(((double)(i))/LN_FACT), + score_sums[i],score2_sums[i],mu,var); + } + } + free_hist(); + fclose(bout); +} +*/ + +s_abort() +{ + exit(1); +}