1 /* calculate stats from results file using scalesws.c */
19 int score; /* smith-waterman score */
20 int sscore; /* duplicate for compatibility with fasta */
27 long lseek; /* position in library file */
31 int cont; /* offset into sequence */
35 } *bbp, *bestptr, **bptr, *best;
44 static struct db_str qtt = {0l, 0l, 0};
46 char gstring2[MAX_STR]; /* string for label */
47 char gstring3[MAX_STR];
48 char hstring1[MAX_STR];
52 int nbest; /* number of sequences better than bestcut in best */
53 int bestcut=1; /* cut off for getting into MAXBEST */
61 /* statistics functions */
63 process_hist(struct stat_str *sptr, int nstat, struct pstruct pst,
64 struct hist_str *hist, void **);
65 extern void addhistz(double, struct hist_str *); /* scaleswn.c */
66 void selectbestz(struct beststr **, int, int );
68 extern double zs_to_E(double, int, int, long, struct db_str);
69 extern double zs_to_Ec(double zs, long entries);
71 extern double (*find_zp)(int score, int length, double comp, void *);
73 void prhist(FILE *, struct mngmsg, struct pstruct, struct hist_str,
74 int, struct db_str, char *);
76 int nshow=20, mshow=50, ashow= -1;
80 int argc; char **argv;
84 int max, icol, iarg, i, qsfnum, lsfnum, n0, n1, s[3], frame;
86 int idup, ndup, max_s;
90 struct mngmsg m_msg; /* Message from host to manager */
92 struct stat_str *stats;
94 double zscor, mu, var;
103 fprintf(stderr," useage - res_stats -c col -r bin_file file\n");
107 m_msg.db.length = qtt.length = 0l;
108 m_msg.db.entries = m_msg.db.carry = qtt.entries = qtt.carry = 0;
109 m_msg.pstat_void = NULL;
110 m_msg.hist.hist_a = NULL;
114 pst.n0 = 200; /* sensible dummy value */
124 if (argv[iarg][0]=='-' && argv[iarg][1]=='c') {
125 sscanf(argv[iarg+1],"%d",&icol);
128 else if (argv[iarg][0]=='-' && argv[iarg][1]=='r') {
129 strncpy(bin_file,argv[iarg+1],sizeof(bin_file));
132 else if (argv[iarg][0]=='-' && argv[iarg][1]=='z') {
133 sscanf(argv[iarg+1],"%d",&pst.zsflag);
136 else if (argv[iarg][0]=='-' && argv[iarg][1]=='n') {
140 else if (argv[iarg][0]=='-' && argv[iarg][1]=='s') {
141 sscanf(argv[iarg+1],"%d",&ndup);
144 else if (argv[iarg][0]=='-' && argv[iarg][1]=='q') {
153 if ((fin=fopen(argv[iarg],"r"))==NULL) {
154 fprintf(stderr," cannot open %s\n",argv[1]);
158 if (bin_file[0]!='\0' && ((bout=fopen(bin_file,"w"))==NULL)) {
159 fprintf(stderr,"cannot open %s for output\n",bin_file);
163 (struct stat_str *)malloc((MAXSTATS)*sizeof(struct stat_str)))==NULL)
164 s_abort ("Cannot allocate stats struct","");
167 initbest(MAXBEST+1); /* +1 required for select() */
169 for (nbest=0; nbest<MAXBEST+1; nbest++)
170 bptr[nbest] = &best[nbest];
172 best[-1].score= BIGNUM;
180 /* read the best scores from the results file */
185 /* get first line with sequence length */
186 fgets(line,sizeof(line),fin);
187 sscanf(line,"%d",&n0);
188 if (n0 > 0) pst.n0 = n0;
190 while (fgets(line,sizeof(line),fin)!=NULL) {
191 if (line[0]=='/' && line[1]=='*') {
193 strncpy(gstring2,line,sizeof(gstring2));
194 if ((bp=strchr(gstring2,'\n'))!=NULL) *bp = '\0';
198 if ((bp=strchr(line,'|'))!=NULL) qsfnum = atoi(bp+1);
200 if ((bp=strchr(line,'('))!=NULL) {
205 fprintf(stderr, "cannot find n0:\n %s\n",line);
210 sscanf(line,"%s %d %d %d %lf %lf %d %d %d",
211 libstr,&lsfnum,&n1,&frame,&comp, &H, &s[0],&s[1],&s[2]);
212 if (lsfnum==0 && n1==0) {
216 if (n1 < 10 || s[icol]<=0) fputs(line,stderr);
219 if (s[icol] > max_s) max_s = s[icol];
220 if (idup < ndup) continue;
223 m_msg.db.length += n1;
225 if (dohist) addhistz(zscor=(*find_zp)(max_s,n1,comp,m_msg.pstat_void),
227 else zscor = (double)max_s;
229 if (nstats < MAXSTATS) {
230 stats[nstats].n1 = n1;
231 stats[nstats].comp = comp;
233 stats[nstats++].score = max_s;
237 /* do_bout(bout,stats,nstats); */
238 process_hist(stats,nstats,pst,&m_msg.hist, &m_msg.pstat_void);
239 for (i=0; i<nbest; i++)
241 (*find_zp)(bptr[i]->score,bptr[i]->n1,bptr[i]->comp,
247 zscor =(*find_zp)(max_s,n1,comp,m_msg.pstat_void);
248 addhistz(zscor,&m_msg.hist);
250 else zscor = (double)max_s;
252 if (nbest >= MAXBEST) {
253 bestfull = nbest-MAXBEST/4;
254 selectz(bestfull-1,nbest);
255 bestcut = (int)(bptr[bestfull-1]->zscore+0.5);
258 bestptr = bptr[nbest];
259 bestptr->score = max_s;
260 bestptr->sscore = max_s;
262 bestptr->comp = comp;
264 bestptr->lib = lsfnum;
265 bestptr->zscore = zscor;
266 strncpy(bestptr->libstr,libstr,12);
267 bestptr->libstr[12]='\0';
273 } /* done with reading results */
280 /* do_bout(bout,stats,nstats); */
281 process_hist(stats,nstats,pst,&m_msg.hist,&m_msg.pstat_void);
282 for (i=0; i<nbest; i++)
284 (*find_zp)(bptr[i]->score,bptr[i]->n1,bptr[i]->comp,m_msg.pstat_void);
289 printf(" using n0: %d\n",pst.n0);
291 /* print histogram, statistics */
293 m_msg.nbr_seq = m_msg.db.entries;
294 pst.zdb_size = m_msg.db.entries;
295 /* get_param(&pst, gstring2,gstring3); */
297 prhist(stdout,m_msg,pst,m_msg.hist,nstats,m_msg.db,gstring2);
299 if (!zsflag) sortbest();
301 sortbestz(bptr,nbest);
302 for (i=0; i<nbest; i++)
303 bptr[i]->escore = zs_to_E(bptr[i]->zscore,bptr[i]->n1,pst.dnaseq,
304 pst.zdb_size, m_msg.db);
308 showbest(m_msg.db); /* display best matches */
311 initbest(nbest) /* allocate arrays for best sort */
315 if ((best=(struct beststr *)calloc((size_t)nbest,sizeof(struct beststr)))
316 == NULL) {fprintf(stderr,"cannot allocate best struct\n"); exit(1);}
317 if ((bptr=(struct beststr **)calloc((size_t)nbest,sizeof(struct beststr *)))
318 == NULL) {fprintf(stderr,"cannot allocate bptr\n"); exit(1);}
322 prhist(FILE *fd, struct mngmsg m_msg,
324 struct hist_str hist,
329 int i,j,hl,hll, el, ell, ev;
330 char hline[80], pch, *bp;
332 int maxval, maxvalt, dotsiz, ddotsiz,doinset;
333 double cur_e, prev_e, f_int;
334 double max_dev, x_tmp;
336 int n_chi_sq, cum_hl, max_i;
341 if (pst.zsflag < 0 || nstats <= 10) {
342 fprintf(fd, "%7ld residues in %5ld sequences\n", ntt.length,ntt.entries);
343 fprintf(fd,"\n%s\n",gstring2);
349 mht = (3*hist.maxh-3)/4 - 1;
351 if (!m_msg.nohist && mh1 > 0) {
352 for (i=0,maxval=0,maxvalt=0; i<hist.maxh; i++) {
353 if (hist.hist_a[i] > maxval) maxval = hist.hist_a[i];
354 if (i >= mht && hist.hist_a[i]>maxvalt) maxvalt = hist.hist_a[i];
357 cum_hl = -hist.hist_a[0];
358 dotsiz = (maxval-1)/60+1;
359 ddotsiz = (maxvalt-1)/50+1;
360 doinset = (ddotsiz < dotsiz && dotsiz > 2);
363 fprintf(fd," opt E()\n");
365 fprintf(fd," opt\n");
367 prev_e = zs_to_Ec((double)(hist.min_hist-hist.histint/2),hist.entries);
368 for (i=0; i<=mh1; i++) {
369 pch = (i==mh1) ? '>' : ' ';
370 pch = (i==0) ? '<' : pch;
371 hll = hl = hist.hist_a[i];
374 f_int = (double)(i*hist.histint+hist.min_hist)+(double)hist.histint/2.0;
375 cur_e = (double)zs_to_Ec(f_int,hist.entries);
376 ev = el = ell = (int)(cur_e - prev_e + 0.5);
377 if (hl > 0 && i > 5 && i < (90-hist.min_hist)/hist.histint) {
378 x_tmp = fabs(cum_hl - cur_e);
379 if ( x_tmp > max_dev) {
385 if ((el=(el+dotsiz-1)/dotsiz) > 60) el = 60;
386 if ((ell=(ell+ddotsiz-1)/ddotsiz) > 40) ell = 40;
387 fprintf(fd,"%c%3d %5d %5d:",
388 pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
389 mh1*hist.histint+hist.min_hist,hl,ev);
391 else fprintf(fd,"%c%3d %5d :",
392 pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
393 mh1*hist.histint+hist.min_hist,hl);
395 if ((hl=(hl+dotsiz-1)/dotsiz) > 60) hl = 60;
396 if ((hll=(hll+ddotsiz-1)/ddotsiz) > 40) hll = 40;
397 for (j=0; j<hl; j++) hline[j]='=';
400 if (el > 0) hline[el-1]='*';
404 for (j = hl; j < el; j++) hline[j]=' ';
411 for (j=hl; j<10; j++) hline[j]=' ';
412 sprintf(&hline[10]," one = represents %d library sequences",dotsiz);
414 if (doinset && i == mht-2) {
415 for (j = hl; j < 10; j++) hline[j]=' ';
416 sprintf(&hline[10]," inset = represents %d library sequences",ddotsiz);
418 if (i >= mht&& doinset ) {
419 for (j = hl; j < 10; j++) hline[j]=' ';
421 for (j = 11; j<11+hll; j++) hline[j]='=';
424 if (ell <= hll) hline[10+ell]='*';
426 for (j = 11+hll; j < 10+ell; j++) hline[j]=' ';
428 hline[11+ell] = '\0';
433 fprintf(fd,"%s\n",hline);
439 fprintf(fd, "%7ld residues in %5ld sequences\n", ntt.length, ntt.entries);
442 db_tt = (double)ntt.carry*(double)LONG_MAX + (double)ntt.length;
443 fprintf(fd, "%.0f residues in %5ld library sequences\n", db_tt, ntt.entries);
447 if (MAXSTATS < hist.entries)
448 fprintf(fd," statistics extrapolated from %d to %ld sequences\n",
449 MAXSTATS,hist.entries);
450 /* summ_stats(stat_info); */
451 fprintf(fd," %s\n",hist.stat_info);
452 if (!m_msg.nohist && cum_hl > 0)
453 fprintf(fd," Kolmogorov-Smirnov statistic: %6.4f (N=%d) at %3d\n",
454 max_dev/(double)cum_hl, n_chi_sq,max_i*hist.histint+hist.min_hist);
455 if (m_msg.markx & MX_M10FORM) {
456 while ((bp=strchr(hist.stat_info,'\n'))!=NULL) *bp=' ';
457 if (cum_hl <= 0) cum_hl = -1;
458 sprintf(hstring1,"; mp_extrap: %d %ld\n; mp_stats: %s\n; mp_KS: %6.4f (N=%d) at %3d\n",
459 MAXSTATS,hist.entries,hist.stat_info,max_dev/(double)cum_hl, n_chi_sq,max_i*hist.histint+hist.min_hist);
462 fprintf(fd,"\n%s\n",gstring2);
466 showbest(struct db_str ntt)
468 int ib, istart, istop;
469 char bline[200], fmt[40], pad[200];
472 int lcont, ccont, loff;
475 sprintf(fmt,"%%-%ds (%%3d)",llen-10);
477 nshow = min(20,nbest);
478 mshow = min(20,nbest);
481 printf(" How many scores would you like to see? [%d] ",nshow);
483 if (fgets(rline,sizeof(rline),stdin)==NULL) exit(0);
484 if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&nshow);
485 if (nshow<=0) nshow = min(20,nbest);
489 memset(pad,' ',llen-10);
492 fprintf(outfd,"The best scores are:%s s-w Z-score E(%ld)\n",pad,ntt.entries);
494 fprintf(outfd,"The best scores are:%s s-w\n",pad);
498 fprintf(stdout,"The best scores are:%s s-w Z-score E(%ld)\n",pad,ntt.entries);
500 fprintf(stdout,"The best scores are:%s s-w\n",pad);
503 l1: istop = min(nbest,nshow);
504 for (ib=istart; ib<istop; ib++) {
507 if (!outtty && zsflag && bbp->escore > e_cut) {
512 sprintf(bline,"%-12s %d",bbp->libstr,bbp->lib);
515 fprintf(outfd,fmt,bline,bbp->n1);
518 fprintf(outfd,"%4d %4.1f %6.2g\n",
519 bbp->score,bbp->zscore,
522 fprintf(outfd,"%4d\n",bbp->score);
525 fprintf(stdout,fmt,bline,bbp->n1);
527 printf("%4d %4.1f %6.2g\n",
528 bbp->score,bbp->zscore,
531 printf("%4d\n",bbp->score);
535 fflush(outfd); if (outfd!=stdout) fflush(stdout);
538 printf(" More scores? [0] ");
540 if (fgets(rline,sizeof(rline),stdin)==NULL) exit(0);
542 if (rline[0]!='\n' && rline[0]!=0) sscanf(rline,"%d",&ntmp);
543 if (ntmp<=0) ntmp = 0;
551 else if (zsflag && bbp->escore < e_cut) {
558 if (outfd!=stdout) fprintf(outfd,"\n");
561 selectz(k,n) /* k is rank in array */
566 struct beststr *tmptr;
575 while (bptr[++i]->zscore > v ) ;
576 while (bptr[--j]->zscore < v ) ;
577 tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
579 bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
587 int cmps(), cmp1(), cmpa(), cmpz();
588 ksort(bptr,nbest,cmps);
594 ksort(bptr,nbest,cmpe);
600 ksort(bptr,nbest,cmpz);
604 struct beststr *ptr1, *ptr2;
606 if (ptr1->score < ptr2->score) return (1);
607 else if (ptr1->score > ptr2->score) return (-1);
612 struct beststr *ptr1, *ptr2;
614 if (ptr1->escore < ptr2->escore) return (-1);
615 else if (ptr1->escore > ptr2->escore) return (1);
620 struct beststr *ptr1, *ptr2;
622 if (ptr1->zscore < ptr2->zscore) return (1);
623 else if (ptr1->zscore > ptr2->zscore) return (-1);
628 char *v[]; int n, (*comp)();
633 for (gap=n/2; gap>0; gap/=2)
634 for (i=gap; i<n; i++)
635 for (j=i-gap; j>=0; j -= gap) {
636 if ((*comp)(v[j],v[j+gap]) <=0)
638 tmp = v[j]; v[j]=v[j+gap]; v[j+gap]=tmp;
643 do_bout(FILE *bout,struct stat_str **bptr, int nbest)
645 int i, min_hist, max_hist;
648 if (bout==NULL) return;
651 for (i = 0; i<nbest; i++)
652 addhist(bptr[i]->score,bptr[i]->n1);
654 for (i=0; i<MAX_LLEN; i++)
655 if (llen_hist[i]>0) {
660 for (i=MAX_LLEN-1; i>=0; i--)
661 if (llen_hist[i]>0) {
666 for (i=min_hist; i<=max_hist; i++) {
667 mu=(double)score_sums[i]/(double)llen_hist[i];
668 if (llen_hist[i]>1) {
669 var = ((double)score2_sums[i]-(double)llen_hist[i]*mu*mu)/
670 (double)(llen_hist[i]-1);
672 fprintf(bout,"%d\t%d\t%.1f\t%.1f\t%.1f\t%.4f\t%.4f\n",
673 i,llen_hist[i],exp(((double)(i))/LN_FACT),
674 score_sums[i],score2_sums[i],mu,var);