/* copyright (c) 1996, 1997, 1998, 1999 William R. Pearson and the U. of Virginia */ /* $Name: fa_34_26_5 $ - $Id: showsum.c,v 1.21 2006/06/22 15:00:51 wrp Exp $ */ /* 10 December 1999 -- code modified to reflect the fact that there may be two scores for each sequence - e.g. forward and reverse strand - and only one of them - presumably the best - is a related score. */ /* showsum.c should report statistics for success - given the sorted results (1) find the highest scoring unrelated sequence: unf_score0 find the number of related sequences missed: relm_num0 (2) find the 0.5% highest scoring unrelated sequence: unf_score05 find the number of related sequences missed: relm_num05 (3) find the score where the number of related sequences missed and the number of unrelated sequences found matches; report the score and the number: equ_score, equ_num; The query sequence library number will be put in qsfnum. */ #include #include #include #include #include "defs.h" #include "param.h" #ifndef PCOMPLIB #include "mw.h" #else #include "p_mw.h" #endif #include "structs.h" #ifndef SFCHAR #define SFCHAR ':' #define NSFCHAR '!' #endif #ifdef PCOMPLIB #define BSFNUM(i) bptr[i]->desptr->sfnum #define QSFNUM qsfnum #define NQSFNUM qsfnum_n #else #define BSFNUM(i) bptr[i]->sfnum #define QSFNUM m_msg->qsfnum #define NQSFNUM m_msg->qsfnum_n #endif #define MAX_BLINE 200 double E_to_zs(double, long); double zs_to_E(double,int,int,long,struct db_str db); double zs_to_bit(double,int,int); #ifdef PVM_SRC void sf_sort(int *s, int n); #endif void lnum_sort(struct beststr **s, int n); void showbest (FILE *fp, #ifndef PCOMPLIB unsigned char **aa0, unsigned char *aa1, int maxn, #endif struct beststr **bptr,int nbest, int qlib, struct mngmsg *m_msg, struct pstruct pst, struct db_str db, char *gstring2 #ifndef PCOMPLIB ,void *f_str #endif ) { int i, j, k, rel_tot; int irelv; int unf_num0, relm_num0; int unf_num01,relm_num01; int unf_num02, relm_num02; int unf_num05, relm_num05; int unf_num100, relm_num100; int equ_num, rel_3_num, rel_1_num; double unf_score0, unf_score01, unf_score02 ,unf_score05; double unf_score100, equ_score, rel_3_score, rel_1_score; double unf_score0_b, unf_score01_b, unf_score02_b ,unf_score05_b; double unf_score100_b, equ_score_b, rel_3_score_b, rel_1_score_b; char *bp; #ifdef PCOMPLIB int qsfnum[10],qsfnum_n[10],isf,nsf,nsf_n; char *bp1, *bpn, *tp; char sfstr[MAX_FN]; #endif #ifdef PCOMPLIB /* not done here because done in pvcomplib.c */ if ((bp=strchr(m_msg->qtitle,SFCHAR))!=NULL) { strncpy(sfstr,bp+1,sizeof(sfstr)); sfstr[sizeof(sfstr)-1]='\0'; if ((bp1=strchr(sfstr,SFCHAR)) != NULL) { /* look for second | */ if ((bpn=strchr(sfstr,NSFCHAR))!=NULL) *bpn = '\0'; *bp1='\0'; tp = strtok(sfstr," \t"); qsfnum[0]=atoi(tp); isf = 1; while ((tp=strtok(NULL," \t"))!=NULL) { qsfnum[isf++] = atoi(tp); if (isf >= 10) { fprintf(stderr," error - too many superfamilies: %d\n %s\n", isf,m_msg->qtitle); break; } } qsfnum[nsf=isf]=0; sf_sort(qsfnum,nsf); /* now get negatives */ qsfnum_n[0]= nsf_n = 0; if (bpn != NULL) { tp = strtok(bpn+1," \t"); qsfnum_n[0]=atoi(tp); isf = 1; while ((tp=strtok(NULL," \t"))!=NULL) { qsfnum_n[isf++] = atoi(tp); if (isf >= 10) { fprintf(stderr, " error - too many negative superfamilies: %d\n %s\n", isf,m_msg->qtitle); break; } } qsfnum[nsf_n=isf]=0; sf_sort(qsfnum_n,nsf_n); } } else { /* only one sfnum */ sscanf(bp+1,"%d",qsfnum); qsfnum[1]=0; qsfnum_n[0]= nsf_n = 0; } } else { fprintf(stderr," no query superfamily number\n %s\n",m_msg->qtitle); return; } #endif if (m_msg->qframe > 1 || m_msg->nframe > 1) { /* this code is included for cases where there are several scores - forward and reverse, or six in the case of tfastf33s, for each sequence lnum_sort sorts the library by lseek position, which will be the same for the same sequence */ lnum_sort(bptr,nbest); /* merge, saving the best score */ i = j = 0; /* i has the source position we are currently examining k has the adjacent alternative scores ( k > i) j has the destination */ while (ilseek == bptr[k]->lseek; k++) { if (bptr[i]->zscore < bptr[k]->zscore) bptr[i] = bptr[k]; } bptr[j++]=bptr[i]; i = k; } if (j != m_msg->nbr_seq) { fprintf(stderr,"*** warning ***, nbest (%d/%d) != nbr_seq (%d)\n", j,nbest,m_msg->nbr_seq); fprintf(stdout,"*** warning ***, nbest (%d/%d) != nbr_seq (%d)\n", j,nbest,m_msg->nbr_seq); } nbest = j; if (pst.zsflag >=0) sortbeste(bptr, nbest); else sortbest(bptr,nbest,pst.score_ix); } /* fprintf(stderr," %1d label is %s (%s)\n",irelv,labptr,label); */ /* get the query superfamily */ for (i=0; izscore; unf_score0_b=zs_to_bit(bptr[i]->zscore,m_msg->n0,bptr[i]->n1); break; } } if (i>=nbest) { fprintf(stderr," %s: %d\n error - no unrelated sequences\n", m_msg->qtitle,QSFNUM[0]); return; } for (i=rel_tot=relm_num0=0; i0 ) { rel_tot++; /* total related */ if (bptr[i]->zscore <= unf_score0) relm_num0++; #ifdef DEBUG if (pst.debug_lib) fprintf(stderr,"%d\t%l\t%.1f\n",i,bptr[i]->lseek,bptr[i]->zscore); #endif } } /* relm_num0, unf_num0, unf_score0 done */ /* now calculate number missed at various expectation value cutoffs */ /* calculate z-score cutoff for E()=0.01, 0.02, 0.05 */ unf_score01 = E_to_zs(0.01,db.entries); unf_score02 = E_to_zs(0.02,db.entries); unf_score05 = E_to_zs(0.05,db.entries); unf_score100 = E_to_zs(1.00,db.entries); /* relm_num01, unf_num01, unf_score01 done */ for (i=unf_num01=0,relm_num01=rel_tot; izscore >= unf_score01; i++) { /* if (sfn_cmp(BSFNUM(i),NQSFNUM)) continue; */ if (sfn_cmp(BSFNUM(i),QSFNUM)==0) { if (sfn_cmp(BSFNUM(i),NQSFNUM)==0) unf_num01++; } else relm_num01--; } unf_score01_b=zs_to_bit(bptr[i]->zscore,m_msg->n0,bptr[i]->n1); for (i=unf_num02=0,relm_num02=rel_tot; izscore >= unf_score02; i++) { /* if (sfn_cmp(BSFNUM(i),NQSFNUM)) continue; */ if (sfn_cmp(BSFNUM(i),QSFNUM)==0) { if (sfn_cmp(BSFNUM(i),NQSFNUM)==0) unf_num02++; } else relm_num02--; } unf_score02_b=zs_to_bit(bptr[i]->zscore,m_msg->n0,bptr[i]->n1); for (i=unf_num05=0,relm_num05=rel_tot; izscore >= unf_score05; i++) { /* if (sfn_cmp(BSFNUM(i),NQSFNUM)) continue; */ if (sfn_cmp(BSFNUM(i),QSFNUM)==0) { if (sfn_cmp(BSFNUM(i),NQSFNUM)==0) unf_num05++; } else relm_num05--; } unf_score05_b=zs_to_bit(bptr[i]->zscore,m_msg->n0,bptr[i]->n1); for (i=unf_num100=0,relm_num100=rel_tot; izscore >= unf_score100; i++) { /* if (sfn_cmp(BSFNUM(i),NQSFNUM)) continue; */ if (sfn_cmp(BSFNUM(i),QSFNUM)==0) { if (sfn_cmp(BSFNUM(i),NQSFNUM)==0) unf_num100++; } else relm_num100--; } unf_score100_b=zs_to_bit(bptr[i]->zscore,m_msg->n0,bptr[i]->n1); /* the final criterion finds the score and the number of sequences where the number of unrelated sequences found == the number of related sequences missed. */ equ_num=0; i = 0; j=nbest-1; /* j is counting up the list of scores (actually down the array) from the lowest scoring related sequence i is counting down the list of scores (actually up the array) from the highest scoring unrelated sequence */ for (i=0, j=nbest-1; j>=0 && i=0 && ( sfn_cmp(BSFNUM(j),QSFNUM)==0)) j--; /* fprintf(stderr,"i: %3d %3d %4d; j: %3d %3d %4d\n",i,bptr[i]->zscore, BSFNUM(i),j,bptr[j]->zscore,BSFNUM(j)); */ /* if unrelated [i] score <= related [j] score, quit */ if (bptr[i]->zscore <= bptr[j]->zscore) break; equ_num++; } equ_score = 0.0; if (i>=nbest || j<0) { #ifndef PCOMPLIB if (pst.debug_lib) #endif fprintf(stderr," i (%3d), j (%3d) off end\n %s\n", i, j,m_msg->qtitle); equ_num = rel_tot+1; equ_score = 0.0; } else { equ_score=bptr[i]->zscore; equ_score_b =zs_to_bit(bptr[i]->zscore,m_msg->n0,bptr[i]->n1); } /* get the lowest scoring related */ for (i=0,rel_1_num=rel_tot-1; i 0; i++) { /* if (sfn_cmp(BSFNUM(i),NQSFNUM)) continue; */ if (sfn_cmp(BSFNUM(i),QSFNUM)) rel_1_num--; } rel_1_num = i; rel_1_score = bptr[i]->zscore; rel_1_score_b=zs_to_bit(bptr[i]->zscore,m_msg->n0,bptr[i]->n1); /* get the 3rd lowest scoring related */ for (i=0,rel_3_num=rel_tot-3; i 0; i++) { /* if (sfn_cmp(BSFNUM(i),NQSFNUM)) continue; */ if (sfn_cmp(BSFNUM(i),QSFNUM)) rel_3_num--; } rel_3_num = i; rel_3_score = bptr[i]->zscore; rel_3_score_b=zs_to_bit(bptr[i]->zscore,m_msg->n0,bptr[i]->n1); fprintf(fp,"%3d>%s - %d (%d/%d)\n", qlib,m_msg->qtitle, QSFNUM[0],rel_tot,nbest); fprintf(fp," 0.0 criterion- relm: %3d pos: %3d score: %5.1f exp: %6.4g\n", relm_num0, unf_num0+1, unf_score0_b, zs_to_E(unf_score0,m_msg->n0,pst.dnaseq,pst.zdb_size,db)); fprintf(fp," 0.01 criterion- relm: %3d unf: %3d score: %5.1f exp: %6.4g\n", relm_num01, unf_num01, unf_score01_b, zs_to_E(unf_score01,m_msg->n0,pst.dnaseq,pst.zdb_size,db)); fprintf(fp," 0.02 criterion- relm: %3d unf: %3d score: %5.1f exp: %6.4g\n", relm_num02, unf_num02, unf_score02_b, zs_to_E(unf_score02,m_msg->n0,pst.dnaseq,pst.zdb_size,db)); fprintf(fp," 0.05 criterion- relm: %3d unf: %3d score: %5.1f exp: %6.4g\n", relm_num05, unf_num05, unf_score05_b, zs_to_E(unf_score05,m_msg->n0,pst.dnaseq,pst.zdb_size,db)); fprintf(fp," 1.00 criterion- relm: %3d unf: %3d score: %5.1f exp: %6.4g\n", relm_num100, unf_num100, unf_score100_b, zs_to_E(unf_score100,m_msg->n0,pst.dnaseq,pst.zdb_size,db)); fprintf(fp," equ num: %3d score: %5.1f exp: %6.4g\n",equ_num,equ_score_b, zs_to_E(equ_score,m_msg->n0,pst.dnaseq,pst.zdb_size,db)); fprintf(fp," rel[-1]: %3d score: %5.1f exp: %6.4g\n",rel_1_num+1,rel_1_score_b, zs_to_E(rel_1_score,m_msg->n0,pst.dnaseq,pst.zdb_size,db)); fprintf(fp," rel[-3]: %3d score: %5.1f exp: %6.4g\n",rel_3_num+1,rel_3_score_b, zs_to_E(rel_3_score,m_msg->n0,pst.dnaseq,pst.zdb_size,db)); /* fprintf(fp,"/ ** %s ** /\n",gstring2); fflush(fp); */ m_msg->nshow = m_msg->ashow; } #ifdef PCOMPLIB void showalign() {} #if !defined(MPI_SRC) && !defined(PCOMPLIB) void sf_sort(int *s, int n) { int gap, i, j; int itmp; for (i=0; is[i+1]) goto l2; return; l2: for (gap=n/2; gap>0; gap/=2) for (i=gap; i=0; j -= gap) { if (s[j] <= s[j+gap]) break; itmp = s[j]; s[j]=s[j+gap]; s[j+gap]=itmp; } } #endif #endif void lnum_sort(struct beststr **s, int n) { int gap, i, j; struct beststr *btmp; for (i=0; ilseek > s[i+1]->lseek) goto l2; return; l2: for (gap=n/2; gap>0; gap/=2) for (i=gap; i=0; j -= gap) { if (s[j]->lseek <= s[j+gap]->lseek) break; btmp = s[j]; s[j]=s[j+gap]; s[j+gap]=btmp; } } #ifdef MPI_SRC void aancpy(char *to, char *from, int count, struct pstruct pst) { char *tp, *sq; int nsq; if (pst.ext_sq_set) { nsq = pst.nsqx; sq = pst.sqx; } else { nsq = pst.nsq; sq = pst.sq; } tp=to; while (count-- && *from) { if (*from <= nsq) *tp++ = sq[*(from++)]; else *tp++ = *from++; } *tp='\0'; } #endif