2 /* copyright (c) 1996, 1997, 1998, 1999 William R. Pearson and the
5 /* $Name: fa_34_26_5 $ - $Id: compacc.c,v 1.61 2007/04/26 18:37:18 wrp Exp $ */
7 /* Concurrent read version */
11 #if defined(UNIX) || defined(WIN32)
12 #include <sys/types.h>
42 extern int pinums[],hosttid;
49 extern time_t tdone, tstart; /* Timing */
53 /* because it is used to pre-allocate space, maxn has various
54 constraints. For "simple" comparisons, it is simply the length of
55 the longest library sequence. But for translated comparisons, it
56 must be 3 or 6X the length of the query sequence.
58 In addition, however, it can be reduced to make certain that
59 sequences are read in smaller chunks. And, maxn affect how large
60 overlaps must be when sequences are read in chunks.
64 reset_maxn(struct mngmsg *m_msg, int maxn) {
66 /* reduce maxn if requested */
67 if (m_msg->maxn > 0 && m_msg->maxn < maxn) maxn = m_msg->maxn;
69 if (m_msg->qdnaseq==m_msg->ldnaseq || m_msg->qdnaseq==SEQT_DNA ||
70 m_msg->qdnaseq == SEQT_RNA) {/* !TFAST - either FASTA or FASTX*/
72 if (m_msg->n0> m_msg->max_tot/3) {
73 fprintf(stderr," query sequence is too long %d > %d %s\n",
79 m_msg->loff = m_msg->n0;
80 m_msg->maxt3 = maxn-m_msg->loff;
83 if (m_msg->n0 > MAXTST) {
84 fprintf(stderr," query sequence is too long %d %s\n",m_msg->n0,m_msg->sqnam);
88 if (m_msg->n0*3 > maxn ) { /* n0*3 for the three frames - this
89 will only happen if maxn has been
92 if (m_msg->n0*4+2 < m_msg->max_tot) { /* m_msg0*3 + m_msg0 */
94 " query sequence too long for library segment: %d - resetting to %d\n",
96 maxn = m_msg->maxn = m_msg->n0*3;
99 fprintf(stderr," query sequence too long for translated search: %d * 4 > %d %s\n",
100 m_msg->n0,maxn, m_msg->sqnam);
105 /* set up some constants for overlaps */
106 m_msg->loff = 3*m_msg->n0;
107 m_msg->maxt3 = maxn-m_msg->loff-3;
108 m_msg->maxt3 -= m_msg->maxt3%3;
111 maxn = maxn - 3; maxn -= maxn%3; maxn++;
118 scanseq(unsigned char *seq, int n, char *str) {
120 char aaray[128]; /* this must be set > nsq */
122 for (i=0; i<128; i++) aaray[i]=0;
123 for (i=0; (size_t)i < strlen(str); i++) aaray[qascii[str[i]]]=1;
124 for (i=tot=0; i<n; i++) tot += aaray[seq[i]];
128 /* subs_env takes a string, possibly with ${ENV}, and looks up all the
129 potential environment variables and substitutes them into the
132 void subs_env(char *dest, char *src, int dest_size) {
133 char *last_src, *bp, *bp1;
137 if ((bp = strchr(src,'$'))==NULL) {
138 strncpy(dest, src, dest_size);
139 dest[dest_size-1] = '\0';
143 while (strlen(dest) < dest_size-1 && bp != NULL ) {
144 /* copy stuff before ${*/
146 strncpy(dest, last_src, dest_size);
150 if (*(bp+1) != '{') {
151 strncat(dest, "$", dest_size - strlen(dest) -1);
152 dest[dest_size-1] = '\0';
155 else { /* have ${ENV} - put it in */
156 if ((bp1 = strchr(bp+2,'}'))==NULL) {
157 fprintf(stderr, "Unterminated ENV: %s\n",src);
162 if (getenv(bp+2)!=NULL) {
163 strncat(dest, getenv(bp+2), dest_size - strlen(dest) - 1);
164 dest[dest_size-1] = '\0';
167 bp = bp1+1; /* bump bp even if getenv == NULL */
172 /* now get the next ${ENV} if present */
173 bp = strchr(last_src,'$');
175 /* now copy the last stuff */
176 strncat(dest, last_src, dest_size - strlen(dest) - 1);
177 dest[dest_size-1]='\0';
182 void selectbest(bptr,k,n) /* k is rank in array */
183 struct beststr **bptr;
187 struct beststr *tmptr;
192 v = bptr[r]->score[0];
196 while (bptr[++i]->score[0] > v) ;
197 while (bptr[--j]->score[0] < v) ;
198 tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
200 bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
206 void selectbestz(bptr,k,n) /* k is rank in array */
207 struct beststr **bptr;
211 struct beststr *tmptr;
221 while (bptr[++i]->zscore > v) ;
222 while (bptr[--j]->zscore < v) ;
223 tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
225 bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
231 /* improved shellsort with high-performance increments */
233 shellsort(itemType a[], int l, int r)
234 { int i, j, k, h; itemType v;
235 int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
236 13776, 4592, 1968, 861, 336,
237 112, 48, 21, 7, 3, 1 };
238 for ( k = 0; k < 16; k++)
239 for (h = incs[k], i = l+h; i <= r; i++)
242 while (j > h && a[j-h] > v)
243 { a[j] = a[j-h]; j -= h; }
249 /* ?improved? version of sortbestz using optimal increments and fewer
251 void sortbestz(struct beststr **bptr, int nbest)
256 int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
257 13776, 4592, 1968, 861, 336,
258 112, 48, 21, 7, 3, 1 };
260 for ( k = 0; k < 16; k++) {
262 for (i=gap; i < nbest; i++) {
266 while ( j >= gap && bptr[j-gap]->zscore < v) {
267 bptr[j] = bptr[j - gap];
276 void sortbeste(struct beststr **bptr, int nbest)
281 int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
282 13776, 4592, 1968, 861, 336,
283 112, 48, 21, 7, 3, 1 };
285 for ( k = 0; k < 16; k++) {
287 for (i=gap; i < nbest; i++) {
291 while ( j >= gap && bptr[j-gap]->escore > v) {
292 bptr[j] = bptr[j - gap];
299 /* sometimes there are many high scores with E()==0.0, sort
300 those by z() score */
303 while (j < nbest && bptr[j]->escore <= 2.0*DBL_MIN ) {j++;}
304 if (j > 1) sortbestz(bptr,j);
307 extern double zs_to_Ec(double zs, long entries);
310 extern double ks_dev;
312 extern char hstring1[];
315 prhist(FILE *fd, struct mngmsg m_msg,
317 struct hist_str hist,
322 int i,j,hl,hll, el, ell, ev;
323 char hline[80], pch, *bp;
325 int maxval, maxvalt, dotsiz, ddotsiz,doinset;
326 double cur_e, prev_e, f_int;
327 double max_dev, x_tmp;
329 int n_chi_sq, cum_hl=0, max_i;
334 if (pst.zsflag_f < 0) {
335 fprintf(fd, "%7ld residues in %5ld sequences\n", ntt.length,ntt.entries);
336 fprintf(fd,"\n%s\n",gstring2);
343 mht = (3*hist.maxh-3)/4 - 1;
345 if (!m_msg.nohist && mh1 > 0) {
346 for (i=0,maxval=0,maxvalt=0; i<hist.maxh; i++) {
347 if (hist.hist_a[i] > maxval) maxval = hist.hist_a[i];
348 if (i >= mht && hist.hist_a[i]>maxvalt) maxvalt = hist.hist_a[i];
351 cum_hl = -hist.hist_a[0];
352 dotsiz = (maxval-1)/60+1;
353 ddotsiz = (maxvalt-1)/50+1;
354 doinset = (ddotsiz < dotsiz && dotsiz > 2);
357 fprintf(fd," opt E()\n");
359 fprintf(fd," opt\n");
361 prev_e = zs_to_Ec((double)(hist.min_hist-hist.histint/2),hist.entries);
362 for (i=0; i<=mh1; i++) {
363 pch = (i==mh1) ? '>' : ' ';
364 pch = (i==0) ? '<' : pch;
365 hll = hl = hist.hist_a[i];
366 if (pst.zsflag_f>=0) {
368 f_int = (double)(i*hist.histint+hist.min_hist)+(double)hist.histint/2.0;
369 cur_e = zs_to_Ec(f_int,hist.entries);
370 ev = el = ell = (int)(cur_e - prev_e + 0.5);
371 if (hl > 0 && i > 5 && i < (90-hist.min_hist)/hist.histint) {
372 x_tmp = fabs(cum_hl - cur_e);
373 if ( x_tmp > max_dev) {
379 if ((el=(el+dotsiz-1)/dotsiz) > 60) el = 60;
380 if ((ell=(ell+ddotsiz-1)/ddotsiz) > 40) ell = 40;
381 fprintf(fd,"%c%3d %5d %5d:",
382 pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
383 mh1*hist.histint+hist.min_hist,hl,ev);
385 else fprintf(fd,"%c%3d %5d :",
386 pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
387 mh1*hist.histint+hist.min_hist,hl);
389 if ((hl=(hl+dotsiz-1)/dotsiz) > 60) hl = 60;
390 if ((hll=(hll+ddotsiz-1)/ddotsiz) > 40) hll = 40;
391 for (j=0; j<hl; j++) hline[j]='=';
392 if (pst.zsflag_f>=0) {
394 if (el > 0) hline[el-1]='*';
398 for (j = hl; j < el; j++) hline[j]=' ';
405 for (j=hl; j<10; j++) hline[j]=' ';
406 sprintf(&hline[10]," one = represents %d library sequences",dotsiz);
408 if (doinset && i == mht-2) {
409 for (j = hl; j < 10; j++) hline[j]=' ';
410 sprintf(&hline[10]," inset = represents %d library sequences",ddotsiz);
412 if (i >= mht&& doinset ) {
413 for (j = hl; j < 10; j++) hline[j]=' ';
415 for (j = 11; j<11+hll; j++) hline[j]='=';
417 if (pst.zsflag_f>=0) {
418 if (ell <= hll) hline[10+ell]='*';
420 for (j = 11+hll; j < 10+ell; j++) hline[j]=' ';
422 hline[11+ell] = '\0';
427 fprintf(fd,"%s\n",hline);
434 fprintf(fd, "%7ld residues in %5ld sequences\n", ntt.length, ntt.entries);
437 db_tt = (double)ntt.carry*(double)LONG_MAX + (double)ntt.length;
438 fprintf(fd, "%.0f residues in %5ld library sequences\n", db_tt, ntt.entries);
441 if (pst.zsflag_f>=0) {
442 if (MAXSTATS < hist.entries)
444 fprintf(fd," statistics sampled from %d to %ld sequences\n",
445 MAXSTATS,hist.entries);
447 fprintf(fd," statistics extrapolated from %d to %ld sequences\n",
448 MAXSTATS,hist.entries);
450 /* summ_stats(stat_info); */
451 fprintf(fd," %s\n",hist.stat_info);
452 if (!m_msg.nohist && cum_hl > 0)
453 fprintf(fd," Kolmogorov-Smirnov statistic: %6.4f (N=%d) at %3d\n",
454 max_dev/(float)cum_hl, n_chi_sq,max_i*hist.histint+hist.min_hist);
455 if (m_msg.markx & MX_M10FORM) {
456 while ((bp=strchr(hist.stat_info,'\n'))!=NULL) *bp=' ';
457 if (cum_hl <= 0) cum_hl = -1;
458 sprintf(hstring1,"; mp_extrap: %d %ld\n; mp_stats: %s\n; mp_KS: %6.4f (N=%d) at %3d\n",
459 MAXSTATS,hist.entries,hist.stat_info,max_dev/(float)cum_hl, n_chi_sq,max_i*hist.histint+hist.min_hist);
462 fprintf(fd,"\n%s\n",gstring2);
466 extern char prog_name[], *verstr;
468 void s_abort (char *p, char *p1)
472 fprintf (stderr, "\n***[%s] %s%s***\n", prog_name, p, p1);
475 for (i=FIRSTNODE; i< nnodes; i++) pvm_kill(pinums[i]);
479 MPI_Abort(MPI_COMM_WORLD,1);
487 void w_abort (char *p, char *p1)
489 fprintf (stderr, "\n***[%s] %s%s***\n\n", prog_name, p, p1);
495 /* copies from from to to shuffling */
497 extern int nrand(int);
500 shuffle(unsigned char *from, unsigned char *to, int n)
502 int i,j; unsigned char tmp;
504 if (from != to) memcpy((void *)to,(void *)from,n);
506 for (i=n; i>0; i--) {
515 /* copies from from to from shuffling, ieven changed for threads */
517 wshuffle(unsigned char *from, unsigned char *to, int n, int wsiz, int *ieven)
520 unsigned char tmp, *top;
522 memcpy((void *)to,(void *)from,n);
527 for (k=0; k<(n-wsiz); k += wsiz) {
529 for (i=wsiz; i>0; i--) {
537 for (i=mm; i>0; i--) {
546 for (k=n; k>=wsiz; k -= wsiz) {
548 for (i=wsiz; i>0; i--) {
556 for (i=mm; i>0; i--) {
570 sfn_cmp(int *q, int *s)
572 if (*q == *s) return *q;
574 if (*q == *s) return *q;
575 else if (*q < *s) q++;
576 else if (*q > *s) s++;
586 revcomp(unsigned char *seq, int n, int *c_nt)
591 for (i=0, ni = n-1; i< n/2; i++,ni--) {
593 seq[i] = c_nt[seq[ni]];
598 seq[i] = c_nt[seq[i]];
606 /* init_stage2 sets up the data structures necessary to send a subset
607 of sequences to the nodes, and then collects the results
610 /* wstage2[] FIRSTNODE .. nnodes has the next sequence to be do_opt()/do_walign()ed */
611 /* wstage2p[] is a list of sequence numbers/frames, to be sent to workers */
612 /* wstage2b[] is a list of bptr's that shares the index with wstage2p[] */
614 static int wstage2[MAXWRKR +1]; /* count of second stage scores */
615 static struct stage2_str *wstage2p[MAXWRKR+1]; /* list of second stage sequences */
616 static int wstage2i[MAXWRKR+1]; /* index into second stage sequences */
617 static struct beststr *bbptr,
618 **wstage2b[MAXWRKR+1]; /* reverse pointers to bestr */
621 do_stage2(struct beststr **bptr, int nbest, struct mngmsg m_msg0,
622 int s_func, struct qmng_str *qm_msp) {
624 int i, is, ib, iw, nres;
625 int node, snode, node_done;
626 int bufid, numt, tid;
628 struct comstr2 bestr2[BFR2+1]; /* temporary structure array */
629 char *seqc_buff, *seqc;
630 int seqc_buff_len, aln_code_n;
632 MPI_Status mpi_status;
635 /* initialize the counter for each worker to 0 */
636 for (iw = FIRSTNODE; iw < nnodes; iw++) wstage2[iw] = 0;
638 /* for each result, bump the counter for the worker that has
640 for (ib = 0; ib < nbest; ib++ ) { wstage2[bptr[ib]->wrkr]++; }
642 /* now allocate enough space to send each worker a
643 list of its sequences stage2_str {seqnm, frame} */
644 for (iw = FIRSTNODE; iw < nnodes; iw++) {
647 (struct stage2_str *)
648 calloc(wstage2[iw],sizeof(struct stage2_str)))==NULL) {
649 sprintf(errstr," cannot allocate sequence listp %d %d",
654 /* allocate space to remember the bptr's for each result */
655 if ((wstage2b[iw]=(struct beststr **)
656 calloc(wstage2[iw],sizeof(struct beststr *)))==NULL) {
657 sprintf(errstr," cannot allocate sequence listb %d %d",
669 /* for each result, set wstage2p[worker][result_index_in_worker] */
670 for (is = 0; is < nbest; is++) {
672 wstage2p[iw][wstage2i[iw]].seqnm = bptr[is]->seqnm;
673 wstage2p[iw][wstage2i[iw]].frame = bptr[is]->frame;
674 wstage2b[iw][wstage2i[iw]] = bptr[is];
679 /* at this point, wstage2i[iw] should equal wstage2[iw] */
681 for (node = FIRSTNODE; node < nnodes; node++) {
683 /* fprintf(stderr,"node: %d stage2: %d\n",node,wstage2[node]); */
685 /* if a worker has no results, move on */
686 if (wstage2[node]<=0) { node_done++; continue;}
688 qm_msp->slist = wstage2[node]; /* set number of results to return */
689 qm_msp->s_func = s_func; /* set s_funct for do_opt/do_walign */
691 pvm_initsend(PvmDataRaw);
692 pvm_pkbyte((char *)qm_msp,sizeof(struct qmng_str),1);
693 pvm_send(pinums[node],MSEQTYPE); /* send qm_msp */
694 pvm_initsend(PvmDataRaw); /* send the list of seqnm/frame */
695 pvm_pkbyte((char *)wstage2p[node],wstage2[node]*sizeof(struct stage2_str),1);
696 pvm_send(pinums[node],LISTTYPE);
699 MPI_Send(qm_msp,sizeof(struct qmng_str),MPI_BYTE,node,MSEQTYPE,
701 MPI_Send((char *)wstage2p[node],wstage2[node]*
702 sizeof(struct stage2_str),MPI_BYTE,node,LISTTYPE,
707 /* all the workers have their list of sequences */
708 /* reset the index of results to obtain */
709 for (iw = 0; iw < nnodes; iw++) wstage2i[iw]=0;
711 while (node_done < nnodes-FIRSTNODE) {
713 bufid = pvm_recv(-1,LISTRTYPE); /* wait for results */
714 pvm_bufinfo(bufid,NULL,NULL,&tid);
715 /* get a chunk of comstr2 results */
716 pvm_upkbyte((char *)&bestr2[0],sizeof(struct comstr2)*(BFR2+1),1);
717 snode = (iw=tidtonode(tid));
721 MPI_Recv((char *)&bestr2[0],sizeof(struct comstr2)*(BFR2+1),
722 MPI_BYTE,MPI_ANY_SOURCE,LISTRTYPE,MPI_COMM_WORLD,
724 snode = mpi_status.MPI_SOURCE;
729 if (s_func == DO_OPT_FLG && m_msg0.show_code==SHOW_CODE_ALIGN) {
731 bufid = pvm_recv(tid,CODERTYPE);
732 pvm_upkint(&seqc_buff_len,1,1); /* get the code string length */
735 MPI_Recv((char *)&seqc_buff_len,1,MPI_INT, snode,
736 CODERTYPE,MPI_COMM_WORLD, &mpi_status);
739 seqc=seqc_buff = NULL;
740 if (seqc_buff_len > 0) { /* allocate space for it */
741 if ((seqc=seqc_buff=calloc(seqc_buff_len,sizeof(char)))==NULL) {
742 fprintf(stderr,"Cannot allocate seqc_buff: %d\n",seqc_buff_len);
747 pvm_upkbyte(seqc_buff,seqc_buff_len*sizeof(char),1);
750 MPI_Recv((char *)seqc_buff,seqc_buff_len*sizeof(char),
751 MPI_BYTE,snode,CODERTYPE,MPI_COMM_WORLD, &mpi_status);
760 /* get number of results in this message */
761 nres = bestr2[BFR2].seqnm & ~FINISHED;
762 /* check to see if finished */
763 if (bestr2[BFR2].seqnm&FINISHED) {node_done++;}
767 /* count through results from a specific worker */
768 for (i=0,is=wstage2i[iw]; i < nres; i++,is++) {
770 /* get the (saved) bptr for this result */
771 bbptr=wstage2b[iw][is];
772 /* consistency check seqnm's must agree */
773 if (wstage2p[iw][is].seqnm == bbptr->seqnm) {
774 if (s_func == DO_CALC_FLG && m_msg0.last_calc_flg) {
775 bbptr->score[0] = bestr2[i].score[0];
776 bbptr->score[1] = bestr2[i].score[1];
777 bbptr->score[2] = bestr2[i].score[2];
778 bbptr->escore = bestr2[i].escore;
779 bbptr->segnum = bestr2[i].segnum;
780 bbptr->seglen = bestr2[i].seglen;
782 else if (m_msg0.stages > 1) {
783 bbptr->score[0] = bestr2[i].score[0];
784 bbptr->score[1] = bestr2[i].score[1];
785 bbptr->score[2] = bestr2[i].score[2];
788 if (s_func == DO_OPT_FLG && m_msg0.markx & MX_M9SUMM) {
789 /* get score, alignment information, percents */
790 bbptr->sw_score = bestr2[i].sw_score;
791 memcpy(bbptr->aln_d,&bestr2[i].aln_d,sizeof(struct a_struct));
792 bbptr->percent = bestr2[i].percent;
793 bbptr->gpercent = bestr2[i].gpercent;
795 if (m_msg0.show_code == 2) { /* if show code */
796 /* length of encoding */
797 aln_code_n = bbptr->aln_code_n = bestr2[i].aln_code_n;
798 if (aln_code_n > 0) {
799 if ((bbptr->aln_code =
800 (char *)calloc(aln_code_n+1,sizeof(char)))==NULL) {
801 fprintf(stderr,"cannot allocate seq_code[%d:%d]: %d\n",
802 bbptr->wrkr,bbptr->seqnm,aln_code_n);
803 seqc += aln_code_n+1;
804 bbptr->aln_code_n = 0;
807 strncpy(bbptr->aln_code,seqc,aln_code_n);
808 bbptr->aln_code[aln_code_n]='\0';
809 seqc += aln_code_n+1;
813 fprintf(stderr," aln_code_n <=0: %d\n",aln_code_n);
818 else fprintf(stderr,"phase error in phase II return %d %d", iw,i);
820 if (seqc_buff != NULL) {
824 wstage2i[iw] += nres;
827 for (iw=FIRSTNODE; iw < nnodes; iw++) {
828 if ((void *)wstage2p[iw]!=NULL) free((void *)wstage2p[iw]);
829 if ((void *)wstage2b[iw]!=NULL) free((void *)wstage2b[iw]);