1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Tue Jan 7 17:19:20 1997 [St. Louis]
14 * Search a sequence database with a profile HMM.
15 * Conditionally includes PVM parallelization when HMMER_PVM is defined
16 * at compile time; hmmsearch --pvm runs the PVM version.
18 * CVS $Id: hmmsearch.c,v 1.1.1.1 2005/03/22 08:34:05 cmzmasek Exp $
33 #include "squid.h" /* general sequence analysis library */
34 #include "config.h" /* compile-time configuration constants */
35 #include "structs.h" /* data structures, macros, #define's */
36 #include "funcs.h" /* function declarations */
37 #include "globals.h" /* alphabet global variables */
38 #include "version.h" /* version info */
40 static char banner[] = "hmmsearch - search a sequence database with a profile HMM";
42 static char usage[] = "\
43 Usage: hmmsearch [-options] <hmmfile> <sequence file or database>\n\
44 Available options are:\n\
45 -h : help; print brief help on version and usage\n\
46 -A <n> : sets alignment output limit to <n> best domain alignments\n\
47 -E <x> : sets E value cutoff (globE) to <= x\n\
48 -T <x> : sets T bit threshold (globT) to >= x\n\
49 -Z <n> : sets Z (# seqs) for E-value calculation\n\
52 static char experts[] = "\
53 --compat : make best effort to use last version's output style\n\
54 --cpu <n> : run <n> threads in parallel (if threaded)\n\
55 --cut_ga : use Pfam GA gathering threshold cutoffs\n\
56 --cut_nc : use Pfam NC noise threshold cutoffs\n\
57 --cut_tc : use Pfam TC trusted threshold cutoffs\n\
58 --domE <x>: sets domain Eval cutoff (2nd threshold) to <= x\n\
59 --domT <x>: sets domain T bit thresh (2nd threshold) to >= x\n\
60 --forward : use the full Forward() algorithm instead of Viterbi\n\
61 --informat <s>: sequence file is in format <s>, not FASTA\n\
62 --null2 : turn OFF the post hoc second null model\n\
63 --pvm : run on a Parallel Virtual Machine (PVM)\n\
64 --xnu : turn ON XNU filtering of target protein sequences\n\
67 static struct opt_s OPTIONS[] = {
68 { "-h", TRUE, sqdARG_NONE },
69 { "-A", TRUE, sqdARG_INT },
70 { "-E", TRUE, sqdARG_FLOAT},
71 { "-T", TRUE, sqdARG_FLOAT},
72 { "-Z", TRUE, sqdARG_INT },
73 { "--compat", FALSE, sqdARG_NONE },
74 { "--cpu", FALSE, sqdARG_INT },
75 { "--cut_ga", FALSE, sqdARG_NONE },
76 { "--cut_nc", FALSE, sqdARG_NONE },
77 { "--cut_tc", FALSE, sqdARG_NONE },
78 { "--domE", FALSE, sqdARG_FLOAT},
79 { "--domT", FALSE, sqdARG_FLOAT},
80 { "--forward", FALSE, sqdARG_NONE },
81 { "--informat",FALSE, sqdARG_STRING},
82 { "--null2", FALSE, sqdARG_NONE },
83 { "--pvm", FALSE, sqdARG_NONE },
84 { "--xnu", FALSE, sqdARG_NONE },
87 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
91 /* POSIX threads version:
92 * the threads share a workpool_s structure amongst themselves,
93 * for obtaining locks on input HMM file and output histogram and
97 /* Shared configuration resources which don't change:
99 struct plan7_s *hmm; /* HMM to search with */
100 int do_xnu; /* TRUE to apply XNU filter */
101 int do_forward; /* TRUE to score using Forward */
102 int do_null2; /* TRUE to apply null2 ad hoc correction */
103 struct threshold_s *thresh; /* score/evalue threshold info */
105 /* Shared (mutex-protected) input resources:
107 SQFILE *sqfp; /* ptr to open sequence file */
108 int nseq; /* number of seqs searched so far */
109 pthread_mutex_t input_lock; /* mutex for locking input */
111 /* Shared (mutex-protected) output resources:
113 struct tophit_s *ghit; /* per-sequence top hits */
114 struct tophit_s *dhit; /* per-domain top hits */
115 struct histogram_s *hist; /* histogram of scores */
116 pthread_mutex_t output_lock; /* mutex for locking output */
118 /* Thread pool information
120 pthread_t *thread; /* our pool of threads */
121 int num_threads; /* number of threads */
123 static struct workpool_s *workpool_start(struct plan7_s *hmm, SQFILE *sqfp,
124 int do_xnu, int do_forward, int do_null2,
125 struct threshold_s *thresh,
126 struct tophit_s *ghit, struct tophit_s *dhit,
127 struct histogram_s *hist, int num_threads);
128 static void workpool_stop(struct workpool_s *wpool);
129 static void workpool_free(struct workpool_s *wpool);
130 static void *worker_thread(void *ptr);
131 #endif /* HMMER_THREADS */
133 static void main_loop_serial(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward,
134 int do_null2, int do_xnu, int num_threads,
135 struct histogram_s *histogram, struct tophit_s *ghit,
136 struct tophit_s *dhit, int *ret_nseq);
138 static void main_loop_pvm(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward,
139 int do_null2, int do_xnu, struct histogram_s *histogram,
140 struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nseq);
145 main(int argc, char **argv)
147 char *hmmfile; /* file to read HMM(s) from */
148 HMMFILE *hmmfp; /* opened hmmfile for reading */
149 char *seqfile; /* file to read target sequence(s) from */
150 SQFILE *sqfp; /* opened seqfile for reading */
151 int format; /* format of seqfile */
153 struct plan7_s *hmm; /* HMM to search with */
154 struct histogram_s *histogram;/* histogram of all scores */
155 struct fancyali_s *ali; /* displayed alignment info */
156 struct tophit_s *ghit; /* list of top hits for whole sequences */
157 struct tophit_s *dhit; /* list of top hits for domains */
159 float sc; /* score of an HMM search */
160 double pvalue; /* pvalue of an HMM score */
161 double evalue; /* evalue of an HMM score */
162 double motherp; /* pvalue of a whole seq HMM score */
163 float mothersc; /* score of a whole seq parent of domain */
164 int sqfrom, sqto; /* coordinates in sequence */
165 int hmmfrom, hmmto; /* coordinate in HMM */
166 char *name, *acc, *desc; /* hit sequence name and description */
167 int sqlen; /* length of seq that was hit */
168 int nseq; /* number of sequences searched */
169 int Z; /* # of seqs for purposes of E-val calc */
170 int domidx; /* number of this domain */
171 int ndom; /* total # of domains in this seq */
172 int namewidth; /* max width of sequence name */
173 int descwidth; /* max width of description */
174 int nreported; /* # of hits reported in a list */
176 int Alimit; /* A parameter limiting output alignments */
177 struct threshold_s thresh; /* contains all threshold (cutoff) info */
179 char *optname; /* name of option found by Getopt() */
180 char *optarg; /* argument found by Getopt() */
181 int optind; /* index in argv[] */
182 int do_null2; /* TRUE to adjust scores with null model #2 */
183 int do_forward; /* TRUE to use Forward() not Viterbi() */
184 int do_xnu; /* TRUE to filter sequences thru XNU */
185 int do_pvm; /* TRUE to run on Parallel Virtual Machine */
186 int be_backwards; /* TRUE to be backwards-compatible in output*/
187 int num_threads; /* number of worker threads */
189 /***********************************************
191 ***********************************************/
193 format = SQFILE_UNKNOWN; /* default: autodetect seq file format w/ Babelfish */
201 Alimit = INT_MAX; /* no limit on alignment output */
202 thresh.globE = 10.0; /* use a reasonable Eval threshold; */
203 thresh.globT = -FLT_MAX; /* but no bit threshold, */
204 thresh.domT = -FLT_MAX; /* no domain bit threshold, */
205 thresh.domE = FLT_MAX; /* and no domain Eval threshold. */
206 thresh.autocut = CUT_NONE; /* and no Pfam cutoffs used */
207 thresh.Z = 0; /* Z not preset; use actual # of seqs */
210 num_threads = ThreadNumber(); /* only matters if we're threaded */
215 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
216 &optind, &optname, &optarg)) {
217 if (strcmp(optname, "-A") == 0) Alimit = atoi(optarg);
218 else if (strcmp(optname, "-E") == 0) thresh.globE = atof(optarg);
219 else if (strcmp(optname, "-T") == 0) thresh.globT = atof(optarg);
220 else if (strcmp(optname, "-Z") == 0) thresh.Z = atoi(optarg);
221 else if (strcmp(optname, "--compat") == 0) be_backwards = TRUE;
222 else if (strcmp(optname, "--cpu") == 0) num_threads = atoi(optarg);
223 else if (strcmp(optname, "--cut_ga") == 0) thresh.autocut = CUT_GA;
224 else if (strcmp(optname, "--cut_nc") == 0) thresh.autocut = CUT_NC;
225 else if (strcmp(optname, "--cut_tc") == 0) thresh.autocut = CUT_TC;
226 else if (strcmp(optname, "--domE") == 0) thresh.domE = atof(optarg);
227 else if (strcmp(optname, "--domT") == 0) thresh.domT = atof(optarg);
228 else if (strcmp(optname, "--forward") == 0) do_forward = TRUE;
229 else if (strcmp(optname, "--null2") == 0) do_null2 = FALSE;
230 else if (strcmp(optname, "--pvm") == 0) do_pvm = TRUE;
231 else if (strcmp(optname, "--xnu") == 0) do_xnu = TRUE;
232 else if (strcmp(optname, "--informat") == 0) {
233 format = String2SeqfileFormat(optarg);
234 if (format == SQFILE_UNKNOWN)
235 Die("unrecognized sequence file format \"%s\"", optarg);
237 else if (strcmp(optname, "-h") == 0) {
238 Banner(stdout, banner);
244 if (argc - optind != 2)
245 Die("Incorrect number of arguments.\n%s\n", usage);
247 hmmfile = argv[optind++];
248 seqfile = argv[optind++];
251 if (do_pvm) Die("PVM support is not compiled into your HMMER software; --pvm doesn't work.");
253 #ifndef HMMER_THREADS
254 if (num_threads) Die("Posix threads support is not compiled into HMMER; --cpu doesn't have any effect");
258 /***********************************************
259 * Open sequence database (might be in BLASTDB or current directory)
260 ***********************************************/
262 if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL)
263 Die("Failed to open sequence database file %s\n%s\n", seqfile, usage);
265 /***********************************************
266 * Open HMM file (might be in HMMERDB or current directory).
267 * Read a single HMM from it. (Config HMM, if necessary).
268 * Alphabet globals are set by reading the HMM.
269 ***********************************************/
271 if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
272 Die("Failed to open HMM file %s\n%s", hmmfile, usage);
273 if (!HMMFileRead(hmmfp, &hmm))
274 Die("Failed to read any HMMs from %s\n", hmmfile);
276 Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile);
277 P7Logoddsify(hmm, !do_forward);
279 if (do_xnu && Alphabet_type == hmmNUCLEIC)
280 Die("The HMM is a DNA model, and you can't use the --xnu filter on DNA data");
282 /*****************************************************************
283 * Set up optional Pfam score thresholds.
284 * Can do this before starting any searches, since we'll only use 1 HMM.
285 *****************************************************************/
287 if (! SetAutocuts(&thresh, hmm))
288 Die("HMM %s did not contain the GA, TC, or NC cutoffs you needed",
291 /***********************************************
293 ***********************************************/
295 Banner(stdout, banner);
296 printf( "HMM file: %s [%s]\n", hmmfile, hmm->name);
297 printf( "Sequence database: %s\n", seqfile);
299 printf( "PVM: ACTIVE\n");
300 printf( "per-sequence score cutoff: ");
301 if (thresh.globT == -FLT_MAX) printf("[none]\n");
303 printf(">= %.1f", thresh.globT);
304 if (thresh.autocut == CUT_GA) printf(" [GA1]\n");
305 else if (thresh.autocut == CUT_NC) printf(" [NC1]\n");
306 else if (thresh.autocut == CUT_TC) printf(" [TC1]\n");
309 printf( "per-domain score cutoff: ");
310 if (thresh.domT == -FLT_MAX) printf("[none]\n");
312 printf(">= %.1f", thresh.domT);
313 if (thresh.autocut == CUT_GA) printf(" [GA2]\n");
314 else if (thresh.autocut == CUT_NC) printf(" [NC2]\n");
315 else if (thresh.autocut == CUT_TC) printf(" [TC2]\n");
318 printf( "per-sequence Eval cutoff: ");
319 if (thresh.globE == FLT_MAX) printf("[none]\n");
320 else printf("<= %-10.2g\n", thresh.globE);
322 printf( "per-domain Eval cutoff: ");
323 if (thresh.domE == FLT_MAX) printf("[none]\n");
324 else printf("<= %10.2g\n", thresh.domE);
325 printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n");
327 /***********************************************
328 * Search HMM against each sequence
329 ***********************************************/
331 /* set up structures for storing output */
332 histogram = AllocHistogram(-200, 200, 100); /* keeps full histogram */
333 ghit = AllocTophits(200); /* per-seq hits: 200=lumpsize */
334 dhit = AllocTophits(200); /* domain hits: 200=lumpsize */
337 main_loop_serial(hmm, sqfp, &thresh, do_forward, do_null2, do_xnu, num_threads,
338 histogram, ghit, dhit, &nseq);
341 main_loop_pvm(hmm, sqfp, &thresh, do_forward, do_null2, do_xnu,
342 histogram, ghit, dhit, &nseq);
345 /***********************************************
346 * Process hit lists, produce text output
347 ***********************************************/
349 /* Set the theoretical EVD curve in our histogram using
350 * calibration in the HMM, if available.
352 if (hmm->flags & PLAN7_STATS)
353 ExtremeValueSetHistogram(histogram, hmm->mu, hmm->lambda,
354 histogram->lowscore, histogram->highscore, 0);
355 if (!thresh.Z) thresh.Z = nseq; /* set Z for good now that we're done. */
357 /* Format and report our output
359 /* 1. Report overall sequence hits (sorted on E-value) */
362 printf("\nQuery HMM: %s|%s|%s\n",
364 hmm->flags & PLAN7_ACC ? hmm->acc : "",
365 hmm->flags & PLAN7_DESC ? hmm->desc : "");
369 printf("\nQuery HMM: %s\n", hmm->name);
370 printf("Accession: %s\n", hmm->flags & PLAN7_ACC ? hmm->acc : "[none]");
371 printf("Description: %s\n", hmm->flags & PLAN7_DESC ? hmm->desc : "[none]");
374 if (hmm->flags & PLAN7_STATS)
375 printf(" [HMM has been calibrated; E-values are empirical estimates]\n");
377 printf(" [No calibration for HMM; E-values are upper bounds]\n");
379 FullSortTophits(ghit);
380 namewidth = MAX(8, TophitsMaxName(ghit)); /* cannot truncate name. */
381 descwidth = MAX(52-namewidth, 11);/* may truncate desc, but need strlen("Description") */
383 printf("\nScores for complete sequences (score includes all domains):\n");
384 printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Sequence", descwidth, "Description", "Score", "E-value", " N ");
385 printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", descwidth, "-----------", "-----", "-------", "---");
386 for (i = 0, nreported = 0; i < ghit->num; i++)
389 GetRankedHit(ghit, i,
390 &pvalue, &sc, NULL, NULL,
392 NULL, NULL, NULL, /* sequence positions */
393 NULL, NULL, NULL, /* HMM positions */
394 NULL, &ndom, /* domain info */
395 NULL); /* alignment info */
396 evalue = pvalue * (double) thresh.Z;
398 /* safedesc is a workaround for an apparent Linux printf()
399 * bug with the *.*s format. dbmalloc crashes with a memchr() ptr out of bounds
400 * flaw if the malloc'ed space for desc is short. The workaround
401 * is to make sure the ptr for *.* has a big malloc space.
403 if (desc != NULL && strlen(desc) < 80)
405 safedesc = MallocOrDie(sizeof(char) * 80);
406 strcpy(safedesc, desc);
408 else safedesc = Strdup(desc);
410 if (evalue <= thresh.globE && sc >= thresh.globT) {
411 printf("%-*s %-*.*s %7.1f %10.2g %3d\n",
413 descwidth, descwidth, safedesc != NULL ? safedesc : "",
419 if (nreported == 0) printf("\t[no hits above thresholds]\n");
422 /* 2. Report domain hits (also sorted on E-value) */
423 FullSortTophits(dhit);
424 namewidth = MAX(8, TophitsMaxName(dhit));
426 printf("\nParsed for domains:\n");
427 printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n",
428 namewidth, "Sequence", "Domain ", "seq-f", "seq-t", "hmm-f", "hmm-t", "score", "E-value");
429 printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n",
430 namewidth, "--------", "-------", "-----", "-----", "-----", "-----", "-----", "-------");
432 for (i = 0, nreported = 0; i < dhit->num; i++)
434 GetRankedHit(dhit, i,
435 &pvalue, &sc, &motherp, &mothersc,
437 &sqfrom, &sqto, &sqlen, /* seq position info */
438 &hmmfrom, &hmmto, NULL, /* HMM position info */
439 &domidx, &ndom, /* domain info */
440 NULL); /* alignment info */
441 evalue = pvalue * (double) thresh.Z;
443 if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT)
445 else if (evalue <= thresh.domE && sc >= thresh.domT) {
446 printf("%-*s %3d/%-3d %5d %5d %c%c %5d %5d %c%c %7.1f %8.2g\n",
450 sqfrom == 1 ? '[' : '.', sqto == sqlen ? ']' : '.',
452 hmmfrom == 1 ? '[':'.', hmmto == hmm->M ? ']' : '.',
457 if (nreported == 0) printf("\t[no hits above thresholds]\n");
460 /* 3. Alignment output, also by domain.
461 * dhits is already sorted and namewidth is set, from above code.
462 * Number of displayed alignments is limited by Alimit parameter;
463 * also by domE (evalue threshold), domT (score theshold).
467 printf("\nAlignments of top-scoring domains:\n");
468 for (i = 0, nreported = 0; i < dhit->num; i++)
470 if (nreported == Alimit) break; /* limit to Alimit output alignments */
471 GetRankedHit(dhit, i,
472 &pvalue, &sc, &motherp, &mothersc,
474 &sqfrom, &sqto, &sqlen, /* seq position info */
475 &hmmfrom, &hmmto, NULL, /* HMM position info */
476 &domidx, &ndom, /* domain info */
477 &ali); /* alignment info */
478 evalue = pvalue * (double) thresh.Z;
480 if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT)
482 else if (evalue <= thresh.domE && sc >= thresh.domT)
484 printf("%s: domain %d of %d, from %d to %d: score %.1f, E = %.2g\n",
485 name, domidx, ndom, sqfrom, sqto, sc, evalue);
486 PrintFancyAli(stdout, ali);
490 if (nreported == 0) printf("\t[no hits above thresholds]\n");
491 if (nreported == Alimit) printf("\t[output cut off at A = %d top alignments]\n", Alimit);
494 /* 4. Histogram output */
495 printf("\nHistogram of all scores:\n");
496 PrintASCIIHistogram(stdout, histogram);
498 /* 5. Tophits summaries, while developing...
500 printf("\nTotal sequences searched: %d\n", nseq);
501 printf("\nWhole sequence top hits:\n");
502 TophitsReport(ghit, thresh.globE, nseq);
503 printf("\nDomain top hits:\n");
504 TophitsReport(dhit, thresh.domE, nseq);
506 /***********************************************
508 ***********************************************/
510 FreeHistogram(histogram);
522 /* Function: main_loop_serial()
523 * Date: SRE, Wed Sep 23 10:20:49 1998 [St. Louis]
525 * Purpose: Search an HMM against a sequence database.
526 * main loop for the serial (non-PVM, non-threads)
529 * In: HMM and open sqfile, plus options
530 * Out: histogram, global hits list, domain hits list, nseq.
532 * Args: hmm - the HMM to search with.
533 * sqfp - open SQFILE for sequence database
534 * thresh - score/evalue threshold info
535 * do_forward - TRUE to score using Forward()
536 * do_null2 - TRUE to use ad hoc null2 score correction
537 * do_xnu - TRUE to apply XNU mask
538 * num_threads- number of worker threads to start, or 0
539 * histogram - RETURN: score histogram
540 * ghit - RETURN: ranked global scores
541 * dhit - RETURN: ranked domain scores
542 * ret_nseq - RETURN: actual number of seqs searched
547 main_loop_serial(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward,
548 int do_null2, int do_xnu, int num_threads,
549 struct histogram_s *histogram,
550 struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nseq)
553 struct workpool_s *wpool; /* pool of worker threads */
555 struct p7trace_s *tr; /* traceback */
556 char *seq; /* target sequence */
557 char *dsq; /* digitized target sequence */
558 SQINFO sqinfo; /* optional info for seq */
559 float sc; /* score of an HMM search */
560 double pvalue; /* pvalue of an HMM score */
561 double evalue; /* evalue of an HMM score */
563 int nseq; /* number of sequences searched */
566 wpool = workpool_start(hmm, sqfp, do_xnu, do_forward, do_null2, thresh,
567 ghit, dhit, histogram, num_threads);
568 workpool_stop(wpool);
570 workpool_free(wpool);
572 #else /* unthreaded code: */
574 while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo))
576 /* Silently skip length 0 seqs.
577 * What, you think this doesn't occur? Welcome to genomics,
580 if (sqinfo.len == 0) continue;
583 dsq = DigitizeSequence(seq, sqinfo.len);
585 if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo.len);
587 /* 1. Recover a trace by Viterbi.
589 if (P7ViterbiSize(sqinfo.len, hmm->M) <= RAMLIMIT)
590 sc = P7Viterbi(dsq, sqinfo.len, hmm, &tr);
592 sc = P7SmallViterbi(dsq, sqinfo.len, hmm, &tr);
594 /* 2. If we're using Forward scores, calculate the
595 * whole sequence score; this overrides anything
596 * PostprocessSignificantHit() is going to do to the per-seq score.
599 sc = P7Forward(dsq, sqinfo.len, hmm, NULL);
600 if (do_null2) sc -= TraceScoreCorrection(hmm, tr, dsq);
604 P7PrintTrace(stdout, tr, hmm, dsq);
607 /* 2. Store score/pvalue for global alignment; will sort on score,
608 * which in hmmsearch is monotonic with E-value.
609 * Keep all domains in a significant sequence hit.
610 * We can only make a lower bound estimate of E-value since
611 * we don't know the final value of nseq yet, so the list
612 * of hits we keep in memory is >= the list we actually
615 pvalue = PValue(hmm, sc);
616 evalue = thresh->Z ? (double) thresh->Z * pvalue : (double) nseq * pvalue;
617 if (sc >= thresh->globT && evalue <= thresh->globE)
619 PostprocessSignificantHit(ghit, dhit,
620 tr, hmm, dsq, sqinfo.len,
622 sqinfo.flags & SQINFO_ACC ? sqinfo.acc : NULL,
623 sqinfo.flags & SQINFO_DESC ? sqinfo.desc : NULL,
627 FALSE); /* FALSE-> not hmmpfam mode, hmmsearch mode */
629 AddToHistogram(histogram, sc);
630 FreeSequence(seq, &sqinfo);
643 /*****************************************************************
644 * PVM specific functions
645 ****************************************************************/
647 /* Function: main_loop_pvm()
648 * Date: SRE, Wed Sep 23 10:36:44 1998 [St. Louis]
650 * Purpose: Search an HMM against a sequence database.
651 * main loop for the PVM version.
653 * In: HMM and open sqfile, plus options
654 * Out: histogram, global hits list, domain hits list, nseq.
656 * Args: hmm - the HMM to search with. scoring form.
657 * sqfp - open SQFILE for sequence database
658 * thresh - score/evalue threshold information
659 * do_forward - TRUE to score using Forward()
660 * do_null2 - TRUE to use ad hoc null2 score correction
661 * do_xnu - TRUE to apply XNU mask
662 * histogram - RETURN: score histogram
663 * ghit - RETURN: ranked global scores
664 * dhit - RETURN: ranked domain scores
665 * ret_nseq - RETURN: actual number of seqs searched
670 main_loop_pvm(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward,
671 int do_null2, int do_xnu, struct histogram_s *histogram,
672 struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nseq)
674 char *seq; /* target sequence */
675 char *dsq; /* digitized target seq */
676 SQINFO sqinfo; /* optional info about target seq */
677 int master_tid; /* master's (my) PVM TID */
678 int *slave_tid; /* array of slave TID's */
679 int nslaves; /* number of slaves */
680 int code; /* status code rec'd from a slave */
681 int nseq; /* number of sequences searched */
682 int sent_trace; /* TRUE if slave gave us a trace */
683 char **dsqlist; /* remember what seqs slaves are doing */
684 char **namelist; /* remember what seq names slaves are doing */
685 char **acclist ; /* remember what seq accessions slaves are doing */
686 char **desclist; /* remember what seq desc's slaves are doing */
687 int *lenlist; /* remember lengths of seqs slaves are doing */
688 int slaveidx; /* counter for slaves */
689 float sc; /* score of an alignment */
690 double pvalue; /* P-value of a score of an alignment */
691 struct p7trace_s *tr; /* Viterbi traceback of an alignment */
692 int i; /* generic counter */
696 SQD_DPRINTF1(("Requesting master TID...\n"));
697 master_tid = pvm_mytid();
699 pvm_catchout(stderr); /* catch output for debugging */
701 SQD_DPRINTF1(("Spawning slaves...\n"));
702 PVMSpawnSlaves("hmmsearch-pvm", &slave_tid, &nslaves);
703 SQD_DPRINTF1(("Spawned a total of %d slaves...\n", nslaves));
705 /* Initialize the slaves by broadcast.
707 SQD_DPRINTF1(("Broadcasting to %d slaves...\n", nslaves));
708 pvm_initsend(PvmDataDefault);
709 pvm_pkfloat(&(thresh->globT), 1, 1);
710 pvm_pkdouble(&(thresh->globE), 1, 1);
711 pvm_pkint(&(thresh->Z), 1, 1);
712 pvm_pkint(&do_forward, 1, 1);
713 pvm_pkint(&do_null2, 1, 1);
714 pvm_pkint(&Alphabet_type, 1, 1);
716 pvm_mcast(slave_tid, nslaves, HMMPVM_INIT);
717 SQD_DPRINTF1(("Slaves should be ready...\n"));
719 /* Confirm slaves' OK status.
721 PVMConfirmSlaves(slave_tid, nslaves);
722 SQD_DPRINTF1(("Slaves confirm that they're ok...\n"));
724 /* Alloc arrays for remembering what seq each
725 * slave was working on.
727 namelist = MallocOrDie(sizeof(char *) * nslaves);
728 acclist = MallocOrDie(sizeof(char *) * nslaves);
729 desclist = MallocOrDie(sizeof(char *) * nslaves);
730 dsqlist = MallocOrDie(sizeof(char *) * nslaves);
731 lenlist = MallocOrDie(sizeof(int) * nslaves);
734 * Give them all a sequence number and a digitized sequence
736 * A side effect of the seq number is that we assign each slave
737 * a number from 0..nslaves-1.
739 for (nseq = 0; nseq < nslaves; nseq++)
741 if (! ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) break;
742 if (sqinfo.len == 0) { nseq--; continue; }
744 dsq = DigitizeSequence(seq, sqinfo.len);
745 if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo.len);
747 pvm_initsend(PvmDataDefault);
748 pvm_pkint(&nseq, 1, 1);
749 pvm_pkint(&(sqinfo.len), 1, 1);
750 pvm_pkbyte(dsq, sqinfo.len+2, 1);
751 pvm_send(slave_tid[nseq], HMMPVM_WORK);
752 SQD_DPRINTF1(("sent a dsq : %d bytes\n", sqinfo.len+2));
754 namelist[nseq] = Strdup(sqinfo.name);
755 acclist[nseq] = (sqinfo.flags & SQINFO_ACC) ? Strdup(sqinfo.acc) : NULL;
756 desclist[nseq] = (sqinfo.flags & SQINFO_DESC) ? Strdup(sqinfo.desc) : NULL;
757 lenlist[nseq] = sqinfo.len;
760 FreeSequence(seq, &sqinfo);
762 SQD_DPRINTF1(("%d slaves are loaded\n", nseq));
764 /* main receive/send loop
766 while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo))
768 if (sqinfo.len == 0) { continue; }
770 /* check slaves before blocking */
771 PVMCheckSlaves(slave_tid, nslaves);
774 SQD_DPRINTF1(("Waiting for a slave to give me output...\n"));
775 pvm_recv(-1, HMMPVM_RESULTS);
776 pvm_upkint(&slaveidx, 1, 1); /* # of slave who's sending us stuff */
777 pvm_upkfloat(&sc, 1, 1); /* score */
778 pvm_upkdouble(&pvalue, 1, 1); /* P-value */
779 pvm_upkint(&sent_trace, 1, 1); /* TRUE if trace is coming */
780 tr = (sent_trace) ? PVMUnpackTrace() : NULL;
781 SQD_DPRINTF1(("Slave %d finished %s for me...\n", slaveidx, namelist[slaveidx]));
784 dsq = DigitizeSequence(seq, sqinfo.len);
785 if (do_xnu) XNU(dsq, sqinfo.len);
787 pvm_initsend(PvmDataDefault);
788 pvm_pkint(&nseq, 1, 1);
789 pvm_pkint(&(sqinfo.len), 1, 1);
790 pvm_pkbyte(dsq, sqinfo.len+2, 1);
791 pvm_send(slave_tid[slaveidx], HMMPVM_WORK);
796 PostprocessSignificantHit(ghit, dhit,
797 tr, hmm, dsqlist[slaveidx], lenlist[slaveidx],
798 namelist[slaveidx], acclist[slaveidx], desclist[slaveidx],
802 FALSE); /* FALSE-> not hmmpfam mode, hmmsearch mode */
805 AddToHistogram(histogram, sc);
807 /* record seq info for seq we just sent */
808 free(namelist[slaveidx]);
809 if (acclist[slaveidx] != NULL) free(acclist[slaveidx]);
810 if (desclist[slaveidx] != NULL) free(desclist[slaveidx]);
811 free(dsqlist[slaveidx]);
813 dsqlist[slaveidx] = dsq;
814 namelist[slaveidx] = Strdup(sqinfo.name);
815 acclist[slaveidx] = (sqinfo.flags & SQINFO_ACC) ? Strdup(sqinfo.acc) : NULL;
816 desclist[slaveidx] = (sqinfo.flags & SQINFO_DESC) ? Strdup(sqinfo.desc) : NULL;
817 lenlist[slaveidx] = sqinfo.len;
819 FreeSequence(seq, &sqinfo);
821 SQD_DPRINTF1(("End of receive/send loop\n"));
823 /* Collect the output. All n slaves are still working.
825 for (i = 0; i < nslaves && i < nseq; i++)
827 /* don't check slaves (they're exiting normally);
828 window of vulnerability here to slave crashes */
830 pvm_recv(-1, HMMPVM_RESULTS);
831 pvm_upkint(&slaveidx, 1, 1); /* # of slave who's sending us stuff */
832 pvm_upkfloat(&sc, 1, 1); /* score */
833 pvm_upkdouble(&pvalue, 1, 1); /* P-value */
834 pvm_upkint(&sent_trace, 1, 1); /* TRUE if trace is coming */
835 tr = (sent_trace) ? PVMUnpackTrace() : NULL;
836 SQD_DPRINTF1(("Slave %d finished %s for me...\n", slaveidx, namelist[slaveidx]));
841 PostprocessSignificantHit(ghit, dhit,
842 tr, hmm, dsqlist[slaveidx], lenlist[slaveidx],
843 namelist[slaveidx], acclist[slaveidx], desclist[slaveidx],
847 FALSE); /* FALSE-> not hmmpfam mode, hmmsearch mode */
850 AddToHistogram(histogram, sc);
853 free(namelist[slaveidx]);
854 if (acclist[slaveidx] != NULL) free(acclist[slaveidx]);
855 if (desclist[slaveidx] != NULL) free(desclist[slaveidx]);
856 free(dsqlist[slaveidx]);
858 /* send cleanup/shutdown flag to slave */
859 pvm_initsend(PvmDataDefault);
861 pvm_pkint(&code, 1, 1);
862 pvm_send(slave_tid[slaveidx], HMMPVM_WORK);
866 /* Cleanup; quit the VM; and return
878 #endif /* HMMER_PVM */
881 /*****************************************************************
882 * POSIX threads implementation.
885 * workpool_start() (makes a workpool_s structure. Starts calculations.)
886 * workpool_stop() (waits for threads to finish.)
887 * workpool_free() (destroys the structure)
890 * worker_thread() (the actual parallelized worker thread).
891 *****************************************************************/
893 /* Function: workpool_start()
894 * Date: SRE, Mon Oct 5 16:44:53 1998
896 * Purpose: Initialize a workpool_s structure, and return it.
898 * Args: sqfp - open sequence file, at start
899 * do_xnu - TRUE to apply XNU filter
900 * do_forward - TRUE to score using Forward
901 * do_null2 - TRUE to apply null2 ad hoc correction
902 * thresh - score/evalue threshold info
903 * ghit - per-seq hit list
904 * dhit - per-domain hit list
905 * hist - histogram (alloced but empty)
906 * num_threads- number of worker threads to run.
908 * Returns: ptr to struct workpool_s.
909 * Caller must wait for threads to finish with workpool_stop(),
910 * then free the structure with workpool_free().
912 static struct workpool_s *
913 workpool_start(struct plan7_s *hmm, SQFILE *sqfp, int do_xnu,
914 int do_forward, int do_null2, struct threshold_s *thresh,
915 struct tophit_s *ghit, struct tophit_s *dhit,
916 struct histogram_s *hist, int num_threads)
918 struct workpool_s *wpool;
923 wpool = MallocOrDie(sizeof(struct workpool_s));
924 wpool->thread = MallocOrDie(num_threads * sizeof(pthread_t));
927 wpool->do_xnu = do_xnu;
928 wpool->do_forward = do_forward;
929 wpool->do_null2 = do_null2;
930 wpool->thresh = thresh;
934 if ((rtn = pthread_mutex_init(&(wpool->input_lock), NULL)) != 0)
935 Die("pthread_mutex_init FAILED; %s\n", strerror(rtn));
940 if ((rtn = pthread_mutex_init(&(wpool->output_lock), NULL)) != 0)
941 Die("pthread_mutex_init FAILED; %s\n", strerror(rtn));
943 wpool->num_threads= num_threads;
945 /* Create slave threads. See comments in hmmcalibrate.c at this
946 * step, regarding concurrency, system scope, and portability
947 * amongst various UNIX implementations of pthreads.
949 pthread_attr_init(&attr);
951 #ifdef HAVE_PTHREAD_ATTR_SETSCOPE
952 pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);
955 #ifdef HAVE_PTHREAD_SETCONCURRENCY
956 pthread_setconcurrency(num_threads+1);
958 /* pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); */
959 for (i = 0; i < num_threads; i++)
960 if ((rtn = pthread_create(&(wpool->thread[i]), &attr,
961 worker_thread , (void *) wpool)) != 0)
962 Die("Failed to create thread %d; return code %d\n", i, rtn);
964 pthread_attr_destroy(&attr);
967 /* Function: workpool_stop()
968 * Date: SRE, Thu Jul 16 11:20:16 1998 [St. Louis]
970 * Purpose: Waits for threads in a workpool to finish.
972 * Args: wpool -- ptr to the workpool structure
977 workpool_stop(struct workpool_s *wpool)
980 /* wait for threads to stop */
981 for (i = 0; i < wpool->num_threads; i++)
982 if (pthread_join(wpool->thread[i],NULL) != 0)
983 Die("pthread_join failed");
987 /* Function: workpool_free()
988 * Date: SRE, Thu Jul 16 11:26:27 1998 [St. Louis]
990 * Purpose: Free a workpool_s structure, after the threads
993 * Args: wpool -- ptr to the workpool.
998 workpool_free(struct workpool_s *wpool)
1000 free(wpool->thread);
1006 /* Function: worker_thread()
1007 * Date: SRE, Mon Sep 28 10:48:29 1998 [St. Louis]
1009 * Purpose: The procedure executed by the worker threads.
1011 * Args: ptr - (void *) that is recast to a pointer to
1017 worker_thread(void *ptr)
1019 struct workpool_s *wpool; /* our working threads structure */
1020 char *seq; /* target sequence */
1021 SQINFO sqinfo; /* information assoc w/ seq */
1022 char *dsq; /* digitized sequence */
1023 struct p7trace_s *tr; /* traceback from an alignment */
1024 float sc; /* score of an alignment */
1025 int rtn; /* a return code from pthreads lib */
1026 double pvalue; /* P-value of score */
1027 double evalue; /* E-value of score */
1029 wpool = (struct workpool_s *) ptr;
1032 /* 1. acquire lock on sequence input, and get
1033 * the next seq to work on.
1035 /* acquire a lock */
1036 if ((rtn = pthread_mutex_lock(&(wpool->input_lock))) != 0)
1037 Die("pthread_mutex_lock failure: %s\n", strerror(rtn));
1038 if (! ReadSeq(wpool->sqfp, wpool->sqfp->format, &seq, &sqinfo))
1039 { /* we're done. release lock, exit thread */
1040 if ((rtn = pthread_mutex_unlock(&(wpool->input_lock))) != 0)
1041 Die("pthread_mutex_unlock failure: %s\n", strerror(rtn));
1044 SQD_DPRINTF1(("a thread is working on %s\n", sqinfo.name));
1046 /* release the lock */
1047 if ((rtn = pthread_mutex_unlock(&(wpool->input_lock))) != 0)
1048 Die("pthread_mutex_unlock failure: %s\n", strerror(rtn));
1050 if (sqinfo.len == 0) continue; /* silent skip of len=0 seqs (wormpep!?!) */
1052 dsq = DigitizeSequence(seq, sqinfo.len);
1053 if (wpool->do_xnu) XNU(dsq, sqinfo.len);
1055 /* 1. Recover a trace by Viterbi.
1057 if (P7ViterbiSize(sqinfo.len, wpool->hmm->M) <= RAMLIMIT)
1058 sc = P7Viterbi(dsq, sqinfo.len, wpool->hmm, &tr);
1060 sc = P7SmallViterbi(dsq, sqinfo.len, wpool->hmm, &tr);
1062 /* 2. If we're using Forward scores, do another DP
1063 * to get it; else, we already have a Viterbi score
1066 if (wpool->do_forward) sc = P7Forward(dsq, sqinfo.len, wpool->hmm, NULL);
1067 if (wpool->do_null2) sc -= TraceScoreCorrection(wpool->hmm, tr, dsq);
1069 /* 3. Save the output in tophits and histogram structures, after acquiring a lock
1071 if ((rtn = pthread_mutex_lock(&(wpool->output_lock))) != 0)
1072 Die("pthread_mutex_lock failure: %s\n", strerror(rtn));
1073 SQD_DPRINTF1(("seq %s scores %f\n", sqinfo.name, sc));
1075 pvalue = PValue(wpool->hmm, sc);
1076 evalue = wpool->thresh->Z ? (double) wpool->thresh->Z * pvalue : (double) wpool->nseq * pvalue;
1078 if (sc >= wpool->thresh->globT && evalue <= wpool->thresh->globE)
1080 PostprocessSignificantHit(wpool->ghit, wpool->dhit,
1081 tr, wpool->hmm, dsq, sqinfo.len,
1083 sqinfo.flags & SQINFO_ACC ? sqinfo.acc : NULL,
1084 sqinfo.flags & SQINFO_DESC ? sqinfo.desc : NULL,
1085 wpool->do_forward, sc,
1088 FALSE); /* FALSE-> not hmmpfam mode, hmmsearch mode */
1090 AddToHistogram(wpool->hist, sc);
1091 if ((rtn = pthread_mutex_unlock(&(wpool->output_lock))) != 0)
1092 Die("pthread_mutex_unlock failure: %s\n", strerror(rtn));
1095 FreeSequence(seq, &sqinfo);
1097 } /* end 'infinite' loop over seqs in this thread */
1100 #endif /* HMMER_THREADS */