1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Mon Aug 25 17:03:14 1997 [Denver]
14 * Search a single sequence against an HMM database.
15 * Conditionally includes PVM parallelization when HMMER_PVM is defined
16 * at compile time; hmmpfam --pvm runs the PVM version.
18 * CVS $Id: hmmpfam.c,v 1.1.1.1 2005/03/22 08:34:13 cmzmasek Exp $
33 #include "squid.h" /* general sequence analysis library */
34 #include "config.h" /* compile-time configuration constants */
35 #include "structs.h" /* data structures, macros, #define's */
36 #include "funcs.h" /* function declarations */
37 #include "globals.h" /* alphabet global variables */
38 #include "version.h" /* version info */
40 static char banner[] = "hmmpfam - search one or more sequences against HMM database";
42 static char usage[] = "\
43 Usage: hmmpfam [-options] <hmm database> <sequence file or database>\n\
44 Available options are:\n\
45 -h : help; print brief help on version and usage\n\
46 -n : nucleic acid models/sequence (default protein)\n\
47 -A <n> : sets alignment output limit to <n> best domain alignments\n\
48 -E <x> : sets E value cutoff (globE) to <x>; default 10\n\
49 -T <x> : sets T bit threshold (globT) to <x>; no threshold by default\n\
50 -Z <n> : sets Z (# models) for E-value calculation\n\
53 static char experts[] = "\
54 --acc : use HMM accession numbers instead of names in output\n\
55 --compat : make best effort to use last version's output style\n\
56 --cpu <n> : run <n> threads in parallel (if threaded)\n\
57 --cut_ga : use Pfam GA gathering threshold cutoffs\n\
58 --cut_nc : use Pfam NC noise threshold cutoffs\n\
59 --cut_tc : use Pfam TC trusted threshold cutoffs\n\
60 --domE <x> : sets domain Eval cutoff (2nd threshold) to <x>\n\
61 --domT <x> : sets domain T bit thresh (2nd threshold) to <x>\n\
62 --forward : use the full Forward() algorithm instead of Viterbi\n\
63 --informat <s>: sequence file is in format <s>, not FASTA\n\
64 --null2 : turn OFF the post hoc second null model\n\
65 --pvm : run on a PVM (Parallel Virtual Machine) cluster\n\
66 --xnu : turn ON XNU filtering of query protein sequence\n\
70 static struct opt_s OPTIONS[] = {
71 { "-h", TRUE, sqdARG_NONE },
72 { "-n", TRUE, sqdARG_NONE },
73 { "-A", TRUE, sqdARG_INT },
74 { "-E", TRUE, sqdARG_FLOAT},
75 { "-T", TRUE, sqdARG_FLOAT},
76 { "-Z", TRUE, sqdARG_INT },
77 { "--acc", FALSE, sqdARG_NONE },
78 { "--compat", FALSE, sqdARG_NONE },
79 { "--cpu", FALSE, sqdARG_INT },
80 { "--cut_ga", FALSE, sqdARG_NONE },
81 { "--cut_nc", FALSE, sqdARG_NONE },
82 { "--cut_tc", FALSE, sqdARG_NONE },
83 { "--domE", FALSE, sqdARG_FLOAT},
84 { "--domT", FALSE, sqdARG_FLOAT},
85 { "--forward", FALSE, sqdARG_NONE },
86 { "--informat",FALSE, sqdARG_STRING},
87 { "--null2", FALSE, sqdARG_NONE },
88 { "--pvm", FALSE, sqdARG_NONE },
89 { "--xnu", FALSE, sqdARG_NONE },
91 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
96 /* POSIX threads version:
97 * the threads share a workpool_s structure amongst themselves,
98 * for obtaining locks on input HMM file and output histogram and
102 /* Shared configuration resources that don't change:
104 char *hmmfile; /* name of HMM file */
105 char *dsq; /* digitized query sequence */
106 char *seqname; /* sequence name */
107 int L; /* length of dsq */
108 int do_forward; /* TRUE to score using Forward */
109 int do_null2; /* TRUE to apply null2 correction */
110 struct threshold_s *thresh; /* score/evalue cutoff information */
112 /* Shared (mutex-protected) input resources:
114 HMMFILE *hmmfp; /* ptr to open HMM file */
115 int nhmm; /* number of HMMs searched so far */
116 pthread_mutex_t input_lock; /* mutex for locking input */
118 /* Shared (mutex-protected) output resources:
120 struct tophit_s *ghit; /* per-sequence top hits */
121 struct tophit_s *dhit; /* per-domain top hits */
122 pthread_mutex_t output_lock; /* mutex for locking output */
124 /* Thread pool information
126 pthread_t *thread; /* our pool of threads */
127 int num_threads; /* number of threads */
130 static struct workpool_s *workpool_start(char *hmmfile, HMMFILE *hmmfp,
131 char *dsq, char *seqname, int L,
132 int do_forward, int do_null2,
133 struct threshold_s *thresh,
134 struct tophit_s *ghit, struct tophit_s *dhit,
136 static void workpool_stop(struct workpool_s *wpool);
137 static void workpool_free(struct workpool_s *wpool);
138 static void *worker_thread(void *ptr);
139 #endif /* HMMER_THREADS */
143 static void main_loop_pvm(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo,
144 struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2,
145 struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nhmm);
147 static void main_loop_serial(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo,
148 struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2,
150 struct tophit_s *ghit, struct tophit_s *dhit, int *nhmm);
153 main(int argc, char **argv)
155 char *hmmfile; /* file to read HMMs from */
156 HMMFILE *hmmfp; /* opened hmmfile for reading */
157 char *seqfile; /* file to read target sequence from */
158 SQFILE *sqfp; /* opened seqfile for reading */
159 int format; /* format of seqfile */
160 char *seq; /* target sequence */
161 SQINFO sqinfo; /* optional info for seq */
162 struct fancyali_s *ali; /* an alignment for display */
163 struct tophit_s *ghit; /* list of top hits and alignments for seq */
164 struct tophit_s *dhit; /* list of top hits/alignments for domains */
166 float sc; /* log-odds score in bits */
167 double pvalue; /* pvalue of an HMM score */
168 double evalue; /* evalue of an HMM score */
169 double motherp; /* pvalue of a whole seq HMM score */
170 float mothersc; /* score of a whole seq parent of domain */
171 int sqfrom, sqto; /* coordinates in sequence */
172 int hmmfrom, hmmto; /* coordinate in HMM */
173 char *name, *acc, *desc; /* hit HMM name, accession, description */
174 int hmmlen; /* length of HMM hit */
175 int nhmm; /* number of HMMs searched */
176 int domidx; /* number of this domain */
177 int ndom; /* total # of domains in this seq */
178 int namewidth; /* max width of printed HMM name */
179 int descwidth; /* max width of printed description */
181 int Alimit; /* A parameter limiting output alignments */
182 struct threshold_s thresh; /* contains all threshold (cutoff) info */
184 char *optname; /* name of option found by Getopt() */
185 char *optarg; /* argument found by Getopt() */
186 int optind; /* index in argv[] */
187 int do_forward; /* TRUE to use Forward() not Viterbi() */
188 int do_nucleic; /* TRUE to do DNA/RNA instead of protein */
189 int do_null2; /* TRUE to adjust scores with null model #2 */
190 int do_pvm; /* TRUE to run on PVM */
191 int do_xnu; /* TRUE to do XNU filtering */
192 int be_backwards; /* TRUE to be backwards-compatible in output*/
193 int show_acc; /* TRUE to sub HMM accessions for names */
197 int num_threads; /* number of worker threads */
199 /***********************************************
201 ***********************************************/
203 format = SQFILE_UNKNOWN; /* default: autodetect format w/ Babelfish */
212 Alimit = INT_MAX; /* no limit on alignment output */
213 thresh.globE = 10.0; /* use a reasonable Eval threshold; */
214 thresh.globT = -FLT_MAX; /* but no bit threshold, */
215 thresh.domT = -FLT_MAX; /* no domain bit threshold, */
216 thresh.domE = FLT_MAX; /* and no domain Eval threshold. */
217 thresh.autocut = CUT_NONE; /* and no Pfam cutoffs used. */
218 thresh.Z = 0; /* Z not preset, so determined by # of HMMs */
221 num_threads = ThreadNumber(); /* only matters if we're threaded */
226 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
227 &optind, &optname, &optarg)) {
228 if (strcmp(optname, "-n") == 0) do_nucleic = TRUE;
229 else if (strcmp(optname, "-A") == 0) Alimit = atoi(optarg);
230 else if (strcmp(optname, "-E") == 0) thresh.globE = atof(optarg);
231 else if (strcmp(optname, "-T") == 0) thresh.globT = atof(optarg);
232 else if (strcmp(optname, "-Z") == 0) thresh.Z = atoi(optarg);
233 else if (strcmp(optname, "--acc") == 0) show_acc = TRUE;
234 else if (strcmp(optname, "--compat") == 0) be_backwards = TRUE;
235 else if (strcmp(optname, "--cpu") == 0) num_threads = atoi(optarg);
236 else if (strcmp(optname, "--cut_ga") == 0) thresh.autocut = CUT_GA;
237 else if (strcmp(optname, "--cut_nc") == 0) thresh.autocut = CUT_NC;
238 else if (strcmp(optname, "--cut_tc") == 0) thresh.autocut = CUT_TC;
239 else if (strcmp(optname, "--domE") == 0) thresh.domE = atof(optarg);
240 else if (strcmp(optname, "--domT") == 0) thresh.domT = atof(optarg);
241 else if (strcmp(optname, "--forward") == 0) do_forward = TRUE;
242 else if (strcmp(optname, "--null2") == 0) do_null2 = FALSE;
243 else if (strcmp(optname, "--pvm") == 0) do_pvm = TRUE;
244 else if (strcmp(optname, "--xnu") == 0) do_xnu = TRUE;
245 else if (strcmp(optname, "--informat") == 0) {
246 format = String2SeqfileFormat(optarg);
247 if (format == SQFILE_UNKNOWN)
248 Die("unrecognized sequence file format \"%s\"", optarg);
250 else if (strcmp(optname, "-h") == 0) {
251 Banner(stdout, banner);
257 if (argc - optind != 2)
258 Die("Incorrect number of arguments.\n%s\n", usage);
260 hmmfile = argv[optind++];
261 seqfile = argv[optind++];
264 if (do_pvm) Die("PVM support is not compiled into HMMER; --pvm doesn't work.");
266 #ifndef HMMER_THREADS
267 if (num_threads) Die("Posix threads support is not compiled into HMMER; --cpu doesn't have any effect");
270 /***********************************************
271 * Open sequence database (must be in curr directory);
272 * get target sequence.
273 ***********************************************/
275 if (do_nucleic) SetAlphabet(hmmNUCLEIC);
276 else SetAlphabet(hmmAMINO);
278 if (do_nucleic && do_xnu)
279 Die("You can't use -n and --xnu together: I can't xnu DNA data.");
281 if ((sqfp = SeqfileOpen(seqfile, format, NULL)) == NULL)
282 Die("Failed to open sequence file %s\n%s\n", seqfile, usage);
284 /***********************************************
285 * Open HMM database (might be in HMMERDB or current directory)
286 ***********************************************/
288 if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL)
289 Die("Failed to open HMM database %s\n%s", hmmfile, usage);
291 /***********************************************
293 ***********************************************/
295 Banner(stdout, banner);
296 printf( "HMM file: %s\n", hmmfile);
297 printf( "Sequence file: %s\n", seqfile);
299 printf( "PVM: ACTIVE\n");
300 printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n");
302 /***********************************************
303 * Search each HMM against each sequence
304 ***********************************************/
306 while (ReadSeq(sqfp, format, &seq, &sqinfo))
308 ghit = AllocTophits(20); /* keeps full seq scores */
309 dhit = AllocTophits(20); /* keeps domain scores */
311 /* 1. Search sequence against all HMMs.
312 * Significant scores+alignments accumulate in ghit, dhit.
315 main_loop_serial(hmmfile, hmmfp, seq, &sqinfo,
316 &thresh, do_xnu, do_forward, do_null2, num_threads,
321 SQD_DPRINTF1(("Entering PVM main loop\n"));
322 main_loop_pvm(hmmfile, hmmfp, seq, &sqinfo,
323 &thresh, do_xnu, do_forward, do_null2,
327 else Die("wait. that can't happen. I didn't do anything.");
329 /* set Z for good now that we're done */
330 if (!thresh.Z) thresh.Z = nhmm;
332 /* 2. (Done searching all HMMs for this query seq; start output)
333 * Report the overall sequence hits, sorted by significance.
337 printf("Query: %s %s\n", sqinfo.name,
338 sqinfo.flags & SQINFO_DESC ? sqinfo.desc : "");
342 printf("\nQuery sequence: %s\n", sqinfo.name);
343 printf("Accession: %s\n", sqinfo.flags &SQINFO_ACC ? sqinfo.acc : "[none]");
344 printf("Description: %s\n", sqinfo.flags &SQINFO_DESC? sqinfo.desc : "[none]");
346 /* We'll now sort the global hit list by evalue...
347 * (not score! that was bug #12. in hmmpfam, score and evalue are not
350 FullSortTophits(ghit);
351 namewidth = MAX(8, TophitsMaxName(ghit)); /* must print whole name, no truncation */
352 descwidth = MAX(52-namewidth, 11); /* may truncate desc, but avoid neg len! */
354 printf("\nScores for sequence family classification (score includes all domains):\n");
355 printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Model", descwidth, "Description", "Score", "E-value", " N ");
356 printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", descwidth, "-----------", "-----", "-------", "---");
357 for (i = 0, nreported = 0; i < ghit->num; i++)
360 GetRankedHit(ghit, i,
361 &pvalue, &sc, NULL, NULL,
363 NULL, NULL, NULL, /* seq positions */
364 NULL, NULL, NULL, /* HMM positions */
365 NULL, &ndom, /* domain info */
366 NULL); /* alignment info*/
368 evalue = pvalue * (double) thresh.Z;
370 /* safedesc is a workaround for an apparent Linux printf()
371 * bug with the *.*s format. dbmalloc crashes with a memchr() ptr out of bounds
372 * flaw if the malloc'ed space for desc is short. The workaround
373 * is to make sure the ptr for *.* has a big malloc space.
375 if (desc != NULL && strlen(desc) < 80)
377 safedesc = MallocOrDie(sizeof(char) * 80);
378 strcpy(safedesc, desc);
380 else safedesc = Strdup(desc);
382 /* sneaky trick warning:
383 * if we're using dynamic Pfam score cutoffs (GA, TC, NC),
384 * then the list of hits is already correct and does not
385 * need any score cutoffs. Unset the thresholds. They'll
386 * be reset in the main_loop if we still have sequences
389 if (thresh.autocut != CUT_NONE) {
390 thresh.globE = thresh.domE = FLT_MAX;
391 thresh.globT = thresh.domT = -FLT_MAX;
394 if (evalue <= thresh.globE && sc >= thresh.globT)
396 printf("%-*s %-*.*s %7.1f %10.2g %3d\n",
398 (show_acc && acc != NULL) ? acc : name,
399 descwidth, descwidth, safedesc != NULL ? safedesc : "",
405 if (nreported == 0) printf("\t[no hits above thresholds]\n");
407 /* 3. Report domain hits (sorted on sqto coordinate)
409 FullSortTophits(dhit);
410 namewidth = MAX(8, TophitsMaxName(dhit)); /* must print whole name, no truncation */
412 printf("\nParsed for domains:\n");
413 printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n",
414 namewidth, "Model", "Domain ", "seq-f", "seq-t", "hmm-f", "hmm-t", "score", "E-value");
415 printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n",
416 namewidth, "--------", "-------", "-----", "-----", "-----", "-----", "-----", "-------");
418 for (i = 0, nreported = 0; i < dhit->num; i++)
420 GetRankedHit(dhit, i,
421 &pvalue, &sc, &motherp, &mothersc,
423 &sqfrom, &sqto, NULL,
424 &hmmfrom, &hmmto, &hmmlen,
427 evalue = pvalue * (double) thresh.Z;
429 /* Does the "mother" (complete) sequence satisfy global thresholds? */
430 if (motherp * (double)thresh. Z > thresh.globE || mothersc < thresh.globT)
432 else if (evalue <= thresh.domE && sc >= thresh.domT) {
433 printf("%-*s %3d/%-3d %5d %5d %c%c %5d %5d %c%c %7.1f %8.2g\n",
435 (show_acc && acc != NULL) ? acc : name,
438 sqfrom == 1 ? '[' : '.', sqto == sqinfo.len ? ']' : '.',
440 hmmfrom == 1 ? '[':'.', hmmto == hmmlen ? ']' : '.',
445 if (nreported == 0) printf("\t[no hits above thresholds]\n");
448 /* 3. Alignment output, also by domain.
449 * dhits is already sorted and namewidth is set, from above code.
450 * Number of displayed alignments is limited by Alimit parameter;
451 * also by domE (evalue threshold), domT (score theshold).
455 printf("\nAlignments of top-scoring domains:\n");
456 for (i = 0, nreported = 0; i < dhit->num; i++)
458 if (nreported == Alimit) break; /* limit to Alimit output alignments */
459 GetRankedHit(dhit, i,
460 &pvalue, &sc, &motherp, &mothersc,
462 &sqfrom, &sqto, NULL, /* seq position info */
463 &hmmfrom, &hmmto, &hmmlen, /* HMM position info */
464 &domidx, &ndom, /* domain info */
465 &ali); /* alignment info */
466 evalue = pvalue * (double) thresh.Z;
468 if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT)
470 else if (evalue <= thresh.domE && sc >= thresh.domT)
472 printf("%s: domain %d of %d, from %d to %d: score %.1f, E = %.2g\n",
473 (show_acc && acc != NULL) ? acc : name,
474 domidx, ndom, sqfrom, sqto, sc, evalue);
475 PrintFancyAli(stdout, ali);
479 if (nreported == 0) printf("\t[no hits above thresholds]\n");
480 if (nreported == Alimit) printf("\t[output cut off at A = %d top alignments]\n", Alimit);
485 FreeSequence(seq, &sqinfo);
489 HMMFileRewind(hmmfp);
492 /***********************************************
494 ***********************************************/
503 /* Function: main_loop_serial()
504 * Date: SRE, Fri Aug 7 13:46:48 1998 [St. Louis]
506 * Purpose: Search a sequence against an HMM database;
507 * main loop for the serial (non-PVM, non-threads)
510 * On return, ghit and dhit contain info for all hits
511 * that satisfy the set thresholds. If an evalue
512 * cutoff is used at all, the lists will be overestimated --
513 * because the evalue will be underestimated until
514 * we know the final Z. (Thus the main program must recheck
515 * thresholds before printing any results.) If only
516 * score cutoffs are used, then the lists are correct,
517 * and may be printed exactly as they come (after
518 * appropriate sorting, anyway). This is especially
519 * important for dynamic thresholding using Pfam
520 * score cutoffs -- the main caller cannot afford to
521 * rescan the HMM file just to get the GA/TC/NC cutoffs
522 * back out for each HMM, and neither do I want to
523 * burn the space to store them as I make a pass thru
526 * Args: hmmfile - name of HMM file
527 * hmmfp - open HMM file (and at start of file)
528 * dsq - digitized sequence
529 * sqinfo - ptr to SQINFO optional info for dsq
530 * thresh - score/evalue threshold information
531 * do_xnu - TRUE to apply XNU filter to sequence
532 * do_forward - TRUE to use Forward() scores
533 * do_null2 - TRUE to adjust scores w/ ad hoc null2 model
534 * num_threads- number of threads, if threaded
535 * ghit - global hits list
536 * dhit - domain hits list
537 * ret_nhmm - number of HMMs searched.
542 main_loop_serial(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo,
543 struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2,
545 struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nhmm)
547 char *dsq; /* digitized sequence */
548 int nhmm; /* number of HMMs searched */
550 struct workpool_s *wpool; /* pool of worker threads */
552 struct plan7_s *hmm; /* current HMM to search with */
553 struct p7trace_s *tr; /* traceback of alignment */
554 float sc; /* an alignment score */
555 double pvalue; /* pvalue of an HMM score */
556 double evalue; /* evalue of an HMM score */
560 dsq = DigitizeSequence(seq, sqinfo->len);
561 if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo->len);
564 if (num_threads > 0) {
565 wpool = workpool_start(hmmfile, hmmfp, dsq, sqinfo->name, sqinfo->len,
566 do_forward, do_null2, thresh,
567 ghit, dhit, num_threads);
568 workpool_stop(wpool);
570 workpool_free(wpool);
577 /* unthreaded code: */
579 while (HMMFileRead(hmmfp, &hmm)) {
581 Die("HMM file %s may be corrupt or in incorrect format; parse failed", hmmfile);
582 P7Logoddsify(hmm, !(do_forward));
584 if (! SetAutocuts(thresh, hmm))
585 Die("HMM %s did not contain the GA, TC, or NC cutoffs you needed",
588 /* Score sequence, do alignment (Viterbi), recover trace
590 if (P7ViterbiSize(sqinfo->len, hmm->M) <= RAMLIMIT)
591 sc = P7Viterbi(dsq, sqinfo->len, hmm, &tr);
593 sc = P7SmallViterbi(dsq, sqinfo->len, hmm, &tr);
595 /* Implement do_forward; we'll override the whole_sc with a P7Forward()
597 * HMMER is so trace- (alignment-) dependent that this gets a bit hacky.
598 * Some important implications:
599 * 1) if --do_forward is selected, the domain (Viterbi) scores do not
600 * necessarily add up to the whole sequence (Forward) score.
601 * 2) The implementation of null2 for a Forward score is undefined,
602 * since the null2 correction is trace-dependent. As a total hack,
603 * we use a null2 correction derived from the whole trace
604 * (which was the behavior of HMMER 2.1.1 and earlier, anyway).
605 * This could put the sum of domain scores and whole seq score even
606 * further in disagreement.
608 * Note that you can't move the Forward calculation into
609 * PostprocessSignificantHit(). The Forward score will exceed the
610 * Viterbi score, so you can't apply thresholds until you
611 * know the Forward score. Also, since PostprocessSignificantHit()
612 * is wrapped by a mutex in the threaded implementation,
613 * you'd destroy all useful parallelism if PostprocessSignificantHit()
614 * did anything compute intensive.
617 sc = P7Forward(dsq, sqinfo->len, hmm, NULL);
618 if (do_null2) sc -= TraceScoreCorrection(hmm, tr, dsq);
621 /* Store scores/pvalue for each HMM aligned to this sequence, overall
623 pvalue = PValue(hmm, sc);
624 evalue = thresh->Z ? (double) thresh->Z * pvalue : (double) nhmm * pvalue;
625 if (sc >= thresh->globT && evalue <= thresh->globE) {
626 PostprocessSignificantHit(ghit, dhit,
627 tr, hmm, dsq, sqinfo->len,
628 sqinfo->name, NULL, NULL, /* won't need acc or desc even if we have 'em */
632 TRUE); /* TRUE -> hmmpfam mode */
646 /*****************************************************************
647 * PVM specific functions
648 ****************************************************************/
650 /* Function: main_loop_pvm()
651 * Date: SRE, Fri Aug 7 13:58:34 1998 [St. Louis]
653 * Purpose: Search a sequence against an HMM database;
654 * main loop for the PVM version.
656 * Args: hmmfile - name of HMM file
657 * hmmfp - open HMM file (and at start of file)
658 * seq - sequence to search against
659 * sqinfo - ptr to SQINFO optional info for dsq
660 * thresh - score/evalue threshold settings
661 * do_xnu - TRUE to apply XNU filter to sequence
662 * do_forward - TRUE to use Forward() scores
663 * do_null2 - TRUE to adjust scores w/ ad hoc null2 model
664 * ghit - global hits list
665 * dhit - domain hits list
666 * nhmm - number of HMMs searched.
671 main_loop_pvm(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo,
672 struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2,
673 struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nhmm)
675 struct plan7_s *hmm; /* HMM that was searched with */
676 struct p7trace_s *tr; /* a traceback structure */
677 char *dsq; /* digitized sequence */
678 float sc; /* score of an HMM match */
679 int master_tid; /* master's ID */
680 int *slave_tid; /* array of slave IDs */
681 int *hmmlist; /* array of hmm indexes being worked on by slaves */
682 int nslaves; /* number of slaves in virtual machine */
683 int nhmm; /* number of HMMs searched */
684 int slaveidx; /* index of a slave wanting work */
686 int sent_trace; /* TRUE if slave sent us a trace */
687 char slavename[32]; /* name of HMM that slave actually did */
688 double pvalue; /* pvalue of HMM score */
693 if (hmmfp->ssi == NULL)
694 Die("HMM file %s needs an SSI index to use PVM. See: hmmindex.", hmmfile);
698 dsq = DigitizeSequence(seq, sqinfo->len);
699 if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo->len);
703 master_tid = pvm_mytid();
705 pvm_catchout(stderr); /* catch output for debugging */
707 SQD_DPRINTF1(("Spawning slaves...\n"));
708 PVMSpawnSlaves("hmmpfam-pvm", &slave_tid, &nslaves);
709 hmmlist = MallocOrDie(sizeof(int) * nslaves);
710 SQD_DPRINTF1(("Spawned a total of %d slaves...\n", nslaves));
712 /* Initialize the slaves
714 SQD_DPRINTF1(("Broadcasting to %d slaves...\n", nslaves));
715 pvm_initsend(PvmDataDefault);
716 arglen = strlen(hmmfile);
717 pvm_pkint(&arglen, 1, 1);
719 pvm_pkint(&(sqinfo->len), 1, 1);
721 pvm_pkfloat(&(thresh->globT), 1, 1);
722 pvm_pkdouble(&(thresh->globE), 1, 1);
723 pvm_pkint(&(thresh->Z), 1, 1);
724 pvm_pkint((int *)&(thresh->autocut), 1, 1);
725 pvm_pkint(&do_xnu, 1, 1);
726 pvm_pkint(&do_forward, 1, 1);
727 pvm_pkint(&do_null2, 1, 1);
728 pvm_pkint(&Alphabet_type, 1, 1);
729 pvm_mcast(slave_tid, nslaves, HMMPVM_INIT);
730 SQD_DPRINTF1(("Slaves should be ready...\n"));
731 /* get their OK codes. */
732 PVMConfirmSlaves(slave_tid, nslaves);
733 SQD_DPRINTF1(("Slaves confirm that they're ok...\n"));
736 * For efficiency reasons, we don't want the master to
737 * load HMMs from disk until she absolutely needs them.
739 for (nhmm = 0; nhmm < nslaves && nhmm < hmmfp->ssi->nprimary; nhmm++) {
740 pvm_initsend(PvmDataDefault);
741 pvm_pkint(&nhmm, 1, 1); /* side effect: also tells him what number he is. */
742 pvm_send(slave_tid[nhmm], HMMPVM_WORK);
743 hmmlist[nhmm] = nhmm;
745 SQD_DPRINTF1(("%d slaves are loaded\n", nhmm));
750 for (; nhmm < hmmfp->ssi->nprimary; nhmm++)
752 /* check slaves before blocking */
753 PVMCheckSlaves(slave_tid, nslaves);
755 SQD_DPRINTF1(("Waiting for a slave to give me output...\n"));
756 pvm_recv(-1, HMMPVM_RESULTS);
757 pvm_upkint(&slaveidx, 1, 1); /* # of slave who's sending us stuff */
758 pvm_upkstr(slavename); /* name of HMM that slave did */
759 pvm_upkfloat(&sc, 1, 1); /* score */
760 pvm_upkdouble(&pvalue, 1, 1); /* P-value */
761 pvm_upkint(&sent_trace, 1, 1); /* TRUE if trace is coming */
762 tr = (sent_trace) ? PVMUnpackTrace() : NULL;
763 SQD_DPRINTF1(("Slave %d finished %s for me...\n", slaveidx, slavename));
766 pvm_initsend(PvmDataDefault);
767 pvm_pkint(&nhmm, 1, 1);
768 pvm_send(slave_tid[slaveidx], HMMPVM_WORK);
769 SQD_DPRINTF1(("Assigned %d -> slave %d\n", nhmm, slaveidx));
772 /* 1b. Store scores/pvalue for each HMM aligned to this sequence, overall
774 SQD_DPRINTF1(("%15s : %2d : %f\n", slavename, slaveidx, sc));
777 /* now load the HMM, because the hit is significant */
778 HMMFilePositionByIndex(hmmfp, hmmlist[slaveidx]);
779 if (!HMMFileRead(hmmfp, &hmm))
780 { pvm_exit(); Die("Unexpected failure to read HMM file %s", hmmfile); }
782 { pvm_exit(); Die("HMM file %s may be corrupt; parse failed", hmmfile); }
783 P7Logoddsify(hmm, TRUE);
784 if (! SetAutocuts(thresh, hmm))
785 Die("HMM %s did not contain your GA, NC, or TC cutoffs", hmm->name);
787 PostprocessSignificantHit(ghit, dhit,
788 tr, hmm, dsq, sqinfo->len,
790 sqinfo->flags & SQINFO_ACC ? sqinfo->acc : NULL,
791 sqinfo->flags & SQINFO_DESC ? sqinfo->desc : NULL,
795 TRUE); /* TRUE -> hmmpfam mode */
800 hmmlist[slaveidx] = nhmm;
803 /* Collect the output. all n slaves are still working, so wait for them.
805 for (slave = 0; slave < nslaves && slave < nhmm; slave++)
807 /* don't check slaves (they're exiting normally);
808 window of vulnerability here to slave crashes */
810 pvm_recv(-1, HMMPVM_RESULTS);
811 pvm_upkint(&slaveidx, 1, 1); /* slave who's sending us stuff */
812 pvm_upkstr(slavename);
813 pvm_upkfloat(&sc, 1, 1); /* one score */
814 pvm_upkdouble(&pvalue, 1, 1); /* P-value */
815 pvm_upkint(&sent_trace, 1, 1); /* TRUE if trace is coming */
816 tr = (sent_trace) ? PVMUnpackTrace() : NULL;
819 SQD_DPRINTF1(("%15s : %2d : %f\n", slavename, slaveidx, sc));
822 /* now load the HMM, because the hit is significant */
823 HMMFilePositionByIndex(hmmfp, hmmlist[slaveidx]);
824 if (!HMMFileRead(hmmfp, &hmm))
825 { pvm_exit(); Die("Unexpected failure to read HMM file %s", hmmfile);}
827 { pvm_exit(); Die("HMM file %s may be corrupt; parse failed", hmmfile); }
828 P7Logoddsify(hmm, TRUE);
829 if (! SetAutocuts(thresh, hmm))
830 Die("HMM %s did not contain your GA, NC, or TC cutoffs", hmm->name);
832 PostprocessSignificantHit(ghit, dhit,
833 tr, hmm, dsq, sqinfo->len,
834 sqinfo->name, NULL, NULL, /* won't need acc or desc even if we have 'em */
838 TRUE); /* TRUE -> hmmpfam mode */
843 /* send cleanup/shutdown flag */
844 pvm_initsend(PvmDataDefault);
846 pvm_pkint(&msg, 1, 1);
847 pvm_send(slave_tid[slaveidx], HMMPVM_WORK);
850 /* Cleanup; quit the VM; and return
864 /*****************************************************************
865 * POSIX threads implementation.
868 * workpool_start() (makes a workpool_s structure. Starts calculations.)
869 * workpool_stop() (waits for threads to finish.)
870 * workpool_free() (destroys the structure)
873 * worker_thread() (the actual parallelized worker thread).
874 *****************************************************************/
876 /* Function: workpool_start()
877 * Date: SRE, Mon Sep 28 11:10:58 1998 [St. Louis]
879 * Purpose: Initialize a workpool_s structure, and return it.
881 * Args: hmmfile - name of HMM file
882 * hmmfp - open HMM file, at start
883 * dsq - ptr to sequence to search
884 * seqname - ptr to name of dsq
886 * do_forward - TRUE to score using Forward
887 * do_null2 - TRUE to apply null2 ad hoc correction
888 * threshold - evalue/score threshold settings
889 * ghit - per-seq hit list
890 * dhit - per-domain hit list
891 * num_threads- number of worker threads to run.
893 * Returns: ptr to struct workpool_s.
894 * Caller must wait for threads to finish with workpool_stop(),
895 * then free the structure with workpool_free().
897 static struct workpool_s *
898 workpool_start(char *hmmfile, HMMFILE *hmmfp, char *dsq, char *seqname, int L,
899 int do_forward, int do_null2, struct threshold_s *thresh,
900 struct tophit_s *ghit, struct tophit_s *dhit,
903 struct workpool_s *wpool;
908 wpool = MallocOrDie(sizeof(struct workpool_s));
909 wpool->thread = MallocOrDie(num_threads * sizeof(pthread_t));
910 wpool->hmmfile = hmmfile;
913 wpool->seqname = seqname;
914 wpool->do_forward = do_forward;
915 wpool->do_null2 = do_null2;
916 wpool->thresh = thresh;
918 wpool->hmmfp = hmmfp;
920 if ((rtn = pthread_mutex_init(&(wpool->input_lock), NULL)) != 0)
921 Die("pthread_mutex_init FAILED; %s\n", strerror(rtn));
925 if ((rtn = pthread_mutex_init(&(wpool->output_lock), NULL)) != 0)
926 Die("pthread_mutex_init FAILED; %s\n", strerror(rtn));
928 wpool->num_threads= num_threads;
930 /* Create slave threads. See comments in hmmcalibrate.c at
931 * this step regarding concurrency and system scope.
933 pthread_attr_init(&attr);
935 #ifdef HAVE_PTHREAD_ATTR_SETSCOPE
936 pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);
939 #ifdef HAVE_PTHREAD_SETCONCURRENCY
940 pthread_setconcurrency(num_threads+1);
942 for (i = 0; i < num_threads; i++)
943 if ((rtn = pthread_create(&(wpool->thread[i]), &attr,
944 worker_thread , (void *) wpool)) != 0)
945 Die("Failed to create thread %d; return code %d\n", i, rtn);
947 pthread_attr_destroy(&attr);
951 /* Function: workpool_stop()
952 * Date: SRE, Thu Jul 16 11:20:16 1998 [St. Louis]
954 * Purpose: Waits for threads in a workpool to finish.
956 * Args: wpool -- ptr to the workpool structure
961 workpool_stop(struct workpool_s *wpool)
964 /* wait for threads to stop */
965 for (i = 0; i < wpool->num_threads; i++)
966 if (pthread_join(wpool->thread[i],NULL) != 0)
967 Die("pthread_join failed");
971 /* Function: workpool_free()
972 * Date: SRE, Thu Jul 16 11:26:27 1998 [St. Louis]
974 * Purpose: Free a workpool_s structure, after the threads
977 * Args: wpool -- ptr to the workpool.
982 workpool_free(struct workpool_s *wpool)
990 /* Function: worker_thread()
991 * Date: SRE, Mon Sep 28 10:48:29 1998 [St. Louis]
993 * Purpose: The procedure executed by the worker threads.
995 * Args: ptr - (void *) that is recast to a pointer to
1001 worker_thread(void *ptr)
1003 struct workpool_s *wpool; /* our working threads structure */
1004 struct plan7_s *hmm; /* an HMM to search with */
1005 struct p7trace_s *tr; /* traceback from an alignment */
1006 float sc; /* score of an alignment */
1007 int rtn; /* a return code from pthreads lib */
1008 double pvalue; /* P-value of score */
1009 double evalue; /* E-value of a score */
1010 struct threshold_s thresh; /* a local copy of thresholds */
1012 wpool = (struct workpool_s *) ptr;
1013 /* Because we might dynamically change the thresholds using
1014 * Pfam GA/NC/TC cutoffs, we make a local copy of the threshold
1015 * structure in this thread.
1017 thresh.globT = wpool->thresh->globT;
1018 thresh.globE = wpool->thresh->globE;
1019 thresh.domT = wpool->thresh->domT;
1020 thresh.domE = wpool->thresh->domE;
1021 thresh.autocut = wpool->thresh->autocut;
1022 thresh.Z = wpool->thresh->Z;
1025 /* 1. acquire lock on HMM input, and get
1026 * the next HMM to work on.
1028 /* acquire a lock */
1029 if ((rtn = pthread_mutex_lock(&(wpool->input_lock))) != 0)
1030 Die("pthread_mutex_lock failure: %s\n", strerror(rtn));
1033 if (! HMMFileRead(wpool->hmmfp, &hmm))
1034 { /* we're done. release lock, exit thread */
1035 if ((rtn = pthread_mutex_unlock(&(wpool->input_lock))) != 0)
1036 Die("pthread_mutex_unlock failure: %s\n", strerror(rtn));
1039 SQD_DPRINTF1(("a thread is working on %s\n", hmm->name));
1040 /* release the lock */
1041 if ((rtn = pthread_mutex_unlock(&(wpool->input_lock))) != 0)
1042 Die("pthread_mutex_unlock failure: %s\n", strerror(rtn));
1045 Die("HMM file %s may be corrupt or in incorrect format; parse failed", wpool->hmmfile);
1046 P7Logoddsify(hmm, !(wpool->do_forward));
1048 if (!SetAutocuts(&thresh, hmm))
1049 Die("HMM %s did not have the right GA, NC, or TC cutoffs", hmm->name);
1051 /* 2. We have an HMM in score form.
1052 * Score the sequence.
1054 if (P7ViterbiSize(wpool->L, hmm->M) <= RAMLIMIT)
1055 sc = P7Viterbi(wpool->dsq, wpool->L, hmm, &tr);
1057 sc = P7SmallViterbi(wpool->dsq, wpool->L, hmm, &tr);
1059 /* The Forward score override (see comments in serial vers)
1061 if (wpool->do_forward) {
1062 sc = P7Forward(wpool->dsq, wpool->L, hmm, NULL);
1063 if (wpool->do_null2) sc -= TraceScoreCorrection(hmm, tr, wpool->dsq);
1066 /* 3. Save the output in tophits structures, after acquiring a lock
1068 if ((rtn = pthread_mutex_lock(&(wpool->output_lock))) != 0)
1069 Die("pthread_mutex_lock failure: %s\n", strerror(rtn));
1070 SQD_DPRINTF1(("model %s scores %f\n", hmm->name, sc));
1072 pvalue = PValue(hmm, sc);
1073 evalue = thresh.Z ? (double) thresh.Z * pvalue : (double) wpool->nhmm * pvalue;
1074 if (sc >= thresh.globT && evalue <= thresh.globE)
1076 PostprocessSignificantHit(wpool->ghit, wpool->dhit,
1077 tr, hmm, wpool->dsq, wpool->L,
1079 NULL, NULL, /* won't need seq's acc or desc */
1080 wpool->do_forward, sc,
1083 TRUE); /* TRUE -> hmmpfam mode */
1085 if ((rtn = pthread_mutex_unlock(&(wpool->output_lock))) != 0)
1086 Die("pthread_mutex_unlock failure: %s\n", strerror(rtn));
1091 } /* end 'infinite' loop over HMMs in this thread */
1094 #endif /* HMMER_THREADS */