1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
13 * Routines for storing, sorting, displaying high scoring hits
16 *****************************************************************************
20 * AllocTophits() - allocation
21 * FreeTophits() - free'ing
22 * RegisterHit() - put information about a hit in the list
23 * GetRankedHit() - recovers information about a hit
24 * FullSortTophits() - sorts the top H hits.
26 *****************************************************************************
27 * Brief example of use:
29 * struct tophit_s *yourhits; // list of hits
30 * struct fancyali_s *ali; // (optional structure) alignment of a hit
32 * yourhits = AllocTophits(200);
33 * (for every hit in a search) {
35 * ali = Trace2FancyAli(); // You provide a function/structure here
36 * if (score > threshold)
37 * RegisterHit(yourhits, ...)
40 * FullSortTophits(yourhits); // Sort hits by evalue
41 * for (i = 0; i < 100; i++) // Recover hits out in ranked order
43 * GetRankedHit(yourhits, i, ...);
44 * // Presumably you'd print here...
46 * FreeTophits(yourhits);
47 ***************************************************************************
49 * Estimated storage per hit:
52 * name/acc/desc: 192 bytes
53 * alignment: 1000 bytes total = ~1200 bytes with alignment;
54 * = ~200 bytes without
55 * Designed for: 10^5 hits (20 MB) or 10^4 alignments (10 MB)
64 /* Function: AllocTophits()
66 * Purpose: Allocate a struct tophit_s, for maintaining
67 * a list of top-scoring hits in a database search.
69 * Args: lumpsize - allocation lumpsize
71 * Return: An allocated struct hit_s. Caller must free.
74 AllocTophits(int lumpsize)
76 struct tophit_s *hitlist;
78 hitlist = MallocOrDie (sizeof(struct tophit_s));
80 hitlist->unsrt = MallocOrDie (lumpsize * sizeof(struct hit_s));
81 hitlist->alloc = lumpsize;
83 hitlist->lump = lumpsize;
87 GrowTophits(struct tophit_s *h)
89 h->unsrt = ReallocOrDie(h->unsrt,(h->alloc + h->lump) * sizeof(struct hit_s));
93 FreeTophits(struct tophit_s *h)
96 for (pos = 0; pos < h->num; pos++)
98 if (h->unsrt[pos].ali != NULL) FreeFancyAli(h->unsrt[pos].ali);
99 if (h->unsrt[pos].name != NULL) free(h->unsrt[pos].name);
100 if (h->unsrt[pos].acc != NULL) free(h->unsrt[pos].acc);
101 if (h->unsrt[pos].desc != NULL) free(h->unsrt[pos].desc);
104 if (h->hit != NULL) free(h->hit);
111 struct fancyali_s *ali;
113 ali = MallocOrDie (sizeof(struct fancyali_s));
114 ali->rfline = ali->csline = ali->model = ali->mline = ali->aseq = NULL;
115 ali->query = ali->target = NULL;
116 ali->sqfrom = ali->sqto = 0;
120 FreeFancyAli(struct fancyali_s *ali)
123 if (ali->rfline != NULL) free(ali->rfline);
124 if (ali->csline != NULL) free(ali->csline);
125 if (ali->model != NULL) free(ali->model);
126 if (ali->mline != NULL) free(ali->mline);
127 if (ali->aseq != NULL) free(ali->aseq);
128 if (ali->query != NULL) free(ali->query);
129 if (ali->target != NULL) free(ali->target);
134 /* Function: RegisterHit()
136 * Purpose: Add a new hit to a list of top hits.
138 * "ali", if provided, is a pointer to allocated memory
139 * for an alignment output structure.
140 * Management is turned over to the top hits structure.
141 * Caller should not free them; they will be free'd by
142 * the FreeTophits() call.
144 * In contrast, "name", "acc", and "desc" are copied, so caller
145 * is still responsible for these.
147 * Number of args is unwieldy.
149 * Args: h - active top hit list
150 * key - value to sort by: bigger is better
151 * pvalue - P-value of this hit
152 * score - score of this hit
153 * motherp - P-value of parent whole sequence
154 * mothersc - score of parent whole sequence
155 * name - name of target
156 * acc - accession of target (may be NULL)
157 * desc - description of target (may be NULL)
158 * sqfrom - 1..L pos in target seq of start
159 * sqto - 1..L pos; sqfrom > sqto if rev comp
160 * sqlen - length of sequence, L
161 * hmmfrom - 0..M+1 pos in HMM of start
162 * hmmto - 0..M+1 pos in HMM of end
163 * hmmlen - length of HMM, M
164 * domidx - number of this domain
165 * ndom - total # of domains in sequence
166 * ali - optional printable alignment info
169 * hitlist is modified and possibly reallocated internally.
172 RegisterHit(struct tophit_s *h, double key,
173 double pvalue, float score, double motherp, float mothersc,
174 char *name, char *acc, char *desc,
175 int sqfrom, int sqto, int sqlen,
176 int hmmfrom, int hmmto, int hmmlen,
177 int domidx, int ndom,
178 struct fancyali_s *ali)
180 /* Check to see if list is full and we must realloc.
182 if (h->num == h->alloc) GrowTophits(h);
184 h->unsrt[h->num].name = Strdup(name);
185 h->unsrt[h->num].acc = Strdup(acc);
186 h->unsrt[h->num].desc = Strdup(desc);
187 h->unsrt[h->num].sortkey = key;
188 h->unsrt[h->num].pvalue = pvalue;
189 h->unsrt[h->num].score = score;
190 h->unsrt[h->num].motherp = motherp;
191 h->unsrt[h->num].mothersc= mothersc;
192 h->unsrt[h->num].sqfrom = sqfrom;
193 h->unsrt[h->num].sqto = sqto;
194 h->unsrt[h->num].sqlen = sqlen;
195 h->unsrt[h->num].hmmfrom = hmmfrom;
196 h->unsrt[h->num].hmmto = hmmto;
197 h->unsrt[h->num].hmmlen = hmmlen;
198 h->unsrt[h->num].domidx = domidx;
199 h->unsrt[h->num].ndom = ndom;
200 h->unsrt[h->num].ali = ali;
205 /* Function: GetRankedHit()
206 * Date: SRE, Tue Oct 28 10:06:48 1997 [Newton Institute, Cambridge UK]
208 * Purpose: Recover the data from the i'th ranked hit.
209 * Any of the data ptrs may be passed as NULL for fields
210 * you don't want. hitlist must have been sorted first.
212 * name, acc, desc, and ali are returned as pointers, not copies;
216 GetRankedHit(struct tophit_s *h, int rank,
217 double *r_pvalue, float *r_score,
218 double *r_motherp, float *r_mothersc,
219 char **r_name, char **r_acc, char **r_desc,
220 int *r_sqfrom, int *r_sqto, int *r_sqlen,
221 int *r_hmmfrom, int *r_hmmto, int *r_hmmlen,
222 int *r_domidx, int *r_ndom,
223 struct fancyali_s **r_ali)
225 if (r_pvalue != NULL) *r_pvalue = h->hit[rank]->pvalue;
226 if (r_score != NULL) *r_score = h->hit[rank]->score;
227 if (r_motherp != NULL) *r_motherp = h->hit[rank]->motherp;
228 if (r_mothersc!= NULL) *r_mothersc= h->hit[rank]->mothersc;
229 if (r_name != NULL) *r_name = h->hit[rank]->name;
230 if (r_acc != NULL) *r_acc = h->hit[rank]->acc;
231 if (r_desc != NULL) *r_desc = h->hit[rank]->desc;
232 if (r_sqfrom != NULL) *r_sqfrom = h->hit[rank]->sqfrom;
233 if (r_sqto != NULL) *r_sqto = h->hit[rank]->sqto;
234 if (r_sqlen != NULL) *r_sqlen = h->hit[rank]->sqlen;
235 if (r_hmmfrom != NULL) *r_hmmfrom = h->hit[rank]->hmmfrom;
236 if (r_hmmto != NULL) *r_hmmto = h->hit[rank]->hmmto;
237 if (r_hmmlen != NULL) *r_hmmlen = h->hit[rank]->hmmlen;
238 if (r_domidx != NULL) *r_domidx = h->hit[rank]->domidx;
239 if (r_ndom != NULL) *r_ndom = h->hit[rank]->ndom;
240 if (r_ali != NULL) *r_ali = h->hit[rank]->ali;
243 /* Function: TophitsMaxName()
245 * Purpose: Returns the maximum name length in a top hits list;
246 * doesn't need to be sorted yet.
249 TophitsMaxName(struct tophit_s *h)
255 for (i = 0; i < h->num; i++)
257 len = strlen(h->unsrt[i].name);
258 if (len > maxlen) maxlen = len;
263 /* Function: FullSortTophits()
265 * Purpose: Completely sort the top hits list. Calls
266 * qsort() to do the sorting, and uses
267 * hit_comparison() to do the comparison.
269 * Args: h - top hits structure
272 hit_comparison(const void *vh1, const void *vh2)
274 /* don't ask. don't change. and, Don't Panic. */
275 struct hit_s *h1 = *((struct hit_s **) vh1);
276 struct hit_s *h2 = *((struct hit_s **) vh2);
278 if (h1->sortkey < h2->sortkey) return 1;
279 else if (h1->sortkey > h2->sortkey) return -1;
280 else if (h1->sortkey == h2->sortkey) return 0;
285 FullSortTophits(struct tophit_s *h)
289 /* If we don't have /any/ hits, then don't
292 if (h->num == 0) return;
294 /* Assign the ptrs in h->hit.
296 h->hit = MallocOrDie(h->num * sizeof(struct hit_s *));
297 for (i = 0; i < h->num; i++)
298 h->hit[i] = &(h->unsrt[i]);
300 /* Sort the pointers. Don't bother if we've only got one.
303 qsort(h->hit, h->num, sizeof(struct hit_s *), hit_comparison);
308 /* Function: TophitsReport()
309 * Date: Thu Dec 18 13:19:18 1997
311 * Purpose: Generate a printout summarizing how much
312 * memory is used by a tophits structure,
313 * how many hits are stored, and how much
314 * waste there is from not knowing nseqs.
316 * Args: h - the sorted tophits list
317 * E - the cutoff in Evalue
318 * nseq - the final number of seqs used for Eval
321 * Prints information on stdout
324 TophitsReport(struct tophit_s *h, double E, int nseq)
331 /* Count up how much memory is used
334 memused = sizeof(struct hit_s) * h->alloc + sizeof(struct tophit_s);
335 for (i = 0; i < h->num; i++)
337 if (h->unsrt[i].name != NULL)
338 memused += strlen(h->unsrt[i].name) + 1;
339 if (h->unsrt[i].acc != NULL)
340 memused += strlen(h->unsrt[i].acc) + 1;
341 if (h->unsrt[i].desc != NULL)
342 memused += strlen(h->unsrt[i].desc) + 1;
343 if (h->unsrt[i].ali != NULL)
345 memused += sizeof(struct fancyali_s);
347 if (h->unsrt[i].ali->rfline != NULL) x++;
348 if (h->unsrt[i].ali->csline != NULL) x++;
349 if (h->unsrt[i].ali->model != NULL) x++;
350 if (h->unsrt[i].ali->mline != NULL) x++;
351 if (h->unsrt[i].ali->aseq != NULL) x++;
352 memused += x * (h->unsrt[i].ali->len + 1);
354 if (h->unsrt[i].ali->query != NULL)
355 memused += strlen(h->unsrt[i].ali->query) + 1;
356 if (h->unsrt[i].ali->target != NULL)
357 memused += strlen(h->unsrt[i].ali->target) + 1;
361 /* Count how many hits actually satisfy the E cutoff.
364 for (i = 0; i < h->num; i++)
366 if (h->hit[i]->pvalue * (double) nseq >= E) break;
370 /* Format and print a summary
372 printf("tophits_s report:\n");
373 printf(" Total hits: %d\n", h->num);
374 printf(" Satisfying E cutoff: %d\n", n);
375 printf(" Total memory: %dK\n", memused / 1000);