1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
8 *****************************************************************/
11 * SRE, Thu Mar 3 07:56:01 1994
13 * Calculate weights for sequences in an alignment.
14 * RCS $Id: weight.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: weight.c,v 1.9 2002/10/09 14:26:09 eddy Exp)
20 #include "sre_random.h"
22 static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node);
23 static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt,
24 float *fwt, int node);
25 static float simple_distance(char *s1, char *s2);
26 static int simple_diffmx(char **aseqs,int num, float ***ret_dmx);
28 /* Function: GSCWeights()
30 * Purpose: Use Erik's tree-based algorithm to set weights for
31 * sequences in an alignment. upweight() and downweight()
32 * are derived from Graeme Mitchison's code.
34 * Args: aseq - array of (0..nseq-1) aligned sequences
35 * nseq - number of seqs in alignment
36 * alen - length of alignment
37 * wgt - allocated [0..nseq-1] array of weights to be returned
43 GSCWeights(char **aseq, int nseq, int alen, float *wgt)
45 float **dmx; /* distance (difference) matrix */
47 float *lwt, *rwt; /* weight on left, right of this tree node */
48 float *fwt; /* final weight assigned to this node */
53 if (nseq == 1) { wgt[0] = 1.0; return; }
55 /* I use a simple fractional difference matrix derived by
56 * pairwise identity. Perhaps I should include a Poisson
57 * distance correction.
59 MakeDiffMx(aseq, nseq, &dmx);
60 if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree)) Die("Cluster() failed");
64 lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
65 rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
66 fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
68 /* lwt and rwt are the total branch weight to the left and
69 * right of a node or sequence. They are 0..2N-2. 0..N-1 are
70 * the sequences; these have weight 0. N..2N-2 are the actual
73 for (i = 0; i < nseq; i++)
74 lwt[i] = rwt[i] = 0.0;
75 /* recursively calculate rwt, lwt, starting
76 at node nseq (the root) */
77 upweight(tree, nseq, lwt, rwt, nseq);
79 /* recursively distribute weight across the
82 downweight(tree, nseq, lwt, rwt, fwt, nseq);
83 /* collect the weights */
84 for (i = 0; i < nseq; i++)
88 FreePhylo(tree, nseq);
89 free(lwt); free(rwt); free(fwt);
93 upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node)
97 ld = tree[node-nseq].left;
98 if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld);
99 rd = tree[node-nseq].right;
100 if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd);
101 lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen;
102 rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen;
107 downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node)
112 ld = tree[node-nseq].left;
113 rd = tree[node-nseq].right;
114 if (lwt[node] + rwt[node] > 0.0)
116 fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node]));
117 fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node]));
121 lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0;
122 rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0;
123 fwt[ld] = fwt[node] * lnum / (lnum + rnum);
124 fwt[rd] = fwt[node] * rnum / (lnum + rnum);
127 if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld);
128 if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd);
134 /* Function: VoronoiWeights()
136 * Purpose: Calculate weights using the scheme of Sibbald &
137 * Argos (JMB 216:813-818 1990). The scheme is
138 * slightly modified because the original algorithm
139 * actually doesn't work on gapped alignments.
140 * The sequences are assumed to be protein.
142 * Args: aseq - array of (0..nseq-1) aligned sequences
143 * nseq - number of sequences
144 * alen - length of alignment
145 * wgt - allocated [0..nseq-1] array of weights to be returned
151 VoronoiWeights(char **aseq, int nseq, int alen, float *wgt)
153 float **dmx; /* distance (difference) matrix */
154 float *halfmin; /* 1/2 minimum distance to other seqs */
155 char **psym; /* symbols seen in each column */
156 int *nsym; /* # syms seen in each column */
157 int symseen[27]; /* flags for observed syms */
158 char *randseq; /* randomly generated sequence */
159 int acol; /* pos in aligned columns */
160 int idx; /* index in sequences */
161 int symidx; /* 0..25 index for symbol */
162 int i; /* generic counter */
163 float min; /* minimum distance */
164 float dist; /* distance between random and real */
165 float challenge, champion; /* for resolving ties */
166 int itscale; /* how many iterations per seq */
168 int best; /* index of nearest real sequence */
170 /* Sanity check first
172 if (nseq == 1) { wgt[0] = 1.0; return; }
176 /* Precalculate 1/2 minimum distance to other
177 * sequences for each sequence
179 if (! simple_diffmx(aseq, nseq, &dmx))
180 Die("simple_diffmx() failed");
181 halfmin = MallocOrDie (sizeof(float) * nseq);
182 for (idx = 0; idx < nseq; idx++)
184 for (min = 1.0, i = 0; i < nseq; i++)
186 if (i == idx) continue;
187 if (dmx[idx][i] < min) min = dmx[idx][i];
189 halfmin[idx] = min / 2.0;
191 Free2DArray((void **) dmx, nseq);
193 /* Set up the random sequence generating model.
195 psym = MallocOrDie (alen * sizeof(char *));
196 nsym = MallocOrDie (alen * sizeof(int));
197 for (acol = 0; acol < alen; acol++)
198 psym[acol] = MallocOrDie (27 * sizeof(char));
200 /* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
201 for (acol = 0; acol < alen; acol++)
203 memset(symseen, 0, sizeof(int) * 27);
204 for (idx = 0; idx < nseq; idx++)
205 if (! isgap(aseq[idx][acol]))
207 if (isupper((int) aseq[idx][acol]))
208 symidx = aseq[idx][acol] - 'A';
210 symidx = aseq[idx][acol] - 'a';
211 if (symidx >= 0 && symidx < 26)
215 symseen[26] = 1; /* a gap */
217 for (nsym[acol] = 0, i = 0; i < 26; i++)
220 psym[acol][nsym[acol]] = 'A'+i;
223 if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; }
225 /* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
227 /* Note: the original Sibbald&Argos algorithm calls for
228 * bounding the sampled space using a template-like random
229 * sequence generator. However, this leads to one minor
230 * and one major problem. The minor problem is that
231 * exceptional amino acids in a column can have a
232 * significant effect by altering the amount of sampled
233 * sequence space; the larger the data set, the worse
234 * this problem becomes. The major problem is that
235 * there is no reasonable way to deal with gaps.
236 * Gapped sequences simply inhabit a different dimensionality
237 * and it's pretty painful to imagine calculating Voronoi
238 * volumes when the N in your N-space is varying.
239 * Note that all the examples shown by Sibbald and Argos
240 * are *ungapped* examples.
242 * The best way I've found to circumvent this problem is
243 * just not to bound the sampled space; count gaps as
244 * symbols and generate completely random sequences.
246 #ifdef ALL_SEQUENCE_SPACE
247 for (acol = 0; acol < alen; acol++)
249 strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY ");
254 /* Sibbald and Argos algorithm:
255 * 1) assign all seqs weight 0.
256 * 2) generate a "random" sequence
257 * 3) calculate distance to every other sequence
258 * (if we get a distance < 1/2 minimum distance
259 * to other real seqs, we can stop)
260 * 4) if unique closest sequence, increment its weight 1.
261 * if multiple closest seq, choose one randomly
262 * 5) repeat 2-4 for lots of iterations
263 * 6) normalize all weights to sum to nseq.
265 randseq = MallocOrDie ((alen+1) * sizeof(char));
267 best = 42.; /* solely to silence GCC uninit warnings. */
268 FSet(wgt, nseq, 0.0);
269 for (iteration = 0; iteration < itscale * nseq; iteration++)
271 for (acol = 0; acol < alen; acol++)
272 randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])];
273 randseq[acol] = '\0';
275 champion = sre_random();
276 for (min = 1.0, idx = 0; idx < nseq; idx++)
278 dist = simple_distance(aseq[idx], randseq);
279 if (dist < halfmin[idx])
285 { champion = sre_random(); best = idx; min = dist; }
286 else if (dist == min)
288 challenge = sre_random();
289 if (challenge > champion)
290 { champion = challenge; best = idx; min = dist; }
296 for (idx = 0; idx < nseq; idx++)
297 wgt[idx] = wgt[idx] / (float) itscale;
302 Free2DArray((void **) psym, alen);
306 /* Function: simple_distance()
308 * Purpose: For two identical-length null-terminated strings, return
309 * the fractional difference between them. (0..1)
310 * (Gaps don't count toward anything.)
313 simple_distance(char *s1, char *s2)
318 for (; *s1 != '\0'; s1++, s2++)
320 if (isgap(*s1) || isgap(*s2)) continue;
321 if (*s1 != *s2) diff++;
324 return (valid > 0 ? ((float) diff / (float) valid) : 0.0);
327 /* Function: simple_diffmx()
329 * Purpose: Given a set of flushed, aligned sequences, construct
330 * an NxN fractional difference matrix using the
331 * simple_distance rule.
333 * Args: aseqs - flushed, aligned sequences
334 * num - number of aseqs
335 * ret_dmx - RETURN: difference matrix (caller must free)
337 * Return: 1 on success, 0 on failure.
340 simple_diffmx(char **aseqs,
344 float **dmx; /* RETURN: distance matrix */
345 int i,j; /* counters over sequences */
349 if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL)
350 Die("malloc failed");
351 for (i = 0; i < num; i++)
352 if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL)
353 Die("malloc failed");
355 /* Calculate distances, symmetric matrix
357 for (i = 0; i < num; i++)
358 for (j = i; j < num; j++)
359 dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]);
369 /* Function: BlosumWeights()
370 * Date: SRE, Fri Jul 16 17:33:59 1999 (St. Louis)
372 * Purpose: Assign weights to a set of aligned sequences
373 * using the BLOSUM rule:
374 * - do single linkage clustering at some pairwise identity
375 * - in each cluster, give each sequence 1/clustsize
378 * The clusters have no pairwise link >= maxid.
380 * O(N) in memory. Probably ~O(NlogN) in time; O(N^2)
381 * in worst case, which is no links between sequences
382 * (e.g., values of maxid near 1.0).
384 * Args: aseqs - alignment
385 * nseq - number of seqs in alignment
386 * alen - # of columns in alignment
387 * maxid - fractional identity (e.g. 0.62 for BLOSUM62)
388 * wgt - [0..nseq-1] array of weights to be returned
391 BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt)
394 int *nmem; /* number of seqs in each cluster */
395 int i; /* loop counter */
397 SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc);
399 FSet(wgt, nseq, 1.0);
400 nmem = MallocOrDie(sizeof(int) * nc);
402 for (i = 0; i < nc; i++) nmem[i] = 0;
403 for (i = 0; i < nseq; i++) nmem[c[i]]++;
404 for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]];
412 /* Function: PositionBasedWeights()
413 * Date: SRE, Fri Jul 16 17:47:22 1999 [St. Louis]
415 * Purpose: Implementation of Henikoff and Henikoff position-based
416 * weights (JMB 243:574-578, 1994) [Henikoff94b].
418 * A significant advantage of this approach that Steve and Jorja
419 * don't point out is that it is O(N) in memory, unlike
420 * many other approaches like GSC weights or Voronoi.
422 * A potential disadvantage that they don't point out
423 * is that in the theoretical limit of infinite sequences
424 * in the alignment, weights go flat: eventually every
425 * column has at least one representative of each of 20 aa (or 4 nt)
428 * They also don't give a rule for how to handle gaps.
429 * The rule used here seems the obvious and sensible one
430 * (ignore them). This means that longer sequences
431 * initially get more weight; hence a "double
432 * normalization" in which the weights are first divided
433 * by sequence length (to compensate for that effect),
434 * then normalized to sum to nseq.
437 * Implemented in a way that's alphabet-independent:
438 * it uses the 26 upper case letters as "residues".
439 * Any alphabetic character in aseq is interpreted as
440 * a unique "residue" (case insensitively; lower case
441 * mapped to upper case). All other characters are
442 * interpreted as gaps.
444 * This way, we don't have to pass around any alphabet
445 * type info (DNA vs. RNA vs. protein) and don't have
446 * to deal with remapping IUPAC degenerate codes
447 * probabilistically. However, on the down side,
448 * a sequence with a lot of degenerate IUPAC characters
449 * will get an artifactually high PB weight.
451 * Args: aseq - sequence alignment to weight
452 * nseq - number of sequences in alignment
453 * alen - length of alignment
454 * wgt - RETURN: weights filled in (pre-allocated 0..nseq-1)
457 * wgt is allocated (0..nseq-1) by caller, and filled in here.
460 PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt)
462 int rescount[26]; /* count of A-Z residues in a column */
463 int nres; /* number of different residues in col */
464 int idx, pos; /* indices into aseq */
468 FSet(wgt, nseq, 0.0);
469 for (pos = 0; pos < alen; pos++)
471 for (x = 0; x < 26; x++) rescount[x] = 0;
472 for (idx = 0; idx < nseq; idx++)
473 if (isalpha(aseq[idx][pos]))
474 rescount[toupper(aseq[idx][pos]) - 'A'] ++;
477 for (x = 0; x < 26; x++)
478 if (rescount[x] > 0) nres++;
480 for (idx = 0; idx < nseq; idx++)
481 if (isalpha(aseq[idx][pos]))
482 wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']);
485 for (idx = 0; idx < nseq; idx++)
486 wgt[idx] /= (float) DealignedLength(aseq[idx]);
487 norm = (float) nseq / FSum(wgt, nseq);
488 FScale(wgt, nseq, norm);
495 /* Function: FilterAlignment()
496 * Date: SRE, Wed Jun 30 09:19:30 1999 [St. Louis]
498 * Purpose: Constructs a new alignment by removing near-identical
499 * sequences from a given alignment (where identity is
500 * calculated *based on the alignment*).
501 * Does not affect the given alignment.
502 * Keeps earlier sequence, discards later one.
504 * Usually called as an ad hoc sequence "weighting" mechanism.
507 * Unparsed Stockholm markup is not propagated into the
510 * Args: msa -- original alignment
511 * cutoff -- fraction identity cutoff. 0.8 removes sequences > 80% id.
512 * ret_new -- RETURN: new MSA, usually w/ fewer sequences
515 * ret_new must be free'd by caller: MSAFree().
518 FilterAlignment(MSA *msa, float cutoff, MSA **ret_new)
520 int nnew; /* number of seqs in new alignment */
527 /* find which seqs to keep (list) */
528 /* diff matrix; allow ragged ends */
529 list = MallocOrDie (sizeof(int) * msa->nseq);
530 useme = MallocOrDie (sizeof(int) * msa->nseq);
531 for (i = 0; i < msa->nseq; i++) useme[i] = FALSE;
534 for (i = 0; i < msa->nseq; i++)
537 for (j = 0; j < nnew; j++)
539 ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]);
543 printf("removing %12s -- fractional identity %.2f to %s\n",
544 msa->sqname[i], ident,
545 msa->sqname[list[j]]);
549 if (remove == FALSE) {
555 MSASmallerAlignment(msa, useme, ret_new);
562 /* Function: SampleAlignment()
563 * Date: SRE, Wed Jun 30 10:13:56 1999 [St. Louis]
565 * Purpose: Constructs a new, smaller alignment by sampling a given
566 * number of sequences at random. Does not change the
567 * alignment nor the order of the sequences.
569 * If you ask for a sample that is larger than nseqs,
570 * it silently returns the original alignment.
572 * Not really a weighting method, but this is as good
573 * a place as any to keep it, since it's similar in
574 * construction to FilterAlignment().
576 * Args: msa -- original alignment
577 * sample -- number of sequences in new alignment (0 < sample <= nseq)
578 * ret_new -- RETURN: new MSA
581 * ret_new must be free'd by caller: MSAFree().
584 SampleAlignment(MSA *msa, int sample, MSA **ret_new)
586 int *list; /* array for random selection w/o replace */
587 int *useme; /* array of flags 0..nseq-1: TRUE to use */
593 list = (int *) MallocOrDie (sizeof(int) * msa->nseq);
594 useme = (int *) MallocOrDie (sizeof(int) * msa->nseq);
595 for (i = 0; i < msa->nseq; i++)
603 if (sample >= msa->nseq) sample = msa->nseq;
605 /* random selection w/o replacement */
606 for (len = msa->nseq, i = 0; i < sample; i++)
609 printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]);
610 useme[list[idx]] = TRUE;
611 list[idx] = list[--len];
614 MSASmallerAlignment(msa, useme, ret_new);
621 /* Function: SingleLinkCluster()
622 * Date: SRE, Fri Jul 16 15:02:57 1999 [St. Louis]
624 * Purpose: Perform simple single link clustering of seqs in a
625 * sequence alignment. A pairwise identity threshold
626 * defines whether two sequences are linked or not.
628 * Important: runs in O(N) memory, unlike standard
629 * graph decomposition algorithms that use O(N^2)
630 * adjacency matrices or adjacency lists. Requires
631 * O(N^2) time in worst case (which is when you have
632 * no links at all), O(NlogN) in "average"
633 * case, and O(N) in best case (when there is just
634 * one cluster in a completely connected graph.
636 * (Developed because hmmbuild could no longer deal
637 * with GP120, a 16,013 sequence alignment.)
640 * CASE-SENSITIVE. Assumes aseq have been put into
641 * either all lower or all upper case; or at least,
642 * within a column, there's no mixed case.
645 * I don't know if this algorithm is published. I
646 * haven't seen it in graph theory books, but that might
647 * be because it's so obvious that nobody's bothered.
649 * In brief, we're going to do a breadth-first search
650 * of the graph, and we're going to calculate links
651 * on the fly rather than precalculating them into
652 * some sort of standard adjacency structure.
654 * While working, we keep two stacks of maximum length N:
655 * a : list of vertices that are still unconnected.
656 * b : list of vertices that we've connected to
657 * in our current breadth level, but we haven't
658 * yet tested for other connections to a.
659 * The current length (number of elements in) a and b are
662 * We store our results in an array of length N:
663 * c : assigns each vertex to a component. for example
664 * c[4] = 1 means that vertex 4 is in component 1.
665 * nc is the number of components. Components
666 * are numbered from 0 to nc-1. We return c and nc
672 * a <-- all the vertices
679 * while (a is not empty)
680 * pop a vertex off a, push onto b
681 * while (b is not empty)
684 * for each vertex w in a:
685 * compare v,w. If w is linked to v, remove w
686 * from a, push onto b.
690 * Args: aseq - aligned sequences
691 * nseq - number of sequences in aseq
692 * alen - alignment length
693 * maxid - fractional identity threshold 0..1. if id >= maxid, seqs linked
694 * ret_c - RETURN: 0..nseq-1 assignments of seqs to components (clusters)
695 * ret_nc - RETURN: number of components
698 * ret_c is allocated here. Caller free's with free(*ret_c)
701 SingleLinkCluster(char **aseq, int nseq, int alen, float maxid,
702 int **ret_c, int *ret_nc)
704 int *a, na; /* stack of available vertices */
705 int *b, nb; /* stack of working vertices */
706 int *c; /* array of results */
707 int nc; /* total number of components */
708 int v,w; /* index of a working vertices */
709 int i; /* loop counter */
711 /* allocations and initializations
713 a = MallocOrDie (sizeof(int) * nseq);
714 b = MallocOrDie (sizeof(int) * nseq);
715 c = MallocOrDie (sizeof(int) * nseq);
716 for (i = 0; i < nseq; i++) a[i] = i;
725 v = a[na-1]; na--; /* pop a vertex off a, */
726 b[nb] = v; nb++; /* and push onto b */
729 v = b[nb-1]; nb--; /* pop vertex off b */
730 c[v] = nc; /* assign it to component nc */
731 for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/
732 if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */
734 w = a[i]; a[i] = a[na-1]; na--; /* delete w from a (note swap) */
735 b[nb] = w; nb++; /* push w onto b */
741 /* Cleanup and return