/***************************************************************** * SQUID - a library of functions for biological sequence analysis * Copyright (C) 1992-2002 Washington University School of Medicine * * This source code is freely distributed under the terms of the * GNU General Public License. See the files COPYRIGHT and LICENSE * for details. *****************************************************************/ /* weight.c * SRE, Thu Mar 3 07:56:01 1994 * * Calculate weights for sequences in an alignment. * RCS $Id: weight.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: weight.c,v 1.9 2002/10/09 14:26:09 eddy Exp) */ #include #include #include "squid.h" #include "sre_random.h" static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node); static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node); static float simple_distance(char *s1, char *s2); static int simple_diffmx(char **aseqs,int num, float ***ret_dmx); /* Function: GSCWeights() * * Purpose: Use Erik's tree-based algorithm to set weights for * sequences in an alignment. upweight() and downweight() * are derived from Graeme Mitchison's code. * * Args: aseq - array of (0..nseq-1) aligned sequences * nseq - number of seqs in alignment * alen - length of alignment * wgt - allocated [0..nseq-1] array of weights to be returned * * Return: (void) * wgt is filled in. */ void GSCWeights(char **aseq, int nseq, int alen, float *wgt) { float **dmx; /* distance (difference) matrix */ struct phylo_s *tree; float *lwt, *rwt; /* weight on left, right of this tree node */ float *fwt; /* final weight assigned to this node */ int i; /* Sanity check first */ if (nseq == 1) { wgt[0] = 1.0; return; } /* I use a simple fractional difference matrix derived by * pairwise identity. Perhaps I should include a Poisson * distance correction. */ MakeDiffMx(aseq, nseq, &dmx); if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree)) Die("Cluster() failed"); /* Allocations */ lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); /* lwt and rwt are the total branch weight to the left and * right of a node or sequence. They are 0..2N-2. 0..N-1 are * the sequences; these have weight 0. N..2N-2 are the actual * tree nodes. */ for (i = 0; i < nseq; i++) lwt[i] = rwt[i] = 0.0; /* recursively calculate rwt, lwt, starting at node nseq (the root) */ upweight(tree, nseq, lwt, rwt, nseq); /* recursively distribute weight across the tree */ fwt[nseq] = nseq; downweight(tree, nseq, lwt, rwt, fwt, nseq); /* collect the weights */ for (i = 0; i < nseq; i++) wgt[i] = fwt[i]; FMX2Free(dmx); FreePhylo(tree, nseq); free(lwt); free(rwt); free(fwt); } static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node) { int ld,rd; ld = tree[node-nseq].left; if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld); rd = tree[node-nseq].right; if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd); lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen; rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen; } static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node) { int ld,rd; float lnum, rnum; ld = tree[node-nseq].left; rd = tree[node-nseq].right; if (lwt[node] + rwt[node] > 0.0) { fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node])); fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node])); } else { lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0; rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0; fwt[ld] = fwt[node] * lnum / (lnum + rnum); fwt[rd] = fwt[node] * rnum / (lnum + rnum); } if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld); if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd); } /* Function: VoronoiWeights() * * Purpose: Calculate weights using the scheme of Sibbald & * Argos (JMB 216:813-818 1990). The scheme is * slightly modified because the original algorithm * actually doesn't work on gapped alignments. * The sequences are assumed to be protein. * * Args: aseq - array of (0..nseq-1) aligned sequences * nseq - number of sequences * alen - length of alignment * wgt - allocated [0..nseq-1] array of weights to be returned * * Return: void * wgt is filled in. */ void VoronoiWeights(char **aseq, int nseq, int alen, float *wgt) { float **dmx; /* distance (difference) matrix */ float *halfmin; /* 1/2 minimum distance to other seqs */ char **psym; /* symbols seen in each column */ int *nsym; /* # syms seen in each column */ int symseen[27]; /* flags for observed syms */ char *randseq; /* randomly generated sequence */ int acol; /* pos in aligned columns */ int idx; /* index in sequences */ int symidx; /* 0..25 index for symbol */ int i; /* generic counter */ float min; /* minimum distance */ float dist; /* distance between random and real */ float challenge, champion; /* for resolving ties */ int itscale; /* how many iterations per seq */ int iteration; int best; /* index of nearest real sequence */ /* Sanity check first */ if (nseq == 1) { wgt[0] = 1.0; return; } itscale = 50; /* Precalculate 1/2 minimum distance to other * sequences for each sequence */ if (! simple_diffmx(aseq, nseq, &dmx)) Die("simple_diffmx() failed"); halfmin = MallocOrDie (sizeof(float) * nseq); for (idx = 0; idx < nseq; idx++) { for (min = 1.0, i = 0; i < nseq; i++) { if (i == idx) continue; if (dmx[idx][i] < min) min = dmx[idx][i]; } halfmin[idx] = min / 2.0; } Free2DArray((void **) dmx, nseq); /* Set up the random sequence generating model. */ psym = MallocOrDie (alen * sizeof(char *)); nsym = MallocOrDie (alen * sizeof(int)); for (acol = 0; acol < alen; acol++) psym[acol] = MallocOrDie (27 * sizeof(char)); /* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */ for (acol = 0; acol < alen; acol++) { memset(symseen, 0, sizeof(int) * 27); for (idx = 0; idx < nseq; idx++) if (! isgap(aseq[idx][acol])) { if (isupper((int) aseq[idx][acol])) symidx = aseq[idx][acol] - 'A'; else symidx = aseq[idx][acol] - 'a'; if (symidx >= 0 && symidx < 26) symseen[symidx] = 1; } else symseen[26] = 1; /* a gap */ for (nsym[acol] = 0, i = 0; i < 26; i++) if (symseen[i]) { psym[acol][nsym[acol]] = 'A'+i; nsym[acol]++; } if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; } } /* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */ /* Note: the original Sibbald&Argos algorithm calls for * bounding the sampled space using a template-like random * sequence generator. However, this leads to one minor * and one major problem. The minor problem is that * exceptional amino acids in a column can have a * significant effect by altering the amount of sampled * sequence space; the larger the data set, the worse * this problem becomes. The major problem is that * there is no reasonable way to deal with gaps. * Gapped sequences simply inhabit a different dimensionality * and it's pretty painful to imagine calculating Voronoi * volumes when the N in your N-space is varying. * Note that all the examples shown by Sibbald and Argos * are *ungapped* examples. * * The best way I've found to circumvent this problem is * just not to bound the sampled space; count gaps as * symbols and generate completely random sequences. */ #ifdef ALL_SEQUENCE_SPACE for (acol = 0; acol < alen; acol++) { strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY "); nsym[acol] = 21; } #endif /* Sibbald and Argos algorithm: * 1) assign all seqs weight 0. * 2) generate a "random" sequence * 3) calculate distance to every other sequence * (if we get a distance < 1/2 minimum distance * to other real seqs, we can stop) * 4) if unique closest sequence, increment its weight 1. * if multiple closest seq, choose one randomly * 5) repeat 2-4 for lots of iterations * 6) normalize all weights to sum to nseq. */ randseq = MallocOrDie ((alen+1) * sizeof(char)); best = 42.; /* solely to silence GCC uninit warnings. */ FSet(wgt, nseq, 0.0); for (iteration = 0; iteration < itscale * nseq; iteration++) { for (acol = 0; acol < alen; acol++) randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])]; randseq[acol] = '\0'; champion = sre_random(); for (min = 1.0, idx = 0; idx < nseq; idx++) { dist = simple_distance(aseq[idx], randseq); if (dist < halfmin[idx]) { best = idx; break; } if (dist < min) { champion = sre_random(); best = idx; min = dist; } else if (dist == min) { challenge = sre_random(); if (challenge > champion) { champion = challenge; best = idx; min = dist; } } } wgt[best] += 1.0; } for (idx = 0; idx < nseq; idx++) wgt[idx] = wgt[idx] / (float) itscale; free(randseq); free(nsym); free(halfmin); Free2DArray((void **) psym, alen); } /* Function: simple_distance() * * Purpose: For two identical-length null-terminated strings, return * the fractional difference between them. (0..1) * (Gaps don't count toward anything.) */ static float simple_distance(char *s1, char *s2) { int diff = 0; int valid = 0; for (; *s1 != '\0'; s1++, s2++) { if (isgap(*s1) || isgap(*s2)) continue; if (*s1 != *s2) diff++; valid++; } return (valid > 0 ? ((float) diff / (float) valid) : 0.0); } /* Function: simple_diffmx() * * Purpose: Given a set of flushed, aligned sequences, construct * an NxN fractional difference matrix using the * simple_distance rule. * * Args: aseqs - flushed, aligned sequences * num - number of aseqs * ret_dmx - RETURN: difference matrix (caller must free) * * Return: 1 on success, 0 on failure. */ static int simple_diffmx(char **aseqs, int num, float ***ret_dmx) { float **dmx; /* RETURN: distance matrix */ int i,j; /* counters over sequences */ /* Allocate */ if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL) Die("malloc failed"); for (i = 0; i < num; i++) if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL) Die("malloc failed"); /* Calculate distances, symmetric matrix */ for (i = 0; i < num; i++) for (j = i; j < num; j++) dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]); /* Return */ *ret_dmx = dmx; return 1; } /* Function: BlosumWeights() * Date: SRE, Fri Jul 16 17:33:59 1999 (St. Louis) * * Purpose: Assign weights to a set of aligned sequences * using the BLOSUM rule: * - do single linkage clustering at some pairwise identity * - in each cluster, give each sequence 1/clustsize * total weight. * * The clusters have no pairwise link >= maxid. * * O(N) in memory. Probably ~O(NlogN) in time; O(N^2) * in worst case, which is no links between sequences * (e.g., values of maxid near 1.0). * * Args: aseqs - alignment * nseq - number of seqs in alignment * alen - # of columns in alignment * maxid - fractional identity (e.g. 0.62 for BLOSUM62) * wgt - [0..nseq-1] array of weights to be returned */ void BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt) { int *c, nc; int *nmem; /* number of seqs in each cluster */ int i; /* loop counter */ SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc); FSet(wgt, nseq, 1.0); nmem = MallocOrDie(sizeof(int) * nc); for (i = 0; i < nc; i++) nmem[i] = 0; for (i = 0; i < nseq; i++) nmem[c[i]]++; for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]]; free(nmem); free(c); return; } /* Function: PositionBasedWeights() * Date: SRE, Fri Jul 16 17:47:22 1999 [St. Louis] * * Purpose: Implementation of Henikoff and Henikoff position-based * weights (JMB 243:574-578, 1994) [Henikoff94b]. * * A significant advantage of this approach that Steve and Jorja * don't point out is that it is O(N) in memory, unlike * many other approaches like GSC weights or Voronoi. * * A potential disadvantage that they don't point out * is that in the theoretical limit of infinite sequences * in the alignment, weights go flat: eventually every * column has at least one representative of each of 20 aa (or 4 nt) * in it. * * They also don't give a rule for how to handle gaps. * The rule used here seems the obvious and sensible one * (ignore them). This means that longer sequences * initially get more weight; hence a "double * normalization" in which the weights are first divided * by sequence length (to compensate for that effect), * then normalized to sum to nseq. * * Limitations: * Implemented in a way that's alphabet-independent: * it uses the 26 upper case letters as "residues". * Any alphabetic character in aseq is interpreted as * a unique "residue" (case insensitively; lower case * mapped to upper case). All other characters are * interpreted as gaps. * * This way, we don't have to pass around any alphabet * type info (DNA vs. RNA vs. protein) and don't have * to deal with remapping IUPAC degenerate codes * probabilistically. However, on the down side, * a sequence with a lot of degenerate IUPAC characters * will get an artifactually high PB weight. * * Args: aseq - sequence alignment to weight * nseq - number of sequences in alignment * alen - length of alignment * wgt - RETURN: weights filled in (pre-allocated 0..nseq-1) * * Returns: (void) * wgt is allocated (0..nseq-1) by caller, and filled in here. */ void PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt) { int rescount[26]; /* count of A-Z residues in a column */ int nres; /* number of different residues in col */ int idx, pos; /* indices into aseq */ int x; float norm; FSet(wgt, nseq, 0.0); for (pos = 0; pos < alen; pos++) { for (x = 0; x < 26; x++) rescount[x] = 0; for (idx = 0; idx < nseq; idx++) if (isalpha(aseq[idx][pos])) rescount[toupper(aseq[idx][pos]) - 'A'] ++; nres = 0; for (x = 0; x < 26; x++) if (rescount[x] > 0) nres++; for (idx = 0; idx < nseq; idx++) if (isalpha(aseq[idx][pos])) wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']); } for (idx = 0; idx < nseq; idx++) wgt[idx] /= (float) DealignedLength(aseq[idx]); norm = (float) nseq / FSum(wgt, nseq); FScale(wgt, nseq, norm); return; } /* Function: FilterAlignment() * Date: SRE, Wed Jun 30 09:19:30 1999 [St. Louis] * * Purpose: Constructs a new alignment by removing near-identical * sequences from a given alignment (where identity is * calculated *based on the alignment*). * Does not affect the given alignment. * Keeps earlier sequence, discards later one. * * Usually called as an ad hoc sequence "weighting" mechanism. * * Limitations: * Unparsed Stockholm markup is not propagated into the * new alignment. * * Args: msa -- original alignment * cutoff -- fraction identity cutoff. 0.8 removes sequences > 80% id. * ret_new -- RETURN: new MSA, usually w/ fewer sequences * * Return: (void) * ret_new must be free'd by caller: MSAFree(). */ void FilterAlignment(MSA *msa, float cutoff, MSA **ret_new) { int nnew; /* number of seqs in new alignment */ int *list; int *useme; float ident; int i,j; int remove; /* find which seqs to keep (list) */ /* diff matrix; allow ragged ends */ list = MallocOrDie (sizeof(int) * msa->nseq); useme = MallocOrDie (sizeof(int) * msa->nseq); for (i = 0; i < msa->nseq; i++) useme[i] = FALSE; nnew = 0; for (i = 0; i < msa->nseq; i++) { remove = FALSE; for (j = 0; j < nnew; j++) { ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]); if (ident > cutoff) { remove = TRUE; printf("removing %12s -- fractional identity %.2f to %s\n", msa->sqname[i], ident, msa->sqname[list[j]]); break; } } if (remove == FALSE) { list[nnew++] = i; useme[i] = TRUE; } } MSASmallerAlignment(msa, useme, ret_new); free(list); free(useme); return; } /* Function: SampleAlignment() * Date: SRE, Wed Jun 30 10:13:56 1999 [St. Louis] * * Purpose: Constructs a new, smaller alignment by sampling a given * number of sequences at random. Does not change the * alignment nor the order of the sequences. * * If you ask for a sample that is larger than nseqs, * it silently returns the original alignment. * * Not really a weighting method, but this is as good * a place as any to keep it, since it's similar in * construction to FilterAlignment(). * * Args: msa -- original alignment * sample -- number of sequences in new alignment (0 < sample <= nseq) * ret_new -- RETURN: new MSA * * Return: (void) * ret_new must be free'd by caller: MSAFree(). */ void SampleAlignment(MSA *msa, int sample, MSA **ret_new) { int *list; /* array for random selection w/o replace */ int *useme; /* array of flags 0..nseq-1: TRUE to use */ int i, idx; int len; /* Allocations */ list = (int *) MallocOrDie (sizeof(int) * msa->nseq); useme = (int *) MallocOrDie (sizeof(int) * msa->nseq); for (i = 0; i < msa->nseq; i++) { list[i] = i; useme[i] = FALSE; } /* Sanity check. */ if (sample >= msa->nseq) sample = msa->nseq; /* random selection w/o replacement */ for (len = msa->nseq, i = 0; i < sample; i++) { idx = CHOOSE(len); printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]); useme[list[idx]] = TRUE; list[idx] = list[--len]; } MSASmallerAlignment(msa, useme, ret_new); free(list); free(useme); return; } /* Function: SingleLinkCluster() * Date: SRE, Fri Jul 16 15:02:57 1999 [St. Louis] * * Purpose: Perform simple single link clustering of seqs in a * sequence alignment. A pairwise identity threshold * defines whether two sequences are linked or not. * * Important: runs in O(N) memory, unlike standard * graph decomposition algorithms that use O(N^2) * adjacency matrices or adjacency lists. Requires * O(N^2) time in worst case (which is when you have * no links at all), O(NlogN) in "average" * case, and O(N) in best case (when there is just * one cluster in a completely connected graph. * * (Developed because hmmbuild could no longer deal * with GP120, a 16,013 sequence alignment.) * * Limitations: * CASE-SENSITIVE. Assumes aseq have been put into * either all lower or all upper case; or at least, * within a column, there's no mixed case. * * Algorithm: * I don't know if this algorithm is published. I * haven't seen it in graph theory books, but that might * be because it's so obvious that nobody's bothered. * * In brief, we're going to do a breadth-first search * of the graph, and we're going to calculate links * on the fly rather than precalculating them into * some sort of standard adjacency structure. * * While working, we keep two stacks of maximum length N: * a : list of vertices that are still unconnected. * b : list of vertices that we've connected to * in our current breadth level, but we haven't * yet tested for other connections to a. * The current length (number of elements in) a and b are * kept in na, nb. * * We store our results in an array of length N: * c : assigns each vertex to a component. for example * c[4] = 1 means that vertex 4 is in component 1. * nc is the number of components. Components * are numbered from 0 to nc-1. We return c and nc * to our caller. * * The algorithm is: * * Initialisation: * a <-- all the vertices * na <-- N * b <-- empty set * nb <-- 0 * nc <-- 0 * * Then: * while (a is not empty) * pop a vertex off a, push onto b * while (b is not empty) * pop vertex v off b * assign c[v] = nc * for each vertex w in a: * compare v,w. If w is linked to v, remove w * from a, push onto b. * nc++ * q.e.d. :) * * Args: aseq - aligned sequences * nseq - number of sequences in aseq * alen - alignment length * maxid - fractional identity threshold 0..1. if id >= maxid, seqs linked * ret_c - RETURN: 0..nseq-1 assignments of seqs to components (clusters) * ret_nc - RETURN: number of components * * Returns: void. * ret_c is allocated here. Caller free's with free(*ret_c) */ void SingleLinkCluster(char **aseq, int nseq, int alen, float maxid, int **ret_c, int *ret_nc) { int *a, na; /* stack of available vertices */ int *b, nb; /* stack of working vertices */ int *c; /* array of results */ int nc; /* total number of components */ int v,w; /* index of a working vertices */ int i; /* loop counter */ /* allocations and initializations */ a = MallocOrDie (sizeof(int) * nseq); b = MallocOrDie (sizeof(int) * nseq); c = MallocOrDie (sizeof(int) * nseq); for (i = 0; i < nseq; i++) a[i] = i; na = nseq; nb = 0; nc = 0; /* Main algorithm */ while (na > 0) { v = a[na-1]; na--; /* pop a vertex off a, */ b[nb] = v; nb++; /* and push onto b */ while (nb > 0) { v = b[nb-1]; nb--; /* pop vertex off b */ c[v] = nc; /* assign it to component nc */ for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/ if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */ { w = a[i]; a[i] = a[na-1]; na--; /* delete w from a (note swap) */ b[nb] = w; nb++; /* push w onto b */ } } nc++; } /* Cleanup and return */ free(a); free(b); *ret_c = c; *ret_nc = nc; return; }