--- /dev/null
+/*****************************************************************
+ * SQUID - a library of functions for biological sequence analysis
+ * Copyright (C) 1992-2002 Washington University School of Medicine
+ *
+ * This source code is freely distributed under the terms of the
+ * GNU General Public License. See the files COPYRIGHT and LICENSE
+ * for details.
+ *****************************************************************/
+
+/* weight.c
+ * SRE, Thu Mar 3 07:56:01 1994
+ *
+ * Calculate weights for sequences in an alignment.
+ * RCS $Id: weight.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: weight.c,v 1.9 2002/10/09 14:26:09 eddy Exp)
+ */
+
+#include <ctype.h>
+#include <string.h>
+#include "squid.h"
+#include "sre_random.h"
+
+static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node);
+static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt,
+ float *fwt, int node);
+static float simple_distance(char *s1, char *s2);
+static int simple_diffmx(char **aseqs,int num, float ***ret_dmx);
+
+/* Function: GSCWeights()
+ *
+ * Purpose: Use Erik's tree-based algorithm to set weights for
+ * sequences in an alignment. upweight() and downweight()
+ * are derived from Graeme Mitchison's code.
+ *
+ * Args: aseq - array of (0..nseq-1) aligned sequences
+ * nseq - number of seqs in alignment
+ * alen - length of alignment
+ * wgt - allocated [0..nseq-1] array of weights to be returned
+ *
+ * Return: (void)
+ * wgt is filled in.
+ */
+void
+GSCWeights(char **aseq, int nseq, int alen, float *wgt)
+{
+ float **dmx; /* distance (difference) matrix */
+ struct phylo_s *tree;
+ float *lwt, *rwt; /* weight on left, right of this tree node */
+ float *fwt; /* final weight assigned to this node */
+ int i;
+
+ /* Sanity check first
+ */
+ if (nseq == 1) { wgt[0] = 1.0; return; }
+
+ /* I use a simple fractional difference matrix derived by
+ * pairwise identity. Perhaps I should include a Poisson
+ * distance correction.
+ */
+ MakeDiffMx(aseq, nseq, &dmx);
+ if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree)) Die("Cluster() failed");
+
+ /* Allocations
+ */
+ lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
+ rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
+ fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
+
+ /* lwt and rwt are the total branch weight to the left and
+ * right of a node or sequence. They are 0..2N-2. 0..N-1 are
+ * the sequences; these have weight 0. N..2N-2 are the actual
+ * tree nodes.
+ */
+ for (i = 0; i < nseq; i++)
+ lwt[i] = rwt[i] = 0.0;
+ /* recursively calculate rwt, lwt, starting
+ at node nseq (the root) */
+ upweight(tree, nseq, lwt, rwt, nseq);
+
+ /* recursively distribute weight across the
+ tree */
+ fwt[nseq] = nseq;
+ downweight(tree, nseq, lwt, rwt, fwt, nseq);
+ /* collect the weights */
+ for (i = 0; i < nseq; i++)
+ wgt[i] = fwt[i];
+
+ FMX2Free(dmx);
+ FreePhylo(tree, nseq);
+ free(lwt); free(rwt); free(fwt);
+}
+
+static void
+upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node)
+{
+ int ld,rd;
+
+ ld = tree[node-nseq].left;
+ if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld);
+ rd = tree[node-nseq].right;
+ if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd);
+ lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen;
+ rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen;
+}
+
+
+static void
+downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node)
+{
+ int ld,rd;
+ float lnum, rnum;
+
+ ld = tree[node-nseq].left;
+ rd = tree[node-nseq].right;
+ if (lwt[node] + rwt[node] > 0.0)
+ {
+ fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node]));
+ fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node]));
+ }
+ else
+ {
+ lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0;
+ rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0;
+ fwt[ld] = fwt[node] * lnum / (lnum + rnum);
+ fwt[rd] = fwt[node] * rnum / (lnum + rnum);
+ }
+
+ if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld);
+ if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd);
+}
+
+
+
+
+/* Function: VoronoiWeights()
+ *
+ * Purpose: Calculate weights using the scheme of Sibbald &
+ * Argos (JMB 216:813-818 1990). The scheme is
+ * slightly modified because the original algorithm
+ * actually doesn't work on gapped alignments.
+ * The sequences are assumed to be protein.
+ *
+ * Args: aseq - array of (0..nseq-1) aligned sequences
+ * nseq - number of sequences
+ * alen - length of alignment
+ * wgt - allocated [0..nseq-1] array of weights to be returned
+ *
+ * Return: void
+ * wgt is filled in.
+ */
+void
+VoronoiWeights(char **aseq, int nseq, int alen, float *wgt)
+{
+ float **dmx; /* distance (difference) matrix */
+ float *halfmin; /* 1/2 minimum distance to other seqs */
+ char **psym; /* symbols seen in each column */
+ int *nsym; /* # syms seen in each column */
+ int symseen[27]; /* flags for observed syms */
+ char *randseq; /* randomly generated sequence */
+ int acol; /* pos in aligned columns */
+ int idx; /* index in sequences */
+ int symidx; /* 0..25 index for symbol */
+ int i; /* generic counter */
+ float min; /* minimum distance */
+ float dist; /* distance between random and real */
+ float challenge, champion; /* for resolving ties */
+ int itscale; /* how many iterations per seq */
+ int iteration;
+ int best; /* index of nearest real sequence */
+
+ /* Sanity check first
+ */
+ if (nseq == 1) { wgt[0] = 1.0; return; }
+
+ itscale = 50;
+
+ /* Precalculate 1/2 minimum distance to other
+ * sequences for each sequence
+ */
+ if (! simple_diffmx(aseq, nseq, &dmx))
+ Die("simple_diffmx() failed");
+ halfmin = MallocOrDie (sizeof(float) * nseq);
+ for (idx = 0; idx < nseq; idx++)
+ {
+ for (min = 1.0, i = 0; i < nseq; i++)
+ {
+ if (i == idx) continue;
+ if (dmx[idx][i] < min) min = dmx[idx][i];
+ }
+ halfmin[idx] = min / 2.0;
+ }
+ Free2DArray((void **) dmx, nseq);
+
+ /* Set up the random sequence generating model.
+ */
+ psym = MallocOrDie (alen * sizeof(char *));
+ nsym = MallocOrDie (alen * sizeof(int));
+ for (acol = 0; acol < alen; acol++)
+ psym[acol] = MallocOrDie (27 * sizeof(char));
+
+/* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
+ for (acol = 0; acol < alen; acol++)
+ {
+ memset(symseen, 0, sizeof(int) * 27);
+ for (idx = 0; idx < nseq; idx++)
+ if (! isgap(aseq[idx][acol]))
+ {
+ if (isupper((int) aseq[idx][acol]))
+ symidx = aseq[idx][acol] - 'A';
+ else
+ symidx = aseq[idx][acol] - 'a';
+ if (symidx >= 0 && symidx < 26)
+ symseen[symidx] = 1;
+ }
+ else
+ symseen[26] = 1; /* a gap */
+
+ for (nsym[acol] = 0, i = 0; i < 26; i++)
+ if (symseen[i])
+ {
+ psym[acol][nsym[acol]] = 'A'+i;
+ nsym[acol]++;
+ }
+ if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; }
+ }
+/* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
+
+ /* Note: the original Sibbald&Argos algorithm calls for
+ * bounding the sampled space using a template-like random
+ * sequence generator. However, this leads to one minor
+ * and one major problem. The minor problem is that
+ * exceptional amino acids in a column can have a
+ * significant effect by altering the amount of sampled
+ * sequence space; the larger the data set, the worse
+ * this problem becomes. The major problem is that
+ * there is no reasonable way to deal with gaps.
+ * Gapped sequences simply inhabit a different dimensionality
+ * and it's pretty painful to imagine calculating Voronoi
+ * volumes when the N in your N-space is varying.
+ * Note that all the examples shown by Sibbald and Argos
+ * are *ungapped* examples.
+ *
+ * The best way I've found to circumvent this problem is
+ * just not to bound the sampled space; count gaps as
+ * symbols and generate completely random sequences.
+ */
+#ifdef ALL_SEQUENCE_SPACE
+ for (acol = 0; acol < alen; acol++)
+ {
+ strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY ");
+ nsym[acol] = 21;
+ }
+#endif
+
+ /* Sibbald and Argos algorithm:
+ * 1) assign all seqs weight 0.
+ * 2) generate a "random" sequence
+ * 3) calculate distance to every other sequence
+ * (if we get a distance < 1/2 minimum distance
+ * to other real seqs, we can stop)
+ * 4) if unique closest sequence, increment its weight 1.
+ * if multiple closest seq, choose one randomly
+ * 5) repeat 2-4 for lots of iterations
+ * 6) normalize all weights to sum to nseq.
+ */
+ randseq = MallocOrDie ((alen+1) * sizeof(char));
+
+ best = 42.; /* solely to silence GCC uninit warnings. */
+ FSet(wgt, nseq, 0.0);
+ for (iteration = 0; iteration < itscale * nseq; iteration++)
+ {
+ for (acol = 0; acol < alen; acol++)
+ randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])];
+ randseq[acol] = '\0';
+
+ champion = sre_random();
+ for (min = 1.0, idx = 0; idx < nseq; idx++)
+ {
+ dist = simple_distance(aseq[idx], randseq);
+ if (dist < halfmin[idx])
+ {
+ best = idx;
+ break;
+ }
+ if (dist < min)
+ { champion = sre_random(); best = idx; min = dist; }
+ else if (dist == min)
+ {
+ challenge = sre_random();
+ if (challenge > champion)
+ { champion = challenge; best = idx; min = dist; }
+ }
+ }
+ wgt[best] += 1.0;
+ }
+
+ for (idx = 0; idx < nseq; idx++)
+ wgt[idx] = wgt[idx] / (float) itscale;
+
+ free(randseq);
+ free(nsym);
+ free(halfmin);
+ Free2DArray((void **) psym, alen);
+}
+
+
+/* Function: simple_distance()
+ *
+ * Purpose: For two identical-length null-terminated strings, return
+ * the fractional difference between them. (0..1)
+ * (Gaps don't count toward anything.)
+ */
+static float
+simple_distance(char *s1, char *s2)
+{
+ int diff = 0;
+ int valid = 0;
+
+ for (; *s1 != '\0'; s1++, s2++)
+ {
+ if (isgap(*s1) || isgap(*s2)) continue;
+ if (*s1 != *s2) diff++;
+ valid++;
+ }
+ return (valid > 0 ? ((float) diff / (float) valid) : 0.0);
+}
+
+/* Function: simple_diffmx()
+ *
+ * Purpose: Given a set of flushed, aligned sequences, construct
+ * an NxN fractional difference matrix using the
+ * simple_distance rule.
+ *
+ * Args: aseqs - flushed, aligned sequences
+ * num - number of aseqs
+ * ret_dmx - RETURN: difference matrix (caller must free)
+ *
+ * Return: 1 on success, 0 on failure.
+ */
+static int
+simple_diffmx(char **aseqs,
+ int num,
+ float ***ret_dmx)
+{
+ float **dmx; /* RETURN: distance matrix */
+ int i,j; /* counters over sequences */
+
+ /* Allocate
+ */
+ if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL)
+ Die("malloc failed");
+ for (i = 0; i < num; i++)
+ if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL)
+ Die("malloc failed");
+
+ /* Calculate distances, symmetric matrix
+ */
+ for (i = 0; i < num; i++)
+ for (j = i; j < num; j++)
+ dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]);
+
+ /* Return
+ */
+ *ret_dmx = dmx;
+ return 1;
+}
+
+
+
+/* Function: BlosumWeights()
+ * Date: SRE, Fri Jul 16 17:33:59 1999 (St. Louis)
+ *
+ * Purpose: Assign weights to a set of aligned sequences
+ * using the BLOSUM rule:
+ * - do single linkage clustering at some pairwise identity
+ * - in each cluster, give each sequence 1/clustsize
+ * total weight.
+ *
+ * The clusters have no pairwise link >= maxid.
+ *
+ * O(N) in memory. Probably ~O(NlogN) in time; O(N^2)
+ * in worst case, which is no links between sequences
+ * (e.g., values of maxid near 1.0).
+ *
+ * Args: aseqs - alignment
+ * nseq - number of seqs in alignment
+ * alen - # of columns in alignment
+ * maxid - fractional identity (e.g. 0.62 for BLOSUM62)
+ * wgt - [0..nseq-1] array of weights to be returned
+ */
+void
+BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt)
+{
+ int *c, nc;
+ int *nmem; /* number of seqs in each cluster */
+ int i; /* loop counter */
+
+ SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc);
+
+ FSet(wgt, nseq, 1.0);
+ nmem = MallocOrDie(sizeof(int) * nc);
+
+ for (i = 0; i < nc; i++) nmem[i] = 0;
+ for (i = 0; i < nseq; i++) nmem[c[i]]++;
+ for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]];
+
+ free(nmem);
+ free(c);
+ return;
+}
+
+
+/* Function: PositionBasedWeights()
+ * Date: SRE, Fri Jul 16 17:47:22 1999 [St. Louis]
+ *
+ * Purpose: Implementation of Henikoff and Henikoff position-based
+ * weights (JMB 243:574-578, 1994) [Henikoff94b].
+ *
+ * A significant advantage of this approach that Steve and Jorja
+ * don't point out is that it is O(N) in memory, unlike
+ * many other approaches like GSC weights or Voronoi.
+ *
+ * A potential disadvantage that they don't point out
+ * is that in the theoretical limit of infinite sequences
+ * in the alignment, weights go flat: eventually every
+ * column has at least one representative of each of 20 aa (or 4 nt)
+ * in it.
+ *
+ * They also don't give a rule for how to handle gaps.
+ * The rule used here seems the obvious and sensible one
+ * (ignore them). This means that longer sequences
+ * initially get more weight; hence a "double
+ * normalization" in which the weights are first divided
+ * by sequence length (to compensate for that effect),
+ * then normalized to sum to nseq.
+ *
+ * Limitations:
+ * Implemented in a way that's alphabet-independent:
+ * it uses the 26 upper case letters as "residues".
+ * Any alphabetic character in aseq is interpreted as
+ * a unique "residue" (case insensitively; lower case
+ * mapped to upper case). All other characters are
+ * interpreted as gaps.
+ *
+ * This way, we don't have to pass around any alphabet
+ * type info (DNA vs. RNA vs. protein) and don't have
+ * to deal with remapping IUPAC degenerate codes
+ * probabilistically. However, on the down side,
+ * a sequence with a lot of degenerate IUPAC characters
+ * will get an artifactually high PB weight.
+ *
+ * Args: aseq - sequence alignment to weight
+ * nseq - number of sequences in alignment
+ * alen - length of alignment
+ * wgt - RETURN: weights filled in (pre-allocated 0..nseq-1)
+ *
+ * Returns: (void)
+ * wgt is allocated (0..nseq-1) by caller, and filled in here.
+ */
+void
+PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt)
+{
+ int rescount[26]; /* count of A-Z residues in a column */
+ int nres; /* number of different residues in col */
+ int idx, pos; /* indices into aseq */
+ int x;
+ float norm;
+
+ FSet(wgt, nseq, 0.0);
+ for (pos = 0; pos < alen; pos++)
+ {
+ for (x = 0; x < 26; x++) rescount[x] = 0;
+ for (idx = 0; idx < nseq; idx++)
+ if (isalpha(aseq[idx][pos]))
+ rescount[toupper(aseq[idx][pos]) - 'A'] ++;
+
+ nres = 0;
+ for (x = 0; x < 26; x++)
+ if (rescount[x] > 0) nres++;
+
+ for (idx = 0; idx < nseq; idx++)
+ if (isalpha(aseq[idx][pos]))
+ wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']);
+ }
+
+ for (idx = 0; idx < nseq; idx++)
+ wgt[idx] /= (float) DealignedLength(aseq[idx]);
+ norm = (float) nseq / FSum(wgt, nseq);
+ FScale(wgt, nseq, norm);
+ return;
+}
+
+
+
+
+/* Function: FilterAlignment()
+ * Date: SRE, Wed Jun 30 09:19:30 1999 [St. Louis]
+ *
+ * Purpose: Constructs a new alignment by removing near-identical
+ * sequences from a given alignment (where identity is
+ * calculated *based on the alignment*).
+ * Does not affect the given alignment.
+ * Keeps earlier sequence, discards later one.
+ *
+ * Usually called as an ad hoc sequence "weighting" mechanism.
+ *
+ * Limitations:
+ * Unparsed Stockholm markup is not propagated into the
+ * new alignment.
+ *
+ * Args: msa -- original alignment
+ * cutoff -- fraction identity cutoff. 0.8 removes sequences > 80% id.
+ * ret_new -- RETURN: new MSA, usually w/ fewer sequences
+ *
+ * Return: (void)
+ * ret_new must be free'd by caller: MSAFree().
+ */
+void
+FilterAlignment(MSA *msa, float cutoff, MSA **ret_new)
+{
+ int nnew; /* number of seqs in new alignment */
+ int *list;
+ int *useme;
+ float ident;
+ int i,j;
+ int remove;
+
+ /* find which seqs to keep (list) */
+ /* diff matrix; allow ragged ends */
+ list = MallocOrDie (sizeof(int) * msa->nseq);
+ useme = MallocOrDie (sizeof(int) * msa->nseq);
+ for (i = 0; i < msa->nseq; i++) useme[i] = FALSE;
+
+ nnew = 0;
+ for (i = 0; i < msa->nseq; i++)
+ {
+ remove = FALSE;
+ for (j = 0; j < nnew; j++)
+ {
+ ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]);
+ if (ident > cutoff)
+ {
+ remove = TRUE;
+ printf("removing %12s -- fractional identity %.2f to %s\n",
+ msa->sqname[i], ident,
+ msa->sqname[list[j]]);
+ break;
+ }
+ }
+ if (remove == FALSE) {
+ list[nnew++] = i;
+ useme[i] = TRUE;
+ }
+ }
+
+ MSASmallerAlignment(msa, useme, ret_new);
+ free(list);
+ free(useme);
+ return;
+}
+
+
+/* Function: SampleAlignment()
+ * Date: SRE, Wed Jun 30 10:13:56 1999 [St. Louis]
+ *
+ * Purpose: Constructs a new, smaller alignment by sampling a given
+ * number of sequences at random. Does not change the
+ * alignment nor the order of the sequences.
+ *
+ * If you ask for a sample that is larger than nseqs,
+ * it silently returns the original alignment.
+ *
+ * Not really a weighting method, but this is as good
+ * a place as any to keep it, since it's similar in
+ * construction to FilterAlignment().
+ *
+ * Args: msa -- original alignment
+ * sample -- number of sequences in new alignment (0 < sample <= nseq)
+ * ret_new -- RETURN: new MSA
+ *
+ * Return: (void)
+ * ret_new must be free'd by caller: MSAFree().
+ */
+void
+SampleAlignment(MSA *msa, int sample, MSA **ret_new)
+{
+ int *list; /* array for random selection w/o replace */
+ int *useme; /* array of flags 0..nseq-1: TRUE to use */
+ int i, idx;
+ int len;
+
+ /* Allocations
+ */
+ list = (int *) MallocOrDie (sizeof(int) * msa->nseq);
+ useme = (int *) MallocOrDie (sizeof(int) * msa->nseq);
+ for (i = 0; i < msa->nseq; i++)
+ {
+ list[i] = i;
+ useme[i] = FALSE;
+ }
+
+ /* Sanity check.
+ */
+ if (sample >= msa->nseq) sample = msa->nseq;
+
+ /* random selection w/o replacement */
+ for (len = msa->nseq, i = 0; i < sample; i++)
+ {
+ idx = CHOOSE(len);
+ printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]);
+ useme[list[idx]] = TRUE;
+ list[idx] = list[--len];
+ }
+
+ MSASmallerAlignment(msa, useme, ret_new);
+ free(list);
+ free(useme);
+ return;
+}
+
+
+/* Function: SingleLinkCluster()
+ * Date: SRE, Fri Jul 16 15:02:57 1999 [St. Louis]
+ *
+ * Purpose: Perform simple single link clustering of seqs in a
+ * sequence alignment. A pairwise identity threshold
+ * defines whether two sequences are linked or not.
+ *
+ * Important: runs in O(N) memory, unlike standard
+ * graph decomposition algorithms that use O(N^2)
+ * adjacency matrices or adjacency lists. Requires
+ * O(N^2) time in worst case (which is when you have
+ * no links at all), O(NlogN) in "average"
+ * case, and O(N) in best case (when there is just
+ * one cluster in a completely connected graph.
+ *
+ * (Developed because hmmbuild could no longer deal
+ * with GP120, a 16,013 sequence alignment.)
+ *
+ * Limitations:
+ * CASE-SENSITIVE. Assumes aseq have been put into
+ * either all lower or all upper case; or at least,
+ * within a column, there's no mixed case.
+ *
+ * Algorithm:
+ * I don't know if this algorithm is published. I
+ * haven't seen it in graph theory books, but that might
+ * be because it's so obvious that nobody's bothered.
+ *
+ * In brief, we're going to do a breadth-first search
+ * of the graph, and we're going to calculate links
+ * on the fly rather than precalculating them into
+ * some sort of standard adjacency structure.
+ *
+ * While working, we keep two stacks of maximum length N:
+ * a : list of vertices that are still unconnected.
+ * b : list of vertices that we've connected to
+ * in our current breadth level, but we haven't
+ * yet tested for other connections to a.
+ * The current length (number of elements in) a and b are
+ * kept in na, nb.
+ *
+ * We store our results in an array of length N:
+ * c : assigns each vertex to a component. for example
+ * c[4] = 1 means that vertex 4 is in component 1.
+ * nc is the number of components. Components
+ * are numbered from 0 to nc-1. We return c and nc
+ * to our caller.
+ *
+ * The algorithm is:
+ *
+ * Initialisation:
+ * a <-- all the vertices
+ * na <-- N
+ * b <-- empty set
+ * nb <-- 0
+ * nc <-- 0
+ *
+ * Then:
+ * while (a is not empty)
+ * pop a vertex off a, push onto b
+ * while (b is not empty)
+ * pop vertex v off b
+ * assign c[v] = nc
+ * for each vertex w in a:
+ * compare v,w. If w is linked to v, remove w
+ * from a, push onto b.
+ * nc++
+ * q.e.d. :)
+ *
+ * Args: aseq - aligned sequences
+ * nseq - number of sequences in aseq
+ * alen - alignment length
+ * maxid - fractional identity threshold 0..1. if id >= maxid, seqs linked
+ * ret_c - RETURN: 0..nseq-1 assignments of seqs to components (clusters)
+ * ret_nc - RETURN: number of components
+ *
+ * Returns: void.
+ * ret_c is allocated here. Caller free's with free(*ret_c)
+ */
+void
+SingleLinkCluster(char **aseq, int nseq, int alen, float maxid,
+ int **ret_c, int *ret_nc)
+{
+ int *a, na; /* stack of available vertices */
+ int *b, nb; /* stack of working vertices */
+ int *c; /* array of results */
+ int nc; /* total number of components */
+ int v,w; /* index of a working vertices */
+ int i; /* loop counter */
+
+ /* allocations and initializations
+ */
+ a = MallocOrDie (sizeof(int) * nseq);
+ b = MallocOrDie (sizeof(int) * nseq);
+ c = MallocOrDie (sizeof(int) * nseq);
+ for (i = 0; i < nseq; i++) a[i] = i;
+ na = nseq;
+ nb = 0;
+ nc = 0;
+
+ /* Main algorithm
+ */
+ while (na > 0)
+ {
+ v = a[na-1]; na--; /* pop a vertex off a, */
+ b[nb] = v; nb++; /* and push onto b */
+ while (nb > 0)
+ {
+ v = b[nb-1]; nb--; /* pop vertex off b */
+ c[v] = nc; /* assign it to component nc */
+ for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/
+ if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */
+ {
+ w = a[i]; a[i] = a[na-1]; na--; /* delete w from a (note swap) */
+ b[nb] = w; nb++; /* push w onto b */
+ }
+ }
+ nc++;
+ }
+
+ /* Cleanup and return
+ */
+ free(a);
+ free(b);
+ *ret_c = c;
+ *ret_nc = nc;
+ return;
+}