website/archive/binaries/mac/src/clustalo/src/squid/weight.c

   1 /*****************************************************************
   2  * SQUID - a library of functions for biological sequence analysis
   3  * Copyright (C) 1992-2002 Washington University School of Medicine
   4  *
   5  *     This source code is freely distributed under the terms of the
   6  *     GNU General Public License. See the files COPYRIGHT and LICENSE
   7  *     for details.
   8  *****************************************************************/
   9
  10 /* weight.c
  11  * SRE, Thu Mar  3 07:56:01 1994
  12  *
  13  * Calculate weights for sequences in an alignment.
  14  * RCS $Id: weight.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: weight.c,v 1.9 2002/10/09 14:26:09 eddy Exp)
  15  */
  16
  17 #include <ctype.h>
  18 #include <string.h>
  19 #include "squid.h"
  20 #include "sre_random.h"
  21
  22 static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node);
  23 static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt,
  24                        float *fwt, int node);
  25 static float simple_distance(char *s1, char *s2);
  26 static int    simple_diffmx(char **aseqs,int num, float ***ret_dmx);
  27
  28 /* Function: GSCWeights()
  29  *
  30  * Purpose:  Use Erik's tree-based algorithm to set weights for
  31  *           sequences in an alignment. upweight() and downweight()
  32  *           are derived from Graeme Mitchison's code.
  33  *
  34  * Args:     aseq        - array of (0..nseq-1) aligned sequences
  35  *           nseq        - number of seqs in alignment
  36  *           alen        - length of alignment
  37  *           wgt         - allocated [0..nseq-1] array of weights to be returned
  38  *
  39  * Return:   (void)
  40  *           wgt is filled in.
  41  */
  42 void
  43 GSCWeights(char **aseq, int nseq, int alen, float *wgt)
  44 {
  45   float **dmx;                 /* distance (difference) matrix */
  46   struct phylo_s *tree;
  47   float  *lwt, *rwt;           /* weight on left, right of this tree node */
  48   float  *fwt;                 /* final weight assigned to this node */
  49   int      i;
  50
  51   /* Sanity check first
  52    */
  53   if (nseq == 1) { wgt[0] = 1.0; return; }
  54
  55   /* I use a simple fractional difference matrix derived by
  56    * pairwise identity. Perhaps I should include a Poisson
  57    * distance correction.
  58    */
  59   MakeDiffMx(aseq, nseq, &dmx);
  60   if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree))  Die("Cluster() failed");
  61
  62   /* Allocations
  63    */
  64   lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
  65   rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
  66   fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1));
  67
  68   /* lwt and rwt are the total branch weight to the left and
  69    * right of a node or sequence. They are 0..2N-2.  0..N-1 are
  70    * the sequences; these have weight 0. N..2N-2 are the actual
  71    * tree nodes.
  72    */
  73   for (i = 0; i < nseq; i++)
  74     lwt[i] = rwt[i] = 0.0;
  75                                 /* recursively calculate rwt, lwt, starting
  76                                    at node nseq (the root) */
  77   upweight(tree, nseq, lwt, rwt, nseq);
  78
  79                                 /* recursively distribute weight across the
  80                                    tree */
  81   fwt[nseq] = nseq;
  82   downweight(tree, nseq, lwt, rwt, fwt, nseq);
  83                                 /* collect the weights */
  84   for (i = 0; i < nseq; i++)
  85     wgt[i]  = fwt[i];
  86
  87   FMX2Free(dmx);
  88   FreePhylo(tree, nseq);
  89   free(lwt); free(rwt); free(fwt);
  90 }
  91
  92 static void
  93 upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node)
  94 {
  95   int ld,rd;
  96
  97   ld = tree[node-nseq].left;
  98   if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld);
  99   rd = tree[node-nseq].right;
 100   if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd);
 101   lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen;
 102   rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen;
 103 }
 104
 105
 106 static void
 107 downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node)
 108 {
 109   int ld,rd;
 110   float lnum, rnum;
 111
 112   ld = tree[node-nseq].left;
 113   rd = tree[node-nseq].right;
 114   if (lwt[node] + rwt[node] > 0.0)
 115     {
 116       fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node]));
 117       fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node]));
 118     }
 119   else
 120     {
 121       lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0;
 122       rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0;
 123       fwt[ld] = fwt[node] * lnum / (lnum + rnum);
 124       fwt[rd] = fwt[node] * rnum / (lnum + rnum);
 125     }
 126
 127   if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld);
 128   if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd);
 129 }
 130
 131
 132
 133
 134 /* Function: VoronoiWeights()
 135  *
 136  * Purpose:  Calculate weights using the scheme of Sibbald &
 137  *           Argos (JMB 216:813-818 1990). The scheme is
 138  *           slightly modified because the original algorithm
 139  *           actually doesn't work on gapped alignments.
 140  *           The sequences are assumed to be protein.
 141  *
 142  * Args:     aseq  - array of (0..nseq-1) aligned sequences
 143  *           nseq  - number of sequences
 144  *           alen  - length of alignment
 145  *           wgt   - allocated [0..nseq-1] array of weights to be returned
 146  *
 147  * Return:   void
 148  *           wgt is filled in.
 149  */
 150 void
 151 VoronoiWeights(char **aseq, int nseq, int alen, float *wgt)
 152 {
 153   float **dmx;                 /* distance (difference) matrix    */
 154   float  *halfmin;             /* 1/2 minimum distance to other seqs */
 155   char   **psym;                /* symbols seen in each column     */
 156   int     *nsym;                /* # syms seen in each column      */
 157   int      symseen[27];         /* flags for observed syms         */
 158   char    *randseq;             /* randomly generated sequence     */
 159   int      acol;                /* pos in aligned columns          */
 160   int      idx;                 /* index in sequences              */
 161   int      symidx;              /* 0..25 index for symbol          */
 162   int      i;                   /* generic counter                 */
 163   float   min;                  /* minimum distance                */
 164   float   dist;         /* distance between random and real */
 165   float   challenge, champion; /* for resolving ties              */
 166   int     itscale;              /* how many iterations per seq     */
 167   int     iteration;
 168   int     best;                 /* index of nearest real sequence  */
 169
 170   /* Sanity check first
 171    */
 172   if (nseq == 1) { wgt[0] = 1.0; return; }
 173
 174   itscale = 50;
 175
 176   /* Precalculate 1/2 minimum distance to other
 177    * sequences for each sequence
 178    */
 179   if (! simple_diffmx(aseq, nseq, &dmx))
 180     Die("simple_diffmx() failed");
 181   halfmin = MallocOrDie (sizeof(float) * nseq);
 182   for (idx = 0; idx < nseq; idx++)
 183     {
 184       for (min = 1.0, i = 0; i < nseq; i++)
 185         {
 186           if (i == idx) continue;
 187           if (dmx[idx][i] < min) min = dmx[idx][i];
 188         }
 189       halfmin[idx] = min / 2.0;
 190     }
 191   Free2DArray((void **) dmx, nseq);
 192
 193   /* Set up the random sequence generating model.
 194    */
 195   psym = MallocOrDie (alen * sizeof(char *));
 196   nsym = MallocOrDie (alen * sizeof(int));
 197   for (acol = 0; acol < alen; acol++)
 198     psym[acol] = MallocOrDie (27 * sizeof(char));
 199
 200 /* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
 201   for (acol = 0; acol < alen; acol++)
 202     {
 203       memset(symseen, 0, sizeof(int) * 27);
 204       for (idx = 0; idx < nseq; idx++)
 205         if (! isgap(aseq[idx][acol]))
 206           {
 207             if (isupper((int) aseq[idx][acol]))
 208               symidx = aseq[idx][acol] - 'A';
 209             else
 210               symidx = aseq[idx][acol] - 'a';
 211             if (symidx >= 0 && symidx < 26)
 212               symseen[symidx] = 1;
 213           }
 214         else
 215           symseen[26] = 1;      /* a gap */
 216
 217       for (nsym[acol] = 0, i = 0; i < 26; i++)
 218         if (symseen[i])
 219           {
 220             psym[acol][nsym[acol]] = 'A'+i;
 221             nsym[acol]++;
 222           }
 223       if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; }
 224     }
 225 /* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */
 226
 227   /* Note: the original Sibbald&Argos algorithm calls for
 228    * bounding the sampled space using a template-like random
 229    * sequence generator. However, this leads to one minor
 230    * and one major problem. The minor problem is that
 231    * exceptional amino acids in a column can have a
 232    * significant effect by altering the amount of sampled
 233    * sequence space; the larger the data set, the worse
 234    * this problem becomes. The major problem is that
 235    * there is no reasonable way to deal with gaps.
 236    * Gapped sequences simply inhabit a different dimensionality
 237    * and it's pretty painful to imagine calculating Voronoi
 238    * volumes when the N in your N-space is varying.
 239    * Note that all the examples shown by Sibbald and Argos
 240    * are *ungapped* examples.
 241    *
 242    * The best way I've found to circumvent this problem is
 243    * just not to bound the sampled space; count gaps as
 244    * symbols and generate completely random sequences.
 245    */
 246 #ifdef ALL_SEQUENCE_SPACE
 247   for (acol = 0; acol < alen; acol++)
 248     {
 249       strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY ");
 250       nsym[acol] = 21;
 251     }
 252 #endif
 253
 254   /* Sibbald and Argos algorithm:
 255    *   1) assign all seqs weight 0.
 256    *   2) generate a "random" sequence
 257    *   3) calculate distance to every other sequence
 258    *      (if we get a distance < 1/2 minimum distance
 259    *       to other real seqs, we can stop)
 260    *   4) if unique closest sequence, increment its weight 1.
 261    *      if multiple closest seq, choose one randomly
 262    *   5) repeat 2-4 for lots of iterations
 263    *   6) normalize all weights to sum to nseq.
 264    */
 265   randseq = MallocOrDie ((alen+1) * sizeof(char));
 266
 267   best = 42.;                   /* solely to silence GCC uninit warnings. */
 268   FSet(wgt, nseq, 0.0);
 269   for (iteration = 0; iteration < itscale * nseq; iteration++)
 270     {
 271       for (acol = 0; acol < alen; acol++)
 272         randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])];
 273       randseq[acol] = '\0';
 274
 275       champion = sre_random();
 276       for (min = 1.0, idx = 0; idx < nseq; idx++)
 277         {
 278           dist = simple_distance(aseq[idx], randseq);
 279           if (dist < halfmin[idx])
 280             {
 281               best = idx;
 282               break;
 283             }
 284           if (dist < min)
 285             { champion = sre_random(); best = idx; min = dist; }
 286           else if (dist == min)
 287             {
 288               challenge = sre_random();
 289               if (challenge > champion)
 290                 { champion = challenge; best = idx; min = dist; }
 291             }
 292         }
 293       wgt[best] += 1.0;
 294     }
 295
 296   for (idx = 0; idx < nseq; idx++)
 297     wgt[idx] = wgt[idx] / (float) itscale;
 298
 299   free(randseq);
 300   free(nsym);
 301   free(halfmin);
 302   Free2DArray((void **) psym, alen);
 303 }
 304
 305
 306 /* Function: simple_distance()
 307  *
 308  * Purpose:  For two identical-length null-terminated strings, return
 309  *           the fractional difference between them. (0..1)
 310  *           (Gaps don't count toward anything.)
 311  */
 312 static float
 313 simple_distance(char *s1, char *s2)
 314 {
 315   int diff  = 0;
 316   int valid = 0;
 317
 318   for (; *s1 != '\0'; s1++, s2++)
 319     {
 320       if (isgap(*s1) || isgap(*s2)) continue;
 321       if (*s1 != *s2) diff++;
 322       valid++;
 323     }
 324   return (valid > 0 ? ((float) diff / (float) valid) : 0.0);
 325 }
 326
 327 /* Function: simple_diffmx()
 328  *
 329  * Purpose:  Given a set of flushed, aligned sequences, construct
 330  *           an NxN fractional difference matrix using the
 331  *           simple_distance rule.
 332  *
 333  * Args:     aseqs        - flushed, aligned sequences
 334  *           num          - number of aseqs
 335  *           ret_dmx      - RETURN: difference matrix (caller must free)
 336  *
 337  * Return:   1 on success, 0 on failure.
 338  */
 339 static int
 340 simple_diffmx(char    **aseqs,
 341               int       num,
 342               float ***ret_dmx)
 343 {
 344   float **dmx;                 /* RETURN: distance matrix           */
 345   int      i,j;                 /* counters over sequences           */
 346
 347   /* Allocate
 348    */
 349   if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL)
 350     Die("malloc failed");
 351   for (i = 0; i < num; i++)
 352     if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL)
 353       Die("malloc failed");
 354
 355   /* Calculate distances, symmetric matrix
 356    */
 357   for (i = 0; i < num; i++)
 358     for (j = i; j < num; j++)
 359       dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]);
 360
 361   /* Return
 362    */
 363   *ret_dmx = dmx;
 364   return 1;
 365 }
 366
 367
 368
 369 /* Function: BlosumWeights()
 370  * Date:     SRE, Fri Jul 16 17:33:59 1999 (St. Louis)
 371  *
 372  * Purpose:  Assign weights to a set of aligned sequences
 373  *           using the BLOSUM rule:
 374  *             - do single linkage clustering at some pairwise identity
 375  *             - in each cluster, give each sequence 1/clustsize
 376  *               total weight.
 377  *
 378  *           The clusters have no pairwise link >= maxid.
 379  *
 380  *           O(N) in memory. Probably ~O(NlogN) in time; O(N^2)
 381  *           in worst case, which is no links between sequences
 382  *           (e.g., values of maxid near 1.0).
 383  *
 384  * Args:     aseqs - alignment
 385  *           nseq  - number of seqs in alignment
 386  *           alen  - # of columns in alignment
 387  *           maxid - fractional identity (e.g. 0.62 for BLOSUM62)
 388  *           wgt   - [0..nseq-1] array of weights to be returned
 389  */
 390 void
 391 BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt)
 392 {
 393   int  *c, nc;
 394   int  *nmem;        /* number of seqs in each cluster */
 395   int   i;           /* loop counter */
 396
 397   SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc);
 398
 399   FSet(wgt, nseq, 1.0);
 400   nmem = MallocOrDie(sizeof(int) * nc);
 401
 402   for (i = 0; i < nc;   i++) nmem[i] = 0;
 403   for (i = 0; i < nseq; i++) nmem[c[i]]++;
 404   for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]];
 405
 406   free(nmem);
 407   free(c);
 408   return;
 409 }
 410
 411
 412 /* Function: PositionBasedWeights()
 413  * Date:     SRE, Fri Jul 16 17:47:22 1999 [St. Louis]
 414  *
 415  * Purpose:  Implementation of Henikoff and Henikoff position-based
 416  *           weights (JMB 243:574-578, 1994) [Henikoff94b].
 417  *
 418  *           A significant advantage of this approach that Steve and Jorja
 419  *           don't point out is that it is O(N) in memory, unlike
 420  *           many other approaches like GSC weights or Voronoi.
 421  *
 422  *           A potential disadvantage that they don't point out
 423  *           is that in the theoretical limit of infinite sequences
 424  *           in the alignment, weights go flat: eventually every
 425  *           column has at least one representative of each of 20 aa (or 4 nt)
 426  *           in it.
 427  *
 428  *           They also don't give a rule for how to handle gaps.
 429  *           The rule used here seems the obvious and sensible one
 430  *           (ignore them). This means that longer sequences
 431  *           initially get more weight; hence a "double
 432  *           normalization" in which the weights are first divided
 433  *           by sequence length (to compensate for that effect),
 434  *           then normalized to sum to nseq.
 435  *
 436  * Limitations:
 437  *           Implemented in a way that's alphabet-independent:
 438  *           it uses the 26 upper case letters as "residues".
 439  *           Any alphabetic character in aseq is interpreted as
 440  *           a unique "residue" (case insensitively; lower case
 441  *           mapped to upper case). All other characters are
 442  *           interpreted as gaps.
 443  *
 444  *           This way, we don't have to pass around any alphabet
 445  *           type info (DNA vs. RNA vs. protein) and don't have
 446  *           to deal with remapping IUPAC degenerate codes
 447  *           probabilistically. However, on the down side,
 448  *           a sequence with a lot of degenerate IUPAC characters
 449  *           will get an artifactually high PB weight.
 450  *
 451  * Args:     aseq   - sequence alignment to weight
 452  *           nseq   - number of sequences in alignment
 453  *           alen   - length of alignment
 454  *           wgt    - RETURN: weights filled in (pre-allocated 0..nseq-1)
 455  *
 456  * Returns:  (void)
 457  *           wgt is allocated (0..nseq-1) by caller, and filled in here.
 458  */
 459 void
 460 PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt)
 461 {
 462   int  rescount[26];            /* count of A-Z residues in a column   */
 463   int  nres;                    /* number of different residues in col */
 464   int  idx, pos;                /* indices into aseq                   */
 465   int  x;
 466   float norm;
 467
 468   FSet(wgt, nseq, 0.0);
 469   for (pos = 0; pos < alen; pos++)
 470     {
 471       for (x = 0; x < 26; x++) rescount[x] = 0;
 472       for (idx = 0; idx < nseq; idx++)
 473         if (isalpha(aseq[idx][pos]))
 474           rescount[toupper(aseq[idx][pos]) - 'A'] ++;
 475
 476       nres = 0;
 477       for (x = 0; x < 26; x++)
 478         if (rescount[x] > 0) nres++;
 479
 480       for (idx = 0; idx < nseq; idx++)
 481         if (isalpha(aseq[idx][pos]))
 482           wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']);
 483     }
 484
 485   for (idx = 0; idx < nseq; idx++)
 486     wgt[idx] /= (float) DealignedLength(aseq[idx]);
 487   norm = (float) nseq / FSum(wgt, nseq);
 488   FScale(wgt, nseq, norm);
 489   return;
 490 }
 491
 492
 493
 494
 495 /* Function: FilterAlignment()
 496  * Date:     SRE, Wed Jun 30 09:19:30 1999 [St. Louis]
 497  *
 498  * Purpose:  Constructs a new alignment by removing near-identical
 499  *           sequences from a given alignment (where identity is
 500  *           calculated *based on the alignment*).
 501  *           Does not affect the given alignment.
 502  *           Keeps earlier sequence, discards later one.
 503  *
 504  *           Usually called as an ad hoc sequence "weighting" mechanism.
 505  *
 506  * Limitations:
 507  *           Unparsed Stockholm markup is not propagated into the
 508  *           new alignment.
 509  *
 510  * Args:     msa      -- original alignment
 511  *           cutoff   -- fraction identity cutoff. 0.8 removes sequences > 80% id.
 512  *           ret_new  -- RETURN: new MSA, usually w/ fewer sequences
 513  *
 514  * Return:   (void)
 515  *           ret_new must be free'd by caller: MSAFree().
 516  */
 517 void
 518 FilterAlignment(MSA *msa, float cutoff, MSA **ret_new)
 519 {
 520   int     nnew;                 /* number of seqs in new alignment */
 521   int    *list;
 522   int    *useme;
 523   float   ident;
 524   int     i,j;
 525   int     remove;
 526
 527                                 /* find which seqs to keep (list) */
 528                                 /* diff matrix; allow ragged ends */
 529   list  = MallocOrDie (sizeof(int) * msa->nseq);
 530   useme = MallocOrDie (sizeof(int) * msa->nseq);
 531   for (i = 0; i < msa->nseq; i++) useme[i] = FALSE;
 532
 533   nnew = 0;
 534   for (i = 0; i < msa->nseq; i++)
 535     {
 536       remove = FALSE;
 537       for (j = 0; j < nnew; j++)
 538         {
 539           ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]);
 540           if (ident > cutoff)
 541             {
 542               remove = TRUE;
 543               printf("removing %12s -- fractional identity %.2f to %s\n",
 544                      msa->sqname[i], ident,
 545                      msa->sqname[list[j]]);
 546               break;
 547             }
 548         }
 549       if (remove == FALSE) {
 550         list[nnew++] = i;
 551         useme[i]     = TRUE;
 552       }
 553     }
 554
 555   MSASmallerAlignment(msa, useme, ret_new);
 556   free(list);
 557   free(useme);
 558   return;
 559 }
 560
 561
 562 /* Function: SampleAlignment()
 563  * Date:     SRE, Wed Jun 30 10:13:56 1999 [St. Louis]
 564  *
 565  * Purpose:  Constructs a new, smaller alignment by sampling a given
 566  *           number of sequences at random. Does not change the
 567  *           alignment nor the order of the sequences.
 568  *
 569  *           If you ask for a sample that is larger than nseqs,
 570  *           it silently returns the original alignment.
 571  *
 572  *           Not really a weighting method, but this is as good
 573  *           a place as any to keep it, since it's similar in
 574  *           construction to FilterAlignment().
 575  *
 576  * Args:     msa      -- original alignment
 577  *           sample   -- number of sequences in new alignment (0 < sample <= nseq)
 578  *           ret_new  -- RETURN: new MSA
 579  *
 580  * Return:   (void)
 581  *           ret_new must be free'd by caller: MSAFree().
 582  */
 583 void
 584 SampleAlignment(MSA *msa, int sample, MSA **ret_new)
 585 {
 586   int    *list;                 /* array for random selection w/o replace */
 587   int    *useme;                /* array of flags 0..nseq-1: TRUE to use */
 588   int     i, idx;
 589   int     len;
 590
 591   /* Allocations
 592    */
 593   list  = (int *) MallocOrDie (sizeof(int) * msa->nseq);
 594   useme = (int *) MallocOrDie (sizeof(int) * msa->nseq);
 595   for (i = 0; i < msa->nseq; i++)
 596     {
 597       list[i]  = i;
 598       useme[i] = FALSE;
 599     }
 600
 601   /* Sanity check.
 602    */
 603   if (sample >= msa->nseq) sample = msa->nseq;
 604
 605                                 /* random selection w/o replacement */
 606   for (len = msa->nseq, i = 0; i < sample; i++)
 607     {
 608       idx = CHOOSE(len);
 609       printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]);
 610       useme[list[idx]] = TRUE;
 611       list[idx] = list[--len];
 612     }
 613
 614   MSASmallerAlignment(msa, useme, ret_new);
 615   free(list);
 616   free(useme);
 617   return;
 618 }
 619
 620
 621 /* Function: SingleLinkCluster()
 622  * Date:     SRE, Fri Jul 16 15:02:57 1999 [St. Louis]
 623  *
 624  * Purpose:  Perform simple single link clustering of seqs in a
 625  *           sequence alignment. A pairwise identity threshold
 626  *           defines whether two sequences are linked or not.
 627  *
 628  *           Important: runs in O(N) memory, unlike standard
 629  *           graph decomposition algorithms that use O(N^2)
 630  *           adjacency matrices or adjacency lists. Requires
 631  *           O(N^2) time in worst case (which is when you have
 632  *           no links at all), O(NlogN) in "average"
 633  *           case, and O(N) in best case (when there is just
 634  *           one cluster in a completely connected graph.
 635  *
 636  *           (Developed because hmmbuild could no longer deal
 637  *           with GP120, a 16,013 sequence alignment.)
 638  *
 639  * Limitations:
 640  *           CASE-SENSITIVE. Assumes aseq have been put into
 641  *           either all lower or all upper case; or at least,
 642  *           within a column, there's no mixed case.
 643  *
 644  * Algorithm:
 645  *           I don't know if this algorithm is published. I
 646  *           haven't seen it in graph theory books, but that might
 647  *           be because it's so obvious that nobody's bothered.
 648  *
 649  *           In brief, we're going to do a breadth-first search
 650  *           of the graph, and we're going to calculate links
 651  *           on the fly rather than precalculating them into
 652  *           some sort of standard adjacency structure.
 653  *
 654  *           While working, we keep two stacks of maximum length N:
 655  *                a : list of vertices that are still unconnected.
 656  *                b : list of vertices that we've connected to
 657  *                    in our current breadth level, but we haven't
 658  *                    yet tested for other connections to a.
 659  *           The current length (number of elements in) a and b are
 660  *           kept in na, nb.
 661  *
 662  *           We store our results in an array of length N:
 663  *                c : assigns each vertex to a component. for example
 664  *                    c[4] = 1 means that vertex 4 is in component 1.
 665  *                    nc is the number of components. Components
 666  *                    are numbered from 0 to nc-1. We return c and nc
 667  *                    to our caller.
 668  *
 669  *           The algorithm is:
 670  *
 671  *           Initialisation:
 672  *                a  <-- all the vertices
 673  *                na <-- N
 674  *                b  <-- empty set
 675  *                nb <-- 0
 676  *                nc <-- 0
 677  *
 678  *           Then:
 679  *                while (a is not empty)
 680  *                  pop a vertex off a, push onto b
 681  *                  while (b is not empty)
 682  *                    pop vertex v off b
 683  *                    assign c[v] = nc
 684  *                    for each vertex w in a:
 685  *                       compare v,w. If w is linked to v, remove w
 686  *                       from a, push onto b.
 687  *                  nc++
 688  *           q.e.d. :)
 689  *
 690  * Args:     aseq   - aligned sequences
 691  *           nseq   - number of sequences in aseq
 692  *           alen   - alignment length
 693  *           maxid  - fractional identity threshold 0..1. if id >= maxid, seqs linked
 694  *           ret_c  - RETURN: 0..nseq-1 assignments of seqs to components (clusters)
 695  *           ret_nc - RETURN: number of components
 696  *
 697  * Returns:  void.
 698  *           ret_c is allocated here. Caller free's with free(*ret_c)
 699  */
 700 void
 701 SingleLinkCluster(char **aseq, int nseq, int alen, float maxid,
 702                   int **ret_c, int *ret_nc)
 703 {
 704   int *a, na;                   /* stack of available vertices */
 705   int *b, nb;                   /* stack of working vertices   */
 706   int *c;                       /* array of results            */
 707   int  nc;                      /* total number of components  */
 708   int  v,w;                     /* index of a working vertices */
 709   int  i;                       /* loop counter */
 710
 711   /* allocations and initializations
 712    */
 713   a = MallocOrDie (sizeof(int) * nseq);
 714   b = MallocOrDie (sizeof(int) * nseq);
 715   c = MallocOrDie (sizeof(int) * nseq);
 716   for (i = 0; i < nseq; i++) a[i] = i;
 717   na = nseq;
 718   nb = 0;
 719   nc = 0;
 720
 721   /* Main algorithm
 722    */
 723   while (na > 0)
 724     {
 725       v = a[na-1]; na--;        /* pop a vertex off a, */
 726       b[nb] = v;   nb++;        /* and push onto b     */
 727       while (nb > 0)
 728         {
 729           v    = b[nb-1]; nb--; /* pop vertex off b          */
 730           c[v] = nc;            /* assign it to component nc */
 731           for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/
 732             if (simple_distance(aseq[v], aseq[a[i]]) <= 1. - maxid) /* linked? */
 733               {
 734                 w = a[i]; a[i] = a[na-1]; na--; /* delete w from a (note swap) */
 735                 b[nb] = w; nb++;                /* push w onto b */
 736               }
 737         }
 738       nc++;
 739     }
 740
 741   /* Cleanup and return
 742    */
 743   free(a);
 744   free(b);
 745   *ret_c  = c;
 746   *ret_nc = nc;
 747   return;
 748 }