binaries/src/clustalo/src/squid/aligneval.c

   1 /*****************************************************************
   2  * SQUID - a library of functions for biological sequence analysis
   3  * Copyright (C) 1992-2002 Washington University School of Medicine
   4  *
   5  *     This source code is freely distributed under the terms of the
   6  *     GNU General Public License. See the files COPYRIGHT and LICENSE
   7  *     for details.
   8  *****************************************************************/
   9
  10 /* aligneval.c
  11  *
  12  * Comparison of multiple alignments. Three functions are
  13  * provided, using subtly different scoring schemes:
  14  *    CompareMultAlignments()    - basic scoring scheme
  15  *    CompareRefMultAlignments() - only certain "canonical" columns
  16  *                                 are scored
  17  *
  18  * The similarity measure is a fractional alignment identity averaged
  19  * over all sequence pairs. The score for all pairs is:
  20  *      (identically aligned symbols) / (total aligned columns in
  21  *      known alignment)
  22  *
  23  * A column c is identically aligned for sequences i, j if:
  24  *    1) both i,j have a symbol aligned in column c, and the
  25  *       same pair of symbols is aligned somewhere in the test
  26  *       alignment
  27  *    2) S[i][c] is aligned to a gap in sequence j, and that symbol
  28  *       is aligned to a gap in the test alignment
  29  *    3) converse of 2)
  30  *
  31  *
  32  * The algorithm is as follows:
  33  *    1) For each known/test aligned pair of sequences (k1,k2 and t1,t2)
  34  *        construct a list for each sequence, in which for every
  35  *        counted symbol we record the raw index of the symbol in
  36  *        the other sequence that it aligns to, or -1 if it aligns
  37  *        to a gap or uncounted symbol.
  38  *
  39  *    2)  Compare the list for k1 to the list for t1 and count an identity
  40  *        for each correct alignment.
  41  *
  42  *    3) Repeat 2) for comparing k2 to t2. Note that this means correct sym/sym
  43  *       alignments count for 2; correct sym/gap alignments count for 1.
  44  *
  45  *    4) The score is (identities from 2 + identities from 3) /
  46  *       (totals from 2 + totals from 3).
  47  *
  48  * Written originally for koala's ss2 pairwise alignment package.
  49  *
  50  * Sean Eddy, Sun Nov  1 12:45:11 1992
  51  * SRE, Thu Jul 29 16:47:18 1993: major revision: all functions replaced by new algorithm
  52  * CVS $Id: aligneval.c,v 1.7 2002/10/09 14:26:09 eddy Exp)
  53  */
  54
  55
  56 #include <stdio.h>
  57 #include <string.h>
  58 #include <ctype.h>
  59 #include "squid.h"
  60 #include "sre_random.h"
  61
  62 static int make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen);
  63 static int make_ref_alilist(int *refcoords, char *k1, char *k2, char *s1, char *s2,
  64                             int **ret_s1_list, int *ret_listlen);
  65 static int compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc);
  66
  67
  68 /* Function: ComparePairAlignments
  69  *
  70  * Purpose:  Calculate and return a number representing how well two different alignments
  71  *           of a pair of sequences compare. The number is, roughly speaking,
  72  *           the fraction of columns which are identically aligned.
  73  *
  74  *           For all columns c in which either known1[c] or known2[c]
  75  *           is a non-gap, count an identity if those same symbols are
  76  *           aligned somewhere in calc1/calc2. The score is identities/total
  77  *           columns examined. (i.e. fully gapped columns don't count)
  78  *
  79  *           more explicitly, identities come from:
  80  *             both known and test aligned pairs have the same symbol in the first sequence aligned to
  81  *               a gap in the second sequence;
  82  *             both known and test aligned pairs have the same symbol in the second sequence
  83  *               aligned to a gap in the first sequence;
  84  *             the known alignment has symbols aligned at this column, and the test
  85  *               alignment aligns the same two symbols.
  86  *
  87  * Args:     known1, known2: trusted alignment of two sequences
  88  *           calc1, calc2:   test alignment of two sequences
  89  *
  90  * Return:   Returns -1.0 on internal failure.
  91  */
  92 float
  93 ComparePairAlignments(char *known1, char *known2, char *calc1, char *calc2)
  94 {
  95   int *klist1;
  96   int *klist2;
  97   int *tlist1;
  98   int *tlist2;
  99   int len1, len2;
 100   float score;
 101
 102   if (! make_alilist(calc1,  calc2,  &tlist1, &len1)) return -1.0;
 103   if (! make_alilist(calc2,  calc1,  &tlist2, &len2)) return -1.0;
 104   if (! make_alilist(known1, known2, &klist1, &len1)) return -1.0;
 105   if (! make_alilist(known2, known1, &klist2, &len2)) return -1.0;
 106   if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;
 107
 108   free(klist1);
 109   free(klist2);
 110   free(tlist1);
 111   free(tlist2);
 112   return score;
 113 }
 114
 115
 116
 117 /* Function: CompareRefPairAlignments()
 118  *
 119  * Same as above, but the only columns that count are the ones
 120  * with indices in *refcoord. *refcoord and the known1, known2
 121  * pair must be in sync with each other (come from the same
 122  * multiple sequence alignment)
 123  *
 124  * Args:     ref           - 0..alen-1 array of 1 or 0
 125  *           known1,known2 - trusted alignment
 126  *           calc1, calc2  - test alignment
 127  *
 128  * Return:  the fractional alignment identity on success, -1.0 on failure.
 129  */
 130 float
 131 CompareRefPairAlignments(int  *ref, char *known1, char *known2, char *calc1, char *calc2)
 132 {
 133   int *klist1;
 134   int *klist2;
 135   int *tlist1;
 136   int *tlist2;
 137   int len1, len2;
 138   float score;
 139
 140   if (! make_ref_alilist(ref, known1, known2, calc1,  calc2,  &tlist1, &len1)) return -1.0;
 141   if (! make_ref_alilist(ref, known2, known1, calc2,  calc1,  &tlist2, &len2)) return -1.0;
 142   if (! make_ref_alilist(ref, known1, known2, known1, known2, &klist1, &len1)) return -1.0;
 143   if (! make_ref_alilist(ref, known2, known1, known2, known1, &klist2, &len2)) return -1.0;
 144   if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;
 145
 146   free(klist1);
 147   free(klist2);
 148   free(tlist1);
 149   free(tlist2);
 150   return score;
 151 }
 152
 153 /* Function: make_alilist()
 154  *
 155  * Purpose:  Construct a list (array) mapping the raw symbols of s1
 156  *           onto the indexes of the aligned symbols in s2 (or -1
 157  *           for gaps in s2). The list (s1_list) will be of the
 158  *           length of s1's raw sequence.
 159  *
 160  * Args:     s1          - sequence to construct the list for
 161  *           s2          - sequence s1 is aligned to
 162  *           ret_s1_list - RETURN: the constructed list (caller must free)
 163  *           ret_listlen - RETURN: length of the list
 164  *
 165  * Returns:  1 on success, 0 on failure
 166  */
 167 static int
 168 make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
 169 {
 170   int *s1_list;
 171   int  col;                     /* column position in alignment */
 172   int  r1, r2;                  /* raw symbol index at current col in s1, s2 */
 173
 174   /* Malloc for s1_list. It can't be longer than s1 itself; we just malloc
 175    * for that (and waste a wee bit of space)
 176    */
 177   s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));
 178   r1 = r2 = 0;
 179   for (col = 0; s1[col] != '\0'; col++)
 180     {
 181       /* symbol in s1? Record what it's aligned to, and bump
 182        * the r1 counter.
 183        */
 184       if (! isgap(s1[col]))
 185         {
 186           s1_list[r1] = isgap(s2[col]) ? -1 : r2;
 187           r1++;
 188         }
 189
 190       /* symbol in s2? bump the r2 counter
 191        */
 192       if (! isgap(s2[col]))
 193         r2++;
 194     }
 195
 196   *ret_listlen = r1;
 197   *ret_s1_list = s1_list;
 198   return 1;
 199 }
 200
 201
 202
 203 /* Function: make_ref_alilist()
 204  *
 205  * Purpose:  Construct a list (array) mapping the raw symbols of s1
 206  *           which are under canonical columns of the ref alignment
 207  *           onto the indexes of the aligned symbols in s2 (or -1
 208  *           for gaps in s2 or noncanonical symbols in s2).
 209  *
 210  * Args:     ref:        - array of indices of canonical coords (1 canonical, 0 non)
 211  *           k1          - s1's known alignment (w/ respect to refcoords)
 212  *           k2          - s2's known alignment (w/ respect to refcoords)
 213  *           s1          - sequence to construct the list for
 214  *           s2          - sequence s1 is aligned to
 215  *           ret_s1_list - RETURN: the constructed list (caller must free)
 216  *           ret_listlen - RETURN: length of the list
 217  *
 218  * Returns:  1 on success, 0 on failure
 219  */
 220 /*ARGSUSED*/
 221 static int
 222 make_ref_alilist(int *ref, char *k1, char *k2,
 223                  char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
 224 {
 225   int *s1_list;
 226   int  col;                     /* column position in alignment */
 227   int  r1, r2;                  /* raw symbol index at current col in s1, s2 */
 228   int *canons1;                 /* flag array, 1 if position i in s1 raw seq is canonical */
 229   int  lpos;                    /* position in list */
 230
 231   /* Allocations. No arrays can exceed the length of their
 232    * appropriate parent (s1 or s2)
 233    */
 234   s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));
 235   canons1 = (int *) MallocOrDie (sizeof(int) * strlen(s1));
 236
 237   /* First we use refcoords and k1,k2 to construct an array of 1's
 238    * and 0's, telling us whether s1's raw symbol number i is countable.
 239    * It's countable simply if it's under a canonical column.
 240    */
 241   r1 =  0;
 242   for (col = 0; k1[col] != '\0'; col++)
 243     {
 244       if (! isgap(k1[col]))
 245         {
 246           canons1[r1] = ref[col] ? 1 : 0;
 247           r1++;
 248         }
 249     }
 250
 251   /* Now we can construct the list. We don't count pairs if the sym in s1
 252    * is non-canonical.
 253    * We have to keep separate track of our position in the list (lpos)
 254    * from our positions in the raw sequences (r1,r2)
 255    */
 256   r1 = r2 = lpos = 0;
 257   for (col = 0; s1[col] != '\0'; col++)
 258     {
 259       if (! isgap(s1[col]) && canons1[r1])
 260         {
 261           s1_list[lpos] = isgap(s2[col]) ? -1 : r2;
 262           lpos++;
 263         }
 264
 265       if (! isgap(s1[col]))
 266         r1++;
 267       if (! isgap(s2[col]))
 268         r2++;
 269     }
 270
 271   free(canons1);
 272   *ret_listlen = lpos;
 273   *ret_s1_list = s1_list;
 274   return 1;
 275 }
 276
 277 /* Function: compare_lists()
 278  *
 279  * Purpose:  Given four alignment lists (k1,k2, t1,t2), calculate the
 280  *           alignment score.
 281  *
 282  * Args:     k1   - list of k1's alignment to k2
 283  *           k2   - list of k2's alignment to k1
 284  *           t1   - list of t1's alignment to t2
 285  *           t2   - list of t2's alignment to t2
 286  *           len1 - length of k1, t1 lists (same by definition)
 287  *           len2 - length of k2, t2 lists (same by definition)
 288  *           ret_sc - RETURN: identity score of alignment
 289  *
 290  * Return:   1 on success, 0 on failure.
 291  */
 292 static int
 293 compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc)
 294 {
 295   float id;
 296   float tot;
 297   int   i;
 298
 299   id = tot = 0.0;
 300   for (i = 0; i < len1; i++)
 301     {
 302       tot += 1.0;
 303       if (t1[i] == k1[i]) id += 1.0;
 304     }
 305
 306   for ( i = 0; i < len2; i++)
 307     {
 308       tot += 1.0;
 309       if (k2[i] == t2[i]) id += 1.0;
 310     }
 311
 312   *ret_sc = id / tot;
 313   return 1;
 314 }
 315
 316
 317 /* Function: CompareMultAlignments
 318  *
 319  * Purpose:  Invokes pairwise alignment comparison for every possible pair,
 320  *           and returns the average score over all N(N-1) of them or -1.0
 321  *           on an internal failure.
 322  *
 323  *           Can be slow for large N, since it's quadratic.
 324  *
 325  * Args:     kseqs  - trusted multiple alignment
 326  *           tseqs  - test multiple alignment
 327  *           N      - number of sequences
 328  *
 329  * Return:   average identity score, or -1.0 on failure.
 330  */
 331 float
 332 CompareMultAlignments(char **kseqs, char **tseqs, int N)
 333 {
 334   int    i, j;                  /* counters for sequences */
 335   float  score;
 336   float  tot_score = 0.0;
 337                                 /* do all pairwise comparisons */
 338   for (i = 0; i < N; i++)
 339     for (j = i+1; j < N; j++)
 340       {
 341         score = ComparePairAlignments(kseqs[i], kseqs[j], tseqs[i], tseqs[j]);
 342         if (score < 0.0) return -1.0;
 343         tot_score += score;
 344       }
 345   return ((tot_score * 2.0) / ((float) N * ((float) N - 1.0)));
 346 }
 347
 348
 349
 350 /* Function: CompareRefMultAlignments()
 351  *
 352  * Purpose:  Same as above, except an array of reference coords for
 353  *           the canonical positions of the known alignment is also
 354  *           provided.
 355  *
 356  * Args:     ref      : 0..alen-1 array of 1/0 flags, 1 if canon
 357  *           kseqs    : trusted alignment
 358  *           tseqs    : test alignment
 359  *           N        : number of sequences
 360  *
 361  * Return:   average identity score, or -1.0 on failure
 362  */
 363 float
 364 CompareRefMultAlignments(int   *ref, char **kseqs, char **tseqs, int N)
 365 {
 366   int    i, j;                  /* counters for sequences */
 367   float  score;
 368   float  tot_score = 0.0;
 369
 370                                 /* do all pairwise comparisons */
 371   for (i = 0; i < N; i++)
 372     for (j = i+1; j < N; j++)
 373       {
 374         score = CompareRefPairAlignments(ref, kseqs[i], kseqs[j], tseqs[i], tseqs[j]);
 375         if (score < 0.0) return -1.0;
 376         tot_score += score;
 377       }
 378   return ((tot_score * 2.0)/ ((float) N * ((float) N - 1.0)));
 379 }
 380
 381 /* Function: PairwiseIdentity()
 382  *
 383  * Purpose:  Calculate the pairwise fractional identity between
 384  *           two aligned sequences s1 and s2. This is simply
 385  *           (idents / MIN(len1, len2)).
 386  *
 387  *           Note how many ways there are to calculate pairwise identity,
 388  *           because of the variety of choices for the denominator:
 389  *           idents/(idents+mismat) has the disadvantage that artifactual
 390  *             gappy alignments would have high "identities".
 391  *           idents/(AVG|MAX)(len1,len2) both have the disadvantage that
 392  *             alignments of fragments to longer sequences would have
 393  *             artifactually low "identities".
 394  *
 395  *  Original Case sensitive; also, watch out in nucleic acid alignments;
 396  *           U/T RNA/DNA alignments will be counted as mismatches!
 397  *
 398  *  Clustal Omega patch: Case insensitive and T and U are treated the same
 399  */
 400 float
 401 PairwiseIdentity(char *s1, char *s2)
 402 {
 403   int     idents;               /* total identical positions  */
 404   int     len1, len2;           /* lengths of seqs            */
 405   int     x;                    /* position in aligned seqs   */
 406
 407   idents = len1 = len2 = 0;
 408   for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++)
 409     {
 410 #ifdef CLUSTALO
 411       char c1 = toupper(s1[x]);
 412       char c2 = toupper(s2[x]);
 413       if (c1=='U')
 414           c1 = 'T';
 415       if (c2=='U')
 416           c2 = 'T';
 417
 418       if (!isgap(c1)) {
 419           len1++;
 420           if (c1 == c2)
 421               idents++;
 422       }
 423       if (!isgap(c2))
 424           len2++;
 425 #else
 426       if (!isgap(s1[x])) {
 427         len1++;
 428         if (s1[x] == s2[x]) idents++;
 429       }
 430       if (!isgap(s2[x])) len2++;
 431 #endif
 432     }
 433   if (len2 < len1) len1 = len2;
 434   return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
 435 }
 436
 437
 438
 439 /* Function: AlignmentIdentityBySampling()
 440  * Date:     SRE, Mon Oct 19 14:29:01 1998 [St. Louis]
 441  *
 442  * Purpose:  Estimate and return the average pairwise
 443  *           fractional identity of an alignment,
 444  *           using sampling.
 445  *
 446  *           For use when there's so many sequences that
 447  *           an all vs. all rigorous calculation will
 448  *           take too long.
 449  *
 450  *           Case sensitive!
 451  *
 452  * Args:     aseq       - aligned sequences
 453  *           L          - length of alignment
 454  *           N          - number of seqs in alignment
 455  *           nsample    - number of samples
 456  *
 457  * Returns:  average fractional identity, 0..1.
 458  */
 459 float
 460 AlignmentIdentityBySampling(char **aseq, int L, int N, int nsample)
 461 {
 462   int x, i, j;                  /* counters */
 463   float sum;
 464
 465   if (N < 2) return 1.0;
 466
 467   sum = 0.;
 468   for (x = 0; x < nsample; x++)
 469     {
 470       i = CHOOSE(N);
 471       do { j = CHOOSE(N); } while (j == i); /* make sure j != i */
 472       sum += PairwiseIdentity(aseq[i], aseq[j]);
 473     }
 474   return sum / (float) nsample;
 475 }
 476
 477 /* Function: MajorityRuleConsensus()
 478  * Date:     SRE, Tue Mar  7 15:30:30 2000 [St. Louis]
 479  *
 480  * Purpose:  Given a set of aligned sequences, produce a
 481  *           majority rule consensus sequence. If >50% nonalphabetic
 482  *           (usually meaning gaps) in the column, ignore the column.
 483  *
 484  * Args:     aseq  - aligned sequences, [0..nseq-1][0..alen-1]
 485  *           nseq  - number of sequences
 486  *           alen  - length of alignment
 487  *
 488  * Returns:  ptr to allocated consensus sequence.
 489  *           Caller is responsible for free'ing this.
 490  */
 491 char *
 492 MajorityRuleConsensus(char **aseq, int nseq, int alen)
 493 {
 494   char *cs;                     /* RETURN: consensus sequence */
 495   int count[27];                /* counts for a..z and gaps in a column */
 496   int idx,apos;                 /* counters for seq, column */
 497   int spos;                     /* position in cs */
 498   int x;                        /* counter for characters */
 499   int sym;
 500   int max, bestx;
 501
 502   cs = MallocOrDie(sizeof(char) * (alen+1));
 503
 504   for (spos=0,apos=0; apos < alen; apos++)
 505     {
 506       for (x = 0; x < 27; x++) count[x] = 0;
 507
 508       for (idx = 0; idx < nseq; idx++)
 509         {
 510           if (isalpha(aseq[idx][apos])) {
 511             sym = toupper(aseq[idx][apos]);
 512             count[sym-'A']++;
 513           } else {
 514             count[26]++;
 515           }
 516         }
 517
 518       if ((float) count[26] / (float) nseq <= 0.5) {
 519         max = bestx = -1;
 520         for (x = 0; x < 26; x++)
 521           if (count[x] > max) { max = count[x]; bestx = x; }
 522         cs[spos++] = (char) ('A' + bestx);
 523       }
 524     }
 525   cs[spos] = '\0';
 526   return cs;
 527 }