forester/archive/RIO/others/hmmer/squid/aligneval.c

   1 /*****************************************************************
   2  * HMMER - Biological sequence analysis with profile HMMs
   3  * Copyright (C) 1992-1999 Washington University School of Medicine
   4  * All Rights Reserved
   5  *
   6  *     This source code is distributed under the terms of the
   7  *     GNU General Public License. See the files COPYING and LICENSE
   8  *     for details.
   9  *****************************************************************/
  10
  11 /* aligneval.c
  12  * RCS $Id: aligneval.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $
  13  *
  14  * Comparison of multiple alignments. Three functions are
  15  * provided, using subtly different scoring schemes:
  16  *    CompareMultAlignments()    - basic scoring scheme
  17  *    CompareRefMultAlignments() - only certain "canonical" columns
  18  *                                 are scored
  19  *
  20  * The similarity measure is a fractional alignment identity averaged
  21  * over all sequence pairs. The score for all pairs is:
  22  *      (identically aligned symbols) / (total aligned columns in
  23  *      known alignment)
  24  *
  25  * A column c is identically aligned for sequences i, j if:
  26  *    1) both i,j have a symbol aligned in column c, and the
  27  *       same pair of symbols is aligned somewhere in the test
  28  *       alignment
  29  *    2) S[i][c] is aligned to a gap in sequence j, and that symbol
  30  *       is aligned to a gap in the test alignment
  31  *    3) converse of 2)
  32  *
  33  *
  34  * The algorithm is as follows:
  35  *    1) For each known/test aligned pair of sequences (k1,k2 and t1,t2)
  36  *        construct a list for each sequence, in which for every
  37  *        counted symbol we record the raw index of the symbol in
  38  *        the other sequence that it aligns to, or -1 if it aligns
  39  *        to a gap or uncounted symbol.
  40  *
  41  *    2)  Compare the list for k1 to the list for t1 and count an identity
  42  *        for each correct alignment.
  43  *
  44  *    3) Repeat 2) for comparing k2 to t2. Note that this means correct sym/sym
  45  *       alignments count for 2; correct sym/gap alignments count for 1.
  46  *
  47  *    4) The score is (identities from 2 + identities from 3) /
  48  *       (totals from 2 + totals from 3).
  49  *
  50  * Written originally for koala's ss2 pairwise alignment package.
  51  *
  52  * Sean Eddy, Sun Nov  1 12:45:11 1992
  53  * SRE, Thu Jul 29 16:47:18 1993: major revision: all functions replaced by new algorithm
  54  * CVS $Id: aligneval.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $
  55  */
  56
  57
  58 #include <stdio.h>
  59 #include <string.h>
  60 #include <ctype.h>
  61 #include "squid.h"
  62
  63 #ifdef MEMDEBUG
  64 #include "dbmalloc.h"
  65 #endif
  66
  67 static int make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen);
  68 static int make_ref_alilist(int *refcoords, char *k1, char *k2, char *s1, char *s2,
  69                             int **ret_s1_list, int *ret_listlen);
  70 static int compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc);
  71
  72
  73 /* Function: ComparePairAlignments
  74  *
  75  * Purpose:  Calculate and return a number representing how well two different alignments
  76  *           of a pair of sequences compare. The number is, roughly speaking,
  77  *           the fraction of columns which are identically aligned.
  78  *
  79  *           For all columns c in which either known1[c] or known2[c]
  80  *           is a non-gap, count an identity if those same symbols are
  81  *           aligned somewhere in calc1/calc2. The score is identities/total
  82  *           columns examined. (i.e. fully gapped columns don't count)
  83  *
  84  *           more explicitly, identities come from:
  85  *             both known and test aligned pairs have the same symbol in the first sequence aligned to
  86  *               a gap in the second sequence;
  87  *             both known and test aligned pairs have the same symbol in the second sequence
  88  *               aligned to a gap in the first sequence;
  89  *             the known alignment has symbols aligned at this column, and the test
  90  *               alignment aligns the same two symbols.
  91  *
  92  * Args:     known1, known2: trusted alignment of two sequences
  93  *           calc1, calc2:   test alignment of two sequences
  94  *
  95  * Return:   Returns -1.0 on internal failure.
  96  */
  97 float
  98 ComparePairAlignments(char *known1, char *known2, char *calc1, char *calc2)
  99 {
 100   int *klist1;
 101   int *klist2;
 102   int *tlist1;
 103   int *tlist2;
 104   int len1, len2;
 105   float score;
 106
 107   if (! make_alilist(calc1,  calc2,  &tlist1, &len1)) return -1.0;
 108   if (! make_alilist(calc2,  calc1,  &tlist2, &len2)) return -1.0;
 109   if (! make_alilist(known1, known2, &klist1, &len1)) return -1.0;
 110   if (! make_alilist(known2, known1, &klist2, &len2)) return -1.0;
 111   if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;
 112
 113   free(klist1);
 114   free(klist2);
 115   free(tlist1);
 116   free(tlist2);
 117   return score;
 118 }
 119
 120
 121
 122 /* Function: CompareRefPairAlignments()
 123  *
 124  * Same as above, but the only columns that count are the ones
 125  * with indices in *refcoord. *refcoord and the known1, known2
 126  * pair must be in sync with each other (come from the same
 127  * multiple sequence alignment)
 128  *
 129  * Args:     ref           - 0..alen-1 array of 1 or 0
 130  *           known1,known2 - trusted alignment
 131  *           calc1, calc2  - test alignment
 132  *
 133  * Return:  the fractional alignment identity on success, -1.0 on failure.
 134  */
 135 float
 136 CompareRefPairAlignments(int  *ref, char *known1, char *known2, char *calc1, char *calc2)
 137 {
 138   int *klist1;
 139   int *klist2;
 140   int *tlist1;
 141   int *tlist2;
 142   int len1, len2;
 143   float score;
 144
 145   if (! make_ref_alilist(ref, known1, known2, calc1,  calc2,  &tlist1, &len1)) return -1.0;
 146   if (! make_ref_alilist(ref, known2, known1, calc2,  calc1,  &tlist2, &len2)) return -1.0;
 147   if (! make_ref_alilist(ref, known1, known2, known1, known2, &klist1, &len1)) return -1.0;
 148   if (! make_ref_alilist(ref, known2, known1, known2, known1, &klist2, &len2)) return -1.0;
 149   if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;
 150
 151   free(klist1);
 152   free(klist2);
 153   free(tlist1);
 154   free(tlist2);
 155   return score;
 156 }
 157
 158 /* Function: make_alilist()
 159  *
 160  * Purpose:  Construct a list (array) mapping the raw symbols of s1
 161  *           onto the indexes of the aligned symbols in s2 (or -1
 162  *           for gaps in s2). The list (s1_list) will be of the
 163  *           length of s1's raw sequence.
 164  *
 165  * Args:     s1          - sequence to construct the list for
 166  *           s2          - sequence s1 is aligned to
 167  *           ret_s1_list - RETURN: the constructed list (caller must free)
 168  *           ret_listlen - RETURN: length of the list
 169  *
 170  * Returns:  1 on success, 0 on failure
 171  */
 172 static int
 173 make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
 174 {
 175   int *s1_list;
 176   int  col;                     /* column position in alignment */
 177   int  r1, r2;                  /* raw symbol index at current col in s1, s2 */
 178
 179   /* Malloc for s1_list. It can't be longer than s1 itself; we just malloc
 180    * for that (and waste a wee bit of space)
 181    */
 182   s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));
 183   r1 = r2 = 0;
 184   for (col = 0; s1[col] != '\0'; col++)
 185     {
 186       /* symbol in s1? Record what it's aligned to, and bump
 187        * the r1 counter.
 188        */
 189       if (! isgap(s1[col]))
 190         {
 191           s1_list[r1] = isgap(s2[col]) ? -1 : r2;
 192           r1++;
 193         }
 194
 195       /* symbol in s2? bump the r2 counter
 196        */
 197       if (! isgap(s2[col]))
 198         r2++;
 199     }
 200
 201   *ret_listlen = r1;
 202   *ret_s1_list = s1_list;
 203   return 1;
 204 }
 205
 206
 207
 208 /* Function: make_ref_alilist()
 209  *
 210  * Purpose:  Construct a list (array) mapping the raw symbols of s1
 211  *           which are under canonical columns of the ref alignment
 212  *           onto the indexes of the aligned symbols in s2 (or -1
 213  *           for gaps in s2 or noncanonical symbols in s2).
 214  *
 215  * Args:     ref:        - array of indices of canonical coords (1 canonical, 0 non)
 216  *           k1          - s1's known alignment (w/ respect to refcoords)
 217  *           k2          - s2's known alignment (w/ respect to refcoords)
 218  *           s1          - sequence to construct the list for
 219  *           s2          - sequence s1 is aligned to
 220  *           ret_s1_list - RETURN: the constructed list (caller must free)
 221  *           ret_listlen - RETURN: length of the list
 222  *
 223  * Returns:  1 on success, 0 on failure
 224  */
 225 /*ARGSUSED*/
 226 static int
 227 make_ref_alilist(int *ref, char *k1, char *k2,
 228                  char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
 229 {
 230   int *s1_list;
 231   int  col;                     /* column position in alignment */
 232   int  r1, r2;                  /* raw symbol index at current col in s1, s2 */
 233   int *canons1;                 /* flag array, 1 if position i in s1 raw seq is canonical */
 234   int  lpos;                    /* position in list */
 235
 236   /* Allocations. No arrays can exceed the length of their
 237    * appropriate parent (s1 or s2)
 238    */
 239   s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));
 240   canons1 = (int *) MallocOrDie (sizeof(int) * strlen(s1));
 241
 242   /* First we use refcoords and k1,k2 to construct an array of 1's
 243    * and 0's, telling us whether s1's raw symbol number i is countable.
 244    * It's countable simply if it's under a canonical column.
 245    */
 246   r1 =  0;
 247   for (col = 0; k1[col] != '\0'; col++)
 248     {
 249       if (! isgap(k1[col]))
 250         {
 251           canons1[r1] = ref[col] ? 1 : 0;
 252           r1++;
 253         }
 254     }
 255
 256   /* Now we can construct the list. We don't count pairs if the sym in s1
 257    * is non-canonical.
 258    * We have to keep separate track of our position in the list (lpos)
 259    * from our positions in the raw sequences (r1,r2)
 260    */
 261   r1 = r2 = lpos = 0;
 262   for (col = 0; s1[col] != '\0'; col++)
 263     {
 264       if (! isgap(s1[col]) && canons1[r1])
 265         {
 266           s1_list[lpos] = isgap(s2[col]) ? -1 : r2;
 267           lpos++;
 268         }
 269
 270       if (! isgap(s1[col]))
 271         r1++;
 272       if (! isgap(s2[col]))
 273         r2++;
 274     }
 275
 276   free(canons1);
 277   *ret_listlen = lpos;
 278   *ret_s1_list = s1_list;
 279   return 1;
 280 }
 281
 282 /* Function: compare_lists()
 283  *
 284  * Purpose:  Given four alignment lists (k1,k2, t1,t2), calculate the
 285  *           alignment score.
 286  *
 287  * Args:     k1   - list of k1's alignment to k2
 288  *           k2   - list of k2's alignment to k1
 289  *           t1   - list of t1's alignment to t2
 290  *           t2   - list of t2's alignment to t2
 291  *           len1 - length of k1, t1 lists (same by definition)
 292  *           len2 - length of k2, t2 lists (same by definition)
 293  *           ret_sc - RETURN: identity score of alignment
 294  *
 295  * Return:   1 on success, 0 on failure.
 296  */
 297 static int
 298 compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc)
 299 {
 300   float id;
 301   float tot;
 302   int   i;
 303
 304   id = tot = 0.0;
 305   for (i = 0; i < len1; i++)
 306     {
 307       tot += 1.0;
 308       if (t1[i] == k1[i]) id += 1.0;
 309     }
 310
 311   for ( i = 0; i < len2; i++)
 312     {
 313       tot += 1.0;
 314       if (k2[i] == t2[i]) id += 1.0;
 315     }
 316
 317   *ret_sc = id / tot;
 318   return 1;
 319 }
 320
 321
 322 /* Function: CompareMultAlignments
 323  *
 324  * Purpose:  Invokes pairwise alignment comparison for every possible pair,
 325  *           and returns the average score over all N(N-1) of them or -1.0
 326  *           on an internal failure.
 327  *
 328  *           Can be slow for large N, since it's quadratic.
 329  *
 330  * Args:     kseqs  - trusted multiple alignment
 331  *           tseqs  - test multiple alignment
 332  *           N      - number of sequences
 333  *
 334  * Return:   average identity score, or -1.0 on failure.
 335  */
 336 float
 337 CompareMultAlignments(char **kseqs, char **tseqs, int N)
 338 {
 339   int    i, j;                  /* counters for sequences */
 340   float  score;
 341   float  tot_score = 0.0;
 342                                 /* do all pairwise comparisons */
 343   for (i = 0; i < N; i++)
 344     for (j = i+1; j < N; j++)
 345       {
 346         score = ComparePairAlignments(kseqs[i], kseqs[j], tseqs[i], tseqs[j]);
 347         if (score < 0.0) return -1.0;
 348         tot_score += score;
 349       }
 350   return ((tot_score * 2.0) / ((float) N * ((float) N - 1.0)));
 351 }
 352
 353
 354
 355 /* Function: CompareRefMultAlignments()
 356  *
 357  * Purpose:  Same as above, except an array of reference coords for
 358  *           the canonical positions of the known alignment is also
 359  *           provided.
 360  *
 361  * Args:     ref      : 0..alen-1 array of 1/0 flags, 1 if canon
 362  *           kseqs    : trusted alignment
 363  *           tseqs    : test alignment
 364  *           N        : number of sequences
 365  *
 366  * Return:   average identity score, or -1.0 on failure
 367  */
 368 float
 369 CompareRefMultAlignments(int   *ref, char **kseqs, char **tseqs, int N)
 370 {
 371   int    i, j;                  /* counters for sequences */
 372   float  score;
 373   float  tot_score = 0.0;
 374
 375                                 /* do all pairwise comparisons */
 376   for (i = 0; i < N; i++)
 377     for (j = i+1; j < N; j++)
 378       {
 379         score = CompareRefPairAlignments(ref, kseqs[i], kseqs[j], tseqs[i], tseqs[j]);
 380         if (score < 0.0) return -1.0;
 381         tot_score += score;
 382       }
 383   return ((tot_score * 2.0)/ ((float) N * ((float) N - 1.0)));
 384 }
 385
 386 /* Function: PairwiseIdentity()
 387  *
 388  * Purpose:  Calculate the pairwise fractional identity between
 389  *           two aligned sequences s1 and s2. This is simply
 390  *           (idents / MIN(len1, len2)).
 391  *
 392  *           Note how many ways there are to calculate pairwise identity,
 393  *           because of the variety of choices for the denominator:
 394  *           idents/(idents+mismat) has the disadvantage that artifactual
 395  *             gappy alignments would have high "identities".
 396  *           idents/(AVG|MAX)(len1,len2) both have the disadvantage that
 397  *             alignments of fragments to longer sequences would have
 398  *             artifactually low "identities".
 399  *
 400  *           Case sensitive; also, watch out in nucleic acid alignments;
 401  *           U/T RNA/DNA alignments will be counted as mismatches!
 402  */
 403 float
 404 PairwiseIdentity(char *s1, char *s2)
 405 {
 406   int     idents;               /* total identical positions  */
 407   int     len1, len2;           /* lengths of seqs            */
 408   int     x;                    /* position in aligned seqs   */
 409
 410   idents = len1 = len2 = 0;
 411   for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++)
 412     {
 413       if (!isgap(s1[x])) {
 414         len1++;
 415         if (s1[x] == s2[x]) idents++;
 416       }
 417       if (!isgap(s2[x])) len2++;
 418     }
 419   if (len2 < len1) len1 = len2;
 420   return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
 421 }
 422
 423
 424
 425 /* Function: AlignmentIdentityBySampling()
 426  * Date:     SRE, Mon Oct 19 14:29:01 1998 [St. Louis]
 427  *
 428  * Purpose:  Estimate and return the average pairwise
 429  *           fractional identity of an alignment,
 430  *           using sampling.
 431  *
 432  *           For use when there's so many sequences that
 433  *           an all vs. all rigorous calculation will
 434  *           take too long.
 435  *
 436  *           Case sensitive!
 437  *
 438  * Args:     aseq       - aligned sequences
 439  *           L          - length of alignment
 440  *           N          - number of seqs in alignment
 441  *           nsample    - number of samples
 442  *
 443  * Returns:  average fractional identity, 0..1.
 444  */
 445 float
 446 AlignmentIdentityBySampling(char **aseq, int L, int N, int nsample)
 447 {
 448   int x, i, j;                  /* counters */
 449   float sum;
 450
 451   if (N < 2) return 1.0;
 452
 453   sum = 0.;
 454   for (x = 0; x < nsample; x++)
 455     {
 456       i = CHOOSE(N);
 457       do { j = CHOOSE(N); } while (j == i); /* make sure j != i */
 458       sum += PairwiseIdentity(aseq[i], aseq[j]);
 459     }
 460   return sum / (float) nsample;
 461 }
 462
 463 /* Function: MajorityRuleConsensus()
 464  * Date:     SRE, Tue Mar  7 15:30:30 2000 [St. Louis]
 465  *
 466  * Purpose:  Given a set of aligned sequences, produce a
 467  *           majority rule consensus sequence. If >50% nonalphabetic
 468  *           (usually meaning gaps) in the column, ignore the column.
 469  *
 470  * Args:     aseq  - aligned sequences, [0..nseq-1][0..alen-1]
 471  *           nseq  - number of sequences
 472  *           alen  - length of alignment
 473  *
 474  * Returns:  ptr to allocated consensus sequence.
 475  *           Caller is responsible for free'ing this.
 476  */
 477 char *
 478 MajorityRuleConsensus(char **aseq, int nseq, int alen)
 479 {
 480   char *cs;                     /* RETURN: consensus sequence */
 481   int count[27];                /* counts for a..z and gaps in a column */
 482   int idx,apos;                 /* counters for seq, column */
 483   int spos;                     /* position in cs */
 484   int x;                        /* counter for characters */
 485   int sym;
 486   int max, bestx;
 487
 488   cs = MallocOrDie(sizeof(char) * (alen+1));
 489
 490   for (spos=0,apos=0; apos < alen; apos++)
 491     {
 492       for (x = 0; x < 27; x++) count[x] = 0;
 493
 494       for (idx = 0; idx < nseq; idx++)
 495         {
 496           if (isalpha(aseq[idx][apos])) {
 497             sym = toupper(aseq[idx][apos]);
 498             count[sym-'A']++;
 499           } else {
 500             count[26]++;
 501           }
 502         }
 503
 504       if ((float) count[26] / (float) nseq <= 0.5) {
 505         max = bestx = -1;
 506         for (x = 0; x < 26; x++)
 507           if (count[x] > max) { max = count[x]; bestx = x; }
 508         cs[spos++] = (char) ('A' + bestx);
 509       }
 510     }
 511   cs[spos] = '\0';
 512   return cs;
 513 }