1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * RCS $Id: aligneval.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $
14 * Comparison of multiple alignments. Three functions are
15 * provided, using subtly different scoring schemes:
16 * CompareMultAlignments() - basic scoring scheme
17 * CompareRefMultAlignments() - only certain "canonical" columns
20 * The similarity measure is a fractional alignment identity averaged
21 * over all sequence pairs. The score for all pairs is:
22 * (identically aligned symbols) / (total aligned columns in
25 * A column c is identically aligned for sequences i, j if:
26 * 1) both i,j have a symbol aligned in column c, and the
27 * same pair of symbols is aligned somewhere in the test
29 * 2) S[i][c] is aligned to a gap in sequence j, and that symbol
30 * is aligned to a gap in the test alignment
34 * The algorithm is as follows:
35 * 1) For each known/test aligned pair of sequences (k1,k2 and t1,t2)
36 * construct a list for each sequence, in which for every
37 * counted symbol we record the raw index of the symbol in
38 * the other sequence that it aligns to, or -1 if it aligns
39 * to a gap or uncounted symbol.
41 * 2) Compare the list for k1 to the list for t1 and count an identity
42 * for each correct alignment.
44 * 3) Repeat 2) for comparing k2 to t2. Note that this means correct sym/sym
45 * alignments count for 2; correct sym/gap alignments count for 1.
47 * 4) The score is (identities from 2 + identities from 3) /
48 * (totals from 2 + totals from 3).
50 * Written originally for koala's ss2 pairwise alignment package.
52 * Sean Eddy, Sun Nov 1 12:45:11 1992
53 * SRE, Thu Jul 29 16:47:18 1993: major revision: all functions replaced by new algorithm
54 * CVS $Id: aligneval.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $
67 static int make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen);
68 static int make_ref_alilist(int *refcoords, char *k1, char *k2, char *s1, char *s2,
69 int **ret_s1_list, int *ret_listlen);
70 static int compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc);
73 /* Function: ComparePairAlignments
75 * Purpose: Calculate and return a number representing how well two different alignments
76 * of a pair of sequences compare. The number is, roughly speaking,
77 * the fraction of columns which are identically aligned.
79 * For all columns c in which either known1[c] or known2[c]
80 * is a non-gap, count an identity if those same symbols are
81 * aligned somewhere in calc1/calc2. The score is identities/total
82 * columns examined. (i.e. fully gapped columns don't count)
84 * more explicitly, identities come from:
85 * both known and test aligned pairs have the same symbol in the first sequence aligned to
86 * a gap in the second sequence;
87 * both known and test aligned pairs have the same symbol in the second sequence
88 * aligned to a gap in the first sequence;
89 * the known alignment has symbols aligned at this column, and the test
90 * alignment aligns the same two symbols.
92 * Args: known1, known2: trusted alignment of two sequences
93 * calc1, calc2: test alignment of two sequences
95 * Return: Returns -1.0 on internal failure.
98 ComparePairAlignments(char *known1, char *known2, char *calc1, char *calc2)
107 if (! make_alilist(calc1, calc2, &tlist1, &len1)) return -1.0;
108 if (! make_alilist(calc2, calc1, &tlist2, &len2)) return -1.0;
109 if (! make_alilist(known1, known2, &klist1, &len1)) return -1.0;
110 if (! make_alilist(known2, known1, &klist2, &len2)) return -1.0;
111 if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;
122 /* Function: CompareRefPairAlignments()
124 * Same as above, but the only columns that count are the ones
125 * with indices in *refcoord. *refcoord and the known1, known2
126 * pair must be in sync with each other (come from the same
127 * multiple sequence alignment)
129 * Args: ref - 0..alen-1 array of 1 or 0
130 * known1,known2 - trusted alignment
131 * calc1, calc2 - test alignment
133 * Return: the fractional alignment identity on success, -1.0 on failure.
136 CompareRefPairAlignments(int *ref, char *known1, char *known2, char *calc1, char *calc2)
145 if (! make_ref_alilist(ref, known1, known2, calc1, calc2, &tlist1, &len1)) return -1.0;
146 if (! make_ref_alilist(ref, known2, known1, calc2, calc1, &tlist2, &len2)) return -1.0;
147 if (! make_ref_alilist(ref, known1, known2, known1, known2, &klist1, &len1)) return -1.0;
148 if (! make_ref_alilist(ref, known2, known1, known2, known1, &klist2, &len2)) return -1.0;
149 if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;
158 /* Function: make_alilist()
160 * Purpose: Construct a list (array) mapping the raw symbols of s1
161 * onto the indexes of the aligned symbols in s2 (or -1
162 * for gaps in s2). The list (s1_list) will be of the
163 * length of s1's raw sequence.
165 * Args: s1 - sequence to construct the list for
166 * s2 - sequence s1 is aligned to
167 * ret_s1_list - RETURN: the constructed list (caller must free)
168 * ret_listlen - RETURN: length of the list
170 * Returns: 1 on success, 0 on failure
173 make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
176 int col; /* column position in alignment */
177 int r1, r2; /* raw symbol index at current col in s1, s2 */
179 /* Malloc for s1_list. It can't be longer than s1 itself; we just malloc
180 * for that (and waste a wee bit of space)
182 s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));
184 for (col = 0; s1[col] != '\0'; col++)
186 /* symbol in s1? Record what it's aligned to, and bump
189 if (! isgap(s1[col]))
191 s1_list[r1] = isgap(s2[col]) ? -1 : r2;
195 /* symbol in s2? bump the r2 counter
197 if (! isgap(s2[col]))
202 *ret_s1_list = s1_list;
208 /* Function: make_ref_alilist()
210 * Purpose: Construct a list (array) mapping the raw symbols of s1
211 * which are under canonical columns of the ref alignment
212 * onto the indexes of the aligned symbols in s2 (or -1
213 * for gaps in s2 or noncanonical symbols in s2).
215 * Args: ref: - array of indices of canonical coords (1 canonical, 0 non)
216 * k1 - s1's known alignment (w/ respect to refcoords)
217 * k2 - s2's known alignment (w/ respect to refcoords)
218 * s1 - sequence to construct the list for
219 * s2 - sequence s1 is aligned to
220 * ret_s1_list - RETURN: the constructed list (caller must free)
221 * ret_listlen - RETURN: length of the list
223 * Returns: 1 on success, 0 on failure
227 make_ref_alilist(int *ref, char *k1, char *k2,
228 char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
231 int col; /* column position in alignment */
232 int r1, r2; /* raw symbol index at current col in s1, s2 */
233 int *canons1; /* flag array, 1 if position i in s1 raw seq is canonical */
234 int lpos; /* position in list */
236 /* Allocations. No arrays can exceed the length of their
237 * appropriate parent (s1 or s2)
239 s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));
240 canons1 = (int *) MallocOrDie (sizeof(int) * strlen(s1));
242 /* First we use refcoords and k1,k2 to construct an array of 1's
243 * and 0's, telling us whether s1's raw symbol number i is countable.
244 * It's countable simply if it's under a canonical column.
247 for (col = 0; k1[col] != '\0'; col++)
249 if (! isgap(k1[col]))
251 canons1[r1] = ref[col] ? 1 : 0;
256 /* Now we can construct the list. We don't count pairs if the sym in s1
258 * We have to keep separate track of our position in the list (lpos)
259 * from our positions in the raw sequences (r1,r2)
262 for (col = 0; s1[col] != '\0'; col++)
264 if (! isgap(s1[col]) && canons1[r1])
266 s1_list[lpos] = isgap(s2[col]) ? -1 : r2;
270 if (! isgap(s1[col]))
272 if (! isgap(s2[col]))
278 *ret_s1_list = s1_list;
282 /* Function: compare_lists()
284 * Purpose: Given four alignment lists (k1,k2, t1,t2), calculate the
287 * Args: k1 - list of k1's alignment to k2
288 * k2 - list of k2's alignment to k1
289 * t1 - list of t1's alignment to t2
290 * t2 - list of t2's alignment to t2
291 * len1 - length of k1, t1 lists (same by definition)
292 * len2 - length of k2, t2 lists (same by definition)
293 * ret_sc - RETURN: identity score of alignment
295 * Return: 1 on success, 0 on failure.
298 compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc)
305 for (i = 0; i < len1; i++)
308 if (t1[i] == k1[i]) id += 1.0;
311 for ( i = 0; i < len2; i++)
314 if (k2[i] == t2[i]) id += 1.0;
322 /* Function: CompareMultAlignments
324 * Purpose: Invokes pairwise alignment comparison for every possible pair,
325 * and returns the average score over all N(N-1) of them or -1.0
326 * on an internal failure.
328 * Can be slow for large N, since it's quadratic.
330 * Args: kseqs - trusted multiple alignment
331 * tseqs - test multiple alignment
332 * N - number of sequences
334 * Return: average identity score, or -1.0 on failure.
337 CompareMultAlignments(char **kseqs, char **tseqs, int N)
339 int i, j; /* counters for sequences */
341 float tot_score = 0.0;
342 /* do all pairwise comparisons */
343 for (i = 0; i < N; i++)
344 for (j = i+1; j < N; j++)
346 score = ComparePairAlignments(kseqs[i], kseqs[j], tseqs[i], tseqs[j]);
347 if (score < 0.0) return -1.0;
350 return ((tot_score * 2.0) / ((float) N * ((float) N - 1.0)));
355 /* Function: CompareRefMultAlignments()
357 * Purpose: Same as above, except an array of reference coords for
358 * the canonical positions of the known alignment is also
361 * Args: ref : 0..alen-1 array of 1/0 flags, 1 if canon
362 * kseqs : trusted alignment
363 * tseqs : test alignment
364 * N : number of sequences
366 * Return: average identity score, or -1.0 on failure
369 CompareRefMultAlignments(int *ref, char **kseqs, char **tseqs, int N)
371 int i, j; /* counters for sequences */
373 float tot_score = 0.0;
375 /* do all pairwise comparisons */
376 for (i = 0; i < N; i++)
377 for (j = i+1; j < N; j++)
379 score = CompareRefPairAlignments(ref, kseqs[i], kseqs[j], tseqs[i], tseqs[j]);
380 if (score < 0.0) return -1.0;
383 return ((tot_score * 2.0)/ ((float) N * ((float) N - 1.0)));
386 /* Function: PairwiseIdentity()
388 * Purpose: Calculate the pairwise fractional identity between
389 * two aligned sequences s1 and s2. This is simply
390 * (idents / MIN(len1, len2)).
392 * Note how many ways there are to calculate pairwise identity,
393 * because of the variety of choices for the denominator:
394 * idents/(idents+mismat) has the disadvantage that artifactual
395 * gappy alignments would have high "identities".
396 * idents/(AVG|MAX)(len1,len2) both have the disadvantage that
397 * alignments of fragments to longer sequences would have
398 * artifactually low "identities".
400 * Case sensitive; also, watch out in nucleic acid alignments;
401 * U/T RNA/DNA alignments will be counted as mismatches!
404 PairwiseIdentity(char *s1, char *s2)
406 int idents; /* total identical positions */
407 int len1, len2; /* lengths of seqs */
408 int x; /* position in aligned seqs */
410 idents = len1 = len2 = 0;
411 for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++)
415 if (s1[x] == s2[x]) idents++;
417 if (!isgap(s2[x])) len2++;
419 if (len2 < len1) len1 = len2;
420 return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
425 /* Function: AlignmentIdentityBySampling()
426 * Date: SRE, Mon Oct 19 14:29:01 1998 [St. Louis]
428 * Purpose: Estimate and return the average pairwise
429 * fractional identity of an alignment,
432 * For use when there's so many sequences that
433 * an all vs. all rigorous calculation will
438 * Args: aseq - aligned sequences
439 * L - length of alignment
440 * N - number of seqs in alignment
441 * nsample - number of samples
443 * Returns: average fractional identity, 0..1.
446 AlignmentIdentityBySampling(char **aseq, int L, int N, int nsample)
448 int x, i, j; /* counters */
451 if (N < 2) return 1.0;
454 for (x = 0; x < nsample; x++)
457 do { j = CHOOSE(N); } while (j == i); /* make sure j != i */
458 sum += PairwiseIdentity(aseq[i], aseq[j]);
460 return sum / (float) nsample;
463 /* Function: MajorityRuleConsensus()
464 * Date: SRE, Tue Mar 7 15:30:30 2000 [St. Louis]
466 * Purpose: Given a set of aligned sequences, produce a
467 * majority rule consensus sequence. If >50% nonalphabetic
468 * (usually meaning gaps) in the column, ignore the column.
470 * Args: aseq - aligned sequences, [0..nseq-1][0..alen-1]
471 * nseq - number of sequences
472 * alen - length of alignment
474 * Returns: ptr to allocated consensus sequence.
475 * Caller is responsible for free'ing this.
478 MajorityRuleConsensus(char **aseq, int nseq, int alen)
480 char *cs; /* RETURN: consensus sequence */
481 int count[27]; /* counts for a..z and gaps in a column */
482 int idx,apos; /* counters for seq, column */
483 int spos; /* position in cs */
484 int x; /* counter for characters */
488 cs = MallocOrDie(sizeof(char) * (alen+1));
490 for (spos=0,apos=0; apos < alen; apos++)
492 for (x = 0; x < 27; x++) count[x] = 0;
494 for (idx = 0; idx < nseq; idx++)
496 if (isalpha(aseq[idx][apos])) {
497 sym = toupper(aseq[idx][apos]);
504 if ((float) count[26] / (float) nseq <= 0.5) {
506 for (x = 0; x < 26; x++)
507 if (count[x] > max) { max = count[x]; bestx = x; }
508 cs[spos++] = (char) ('A' + bestx);