1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
8 *****************************************************************/
11 * SRE, Mon Jul 12 11:57:37 1993
12 * RCS $Id: alignio.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: alignio.c,v 1.11 2002/10/09 14:26:09 eddy Exp)
14 * Input/output of sequence alignments.
22 #include "sre_random.h"
24 /* Function: AllocAlignment()
26 * Purpose: Allocate space for an alignment, given the number
27 * of sequences and the alignment length in columns.
29 * Args: nseq - number of sequences
30 * alen - width of alignment
31 * ret_aseq - RETURN: alignment itself
32 * ainfo - RETURN: other info associated with alignment
35 * aseq, ainfo free'd by caller: FreeAlignment(aseq, &ainfo).
36 * note that ainfo itself is alloc'ed in caller, usually
37 * just by a "AINFO ainfo" definition.
40 AllocAlignment(int nseq, int alen, char ***ret_aseq, AINFO *ainfo)
47 aseq = (char **) MallocOrDie (sizeof(char *) * nseq);
48 for (idx = 0; idx < nseq; idx++)
49 aseq[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
54 ainfo->wgt = (float *) MallocOrDie (sizeof(float) * nseq);
55 FSet(ainfo->wgt, nseq, 1.0);
57 ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
58 for (idx = 0; idx < nseq; idx++)
59 ainfo->sqinfo[idx].flags = 0;
65 /* Function: InitAinfo()
66 * Date: SRE, Tue Jan 19 10:16:02 1999 [St. Louis]
68 * Purpose: Initialize the fields in ainfo structure to
69 * default (null) values. Does nothing with
70 * fields that are dependent on nseq or alen.
72 * Args: ainfo - optional info structure for an alignment
74 * Returns: (void). ainfo is modified.
77 InitAinfo(AINFO *ainfo)
87 ainfo->tc1 = ainfo->tc2 = 0.0;
88 ainfo->nc1 = ainfo->nc2 = 0.0;
89 ainfo->ga1 = ainfo->ga2 = 0.0;
93 /* Function: FreeAlignment()
95 * Purpose: Free the space allocated to alignment, names, and optional
98 * Args: aseqs - sequence alignment
99 * ainfo - associated alignment data.
102 FreeAlignment(char **aseqs, AINFO *ainfo)
106 for (i = 0; i < ainfo->nseq; i++)
108 if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss);
109 if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa);
111 if (ainfo->cs != NULL) free(ainfo->cs);
112 if (ainfo->rf != NULL) free(ainfo->rf);
113 if (ainfo->name != NULL) free(ainfo->name);
114 if (ainfo->desc != NULL) free(ainfo->desc);
115 if (ainfo->acc != NULL) free(ainfo->acc);
116 if (ainfo->au != NULL) free(ainfo->au);
120 Free2DArray((void **) aseqs, ainfo->nseq);
125 /* Function: SAMizeAlignment()
126 * Date: SRE, Tue Jun 30 09:49:40 1998 [St. Louis]
128 * Purpose: Make a "best effort" attempt to convert an alignment
129 * to SAM gap format: - in delete col, . in insert col.
130 * Only works if alignment adheres to SAM's upper/lower
131 * case convention, which is true for instance of old
134 * Args: aseq - alignment to convert
135 * nseq - number of seqs in alignment
136 * alen - length of alignment
141 SAMizeAlignment(char **aseq, int nseq, int alen)
143 int col; /* counter for aligned columns */
144 int i; /* counter for seqs */
145 int sawlower, sawupper, sawgap;
148 for (col = 0; col < alen; col++)
150 sawlower = sawupper = sawgap = 0;
151 /* pass 1: do we see only upper or lower? */
152 for (i = 0; i < nseq; i++)
154 if (isgap(aseq[i][col])) { sawgap = 1; continue; }
155 if (isupper((int) aseq[i][col])) { sawupper = 1; continue; }
156 if (islower((int) aseq[i][col])) sawlower = 1;
158 /* select gap character for column */
159 gapchar = '-'; /* default */
160 if (sawlower && ! sawupper) gapchar = '.';
162 /* pass 2: set gap char */
163 for (i = 0; i < nseq; i++)
164 if (isgap(aseq[i][col])) aseq[i][col] = gapchar;
169 /* Function: SAMizeAlignmentByGapFrac()
170 * Date: SRE, Tue Jun 30 10:58:38 1998 [St. Louis]
172 * Purpose: Convert an alignment to SAM's gap and case
173 * conventions, using gap fraction in a column
174 * to choose match versus insert columns. In match columns,
175 * residues are upper case and gaps are '-'.
176 * In insert columns, residues are lower case and
179 * Args: aseq - aligned sequences
180 * nseq - number of sequences
181 * alen - length of alignment
182 * maxgap - if more gaps than this fraction, column is insert.
184 * Returns: (void) Characters in aseq may be altered.
187 SAMizeAlignmentByGapFrac(char **aseq, int nseq, int alen, float maxgap)
189 int apos; /* counter over columns */
190 int idx; /* counter over sequences */
191 int ngap; /* number of gaps seen */
193 for (apos = 0; apos < alen; apos++)
197 for (idx = 0; idx < nseq; idx++)
198 if (isgap(aseq[idx][apos])) ngap++;
200 /* convert to SAM conventions */
201 if ((float) ngap / (float) nseq > maxgap)
202 { /* insert column */
203 for (idx = 0; idx < nseq; idx++)
204 if (isgap(aseq[idx][apos])) aseq[idx][apos] = '.';
205 else aseq[idx][apos] = (char) tolower((int) aseq[idx][apos]);
209 for (idx = 0; idx < nseq; idx++)
210 if (isgap(aseq[idx][apos])) aseq[idx][apos] = '-';
211 else aseq[idx][apos] = (char) toupper((int) aseq[idx][apos]);
219 /* Function: MakeAlignedString()
221 * Purpose: Given a raw string of some type (secondary structure, say),
222 * align it to a given aseq by putting gaps wherever the
225 * Args: aseq: template for alignment
226 * alen: length of aseq
227 * ss: raw string to align to aseq
228 * ret_s: RETURN: aligned ss
230 * Return: 1 on success, 0 on failure (and squid_errno is set.)
231 * ret_ss is malloc'ed here and must be free'd by caller.
234 MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s)
239 new = (char *) MallocOrDie ((alen+1) * sizeof(char));
240 for (apos = rpos = 0; apos < alen; apos++)
241 if (! isgap(aseq[apos]))
243 new[apos] = ss[rpos];
250 if (rpos != strlen(ss))
251 { squid_errno = SQERR_PARAMETER; free(new); return 0; }
257 /* Function: MakeDealignedString()
259 * Purpose: Given an aligned string of some type (either sequence or
260 * secondary structure, for instance), dealign it relative
261 * to a given aseq. Return a ptr to the new string.
263 * Args: aseq : template alignment
264 * alen : length of aseq
265 * ss: : string to make dealigned copy of; same length as aseq
266 * ret_s : RETURN: dealigned copy of ss
268 * Return: 1 on success, 0 on failure (and squid_errno is set)
269 * ret_s is alloc'ed here and must be freed by caller
272 MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s)
277 new = (char *) MallocOrDie ((alen+1) * sizeof(char));
278 for (apos = rpos = 0; apos < alen; apos++)
279 if (! isgap(aseq[apos]))
281 new[rpos] = ss[apos];
285 if (alen != strlen(ss))
286 { squid_errno = SQERR_PARAMETER; free(new); return 0; }
292 /* Function: DealignedLength()
294 * Purpose: Count the number of non-gap symbols in seq.
295 * (i.e. find the length of the unaligned sequence)
297 * Args: aseq - aligned sequence to count symbols in, \0 terminated
299 * Return: raw length of seq.
302 DealignedLength(char *aseq)
305 for (rlen = 0; *aseq; aseq++)
306 if (! isgap(*aseq)) rlen++;
311 /* Function: WritePairwiseAlignment()
313 * Purpose: Write a nice formatted pairwise alignment out,
314 * with a BLAST-style middle line showing identities
315 * as themselves (single letter) and conservative
318 * Args: ofp - open fp to write to (stdout, perhaps)
319 * aseq1, aseq2 - alignments to write (not necessarily
320 * flushed right with gaps)
321 * name1, name2 - names of sequences
322 * spos1, spos2 - starting position in each (raw) sequence
323 * pam - PAM matrix; positive values define
324 * conservative changes
325 * indent - how many extra spaces to print on left
327 * Return: 1 on success, 0 on failure
330 WritePairwiseAlignment(FILE *ofp,
331 char *aseq1, char *name1, int spos1,
332 char *aseq2, char *name2, int spos2,
333 int **pam, int indent)
335 char sname1[11]; /* shortened name */
337 int still_going; /* True if writing another block */
338 char buf1[61]; /* buffer for writing seq1; CPL+1*/
339 char bufmid[61]; /* buffer for writing consensus */
341 char *s1, *s2; /* ptrs into each sequence */
342 int count1, count2; /* number of symbols we're writing */
343 int rpos1, rpos2; /* position in raw seqs */
344 int rawcount1, rawcount2; /* number of nongap symbols written */
347 strncpy(sname1, name1, 10);
349 strtok(sname1, WHITESPACE);
351 strncpy(sname2, name2, 10);
353 strtok(sname2, WHITESPACE);
365 /* get next line's worth from both */
366 strncpy(buf1, s1, 60); buf1[60] = '\0';
367 strncpy(buf2, s2, 60); buf2[60] = '\0';
368 count1 = strlen(buf1);
369 count2 = strlen(buf2);
371 /* is there still more to go? */
372 if ((count1 == 60 && s1[60] != '\0') ||
373 (count2 == 60 && s2[60] != '\0'))
376 /* shift seq ptrs by a line */
380 /* assemble the consensus line */
381 for (apos = 0; apos < count1 && apos < count2; apos++)
383 if (!isgap(buf1[apos]) && !isgap(buf2[apos]))
385 if (buf1[apos] == buf2[apos])
386 bufmid[apos] = buf1[apos];
387 else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0)
398 for (apos = 0; apos < count1; apos++)
399 if (!isgap(buf1[apos])) rawcount1++;
402 for (apos = 0; apos < count2; apos++)
403 if (!isgap(buf2[apos])) rawcount2++;
405 (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
406 sname1, rpos1, buf1, rpos1 + rawcount1 -1);
407 (void) fprintf(ofp, "%*s %s\n", indent, "",
409 (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
410 sname2, rpos2, buf2, rpos2 + rawcount2 -1);
411 (void) fprintf(ofp, "\n");
421 /* Function: MingapAlignment()
423 * Purpose: Remove all-gap columns from a multiple sequence alignment
424 * and its associated data. The alignment is assumed to be
425 * flushed (all aseqs the same length).
428 MingapAlignment(char **aseqs, AINFO *ainfo)
430 int apos; /* position in original alignment */
431 int mpos; /* position in new alignment */
434 /* We overwrite aseqs, using its allocated memory.
436 for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++)
438 /* check for all-gap in column */
439 for (idx = 0; idx < ainfo->nseq; idx++)
440 if (! isgap(aseqs[idx][apos]))
442 if (idx == ainfo->nseq) continue;
444 /* shift alignment and ainfo */
447 for (idx = 0; idx < ainfo->nseq; idx++)
448 aseqs[idx][mpos] = aseqs[idx][apos];
450 if (ainfo->cs != NULL) ainfo->cs[mpos] = ainfo->cs[apos];
451 if (ainfo->rf != NULL) ainfo->rf[mpos] = ainfo->rf[apos];
455 /* null terminate everything */
456 for (idx = 0; idx < ainfo->nseq; idx++)
457 aseqs[idx][mpos] = '\0';
458 ainfo->alen = mpos; /* set new length */
459 if (ainfo->cs != NULL) ainfo->cs[mpos] = '\0';
460 if (ainfo->rf != NULL) ainfo->rf[mpos] = '\0';
466 /* Function: RandomAlignment()
468 * Purpose: Create a random alignment from raw sequences.
470 * Ideally, we would like to sample an alignment from the
471 * space of possible alignments according to its probability,
472 * given a prior probability distribution for alignments.
473 * I don't see how to describe such a distribution, let alone
476 * This is a rough approximation that tries to capture some
477 * desired properties. We assume the alignment is generated
478 * by a simple HMM composed of match and insert states.
479 * Given parameters (pop, pex) for the probability of opening
480 * and extending an insertion, we can find the expected number
481 * of match states, M, in the underlying model for each sequence.
482 * We use an average M taken over all the sequences (this is
483 * an approximation. The expectation of M given all the sequence
484 * lengths is a nasty-looking summation.)
486 * M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) )
488 * Then, we assign positions in each raw sequence onto the M match
489 * states and M+1 insert states of this "HMM", by rolling random
490 * numbers and inserting the (rlen-M) inserted positions randomly
491 * into the insert slots, taking into account the relative probability
492 * of open vs. extend.
494 * The resulting alignment has two desired properties: insertions
495 * tend to follow the HMM-like exponential distribution, and
496 * the "sparseness" of the alignment is controllable through
499 * Args: rseqs - raw sequences to "align", 0..nseq-1
500 * sqinfo - array of 0..nseq-1 info structures for the sequences
501 * nseq - number of sequences
502 * pop - probability to open insertion (0<pop<1)
503 * pex - probability to extend insertion (0<pex<1)
504 * ret_aseqs - RETURN: alignment (flushed)
505 * ainfo - fill in: alignment info
507 * Return: 1 on success, 0 on failure. Sets squid_errno to indicate cause
511 RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex,
512 char ***ret_aseqs, AINFO *ainfo)
514 char **aseqs; /* RETURN: alignment */
515 int alen; /* length of alignment */
516 int *rlen; /* lengths of each raw sequence */
517 int M; /* length of "model" */
518 int **ins; /* insertion counts, 0..nseq-1 by 0..M */
519 int *master_ins; /* max insertion counts, 0..M */
525 /* calculate expected length of model, M
527 rlen = (int *) MallocOrDie (sizeof(int) * nseq);
530 for (idx = 0; idx < nseq; idx++)
532 rlen[idx] = strlen(rseqs[idx]);
534 minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen;
536 M = (int) ((float) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex))));
538 if (M > minlen) M = minlen;
540 /* make arrays that count insertions in M+1 possible insert states
542 ins = (int **) MallocOrDie (sizeof(int *) * nseq);
543 master_ins = (int *) MallocOrDie (sizeof(int) * (M+1));
544 for (idx = 0; idx < nseq; idx++)
546 ins[idx] = (int *) MallocOrDie (sizeof(int) * (M+1));
547 for (rpos = 0; rpos <= M; rpos++)
551 pop = pop / (pop+pex);
553 /* make insertions for individual sequences */
554 for (idx = 0; idx < nseq; idx++)
557 for (rpos = 0; rpos < rlen[idx]-M; rpos++)
559 if (sre_random() < pop || apos == -1) /* open insertion */
560 apos = CHOOSE(M+1); /* choose 0..M */
564 /* calculate master_ins, max inserts */
566 for (apos = 0; apos <= M; apos++)
568 master_ins[apos] = 0;
569 for (idx = 0; idx < nseq; idx++)
570 if (ins[idx][apos] > master_ins[apos])
571 master_ins[apos] = ins[idx][apos];
572 alen += master_ins[apos];
576 /* Now, construct alignment
578 aseqs = (char **) MallocOrDie (sizeof (char *) * nseq);
579 for (idx = 0; idx < nseq; idx++)
580 aseqs[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
581 for (idx = 0; idx < nseq; idx++)
585 for (statepos = 0; statepos <= M; statepos++)
587 for (count = 0; count < ins[idx][statepos]; count++)
588 aseqs[idx][apos++] = rseqs[idx][rpos++];
589 for (; count < master_ins[statepos]; count++)
590 aseqs[idx][apos++] = ' ';
593 aseqs[idx][apos++] = rseqs[idx][rpos++];
595 aseqs[idx][alen] = '\0';
600 ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
601 for (idx = 0; idx < nseq; idx++)
602 SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx]));
606 Free2DArray((void **) ins, nseq);
611 /* Function: AlignmentHomogenousGapsym()
612 * Date: SRE, Sun Mar 19 19:37:12 2000 [wren, St. Louis]
614 * Purpose: Sometimes we've got to convert alignments to
615 * a lowest common denominator, and we need
616 * a single specific gap character -- for example,
617 * PSI-BLAST blastpgp -B takes a very simplistic
618 * alignment input format which appears to only
619 * allow '-' as a gap symbol.
621 * Anything matching the isgap() macro is
624 * Args: aseq - aligned character strings, [0..nseq-1][0..alen-1]
625 * nseq - number of aligned strings
626 * alen - length of alignment
627 * gapsym - character to use for gaps.
629 * Returns: void ("never fails")
632 AlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym)
636 for (i = 0; i < nseq; i++)
637 for (apos = 0; apos < alen; apos++)
638 if (isgap(aseq[i][apos])) aseq[i][apos] = gapsym;