1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * SRE, Mon Jul 12 11:57:37 1993
13 * RCS $Id: alignio.c,v 1.1.1.1 2005/03/22 08:34:27 cmzmasek Exp $
15 * Input/output of sequence alignments.
28 /* Function: AllocAlignment()
30 * Purpose: Allocate space for an alignment, given the number
31 * of sequences and the alignment length in columns.
33 * Args: nseq - number of sequences
34 * alen - width of alignment
35 * ret_aseq - RETURN: alignment itself
36 * ainfo - RETURN: other info associated with alignment
39 * aseq, ainfo free'd by caller: FreeAlignment(aseq, &ainfo).
40 * note that ainfo itself is alloc'ed in caller, usually
41 * just by a "AINFO ainfo" definition.
44 AllocAlignment(int nseq, int alen, char ***ret_aseq, AINFO *ainfo)
51 aseq = (char **) MallocOrDie (sizeof(char *) * nseq);
52 for (idx = 0; idx < nseq; idx++)
53 aseq[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
58 ainfo->wgt = (float *) MallocOrDie (sizeof(float) * nseq);
59 FSet(ainfo->wgt, nseq, 1.0);
61 ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
62 for (idx = 0; idx < nseq; idx++)
63 ainfo->sqinfo[idx].flags = 0;
69 /* Function: InitAinfo()
70 * Date: SRE, Tue Jan 19 10:16:02 1999 [St. Louis]
72 * Purpose: Initialize the fields in ainfo structure to
73 * default (null) values. Does nothing with
74 * fields that are dependent on nseq or alen.
76 * Args: ainfo - optional info structure for an alignment
78 * Returns: (void). ainfo is modified.
81 InitAinfo(AINFO *ainfo)
91 ainfo->tc1 = ainfo->tc2 = 0.0;
92 ainfo->nc1 = ainfo->nc2 = 0.0;
93 ainfo->ga1 = ainfo->ga2 = 0.0;
97 /* Function: FreeAlignment()
99 * Purpose: Free the space allocated to alignment, names, and optional
102 * Args: aseqs - sequence alignment
103 * ainfo - associated alignment data.
106 FreeAlignment(char **aseqs, AINFO *ainfo)
110 for (i = 0; i < ainfo->nseq; i++)
112 if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss);
113 if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa);
115 if (ainfo->cs != NULL) free(ainfo->cs);
116 if (ainfo->rf != NULL) free(ainfo->rf);
117 if (ainfo->name != NULL) free(ainfo->name);
118 if (ainfo->desc != NULL) free(ainfo->desc);
119 if (ainfo->acc != NULL) free(ainfo->acc);
120 if (ainfo->au != NULL) free(ainfo->au);
124 Free2DArray((void **) aseqs, ainfo->nseq);
129 /* Function: SAMizeAlignment()
130 * Date: SRE, Tue Jun 30 09:49:40 1998 [St. Louis]
132 * Purpose: Make a "best effort" attempt to convert an alignment
133 * to SAM gap format: - in delete col, . in insert col.
134 * Only works if alignment adheres to SAM's upper/lower
135 * case convention, which is true for instance of old
138 * Args: aseq - alignment to convert
139 * nseq - number of seqs in alignment
140 * alen - length of alignment
145 SAMizeAlignment(char **aseq, int nseq, int alen)
147 int col; /* counter for aligned columns */
148 int i; /* counter for seqs */
149 int sawlower, sawupper, sawgap;
152 for (col = 0; col < alen; col++)
154 sawlower = sawupper = sawgap = 0;
155 /* pass 1: do we see only upper or lower? */
156 for (i = 0; i < nseq; i++)
158 if (isgap(aseq[i][col])) { sawgap = 1; continue; }
159 if (isupper((int) aseq[i][col])) { sawupper = 1; continue; }
160 if (islower((int) aseq[i][col])) sawlower = 1;
162 /* select gap character for column */
163 gapchar = '-'; /* default */
164 if (sawlower && ! sawupper) gapchar = '.';
166 /* pass 2: set gap char */
167 for (i = 0; i < nseq; i++)
168 if (isgap(aseq[i][col])) aseq[i][col] = gapchar;
173 /* Function: SAMizeAlignmentByGapFrac()
174 * Date: SRE, Tue Jun 30 10:58:38 1998 [St. Louis]
176 * Purpose: Convert an alignment to SAM's gap and case
177 * conventions, using gap fraction in a column
178 * to choose match versus insert columns. In match columns,
179 * residues are upper case and gaps are '-'.
180 * In insert columns, residues are lower case and
183 * Args: aseq - aligned sequences
184 * nseq - number of sequences
185 * alen - length of alignment
186 * maxgap - if more gaps than this fraction, column is insert.
188 * Returns: (void) Characters in aseq may be altered.
191 SAMizeAlignmentByGapFrac(char **aseq, int nseq, int alen, float maxgap)
193 int apos; /* counter over columns */
194 int idx; /* counter over sequences */
195 int ngap; /* number of gaps seen */
197 for (apos = 0; apos < alen; apos++)
201 for (idx = 0; idx < nseq; idx++)
202 if (isgap(aseq[idx][apos])) ngap++;
204 /* convert to SAM conventions */
205 if ((float) ngap / (float) nseq > maxgap)
206 { /* insert column */
207 for (idx = 0; idx < nseq; idx++)
208 if (isgap(aseq[idx][apos])) aseq[idx][apos] = '.';
209 else aseq[idx][apos] = (char) tolower((int) aseq[idx][apos]);
213 for (idx = 0; idx < nseq; idx++)
214 if (isgap(aseq[idx][apos])) aseq[idx][apos] = '-';
215 else aseq[idx][apos] = (char) toupper((int) aseq[idx][apos]);
223 /* Function: MakeAlignedString()
225 * Purpose: Given a raw string of some type (secondary structure, say),
226 * align it to a given aseq by putting gaps wherever the
229 * Args: aseq: template for alignment
230 * alen: length of aseq
231 * ss: raw string to align to aseq
232 * ret_s: RETURN: aligned ss
234 * Return: 1 on success, 0 on failure (and squid_errno is set.)
235 * ret_ss is malloc'ed here and must be free'd by caller.
238 MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s)
243 new = (char *) MallocOrDie ((alen+1) * sizeof(char));
244 for (apos = rpos = 0; apos < alen; apos++)
245 if (! isgap(aseq[apos]))
247 new[apos] = ss[rpos];
254 if (rpos != strlen(ss))
255 { squid_errno = SQERR_PARAMETER; free(new); return 0; }
261 /* Function: MakeDealignedString()
263 * Purpose: Given an aligned string of some type (either sequence or
264 * secondary structure, for instance), dealign it relative
265 * to a given aseq. Return a ptr to the new string.
267 * Args: aseq : template alignment
268 * alen : length of aseq
269 * ss: : string to make dealigned copy of; same length as aseq
270 * ret_s : RETURN: dealigned copy of ss
272 * Return: 1 on success, 0 on failure (and squid_errno is set)
273 * ret_s is alloc'ed here and must be freed by caller
276 MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s)
281 new = (char *) MallocOrDie ((alen+1) * sizeof(char));
282 for (apos = rpos = 0; apos < alen; apos++)
283 if (! isgap(aseq[apos]))
285 new[rpos] = ss[apos];
289 if (alen != strlen(ss))
290 { squid_errno = SQERR_PARAMETER; free(new); return 0; }
296 /* Function: DealignedLength()
298 * Purpose: Count the number of non-gap symbols in seq.
299 * (i.e. find the length of the unaligned sequence)
301 * Args: aseq - aligned sequence to count symbols in, \0 terminated
303 * Return: raw length of seq.
306 DealignedLength(char *aseq)
309 for (rlen = 0; *aseq; aseq++)
310 if (! isgap(*aseq)) rlen++;
315 /* Function: WritePairwiseAlignment()
317 * Purpose: Write a nice formatted pairwise alignment out,
318 * with a BLAST-style middle line showing identities
319 * as themselves (single letter) and conservative
322 * Args: ofp - open fp to write to (stdout, perhaps)
323 * aseq1, aseq2 - alignments to write (not necessarily
324 * flushed right with gaps)
325 * name1, name2 - names of sequences
326 * spos1, spos2 - starting position in each (raw) sequence
327 * pam - PAM matrix; positive values define
328 * conservative changes
329 * indent - how many extra spaces to print on left
331 * Return: 1 on success, 0 on failure
334 WritePairwiseAlignment(FILE *ofp,
335 char *aseq1, char *name1, int spos1,
336 char *aseq2, char *name2, int spos2,
337 int **pam, int indent)
339 char sname1[11]; /* shortened name */
341 int still_going; /* True if writing another block */
342 char buf1[61]; /* buffer for writing seq1; CPL+1*/
343 char bufmid[61]; /* buffer for writing consensus */
345 char *s1, *s2; /* ptrs into each sequence */
346 int count1, count2; /* number of symbols we're writing */
347 int rpos1, rpos2; /* position in raw seqs */
348 int rawcount1, rawcount2; /* number of nongap symbols written */
351 strncpy(sname1, name1, 10);
353 strtok(sname1, WHITESPACE);
355 strncpy(sname2, name2, 10);
357 strtok(sname2, WHITESPACE);
369 /* get next line's worth from both */
370 strncpy(buf1, s1, 60); buf1[60] = '\0';
371 strncpy(buf2, s2, 60); buf2[60] = '\0';
372 count1 = strlen(buf1);
373 count2 = strlen(buf2);
375 /* is there still more to go? */
376 if ((count1 == 60 && s1[60] != '\0') ||
377 (count2 == 60 && s2[60] != '\0'))
380 /* shift seq ptrs by a line */
384 /* assemble the consensus line */
385 for (apos = 0; apos < count1 && apos < count2; apos++)
387 if (!isgap(buf1[apos]) && !isgap(buf2[apos]))
389 if (buf1[apos] == buf2[apos])
390 bufmid[apos] = buf1[apos];
391 else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0)
402 for (apos = 0; apos < count1; apos++)
403 if (!isgap(buf1[apos])) rawcount1++;
406 for (apos = 0; apos < count2; apos++)
407 if (!isgap(buf2[apos])) rawcount2++;
409 (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
410 sname1, rpos1, buf1, rpos1 + rawcount1 -1);
411 (void) fprintf(ofp, "%*s %s\n", indent, "",
413 (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
414 sname2, rpos2, buf2, rpos2 + rawcount2 -1);
415 (void) fprintf(ofp, "\n");
425 /* Function: MingapAlignment()
427 * Purpose: Remove all-gap columns from a multiple sequence alignment
428 * and its associated data. The alignment is assumed to be
429 * flushed (all aseqs the same length).
432 MingapAlignment(char **aseqs, AINFO *ainfo)
434 int apos; /* position in original alignment */
435 int mpos; /* position in new alignment */
438 /* We overwrite aseqs, using its allocated memory.
440 for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++)
442 /* check for all-gap in column */
443 for (idx = 0; idx < ainfo->nseq; idx++)
444 if (! isgap(aseqs[idx][apos]))
446 if (idx == ainfo->nseq) continue;
448 /* shift alignment and ainfo */
451 for (idx = 0; idx < ainfo->nseq; idx++)
452 aseqs[idx][mpos] = aseqs[idx][apos];
454 if (ainfo->cs != NULL) ainfo->cs[mpos] = ainfo->cs[apos];
455 if (ainfo->rf != NULL) ainfo->rf[mpos] = ainfo->rf[apos];
459 /* null terminate everything */
460 for (idx = 0; idx < ainfo->nseq; idx++)
461 aseqs[idx][mpos] = '\0';
462 ainfo->alen = mpos; /* set new length */
463 if (ainfo->cs != NULL) ainfo->cs[mpos] = '\0';
464 if (ainfo->rf != NULL) ainfo->rf[mpos] = '\0';
470 /* Function: RandomAlignment()
472 * Purpose: Create a random alignment from raw sequences.
474 * Ideally, we would like to sample an alignment from the
475 * space of possible alignments according to its probability,
476 * given a prior probability distribution for alignments.
477 * I don't see how to describe such a distribution, let alone
480 * This is a rough approximation that tries to capture some
481 * desired properties. We assume the alignment is generated
482 * by a simple HMM composed of match and insert states.
483 * Given parameters (pop, pex) for the probability of opening
484 * and extending an insertion, we can find the expected number
485 * of match states, M, in the underlying model for each sequence.
486 * We use an average M taken over all the sequences (this is
487 * an approximation. The expectation of M given all the sequence
488 * lengths is a nasty-looking summation.)
490 * M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) )
492 * Then, we assign positions in each raw sequence onto the M match
493 * states and M+1 insert states of this "HMM", by rolling random
494 * numbers and inserting the (rlen-M) inserted positions randomly
495 * into the insert slots, taking into account the relative probability
496 * of open vs. extend.
498 * The resulting alignment has two desired properties: insertions
499 * tend to follow the HMM-like exponential distribution, and
500 * the "sparseness" of the alignment is controllable through
503 * Args: rseqs - raw sequences to "align", 0..nseq-1
504 * sqinfo - array of 0..nseq-1 info structures for the sequences
505 * nseq - number of sequences
506 * pop - probability to open insertion (0<pop<1)
507 * pex - probability to extend insertion (0<pex<1)
508 * ret_aseqs - RETURN: alignment (flushed)
509 * ainfo - fill in: alignment info
511 * Return: 1 on success, 0 on failure. Sets squid_errno to indicate cause
515 RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex,
516 char ***ret_aseqs, AINFO *ainfo)
518 char **aseqs; /* RETURN: alignment */
519 int alen; /* length of alignment */
520 int *rlen; /* lengths of each raw sequence */
521 int M; /* length of "model" */
522 int **ins; /* insertion counts, 0..nseq-1 by 0..M */
523 int *master_ins; /* max insertion counts, 0..M */
529 /* calculate expected length of model, M
531 rlen = (int *) MallocOrDie (sizeof(int) * nseq);
534 for (idx = 0; idx < nseq; idx++)
536 rlen[idx] = strlen(rseqs[idx]);
538 minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen;
540 M = (int) ((float) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex))));
542 if (M > minlen) M = minlen;
544 /* make arrays that count insertions in M+1 possible insert states
546 ins = (int **) MallocOrDie (sizeof(int *) * nseq);
547 master_ins = (int *) MallocOrDie (sizeof(int) * (M+1));
548 for (idx = 0; idx < nseq; idx++)
550 ins[idx] = (int *) MallocOrDie (sizeof(int) * (M+1));
551 for (rpos = 0; rpos <= M; rpos++)
555 pop = pop / (pop+pex);
557 /* make insertions for individual sequences */
558 for (idx = 0; idx < nseq; idx++)
561 for (rpos = 0; rpos < rlen[idx]-M; rpos++)
563 if (sre_random() < pop || apos == -1) /* open insertion */
564 apos = CHOOSE(M+1); /* choose 0..M */
568 /* calculate master_ins, max inserts */
570 for (apos = 0; apos <= M; apos++)
572 master_ins[apos] = 0;
573 for (idx = 0; idx < nseq; idx++)
574 if (ins[idx][apos] > master_ins[apos])
575 master_ins[apos] = ins[idx][apos];
576 alen += master_ins[apos];
580 /* Now, construct alignment
582 aseqs = (char **) MallocOrDie (sizeof (char *) * nseq);
583 for (idx = 0; idx < nseq; idx++)
584 aseqs[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
585 for (idx = 0; idx < nseq; idx++)
589 for (statepos = 0; statepos <= M; statepos++)
591 for (count = 0; count < ins[idx][statepos]; count++)
592 aseqs[idx][apos++] = rseqs[idx][rpos++];
593 for (; count < master_ins[statepos]; count++)
594 aseqs[idx][apos++] = ' ';
597 aseqs[idx][apos++] = rseqs[idx][rpos++];
599 aseqs[idx][alen] = '\0';
604 ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
605 for (idx = 0; idx < nseq; idx++)
606 SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx]));
610 Free2DArray((void **) ins, nseq);
615 /* Function: AlignmentHomogenousGapsym()
616 * Date: SRE, Sun Mar 19 19:37:12 2000 [wren, St. Louis]
618 * Purpose: Sometimes we've got to convert alignments to
619 * a lowest common denominator, and we need
620 * a single specific gap character -- for example,
621 * PSI-BLAST blastpgp -B takes a very simplistic
622 * alignment input format which appears to only
623 * allow '-' as a gap symbol.
625 * Anything matching the isgap() macro is
628 * Args: aseq - aligned character strings, [0..nseq-1][0..alen-1]
629 * nseq - number of aligned strings
630 * alen - length of alignment
631 * gapsym - character to use for gaps.
633 * Returns: void ("never fails")
636 AlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym)
640 for (i = 0; i < nseq; i++)
641 for (apos = 0; apos < alen; apos++)
642 if (isgap(aseq[i][apos])) aseq[i][apos] = gapsym;