--- /dev/null
+/*****************************************************************
+ * SQUID - a library of functions for biological sequence analysis
+ * Copyright (C) 1992-2002 Washington University School of Medicine
+ *
+ * This source code is freely distributed under the terms of the
+ * GNU General Public License. See the files COPYRIGHT and LICENSE
+ * for details.
+ *****************************************************************/
+
+/* alignio.c
+ * SRE, Mon Jul 12 11:57:37 1993
+ * RCS $Id: alignio.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: alignio.c,v 1.11 2002/10/09 14:26:09 eddy Exp)
+ *
+ * Input/output of sequence alignments.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "squid.h"
+#include "sre_random.h"
+
+/* Function: AllocAlignment()
+ *
+ * Purpose: Allocate space for an alignment, given the number
+ * of sequences and the alignment length in columns.
+ *
+ * Args: nseq - number of sequences
+ * alen - width of alignment
+ * ret_aseq - RETURN: alignment itself
+ * ainfo - RETURN: other info associated with alignment
+ *
+ * Return: (void)
+ * aseq, ainfo free'd by caller: FreeAlignment(aseq, &ainfo).
+ * note that ainfo itself is alloc'ed in caller, usually
+ * just by a "AINFO ainfo" definition.
+ */
+void
+AllocAlignment(int nseq, int alen, char ***ret_aseq, AINFO *ainfo)
+{
+ char **aseq;
+ int idx;
+
+ InitAinfo(ainfo);
+
+ aseq = (char **) MallocOrDie (sizeof(char *) * nseq);
+ for (idx = 0; idx < nseq; idx++)
+ aseq[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
+
+ ainfo->alen = alen;
+ ainfo->nseq = nseq;
+
+ ainfo->wgt = (float *) MallocOrDie (sizeof(float) * nseq);
+ FSet(ainfo->wgt, nseq, 1.0);
+
+ ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
+ for (idx = 0; idx < nseq; idx++)
+ ainfo->sqinfo[idx].flags = 0;
+
+ *ret_aseq = aseq;
+}
+
+
+/* Function: InitAinfo()
+ * Date: SRE, Tue Jan 19 10:16:02 1999 [St. Louis]
+ *
+ * Purpose: Initialize the fields in ainfo structure to
+ * default (null) values. Does nothing with
+ * fields that are dependent on nseq or alen.
+ *
+ * Args: ainfo - optional info structure for an alignment
+ *
+ * Returns: (void). ainfo is modified.
+ */
+void
+InitAinfo(AINFO *ainfo)
+{
+ ainfo->name = NULL;
+ ainfo->desc = NULL;
+ ainfo->cs = NULL;
+ ainfo->rf = NULL;
+ ainfo->acc = NULL;
+ ainfo->au = NULL;
+ ainfo->flags = 0;
+
+ ainfo->tc1 = ainfo->tc2 = 0.0;
+ ainfo->nc1 = ainfo->nc2 = 0.0;
+ ainfo->ga1 = ainfo->ga2 = 0.0;
+}
+
+
+/* Function: FreeAlignment()
+ *
+ * Purpose: Free the space allocated to alignment, names, and optional
+ * information.
+ *
+ * Args: aseqs - sequence alignment
+ * ainfo - associated alignment data.
+ */
+void
+FreeAlignment(char **aseqs, AINFO *ainfo)
+{
+ int i;
+
+ for (i = 0; i < ainfo->nseq; i++)
+ {
+ if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss);
+ if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa);
+ }
+ if (ainfo->cs != NULL) free(ainfo->cs);
+ if (ainfo->rf != NULL) free(ainfo->rf);
+ if (ainfo->name != NULL) free(ainfo->name);
+ if (ainfo->desc != NULL) free(ainfo->desc);
+ if (ainfo->acc != NULL) free(ainfo->acc);
+ if (ainfo->au != NULL) free(ainfo->au);
+
+ free(ainfo->sqinfo);
+ free(ainfo->wgt);
+ Free2DArray((void **) aseqs, ainfo->nseq);
+}
+
+
+
+/* Function: SAMizeAlignment()
+ * Date: SRE, Tue Jun 30 09:49:40 1998 [St. Louis]
+ *
+ * Purpose: Make a "best effort" attempt to convert an alignment
+ * to SAM gap format: - in delete col, . in insert col.
+ * Only works if alignment adheres to SAM's upper/lower
+ * case convention, which is true for instance of old
+ * HMMER alignments.
+ *
+ * Args: aseq - alignment to convert
+ * nseq - number of seqs in alignment
+ * alen - length of alignment
+ *
+ * Returns: (void)
+ */
+void
+SAMizeAlignment(char **aseq, int nseq, int alen)
+{
+ int col; /* counter for aligned columns */
+ int i; /* counter for seqs */
+ int sawlower, sawupper, sawgap;
+ char gapchar;
+
+ for (col = 0; col < alen; col++)
+ {
+ sawlower = sawupper = sawgap = 0;
+ /* pass 1: do we see only upper or lower? */
+ for (i = 0; i < nseq; i++)
+ {
+ if (isgap(aseq[i][col])) { sawgap = 1; continue; }
+ if (isupper((int) aseq[i][col])) { sawupper = 1; continue; }
+ if (islower((int) aseq[i][col])) sawlower = 1;
+ }
+ /* select gap character for column */
+ gapchar = '-'; /* default */
+ if (sawlower && ! sawupper) gapchar = '.';
+
+ /* pass 2: set gap char */
+ for (i = 0; i < nseq; i++)
+ if (isgap(aseq[i][col])) aseq[i][col] = gapchar;
+ }
+}
+
+
+/* Function: SAMizeAlignmentByGapFrac()
+ * Date: SRE, Tue Jun 30 10:58:38 1998 [St. Louis]
+ *
+ * Purpose: Convert an alignment to SAM's gap and case
+ * conventions, using gap fraction in a column
+ * to choose match versus insert columns. In match columns,
+ * residues are upper case and gaps are '-'.
+ * In insert columns, residues are lower case and
+ * gaps are '.'
+ *
+ * Args: aseq - aligned sequences
+ * nseq - number of sequences
+ * alen - length of alignment
+ * maxgap - if more gaps than this fraction, column is insert.
+ *
+ * Returns: (void) Characters in aseq may be altered.
+ */
+void
+SAMizeAlignmentByGapFrac(char **aseq, int nseq, int alen, float maxgap)
+{
+ int apos; /* counter over columns */
+ int idx; /* counter over sequences */
+ int ngap; /* number of gaps seen */
+
+ for (apos = 0; apos < alen; apos++)
+ {
+ /* count gaps */
+ ngap = 0;
+ for (idx = 0; idx < nseq; idx++)
+ if (isgap(aseq[idx][apos])) ngap++;
+
+ /* convert to SAM conventions */
+ if ((float) ngap / (float) nseq > maxgap)
+ { /* insert column */
+ for (idx = 0; idx < nseq; idx++)
+ if (isgap(aseq[idx][apos])) aseq[idx][apos] = '.';
+ else aseq[idx][apos] = (char) tolower((int) aseq[idx][apos]);
+ }
+ else
+ { /* match column */
+ for (idx = 0; idx < nseq; idx++)
+ if (isgap(aseq[idx][apos])) aseq[idx][apos] = '-';
+ else aseq[idx][apos] = (char) toupper((int) aseq[idx][apos]);
+ }
+ }
+}
+
+
+
+
+/* Function: MakeAlignedString()
+ *
+ * Purpose: Given a raw string of some type (secondary structure, say),
+ * align it to a given aseq by putting gaps wherever the
+ * aseq has gaps.
+ *
+ * Args: aseq: template for alignment
+ * alen: length of aseq
+ * ss: raw string to align to aseq
+ * ret_s: RETURN: aligned ss
+ *
+ * Return: 1 on success, 0 on failure (and squid_errno is set.)
+ * ret_ss is malloc'ed here and must be free'd by caller.
+ */
+int
+MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s)
+{
+ char *new;
+ int apos, rpos;
+
+ new = (char *) MallocOrDie ((alen+1) * sizeof(char));
+ for (apos = rpos = 0; apos < alen; apos++)
+ if (! isgap(aseq[apos]))
+ {
+ new[apos] = ss[rpos];
+ rpos++;
+ }
+ else
+ new[apos] = '.';
+ new[apos] = '\0';
+
+ if (rpos != strlen(ss))
+ { squid_errno = SQERR_PARAMETER; free(new); return 0; }
+ *ret_s = new;
+ return 1;
+}
+
+
+/* Function: MakeDealignedString()
+ *
+ * Purpose: Given an aligned string of some type (either sequence or
+ * secondary structure, for instance), dealign it relative
+ * to a given aseq. Return a ptr to the new string.
+ *
+ * Args: aseq : template alignment
+ * alen : length of aseq
+ * ss: : string to make dealigned copy of; same length as aseq
+ * ret_s : RETURN: dealigned copy of ss
+ *
+ * Return: 1 on success, 0 on failure (and squid_errno is set)
+ * ret_s is alloc'ed here and must be freed by caller
+ */
+int
+MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s)
+{
+ char *new;
+ int apos, rpos;
+
+ new = (char *) MallocOrDie ((alen+1) * sizeof(char));
+ for (apos = rpos = 0; apos < alen; apos++)
+ if (! isgap(aseq[apos]))
+ {
+ new[rpos] = ss[apos];
+ rpos++;
+ }
+ new[rpos] = '\0';
+ if (alen != strlen(ss))
+ { squid_errno = SQERR_PARAMETER; free(new); return 0; }
+ *ret_s = new;
+ return 1;
+}
+
+
+/* Function: DealignedLength()
+ *
+ * Purpose: Count the number of non-gap symbols in seq.
+ * (i.e. find the length of the unaligned sequence)
+ *
+ * Args: aseq - aligned sequence to count symbols in, \0 terminated
+ *
+ * Return: raw length of seq.
+ */
+int
+DealignedLength(char *aseq)
+{
+ int rlen;
+ for (rlen = 0; *aseq; aseq++)
+ if (! isgap(*aseq)) rlen++;
+ return rlen;
+}
+
+
+/* Function: WritePairwiseAlignment()
+ *
+ * Purpose: Write a nice formatted pairwise alignment out,
+ * with a BLAST-style middle line showing identities
+ * as themselves (single letter) and conservative
+ * changes as '+'.
+ *
+ * Args: ofp - open fp to write to (stdout, perhaps)
+ * aseq1, aseq2 - alignments to write (not necessarily
+ * flushed right with gaps)
+ * name1, name2 - names of sequences
+ * spos1, spos2 - starting position in each (raw) sequence
+ * pam - PAM matrix; positive values define
+ * conservative changes
+ * indent - how many extra spaces to print on left
+ *
+ * Return: 1 on success, 0 on failure
+ */
+int
+WritePairwiseAlignment(FILE *ofp,
+ char *aseq1, char *name1, int spos1,
+ char *aseq2, char *name2, int spos2,
+ int **pam, int indent)
+{
+ char sname1[11]; /* shortened name */
+ char sname2[11];
+ int still_going; /* True if writing another block */
+ char buf1[61]; /* buffer for writing seq1; CPL+1*/
+ char bufmid[61]; /* buffer for writing consensus */
+ char buf2[61];
+ char *s1, *s2; /* ptrs into each sequence */
+ int count1, count2; /* number of symbols we're writing */
+ int rpos1, rpos2; /* position in raw seqs */
+ int rawcount1, rawcount2; /* number of nongap symbols written */
+ int apos;
+
+ strncpy(sname1, name1, 10);
+ sname1[10] = '\0';
+ strtok(sname1, WHITESPACE);
+
+ strncpy(sname2, name2, 10);
+ sname2[10] = '\0';
+ strtok(sname2, WHITESPACE);
+
+ s1 = aseq1;
+ s2 = aseq2;
+ rpos1 = spos1;
+ rpos2 = spos2;
+
+ still_going = TRUE;
+ while (still_going)
+ {
+ still_going = FALSE;
+
+ /* get next line's worth from both */
+ strncpy(buf1, s1, 60); buf1[60] = '\0';
+ strncpy(buf2, s2, 60); buf2[60] = '\0';
+ count1 = strlen(buf1);
+ count2 = strlen(buf2);
+
+ /* is there still more to go? */
+ if ((count1 == 60 && s1[60] != '\0') ||
+ (count2 == 60 && s2[60] != '\0'))
+ still_going = TRUE;
+
+ /* shift seq ptrs by a line */
+ s1 += count1;
+ s2 += count2;
+
+ /* assemble the consensus line */
+ for (apos = 0; apos < count1 && apos < count2; apos++)
+ {
+ if (!isgap(buf1[apos]) && !isgap(buf2[apos]))
+ {
+ if (buf1[apos] == buf2[apos])
+ bufmid[apos] = buf1[apos];
+ else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0)
+ bufmid[apos] = '+';
+ else
+ bufmid[apos] = ' ';
+ }
+ else
+ bufmid[apos] = ' ';
+ }
+ bufmid[apos] = '\0';
+
+ rawcount1 = 0;
+ for (apos = 0; apos < count1; apos++)
+ if (!isgap(buf1[apos])) rawcount1++;
+
+ rawcount2 = 0;
+ for (apos = 0; apos < count2; apos++)
+ if (!isgap(buf2[apos])) rawcount2++;
+
+ (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
+ sname1, rpos1, buf1, rpos1 + rawcount1 -1);
+ (void) fprintf(ofp, "%*s %s\n", indent, "",
+ bufmid);
+ (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
+ sname2, rpos2, buf2, rpos2 + rawcount2 -1);
+ (void) fprintf(ofp, "\n");
+
+ rpos1 += rawcount1;
+ rpos2 += rawcount2;
+ }
+
+ return 1;
+}
+
+
+/* Function: MingapAlignment()
+ *
+ * Purpose: Remove all-gap columns from a multiple sequence alignment
+ * and its associated data. The alignment is assumed to be
+ * flushed (all aseqs the same length).
+ */
+int
+MingapAlignment(char **aseqs, AINFO *ainfo)
+{
+ int apos; /* position in original alignment */
+ int mpos; /* position in new alignment */
+ int idx;
+
+ /* We overwrite aseqs, using its allocated memory.
+ */
+ for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++)
+ {
+ /* check for all-gap in column */
+ for (idx = 0; idx < ainfo->nseq; idx++)
+ if (! isgap(aseqs[idx][apos]))
+ break;
+ if (idx == ainfo->nseq) continue;
+
+ /* shift alignment and ainfo */
+ if (mpos != apos)
+ {
+ for (idx = 0; idx < ainfo->nseq; idx++)
+ aseqs[idx][mpos] = aseqs[idx][apos];
+
+ if (ainfo->cs != NULL) ainfo->cs[mpos] = ainfo->cs[apos];
+ if (ainfo->rf != NULL) ainfo->rf[mpos] = ainfo->rf[apos];
+ }
+ mpos++;
+ }
+ /* null terminate everything */
+ for (idx = 0; idx < ainfo->nseq; idx++)
+ aseqs[idx][mpos] = '\0';
+ ainfo->alen = mpos; /* set new length */
+ if (ainfo->cs != NULL) ainfo->cs[mpos] = '\0';
+ if (ainfo->rf != NULL) ainfo->rf[mpos] = '\0';
+ return 1;
+}
+
+
+
+/* Function: RandomAlignment()
+ *
+ * Purpose: Create a random alignment from raw sequences.
+ *
+ * Ideally, we would like to sample an alignment from the
+ * space of possible alignments according to its probability,
+ * given a prior probability distribution for alignments.
+ * I don't see how to describe such a distribution, let alone
+ * sample it.
+ *
+ * This is a rough approximation that tries to capture some
+ * desired properties. We assume the alignment is generated
+ * by a simple HMM composed of match and insert states.
+ * Given parameters (pop, pex) for the probability of opening
+ * and extending an insertion, we can find the expected number
+ * of match states, M, in the underlying model for each sequence.
+ * We use an average M taken over all the sequences (this is
+ * an approximation. The expectation of M given all the sequence
+ * lengths is a nasty-looking summation.)
+ *
+ * M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) )
+ *
+ * Then, we assign positions in each raw sequence onto the M match
+ * states and M+1 insert states of this "HMM", by rolling random
+ * numbers and inserting the (rlen-M) inserted positions randomly
+ * into the insert slots, taking into account the relative probability
+ * of open vs. extend.
+ *
+ * The resulting alignment has two desired properties: insertions
+ * tend to follow the HMM-like exponential distribution, and
+ * the "sparseness" of the alignment is controllable through
+ * pop and pex.
+ *
+ * Args: rseqs - raw sequences to "align", 0..nseq-1
+ * sqinfo - array of 0..nseq-1 info structures for the sequences
+ * nseq - number of sequences
+ * pop - probability to open insertion (0<pop<1)
+ * pex - probability to extend insertion (0<pex<1)
+ * ret_aseqs - RETURN: alignment (flushed)
+ * ainfo - fill in: alignment info
+ *
+ * Return: 1 on success, 0 on failure. Sets squid_errno to indicate cause
+ * of failure.
+ */
+int
+RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex,
+ char ***ret_aseqs, AINFO *ainfo)
+{
+ char **aseqs; /* RETURN: alignment */
+ int alen; /* length of alignment */
+ int *rlen; /* lengths of each raw sequence */
+ int M; /* length of "model" */
+ int **ins; /* insertion counts, 0..nseq-1 by 0..M */
+ int *master_ins; /* max insertion counts, 0..M */
+ int apos, rpos, idx;
+ int statepos;
+ int count;
+ int minlen;
+
+ /* calculate expected length of model, M
+ */
+ rlen = (int *) MallocOrDie (sizeof(int) * nseq);
+ M = 0;
+ minlen = 9999999;
+ for (idx = 0; idx < nseq; idx++)
+ {
+ rlen[idx] = strlen(rseqs[idx]);
+ M += rlen[idx];
+ minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen;
+ }
+ M = (int) ((float) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex))));
+ M /= nseq;
+ if (M > minlen) M = minlen;
+
+ /* make arrays that count insertions in M+1 possible insert states
+ */
+ ins = (int **) MallocOrDie (sizeof(int *) * nseq);
+ master_ins = (int *) MallocOrDie (sizeof(int) * (M+1));
+ for (idx = 0; idx < nseq; idx++)
+ {
+ ins[idx] = (int *) MallocOrDie (sizeof(int) * (M+1));
+ for (rpos = 0; rpos <= M; rpos++)
+ ins[idx][rpos] = 0;
+ }
+ /* normalize */
+ pop = pop / (pop+pex);
+ pex = 1.0 - pop;
+ /* make insertions for individual sequences */
+ for (idx = 0; idx < nseq; idx++)
+ {
+ apos = -1;
+ for (rpos = 0; rpos < rlen[idx]-M; rpos++)
+ {
+ if (sre_random() < pop || apos == -1) /* open insertion */
+ apos = CHOOSE(M+1); /* choose 0..M */
+ ins[idx][apos]++;
+ }
+ }
+ /* calculate master_ins, max inserts */
+ alen = M;
+ for (apos = 0; apos <= M; apos++)
+ {
+ master_ins[apos] = 0;
+ for (idx = 0; idx < nseq; idx++)
+ if (ins[idx][apos] > master_ins[apos])
+ master_ins[apos] = ins[idx][apos];
+ alen += master_ins[apos];
+ }
+
+
+ /* Now, construct alignment
+ */
+ aseqs = (char **) MallocOrDie (sizeof (char *) * nseq);
+ for (idx = 0; idx < nseq; idx++)
+ aseqs[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
+ for (idx = 0; idx < nseq; idx++)
+ {
+ apos = rpos = 0;
+
+ for (statepos = 0; statepos <= M; statepos++)
+ {
+ for (count = 0; count < ins[idx][statepos]; count++)
+ aseqs[idx][apos++] = rseqs[idx][rpos++];
+ for (; count < master_ins[statepos]; count++)
+ aseqs[idx][apos++] = ' ';
+
+ if (statepos != M)
+ aseqs[idx][apos++] = rseqs[idx][rpos++];
+ }
+ aseqs[idx][alen] = '\0';
+ }
+ ainfo->flags = 0;
+ ainfo->alen = alen;
+ ainfo->nseq = nseq;
+ ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
+ for (idx = 0; idx < nseq; idx++)
+ SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx]));
+
+ free(rlen);
+ free(master_ins);
+ Free2DArray((void **) ins, nseq);
+ *ret_aseqs = aseqs;
+ return 1;
+}
+
+/* Function: AlignmentHomogenousGapsym()
+ * Date: SRE, Sun Mar 19 19:37:12 2000 [wren, St. Louis]
+ *
+ * Purpose: Sometimes we've got to convert alignments to
+ * a lowest common denominator, and we need
+ * a single specific gap character -- for example,
+ * PSI-BLAST blastpgp -B takes a very simplistic
+ * alignment input format which appears to only
+ * allow '-' as a gap symbol.
+ *
+ * Anything matching the isgap() macro is
+ * converted.
+ *
+ * Args: aseq - aligned character strings, [0..nseq-1][0..alen-1]
+ * nseq - number of aligned strings
+ * alen - length of alignment
+ * gapsym - character to use for gaps.
+ *
+ * Returns: void ("never fails")
+ */
+void
+AlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym)
+{
+ int i, apos;
+
+ for (i = 0; i < nseq; i++)
+ for (apos = 0; apos < alen; apos++)
+ if (isgap(aseq[i][apos])) aseq[i][apos] = gapsym;
+}