+++ /dev/null
-/*****************************************************************
- * SQUID - a library of functions for biological sequence analysis
- * Copyright (C) 1992-2002 Washington University School of Medicine
- *
- * This source code is freely distributed under the terms of the
- * GNU General Public License. See the files COPYRIGHT and LICENSE
- * for details.
- *****************************************************************/
-
-/* alignio.c
- * SRE, Mon Jul 12 11:57:37 1993
- * RCS $Id: alignio.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: alignio.c,v 1.11 2002/10/09 14:26:09 eddy Exp)
- *
- * Input/output of sequence alignments.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include "squid.h"
-#include "sre_random.h"
-
-/* Function: AllocAlignment()
- *
- * Purpose: Allocate space for an alignment, given the number
- * of sequences and the alignment length in columns.
- *
- * Args: nseq - number of sequences
- * alen - width of alignment
- * ret_aseq - RETURN: alignment itself
- * ainfo - RETURN: other info associated with alignment
- *
- * Return: (void)
- * aseq, ainfo free'd by caller: FreeAlignment(aseq, &ainfo).
- * note that ainfo itself is alloc'ed in caller, usually
- * just by a "AINFO ainfo" definition.
- */
-void
-AllocAlignment(int nseq, int alen, char ***ret_aseq, AINFO *ainfo)
-{
- char **aseq;
- int idx;
-
- InitAinfo(ainfo);
-
- aseq = (char **) MallocOrDie (sizeof(char *) * nseq);
- for (idx = 0; idx < nseq; idx++)
- aseq[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
-
- ainfo->alen = alen;
- ainfo->nseq = nseq;
-
- ainfo->wgt = (float *) MallocOrDie (sizeof(float) * nseq);
- FSet(ainfo->wgt, nseq, 1.0);
-
- ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
- for (idx = 0; idx < nseq; idx++)
- ainfo->sqinfo[idx].flags = 0;
-
- *ret_aseq = aseq;
-}
-
-
-/* Function: InitAinfo()
- * Date: SRE, Tue Jan 19 10:16:02 1999 [St. Louis]
- *
- * Purpose: Initialize the fields in ainfo structure to
- * default (null) values. Does nothing with
- * fields that are dependent on nseq or alen.
- *
- * Args: ainfo - optional info structure for an alignment
- *
- * Returns: (void). ainfo is modified.
- */
-void
-InitAinfo(AINFO *ainfo)
-{
- ainfo->name = NULL;
- ainfo->desc = NULL;
- ainfo->cs = NULL;
- ainfo->rf = NULL;
- ainfo->acc = NULL;
- ainfo->au = NULL;
- ainfo->flags = 0;
-
- ainfo->tc1 = ainfo->tc2 = 0.0;
- ainfo->nc1 = ainfo->nc2 = 0.0;
- ainfo->ga1 = ainfo->ga2 = 0.0;
-}
-
-
-/* Function: FreeAlignment()
- *
- * Purpose: Free the space allocated to alignment, names, and optional
- * information.
- *
- * Args: aseqs - sequence alignment
- * ainfo - associated alignment data.
- */
-void
-FreeAlignment(char **aseqs, AINFO *ainfo)
-{
- int i;
-
- for (i = 0; i < ainfo->nseq; i++)
- {
- if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss);
- if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa);
- }
- if (ainfo->cs != NULL) free(ainfo->cs);
- if (ainfo->rf != NULL) free(ainfo->rf);
- if (ainfo->name != NULL) free(ainfo->name);
- if (ainfo->desc != NULL) free(ainfo->desc);
- if (ainfo->acc != NULL) free(ainfo->acc);
- if (ainfo->au != NULL) free(ainfo->au);
-
- free(ainfo->sqinfo);
- free(ainfo->wgt);
- Free2DArray((void **) aseqs, ainfo->nseq);
-}
-
-
-
-/* Function: SAMizeAlignment()
- * Date: SRE, Tue Jun 30 09:49:40 1998 [St. Louis]
- *
- * Purpose: Make a "best effort" attempt to convert an alignment
- * to SAM gap format: - in delete col, . in insert col.
- * Only works if alignment adheres to SAM's upper/lower
- * case convention, which is true for instance of old
- * HMMER alignments.
- *
- * Args: aseq - alignment to convert
- * nseq - number of seqs in alignment
- * alen - length of alignment
- *
- * Returns: (void)
- */
-void
-SAMizeAlignment(char **aseq, int nseq, int alen)
-{
- int col; /* counter for aligned columns */
- int i; /* counter for seqs */
- int sawlower, sawupper, sawgap;
- char gapchar;
-
- for (col = 0; col < alen; col++)
- {
- sawlower = sawupper = sawgap = 0;
- /* pass 1: do we see only upper or lower? */
- for (i = 0; i < nseq; i++)
- {
- if (isgap(aseq[i][col])) { sawgap = 1; continue; }
- if (isupper((int) aseq[i][col])) { sawupper = 1; continue; }
- if (islower((int) aseq[i][col])) sawlower = 1;
- }
- /* select gap character for column */
- gapchar = '-'; /* default */
- if (sawlower && ! sawupper) gapchar = '.';
-
- /* pass 2: set gap char */
- for (i = 0; i < nseq; i++)
- if (isgap(aseq[i][col])) aseq[i][col] = gapchar;
- }
-}
-
-
-/* Function: SAMizeAlignmentByGapFrac()
- * Date: SRE, Tue Jun 30 10:58:38 1998 [St. Louis]
- *
- * Purpose: Convert an alignment to SAM's gap and case
- * conventions, using gap fraction in a column
- * to choose match versus insert columns. In match columns,
- * residues are upper case and gaps are '-'.
- * In insert columns, residues are lower case and
- * gaps are '.'
- *
- * Args: aseq - aligned sequences
- * nseq - number of sequences
- * alen - length of alignment
- * maxgap - if more gaps than this fraction, column is insert.
- *
- * Returns: (void) Characters in aseq may be altered.
- */
-void
-SAMizeAlignmentByGapFrac(char **aseq, int nseq, int alen, float maxgap)
-{
- int apos; /* counter over columns */
- int idx; /* counter over sequences */
- int ngap; /* number of gaps seen */
-
- for (apos = 0; apos < alen; apos++)
- {
- /* count gaps */
- ngap = 0;
- for (idx = 0; idx < nseq; idx++)
- if (isgap(aseq[idx][apos])) ngap++;
-
- /* convert to SAM conventions */
- if ((float) ngap / (float) nseq > maxgap)
- { /* insert column */
- for (idx = 0; idx < nseq; idx++)
- if (isgap(aseq[idx][apos])) aseq[idx][apos] = '.';
- else aseq[idx][apos] = (char) tolower((int) aseq[idx][apos]);
- }
- else
- { /* match column */
- for (idx = 0; idx < nseq; idx++)
- if (isgap(aseq[idx][apos])) aseq[idx][apos] = '-';
- else aseq[idx][apos] = (char) toupper((int) aseq[idx][apos]);
- }
- }
-}
-
-
-
-
-/* Function: MakeAlignedString()
- *
- * Purpose: Given a raw string of some type (secondary structure, say),
- * align it to a given aseq by putting gaps wherever the
- * aseq has gaps.
- *
- * Args: aseq: template for alignment
- * alen: length of aseq
- * ss: raw string to align to aseq
- * ret_s: RETURN: aligned ss
- *
- * Return: 1 on success, 0 on failure (and squid_errno is set.)
- * ret_ss is malloc'ed here and must be free'd by caller.
- */
-int
-MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s)
-{
- char *new;
- int apos, rpos;
-
- new = (char *) MallocOrDie ((alen+1) * sizeof(char));
- for (apos = rpos = 0; apos < alen; apos++)
- if (! isgap(aseq[apos]))
- {
- new[apos] = ss[rpos];
- rpos++;
- }
- else
- new[apos] = '.';
- new[apos] = '\0';
-
- if (rpos != strlen(ss))
- { squid_errno = SQERR_PARAMETER; free(new); return 0; }
- *ret_s = new;
- return 1;
-}
-
-
-/* Function: MakeDealignedString()
- *
- * Purpose: Given an aligned string of some type (either sequence or
- * secondary structure, for instance), dealign it relative
- * to a given aseq. Return a ptr to the new string.
- *
- * Args: aseq : template alignment
- * alen : length of aseq
- * ss: : string to make dealigned copy of; same length as aseq
- * ret_s : RETURN: dealigned copy of ss
- *
- * Return: 1 on success, 0 on failure (and squid_errno is set)
- * ret_s is alloc'ed here and must be freed by caller
- */
-int
-MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s)
-{
- char *new;
- int apos, rpos;
-
- new = (char *) MallocOrDie ((alen+1) * sizeof(char));
- for (apos = rpos = 0; apos < alen; apos++)
- if (! isgap(aseq[apos]))
- {
- new[rpos] = ss[apos];
- rpos++;
- }
- new[rpos] = '\0';
- if (alen != strlen(ss))
- { squid_errno = SQERR_PARAMETER; free(new); return 0; }
- *ret_s = new;
- return 1;
-}
-
-
-/* Function: DealignedLength()
- *
- * Purpose: Count the number of non-gap symbols in seq.
- * (i.e. find the length of the unaligned sequence)
- *
- * Args: aseq - aligned sequence to count symbols in, \0 terminated
- *
- * Return: raw length of seq.
- */
-int
-DealignedLength(char *aseq)
-{
- int rlen;
- for (rlen = 0; *aseq; aseq++)
- if (! isgap(*aseq)) rlen++;
- return rlen;
-}
-
-
-/* Function: WritePairwiseAlignment()
- *
- * Purpose: Write a nice formatted pairwise alignment out,
- * with a BLAST-style middle line showing identities
- * as themselves (single letter) and conservative
- * changes as '+'.
- *
- * Args: ofp - open fp to write to (stdout, perhaps)
- * aseq1, aseq2 - alignments to write (not necessarily
- * flushed right with gaps)
- * name1, name2 - names of sequences
- * spos1, spos2 - starting position in each (raw) sequence
- * pam - PAM matrix; positive values define
- * conservative changes
- * indent - how many extra spaces to print on left
- *
- * Return: 1 on success, 0 on failure
- */
-int
-WritePairwiseAlignment(FILE *ofp,
- char *aseq1, char *name1, int spos1,
- char *aseq2, char *name2, int spos2,
- int **pam, int indent)
-{
- char sname1[11]; /* shortened name */
- char sname2[11];
- int still_going; /* True if writing another block */
- char buf1[61]; /* buffer for writing seq1; CPL+1*/
- char bufmid[61]; /* buffer for writing consensus */
- char buf2[61];
- char *s1, *s2; /* ptrs into each sequence */
- int count1, count2; /* number of symbols we're writing */
- int rpos1, rpos2; /* position in raw seqs */
- int rawcount1, rawcount2; /* number of nongap symbols written */
- int apos;
-
- strncpy(sname1, name1, 10);
- sname1[10] = '\0';
- strtok(sname1, WHITESPACE);
-
- strncpy(sname2, name2, 10);
- sname2[10] = '\0';
- strtok(sname2, WHITESPACE);
-
- s1 = aseq1;
- s2 = aseq2;
- rpos1 = spos1;
- rpos2 = spos2;
-
- still_going = TRUE;
- while (still_going)
- {
- still_going = FALSE;
-
- /* get next line's worth from both */
- strncpy(buf1, s1, 60); buf1[60] = '\0';
- strncpy(buf2, s2, 60); buf2[60] = '\0';
- count1 = strlen(buf1);
- count2 = strlen(buf2);
-
- /* is there still more to go? */
- if ((count1 == 60 && s1[60] != '\0') ||
- (count2 == 60 && s2[60] != '\0'))
- still_going = TRUE;
-
- /* shift seq ptrs by a line */
- s1 += count1;
- s2 += count2;
-
- /* assemble the consensus line */
- for (apos = 0; apos < count1 && apos < count2; apos++)
- {
- if (!isgap(buf1[apos]) && !isgap(buf2[apos]))
- {
- if (buf1[apos] == buf2[apos])
- bufmid[apos] = buf1[apos];
- else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0)
- bufmid[apos] = '+';
- else
- bufmid[apos] = ' ';
- }
- else
- bufmid[apos] = ' ';
- }
- bufmid[apos] = '\0';
-
- rawcount1 = 0;
- for (apos = 0; apos < count1; apos++)
- if (!isgap(buf1[apos])) rawcount1++;
-
- rawcount2 = 0;
- for (apos = 0; apos < count2; apos++)
- if (!isgap(buf2[apos])) rawcount2++;
-
- (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
- sname1, rpos1, buf1, rpos1 + rawcount1 -1);
- (void) fprintf(ofp, "%*s %s\n", indent, "",
- bufmid);
- (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
- sname2, rpos2, buf2, rpos2 + rawcount2 -1);
- (void) fprintf(ofp, "\n");
-
- rpos1 += rawcount1;
- rpos2 += rawcount2;
- }
-
- return 1;
-}
-
-
-/* Function: MingapAlignment()
- *
- * Purpose: Remove all-gap columns from a multiple sequence alignment
- * and its associated data. The alignment is assumed to be
- * flushed (all aseqs the same length).
- */
-int
-MingapAlignment(char **aseqs, AINFO *ainfo)
-{
- int apos; /* position in original alignment */
- int mpos; /* position in new alignment */
- int idx;
-
- /* We overwrite aseqs, using its allocated memory.
- */
- for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++)
- {
- /* check for all-gap in column */
- for (idx = 0; idx < ainfo->nseq; idx++)
- if (! isgap(aseqs[idx][apos]))
- break;
- if (idx == ainfo->nseq) continue;
-
- /* shift alignment and ainfo */
- if (mpos != apos)
- {
- for (idx = 0; idx < ainfo->nseq; idx++)
- aseqs[idx][mpos] = aseqs[idx][apos];
-
- if (ainfo->cs != NULL) ainfo->cs[mpos] = ainfo->cs[apos];
- if (ainfo->rf != NULL) ainfo->rf[mpos] = ainfo->rf[apos];
- }
- mpos++;
- }
- /* null terminate everything */
- for (idx = 0; idx < ainfo->nseq; idx++)
- aseqs[idx][mpos] = '\0';
- ainfo->alen = mpos; /* set new length */
- if (ainfo->cs != NULL) ainfo->cs[mpos] = '\0';
- if (ainfo->rf != NULL) ainfo->rf[mpos] = '\0';
- return 1;
-}
-
-
-
-/* Function: RandomAlignment()
- *
- * Purpose: Create a random alignment from raw sequences.
- *
- * Ideally, we would like to sample an alignment from the
- * space of possible alignments according to its probability,
- * given a prior probability distribution for alignments.
- * I don't see how to describe such a distribution, let alone
- * sample it.
- *
- * This is a rough approximation that tries to capture some
- * desired properties. We assume the alignment is generated
- * by a simple HMM composed of match and insert states.
- * Given parameters (pop, pex) for the probability of opening
- * and extending an insertion, we can find the expected number
- * of match states, M, in the underlying model for each sequence.
- * We use an average M taken over all the sequences (this is
- * an approximation. The expectation of M given all the sequence
- * lengths is a nasty-looking summation.)
- *
- * M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) )
- *
- * Then, we assign positions in each raw sequence onto the M match
- * states and M+1 insert states of this "HMM", by rolling random
- * numbers and inserting the (rlen-M) inserted positions randomly
- * into the insert slots, taking into account the relative probability
- * of open vs. extend.
- *
- * The resulting alignment has two desired properties: insertions
- * tend to follow the HMM-like exponential distribution, and
- * the "sparseness" of the alignment is controllable through
- * pop and pex.
- *
- * Args: rseqs - raw sequences to "align", 0..nseq-1
- * sqinfo - array of 0..nseq-1 info structures for the sequences
- * nseq - number of sequences
- * pop - probability to open insertion (0<pop<1)
- * pex - probability to extend insertion (0<pex<1)
- * ret_aseqs - RETURN: alignment (flushed)
- * ainfo - fill in: alignment info
- *
- * Return: 1 on success, 0 on failure. Sets squid_errno to indicate cause
- * of failure.
- */
-int
-RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex,
- char ***ret_aseqs, AINFO *ainfo)
-{
- char **aseqs; /* RETURN: alignment */
- int alen; /* length of alignment */
- int *rlen; /* lengths of each raw sequence */
- int M; /* length of "model" */
- int **ins; /* insertion counts, 0..nseq-1 by 0..M */
- int *master_ins; /* max insertion counts, 0..M */
- int apos, rpos, idx;
- int statepos;
- int count;
- int minlen;
-
- /* calculate expected length of model, M
- */
- rlen = (int *) MallocOrDie (sizeof(int) * nseq);
- M = 0;
- minlen = 9999999;
- for (idx = 0; idx < nseq; idx++)
- {
- rlen[idx] = strlen(rseqs[idx]);
- M += rlen[idx];
- minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen;
- }
- M = (int) ((float) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex))));
- M /= nseq;
- if (M > minlen) M = minlen;
-
- /* make arrays that count insertions in M+1 possible insert states
- */
- ins = (int **) MallocOrDie (sizeof(int *) * nseq);
- master_ins = (int *) MallocOrDie (sizeof(int) * (M+1));
- for (idx = 0; idx < nseq; idx++)
- {
- ins[idx] = (int *) MallocOrDie (sizeof(int) * (M+1));
- for (rpos = 0; rpos <= M; rpos++)
- ins[idx][rpos] = 0;
- }
- /* normalize */
- pop = pop / (pop+pex);
- pex = 1.0 - pop;
- /* make insertions for individual sequences */
- for (idx = 0; idx < nseq; idx++)
- {
- apos = -1;
- for (rpos = 0; rpos < rlen[idx]-M; rpos++)
- {
- if (sre_random() < pop || apos == -1) /* open insertion */
- apos = CHOOSE(M+1); /* choose 0..M */
- ins[idx][apos]++;
- }
- }
- /* calculate master_ins, max inserts */
- alen = M;
- for (apos = 0; apos <= M; apos++)
- {
- master_ins[apos] = 0;
- for (idx = 0; idx < nseq; idx++)
- if (ins[idx][apos] > master_ins[apos])
- master_ins[apos] = ins[idx][apos];
- alen += master_ins[apos];
- }
-
-
- /* Now, construct alignment
- */
- aseqs = (char **) MallocOrDie (sizeof (char *) * nseq);
- for (idx = 0; idx < nseq; idx++)
- aseqs[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
- for (idx = 0; idx < nseq; idx++)
- {
- apos = rpos = 0;
-
- for (statepos = 0; statepos <= M; statepos++)
- {
- for (count = 0; count < ins[idx][statepos]; count++)
- aseqs[idx][apos++] = rseqs[idx][rpos++];
- for (; count < master_ins[statepos]; count++)
- aseqs[idx][apos++] = ' ';
-
- if (statepos != M)
- aseqs[idx][apos++] = rseqs[idx][rpos++];
- }
- aseqs[idx][alen] = '\0';
- }
- ainfo->flags = 0;
- ainfo->alen = alen;
- ainfo->nseq = nseq;
- ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
- for (idx = 0; idx < nseq; idx++)
- SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx]));
-
- free(rlen);
- free(master_ins);
- Free2DArray((void **) ins, nseq);
- *ret_aseqs = aseqs;
- return 1;
-}
-
-/* Function: AlignmentHomogenousGapsym()
- * Date: SRE, Sun Mar 19 19:37:12 2000 [wren, St. Louis]
- *
- * Purpose: Sometimes we've got to convert alignments to
- * a lowest common denominator, and we need
- * a single specific gap character -- for example,
- * PSI-BLAST blastpgp -B takes a very simplistic
- * alignment input format which appears to only
- * allow '-' as a gap symbol.
- *
- * Anything matching the isgap() macro is
- * converted.
- *
- * Args: aseq - aligned character strings, [0..nseq-1][0..alen-1]
- * nseq - number of aligned strings
- * alen - length of alignment
- * gapsym - character to use for gaps.
- *
- * Returns: void ("never fails")
- */
-void
-AlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym)
-{
- int i, apos;
-
- for (i = 0; i < nseq; i++)
- for (apos = 0; apos < alen; apos++)
- if (isgap(aseq[i][apos])) aseq[i][apos] = gapsym;
-}