/***************************************************************** * SQUID - a library of functions for biological sequence analysis * Copyright (C) 1992-2002 Washington University School of Medicine * * This source code is freely distributed under the terms of the * GNU General Public License. See the files COPYRIGHT and LICENSE * for details. *****************************************************************/ /* alignio.c * SRE, Mon Jul 12 11:57:37 1993 * RCS $Id: alignio.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: alignio.c,v 1.11 2002/10/09 14:26:09 eddy Exp) * * Input/output of sequence alignments. */ #include #include #include #include #include "squid.h" #include "sre_random.h" /* Function: AllocAlignment() * * Purpose: Allocate space for an alignment, given the number * of sequences and the alignment length in columns. * * Args: nseq - number of sequences * alen - width of alignment * ret_aseq - RETURN: alignment itself * ainfo - RETURN: other info associated with alignment * * Return: (void) * aseq, ainfo free'd by caller: FreeAlignment(aseq, &ainfo). * note that ainfo itself is alloc'ed in caller, usually * just by a "AINFO ainfo" definition. */ void AllocAlignment(int nseq, int alen, char ***ret_aseq, AINFO *ainfo) { char **aseq; int idx; InitAinfo(ainfo); aseq = (char **) MallocOrDie (sizeof(char *) * nseq); for (idx = 0; idx < nseq; idx++) aseq[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1)); ainfo->alen = alen; ainfo->nseq = nseq; ainfo->wgt = (float *) MallocOrDie (sizeof(float) * nseq); FSet(ainfo->wgt, nseq, 1.0); ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq); for (idx = 0; idx < nseq; idx++) ainfo->sqinfo[idx].flags = 0; *ret_aseq = aseq; } /* Function: InitAinfo() * Date: SRE, Tue Jan 19 10:16:02 1999 [St. Louis] * * Purpose: Initialize the fields in ainfo structure to * default (null) values. Does nothing with * fields that are dependent on nseq or alen. * * Args: ainfo - optional info structure for an alignment * * Returns: (void). ainfo is modified. */ void InitAinfo(AINFO *ainfo) { ainfo->name = NULL; ainfo->desc = NULL; ainfo->cs = NULL; ainfo->rf = NULL; ainfo->acc = NULL; ainfo->au = NULL; ainfo->flags = 0; ainfo->tc1 = ainfo->tc2 = 0.0; ainfo->nc1 = ainfo->nc2 = 0.0; ainfo->ga1 = ainfo->ga2 = 0.0; } /* Function: FreeAlignment() * * Purpose: Free the space allocated to alignment, names, and optional * information. * * Args: aseqs - sequence alignment * ainfo - associated alignment data. */ void FreeAlignment(char **aseqs, AINFO *ainfo) { int i; for (i = 0; i < ainfo->nseq; i++) { if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss); if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa); } if (ainfo->cs != NULL) free(ainfo->cs); if (ainfo->rf != NULL) free(ainfo->rf); if (ainfo->name != NULL) free(ainfo->name); if (ainfo->desc != NULL) free(ainfo->desc); if (ainfo->acc != NULL) free(ainfo->acc); if (ainfo->au != NULL) free(ainfo->au); free(ainfo->sqinfo); free(ainfo->wgt); Free2DArray((void **) aseqs, ainfo->nseq); } /* Function: SAMizeAlignment() * Date: SRE, Tue Jun 30 09:49:40 1998 [St. Louis] * * Purpose: Make a "best effort" attempt to convert an alignment * to SAM gap format: - in delete col, . in insert col. * Only works if alignment adheres to SAM's upper/lower * case convention, which is true for instance of old * HMMER alignments. * * Args: aseq - alignment to convert * nseq - number of seqs in alignment * alen - length of alignment * * Returns: (void) */ void SAMizeAlignment(char **aseq, int nseq, int alen) { int col; /* counter for aligned columns */ int i; /* counter for seqs */ int sawlower, sawupper, sawgap; char gapchar; for (col = 0; col < alen; col++) { sawlower = sawupper = sawgap = 0; /* pass 1: do we see only upper or lower? */ for (i = 0; i < nseq; i++) { if (isgap(aseq[i][col])) { sawgap = 1; continue; } if (isupper((int) aseq[i][col])) { sawupper = 1; continue; } if (islower((int) aseq[i][col])) sawlower = 1; } /* select gap character for column */ gapchar = '-'; /* default */ if (sawlower && ! sawupper) gapchar = '.'; /* pass 2: set gap char */ for (i = 0; i < nseq; i++) if (isgap(aseq[i][col])) aseq[i][col] = gapchar; } } /* Function: SAMizeAlignmentByGapFrac() * Date: SRE, Tue Jun 30 10:58:38 1998 [St. Louis] * * Purpose: Convert an alignment to SAM's gap and case * conventions, using gap fraction in a column * to choose match versus insert columns. In match columns, * residues are upper case and gaps are '-'. * In insert columns, residues are lower case and * gaps are '.' * * Args: aseq - aligned sequences * nseq - number of sequences * alen - length of alignment * maxgap - if more gaps than this fraction, column is insert. * * Returns: (void) Characters in aseq may be altered. */ void SAMizeAlignmentByGapFrac(char **aseq, int nseq, int alen, float maxgap) { int apos; /* counter over columns */ int idx; /* counter over sequences */ int ngap; /* number of gaps seen */ for (apos = 0; apos < alen; apos++) { /* count gaps */ ngap = 0; for (idx = 0; idx < nseq; idx++) if (isgap(aseq[idx][apos])) ngap++; /* convert to SAM conventions */ if ((float) ngap / (float) nseq > maxgap) { /* insert column */ for (idx = 0; idx < nseq; idx++) if (isgap(aseq[idx][apos])) aseq[idx][apos] = '.'; else aseq[idx][apos] = (char) tolower((int) aseq[idx][apos]); } else { /* match column */ for (idx = 0; idx < nseq; idx++) if (isgap(aseq[idx][apos])) aseq[idx][apos] = '-'; else aseq[idx][apos] = (char) toupper((int) aseq[idx][apos]); } } } /* Function: MakeAlignedString() * * Purpose: Given a raw string of some type (secondary structure, say), * align it to a given aseq by putting gaps wherever the * aseq has gaps. * * Args: aseq: template for alignment * alen: length of aseq * ss: raw string to align to aseq * ret_s: RETURN: aligned ss * * Return: 1 on success, 0 on failure (and squid_errno is set.) * ret_ss is malloc'ed here and must be free'd by caller. */ int MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s) { char *new; int apos, rpos; new = (char *) MallocOrDie ((alen+1) * sizeof(char)); for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseq[apos])) { new[apos] = ss[rpos]; rpos++; } else new[apos] = '.'; new[apos] = '\0'; if (rpos != strlen(ss)) { squid_errno = SQERR_PARAMETER; free(new); return 0; } *ret_s = new; return 1; } /* Function: MakeDealignedString() * * Purpose: Given an aligned string of some type (either sequence or * secondary structure, for instance), dealign it relative * to a given aseq. Return a ptr to the new string. * * Args: aseq : template alignment * alen : length of aseq * ss: : string to make dealigned copy of; same length as aseq * ret_s : RETURN: dealigned copy of ss * * Return: 1 on success, 0 on failure (and squid_errno is set) * ret_s is alloc'ed here and must be freed by caller */ int MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s) { char *new; int apos, rpos; new = (char *) MallocOrDie ((alen+1) * sizeof(char)); for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseq[apos])) { new[rpos] = ss[apos]; rpos++; } new[rpos] = '\0'; if (alen != strlen(ss)) { squid_errno = SQERR_PARAMETER; free(new); return 0; } *ret_s = new; return 1; } /* Function: DealignedLength() * * Purpose: Count the number of non-gap symbols in seq. * (i.e. find the length of the unaligned sequence) * * Args: aseq - aligned sequence to count symbols in, \0 terminated * * Return: raw length of seq. */ int DealignedLength(char *aseq) { int rlen; for (rlen = 0; *aseq; aseq++) if (! isgap(*aseq)) rlen++; return rlen; } /* Function: WritePairwiseAlignment() * * Purpose: Write a nice formatted pairwise alignment out, * with a BLAST-style middle line showing identities * as themselves (single letter) and conservative * changes as '+'. * * Args: ofp - open fp to write to (stdout, perhaps) * aseq1, aseq2 - alignments to write (not necessarily * flushed right with gaps) * name1, name2 - names of sequences * spos1, spos2 - starting position in each (raw) sequence * pam - PAM matrix; positive values define * conservative changes * indent - how many extra spaces to print on left * * Return: 1 on success, 0 on failure */ int WritePairwiseAlignment(FILE *ofp, char *aseq1, char *name1, int spos1, char *aseq2, char *name2, int spos2, int **pam, int indent) { char sname1[11]; /* shortened name */ char sname2[11]; int still_going; /* True if writing another block */ char buf1[61]; /* buffer for writing seq1; CPL+1*/ char bufmid[61]; /* buffer for writing consensus */ char buf2[61]; char *s1, *s2; /* ptrs into each sequence */ int count1, count2; /* number of symbols we're writing */ int rpos1, rpos2; /* position in raw seqs */ int rawcount1, rawcount2; /* number of nongap symbols written */ int apos; strncpy(sname1, name1, 10); sname1[10] = '\0'; strtok(sname1, WHITESPACE); strncpy(sname2, name2, 10); sname2[10] = '\0'; strtok(sname2, WHITESPACE); s1 = aseq1; s2 = aseq2; rpos1 = spos1; rpos2 = spos2; still_going = TRUE; while (still_going) { still_going = FALSE; /* get next line's worth from both */ strncpy(buf1, s1, 60); buf1[60] = '\0'; strncpy(buf2, s2, 60); buf2[60] = '\0'; count1 = strlen(buf1); count2 = strlen(buf2); /* is there still more to go? */ if ((count1 == 60 && s1[60] != '\0') || (count2 == 60 && s2[60] != '\0')) still_going = TRUE; /* shift seq ptrs by a line */ s1 += count1; s2 += count2; /* assemble the consensus line */ for (apos = 0; apos < count1 && apos < count2; apos++) { if (!isgap(buf1[apos]) && !isgap(buf2[apos])) { if (buf1[apos] == buf2[apos]) bufmid[apos] = buf1[apos]; else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0) bufmid[apos] = '+'; else bufmid[apos] = ' '; } else bufmid[apos] = ' '; } bufmid[apos] = '\0'; rawcount1 = 0; for (apos = 0; apos < count1; apos++) if (!isgap(buf1[apos])) rawcount1++; rawcount2 = 0; for (apos = 0; apos < count2; apos++) if (!isgap(buf2[apos])) rawcount2++; (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", sname1, rpos1, buf1, rpos1 + rawcount1 -1); (void) fprintf(ofp, "%*s %s\n", indent, "", bufmid); (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", sname2, rpos2, buf2, rpos2 + rawcount2 -1); (void) fprintf(ofp, "\n"); rpos1 += rawcount1; rpos2 += rawcount2; } return 1; } /* Function: MingapAlignment() * * Purpose: Remove all-gap columns from a multiple sequence alignment * and its associated data. The alignment is assumed to be * flushed (all aseqs the same length). */ int MingapAlignment(char **aseqs, AINFO *ainfo) { int apos; /* position in original alignment */ int mpos; /* position in new alignment */ int idx; /* We overwrite aseqs, using its allocated memory. */ for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++) { /* check for all-gap in column */ for (idx = 0; idx < ainfo->nseq; idx++) if (! isgap(aseqs[idx][apos])) break; if (idx == ainfo->nseq) continue; /* shift alignment and ainfo */ if (mpos != apos) { for (idx = 0; idx < ainfo->nseq; idx++) aseqs[idx][mpos] = aseqs[idx][apos]; if (ainfo->cs != NULL) ainfo->cs[mpos] = ainfo->cs[apos]; if (ainfo->rf != NULL) ainfo->rf[mpos] = ainfo->rf[apos]; } mpos++; } /* null terminate everything */ for (idx = 0; idx < ainfo->nseq; idx++) aseqs[idx][mpos] = '\0'; ainfo->alen = mpos; /* set new length */ if (ainfo->cs != NULL) ainfo->cs[mpos] = '\0'; if (ainfo->rf != NULL) ainfo->rf[mpos] = '\0'; return 1; } /* Function: RandomAlignment() * * Purpose: Create a random alignment from raw sequences. * * Ideally, we would like to sample an alignment from the * space of possible alignments according to its probability, * given a prior probability distribution for alignments. * I don't see how to describe such a distribution, let alone * sample it. * * This is a rough approximation that tries to capture some * desired properties. We assume the alignment is generated * by a simple HMM composed of match and insert states. * Given parameters (pop, pex) for the probability of opening * and extending an insertion, we can find the expected number * of match states, M, in the underlying model for each sequence. * We use an average M taken over all the sequences (this is * an approximation. The expectation of M given all the sequence * lengths is a nasty-looking summation.) * * M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) ) * * Then, we assign positions in each raw sequence onto the M match * states and M+1 insert states of this "HMM", by rolling random * numbers and inserting the (rlen-M) inserted positions randomly * into the insert slots, taking into account the relative probability * of open vs. extend. * * The resulting alignment has two desired properties: insertions * tend to follow the HMM-like exponential distribution, and * the "sparseness" of the alignment is controllable through * pop and pex. * * Args: rseqs - raw sequences to "align", 0..nseq-1 * sqinfo - array of 0..nseq-1 info structures for the sequences * nseq - number of sequences * pop - probability to open insertion (0 minlen) M = minlen; /* make arrays that count insertions in M+1 possible insert states */ ins = (int **) MallocOrDie (sizeof(int *) * nseq); master_ins = (int *) MallocOrDie (sizeof(int) * (M+1)); for (idx = 0; idx < nseq; idx++) { ins[idx] = (int *) MallocOrDie (sizeof(int) * (M+1)); for (rpos = 0; rpos <= M; rpos++) ins[idx][rpos] = 0; } /* normalize */ pop = pop / (pop+pex); pex = 1.0 - pop; /* make insertions for individual sequences */ for (idx = 0; idx < nseq; idx++) { apos = -1; for (rpos = 0; rpos < rlen[idx]-M; rpos++) { if (sre_random() < pop || apos == -1) /* open insertion */ apos = CHOOSE(M+1); /* choose 0..M */ ins[idx][apos]++; } } /* calculate master_ins, max inserts */ alen = M; for (apos = 0; apos <= M; apos++) { master_ins[apos] = 0; for (idx = 0; idx < nseq; idx++) if (ins[idx][apos] > master_ins[apos]) master_ins[apos] = ins[idx][apos]; alen += master_ins[apos]; } /* Now, construct alignment */ aseqs = (char **) MallocOrDie (sizeof (char *) * nseq); for (idx = 0; idx < nseq; idx++) aseqs[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1)); for (idx = 0; idx < nseq; idx++) { apos = rpos = 0; for (statepos = 0; statepos <= M; statepos++) { for (count = 0; count < ins[idx][statepos]; count++) aseqs[idx][apos++] = rseqs[idx][rpos++]; for (; count < master_ins[statepos]; count++) aseqs[idx][apos++] = ' '; if (statepos != M) aseqs[idx][apos++] = rseqs[idx][rpos++]; } aseqs[idx][alen] = '\0'; } ainfo->flags = 0; ainfo->alen = alen; ainfo->nseq = nseq; ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq); for (idx = 0; idx < nseq; idx++) SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx])); free(rlen); free(master_ins); Free2DArray((void **) ins, nseq); *ret_aseqs = aseqs; return 1; } /* Function: AlignmentHomogenousGapsym() * Date: SRE, Sun Mar 19 19:37:12 2000 [wren, St. Louis] * * Purpose: Sometimes we've got to convert alignments to * a lowest common denominator, and we need * a single specific gap character -- for example, * PSI-BLAST blastpgp -B takes a very simplistic * alignment input format which appears to only * allow '-' as a gap symbol. * * Anything matching the isgap() macro is * converted. * * Args: aseq - aligned character strings, [0..nseq-1][0..alen-1] * nseq - number of aligned strings * alen - length of alignment * gapsym - character to use for gaps. * * Returns: void ("never fails") */ void AlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym) { int i, apos; for (i = 0; i < nseq; i++) for (apos = 0; apos < alen; apos++) if (isgap(aseq[i][apos])) aseq[i][apos] = gapsym; }