/***************************************************************** * SQUID - a library of functions for biological sequence analysis * Copyright (C) 1992-2002 Washington University School of Medicine * * This source code is freely distributed under the terms of the * GNU General Public License. See the files COPYRIGHT and LICENSE * for details. *****************************************************************/ /* msa.c * SRE, Mon May 17 10:48:47 1999 * * SQUID's interface for multiple sequence alignment * manipulation: access to the MSA object. * * RCS $Id: msa.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: msa.c,v 1.18 2002/10/12 04:40:35 eddy Exp) */ #include #include #include #include "squid.h" #include "msa.h" /* multiple sequence alignment object support */ #include "gki.h" /* string indexing hashtable code */ #include "ssi.h" /* SSI sequence file indexing code */ /* Function: MSAAlloc() * Date: SRE, Tue May 18 10:45:47 1999 [St. Louis] * * Purpose: Allocate an MSA structure, return a pointer * to it. * * Designed to be used in three ways: * 1) We know exactly the dimensions of the alignment: * both nseq and alen. * msa = MSAAlloc(nseq, alen); * * 2) We know the number of sequences but not alen. * (We add sequences later.) * msa = MSAAlloc(nseq, 0); * * 3) We even don't know the number of sequences, so * we'll have to dynamically expand allocations. * We provide a blocksize for the allocation expansion, * and expand when needed. * msa = MSAAlloc(10, 0); * if (msa->nseq == msa->nseqalloc) MSAExpand(msa); * * Args: nseq - number of sequences, or nseq allocation blocksize * alen - length of alignment in columns, or 0 * * Returns: pointer to new MSA object, w/ all values initialized. * Note that msa->nseq is initialized to 0, though space * is allocated. * * Diagnostics: "always works". Die()'s on memory allocation failure. * */ MSA * MSAAlloc(int nseq, int alen) { MSA *msa; int i; msa = MallocOrDie(sizeof(MSA)); msa->aseq = MallocOrDie(sizeof(char *) * nseq); msa->sqname = MallocOrDie(sizeof(char *) * nseq); msa->sqlen = MallocOrDie(sizeof(int) * nseq); msa->wgt = MallocOrDie(sizeof(float) * nseq); for (i = 0; i < nseq; i++) { msa->sqname[i] = NULL; msa->sqlen[i] = 0; msa->wgt[i] = -1.0; if (alen != 0) msa->aseq[i] = MallocOrDie(sizeof(char) * (alen+1)); else msa->aseq[i] = NULL; } msa->alen = alen; msa->nseq = 0; msa->nseqalloc = nseq; msa->nseqlump = nseq; msa->flags = 0; msa->type = kOtherSeq; msa->name = NULL; msa->desc = NULL; msa->acc = NULL; msa->au = NULL; msa->ss_cons = NULL; msa->sa_cons = NULL; msa->rf = NULL; msa->sqacc = NULL; msa->sqdesc = NULL; msa->ss = NULL; msa->sslen = NULL; msa->sa = NULL; msa->salen = NULL; msa->index = GKIInit(); msa->lastidx = 0; for (i = 0; i < MSA_MAXCUTOFFS; i++) { msa->cutoff[i] = 0.; msa->cutoff_is_set[i] = FALSE; } /* Initialize unparsed optional markup */ msa->comment = NULL; msa->ncomment = 0; msa->alloc_ncomment = 0; msa->gf_tag = NULL; msa->gf = NULL; msa->ngf = 0; msa->gs_tag = NULL; msa->gs = NULL; msa->gs_idx = NULL; msa->ngs = 0; msa->gc_tag = NULL; msa->gc = NULL; msa->gc_idx = NULL; msa->ngc = 0; msa->gr_tag = NULL; msa->gr = NULL; msa->gr_idx = NULL; msa->ngr = 0; /* Done. Return the alloced, initialized structure */ return msa; } /* Function: MSAExpand() * Date: SRE, Tue May 18 11:06:53 1999 [St. Louis] * * Purpose: Increase the sequence allocation in an MSA * by msa->nseqlump. (Typically used when we're reading * in an alignment sequentially from a file, * so we don't know nseq until we're done.) * * Args: msa - the MSA object * * Returns: (void) * */ void MSAExpand(MSA *msa) { int i,j; msa->nseqalloc += msa->nseqlump; msa->aseq = ReallocOrDie(msa->aseq, sizeof(char *) * msa->nseqalloc); msa->sqname = ReallocOrDie(msa->sqname, sizeof(char *) * msa->nseqalloc); msa->sqlen = ReallocOrDie(msa->sqlen, sizeof(char *) * msa->nseqalloc); msa->wgt = ReallocOrDie(msa->wgt, sizeof(float) * msa->nseqalloc); if (msa->ss != NULL) { msa->ss = ReallocOrDie(msa->ss, sizeof(char *) * msa->nseqalloc); msa->sslen = ReallocOrDie(msa->sslen, sizeof(int) * msa->nseqalloc); } if (msa->sa != NULL) { msa->sa = ReallocOrDie(msa->sa, sizeof(char *) * msa->nseqalloc); msa->salen = ReallocOrDie(msa->salen, sizeof(int) * msa->nseqalloc); } if (msa->sqacc != NULL) msa->sqacc = ReallocOrDie(msa->sqacc, sizeof(char *) * msa->nseqalloc); if (msa->sqdesc != NULL) msa->sqdesc =ReallocOrDie(msa->sqdesc,sizeof(char *) * msa->nseqalloc); for (i = msa->nseqalloc-msa->nseqlump; i < msa->nseqalloc; i++) { msa->sqname[i] = NULL; msa->wgt[i] = -1.0; if (msa->sqacc != NULL) msa->sqacc[i] = NULL; if (msa->sqdesc != NULL) msa->sqdesc[i] = NULL; if (msa->alen != 0) msa->aseq[i] = ReallocOrDie(msa->aseq[i], sizeof(char) * (msa->alen+1)); else msa->aseq[i] = NULL; msa->sqlen[i] = 0; if (msa->ss != NULL) { if (msa->alen != 0) msa->ss[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1)); else msa->ss[i] = NULL; msa->sslen[i] = 0; } if (msa->sa != NULL) { if (msa->alen != 0) msa->sa[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1)); else msa->sa[i] = NULL; msa->salen[i] = 0; } } /* Reallocate and re-init for unparsed #=GS tags, if we have some. * gs is [0..ngs-1][0..nseq-1][], so we're reallocing the middle * set of pointers. */ if (msa->gs != NULL) for (i = 0; i < msa->ngs; i++) { if (msa->gs[i] != NULL) { msa->gs[i] = ReallocOrDie(msa->gs[i], sizeof(char *) * msa->nseqalloc); for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++) msa->gs[i][j] = NULL; } } /* Reallocate and re-init for unparsed #=GR tags, if we have some. * gr is [0..ngs-1][0..nseq-1][], so we're reallocing the middle * set of pointers. */ if (msa->gr != NULL) for (i = 0; i < msa->ngr; i++) { if (msa->gr[i] != NULL) { msa->gr[i] = ReallocOrDie(msa->gr[i], sizeof(char *) * msa->nseqalloc); for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++) msa->gr[i][j] = NULL; } } return; } /* Function: MSAFree() * Date: SRE, Tue May 18 11:20:16 1999 [St. Louis] * * Purpose: Free a multiple sequence alignment structure. * * Args: msa - the alignment * * Returns: (void) */ void MSAFree(MSA *msa) { Free2DArray((void **) msa->aseq, msa->nseq); Free2DArray((void **) msa->sqname, msa->nseq); Free2DArray((void **) msa->sqacc, msa->nseq); Free2DArray((void **) msa->sqdesc, msa->nseq); Free2DArray((void **) msa->ss, msa->nseq); Free2DArray((void **) msa->sa, msa->nseq); if (msa->sqlen != NULL) free(msa->sqlen); if (msa->wgt != NULL) free(msa->wgt); if (msa->name != NULL) free(msa->name); if (msa->desc != NULL) free(msa->desc); if (msa->acc != NULL) free(msa->acc); if (msa->au != NULL) free(msa->au); if (msa->ss_cons != NULL) free(msa->ss_cons); if (msa->sa_cons != NULL) free(msa->sa_cons); if (msa->rf != NULL) free(msa->rf); if (msa->sslen != NULL) free(msa->sslen); if (msa->salen != NULL) free(msa->salen); Free2DArray((void **) msa->comment, msa->ncomment); Free2DArray((void **) msa->gf_tag, msa->ngf); Free2DArray((void **) msa->gf, msa->ngf); Free2DArray((void **) msa->gs_tag, msa->ngs); Free3DArray((void ***)msa->gs, msa->ngs, msa->nseq); Free2DArray((void **) msa->gc_tag, msa->ngc); Free2DArray((void **) msa->gc, msa->ngc); Free2DArray((void **) msa->gr_tag, msa->ngr); Free3DArray((void ***)msa->gr, msa->ngr, msa->nseq); GKIFree(msa->index); GKIFree(msa->gs_idx); GKIFree(msa->gc_idx); GKIFree(msa->gr_idx); free(msa); } /* Function: MSASetSeqAccession() * Date: SRE, Mon Jun 21 04:13:33 1999 [Sanger Centre] * * Purpose: Set a sequence accession in an MSA structure. * Handles some necessary allocation/initialization. * * Args: msa - multiple alignment to add accession to * seqidx - index of sequence to attach accession to * acc - accession * * Returns: void */ void MSASetSeqAccession(MSA *msa, int seqidx, char *acc) { int x; if (msa->sqacc == NULL) { msa->sqacc = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (x = 0; x < msa->nseqalloc; x++) msa->sqacc[x] = NULL; } msa->sqacc[seqidx] = sre_strdup(acc, -1); } /* Function: MSASetSeqDescription() * Date: SRE, Mon Jun 21 04:21:09 1999 [Sanger Centre] * * Purpose: Set a sequence description in an MSA structure. * Handles some necessary allocation/initialization. * * Args: msa - multiple alignment to add accession to * seqidx - index of sequence to attach accession to * desc - description * * Returns: void */ void MSASetSeqDescription(MSA *msa, int seqidx, char *desc) { int x; if (msa->sqdesc == NULL) { msa->sqdesc = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (x = 0; x < msa->nseqalloc; x++) msa->sqdesc[x] = NULL; } msa->sqdesc[seqidx] = sre_strdup(desc, -1); } /* Function: MSAAddComment() * Date: SRE, Tue Jun 1 17:37:21 1999 [St. Louis] * * Purpose: Add an (unparsed) comment line to the MSA structure, * allocating as necessary. * * Args: msa - a multiple alignment * s - comment line to add * * Returns: (void) */ void MSAAddComment(MSA *msa, char *s) { /* If this is our first recorded comment, we need to malloc(); * and if we've filled available space, we need to realloc(). * Note the arbitrary lumpsize of 10 lines per allocation... */ if (msa->comment == NULL) { msa->comment = MallocOrDie (sizeof(char *) * 10); msa->alloc_ncomment = 10; } if (msa->ncomment == msa->alloc_ncomment) { msa->alloc_ncomment += 10; msa->comment = ReallocOrDie(msa->comment, sizeof(char *) * msa->alloc_ncomment); } msa->comment[msa->ncomment] = sre_strdup(s, -1); msa->ncomment++; return; } /* Function: MSAAddGF() * Date: SRE, Wed Jun 2 06:53:54 1999 [bus to Madison] * * Purpose: Add an unparsed #=GF markup line to the MSA * structure, allocating as necessary. * * Args: msa - a multiple alignment * tag - markup tag (e.g. "AU") * value - free text markup (e.g. "Alex Bateman") * * Returns: (void) */ void MSAAddGF(MSA *msa, char *tag, char *value) { /* If this is our first recorded unparsed #=GF line, we need to malloc(); * if we've filled availabl space If we already have a hash index, and the GF * Note the arbitrary lumpsize of 10 lines per allocation... */ if (msa->gf_tag == NULL) { msa->gf_tag = MallocOrDie (sizeof(char *) * 10); msa->gf = MallocOrDie (sizeof(char *) * 10); msa->alloc_ngf = 10; } if (msa->ngf == msa->alloc_ngf) { msa->alloc_ngf += 10; msa->gf_tag = ReallocOrDie(msa->gf_tag, sizeof(char *) * msa->alloc_ngf); msa->gf = ReallocOrDie(msa->gf, sizeof(char *) * msa->alloc_ngf); } msa->gf_tag[msa->ngf] = sre_strdup(tag, -1); msa->gf[msa->ngf] = sre_strdup(value, -1); msa->ngf++; return; } /* Function: MSAAddGS() * Date: SRE, Wed Jun 2 06:57:03 1999 [St. Louis] * * Purpose: Add an unparsed #=GS markup line to the MSA * structure, allocating as necessary. * * It's possible that we could get more than one * of the same type of GS tag per sequence; for * example, "DR PDB;" structure links in Pfam. * Hack: handle these by appending to the string, * in a \n separated fashion. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "AC") * sqidx - index of sequence to assoc markup with (0..nseq-1) * value - markup (e.g. "P00666") * * Returns: 0 on success */ void MSAAddGS(MSA *msa, char *tag, int sqidx, char *value) { int tagidx; int i; /* Is this an unparsed tag name that we recognize? * If not, handle adding it to index, and reallocating * as needed. */ if (msa->gs_tag == NULL) /* first tag? init w/ malloc */ { msa->gs_idx = GKIInit(); tagidx = GKIStoreKey(msa->gs_idx, tag); SQD_DASSERT1((tagidx == 0)); msa->gs_tag = MallocOrDie(sizeof(char *)); msa->gs = MallocOrDie(sizeof(char **)); msa->gs[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gs[0][i] = NULL; } else { /* new tag? */ tagidx = GKIKeyIndex(msa->gs_idx, tag); if (tagidx < 0) { /* it's a new tag name; realloc */ tagidx = GKIStoreKey(msa->gs_idx, tag); /* since we alloc in blocks of 1, we always realloc upon seeing a new tag. */ SQD_DASSERT1((tagidx == msa->ngs)); msa->gs_tag = ReallocOrDie(msa->gs_tag, (msa->ngs+1) * sizeof(char *)); msa->gs = ReallocOrDie(msa->gs, (msa->ngs+1) * sizeof(char **)); msa->gs[msa->ngs] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gs[msa->ngs][i] = NULL; } } if (tagidx == msa->ngs) { msa->gs_tag[tagidx] = sre_strdup(tag, -1); msa->ngs++; } if (msa->gs[tagidx][sqidx] == NULL) /* first annotation of this seq with this tag? */ msa->gs[tagidx][sqidx] = sre_strdup(value, -1); else { /* >1 annotation of this seq with this tag; append */ int len; if ((len = sre_strcat(&(msa->gs[tagidx][sqidx]), -1, "\n", 1)) < 0) Die("failed to sre_strcat()"); if (sre_strcat(&(msa->gs[tagidx][sqidx]), len, value, -1) < 0) Die("failed to sre_strcat()"); } return; } /* Function: MSAAppendGC() * Date: SRE, Thu Jun 3 06:25:14 1999 [Madison] * * Purpose: Add an unparsed #=GC markup line to the MSA * structure, allocating as necessary. * * When called multiple times for the same tag, * appends value strings together -- used when * parsing multiblock alignment files, for * example. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "CS") * value - markup, one char per aligned column * * Returns: (void) */ void MSAAppendGC(MSA *msa, char *tag, char *value) { int tagidx; /* Is this an unparsed tag name that we recognize? * If not, handle adding it to index, and reallocating * as needed. */ if (msa->gc_tag == NULL) /* first tag? init w/ malloc */ { msa->gc_tag = MallocOrDie(sizeof(char *)); msa->gc = MallocOrDie(sizeof(char *)); msa->gc_idx = GKIInit(); tagidx = GKIStoreKey(msa->gc_idx, tag); SQD_DASSERT1((tagidx == 0)); msa->gc[0] = NULL; } else { /* new tag? */ tagidx = GKIKeyIndex(msa->gc_idx, tag); if (tagidx < 0) { /* it's a new tag name; realloc */ tagidx = GKIStoreKey(msa->gc_idx, tag); /* since we alloc in blocks of 1, we always realloc upon seeing a new tag. */ SQD_DASSERT1((tagidx == msa->ngc)); msa->gc_tag = ReallocOrDie(msa->gc_tag, (msa->ngc+1) * sizeof(char **)); msa->gc = ReallocOrDie(msa->gc, (msa->ngc+1) * sizeof(char **)); msa->gc[tagidx] = NULL; } } if (tagidx == msa->ngc) { msa->gc_tag[tagidx] = sre_strdup(tag, -1); msa->ngc++; } sre_strcat(&(msa->gc[tagidx]), -1, value, -1); return; } /* Function: MSAGetGC() * Date: SRE, Fri Aug 13 13:25:57 1999 [St. Louis] * * Purpose: Given a tagname for a miscellaneous #=GC column * annotation, return a pointer to the annotation * string. * * Args: msa - alignment and its annotation * tag - name of the annotation * * Returns: ptr to the annotation string. Caller does *not* * free; is managed by msa object still. */ char * MSAGetGC(MSA *msa, char *tag) { int tagidx; if (msa->gc_idx == NULL) return NULL; if ((tagidx = GKIKeyIndex(msa->gc_idx, tag)) < 0) return NULL; return msa->gc[tagidx]; } /* Function: MSAAppendGR() * Date: SRE, Thu Jun 3 06:34:38 1999 [Madison] * * Purpose: Add an unparsed #=GR markup line to the * MSA structure, allocating as necessary. * * When called multiple times for the same tag, * appends value strings together -- used when * parsing multiblock alignment files, for * example. * * Args: msa - multiple alignment structure * tag - markup tag (e.g. "SS") * sqidx - index of seq to assoc markup with (0..nseq-1) * value - markup, one char per aligned column * * Returns: (void) */ void MSAAppendGR(MSA *msa, char *tag, int sqidx, char *value) { int tagidx; int i; /* Is this an unparsed tag name that we recognize? * If not, handle adding it to index, and reallocating * as needed. */ if (msa->gr_tag == NULL) /* first tag? init w/ malloc */ { msa->gr_tag = MallocOrDie(sizeof(char *)); msa->gr = MallocOrDie(sizeof(char **)); msa->gr[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gr[0][i] = NULL; msa->gr_idx = GKIInit(); tagidx = GKIStoreKey(msa->gr_idx, tag); SQD_DASSERT1((tagidx == 0)); } else { /* new tag? */ tagidx = GKIKeyIndex(msa->gr_idx, tag); if (tagidx < 0) { /* it's a new tag name; realloc */ tagidx = GKIStoreKey(msa->gr_idx, tag); /* since we alloc in blocks of 1, we always realloc upon seeing a new tag. */ SQD_DASSERT1((tagidx == msa->ngr)); msa->gr_tag = ReallocOrDie(msa->gr_tag, (msa->ngr+1) * sizeof(char *)); msa->gr = ReallocOrDie(msa->gr, (msa->ngr+1) * sizeof(char **)); msa->gr[msa->ngr] = MallocOrDie(sizeof(char *) * msa->nseqalloc); for (i = 0; i < msa->nseqalloc; i++) msa->gr[msa->ngr][i] = NULL; } } if (tagidx == msa->ngr) { msa->gr_tag[tagidx] = sre_strdup(tag, -1); msa->ngr++; } sre_strcat(&(msa->gr[tagidx][sqidx]), -1, value, -1); return; } /* Function: MSAVerifyParse() * Date: SRE, Sat Jun 5 14:24:24 1999 [Madison, 1999 worm mtg] * * Purpose: Last function called after a multiple alignment is * parsed. Checks that parse was successful; makes sure * required information is present; makes sure required * information is consistent. Some fields that are * only use during parsing may be freed (sqlen, for * example). * * Some fields in msa may be modified (msa->alen is set, * for example). * * Args: msa - the multiple alignment * sqname, aseq must be set * nseq must be correct * alen need not be set; will be set here. * wgt will be set here if not already set * * Returns: (void) * Will Die() here with diagnostics on error. * * Example: */ void MSAVerifyParse(MSA *msa) { int idx; if (msa->nseq == 0) Die("Parse error: no sequences were found for alignment %s", msa->name != NULL ? msa->name : ""); msa->alen = msa->sqlen[0]; /* We can rely on msa->sqname[] being valid for any index, * because of the way the line parsers always store any name * they add to the index. */ for (idx = 0; idx < msa->nseq; idx++) { /* aseq is required. */ if (msa->aseq[idx] == NULL) Die("Parse error: No sequence for %s in alignment %s", msa->sqname[idx], msa->name != NULL ? msa->name : ""); /* either all weights must be set, or none of them */ if ((msa->flags & MSA_SET_WGT) && msa->wgt[idx] == -1.0) Die("Parse error: some weights are set, but %s doesn't have one in alignment %s", msa->sqname[idx], msa->name != NULL ? msa->name : ""); /* all aseq must be same length. */ if (msa->sqlen[idx] != msa->alen) Die("Parse error: sequence %s: length %d, expected %d in alignment %s", msa->sqname[idx], msa->sqlen[idx], msa->alen, msa->name != NULL ? msa->name : ""); /* if SS is present, must have length right */ if (msa->ss != NULL && msa->ss[idx] != NULL && msa->sslen[idx] != msa->alen) Die("Parse error: #=GR SS annotation for %s: length %d, expected %d in alignment %s", msa->sqname[idx], msa->sslen[idx], msa->alen, msa->name != NULL ? msa->name : ""); /* if SA is present, must have length right */ if (msa->sa != NULL && msa->sa[idx] != NULL && msa->salen[idx] != msa->alen) Die("Parse error: #=GR SA annotation for %s: length %d, expected %d in alignment %s", msa->sqname[idx], msa->salen[idx], msa->alen, msa->name != NULL ? msa->name : ""); } /* if cons SS is present, must have length right */ if (msa->ss_cons != NULL && strlen(msa->ss_cons) != msa->alen) Die("Parse error: #=GC SS_cons annotation: length %d, expected %d in alignment %s", strlen(msa->ss_cons), msa->alen, msa->name != NULL ? msa->name : ""); /* if cons SA is present, must have length right */ if (msa->sa_cons != NULL && strlen(msa->sa_cons) != msa->alen) Die("Parse error: #=GC SA_cons annotation: length %d, expected %d in alignment %s", strlen(msa->sa_cons), msa->alen, msa->name != NULL ? msa->name : ""); /* if RF is present, must have length right */ if (msa->rf != NULL && strlen(msa->rf) != msa->alen) Die("Parse error: #=GC RF annotation: length %d, expected %d in alignment %s", strlen(msa->rf), msa->alen, msa->name != NULL ? msa->name : ""); /* Check that all or no weights are set */ if (!(msa->flags & MSA_SET_WGT)) FSet(msa->wgt, msa->nseq, 1.0); /* default weights */ /* Clean up a little from the parser */ if (msa->sqlen != NULL) { free(msa->sqlen); msa->sqlen = NULL; } if (msa->sslen != NULL) { free(msa->sslen); msa->sslen = NULL; } if (msa->salen != NULL) { free(msa->salen); msa->salen = NULL; } return; } /* Function: MSAFileOpen() * Date: SRE, Tue May 18 13:22:01 1999 [St. Louis] * * Purpose: Open an alignment database file and prepare * for reading one alignment, or sequentially * in the (rare) case of multiple MSA databases * (e.g. Stockholm format). * * Args: filename - name of file to open * if "-", read stdin * if it ends in ".gz", read from pipe to gunzip -dc * format - format of file (e.g. MSAFILE_STOCKHOLM) * env - environment variable for path (e.g. BLASTDB) * * Returns: opened MSAFILE * on success. * NULL on failure: * usually, because the file doesn't exist; * for gzip'ed files, may also mean that gzip isn't in the path. */ MSAFILE * MSAFileOpen(char *filename, int format, char *env) { MSAFILE *afp; afp = MallocOrDie(sizeof(MSAFILE)); if (strcmp(filename, "-") == 0) { afp->f = stdin; afp->do_stdin = TRUE; afp->do_gzip = FALSE; afp->fname = sre_strdup("[STDIN]", -1); afp->ssi = NULL; /* can't index stdin because we can't seek*/ } #ifndef SRE_STRICT_ANSI /* popen(), pclose() aren't portable to non-POSIX systems; disable */ else if (Strparse("^.*\\.gz$", filename, 0)) { char cmd[256]; /* Note that popen() will return "successfully" * if file doesn't exist, because gzip works fine * and prints an error! So we have to check for * existence of file ourself. */ if (! FileExists(filename)) Die("%s: file does not exist", filename); if (strlen(filename) + strlen("gzip -dc ") >= 256) Die("filename > 255 char in MSAFileOpen()"); sprintf(cmd, "gzip -dc %s", filename); if ((afp->f = popen(cmd, "r")) == NULL) return NULL; afp->do_stdin = FALSE; afp->do_gzip = TRUE; afp->fname = sre_strdup(filename, -1); /* we can't index a .gz file, because we can't seek in a pipe afaik */ afp->ssi = NULL; } #endif /*SRE_STRICT_ANSI*/ else { char *ssifile; char *dir; /* When we open a file, it may be either in the current * directory, or in the directory indicated by the env * argument - and we have to construct the SSI filename accordingly. */ if ((afp->f = fopen(filename, "r")) != NULL) { ssifile = MallocOrDie(sizeof(char) * (strlen(filename) + 5)); sprintf(ssifile, "%s.ssi", filename); } else if ((afp->f = EnvFileOpen(filename, env, &dir)) != NULL) { char *full; full = FileConcat(dir, filename); ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(filename) + 5)); sprintf(ssifile, "%s.ssi", full); free(dir); } else return NULL; afp->do_stdin = FALSE; afp->do_gzip = FALSE; afp->fname = sre_strdup(filename, -1); afp->ssi = NULL; /* Open the SSI index file. If it doesn't exist, or * it's corrupt, or some error happens, afp->ssi stays NULL. */ SSIOpen(ssifile, &(afp->ssi)); free(ssifile); } /* Invoke autodetection if we haven't already been told what * to expect. */ if (format == MSAFILE_UNKNOWN) { if (afp->do_stdin == TRUE || afp->do_gzip) Die("Can't autodetect alignment file format from a stdin or gzip pipe"); format = MSAFileFormat(afp); if (format == MSAFILE_UNKNOWN) Die("Can't determine format of multiple alignment file %s", afp->fname); } afp->format = format; afp->linenumber = 0; afp->buf = NULL; afp->buflen = 0; return afp; } /* Function: MSAFilePositionByKey() * MSAFilePositionByIndex() * MSAFileRewind() * * Date: SRE, Tue Nov 9 19:02:54 1999 [St. Louis] * * Purpose: Family of functions for repositioning in * open MSA files; analogous to a similarly * named function series in HMMER's hmmio.c. * * Args: afp - open alignment file * offset - disk offset in bytes * key - key to look up in SSI indices * idx - index of alignment. * * Returns: 0 on failure. * 1 on success. * If called on a non-fseek()'able file (e.g. a gzip'ed * or pipe'd alignment), returns 0 as a failure flag. */ int MSAFileRewind(MSAFILE *afp) { if (afp->do_gzip || afp->do_stdin) return 0; rewind(afp->f); return 1; } int MSAFilePositionByKey(MSAFILE *afp, char *key) { int fh; /* filehandle is ignored */ SSIOFFSET offset; /* offset of the key alignment */ if (afp->ssi == NULL) return 0; if (SSIGetOffsetByName(afp->ssi, key, &fh, &offset) != 0) return 0; if (SSISetFilePosition(afp->f, &offset) != 0) return 0; return 1; } int MSAFilePositionByIndex(MSAFILE *afp, int idx) { int fh; /* filehandled is passed but ignored */ SSIOFFSET offset; /* disk offset of desired alignment */ if (afp->ssi == NULL) return 0; if (SSIGetOffsetByNumber(afp->ssi, idx, &fh, &offset) != 0) return 0; if (SSISetFilePosition(afp->f, &offset) != 0) return 0; return 1; } /* Function: MSAFileRead() * Date: SRE, Fri May 28 16:01:43 1999 [St. Louis] * * Purpose: Read the next msa from an open alignment file. * This is a wrapper around format-specific calls. * * Args: afp - open alignment file * * Returns: next alignment, or NULL if out of alignments */ MSA * MSAFileRead(MSAFILE *afp) { MSA *msa = NULL; switch (afp->format) { case MSAFILE_STOCKHOLM: msa = ReadStockholm(afp); break; case MSAFILE_MSF: msa = ReadMSF(afp); break; case MSAFILE_A2M: msa = ReadA2M(afp); break; case MSAFILE_CLUSTAL: msa = ReadClustal(afp); break; case MSAFILE_SELEX: msa = ReadSELEX(afp); break; case MSAFILE_PHYLIP: msa = ReadPhylip(afp); break; default: Die("MSAFILE corrupted: bad format index"); } return msa; } /* Function: MSAFileClose() * Date: SRE, Tue May 18 14:05:28 1999 [St. Louis] * * Purpose: Close an open MSAFILE. * * Args: afp - ptr to an open MSAFILE. * * Returns: void */ void MSAFileClose(MSAFILE *afp) { #ifndef SRE_STRICT_ANSI /* gzip functionality only on POSIX systems */ if (afp->do_gzip) pclose(afp->f); #endif if (! afp->do_stdin) fclose(afp->f); if (afp->buf != NULL) free(afp->buf); if (afp->ssi != NULL) SSIClose(afp->ssi); if (afp->fname != NULL) free(afp->fname); free(afp); } char * MSAFileGetLine(MSAFILE *afp) { char *s; if ((s = sre_fgets(&(afp->buf), &(afp->buflen), afp->f)) == NULL) return NULL; afp->linenumber++; return afp->buf; } void MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline) { switch (outfmt) { #ifdef CLUSTALO case MSAFILE_A2M: WriteA2M(stdout, msa, 0); break; case MSAFILE_VIENNA: WriteA2M(stdout, msa, 1); break; #else case MSAFILE_A2M: WriteA2M(stdout, msa); break; #endif case MSAFILE_CLUSTAL: WriteClustal(stdout, msa); break; case MSAFILE_MSF: WriteMSF(stdout, msa); break; case MSAFILE_PHYLIP: WritePhylip(stdout, msa); break; case MSAFILE_SELEX: WriteSELEX(stdout, msa); break; case MSAFILE_STOCKHOLM: if (do_oneline) WriteStockholmOneBlock(stdout, msa); else WriteStockholm(stdout, msa); break; default: Die("can't write. no such alignment format %d\n", outfmt); } } /* Function: MSAGetSeqidx() * Date: SRE, Wed May 19 15:08:25 1999 [St. Louis] * * Purpose: From a sequence name, return seqidx appropriate * for an MSA structure. * * 1) try to guess the index. (pass -1 if you can't guess) * 2) Look up name in msa's hashtable. * 3) If it's a new name, store in msa's hashtable; * expand allocs as needed; * save sqname. * * Args: msa - alignment object * name - a sequence name * guess - a guess at the right index, or -1 if no guess. * * Returns: seqidx */ int MSAGetSeqidx(MSA *msa, char *name, int guess) { int seqidx; /* can we guess? */ if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0) return guess; /* else, a lookup in the index */ if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0) return seqidx; /* else, it's a new name */ seqidx = GKIStoreKey(msa->index, name); if (seqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[seqidx] = sre_strdup(name, -1); msa->nseq++; return seqidx; } /* Function: MSAFromAINFO() * Date: SRE, Mon Jun 14 11:22:24 1999 [St. Louis] * * Purpose: Convert the old aseq/ainfo alignment structure * to new MSA structure. Enables more rapid conversion * of codebase to the new world order. * * Args: aseq - [0..nseq-1][0..alen-1] alignment * ainfo - old-style optional info * * Returns: MSA * */ MSA * MSAFromAINFO(char **aseq, AINFO *ainfo) { MSA *msa; int i, j; msa = MSAAlloc(ainfo->nseq, ainfo->alen); for (i = 0; i < ainfo->nseq; i++) { strcpy(msa->aseq[i], aseq[i]); msa->wgt[i] = ainfo->wgt[i]; msa->sqname[i] = sre_strdup(ainfo->sqinfo[i].name, -1); msa->sqlen[i] = msa->alen; GKIStoreKey(msa->index, msa->sqname[i]); if (ainfo->sqinfo[i].flags & SQINFO_ACC) MSASetSeqAccession(msa, i, ainfo->sqinfo[i].acc); if (ainfo->sqinfo[i].flags & SQINFO_DESC) MSASetSeqDescription(msa, i, ainfo->sqinfo[i].desc); if (ainfo->sqinfo[i].flags & SQINFO_SS) { if (msa->ss == NULL) { msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc); msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc); for (j = 0; j < msa->nseqalloc; j++) { msa->ss[j] = NULL; msa->sslen[j] = 0; } } MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].ss, &(msa->ss[i])); msa->sslen[i] = msa->alen; } if (ainfo->sqinfo[i].flags & SQINFO_SA) { if (msa->sa == NULL) { msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc); msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc); for (j = 0; j < msa->nseqalloc; j++) { msa->sa[j] = NULL; msa->salen[j] = 0; } } MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].sa, &(msa->sa[i])); msa->salen[i] = msa->alen; } } /* note that sre_strdup() returns NULL when passed NULL */ msa->name = sre_strdup(ainfo->name, -1); msa->desc = sre_strdup(ainfo->desc, -1); msa->acc = sre_strdup(ainfo->acc, -1); msa->au = sre_strdup(ainfo->au, -1); msa->ss_cons = sre_strdup(ainfo->cs, -1); msa->rf = sre_strdup(ainfo->rf, -1); if (ainfo->flags & AINFO_TC) { msa->cutoff[MSA_CUTOFF_TC1] = ainfo->tc1; msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE; msa->cutoff[MSA_CUTOFF_TC2] = ainfo->tc2; msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE; } if (ainfo->flags & AINFO_NC) { msa->cutoff[MSA_CUTOFF_NC1] = ainfo->nc1; msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE; msa->cutoff[MSA_CUTOFF_NC2] = ainfo->nc2; msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE; } if (ainfo->flags & AINFO_GA) { msa->cutoff[MSA_CUTOFF_GA1] = ainfo->ga1; msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE; msa->cutoff[MSA_CUTOFF_GA2] = ainfo->ga2; msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE; } msa->nseq = ainfo->nseq; msa->alen = ainfo->alen; return msa; } /* Function: MSAFileFormat() * Date: SRE, Fri Jun 18 14:26:49 1999 [Sanger Centre] * * Purpose: (Attempt to) determine the format of an alignment file. * Since it rewinds the file pointer when it's done, * cannot be used on a pipe or gzip'ed file. Works by * calling SeqfileFormat() from sqio.c, then making sure * that the format is indeed an alignment. If the format * comes back as FASTA, it assumes that the format as A2M * (e.g. aligned FASTA). * * Args: fname - file to evaluate * * Returns: format code; e.g. MSAFILE_STOCKHOLM */ int MSAFileFormat(MSAFILE *afp) { int fmt; fmt = SeqfileFormat(afp->f); if (fmt == SQFILE_FASTA) fmt = MSAFILE_A2M; if (fmt != MSAFILE_UNKNOWN && ! IsAlignmentFormat(fmt)) Die("File %s does not appear to be an alignment file;\n\ rather, it appears to be an unaligned file in %s format.\n\ I'm expecting an alignment file in this context.\n", afp->fname, SeqfileFormat2String(fmt)); return fmt; } /* Function: MSAMingap() * Date: SRE, Mon Jun 28 18:57:54 1999 [on jury duty, St. Louis Civil Court] * * Purpose: Remove all-gap columns from a multiple sequence alignment * and its associated per-residue data. * * Args: msa - the alignment * * Returns: (void) */ void MSAMingap(MSA *msa) { int *useme; /* array of TRUE/FALSE flags for which columns to keep */ int apos; /* position in original alignment */ int idx; /* sequence index */ useme = MallocOrDie(sizeof(int) * msa->alen); for (apos = 0; apos < msa->alen; apos++) { for (idx = 0; idx < msa->nseq; idx++) if (! isgap(msa->aseq[idx][apos])) break; if (idx == msa->nseq) useme[apos] = FALSE; else useme[apos] = TRUE; } MSAShorterAlignment(msa, useme); free(useme); return; } /* Function: MSANogap() * Date: SRE, Wed Nov 17 09:59:51 1999 [St. Louis] * * Purpose: Remove all columns from a multiple sequence alignment that * contain any gaps -- used for filtering before phylogenetic * analysis. * * Args: msa - the alignment * * Returns: (void). The alignment is modified, so if you want to keep * the original for something, make a copy. */ void MSANogap(MSA *msa) { int *useme; /* array of TRUE/FALSE flags for which columns to keep */ int apos; /* position in original alignment */ int idx; /* sequence index */ useme = MallocOrDie(sizeof(int) * msa->alen); for (apos = 0; apos < msa->alen; apos++) { for (idx = 0; idx < msa->nseq; idx++) if (isgap(msa->aseq[idx][apos])) break; if (idx == msa->nseq) useme[apos] = TRUE; else useme[apos] = FALSE; } MSAShorterAlignment(msa, useme); free(useme); return; } /* Function: MSAShorterAlignment() * Date: SRE, Wed Nov 17 09:49:32 1999 [St. Louis] * * Purpose: Given an array "useme" (0..alen-1) of TRUE/FALSE flags, * where TRUE means "keep this column in the new alignment": * Remove all columns annotated as "FALSE" in the useme * array. * * Args: msa - the alignment. The alignment is changed, so * if you don't want the original screwed up, make * a copy of it first. * useme - TRUE/FALSE flags for columns to keep: 0..alen-1 * * Returns: (void) */ void MSAShorterAlignment(MSA *msa, int *useme) { int apos; /* position in original alignment */ int mpos; /* position in new alignment */ int idx; /* sequence index */ int i; /* markup index */ /* Since we're minimizing, we can overwrite, using already allocated * memory. */ for (apos = 0, mpos = 0; apos < msa->alen; apos++) { if (useme[apos] == FALSE) continue; /* shift alignment and associated per-column+per-residue markup */ if (mpos != apos) { for (idx = 0; idx < msa->nseq; idx++) { msa->aseq[idx][mpos] = msa->aseq[idx][apos]; if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = msa->ss[idx][apos]; if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = msa->sa[idx][apos]; for (i = 0; i < msa->ngr; i++) if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = msa->gr[i][idx][apos]; } if (msa->ss_cons != NULL) msa->ss_cons[mpos] = msa->ss_cons[apos]; if (msa->sa_cons != NULL) msa->sa_cons[mpos] = msa->sa_cons[apos]; if (msa->rf != NULL) msa->rf[mpos] = msa->rf[apos]; for (i = 0; i < msa->ngc; i++) msa->gc[i][mpos] = msa->gc[i][apos]; } mpos++; } msa->alen = mpos; /* set new length */ /* null terminate everything */ for (idx = 0; idx < msa->nseq; idx++) { msa->aseq[idx][mpos] = '\0'; if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = '\0'; if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = '\0'; for (i = 0; i < msa->ngr; i++) if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = '\0'; } if (msa->ss_cons != NULL) msa->ss_cons[mpos] = '\0'; if (msa->sa_cons != NULL) msa->sa_cons[mpos] = '\0'; if (msa->rf != NULL) msa->rf[mpos] = '\0'; for (i = 0; i < msa->ngc; i++) msa->gc[i][mpos] = '\0'; return; } /* Function: MSASmallerAlignment() * Date: SRE, Wed Jun 30 09:56:08 1999 [St. Louis] * * Purpose: Given an array "useme" of TRUE/FALSE flags for * each sequence in an alignment, construct * and return a new alignment containing only * those sequences that are flagged useme=TRUE. * * Used by routines such as MSAFilterAlignment() * and MSASampleAlignment(). * * Limitations: * Does not copy unparsed Stockholm markup. * * Does not make assumptions about meaning of wgt; * if you want the new wgt vector renormalized, do * it yourself with FNorm(new->wgt, new->nseq). * * Args: msa -- the original (larger) alignment * useme -- [0..nseq-1] array of TRUE/FALSE flags; TRUE means include * this seq in new alignment * ret_new -- RETURN: new alignment * * Returns: void * ret_new is allocated here; free with MSAFree() */ void MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new) { MSA *new; /* RETURN: new alignment */ int nnew; /* number of seqs in new msa (e.g. # of TRUEs) */ int oidx, nidx; /* old, new indices */ int i; nnew = 0; for (oidx = 0; oidx < msa->nseq; oidx++) if (useme[oidx]) nnew++; if (nnew == 0) { *ret_new = NULL; return; } new = MSAAlloc(nnew, 0); nidx = 0; for (oidx = 0; oidx < msa->nseq; oidx++) if (useme[oidx]) { new->aseq[nidx] = sre_strdup(msa->aseq[oidx], msa->alen); new->sqname[nidx] = sre_strdup(msa->sqname[oidx], msa->alen); GKIStoreKey(new->index, msa->sqname[oidx]); new->wgt[nidx] = msa->wgt[oidx]; if (msa->sqacc != NULL) MSASetSeqAccession(new, nidx, msa->sqacc[oidx]); if (msa->sqdesc != NULL) MSASetSeqDescription(new, nidx, msa->sqdesc[oidx]); if (msa->ss != NULL && msa->ss[oidx] != NULL) { if (new->ss == NULL) new->ss = MallocOrDie(sizeof(char *) * new->nseq); new->ss[nidx] = sre_strdup(msa->ss[oidx], -1); } if (msa->sa != NULL && msa->sa[oidx] != NULL) { if (new->sa == NULL) new->sa = MallocOrDie(sizeof(char *) * new->nseq); new->sa[nidx] = sre_strdup(msa->sa[oidx], -1); } nidx++; } new->nseq = nnew; new->alen = msa->alen; new->flags = msa->flags; new->type = msa->type; new->name = sre_strdup(msa->name, -1); new->desc = sre_strdup(msa->desc, -1); new->acc = sre_strdup(msa->acc, -1); new->au = sre_strdup(msa->au, -1); new->ss_cons = sre_strdup(msa->ss_cons, -1); new->sa_cons = sre_strdup(msa->sa_cons, -1); new->rf = sre_strdup(msa->rf, -1); for (i = 0; i < MSA_MAXCUTOFFS; i++) { new->cutoff[i] = msa->cutoff[i]; new->cutoff_is_set[i] = msa->cutoff_is_set[i]; } free(new->sqlen); MSAMingap(new); *ret_new = new; return; } /***************************************************************** * Retrieval routines * * Access to MSA structure data is possible through these routines. * I'm not doing this because of object oriented design, though * it might work in my favor someday. * I'm doing this because lots of MSA data is optional, and * checking through the chain of possible NULLs is a pain. *****************************************************************/ char * MSAGetSeqAccession(MSA *msa, int idx) { if (msa->sqacc != NULL && msa->sqacc[idx] != NULL) return msa->sqacc[idx]; else return NULL; } char * MSAGetSeqDescription(MSA *msa, int idx) { if (msa->sqdesc != NULL && msa->sqdesc[idx] != NULL) return msa->sqdesc[idx]; else return NULL; } char * MSAGetSeqSS(MSA *msa, int idx) { if (msa->ss != NULL && msa->ss[idx] != NULL) return msa->ss[idx]; else return NULL; } char * MSAGetSeqSA(MSA *msa, int idx) { if (msa->sa != NULL && msa->sa[idx] != NULL) return msa->sa[idx]; else return NULL; } /***************************************************************** * Information routines * * Access information about the MSA. *****************************************************************/ /* Function: MSAAverageSequenceLength() * Date: SRE, Sat Apr 6 09:41:34 2002 [St. Louis] * * Purpose: Return the average length of the (unaligned) sequences * in the MSA. * * Args: msa - the alignment * * Returns: average length */ float MSAAverageSequenceLength(MSA *msa) { int i; float avg; avg = 0.; for (i = 0; i < msa->nseq; i++) avg += (float) DealignedLength(msa->aseq[i]); if (msa->nseq == 0) return 0.; else return (avg / msa->nseq); }