1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
8 *****************************************************************/
11 * SRE, Mon May 17 10:48:47 1999
13 * SQUID's interface for multiple sequence alignment
14 * manipulation: access to the MSA object.
16 * RCS $Id: msa.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: msa.c,v 1.18 2002/10/12 04:40:35 eddy Exp)
23 #include "msa.h" /* multiple sequence alignment object support */
24 #include "gki.h" /* string indexing hashtable code */
25 #include "ssi.h" /* SSI sequence file indexing code */
27 /* Function: MSAAlloc()
28 * Date: SRE, Tue May 18 10:45:47 1999 [St. Louis]
30 * Purpose: Allocate an MSA structure, return a pointer
33 * Designed to be used in three ways:
34 * 1) We know exactly the dimensions of the alignment:
36 * msa = MSAAlloc(nseq, alen);
38 * 2) We know the number of sequences but not alen.
39 * (We add sequences later.)
40 * msa = MSAAlloc(nseq, 0);
42 * 3) We even don't know the number of sequences, so
43 * we'll have to dynamically expand allocations.
44 * We provide a blocksize for the allocation expansion,
45 * and expand when needed.
46 * msa = MSAAlloc(10, 0);
47 * if (msa->nseq == msa->nseqalloc) MSAExpand(msa);
49 * Args: nseq - number of sequences, or nseq allocation blocksize
50 * alen - length of alignment in columns, or 0
52 * Returns: pointer to new MSA object, w/ all values initialized.
53 * Note that msa->nseq is initialized to 0, though space
56 * Diagnostics: "always works". Die()'s on memory allocation failure.
60 MSAAlloc(int nseq, int alen)
65 msa = MallocOrDie(sizeof(MSA));
66 msa->aseq = MallocOrDie(sizeof(char *) * nseq);
67 msa->sqname = MallocOrDie(sizeof(char *) * nseq);
68 msa->sqlen = MallocOrDie(sizeof(int) * nseq);
69 msa->wgt = MallocOrDie(sizeof(float) * nseq);
71 for (i = 0; i < nseq; i++)
73 msa->sqname[i] = NULL;
77 if (alen != 0) msa->aseq[i] = MallocOrDie(sizeof(char) * (alen+1));
78 else msa->aseq[i] = NULL;
83 msa->nseqalloc = nseq;
87 msa->type = kOtherSeq;
101 msa->index = GKIInit();
104 for (i = 0; i < MSA_MAXCUTOFFS; i++) {
106 msa->cutoff_is_set[i] = FALSE;
109 /* Initialize unparsed optional markup
113 msa->alloc_ncomment = 0;
134 /* Done. Return the alloced, initialized structure
139 /* Function: MSAExpand()
140 * Date: SRE, Tue May 18 11:06:53 1999 [St. Louis]
142 * Purpose: Increase the sequence allocation in an MSA
143 * by msa->nseqlump. (Typically used when we're reading
144 * in an alignment sequentially from a file,
145 * so we don't know nseq until we're done.)
147 * Args: msa - the MSA object
157 msa->nseqalloc += msa->nseqlump;
159 msa->aseq = ReallocOrDie(msa->aseq, sizeof(char *) * msa->nseqalloc);
160 msa->sqname = ReallocOrDie(msa->sqname, sizeof(char *) * msa->nseqalloc);
161 msa->sqlen = ReallocOrDie(msa->sqlen, sizeof(char *) * msa->nseqalloc);
162 msa->wgt = ReallocOrDie(msa->wgt, sizeof(float) * msa->nseqalloc);
164 if (msa->ss != NULL) {
165 msa->ss = ReallocOrDie(msa->ss, sizeof(char *) * msa->nseqalloc);
166 msa->sslen = ReallocOrDie(msa->sslen, sizeof(int) * msa->nseqalloc);
168 if (msa->sa != NULL) {
169 msa->sa = ReallocOrDie(msa->sa, sizeof(char *) * msa->nseqalloc);
170 msa->salen = ReallocOrDie(msa->salen, sizeof(int) * msa->nseqalloc);
172 if (msa->sqacc != NULL)
173 msa->sqacc = ReallocOrDie(msa->sqacc, sizeof(char *) * msa->nseqalloc);
174 if (msa->sqdesc != NULL)
175 msa->sqdesc =ReallocOrDie(msa->sqdesc,sizeof(char *) * msa->nseqalloc);
177 for (i = msa->nseqalloc-msa->nseqlump; i < msa->nseqalloc; i++)
179 msa->sqname[i] = NULL;
182 if (msa->sqacc != NULL) msa->sqacc[i] = NULL;
183 if (msa->sqdesc != NULL) msa->sqdesc[i] = NULL;
186 msa->aseq[i] = ReallocOrDie(msa->aseq[i], sizeof(char) * (msa->alen+1));
187 else msa->aseq[i] = NULL;
190 if (msa->ss != NULL) {
192 msa->ss[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1));
193 else msa->ss[i] = NULL;
196 if (msa->sa != NULL) {
198 msa->sa[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1));
205 /* Reallocate and re-init for unparsed #=GS tags, if we have some.
206 * gs is [0..ngs-1][0..nseq-1][], so we're reallocing the middle
210 for (i = 0; i < msa->ngs; i++)
212 if (msa->gs[i] != NULL)
214 msa->gs[i] = ReallocOrDie(msa->gs[i], sizeof(char *) * msa->nseqalloc);
215 for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++)
216 msa->gs[i][j] = NULL;
220 /* Reallocate and re-init for unparsed #=GR tags, if we have some.
221 * gr is [0..ngs-1][0..nseq-1][], so we're reallocing the middle
225 for (i = 0; i < msa->ngr; i++)
227 if (msa->gr[i] != NULL)
229 msa->gr[i] = ReallocOrDie(msa->gr[i], sizeof(char *) * msa->nseqalloc);
230 for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++)
231 msa->gr[i][j] = NULL;
238 /* Function: MSAFree()
239 * Date: SRE, Tue May 18 11:20:16 1999 [St. Louis]
241 * Purpose: Free a multiple sequence alignment structure.
243 * Args: msa - the alignment
250 Free2DArray((void **) msa->aseq, msa->nseq);
251 Free2DArray((void **) msa->sqname, msa->nseq);
252 Free2DArray((void **) msa->sqacc, msa->nseq);
253 Free2DArray((void **) msa->sqdesc, msa->nseq);
254 Free2DArray((void **) msa->ss, msa->nseq);
255 Free2DArray((void **) msa->sa, msa->nseq);
257 if (msa->sqlen != NULL) free(msa->sqlen);
258 if (msa->wgt != NULL) free(msa->wgt);
260 if (msa->name != NULL) free(msa->name);
261 if (msa->desc != NULL) free(msa->desc);
262 if (msa->acc != NULL) free(msa->acc);
263 if (msa->au != NULL) free(msa->au);
264 if (msa->ss_cons != NULL) free(msa->ss_cons);
265 if (msa->sa_cons != NULL) free(msa->sa_cons);
266 if (msa->rf != NULL) free(msa->rf);
267 if (msa->sslen != NULL) free(msa->sslen);
268 if (msa->salen != NULL) free(msa->salen);
270 Free2DArray((void **) msa->comment, msa->ncomment);
271 Free2DArray((void **) msa->gf_tag, msa->ngf);
272 Free2DArray((void **) msa->gf, msa->ngf);
273 Free2DArray((void **) msa->gs_tag, msa->ngs);
274 Free3DArray((void ***)msa->gs, msa->ngs, msa->nseq);
275 Free2DArray((void **) msa->gc_tag, msa->ngc);
276 Free2DArray((void **) msa->gc, msa->ngc);
277 Free2DArray((void **) msa->gr_tag, msa->ngr);
278 Free3DArray((void ***)msa->gr, msa->ngr, msa->nseq);
281 GKIFree(msa->gs_idx);
282 GKIFree(msa->gc_idx);
283 GKIFree(msa->gr_idx);
289 /* Function: MSASetSeqAccession()
290 * Date: SRE, Mon Jun 21 04:13:33 1999 [Sanger Centre]
292 * Purpose: Set a sequence accession in an MSA structure.
293 * Handles some necessary allocation/initialization.
295 * Args: msa - multiple alignment to add accession to
296 * seqidx - index of sequence to attach accession to
302 MSASetSeqAccession(MSA *msa, int seqidx, char *acc)
306 if (msa->sqacc == NULL) {
307 msa->sqacc = MallocOrDie(sizeof(char *) * msa->nseqalloc);
308 for (x = 0; x < msa->nseqalloc; x++)
309 msa->sqacc[x] = NULL;
311 msa->sqacc[seqidx] = sre_strdup(acc, -1);
314 /* Function: MSASetSeqDescription()
315 * Date: SRE, Mon Jun 21 04:21:09 1999 [Sanger Centre]
317 * Purpose: Set a sequence description in an MSA structure.
318 * Handles some necessary allocation/initialization.
320 * Args: msa - multiple alignment to add accession to
321 * seqidx - index of sequence to attach accession to
327 MSASetSeqDescription(MSA *msa, int seqidx, char *desc)
331 if (msa->sqdesc == NULL) {
332 msa->sqdesc = MallocOrDie(sizeof(char *) * msa->nseqalloc);
333 for (x = 0; x < msa->nseqalloc; x++)
334 msa->sqdesc[x] = NULL;
336 msa->sqdesc[seqidx] = sre_strdup(desc, -1);
340 /* Function: MSAAddComment()
341 * Date: SRE, Tue Jun 1 17:37:21 1999 [St. Louis]
343 * Purpose: Add an (unparsed) comment line to the MSA structure,
344 * allocating as necessary.
346 * Args: msa - a multiple alignment
347 * s - comment line to add
352 MSAAddComment(MSA *msa, char *s)
354 /* If this is our first recorded comment, we need to malloc();
355 * and if we've filled available space, we need to realloc().
356 * Note the arbitrary lumpsize of 10 lines per allocation...
358 if (msa->comment == NULL) {
359 msa->comment = MallocOrDie (sizeof(char *) * 10);
360 msa->alloc_ncomment = 10;
362 if (msa->ncomment == msa->alloc_ncomment) {
363 msa->alloc_ncomment += 10;
364 msa->comment = ReallocOrDie(msa->comment, sizeof(char *) * msa->alloc_ncomment);
367 msa->comment[msa->ncomment] = sre_strdup(s, -1);
372 /* Function: MSAAddGF()
373 * Date: SRE, Wed Jun 2 06:53:54 1999 [bus to Madison]
375 * Purpose: Add an unparsed #=GF markup line to the MSA
376 * structure, allocating as necessary.
378 * Args: msa - a multiple alignment
379 * tag - markup tag (e.g. "AU")
380 * value - free text markup (e.g. "Alex Bateman")
385 MSAAddGF(MSA *msa, char *tag, char *value)
387 /* If this is our first recorded unparsed #=GF line, we need to malloc();
388 * if we've filled availabl space If we already have a hash index, and the GF
389 * Note the arbitrary lumpsize of 10 lines per allocation...
391 if (msa->gf_tag == NULL) {
392 msa->gf_tag = MallocOrDie (sizeof(char *) * 10);
393 msa->gf = MallocOrDie (sizeof(char *) * 10);
396 if (msa->ngf == msa->alloc_ngf) {
397 msa->alloc_ngf += 10;
398 msa->gf_tag = ReallocOrDie(msa->gf_tag, sizeof(char *) * msa->alloc_ngf);
399 msa->gf = ReallocOrDie(msa->gf, sizeof(char *) * msa->alloc_ngf);
402 msa->gf_tag[msa->ngf] = sre_strdup(tag, -1);
403 msa->gf[msa->ngf] = sre_strdup(value, -1);
410 /* Function: MSAAddGS()
411 * Date: SRE, Wed Jun 2 06:57:03 1999 [St. Louis]
413 * Purpose: Add an unparsed #=GS markup line to the MSA
414 * structure, allocating as necessary.
416 * It's possible that we could get more than one
417 * of the same type of GS tag per sequence; for
418 * example, "DR PDB;" structure links in Pfam.
419 * Hack: handle these by appending to the string,
420 * in a \n separated fashion.
422 * Args: msa - multiple alignment structure
423 * tag - markup tag (e.g. "AC")
424 * sqidx - index of sequence to assoc markup with (0..nseq-1)
425 * value - markup (e.g. "P00666")
427 * Returns: 0 on success
430 MSAAddGS(MSA *msa, char *tag, int sqidx, char *value)
435 /* Is this an unparsed tag name that we recognize?
436 * If not, handle adding it to index, and reallocating
439 if (msa->gs_tag == NULL) /* first tag? init w/ malloc */
441 msa->gs_idx = GKIInit();
442 tagidx = GKIStoreKey(msa->gs_idx, tag);
443 SQD_DASSERT1((tagidx == 0));
444 msa->gs_tag = MallocOrDie(sizeof(char *));
445 msa->gs = MallocOrDie(sizeof(char **));
446 msa->gs[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
447 for (i = 0; i < msa->nseqalloc; i++)
448 msa->gs[0][i] = NULL;
453 tagidx = GKIKeyIndex(msa->gs_idx, tag);
454 if (tagidx < 0) { /* it's a new tag name; realloc */
455 tagidx = GKIStoreKey(msa->gs_idx, tag);
456 /* since we alloc in blocks of 1,
457 we always realloc upon seeing
459 SQD_DASSERT1((tagidx == msa->ngs));
460 msa->gs_tag = ReallocOrDie(msa->gs_tag, (msa->ngs+1) * sizeof(char *));
461 msa->gs = ReallocOrDie(msa->gs, (msa->ngs+1) * sizeof(char **));
462 msa->gs[msa->ngs] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
463 for (i = 0; i < msa->nseqalloc; i++)
464 msa->gs[msa->ngs][i] = NULL;
468 if (tagidx == msa->ngs) {
469 msa->gs_tag[tagidx] = sre_strdup(tag, -1);
473 if (msa->gs[tagidx][sqidx] == NULL) /* first annotation of this seq with this tag? */
474 msa->gs[tagidx][sqidx] = sre_strdup(value, -1);
476 /* >1 annotation of this seq with this tag; append */
478 if ((len = sre_strcat(&(msa->gs[tagidx][sqidx]), -1, "\n", 1)) < 0)
479 Die("failed to sre_strcat()");
480 if (sre_strcat(&(msa->gs[tagidx][sqidx]), len, value, -1) < 0)
481 Die("failed to sre_strcat()");
486 /* Function: MSAAppendGC()
487 * Date: SRE, Thu Jun 3 06:25:14 1999 [Madison]
489 * Purpose: Add an unparsed #=GC markup line to the MSA
490 * structure, allocating as necessary.
492 * When called multiple times for the same tag,
493 * appends value strings together -- used when
494 * parsing multiblock alignment files, for
497 * Args: msa - multiple alignment structure
498 * tag - markup tag (e.g. "CS")
499 * value - markup, one char per aligned column
504 MSAAppendGC(MSA *msa, char *tag, char *value)
508 /* Is this an unparsed tag name that we recognize?
509 * If not, handle adding it to index, and reallocating
512 if (msa->gc_tag == NULL) /* first tag? init w/ malloc */
514 msa->gc_tag = MallocOrDie(sizeof(char *));
515 msa->gc = MallocOrDie(sizeof(char *));
516 msa->gc_idx = GKIInit();
517 tagidx = GKIStoreKey(msa->gc_idx, tag);
518 SQD_DASSERT1((tagidx == 0));
523 tagidx = GKIKeyIndex(msa->gc_idx, tag);
524 if (tagidx < 0) { /* it's a new tag name; realloc */
525 tagidx = GKIStoreKey(msa->gc_idx, tag);
526 /* since we alloc in blocks of 1,
527 we always realloc upon seeing
529 SQD_DASSERT1((tagidx == msa->ngc));
530 msa->gc_tag = ReallocOrDie(msa->gc_tag, (msa->ngc+1) * sizeof(char **));
531 msa->gc = ReallocOrDie(msa->gc, (msa->ngc+1) * sizeof(char **));
532 msa->gc[tagidx] = NULL;
536 if (tagidx == msa->ngc) {
537 msa->gc_tag[tagidx] = sre_strdup(tag, -1);
540 sre_strcat(&(msa->gc[tagidx]), -1, value, -1);
544 /* Function: MSAGetGC()
545 * Date: SRE, Fri Aug 13 13:25:57 1999 [St. Louis]
547 * Purpose: Given a tagname for a miscellaneous #=GC column
548 * annotation, return a pointer to the annotation
551 * Args: msa - alignment and its annotation
552 * tag - name of the annotation
554 * Returns: ptr to the annotation string. Caller does *not*
555 * free; is managed by msa object still.
558 MSAGetGC(MSA *msa, char *tag)
562 if (msa->gc_idx == NULL) return NULL;
563 if ((tagidx = GKIKeyIndex(msa->gc_idx, tag)) < 0) return NULL;
564 return msa->gc[tagidx];
568 /* Function: MSAAppendGR()
569 * Date: SRE, Thu Jun 3 06:34:38 1999 [Madison]
571 * Purpose: Add an unparsed #=GR markup line to the
572 * MSA structure, allocating as necessary.
574 * When called multiple times for the same tag,
575 * appends value strings together -- used when
576 * parsing multiblock alignment files, for
579 * Args: msa - multiple alignment structure
580 * tag - markup tag (e.g. "SS")
581 * sqidx - index of seq to assoc markup with (0..nseq-1)
582 * value - markup, one char per aligned column
587 MSAAppendGR(MSA *msa, char *tag, int sqidx, char *value)
592 /* Is this an unparsed tag name that we recognize?
593 * If not, handle adding it to index, and reallocating
596 if (msa->gr_tag == NULL) /* first tag? init w/ malloc */
598 msa->gr_tag = MallocOrDie(sizeof(char *));
599 msa->gr = MallocOrDie(sizeof(char **));
600 msa->gr[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
601 for (i = 0; i < msa->nseqalloc; i++)
602 msa->gr[0][i] = NULL;
603 msa->gr_idx = GKIInit();
604 tagidx = GKIStoreKey(msa->gr_idx, tag);
605 SQD_DASSERT1((tagidx == 0));
610 tagidx = GKIKeyIndex(msa->gr_idx, tag);
611 if (tagidx < 0) { /* it's a new tag name; realloc */
612 tagidx = GKIStoreKey(msa->gr_idx, tag);
613 /* since we alloc in blocks of 1,
614 we always realloc upon seeing
616 SQD_DASSERT1((tagidx == msa->ngr));
617 msa->gr_tag = ReallocOrDie(msa->gr_tag, (msa->ngr+1) * sizeof(char *));
618 msa->gr = ReallocOrDie(msa->gr, (msa->ngr+1) * sizeof(char **));
619 msa->gr[msa->ngr] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
620 for (i = 0; i < msa->nseqalloc; i++)
621 msa->gr[msa->ngr][i] = NULL;
625 if (tagidx == msa->ngr) {
626 msa->gr_tag[tagidx] = sre_strdup(tag, -1);
629 sre_strcat(&(msa->gr[tagidx][sqidx]), -1, value, -1);
634 /* Function: MSAVerifyParse()
635 * Date: SRE, Sat Jun 5 14:24:24 1999 [Madison, 1999 worm mtg]
637 * Purpose: Last function called after a multiple alignment is
638 * parsed. Checks that parse was successful; makes sure
639 * required information is present; makes sure required
640 * information is consistent. Some fields that are
641 * only use during parsing may be freed (sqlen, for
644 * Some fields in msa may be modified (msa->alen is set,
647 * Args: msa - the multiple alignment
648 * sqname, aseq must be set
649 * nseq must be correct
650 * alen need not be set; will be set here.
651 * wgt will be set here if not already set
654 * Will Die() here with diagnostics on error.
659 MSAVerifyParse(MSA *msa)
663 if (msa->nseq == 0) Die("Parse error: no sequences were found for alignment %s",
664 msa->name != NULL ? msa->name : "");
666 msa->alen = msa->sqlen[0];
668 /* We can rely on msa->sqname[] being valid for any index,
669 * because of the way the line parsers always store any name
670 * they add to the index.
672 for (idx = 0; idx < msa->nseq; idx++)
674 /* aseq is required. */
675 if (msa->aseq[idx] == NULL)
676 Die("Parse error: No sequence for %s in alignment %s", msa->sqname[idx],
677 msa->name != NULL ? msa->name : "");
678 /* either all weights must be set, or none of them */
679 if ((msa->flags & MSA_SET_WGT) && msa->wgt[idx] == -1.0)
680 Die("Parse error: some weights are set, but %s doesn't have one in alignment %s",
682 msa->name != NULL ? msa->name : "");
683 /* all aseq must be same length. */
684 if (msa->sqlen[idx] != msa->alen)
685 Die("Parse error: sequence %s: length %d, expected %d in alignment %s",
686 msa->sqname[idx], msa->sqlen[idx], msa->alen,
687 msa->name != NULL ? msa->name : "");
688 /* if SS is present, must have length right */
689 if (msa->ss != NULL && msa->ss[idx] != NULL && msa->sslen[idx] != msa->alen)
690 Die("Parse error: #=GR SS annotation for %s: length %d, expected %d in alignment %s",
691 msa->sqname[idx], msa->sslen[idx], msa->alen,
692 msa->name != NULL ? msa->name : "");
693 /* if SA is present, must have length right */
694 if (msa->sa != NULL && msa->sa[idx] != NULL && msa->salen[idx] != msa->alen)
695 Die("Parse error: #=GR SA annotation for %s: length %d, expected %d in alignment %s",
696 msa->sqname[idx], msa->salen[idx], msa->alen,
697 msa->name != NULL ? msa->name : "");
700 /* if cons SS is present, must have length right */
701 if (msa->ss_cons != NULL && strlen(msa->ss_cons) != msa->alen)
702 Die("Parse error: #=GC SS_cons annotation: length %d, expected %d in alignment %s",
703 strlen(msa->ss_cons), msa->alen,
704 msa->name != NULL ? msa->name : "");
706 /* if cons SA is present, must have length right */
707 if (msa->sa_cons != NULL && strlen(msa->sa_cons) != msa->alen)
708 Die("Parse error: #=GC SA_cons annotation: length %d, expected %d in alignment %s",
709 strlen(msa->sa_cons), msa->alen,
710 msa->name != NULL ? msa->name : "");
712 /* if RF is present, must have length right */
713 if (msa->rf != NULL && strlen(msa->rf) != msa->alen)
714 Die("Parse error: #=GC RF annotation: length %d, expected %d in alignment %s",
715 strlen(msa->rf), msa->alen,
716 msa->name != NULL ? msa->name : "");
718 /* Check that all or no weights are set */
719 if (!(msa->flags & MSA_SET_WGT))
720 FSet(msa->wgt, msa->nseq, 1.0); /* default weights */
722 /* Clean up a little from the parser */
723 if (msa->sqlen != NULL) { free(msa->sqlen); msa->sqlen = NULL; }
724 if (msa->sslen != NULL) { free(msa->sslen); msa->sslen = NULL; }
725 if (msa->salen != NULL) { free(msa->salen); msa->salen = NULL; }
733 /* Function: MSAFileOpen()
734 * Date: SRE, Tue May 18 13:22:01 1999 [St. Louis]
736 * Purpose: Open an alignment database file and prepare
737 * for reading one alignment, or sequentially
738 * in the (rare) case of multiple MSA databases
739 * (e.g. Stockholm format).
741 * Args: filename - name of file to open
743 * if it ends in ".gz", read from pipe to gunzip -dc
744 * format - format of file (e.g. MSAFILE_STOCKHOLM)
745 * env - environment variable for path (e.g. BLASTDB)
747 * Returns: opened MSAFILE * on success.
749 * usually, because the file doesn't exist;
750 * for gzip'ed files, may also mean that gzip isn't in the path.
753 MSAFileOpen(char *filename, int format, char *env)
757 afp = MallocOrDie(sizeof(MSAFILE));
758 if (strcmp(filename, "-") == 0)
761 afp->do_stdin = TRUE;
762 afp->do_gzip = FALSE;
763 afp->fname = sre_strdup("[STDIN]", -1);
764 afp->ssi = NULL; /* can't index stdin because we can't seek*/
766 #ifndef SRE_STRICT_ANSI
767 /* popen(), pclose() aren't portable to non-POSIX systems; disable */
768 else if (Strparse("^.*\\.gz$", filename, 0))
772 /* Note that popen() will return "successfully"
773 * if file doesn't exist, because gzip works fine
774 * and prints an error! So we have to check for
775 * existence of file ourself.
777 if (! FileExists(filename))
778 Die("%s: file does not exist", filename);
779 if (strlen(filename) + strlen("gzip -dc ") >= 256)
780 Die("filename > 255 char in MSAFileOpen()");
781 sprintf(cmd, "gzip -dc %s", filename);
782 if ((afp->f = popen(cmd, "r")) == NULL)
785 afp->do_stdin = FALSE;
787 afp->fname = sre_strdup(filename, -1);
788 /* we can't index a .gz file, because we can't seek in a pipe afaik */
791 #endif /*SRE_STRICT_ANSI*/
797 /* When we open a file, it may be either in the current
798 * directory, or in the directory indicated by the env
799 * argument - and we have to construct the SSI filename accordingly.
801 if ((afp->f = fopen(filename, "r")) != NULL)
803 ssifile = MallocOrDie(sizeof(char) * (strlen(filename) + 5));
804 sprintf(ssifile, "%s.ssi", filename);
806 else if ((afp->f = EnvFileOpen(filename, env, &dir)) != NULL)
809 full = FileConcat(dir, filename);
810 ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(filename) + 5));
811 sprintf(ssifile, "%s.ssi", full);
816 afp->do_stdin = FALSE;
817 afp->do_gzip = FALSE;
818 afp->fname = sre_strdup(filename, -1);
821 /* Open the SSI index file. If it doesn't exist, or
822 * it's corrupt, or some error happens, afp->ssi stays NULL.
824 SSIOpen(ssifile, &(afp->ssi));
828 /* Invoke autodetection if we haven't already been told what
831 if (format == MSAFILE_UNKNOWN)
833 if (afp->do_stdin == TRUE || afp->do_gzip)
834 Die("Can't autodetect alignment file format from a stdin or gzip pipe");
835 format = MSAFileFormat(afp);
836 if (format == MSAFILE_UNKNOWN)
837 Die("Can't determine format of multiple alignment file %s", afp->fname);
840 afp->format = format;
849 /* Function: MSAFilePositionByKey()
850 * MSAFilePositionByIndex()
853 * Date: SRE, Tue Nov 9 19:02:54 1999 [St. Louis]
855 * Purpose: Family of functions for repositioning in
856 * open MSA files; analogous to a similarly
857 * named function series in HMMER's hmmio.c.
859 * Args: afp - open alignment file
860 * offset - disk offset in bytes
861 * key - key to look up in SSI indices
862 * idx - index of alignment.
864 * Returns: 0 on failure.
866 * If called on a non-fseek()'able file (e.g. a gzip'ed
867 * or pipe'd alignment), returns 0 as a failure flag.
870 MSAFileRewind(MSAFILE *afp)
872 if (afp->do_gzip || afp->do_stdin) return 0;
877 MSAFilePositionByKey(MSAFILE *afp, char *key)
879 int fh; /* filehandle is ignored */
880 SSIOFFSET offset; /* offset of the key alignment */
882 if (afp->ssi == NULL) return 0;
883 if (SSIGetOffsetByName(afp->ssi, key, &fh, &offset) != 0) return 0;
884 if (SSISetFilePosition(afp->f, &offset) != 0) return 0;
888 MSAFilePositionByIndex(MSAFILE *afp, int idx)
890 int fh; /* filehandled is passed but ignored */
891 SSIOFFSET offset; /* disk offset of desired alignment */
893 if (afp->ssi == NULL) return 0;
894 if (SSIGetOffsetByNumber(afp->ssi, idx, &fh, &offset) != 0) return 0;
895 if (SSISetFilePosition(afp->f, &offset) != 0) return 0;
900 /* Function: MSAFileRead()
901 * Date: SRE, Fri May 28 16:01:43 1999 [St. Louis]
903 * Purpose: Read the next msa from an open alignment file.
904 * This is a wrapper around format-specific calls.
906 * Args: afp - open alignment file
908 * Returns: next alignment, or NULL if out of alignments
911 MSAFileRead(MSAFILE *afp)
915 switch (afp->format) {
916 case MSAFILE_STOCKHOLM: msa = ReadStockholm(afp); break;
917 case MSAFILE_MSF: msa = ReadMSF(afp); break;
918 case MSAFILE_A2M: msa = ReadA2M(afp); break;
919 case MSAFILE_CLUSTAL: msa = ReadClustal(afp); break;
920 case MSAFILE_SELEX: msa = ReadSELEX(afp); break;
921 case MSAFILE_PHYLIP: msa = ReadPhylip(afp); break;
923 Die("MSAFILE corrupted: bad format index");
928 /* Function: MSAFileClose()
929 * Date: SRE, Tue May 18 14:05:28 1999 [St. Louis]
931 * Purpose: Close an open MSAFILE.
933 * Args: afp - ptr to an open MSAFILE.
938 MSAFileClose(MSAFILE *afp)
940 #ifndef SRE_STRICT_ANSI /* gzip functionality only on POSIX systems */
941 if (afp->do_gzip) pclose(afp->f);
943 if (! afp->do_stdin) fclose(afp->f);
944 if (afp->buf != NULL) free(afp->buf);
945 if (afp->ssi != NULL) SSIClose(afp->ssi);
946 if (afp->fname != NULL) free(afp->fname);
951 MSAFileGetLine(MSAFILE *afp)
954 if ((s = sre_fgets(&(afp->buf), &(afp->buflen), afp->f)) == NULL)
961 MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline)
965 case MSAFILE_A2M: WriteA2M(stdout, msa, 0); break;
966 case MSAFILE_VIENNA: WriteA2M(stdout, msa, 1); break;
968 case MSAFILE_A2M: WriteA2M(stdout, msa); break;
970 case MSAFILE_CLUSTAL: WriteClustal(stdout, msa); break;
971 case MSAFILE_MSF: WriteMSF(stdout, msa); break;
972 case MSAFILE_PHYLIP: WritePhylip(stdout, msa); break;
973 case MSAFILE_SELEX: WriteSELEX(stdout, msa); break;
974 case MSAFILE_STOCKHOLM:
975 if (do_oneline) WriteStockholmOneBlock(stdout, msa);
976 else WriteStockholm(stdout, msa);
979 Die("can't write. no such alignment format %d\n", outfmt);
983 /* Function: MSAGetSeqidx()
984 * Date: SRE, Wed May 19 15:08:25 1999 [St. Louis]
986 * Purpose: From a sequence name, return seqidx appropriate
987 * for an MSA structure.
989 * 1) try to guess the index. (pass -1 if you can't guess)
990 * 2) Look up name in msa's hashtable.
991 * 3) If it's a new name, store in msa's hashtable;
992 * expand allocs as needed;
995 * Args: msa - alignment object
996 * name - a sequence name
997 * guess - a guess at the right index, or -1 if no guess.
1002 MSAGetSeqidx(MSA *msa, char *name, int guess)
1006 if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0)
1008 /* else, a lookup in the index */
1009 if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0)
1011 /* else, it's a new name */
1012 seqidx = GKIStoreKey(msa->index, name);
1013 if (seqidx >= msa->nseqalloc) MSAExpand(msa);
1015 msa->sqname[seqidx] = sre_strdup(name, -1);
1021 /* Function: MSAFromAINFO()
1022 * Date: SRE, Mon Jun 14 11:22:24 1999 [St. Louis]
1024 * Purpose: Convert the old aseq/ainfo alignment structure
1025 * to new MSA structure. Enables more rapid conversion
1026 * of codebase to the new world order.
1028 * Args: aseq - [0..nseq-1][0..alen-1] alignment
1029 * ainfo - old-style optional info
1034 MSAFromAINFO(char **aseq, AINFO *ainfo)
1039 msa = MSAAlloc(ainfo->nseq, ainfo->alen);
1040 for (i = 0; i < ainfo->nseq; i++)
1042 strcpy(msa->aseq[i], aseq[i]);
1043 msa->wgt[i] = ainfo->wgt[i];
1044 msa->sqname[i] = sre_strdup(ainfo->sqinfo[i].name, -1);
1045 msa->sqlen[i] = msa->alen;
1046 GKIStoreKey(msa->index, msa->sqname[i]);
1048 if (ainfo->sqinfo[i].flags & SQINFO_ACC)
1049 MSASetSeqAccession(msa, i, ainfo->sqinfo[i].acc);
1051 if (ainfo->sqinfo[i].flags & SQINFO_DESC)
1052 MSASetSeqDescription(msa, i, ainfo->sqinfo[i].desc);
1054 if (ainfo->sqinfo[i].flags & SQINFO_SS) {
1055 if (msa->ss == NULL) {
1056 msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc);
1057 msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc);
1058 for (j = 0; j < msa->nseqalloc; j++) {
1063 MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].ss, &(msa->ss[i]));
1064 msa->sslen[i] = msa->alen;
1067 if (ainfo->sqinfo[i].flags & SQINFO_SA) {
1068 if (msa->sa == NULL) {
1069 msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc);
1070 msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc);
1071 for (j = 0; j < msa->nseqalloc; j++) {
1076 MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].sa, &(msa->sa[i]));
1077 msa->salen[i] = msa->alen;
1080 /* note that sre_strdup() returns NULL when passed NULL */
1081 msa->name = sre_strdup(ainfo->name, -1);
1082 msa->desc = sre_strdup(ainfo->desc, -1);
1083 msa->acc = sre_strdup(ainfo->acc, -1);
1084 msa->au = sre_strdup(ainfo->au, -1);
1085 msa->ss_cons = sre_strdup(ainfo->cs, -1);
1086 msa->rf = sre_strdup(ainfo->rf, -1);
1087 if (ainfo->flags & AINFO_TC) {
1088 msa->cutoff[MSA_CUTOFF_TC1] = ainfo->tc1; msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
1089 msa->cutoff[MSA_CUTOFF_TC2] = ainfo->tc2; msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
1091 if (ainfo->flags & AINFO_NC) {
1092 msa->cutoff[MSA_CUTOFF_NC1] = ainfo->nc1; msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
1093 msa->cutoff[MSA_CUTOFF_NC2] = ainfo->nc2; msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
1095 if (ainfo->flags & AINFO_GA) {
1096 msa->cutoff[MSA_CUTOFF_GA1] = ainfo->ga1; msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
1097 msa->cutoff[MSA_CUTOFF_GA2] = ainfo->ga2; msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
1099 msa->nseq = ainfo->nseq;
1100 msa->alen = ainfo->alen;
1107 /* Function: MSAFileFormat()
1108 * Date: SRE, Fri Jun 18 14:26:49 1999 [Sanger Centre]
1110 * Purpose: (Attempt to) determine the format of an alignment file.
1111 * Since it rewinds the file pointer when it's done,
1112 * cannot be used on a pipe or gzip'ed file. Works by
1113 * calling SeqfileFormat() from sqio.c, then making sure
1114 * that the format is indeed an alignment. If the format
1115 * comes back as FASTA, it assumes that the format as A2M
1116 * (e.g. aligned FASTA).
1118 * Args: fname - file to evaluate
1120 * Returns: format code; e.g. MSAFILE_STOCKHOLM
1123 MSAFileFormat(MSAFILE *afp)
1127 fmt = SeqfileFormat(afp->f);
1129 if (fmt == SQFILE_FASTA) fmt = MSAFILE_A2M;
1131 if (fmt != MSAFILE_UNKNOWN && ! IsAlignmentFormat(fmt))
1132 Die("File %s does not appear to be an alignment file;\n\
1133 rather, it appears to be an unaligned file in %s format.\n\
1134 I'm expecting an alignment file in this context.\n",
1136 SeqfileFormat2String(fmt));
1141 /* Function: MSAMingap()
1142 * Date: SRE, Mon Jun 28 18:57:54 1999 [on jury duty, St. Louis Civil Court]
1144 * Purpose: Remove all-gap columns from a multiple sequence alignment
1145 * and its associated per-residue data.
1147 * Args: msa - the alignment
1154 int *useme; /* array of TRUE/FALSE flags for which columns to keep */
1155 int apos; /* position in original alignment */
1156 int idx; /* sequence index */
1158 useme = MallocOrDie(sizeof(int) * msa->alen);
1159 for (apos = 0; apos < msa->alen; apos++)
1161 for (idx = 0; idx < msa->nseq; idx++)
1162 if (! isgap(msa->aseq[idx][apos]))
1164 if (idx == msa->nseq) useme[apos] = FALSE; else useme[apos] = TRUE;
1166 MSAShorterAlignment(msa, useme);
1171 /* Function: MSANogap()
1172 * Date: SRE, Wed Nov 17 09:59:51 1999 [St. Louis]
1174 * Purpose: Remove all columns from a multiple sequence alignment that
1175 * contain any gaps -- used for filtering before phylogenetic
1178 * Args: msa - the alignment
1180 * Returns: (void). The alignment is modified, so if you want to keep
1181 * the original for something, make a copy.
1186 int *useme; /* array of TRUE/FALSE flags for which columns to keep */
1187 int apos; /* position in original alignment */
1188 int idx; /* sequence index */
1190 useme = MallocOrDie(sizeof(int) * msa->alen);
1191 for (apos = 0; apos < msa->alen; apos++)
1193 for (idx = 0; idx < msa->nseq; idx++)
1194 if (isgap(msa->aseq[idx][apos]))
1196 if (idx == msa->nseq) useme[apos] = TRUE; else useme[apos] = FALSE;
1198 MSAShorterAlignment(msa, useme);
1204 /* Function: MSAShorterAlignment()
1205 * Date: SRE, Wed Nov 17 09:49:32 1999 [St. Louis]
1207 * Purpose: Given an array "useme" (0..alen-1) of TRUE/FALSE flags,
1208 * where TRUE means "keep this column in the new alignment":
1209 * Remove all columns annotated as "FALSE" in the useme
1212 * Args: msa - the alignment. The alignment is changed, so
1213 * if you don't want the original screwed up, make
1214 * a copy of it first.
1215 * useme - TRUE/FALSE flags for columns to keep: 0..alen-1
1220 MSAShorterAlignment(MSA *msa, int *useme)
1222 int apos; /* position in original alignment */
1223 int mpos; /* position in new alignment */
1224 int idx; /* sequence index */
1225 int i; /* markup index */
1227 /* Since we're minimizing, we can overwrite, using already allocated
1230 for (apos = 0, mpos = 0; apos < msa->alen; apos++)
1232 if (useme[apos] == FALSE) continue;
1234 /* shift alignment and associated per-column+per-residue markup */
1237 for (idx = 0; idx < msa->nseq; idx++)
1239 msa->aseq[idx][mpos] = msa->aseq[idx][apos];
1240 if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = msa->ss[idx][apos];
1241 if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = msa->sa[idx][apos];
1243 for (i = 0; i < msa->ngr; i++)
1244 if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = msa->gr[i][idx][apos];
1247 if (msa->ss_cons != NULL) msa->ss_cons[mpos] = msa->ss_cons[apos];
1248 if (msa->sa_cons != NULL) msa->sa_cons[mpos] = msa->sa_cons[apos];
1249 if (msa->rf != NULL) msa->rf[mpos] = msa->rf[apos];
1251 for (i = 0; i < msa->ngc; i++)
1252 msa->gc[i][mpos] = msa->gc[i][apos];
1257 msa->alen = mpos; /* set new length */
1258 /* null terminate everything */
1259 for (idx = 0; idx < msa->nseq; idx++)
1261 msa->aseq[idx][mpos] = '\0';
1262 if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = '\0';
1263 if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = '\0';
1265 for (i = 0; i < msa->ngr; i++)
1266 if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = '\0';
1269 if (msa->ss_cons != NULL) msa->ss_cons[mpos] = '\0';
1270 if (msa->sa_cons != NULL) msa->sa_cons[mpos] = '\0';
1271 if (msa->rf != NULL) msa->rf[mpos] = '\0';
1273 for (i = 0; i < msa->ngc; i++)
1274 msa->gc[i][mpos] = '\0';
1280 /* Function: MSASmallerAlignment()
1281 * Date: SRE, Wed Jun 30 09:56:08 1999 [St. Louis]
1283 * Purpose: Given an array "useme" of TRUE/FALSE flags for
1284 * each sequence in an alignment, construct
1285 * and return a new alignment containing only
1286 * those sequences that are flagged useme=TRUE.
1288 * Used by routines such as MSAFilterAlignment()
1289 * and MSASampleAlignment().
1292 * Does not copy unparsed Stockholm markup.
1294 * Does not make assumptions about meaning of wgt;
1295 * if you want the new wgt vector renormalized, do
1296 * it yourself with FNorm(new->wgt, new->nseq).
1298 * Args: msa -- the original (larger) alignment
1299 * useme -- [0..nseq-1] array of TRUE/FALSE flags; TRUE means include
1300 * this seq in new alignment
1301 * ret_new -- RETURN: new alignment
1304 * ret_new is allocated here; free with MSAFree()
1307 MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new)
1309 MSA *new; /* RETURN: new alignment */
1310 int nnew; /* number of seqs in new msa (e.g. # of TRUEs) */
1311 int oidx, nidx; /* old, new indices */
1315 for (oidx = 0; oidx < msa->nseq; oidx++)
1316 if (useme[oidx]) nnew++;
1317 if (nnew == 0) { *ret_new = NULL; return; }
1319 new = MSAAlloc(nnew, 0);
1321 for (oidx = 0; oidx < msa->nseq; oidx++)
1324 new->aseq[nidx] = sre_strdup(msa->aseq[oidx], msa->alen);
1325 new->sqname[nidx] = sre_strdup(msa->sqname[oidx], msa->alen);
1326 GKIStoreKey(new->index, msa->sqname[oidx]);
1327 new->wgt[nidx] = msa->wgt[oidx];
1328 if (msa->sqacc != NULL)
1329 MSASetSeqAccession(new, nidx, msa->sqacc[oidx]);
1330 if (msa->sqdesc != NULL)
1331 MSASetSeqDescription(new, nidx, msa->sqdesc[oidx]);
1332 if (msa->ss != NULL && msa->ss[oidx] != NULL)
1334 if (new->ss == NULL) new->ss = MallocOrDie(sizeof(char *) * new->nseq);
1335 new->ss[nidx] = sre_strdup(msa->ss[oidx], -1);
1337 if (msa->sa != NULL && msa->sa[oidx] != NULL)
1339 if (new->sa == NULL) new->sa = MallocOrDie(sizeof(char *) * new->nseq);
1340 new->sa[nidx] = sre_strdup(msa->sa[oidx], -1);
1346 new->alen = msa->alen;
1347 new->flags = msa->flags;
1348 new->type = msa->type;
1349 new->name = sre_strdup(msa->name, -1);
1350 new->desc = sre_strdup(msa->desc, -1);
1351 new->acc = sre_strdup(msa->acc, -1);
1352 new->au = sre_strdup(msa->au, -1);
1353 new->ss_cons = sre_strdup(msa->ss_cons, -1);
1354 new->sa_cons = sre_strdup(msa->sa_cons, -1);
1355 new->rf = sre_strdup(msa->rf, -1);
1356 for (i = 0; i < MSA_MAXCUTOFFS; i++) {
1357 new->cutoff[i] = msa->cutoff[i];
1358 new->cutoff_is_set[i] = msa->cutoff_is_set[i];
1368 /*****************************************************************
1369 * Retrieval routines
1371 * Access to MSA structure data is possible through these routines.
1372 * I'm not doing this because of object oriented design, though
1373 * it might work in my favor someday.
1374 * I'm doing this because lots of MSA data is optional, and
1375 * checking through the chain of possible NULLs is a pain.
1376 *****************************************************************/
1379 MSAGetSeqAccession(MSA *msa, int idx)
1381 if (msa->sqacc != NULL && msa->sqacc[idx] != NULL)
1382 return msa->sqacc[idx];
1387 MSAGetSeqDescription(MSA *msa, int idx)
1389 if (msa->sqdesc != NULL && msa->sqdesc[idx] != NULL)
1390 return msa->sqdesc[idx];
1395 MSAGetSeqSS(MSA *msa, int idx)
1397 if (msa->ss != NULL && msa->ss[idx] != NULL)
1398 return msa->ss[idx];
1403 MSAGetSeqSA(MSA *msa, int idx)
1405 if (msa->sa != NULL && msa->sa[idx] != NULL)
1406 return msa->sa[idx];
1412 /*****************************************************************
1413 * Information routines
1415 * Access information about the MSA.
1416 *****************************************************************/
1418 /* Function: MSAAverageSequenceLength()
1419 * Date: SRE, Sat Apr 6 09:41:34 2002 [St. Louis]
1421 * Purpose: Return the average length of the (unaligned) sequences
1424 * Args: msa - the alignment
1426 * Returns: average length
1429 MSAAverageSequenceLength(MSA *msa)
1435 for (i = 0; i < msa->nseq; i++)
1436 avg += (float) DealignedLength(msa->aseq[i]);
1438 if (msa->nseq == 0) return 0.;
1439 else return (avg / msa->nseq);