1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * SRE, Mon May 17 10:48:47 1999
14 * SQUID's interface for multiple sequence alignment
15 * manipulation: access to the MSA object.
17 * RCS $Id: msa.c,v 1.1.1.1 2005/03/22 08:34:19 cmzmasek Exp $
24 #include "msa.h" /* multiple sequence alignment object support */
25 #include "gki.h" /* string indexing hashtable code */
26 #include "ssi.h" /* SSI sequence file indexing code */
28 /* Function: MSAAlloc()
29 * Date: SRE, Tue May 18 10:45:47 1999 [St. Louis]
31 * Purpose: Allocate an MSA structure, return a pointer
34 * Designed to be used in three ways:
35 * 1) We know exactly the dimensions of the alignment:
37 * msa = MSAAlloc(nseq, alen);
39 * 2) We know the number of sequences but not alen.
40 * (We add sequences later.)
41 * msa = MSAAlloc(nseq, 0);
43 * 3) We even don't know the number of sequences, so
44 * we'll have to dynamically expand allocations.
45 * We provide a blocksize for the allocation expansion,
46 * and expand when needed.
47 * msa = MSAAlloc(10, 0);
48 * if (msa->nseq == msa->nseqalloc) MSAExpand(msa);
50 * Args: nseq - number of sequences, or nseq allocation blocksize
51 * alen - length of alignment in columns, or 0
53 * Returns: pointer to new MSA object, w/ all values initialized.
54 * Note that msa->nseq is initialized to 0, though space
57 * Diagnostics: "always works". Die()'s on memory allocation failure.
61 MSAAlloc(int nseq, int alen)
66 msa = MallocOrDie(sizeof(MSA));
67 msa->aseq = MallocOrDie(sizeof(char *) * nseq);
68 msa->sqname = MallocOrDie(sizeof(char *) * nseq);
69 msa->sqlen = MallocOrDie(sizeof(int) * nseq);
70 msa->wgt = MallocOrDie(sizeof(float) * nseq);
72 for (i = 0; i < nseq; i++)
74 msa->sqname[i] = NULL;
78 if (alen != 0) msa->aseq[i] = MallocOrDie(sizeof(char) * (alen+1));
79 else msa->aseq[i] = NULL;
84 msa->nseqalloc = nseq;
88 msa->type = kOtherSeq;
102 msa->index = GKIInit();
105 /* Initialize unparsed optional markup
109 msa->alloc_ncomment = 0;
130 /* Done. Return the alloced, initialized structure
135 /* Function: MSAExpand()
136 * Date: SRE, Tue May 18 11:06:53 1999 [St. Louis]
138 * Purpose: Increase the sequence allocation in an MSA
139 * by msa->nseqlump. (Typically used when we're reading
140 * in an alignment sequentially from a file,
141 * so we don't know nseq until we're done.)
143 * Args: msa - the MSA object
153 msa->nseqalloc += msa->nseqlump;
155 msa->aseq = ReallocOrDie(msa->aseq, sizeof(char *) * msa->nseqalloc);
156 msa->sqname = ReallocOrDie(msa->sqname, sizeof(char *) * msa->nseqalloc);
157 msa->sqlen = ReallocOrDie(msa->sqlen, sizeof(char *) * msa->nseqalloc);
158 msa->wgt = ReallocOrDie(msa->wgt, sizeof(float) * msa->nseqalloc);
160 if (msa->ss != NULL) {
161 msa->ss = ReallocOrDie(msa->ss, sizeof(char *) * msa->nseqalloc);
162 msa->sslen = ReallocOrDie(msa->sslen, sizeof(int) * msa->nseqalloc);
164 if (msa->sa != NULL) {
165 msa->sa = ReallocOrDie(msa->sa, sizeof(char *) * msa->nseqalloc);
166 msa->salen = ReallocOrDie(msa->salen, sizeof(int) * msa->nseqalloc);
168 if (msa->sqacc != NULL)
169 msa->sqacc = ReallocOrDie(msa->sqacc, sizeof(char *) * msa->nseqalloc);
170 if (msa->sqdesc != NULL)
171 msa->sqdesc =ReallocOrDie(msa->sqdesc,sizeof(char *) * msa->nseqalloc);
173 for (i = msa->nseqalloc-msa->nseqlump; i < msa->nseqalloc; i++)
175 msa->sqname[i] = NULL;
178 if (msa->sqacc != NULL) msa->sqacc[i] = NULL;
179 if (msa->sqdesc != NULL) msa->sqdesc[i] = NULL;
182 msa->aseq[i] = ReallocOrDie(msa->aseq[i], sizeof(char) * (msa->alen+1));
183 else msa->aseq[i] = NULL;
186 if (msa->ss != NULL) {
188 msa->ss[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1));
189 else msa->ss[i] = NULL;
192 if (msa->sa != NULL) {
194 msa->sa[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1));
201 /* Reallocate and re-init for unparsed #=GS tags, if we have some.
202 * gs is [0..ngs-1][0..nseq-1][], so we're reallocing the middle
206 for (i = 0; i < msa->ngs; i++)
208 if (msa->gs[i] != NULL)
210 msa->gs[i] = ReallocOrDie(msa->gs[i], sizeof(char *) * msa->nseqalloc);
211 for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++)
212 msa->gs[i][j] = NULL;
216 /* Reallocate and re-init for unparsed #=GR tags, if we have some.
217 * gr is [0..ngs-1][0..nseq-1][], so we're reallocing the middle
221 for (i = 0; i < msa->ngr; i++)
223 if (msa->gr[i] != NULL)
225 msa->gr[i] = ReallocOrDie(msa->gr[i], sizeof(char *) * msa->nseqalloc);
226 for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++)
227 msa->gr[i][j] = NULL;
234 /* Function: MSAFree()
235 * Date: SRE, Tue May 18 11:20:16 1999 [St. Louis]
237 * Purpose: Free a multiple sequence alignment structure.
239 * Args: msa - the alignment
246 Free2DArray((void **) msa->aseq, msa->nseq);
247 Free2DArray((void **) msa->sqname, msa->nseq);
248 Free2DArray((void **) msa->sqacc, msa->nseq);
249 Free2DArray((void **) msa->sqdesc, msa->nseq);
250 Free2DArray((void **) msa->ss, msa->nseq);
251 Free2DArray((void **) msa->sa, msa->nseq);
253 if (msa->sqlen != NULL) free(msa->sqlen);
254 if (msa->wgt != NULL) free(msa->wgt);
256 if (msa->name != NULL) free(msa->name);
257 if (msa->desc != NULL) free(msa->desc);
258 if (msa->acc != NULL) free(msa->acc);
259 if (msa->au != NULL) free(msa->au);
260 if (msa->ss_cons != NULL) free(msa->ss_cons);
261 if (msa->sa_cons != NULL) free(msa->sa_cons);
262 if (msa->rf != NULL) free(msa->rf);
263 if (msa->sslen != NULL) free(msa->sslen);
264 if (msa->salen != NULL) free(msa->salen);
266 Free2DArray((void **) msa->comment, msa->ncomment);
267 Free2DArray((void **) msa->gf_tag, msa->ngf);
268 Free2DArray((void **) msa->gf, msa->ngf);
269 Free2DArray((void **) msa->gs_tag, msa->ngs);
270 Free3DArray((void ***)msa->gs, msa->ngs, msa->nseq);
271 Free2DArray((void **) msa->gc_tag, msa->ngc);
272 Free2DArray((void **) msa->gc, msa->ngc);
273 Free2DArray((void **) msa->gr_tag, msa->ngr);
274 Free3DArray((void ***)msa->gr, msa->ngr, msa->nseq);
277 GKIFree(msa->gs_idx);
278 GKIFree(msa->gc_idx);
279 GKIFree(msa->gr_idx);
285 /* Function: MSASetSeqAccession()
286 * Date: SRE, Mon Jun 21 04:13:33 1999 [Sanger Centre]
288 * Purpose: Set a sequence accession in an MSA structure.
289 * Handles some necessary allocation/initialization.
291 * Args: msa - multiple alignment to add accession to
292 * seqidx - index of sequence to attach accession to
298 MSASetSeqAccession(MSA *msa, int seqidx, char *acc)
302 if (msa->sqacc == NULL) {
303 msa->sqacc = MallocOrDie(sizeof(char *) * msa->nseqalloc);
304 for (x = 0; x < msa->nseqalloc; x++)
305 msa->sqacc[x] = NULL;
307 msa->sqacc[seqidx] = sre_strdup(acc, -1);
310 /* Function: MSASetSeqDescription()
311 * Date: SRE, Mon Jun 21 04:21:09 1999 [Sanger Centre]
313 * Purpose: Set a sequence description in an MSA structure.
314 * Handles some necessary allocation/initialization.
316 * Args: msa - multiple alignment to add accession to
317 * seqidx - index of sequence to attach accession to
323 MSASetSeqDescription(MSA *msa, int seqidx, char *desc)
327 if (msa->sqdesc == NULL) {
328 msa->sqdesc = MallocOrDie(sizeof(char *) * msa->nseqalloc);
329 for (x = 0; x < msa->nseqalloc; x++)
330 msa->sqdesc[x] = NULL;
332 msa->sqdesc[seqidx] = sre_strdup(desc, -1);
336 /* Function: MSAAddComment()
337 * Date: SRE, Tue Jun 1 17:37:21 1999 [St. Louis]
339 * Purpose: Add an (unparsed) comment line to the MSA structure,
340 * allocating as necessary.
342 * Args: msa - a multiple alignment
343 * s - comment line to add
348 MSAAddComment(MSA *msa, char *s)
350 /* If this is our first recorded comment, we need to malloc();
351 * and if we've filled available space, we need to realloc().
352 * Note the arbitrary lumpsize of 10 lines per allocation...
354 if (msa->comment == NULL) {
355 msa->comment = MallocOrDie (sizeof(char *) * 10);
356 msa->alloc_ncomment = 10;
358 if (msa->ncomment == msa->alloc_ncomment) {
359 msa->alloc_ncomment += 10;
360 msa->comment = ReallocOrDie(msa->comment, sizeof(char *) * msa->alloc_ncomment);
363 msa->comment[msa->ncomment] = sre_strdup(s, -1);
368 /* Function: MSAAddGF()
369 * Date: SRE, Wed Jun 2 06:53:54 1999 [bus to Madison]
371 * Purpose: Add an unparsed #=GF markup line to the MSA
372 * structure, allocating as necessary.
374 * Args: msa - a multiple alignment
375 * tag - markup tag (e.g. "AU")
376 * value - free text markup (e.g. "Alex Bateman")
381 MSAAddGF(MSA *msa, char *tag, char *value)
383 /* If this is our first recorded unparsed #=GF line, we need to malloc();
384 * if we've filled availabl space If we already have a hash index, and the GF
385 * Note the arbitrary lumpsize of 10 lines per allocation...
387 if (msa->gf_tag == NULL) {
388 msa->gf_tag = MallocOrDie (sizeof(char *) * 10);
389 msa->gf = MallocOrDie (sizeof(char *) * 10);
392 if (msa->ngf == msa->alloc_ngf) {
393 msa->alloc_ngf += 10;
394 msa->gf_tag = ReallocOrDie(msa->gf_tag, sizeof(char *) * msa->alloc_ngf);
395 msa->gf = ReallocOrDie(msa->gf, sizeof(char *) * msa->alloc_ngf);
398 msa->gf_tag[msa->ngf] = sre_strdup(tag, -1);
399 msa->gf[msa->ngf] = sre_strdup(value, -1);
406 /* Function: MSAAddGS()
407 * Date: SRE, Wed Jun 2 06:57:03 1999 [St. Louis]
409 * Purpose: Add an unparsed #=GS markup line to the MSA
410 * structure, allocating as necessary.
412 * It's possible that we could get more than one
413 * of the same type of GS tag per sequence; for
414 * example, "DR PDB;" structure links in Pfam.
415 * Hack: handle these by appending to the string,
416 * in a \n separated fashion.
418 * Args: msa - multiple alignment structure
419 * tag - markup tag (e.g. "AC")
420 * sqidx - index of sequence to assoc markup with (0..nseq-1)
421 * value - markup (e.g. "P00666")
423 * Returns: 0 on success
426 MSAAddGS(MSA *msa, char *tag, int sqidx, char *value)
431 /* Is this an unparsed tag name that we recognize?
432 * If not, handle adding it to index, and reallocating
435 if (msa->gs_tag == NULL) /* first tag? init w/ malloc */
437 msa->gs_idx = GKIInit();
438 tagidx = GKIStoreKey(msa->gs_idx, tag);
439 SQD_DASSERT1((tagidx == 0));
440 msa->gs_tag = MallocOrDie(sizeof(char *));
441 msa->gs = MallocOrDie(sizeof(char **));
442 msa->gs[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
443 for (i = 0; i < msa->nseqalloc; i++)
444 msa->gs[0][i] = NULL;
449 tagidx = GKIKeyIndex(msa->gs_idx, tag);
450 if (tagidx < 0) { /* it's a new tag name; realloc */
451 tagidx = GKIStoreKey(msa->gs_idx, tag);
452 /* since we alloc in blocks of 1,
453 we always realloc upon seeing
455 SQD_DASSERT1((tagidx == msa->ngs));
456 msa->gs_tag = ReallocOrDie(msa->gs_tag, (msa->ngs+1) + sizeof(char *));
457 msa->gs = ReallocOrDie(msa->gs, (msa->ngs+1) + sizeof(char **));
458 msa->gs[msa->ngs] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
459 for (i = 0; i < msa->nseqalloc; i++)
460 msa->gs[msa->ngs][i] = NULL;
464 if (tagidx == msa->ngs) {
465 msa->gs_tag[tagidx] = sre_strdup(tag, -1);
469 if (msa->gs[tagidx][sqidx] == NULL) /* first annotation of this seq with this tag? */
470 msa->gs[tagidx][sqidx] = sre_strdup(value, -1);
472 /* >1 annotation of this seq with this tag; append */
474 if ((len = sre_strcat(&(msa->gs[tagidx][sqidx]), -1, "\n", 1)) < 0)
475 Die("failed to sre_strcat()");
476 if (sre_strcat(&(msa->gs[tagidx][sqidx]), len, value, -1) < 0)
477 Die("failed to sre_strcat()");
482 /* Function: MSAAppendGC()
483 * Date: SRE, Thu Jun 3 06:25:14 1999 [Madison]
485 * Purpose: Add an unparsed #=GC markup line to the MSA
486 * structure, allocating as necessary.
488 * When called multiple times for the same tag,
489 * appends value strings together -- used when
490 * parsing multiblock alignment files, for
493 * Args: msa - multiple alignment structure
494 * tag - markup tag (e.g. "CS")
495 * value - markup, one char per aligned column
500 MSAAppendGC(MSA *msa, char *tag, char *value)
504 /* Is this an unparsed tag name that we recognize?
505 * If not, handle adding it to index, and reallocating
508 if (msa->gc_tag == NULL) /* first tag? init w/ malloc */
510 msa->gc_tag = MallocOrDie(sizeof(char *));
511 msa->gc = MallocOrDie(sizeof(char **));
512 msa->gc_idx = GKIInit();
513 tagidx = GKIStoreKey(msa->gc_idx, tag);
514 SQD_DASSERT1((tagidx == 0));
519 tagidx = GKIKeyIndex(msa->gc_idx, tag);
520 if (tagidx < 0) { /* it's a new tag name; realloc */
521 tagidx = GKIStoreKey(msa->gc_idx, tag);
522 /* since we alloc in blocks of 1,
523 we always realloc upon seeing
525 SQD_DASSERT1((tagidx == msa->ngc));
526 msa->gc_tag = ReallocOrDie(msa->gc_tag, (msa->ngc+1) + sizeof(char *));
527 msa->gc = ReallocOrDie(msa->gc, (msa->ngc+1) + sizeof(char **));
528 msa->gc[tagidx] = NULL;
532 if (tagidx == msa->ngc) {
533 msa->gc_tag[tagidx] = sre_strdup(tag, -1);
536 sre_strcat(&(msa->gc[tagidx]), -1, value, -1);
540 /* Function: MSAGetGC()
541 * Date: SRE, Fri Aug 13 13:25:57 1999 [St. Louis]
543 * Purpose: Given a tagname for a miscellaneous #=GC column
544 * annotation, return a pointer to the annotation
547 * Args: msa - alignment and its annotation
548 * tag - name of the annotation
550 * Returns: ptr to the annotation string. Caller does *not*
551 * free; is managed by msa object still.
554 MSAGetGC(MSA *msa, char *tag)
558 if (msa->gc_idx == NULL) return NULL;
559 if ((tagidx = GKIKeyIndex(msa->gc_idx, tag)) < 0) return NULL;
560 return msa->gc[tagidx];
564 /* Function: MSAAppendGR()
565 * Date: SRE, Thu Jun 3 06:34:38 1999 [Madison]
567 * Purpose: Add an unparsed #=GR markup line to the
568 * MSA structure, allocating as necessary.
570 * When called multiple times for the same tag,
571 * appends value strings together -- used when
572 * parsing multiblock alignment files, for
575 * Args: msa - multiple alignment structure
576 * tag - markup tag (e.g. "SS")
577 * sqidx - index of seq to assoc markup with (0..nseq-1)
578 * value - markup, one char per aligned column
583 MSAAppendGR(MSA *msa, char *tag, int sqidx, char *value)
588 /* Is this an unparsed tag name that we recognize?
589 * If not, handle adding it to index, and reallocating
592 if (msa->gr_tag == NULL) /* first tag? init w/ malloc */
594 msa->gr_tag = MallocOrDie(sizeof(char *));
595 msa->gr = MallocOrDie(sizeof(char **));
596 msa->gr[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
597 msa->gr_idx = GKIInit();
598 tagidx = GKIStoreKey(msa->gr_idx, tag);
599 SQD_DASSERT1((tagidx == 0));
604 tagidx = GKIKeyIndex(msa->gr_idx, tag);
605 if (tagidx < 0) { /* it's a new tag name; realloc */
606 tagidx = GKIStoreKey(msa->gr_idx, tag);
607 /* since we alloc in blocks of 1,
608 we always realloc upon seeing
610 SQD_DASSERT1((tagidx == msa->ngr));
611 msa->gr_tag = ReallocOrDie(msa->gr_tag, (msa->ngr+1) + sizeof(char *));
612 msa->gr = ReallocOrDie(msa->gr, (msa->ngr+1) + sizeof(char **));
613 msa->gr[msa->ngr] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
614 for (i = 0; i < msa->nseqalloc; i++)
615 msa->gr[msa->ngr][i] = NULL;
619 if (tagidx == msa->ngr) {
620 msa->gr_tag[tagidx] = sre_strdup(tag, -1);
623 sre_strcat(&(msa->gr[tagidx][sqidx]), -1, value, -1);
628 /* Function: MSAVerifyParse()
629 * Date: SRE, Sat Jun 5 14:24:24 1999 [Madison, 1999 worm mtg]
631 * Purpose: Last function called after a multiple alignment is
632 * parsed. Checks that parse was successful; makes sure
633 * required information is present; makes sure required
634 * information is consistent. Some fields that are
635 * only use during parsing may be freed (sqlen, for
638 * Some fields in msa may be modified (msa->alen is set,
641 * Args: msa - the multiple alignment
642 * sqname, aseq must be set
643 * nseq must be correct
644 * alen need not be set; will be set here.
645 * wgt will be set here if not already set
648 * Will Die() here with diagnostics on error.
653 MSAVerifyParse(MSA *msa)
657 if (msa->nseq == 0) Die("Parse error: no sequences were found for alignment %s",
658 msa->name != NULL ? msa->name : "");
660 msa->alen = msa->sqlen[0];
662 /* We can rely on msa->sqname[] being valid for any index,
663 * because of the way the line parsers always store any name
664 * they add to the index.
666 for (idx = 0; idx < msa->nseq; idx++)
668 /* aseq is required. */
669 if (msa->aseq[idx] == NULL)
670 Die("Parse error: No sequence for %s in alignment %s", msa->sqname[idx],
671 msa->name != NULL ? msa->name : "");
672 /* either all weights must be set, or none of them */
673 if ((msa->flags & MSA_SET_WGT) && msa->wgt[idx] == -1.0)
674 Die("Parse error: some weights are set, but %s doesn't have one in alignment %s",
676 msa->name != NULL ? msa->name : "");
677 /* all aseq must be same length. */
678 if (msa->sqlen[idx] != msa->alen)
679 Die("Parse error: sequence %s: length %d, expected %d in alignment %s",
680 msa->sqname[idx], msa->sqlen[idx], msa->alen,
681 msa->name != NULL ? msa->name : "");
682 /* if SS is present, must have length right */
683 if (msa->ss != NULL && msa->ss[idx] != NULL && msa->sslen[idx] != msa->alen)
684 Die("Parse error: #=GR SS annotation for %s: length %d, expected %d in alignment %s",
685 msa->sqname[idx], msa->sslen[idx], msa->alen,
686 msa->name != NULL ? msa->name : "");
687 /* if SA is present, must have length right */
688 if (msa->sa != NULL && msa->sa[idx] != NULL && msa->salen[idx] != msa->alen)
689 Die("Parse error: #=GR SA annotation for %s: length %d, expected %d in alignment %s",
690 msa->sqname[idx], msa->salen[idx], msa->alen,
691 msa->name != NULL ? msa->name : "");
694 /* if cons SS is present, must have length right */
695 if (msa->ss_cons != NULL && strlen(msa->ss_cons) != msa->alen)
696 Die("Parse error: #=GC SS_cons annotation: length %d, expected %d in alignment %s",
697 strlen(msa->ss_cons), msa->alen,
698 msa->name != NULL ? msa->name : "");
700 /* if cons SA is present, must have length right */
701 if (msa->sa_cons != NULL && strlen(msa->sa_cons) != msa->alen)
702 Die("Parse error: #=GC SA_cons annotation: length %d, expected %d in alignment %s",
703 strlen(msa->sa_cons), msa->alen,
704 msa->name != NULL ? msa->name : "");
706 /* if RF is present, must have length right */
707 if (msa->rf != NULL && strlen(msa->rf) != msa->alen)
708 Die("Parse error: #=GC RF annotation: length %d, expected %d in alignment %s",
709 strlen(msa->rf), msa->alen,
710 msa->name != NULL ? msa->name : "");
712 /* Check that all or no weights are set */
713 if (!(msa->flags & MSA_SET_WGT))
714 FSet(msa->wgt, msa->nseq, 1.0); /* default weights */
716 /* Clean up a little from the parser */
717 if (msa->sqlen != NULL) { free(msa->sqlen); msa->sqlen = NULL; }
718 if (msa->sslen != NULL) { free(msa->sslen); msa->sslen = NULL; }
719 if (msa->salen != NULL) { free(msa->salen); msa->salen = NULL; }
727 /* Function: MSAFileOpen()
728 * Date: SRE, Tue May 18 13:22:01 1999 [St. Louis]
730 * Purpose: Open an alignment database file and prepare
731 * for reading one alignment, or sequentially
732 * in the (rare) case of multiple MSA databases
733 * (e.g. Stockholm format).
735 * Args: filename - name of file to open
737 * if it ends in ".gz", read from pipe to gunzip -dc
738 * format - format of file (e.g. MSAFILE_STOCKHOLM)
739 * env - environment variable for path (e.g. BLASTDB)
741 * Returns: opened MSAFILE * on success.
743 * usually, because the file doesn't exist;
744 * for gzip'ed files, may also mean that gzip isn't in the path.
747 MSAFileOpen(char *filename, int format, char *env)
751 afp = MallocOrDie(sizeof(MSAFILE));
752 if (strcmp(filename, "-") == 0)
755 afp->do_stdin = TRUE;
756 afp->do_gzip = FALSE;
757 afp->fname = sre_strdup("[STDIN]", -1);
758 afp->ssi = NULL; /* can't index stdin because we can't seek*/
760 #ifndef SRE_STRICT_ANSI
761 /* popen(), pclose() aren't portable to non-POSIX systems; disable */
762 else if (Strparse("^.*\\.gz$", filename, 0))
766 /* Note that popen() will return "successfully"
767 * if file doesn't exist, because gzip works fine
768 * and prints an error! So we have to check for
769 * existence of file ourself.
771 if (! FileExists(filename))
772 Die("%s: file does not exist", filename);
773 if (strlen(filename) + strlen("gzip -dc ") >= 256)
774 Die("filename > 255 char in MSAFileOpen()");
775 sprintf(cmd, "gzip -dc %s", filename);
776 if ((afp->f = popen(cmd, "r")) == NULL)
779 afp->do_stdin = FALSE;
781 afp->fname = sre_strdup(filename, -1);
782 /* we can't index a .gz file, because we can't seek in a pipe afaik */
785 #endif /*SRE_STRICT_ANSI*/
791 /* When we open a file, it may be either in the current
792 * directory, or in the directory indicated by the env
793 * argument - and we have to construct the SSI filename accordingly.
795 if ((afp->f = fopen(filename, "r")) != NULL)
797 ssifile = MallocOrDie(sizeof(char) * (strlen(filename) + 5));
798 sprintf(ssifile, "%s.ssi", filename);
800 else if ((afp->f = EnvFileOpen(filename, env, &dir)) != NULL)
803 full = FileConcat(dir, filename);
804 ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(filename) + 5));
805 sprintf(ssifile, "%s.ssi", full);
810 afp->do_stdin = FALSE;
811 afp->do_gzip = FALSE;
812 afp->fname = sre_strdup(filename, -1);
815 /* Open the SSI index file. If it doesn't exist, or
816 * it's corrupt, or some error happens, afp->ssi stays NULL.
818 SSIOpen(ssifile, &(afp->ssi));
822 /* Invoke autodetection if we haven't already been told what
825 if (format == MSAFILE_UNKNOWN)
827 if (afp->do_stdin == TRUE || afp->do_gzip)
828 Die("Can't autodetect alignment file format from a stdin or gzip pipe");
829 format = MSAFileFormat(afp);
830 if (format == MSAFILE_UNKNOWN)
831 Die("Can't determine format of multiple alignment file %s", afp->fname);
834 afp->format = format;
843 /* Function: MSAFilePositionByKey()
844 * MSAFilePositionByIndex()
847 * Date: SRE, Tue Nov 9 19:02:54 1999 [St. Louis]
849 * Purpose: Family of functions for repositioning in
850 * open MSA files; analogous to a similarly
851 * named function series in HMMER's hmmio.c.
853 * Args: afp - open alignment file
854 * offset - disk offset in bytes
855 * key - key to look up in SSI indices
856 * idx - index of alignment.
858 * Returns: 0 on failure.
860 * If called on a non-fseek()'able file (e.g. a gzip'ed
861 * or pipe'd alignment), returns 0 as a failure flag.
864 MSAFileRewind(MSAFILE *afp)
866 if (afp->do_gzip || afp->do_stdin) return 0;
871 MSAFilePositionByKey(MSAFILE *afp, char *key)
873 int fh; /* filehandle is ignored */
874 SSIOFFSET offset; /* offset of the key alignment */
876 if (afp->ssi == NULL) return 0;
877 if (SSIGetOffsetByName(afp->ssi, key, &fh, &offset) != 0) return 0;
878 if (SSISetFilePosition(afp->f, &offset) != 0) return 0;
882 MSAFilePositionByIndex(MSAFILE *afp, int idx)
884 int fh; /* filehandled is passed but ignored */
885 SSIOFFSET offset; /* disk offset of desired alignment */
887 if (afp->ssi == NULL) return 0;
888 if (SSIGetOffsetByNumber(afp->ssi, idx, &fh, &offset) != 0) return 0;
889 if (SSISetFilePosition(afp->f, &offset) != 0) return 0;
894 /* Function: MSAFileRead()
895 * Date: SRE, Fri May 28 16:01:43 1999 [St. Louis]
897 * Purpose: Read the next msa from an open alignment file.
898 * This is a wrapper around format-specific calls.
900 * Args: afp - open alignment file
902 * Returns: next alignment, or NULL if out of alignments
905 MSAFileRead(MSAFILE *afp)
909 switch (afp->format) {
910 case MSAFILE_STOCKHOLM: msa = ReadStockholm(afp); break;
911 case MSAFILE_MSF: msa = ReadMSF(afp); break;
912 case MSAFILE_A2M: msa = ReadA2M(afp); break;
913 case MSAFILE_CLUSTAL: msa = ReadClustal(afp); break;
914 case MSAFILE_SELEX: msa = ReadSELEX(afp); break;
915 case MSAFILE_PHYLIP: msa = ReadPhylip(afp); break;
917 Die("MSAFILE corrupted: bad format index");
922 /* Function: MSAFileClose()
923 * Date: SRE, Tue May 18 14:05:28 1999 [St. Louis]
925 * Purpose: Close an open MSAFILE.
927 * Args: afp - ptr to an open MSAFILE.
932 MSAFileClose(MSAFILE *afp)
934 #ifndef SRE_STRICT_ANSI /* gzip functionality only on POSIX systems */
935 if (afp->do_gzip) pclose(afp->f);
937 if (! afp->do_stdin) fclose(afp->f);
938 if (afp->buf != NULL) free(afp->buf);
939 if (afp->ssi != NULL) SSIClose(afp->ssi);
940 if (afp->fname != NULL) free(afp->fname);
945 MSAFileGetLine(MSAFILE *afp)
948 if ((s = sre_fgets(&(afp->buf), &(afp->buflen), afp->f)) == NULL)
955 MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline)
958 case MSAFILE_A2M: WriteA2M(stdout, msa); break;
959 case MSAFILE_CLUSTAL: WriteClustal(stdout, msa); break;
960 case MSAFILE_MSF: WriteMSF(stdout, msa); break;
961 case MSAFILE_PHYLIP: WritePhylip(stdout, msa); break;
962 case MSAFILE_SELEX: WriteSELEX(stdout, msa); break;
963 case MSAFILE_STOCKHOLM:
964 if (do_oneline) WriteStockholmOneBlock(stdout, msa);
965 else WriteStockholm(stdout, msa);
968 Die("can't write. no such alignment format %d\n", outfmt);
972 /* Function: MSAGetSeqidx()
973 * Date: SRE, Wed May 19 15:08:25 1999 [St. Louis]
975 * Purpose: From a sequence name, return seqidx appropriate
976 * for an MSA structure.
978 * 1) try to guess the index. (pass -1 if you can't guess)
979 * 2) Look up name in msa's hashtable.
980 * 3) If it's a new name, store in msa's hashtable;
981 * expand allocs as needed;
984 * Args: msa - alignment object
985 * name - a sequence name
986 * guess - a guess at the right index, or -1 if no guess.
991 MSAGetSeqidx(MSA *msa, char *name, int guess)
995 if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0)
997 /* else, a lookup in the index */
998 if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0)
1000 /* else, it's a new name */
1001 seqidx = GKIStoreKey(msa->index, name);
1002 if (seqidx >= msa->nseqalloc) MSAExpand(msa);
1004 msa->sqname[seqidx] = sre_strdup(name, -1);
1010 /* Function: MSAFromAINFO()
1011 * Date: SRE, Mon Jun 14 11:22:24 1999 [St. Louis]
1013 * Purpose: Convert the old aseq/ainfo alignment structure
1014 * to new MSA structure. Enables more rapid conversion
1015 * of codebase to the new world order.
1017 * Args: aseq - [0..nseq-1][0..alen-1] alignment
1018 * ainfo - old-style optional info
1023 MSAFromAINFO(char **aseq, AINFO *ainfo)
1028 msa = MSAAlloc(ainfo->nseq, ainfo->alen);
1029 for (i = 0; i < ainfo->nseq; i++)
1031 strcpy(msa->aseq[i], aseq[i]);
1032 msa->wgt[i] = ainfo->wgt[i];
1033 msa->sqname[i] = sre_strdup(ainfo->sqinfo[i].name, -1);
1034 msa->sqlen[i] = msa->alen;
1035 GKIStoreKey(msa->index, msa->sqname[i]);
1037 if (ainfo->sqinfo[i].flags & SQINFO_ACC)
1038 MSASetSeqAccession(msa, i, ainfo->sqinfo[i].acc);
1040 if (ainfo->sqinfo[i].flags & SQINFO_DESC)
1041 MSASetSeqDescription(msa, i, ainfo->sqinfo[i].desc);
1043 if (ainfo->sqinfo[i].flags & SQINFO_SS) {
1044 if (msa->ss == NULL) {
1045 msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc);
1046 msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc);
1047 for (j = 0; j < msa->nseqalloc; j++) {
1052 MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].ss, &(msa->ss[i]));
1053 msa->sslen[i] = msa->alen;
1056 if (ainfo->sqinfo[i].flags & SQINFO_SA) {
1057 if (msa->sa == NULL) {
1058 msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc);
1059 msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc);
1060 for (j = 0; j < msa->nseqalloc; j++) {
1065 MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].sa, &(msa->sa[i]));
1066 msa->salen[i] = msa->alen;
1069 /* note that sre_strdup() returns NULL when passed NULL */
1070 msa->name = sre_strdup(ainfo->name, -1);
1071 msa->desc = sre_strdup(ainfo->desc, -1);
1072 msa->acc = sre_strdup(ainfo->acc, -1);
1073 msa->au = sre_strdup(ainfo->au, -1);
1074 msa->ss_cons = sre_strdup(ainfo->cs, -1);
1075 msa->rf = sre_strdup(ainfo->rf, -1);
1076 if (ainfo->flags & AINFO_TC)
1077 { msa->tc1 = ainfo->tc1; msa->tc2 = ainfo->tc2; msa->flags |= MSA_SET_TC; }
1078 if (ainfo->flags & AINFO_NC)
1079 { msa->nc1 = ainfo->nc1; msa->nc2 = ainfo->nc2; msa->flags |= MSA_SET_NC; }
1080 if (ainfo->flags & AINFO_GA)
1081 { msa->ga1 = ainfo->ga1; msa->ga2 = ainfo->ga2; msa->flags |= MSA_SET_GA; }
1083 msa->nseq = ainfo->nseq;
1084 msa->alen = ainfo->alen;
1091 /* Function: MSAFileFormat()
1092 * Date: SRE, Fri Jun 18 14:26:49 1999 [Sanger Centre]
1094 * Purpose: (Attempt to) determine the format of an alignment file.
1095 * Since it rewinds the file pointer when it's done,
1096 * cannot be used on a pipe or gzip'ed file. Works by
1097 * calling SeqfileFormat() from sqio.c, then making sure
1098 * that the format is indeed an alignment. If the format
1099 * comes back as FASTA, it assumes that the format as A2M
1100 * (e.g. aligned FASTA).
1102 * Args: fname - file to evaluate
1104 * Returns: format code; e.g. MSAFILE_STOCKHOLM
1107 MSAFileFormat(MSAFILE *afp)
1111 fmt = SeqfileFormat(afp->f);
1113 if (fmt == SQFILE_FASTA) fmt = MSAFILE_A2M;
1115 if (fmt != MSAFILE_UNKNOWN && ! IsAlignmentFormat(fmt))
1116 Die("File %s does not appear to be an alignment file;\n\
1117 rather, it appears to be an unaligned file in %s format.\n\
1118 I'm expecting an alignment file in this context.\n",
1120 SeqfileFormat2String(fmt));
1125 /* Function: MSAMingap()
1126 * Date: SRE, Mon Jun 28 18:57:54 1999 [on jury duty, St. Louis Civil Court]
1128 * Purpose: Remove all-gap columns from a multiple sequence alignment
1129 * and its associated per-residue data.
1131 * Args: msa - the alignment
1138 int *useme; /* array of TRUE/FALSE flags for which columns to keep */
1139 int apos; /* position in original alignment */
1140 int idx; /* sequence index */
1142 useme = MallocOrDie(sizeof(int) * msa->alen);
1143 for (apos = 0; apos < msa->alen; apos++)
1145 for (idx = 0; idx < msa->nseq; idx++)
1146 if (! isgap(msa->aseq[idx][apos]))
1148 if (idx == msa->nseq) useme[apos] = FALSE; else useme[apos] = TRUE;
1150 MSAShorterAlignment(msa, useme);
1155 /* Function: MSANogap()
1156 * Date: SRE, Wed Nov 17 09:59:51 1999 [St. Louis]
1158 * Purpose: Remove all columns from a multiple sequence alignment that
1159 * contain any gaps -- used for filtering before phylogenetic
1162 * Args: msa - the alignment
1164 * Returns: (void). The alignment is modified, so if you want to keep
1165 * the original for something, make a copy.
1170 int *useme; /* array of TRUE/FALSE flags for which columns to keep */
1171 int apos; /* position in original alignment */
1172 int idx; /* sequence index */
1174 useme = MallocOrDie(sizeof(int) * msa->alen);
1175 for (apos = 0; apos < msa->alen; apos++)
1177 for (idx = 0; idx < msa->nseq; idx++)
1178 if (isgap(msa->aseq[idx][apos]))
1180 if (idx == msa->nseq) useme[apos] = TRUE; else useme[apos] = FALSE;
1182 MSAShorterAlignment(msa, useme);
1188 /* Function: MSAShorterAlignment()
1189 * Date: SRE, Wed Nov 17 09:49:32 1999 [St. Louis]
1191 * Purpose: Given an array "useme" (0..alen-1) of TRUE/FALSE flags,
1192 * where TRUE means "keep this column in the new alignment":
1193 * Remove all columns annotated as "FALSE" in the useme
1196 * Args: msa - the alignment. The alignment is changed, so
1197 * if you don't want the original screwed up, make
1198 * a copy of it first.
1199 * useme - TRUE/FALSE flags for columns to keep: 0..alen-1
1204 MSAShorterAlignment(MSA *msa, int *useme)
1206 int apos; /* position in original alignment */
1207 int mpos; /* position in new alignment */
1208 int idx; /* sequence index */
1209 int i; /* markup index */
1211 /* Since we're minimizing, we can overwrite, using already allocated
1214 for (apos = 0, mpos = 0; apos < msa->alen; apos++)
1216 if (useme[apos] == FALSE) continue;
1218 /* shift alignment and associated per-column+per-residue markup */
1221 for (idx = 0; idx < msa->nseq; idx++)
1223 msa->aseq[idx][mpos] = msa->aseq[idx][apos];
1224 if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = msa->ss[idx][apos];
1225 if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = msa->sa[idx][apos];
1227 for (i = 0; i < msa->ngr; i++)
1228 if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = msa->gr[i][idx][apos];
1231 if (msa->ss_cons != NULL) msa->ss_cons[mpos] = msa->ss_cons[apos];
1232 if (msa->sa_cons != NULL) msa->sa_cons[mpos] = msa->sa_cons[apos];
1233 if (msa->rf != NULL) msa->rf[mpos] = msa->rf[apos];
1235 for (i = 0; i < msa->ngc; i++)
1236 msa->gc[i][mpos] = msa->gc[i][apos];
1241 msa->alen = mpos; /* set new length */
1242 /* null terminate everything */
1243 for (idx = 0; idx < msa->nseq; idx++)
1245 msa->aseq[idx][mpos] = '\0';
1246 if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = '\0';
1247 if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = '\0';
1249 for (i = 0; i < msa->ngr; i++)
1250 if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = '\0';
1253 if (msa->ss_cons != NULL) msa->ss_cons[mpos] = '\0';
1254 if (msa->sa_cons != NULL) msa->sa_cons[mpos] = '\0';
1255 if (msa->rf != NULL) msa->rf[mpos] = '\0';
1257 for (i = 0; i < msa->ngc; i++)
1258 msa->gc[i][mpos] = '\0';
1264 /* Function: MSASmallerAlignment()
1265 * Date: SRE, Wed Jun 30 09:56:08 1999 [St. Louis]
1267 * Purpose: Given an array "useme" of TRUE/FALSE flags for
1268 * each sequence in an alignment, construct
1269 * and return a new alignment containing only
1270 * those sequences that are flagged useme=TRUE.
1272 * Used by routines such as MSAFilterAlignment()
1273 * and MSASampleAlignment().
1276 * Does not copy unparsed Stockholm markup.
1278 * Does not make assumptions about meaning of wgt;
1279 * if you want the new wgt vector renormalized, do
1280 * it yourself with FNorm(new->wgt, new->nseq).
1282 * Args: msa -- the original (larger) alignment
1283 * useme -- [0..nseq-1] array of TRUE/FALSE flags; TRUE means include
1284 * this seq in new alignment
1285 * ret_new -- RETURN: new alignment
1288 * ret_new is allocated here; free with MSAFree()
1291 MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new)
1293 MSA *new; /* RETURN: new alignment */
1294 int nnew; /* number of seqs in new msa (e.g. # of TRUEs) */
1295 int oidx, nidx; /* old, new indices */
1298 for (oidx = 0; oidx < msa->nseq; oidx++)
1299 if (useme[oidx]) nnew++;
1300 if (nnew == 0) { *ret_new = NULL; return; }
1302 new = MSAAlloc(nnew, 0);
1304 for (oidx = 0; oidx < msa->nseq; oidx++)
1307 new->aseq[nidx] = sre_strdup(msa->aseq[oidx], msa->alen);
1308 new->sqname[nidx] = sre_strdup(msa->sqname[oidx], msa->alen);
1309 GKIStoreKey(new->index, msa->sqname[oidx]);
1310 new->wgt[nidx] = msa->wgt[oidx];
1311 if (msa->sqacc != NULL)
1312 MSASetSeqAccession(new, nidx, msa->sqacc[oidx]);
1313 if (msa->sqdesc != NULL)
1314 MSASetSeqDescription(new, nidx, msa->sqdesc[oidx]);
1315 if (msa->ss != NULL && msa->ss[oidx] != NULL)
1317 if (new->ss == NULL) new->ss = MallocOrDie(sizeof(char *) * new->nseq);
1318 new->ss[nidx] = sre_strdup(msa->ss[oidx], -1);
1320 if (msa->sa != NULL && msa->sa[oidx] != NULL)
1322 if (new->sa == NULL) new->sa = MallocOrDie(sizeof(char *) * new->nseq);
1323 new->sa[nidx] = sre_strdup(msa->sa[oidx], -1);
1329 new->alen = msa->alen;
1330 new->flags = msa->flags;
1331 new->type = msa->type;
1332 new->name = sre_strdup(msa->name, -1);
1333 new->desc = sre_strdup(msa->desc, -1);
1334 new->acc = sre_strdup(msa->acc, -1);
1335 new->au = sre_strdup(msa->au, -1);
1336 new->ss_cons = sre_strdup(msa->ss_cons, -1);
1337 new->sa_cons = sre_strdup(msa->sa_cons, -1);
1338 new->rf = sre_strdup(msa->rf, -1);
1339 new->tc1 = msa->tc1;
1340 new->tc2 = msa->tc2;
1341 new->nc1 = msa->nc1;
1342 new->nc2 = msa->nc2;
1343 new->ga1 = msa->ga1;
1344 new->ga2 = msa->ga2;
1353 /*****************************************************************
1354 * Retrieval routines
1356 * Access to MSA structure data is possible through these routines.
1357 * I'm not doing this because of object oriented design, though
1358 * it might work in my favor someday.
1359 * I'm doing this because lots of MSA data is optional, and
1360 * checking through the chain of possible NULLs is a pain.
1361 *****************************************************************/
1364 MSAGetSeqAccession(MSA *msa, int idx)
1366 if (msa->sqacc != NULL && msa->sqacc[idx] != NULL)
1367 return msa->sqacc[idx];
1372 MSAGetSeqDescription(MSA *msa, int idx)
1374 if (msa->sqdesc != NULL && msa->sqdesc[idx] != NULL)
1375 return msa->sqdesc[idx];
1380 MSAGetSeqSS(MSA *msa, int idx)
1382 if (msa->ss != NULL && msa->ss[idx] != NULL)
1383 return msa->ss[idx];
1388 MSAGetSeqSA(MSA *msa, int idx)
1390 if (msa->sa != NULL && msa->sa[idx] != NULL)
1391 return msa->sa[idx];