1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * SRE, Sun Jun 6 17:50:45 1999 [bus from Madison, 1999 worm mtg]
14 * Import/export of ClustalV/W multiple sequence alignment
15 * formatted files. Derivative of msf.c; MSF is a pretty
16 * generic interleaved format.
18 * RCS $Id: clustal.c,v 1.1.1.1 2005/03/22 08:34:27 cmzmasek Exp $
28 #ifdef TESTDRIVE_CLUSTAL
29 /*****************************************************************
31 * cc -DTESTDRIVE_CLUSTAL -g -O2 -Wall -o test clustal.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm
35 main(int argc, char **argv)
43 if ((afp = MSAFileOpen(file, MSAFILE_CLUSTAL, NULL)) == NULL)
44 Die("Couldn't open %s\n", file);
46 while ((msa = ReadClustal(afp)) != NULL)
48 WriteClustal(stdout, msa);
55 /******************************************************************/
56 #endif /* testdrive_clustal */
59 /* Function: ReadClustal()
60 * Date: SRE, Sun Jun 6 17:53:49 1999 [bus from Madison, 1999 worm mtg]
62 * Purpose: Parse an alignment read from an open Clustal format
63 * alignment file. Clustal is a single-alignment format.
64 * Return the alignment, or NULL if we have no data.
66 * Args: afp - open alignment file
68 * Returns: MSA * - an alignment object
69 * caller responsible for an MSAFree()
70 * NULL if no more alignments
73 * Will Die() here with a (potentially) useful message
74 * if a parsing error occurs.
77 ReadClustal(MSAFILE *afp)
87 if (feof(afp->f)) return NULL;
89 /* Skip until we see the CLUSTAL header
91 while ((s = MSAFileGetLine(afp)) != NULL)
93 if (strncmp(s, "CLUSTAL", 7) == 0 &&
94 strstr(s, "multiple sequence alignment") != NULL)
97 if (s == NULL) return NULL;
99 msa = MSAAlloc(10, 0);
101 /* Now we're in the sequence section.
102 * As discussed above, if we haven't seen a sequence name, then we
103 * don't include the sequence in the alignment.
104 * Watch out for conservation markup lines that contain *.: chars
106 while ((s = MSAFileGetLine(afp)) != NULL)
108 if ((name = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue;
109 if ((seq = sre_strtok(&s, WHITESPACE, &slen)) == NULL) continue;
110 s2 = sre_strtok(&s, "\n", NULL);
112 /* The test for a conservation markup line
114 if (strpbrk(name, ".*:") != NULL && strpbrk(seq, ".*:") != NULL)
117 Die("Parse failed at line %d, file %s: possibly using spaces as gaps",
118 afp->linenumber, afp->fname);
120 /* It's not blank, and it's not a coord line: must be sequence
122 sqidx = MSAGetSeqidx(msa, name, msa->lastidx+1);
123 msa->lastidx = sqidx;
124 msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen);
127 MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */
132 /* Function: WriteClustal()
133 * Date: SRE, Sun Jun 6 18:12:47 1999 [bus from Madison, worm mtg 1999]
135 * Purpose: Write an alignment in Clustal format to an open file.
137 * Args: fp - file that's open for writing.
138 * msa - alignment to write.
143 WriteClustal(FILE *fp, MSA *msa)
145 int idx; /* counter for sequences */
146 int len; /* tmp variable for name lengths */
147 int namelen; /* maximum name length used */
148 int pos; /* position counter */
149 char buf[64]; /* buffer for writing seq */
150 int cpl = 50; /* char per line (< 64) */
152 /* calculate max namelen used */
154 for (idx = 0; idx < msa->nseq; idx++)
155 if ((len = strlen(msa->sqname[idx])) > namelen)
158 fprintf(fp, "CLUSTAL W(1.5) multiple sequence alignment\n");
160 /*****************************************************
161 * Write the sequences
162 *****************************************************/
164 for (pos = 0; pos < msa->alen; pos += cpl)
166 fprintf(fp, "\n"); /* Blank line between sequence blocks */
167 for (idx = 0; idx < msa->nseq; idx++)
169 strncpy(buf, msa->aseq[idx] + pos, cpl);
171 fprintf(fp, "%*s %s\n", namelen, msa->sqname[idx], buf);