1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * SRE, Sun Jul 11 16:17:32 1993
14 * Import/export of GCG MSF multiple sequence alignment
15 * formatted files. Designed using format specifications
16 * kindly provided by Steve Smith of Genetics Computer Group.
18 * RCS $Id: msf.c,v 1.1.1.1 2005/03/22 08:34:20 cmzmasek Exp $
30 /*****************************************************************
32 * cc -DTESTDRIVE_MSF -g -O2 -Wall -o test msf.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c sqio.c alignio.c selex.c interleaved.c types.c -lm
36 main(int argc, char **argv)
44 if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
45 Die("Couldn't open %s\n", file);
47 while ((msa = ReadMSF(afp)) != NULL)
49 WriteMSF(stdout, msa);
56 /******************************************************************/
57 #endif /* testdrive_msf */
61 /* Function: ReadMSF()
62 * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis]
64 * Purpose: Parse an alignment read from an open MSF format
65 * alignment file. (MSF is a single-alignment format.)
66 * Return the alignment, or NULL if we've already
69 * Args: afp - open alignment file
71 * Returns: MSA * - an alignment object
72 * caller responsible for an MSAFree()
73 * NULL if no more alignments
76 * Will Die() here with a (potentially) useful message
77 * if a parsing error occurs.
94 if (feof(afp->f)) return NULL;
95 if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
97 /* The first line is the header.
98 * This is a new-ish GCG feature. Don't count on it, so
99 * we can be a bit more tolerant towards non-GCG software
100 * generating "MSF" files.
102 msa = MSAAlloc(10, 0);
103 if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) {
105 if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
106 } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) {
108 if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
111 /* Now we're in the free text comment section of the MSF file.
112 * It ends when we see the "MSF: Type: Check: .." line.
113 * This line must be present.
117 if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) &&
118 Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3))
120 alleged_alen = atoi(sqd_parse[0]);
121 switch (*(sqd_parse[1])) {
122 case 'N' : alleged_type = kRNA; break;
123 case 'P' : alleged_type = kAmino; break;
124 case 'X' : alleged_type = kOtherSeq; break;
125 default : alleged_type = kOtherSeq;
127 alleged_checksum = atoi(sqd_parse[3]);
128 if (msa->type == kOtherSeq) msa->type = alleged_type;
129 break; /* we're done with comment section. */
131 if (! IsBlankline(s))
132 MSAAddComment(msa, s);
133 } while ((s = MSAFileGetLine(afp)) != NULL);
135 /* Now we're in the name section.
136 * GCG has a relatively poorly documented feature: only sequences that
137 * appear in this list will be read from the alignment section. Commenting
138 * out sequences in the name list (by preceding them with "!") is
139 * allowed as a means of manually defining subsets of sequences in
140 * the alignment section. We can support this feature reasonably
141 * easily because of the hash table for names in the MSA: we
142 * only add names to the hash table when we see 'em in the name section.
144 while ((s = MSAFileGetLine(afp)) != NULL)
146 while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */
148 if (*s == '\n') continue; /* skip blank lines */
149 else if (*s == '!') MSAAddComment(msa, s);
150 else if ((sp = strstr(s, "Name:")) != NULL)
152 /* We take the name and the weigh, and that's it */
154 tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */
155 sqidx = GKIStoreKey(msa->index, tok);
156 if (sqidx >= msa->nseqalloc) MSAExpand(msa);
157 msa->sqname[sqidx] = sre_strdup(tok, slen);
160 if ((sp = strstr(sp, "Weight:")) == NULL)
161 Die("No Weight: on line %d for %s in name section of MSF file %s\n",
162 afp->linenumber, msa->sqname[sqidx], afp->fname);
164 tok = sre_strtok(&sp, " \t", &slen);
165 msa->wgt[sqidx] = atof(tok);
166 msa->flags |= MSA_SET_WGT;
168 else if (strncmp(s, "//", 2) == 0)
172 Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n",
173 afp->linenumber, afp->fname, s);
174 squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */
180 /* And now we're in the sequence section.
181 * As discussed above, if we haven't seen a sequence name, then we
182 * don't include the sequence in the alignment.
183 * Also, watch out for coordinate-only lines.
185 while ((s = MSAFileGetLine(afp)) != NULL)
188 if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue;
189 if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue;
191 /* The test for a coord line: digits starting both fields
193 if (isdigit(*name) && isdigit(*seq))
196 /* It's not blank, and it's not a coord line: must be sequence
198 sqidx = GKIKeyIndex(msa->index, name);
199 if (sqidx < 0) continue; /* not a sequence we recognize */
201 msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen);
204 /* We've left blanks in the aseqs; take them back out.
206 for (sqidx = 0; sqidx < msa->nseq; sqidx++)
208 if (msa->aseq[sqidx] == NULL)
209 Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname);
211 for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++)
213 if (*s == ' ' || *s == '\t') {
223 MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */
228 /* Function: WriteMSF()
229 * Date: SRE, Mon May 31 11:25:18 1999 [St. Louis]
231 * Purpose: Write an alignment in MSF format to an open file.
233 * Args: fp - file that's open for writing.
234 * msa - alignment to write.
236 * Note that msa->type, usually optional, must be
237 * set for WriteMSF to work. If it isn't, a fatal
238 * error is generated.
243 WriteMSF(FILE *fp, MSA *msa)
245 time_t now; /* current time as a time_t */
246 char date[64]; /* today's date in GCG's format "October 3, 1996 15:57" */
247 char **gcg_aseq; /* aligned sequences with gaps converted to GCG format */
248 char **gcg_sqname; /* sequence names with GCG-valid character sets */
249 int idx; /* counter for sequences */
250 char *s; /* pointer into sqname or seq */
251 int len; /* tmp variable for name lengths */
252 int namelen; /* maximum name length used */
253 int pos; /* position counter */
254 char buffer[51]; /* buffer for writing seq */
255 int i; /* another position counter */
257 /*****************************************************************
258 * Make copies of sequence names and sequences.
259 * GCG recommends that name characters should only contain
260 * alphanumeric characters, -, or _
261 * Some GCG and GCG-compatible software is sensitive to this.
262 * We silently convert all other characters to '_'.
264 * For sequences, GCG allows only ~ and . for gaps.
265 * Otherwise, everthing is interpreted as a residue;
266 * so squid's IUPAC-restricted chars are fine. ~ means
267 * an external gap. . means an internal gap.
268 *****************************************************************/
270 /* make copies that we can edit */
271 gcg_aseq = MallocOrDie(sizeof(char *) * msa->nseq);
272 gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq);
273 for (idx = 0; idx < msa->nseq; idx++)
275 gcg_aseq[idx] = sre_strdup(msa->aseq[idx], msa->alen);
276 gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1);
278 /* alter names as needed */
279 for (idx = 0; idx < msa->nseq; idx++)
280 for (s = gcg_sqname[idx]; *s != '\0'; s++)
281 if (! isalnum((int) *s) && *s != '-' && *s != '_')
283 /* alter gap chars in seq */
284 for (idx = 0; idx < msa->nseq; idx++)
286 for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++)
288 for (; *s != '\0'; s++)
289 if (isgap(*s)) *s = '.';
290 for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--)
291 gcg_aseq[idx][pos] = '~';
293 /* calculate max namelen used */
295 for (idx = 0; idx < msa->nseq; idx++)
296 if ((len = strlen(msa->sqname[idx])) > namelen)
299 /*****************************************************
300 * Write the MSF header
301 *****************************************************/
302 /* required file type line */
303 if (msa->type == kOtherSeq)
304 msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq);
306 if (msa->type == kRNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
307 else if (msa->type == kDNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
308 else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n");
309 else if (msa->type == kOtherSeq)
310 Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n");
312 Die("Invalid sequence type %d in WriteMSF()\n", msa->type);
314 /* free text comments */
315 if (msa->ncomment > 0)
317 for (idx = 0; idx < msa->ncomment; idx++)
318 fprintf(fp, "%s\n", msa->comment[idx]);
321 /* required checksum line */
323 if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0)
324 Die("What time is it on earth? strftime() failed in WriteMSF().\n");
325 fprintf(fp, " %s MSF: %d Type: %c %s Check: %d ..\n",
326 msa->name != NULL ? msa->name : "squid.msf",
328 msa->type == kRNA ? 'N' : 'P',
330 GCGMultchecksum(gcg_aseq, msa->nseq));
333 /*****************************************************
334 * Names/weights section
335 *****************************************************/
337 for (idx = 0; idx < msa->nseq; idx++)
339 fprintf(fp, " Name: %-*.*s Len: %5d Check: %4d Weight: %.2f\n",
343 GCGchecksum(gcg_aseq[idx], msa->alen),
349 /*****************************************************
350 * Write the sequences
351 *****************************************************/
353 for (pos = 0; pos < msa->alen; pos += 50)
355 fprintf(fp, "\n"); /* Blank line between sequence blocks */
357 /* Coordinate line */
358 len = (pos + 50) > msa->alen ? msa->alen - pos : 50;
360 fprintf(fp, "%*s %-6d%*s%6d\n", namelen, "",
362 len + ((len-1)/10) - 12, "",
365 fprintf(fp, "%*s %-6d\n", namelen, "", pos+1);
367 for (idx = 0; idx < msa->nseq; idx++)
369 fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]);
370 /* get next line's worth of 50 from seq */
371 strncpy(buffer, gcg_aseq[idx] + pos, 50);
373 /* draw the sequence line */
374 for (i = 0; i < len; i++)
376 if (! (i % 10)) fputc(' ', fp);
377 fputc(buffer[i], fp);
383 Free2DArray((void **) gcg_aseq, msa->nseq);
384 Free2DArray((void **) gcg_sqname, msa->nseq);