1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
8 *****************************************************************/
11 * SRE, Sun Jul 11 16:17:32 1993
13 * Import/export of GCG MSF multiple sequence alignment
14 * formatted files. Designed using format specifications
15 * kindly provided by Steve Smith of Genetics Computer Group.
17 * RCS $Id: msf.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: msf.c,v 1.4 2001/04/23 00:35:33 eddy Exp)
29 /*****************************************************************
31 * cc -DTESTDRIVE_MSF -g -O2 -Wall -o test msf.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c sqio.c alignio.c selex.c interleaved.c types.c -lm
35 main(int argc, char **argv)
43 if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
44 Die("Couldn't open %s\n", file);
46 while ((msa = ReadMSF(afp)) != NULL)
48 WriteMSF(stdout, msa);
55 /******************************************************************/
56 #endif /* testdrive_msf */
60 /* Function: ReadMSF()
61 * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis]
63 * Purpose: Parse an alignment read from an open MSF format
64 * alignment file. (MSF is a single-alignment format.)
65 * Return the alignment, or NULL if we've already
68 * Args: afp - open alignment file
70 * Returns: MSA * - an alignment object
71 * caller responsible for an MSAFree()
72 * NULL if no more alignments
75 * Will Die() here with a (potentially) useful message
76 * if a parsing error occurs.
93 if (feof(afp->f)) return NULL;
94 if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
96 /* The first line is the header.
97 * This is a new-ish GCG feature. Don't count on it, so
98 * we can be a bit more tolerant towards non-GCG software
99 * generating "MSF" files.
101 msa = MSAAlloc(10, 0);
102 if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) {
104 if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
105 } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) {
107 if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
110 /* Now we're in the free text comment section of the MSF file.
111 * It ends when we see the "MSF: Type: Check: .." line.
112 * This line must be present.
116 if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) &&
117 Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3))
119 alleged_alen = atoi(sqd_parse[0]);
120 switch (*(sqd_parse[1])) {
121 case 'N' : alleged_type = kRNA; break;
122 case 'P' : alleged_type = kAmino; break;
123 case 'X' : alleged_type = kOtherSeq; break;
124 default : alleged_type = kOtherSeq;
126 alleged_checksum = atoi(sqd_parse[3]);
127 if (msa->type == kOtherSeq) msa->type = alleged_type;
128 break; /* we're done with comment section. */
130 if (! IsBlankline(s))
131 MSAAddComment(msa, s);
132 } while ((s = MSAFileGetLine(afp)) != NULL);
134 /* Now we're in the name section.
135 * GCG has a relatively poorly documented feature: only sequences that
136 * appear in this list will be read from the alignment section. Commenting
137 * out sequences in the name list (by preceding them with "!") is
138 * allowed as a means of manually defining subsets of sequences in
139 * the alignment section. We can support this feature reasonably
140 * easily because of the hash table for names in the MSA: we
141 * only add names to the hash table when we see 'em in the name section.
143 while ((s = MSAFileGetLine(afp)) != NULL)
145 while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */
147 if (*s == '\n') continue; /* skip blank lines */
148 else if (*s == '!') MSAAddComment(msa, s);
149 else if ((sp = strstr(s, "Name:")) != NULL)
151 /* We take the name and the weigh, and that's it */
153 tok = sre_strtok(&sp, " \t", &slen); /* <sequence name> */
154 sqidx = GKIStoreKey(msa->index, tok);
155 if (sqidx >= msa->nseqalloc) MSAExpand(msa);
156 msa->sqname[sqidx] = sre_strdup(tok, slen);
159 if ((sp = strstr(sp, "Weight:")) == NULL)
160 Die("No Weight: on line %d for %s in name section of MSF file %s\n",
161 afp->linenumber, msa->sqname[sqidx], afp->fname);
163 tok = sre_strtok(&sp, " \t", &slen);
164 msa->wgt[sqidx] = atof(tok);
165 msa->flags |= MSA_SET_WGT;
167 else if (strncmp(s, "//", 2) == 0)
171 Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n",
172 afp->linenumber, afp->fname, s);
173 squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */
179 /* And now we're in the sequence section.
180 * As discussed above, if we haven't seen a sequence name, then we
181 * don't include the sequence in the alignment.
182 * Also, watch out for coordinate-only lines.
184 while ((s = MSAFileGetLine(afp)) != NULL)
187 if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue;
188 if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue;
190 /* The test for a coord line: digits starting both fields
192 if (isdigit(*name) && isdigit(*seq))
195 /* It's not blank, and it's not a coord line: must be sequence
197 sqidx = GKIKeyIndex(msa->index, name);
198 if (sqidx < 0) continue; /* not a sequence we recognize */
200 msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen);
203 /* We've left blanks in the aseqs; take them back out.
205 for (sqidx = 0; sqidx < msa->nseq; sqidx++)
207 if (msa->aseq[sqidx] == NULL)
208 Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname);
210 for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++)
212 if (*s == ' ' || *s == '\t') {
222 MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */
227 /* Function: WriteMSF()
228 * Date: SRE, Mon May 31 11:25:18 1999 [St. Louis]
230 * Purpose: Write an alignment in MSF format to an open file.
232 * Args: fp - file that's open for writing.
233 * msa - alignment to write.
235 * Note that msa->type, usually optional, must be
236 * set for WriteMSF to work. If it isn't, a fatal
237 * error is generated.
242 WriteMSF(FILE *fp, MSA *msa)
244 time_t now; /* current time as a time_t */
245 char date[64]; /* today's date in GCG's format "October 3, 1996 15:57" */
246 char **gcg_aseq; /* aligned sequences with gaps converted to GCG format */
247 char **gcg_sqname; /* sequence names with GCG-valid character sets */
248 int idx; /* counter for sequences */
249 char *s; /* pointer into sqname or seq */
250 int len; /* tmp variable for name lengths */
251 int namelen; /* maximum name length used */
252 int pos; /* position counter */
253 char buffer[51]; /* buffer for writing seq */
254 int i; /* another position counter */
256 /*****************************************************************
257 * Make copies of sequence names and sequences.
258 * GCG recommends that name characters should only contain
259 * alphanumeric characters, -, or _
260 * Some GCG and GCG-compatible software is sensitive to this.
261 * We silently convert all other characters to '_'.
263 * For sequences, GCG allows only ~ and . for gaps.
264 * Otherwise, everthing is interpreted as a residue;
265 * so squid's IUPAC-restricted chars are fine. ~ means
266 * an external gap. . means an internal gap.
267 *****************************************************************/
269 /* make copies that we can edit */
270 gcg_aseq = MallocOrDie(sizeof(char *) * msa->nseq);
271 gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq);
272 for (idx = 0; idx < msa->nseq; idx++)
274 gcg_aseq[idx] = sre_strdup(msa->aseq[idx], msa->alen);
275 gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1);
277 /* alter names as needed */
278 for (idx = 0; idx < msa->nseq; idx++)
279 for (s = gcg_sqname[idx]; *s != '\0'; s++)
280 if (! isalnum((int) *s) && *s != '-' && *s != '_')
282 /* alter gap chars in seq */
283 for (idx = 0; idx < msa->nseq; idx++)
285 for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++)
287 for (; *s != '\0'; s++)
288 if (isgap(*s)) *s = '.';
289 for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--)
290 gcg_aseq[idx][pos] = '~';
292 /* calculate max namelen used */
294 for (idx = 0; idx < msa->nseq; idx++)
295 if ((len = strlen(msa->sqname[idx])) > namelen)
298 /*****************************************************
299 * Write the MSF header
300 *****************************************************/
301 /* required file type line */
302 if (msa->type == kOtherSeq)
303 msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq);
305 if (msa->type == kRNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
306 else if (msa->type == kDNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
307 else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n");
308 else if (msa->type == kOtherSeq)
309 Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n");
311 Die("Invalid sequence type %d in WriteMSF()\n", msa->type);
313 /* free text comments */
314 if (msa->ncomment > 0)
316 for (idx = 0; idx < msa->ncomment; idx++)
317 fprintf(fp, "%s\n", msa->comment[idx]);
320 /* required checksum line */
322 if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0)
323 Die("What time is it on earth? strftime() failed in WriteMSF().\n");
324 fprintf(fp, " %s MSF: %d Type: %c %s Check: %d ..\n",
325 msa->name != NULL ? msa->name : "squid.msf",
327 msa->type == kRNA ? 'N' : 'P',
329 GCGMultchecksum(gcg_aseq, msa->nseq));
332 /*****************************************************
333 * Names/weights section
334 *****************************************************/
336 for (idx = 0; idx < msa->nseq; idx++)
338 fprintf(fp, " Name: %-*.*s Len: %5d Check: %4d Weight: %.2f\n",
342 GCGchecksum(gcg_aseq[idx], msa->alen),
348 /*****************************************************
349 * Write the sequences
350 *****************************************************/
352 for (pos = 0; pos < msa->alen; pos += 50)
354 fprintf(fp, "\n"); /* Blank line between sequence blocks */
356 /* Coordinate line */
357 len = (pos + 50) > msa->alen ? msa->alen - pos : 50;
359 fprintf(fp, "%*s %-6d%*s%6d\n", namelen, "",
361 len + ((len-1)/10) - 12, "",
364 fprintf(fp, "%*s %-6d\n", namelen, "", pos+1);
366 for (idx = 0; idx < msa->nseq; idx++)
368 fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]);
369 /* get next line's worth of 50 from seq */
370 strncpy(buffer, gcg_aseq[idx] + pos, 50);
372 /* draw the sequence line */
373 for (i = 0; i < len; i++)
375 if (! (i % 10)) fputc(' ', fp);
376 fputc(buffer[i], fp);
382 Free2DArray((void **) gcg_aseq, msa->nseq);
383 Free2DArray((void **) gcg_sqname, msa->nseq);