1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
8 *****************************************************************/
12 * SRE, Mon Jun 14 11:08:38 1999
13 * SELEX obsolete as the preferred HMMER/SQUID format
14 * replaced by Stockholm format
15 * selex support retained for backwards compatibility
16 * kludged to use the MSA interface
18 * SRE, Mon Jan 30 14:41:49 1995:
19 * #=SA side chain % surface accessibility annotation supported
21 * SRE, Tue Nov 9 17:40:50 1993:
22 * major revision. #= special comments and aliinfo_s optional
23 * alignment info support added. Support for #=CS (consensus
24 * secondary structure), #=SS (individual secondary structure),
25 * #=RF (reference coordinate system), #=SQ (per-sequence header info),
26 * and #=AU ("author") added.
28 * Fri Dec 4 17:43:24 1992, SRE:
29 * Reading and writing aligned sequences to/from disk files.
30 * Implements a new, broader specification of SELEX format
31 * and supercedes alignio.c.
33 * SELEX format is documented in Docs/formats.tex.
34 ****************************************************************************
35 * RCS $Id: selex.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: selex.c,v 1.11 2002/10/12 04:40:35 eddy Exp)
46 static int copy_alignment_line(char *aseq, int apos, int name_rcol,
47 char *buffer, int lcol, int rcol, char gapsym);
48 static void actually_write_selex(FILE *fp, MSA *msa, int cpl);
50 static char commentsyms[] = "%#";
52 /* Function: ReadSELEX()
53 * Date: SRE, Sun Jun 6 18:24:09 1999 [St. Louis]
55 * Purpose: Parse an alignment read from an open SELEX format
56 * alignment file. (SELEX is a single alignment format).
57 * Return the alignment, or NULL if we've already read the
58 * alignment or there's no alignment data in the file.
60 * Limitations: SELEX is the only remaining multipass parser for
61 * alignment files. It cannot read from gzip or from stdin.
62 * It Die()'s here if you try. The reason for this
63 * that SELEX allows space characters as gaps, so we don't
64 * know the borders of an alignment block until we've seen
65 * the whole block. I could rewrite to allow single-pass
66 * parsing (by storing the whole block in memory) but
67 * since SELEX is now legacy, why bother.
69 * Note that the interface is totally kludged: fastest
70 * possible adaptation of old ReadSELEX() to the new
73 * Args: afp - open alignment file
75 * Returns: MSA * - an alignment object
76 * caller responsible for an MSAFree()
77 * NULL if no alignment data.
80 ReadSELEX(MSAFILE *afp)
82 MSA *msa; /* RETURN: mult seq alignment */
83 FILE *fp; /* ptr to opened seqfile */
84 char **aseqs; /* aligned seqs */
85 int num = 0; /* number of seqs read */
86 char buffer[LINEBUFLEN]; /* input buffer for lines */
87 char bufcpy[LINEBUFLEN]; /* strtok'able copy of buffer */
88 struct block_struc { /** alignment data for a block: */
89 int lcol; /* furthest left aligned sym */
90 int rcol; /* furthest right aligned sym */
92 int blocknum; /* number of blocks in file */
93 char *nptr; /* ptr to start of name on line */
94 char *sptr; /* ptr into sequence on line */
95 int currnum; /* num. seqs in given block */
96 int currblock; /* index for blocks */
97 int i; /* loop counter */
98 int seqidx; /* counter for seqs */
99 int alen; /* length of alignment */
100 int warn_names; /* becomes TRUE if names don't match between blocks */
101 int headnum; /* seqidx in per-sequence header info */
106 AINFO base_ainfo, *ainfo; /* hack: used to be passed ptr to AINFO */
109 /* Convert from MSA interface to what old ReadSELEX() did:
110 * - copy our open fp, rather than opening file
111 * - verify that we're not reading a gzip or stdin
113 if (feof(afp->f)) return NULL;
114 if (afp->do_gzip || afp->do_stdin)
115 Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file");
119 /***************************************************
120 * First pass across file.
121 * Count seqs, get names, determine column info
122 * Determine what sorts of info are active in this file.
123 ***************************************************/
126 /* get first line of the block
127 * (non-comment, non-blank) */
130 if (fgets(buffer, LINEBUFLEN, fp) == NULL)
131 { squid_errno = SQERR_NODATA; return 0; }
132 strcpy(bufcpy, buffer);
135 if (strncmp(buffer, "#=CS", 4) == 0) have_cs = 1;
136 else if (strncmp(buffer, "#=RF", 4) == 0) have_rf = 1;
139 while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL ||
140 (strchr(commentsyms, *nptr) != NULL));
146 /* allocate for info about this block. */
148 blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc));
150 blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc));
151 blocks[blocknum].lcol = LINEBUFLEN+1;
152 blocks[blocknum].rcol = -1;
155 while (nptr != NULL) /* becomes NULL when this block ends. */
157 /* First block only: save names */
161 ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO));
163 ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO));
165 ainfo->sqinfo[currnum].flags = 0;
166 SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME);
168 else /* in each additional block: check names */
170 if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0)
175 /* check rcol, lcol */
176 if ((sptr = strtok(NULL, WHITESPACE)) != NULL)
178 /* is this the furthest left we've
179 seen word 2 in this block? */
180 if (sptr - bufcpy < blocks[blocknum].lcol)
181 blocks[blocknum].lcol = sptr - bufcpy;
182 /* look for right side in buffer */
183 for (sptr = buffer + strlen(buffer) - 1;
184 strchr(WHITESPACE, *sptr) != NULL;
187 if (sptr - buffer > blocks[blocknum].rcol)
188 blocks[blocknum].rcol = sptr - buffer;
191 /* get the next line; blank line means end of block */
194 if (fgets(buffer, LINEBUFLEN, fp) == NULL)
195 { nptr = NULL; break; }
196 strcpy(bufcpy, buffer);
198 if (strncmp(buffer, "#=SS", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS;
199 else if (strncmp(buffer, "#=SA", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA;
200 else if (strncmp(buffer, "#=CS", 4) == 0) have_cs = 1;
201 else if (strncmp(buffer, "#=RF", 4) == 0) have_rf = 1;
203 if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL)
205 } while (strchr(commentsyms, *nptr) != NULL);
209 /* check that number of sequences matches expected */
212 else if (currnum != num)
213 Die("Parse error in ReadSELEX()");
216 /* get first line of next block
217 * (non-comment, non-blank) */
220 if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; }
221 strcpy(bufcpy, buffer);
223 while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL ||
224 (strchr(commentsyms, *nptr) != NULL));
228 /***************************************************
229 * Get ready for second pass:
230 * figure out the length of the alignment
233 ***************************************************/
236 for (currblock = 0; currblock < blocknum; currblock++)
237 alen += blocks[currblock].rcol - blocks[currblock].lcol + 1;
241 /* allocations. we can't use AllocateAlignment because of
242 * the way we already used ainfo->sqinfo.
244 aseqs = (char **) MallocOrDie (num * sizeof(char *));
246 ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char));
248 ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char));
252 for (i = 0; i < num; i++)
254 aseqs[i] = (char *) MallocOrDie ((alen+1) * sizeof(char));
255 if (ainfo->sqinfo[i].flags & SQINFO_SS)
256 ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char));
257 if (ainfo->sqinfo[i].flags & SQINFO_SA)
258 ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char));
263 ainfo->wgt = (float *) MallocOrDie (sizeof(float) * num);
264 FSet(ainfo->wgt, num, 1.0);
266 /***************************************************
267 * Second pass across file. Parse header; assemble sequences
268 ***************************************************/
269 /* We've now made a complete first pass over the file. We know how
270 * many blocks it contains, we know the number of seqs in the first
271 * block, and we know every block has the same number of blocks;
272 * so we can be a bit more cavalier about error-checking as we
273 * make the second pass.
281 if (fgets(buffer, LINEBUFLEN, fp) == NULL)
282 Die("Parse error in ReadSELEX()");
283 strcpy(bufcpy, buffer);
284 if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */
286 if (strcmp(nptr, "#=AU") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
287 ainfo->au = Strdup(sptr);
288 else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
289 ainfo->name = Strdup(sptr);
290 else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
291 ainfo->acc = Strdup(sptr);
292 else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
293 ainfo->desc = Strdup(sptr);
294 else if (strcmp(nptr, "#=GA") == 0)
296 if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
297 Die("Parse error in #=GA line in ReadSELEX()");
298 ainfo->ga1 = atof(sptr);
300 if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
301 Die("Parse error in #=GA line in ReadSELEX()");
302 ainfo->ga2 = atof(sptr);
304 ainfo->flags |= AINFO_GA;
306 else if (strcmp(nptr, "#=TC") == 0)
308 if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
309 Die("Parse error in #=TC line in ReadSELEX()");
310 ainfo->tc1 = atof(sptr);
312 if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
313 Die("Parse error in #=TC line in ReadSELEX()");
314 ainfo->tc2 = atof(sptr);
316 ainfo->flags |= AINFO_TC;
318 else if (strcmp(nptr, "#=NC") == 0)
320 if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
321 Die("Parse error in #=NC line in ReadSELEX()");
322 ainfo->nc1 = atof(sptr);
324 if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
325 Die("Parse error in #=NC line in ReadSELEX()");
326 ainfo->nc2 = atof(sptr);
328 ainfo->flags |= AINFO_NC;
330 else if (strcmp(nptr, "#=SQ") == 0) /* per-sequence header info */
332 /* first field is the name */
333 if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
334 Die("Parse error in #=SQ line in ReadSELEX()");
335 if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE;
337 /* second field is the weight */
338 if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
339 Die("Parse error in #=SQ line in ReadSELEX()");
341 Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number");
342 ainfo->wgt[headnum] = atof(sptr);
344 /* third field is database source id */
345 if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
346 Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
347 SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID);
349 /* fourth field is database accession number */
350 if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
351 Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
352 SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC);
354 /* fifth field is start..stop::olen */
355 if ((sptr = strtok(NULL, ".:")) == NULL)
356 Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
357 SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START);
359 if ((sptr = strtok(NULL, ".:")) == NULL)
360 Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
361 SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP);
363 if ((sptr = strtok(NULL, ":\t ")) == NULL)
364 Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
365 SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN);
367 /* rest of line is optional description */
368 if ((sptr = strtok(NULL, "\n")) != NULL)
369 SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC);
373 else if (strcmp(nptr, "#=CS") == 0) break;
374 else if (strcmp(nptr, "#=RF") == 0) break;
375 else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */
380 for (currblock = 0 ; currblock < blocknum; currblock++)
382 /* parse the block */
386 /* Consensus structure */
387 if (strcmp(nptr, "#=CS") == 0)
389 if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1,
390 buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
391 Die("Parse error in #=CS line in ReadSELEX()");
394 /* Reference coordinates */
395 else if (strcmp(nptr, "#=RF") == 0)
397 if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1,
398 buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
399 Die("Parse error in #=RF line in ReadSELEX()");
401 /* Individual secondary structure */
402 else if (strcmp(nptr, "#=SS") == 0)
404 if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1,
405 buffer, blocks[currblock].lcol,
406 blocks[currblock].rcol, (char) '.'))
407 Die("Parse error in #=SS line in ReadSELEX()");
410 /* Side chain % surface accessibility code */
411 else if (strcmp(nptr, "#=SA") == 0)
413 if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1,
414 buffer, blocks[currblock].lcol,
415 blocks[currblock].rcol, (char) '.'))
416 Die("Parse error in #=SA line in ReadSELEX()");
418 /* Aligned sequence; avoid unparsed machine comments */
419 else if (strncmp(nptr, "#=", 2) != 0)
421 if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1,
422 buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
423 Die("Parse error in alignment line in ReadSELEX()");
431 if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* EOF */
432 strcpy(bufcpy, buffer);
433 if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */
434 if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */
435 if (strchr(commentsyms, *nptr) == NULL) break; /* data */
437 } /* end of a block */
439 currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1;
441 /* get line 1 of next block */
444 if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */
445 strcpy(bufcpy, buffer);
446 if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */
447 if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */
448 if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */
450 } /* end of the file */
452 /* Lengths in sqinfo are for raw sequence (ungapped),
453 * and SS, SA are 0..rlen-1 not 0..alen-1.
454 * Only the seqs with structures come out of here with lengths set.
456 for (seqidx = 0; seqidx < num; seqidx++)
459 /* secondary structures */
460 if (ainfo->sqinfo[seqidx].flags & SQINFO_SS)
462 for (apos = rpos = 0; apos < alen; apos++)
463 if (! isgap(aseqs[seqidx][apos]))
465 ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos];
468 ainfo->sqinfo[seqidx].ss[rpos] = '\0';
470 /* Surface accessibility */
471 if (ainfo->sqinfo[seqidx].flags & SQINFO_SA)
473 for (apos = rpos = 0; apos < alen; apos++)
474 if (! isgap(aseqs[seqidx][apos]))
476 ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos];
479 ainfo->sqinfo[seqidx].sa[rpos] = '\0';
483 /* NULL-terminate all the strings */
484 if (ainfo->rf != NULL) ainfo->rf[alen] = '\0';
485 if (ainfo->cs != NULL) ainfo->cs[alen] = '\0';
486 for (seqidx = 0; seqidx < num; seqidx++)
487 aseqs[seqidx][alen] = '\0';
489 /* find raw sequence lengths for sqinfo */
490 for (seqidx = 0; seqidx < num; seqidx++)
493 for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++)
494 if (!isgap(*sptr)) count++;
495 ainfo->sqinfo[seqidx].len = count;
496 ainfo->sqinfo[seqidx].flags |= SQINFO_LEN;
500 /***************************************************
501 * Garbage collection and return
502 ***************************************************/
506 Warning("sequences may be in different orders in blocks of %s?", afp->fname);
508 Warn("sequences may be in different orders in blocks of %s?", afp->fname);
511 /* Convert back to MSA structure. (Wasteful kludge.)
513 msa = MSAFromAINFO(aseqs, ainfo);
515 FreeAlignment(aseqs, ainfo);
520 /* Function: WriteSELEX()
521 * Date: SRE, Mon Jun 14 13:13:14 1999 [St. Louis]
523 * Purpose: Write a SELEX file in multiblock format.
525 * Args: fp - file that's open for writing
526 * msa - multiple sequence alignment object
531 WriteSELEX(FILE *fp, MSA *msa)
533 actually_write_selex(fp, msa, 50); /* 50 char per block */
536 /* Function: WriteSELEXOneBlock()
537 * Date: SRE, Mon Jun 14 13:14:56 1999 [St. Louis]
539 * Purpose: Write a SELEX alignment file in Pfam's single-block
540 * format style. A wrapper for actually_write_selex().
542 * Args: fp - file that's open for writing
543 * msa- alignment to write
548 WriteSELEXOneBlock(FILE *fp, MSA *msa)
550 actually_write_selex(fp, msa, msa->alen); /* one big block */
554 /* Function: actually_write_selex()
555 * Date: SRE, Mon Jun 14 12:54:46 1999 [St. Louis]
557 * Purpose: Write an alignment in SELEX format to an open
558 * file. This is the function that actually does
559 * the work. The API's WriteSELEX() and
560 * WriteSELEXOneBlock() are wrappers.
562 * Args: fp - file that's open for writing
563 * msa - alignment to write
564 * cpl - characters to write per line in alignment block
569 actually_write_selex(FILE *fp, MSA *msa, int cpl)
577 buf = malloc(sizeof(char) * (cpl+101)); /* 100 chars allowed for name, etc. */
579 /* Figure out how much space we need for name + markup
580 * to keep the alignment in register, for easier human viewing --
581 * even though Stockholm format doesn't care about visual
585 for (i = 0; i < msa->nseq; i++)
586 if ((len = strlen(msa->sqname[i])) > namewidth)
588 if (namewidth < 6) namewidth = 6; /* minimum space for markup tags */
590 /* Free text comments
592 for (i = 0; i < msa->ncomment; i++)
593 fprintf(fp, "# %s\n", msa->comment[i]);
594 if (msa->ncomment > 0) fprintf(fp, "\n");
596 /* Per-file annotation
598 if (msa->name != NULL) fprintf(fp, "#=ID %s\n", msa->name);
599 if (msa->acc != NULL) fprintf(fp, "#=AC %s\n", msa->acc);
600 if (msa->desc != NULL) fprintf(fp, "#=DE %s\n", msa->desc);
601 if (msa->au != NULL) fprintf(fp, "#=AU %s\n", msa->au);
603 /* Thresholds are hacky. Pfam has two. Rfam has one.
605 if (msa->cutoff_is_set[MSA_CUTOFF_GA1] && msa->cutoff_is_set[MSA_CUTOFF_GA2])
606 fprintf(fp, "#=GA %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_GA1], msa->cutoff[MSA_CUTOFF_GA2]);
607 else if (msa->cutoff_is_set[MSA_CUTOFF_GA1])
608 fprintf(fp, "#=GA %.1f\n", msa->cutoff[MSA_CUTOFF_GA1]);
609 if (msa->cutoff_is_set[MSA_CUTOFF_NC1] && msa->cutoff_is_set[MSA_CUTOFF_NC2])
610 fprintf(fp, "#=NC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_NC1], msa->cutoff[MSA_CUTOFF_NC2]);
611 else if (msa->cutoff_is_set[MSA_CUTOFF_NC1])
612 fprintf(fp, "#=NC %.1f\n", msa->cutoff[MSA_CUTOFF_NC1]);
613 if (msa->cutoff_is_set[MSA_CUTOFF_TC1] && msa->cutoff_is_set[MSA_CUTOFF_TC2])
614 fprintf(fp, "#=TC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_TC1], msa->cutoff[MSA_CUTOFF_TC2]);
615 else if (msa->cutoff_is_set[MSA_CUTOFF_TC1])
616 fprintf(fp, "#=TC %.1f\n", msa->cutoff[MSA_CUTOFF_TC1]);
618 /* Per-sequence annotation
620 for (i = 0; i < msa->nseq; i++)
621 fprintf(fp, "#=SQ %-*.*s %6.4f %s %s %d..%d::%d %s\n",
622 namewidth, namewidth, msa->sqname[i],
624 "-", /* MSA has no ID field */
625 (msa->sqacc != NULL && msa->sqacc[i] != NULL) ? msa->sqacc[i] : "-",
626 0, 0, 0, /* MSA has no start, stop, olen field */
627 (msa->sqdesc != NULL && msa->sqdesc[i] != NULL) ? msa->sqdesc[i] : "-");
630 /* Alignment section:
632 for (currpos = 0; currpos < msa->alen; currpos += cpl)
634 if (currpos > 0) fprintf(fp, "\n");
636 if (msa->ss_cons != NULL) {
637 strncpy(buf, msa->ss_cons + currpos, cpl);
639 fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=CS", buf);
641 if (msa->rf != NULL) {
642 strncpy(buf, msa->rf + currpos, cpl);
644 fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=RF", buf);
646 for (i = 0; i < msa->nseq; i++)
648 strncpy(buf, msa->aseq[i] + currpos, cpl);
650 fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, msa->sqname[i], buf);
652 if (msa->ss != NULL && msa->ss[i] != NULL) {
653 strncpy(buf, msa->ss[i] + currpos, cpl);
655 fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=SS", buf);
657 if (msa->sa != NULL && msa->sa[i] != NULL) {
658 strncpy(buf, msa->sa[i] + currpos, cpl);
660 fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=SA", buf);
668 /* Function: copy_alignment_line()
670 * Purpose: Given a line from an alignment file, and bounds lcol,rcol
671 * on what part of it may be sequence, save the alignment into
672 * aseq starting at position apos.
674 * name_rcol is set to the rightmost column this aseqs's name
675 * occupies; if name_rcol >= lcol, we have a special case in
676 * which the name intrudes into the sequence zone.
679 copy_alignment_line(char *aseq, int apos, int name_rcol,
680 char *buffer, int lcol, int rcol, char gapsym)
686 s2 = buffer; /* be careful that buffer doesn't end before lcol! */
687 for (i = 0; i < lcol; i++)
690 for (i = lcol; i <= rcol; i++)
694 Warning("TAB characters will corrupt a SELEX alignment! Please remove them first.");
696 Warn("TAB characters will corrupt a SELEX alignment! Please remove them first.");
700 if (name_rcol >= i) /* name intrusion special case: pad left w/ gaps */
702 /* short buffer special case: pad right w/ gaps */
703 else if (*s2 == '\0' || *s2 == '\n')
706 else if (*s2 == ' ') /* new: disallow spaces as gap symbols */
709 else /* normal case: copy buffer into aseq */
722 /* Function: DealignAseqs()
724 * Given an array of (num) aligned sequences aseqs,
725 * strip the gaps. Store the raw sequences in a new allocated array.
727 * Caller is responsible for free'ing the memory allocated to
730 * Returns 1 on success. Returns 0 and sets squid_errno on
734 DealignAseqs(char **aseqs, int num, char ***ret_rseqs)
736 char **rseqs; /* de-aligned sequence array */
737 int idx; /* counter for sequences */
738 int depos; /* position counter for dealigned seq*/
739 int apos; /* position counter for aligned seq */
740 int seqlen; /* length of aligned seq */
743 rseqs = (char **) MallocOrDie (num * sizeof(char *));
745 for (idx = 0; idx < num; idx++)
747 seqlen = strlen(aseqs[idx]);
749 rseqs[idx] = (char *) MallocOrDie ((seqlen + 1) * sizeof(char));
753 for (apos = 0; aseqs[idx][apos] != '\0'; apos++)
754 if (!isgap(aseqs[idx][apos]))
756 rseqs[idx][depos] = aseqs[idx][apos];
759 rseqs[idx][depos] = '\0';
766 /* Function: IsSELEXFormat()
768 * Return TRUE if filename may be in SELEX format.
770 * Accuracy is sacrificed for speed; a TRUE return does
771 * *not* guarantee that the file will pass the stricter
772 * error-checking of ReadSELEX(). All it checks is that
773 * the first 500 non-comment lines of a file are
774 * blank, or if there's a second "word" on the line
775 * it looks like sequence (i.e., it's not kOtherSeq).
777 * Returns TRUE or FALSE.
780 IsSELEXFormat(char *filename)
782 FILE *fp; /* ptr to open sequence file */
783 char buffer[LINEBUFLEN];
784 char *sptr; /* ptr to first word */
788 if ((fp = fopen(filename, "r")) == NULL)
789 { squid_errno = SQERR_NOFILE; return 0; }
792 while (linenum < 500 &&
793 fgets(buffer, LINEBUFLEN, fp) != NULL)
796 /* dead giveaways for extended SELEX */
797 if (strncmp(buffer, "#=AU", 4) == 0) goto DONE;
798 else if (strncmp(buffer, "#=ID", 4) == 0) goto DONE;
799 else if (strncmp(buffer, "#=AC", 4) == 0) goto DONE;
800 else if (strncmp(buffer, "#=DE", 4) == 0) goto DONE;
801 else if (strncmp(buffer, "#=GA", 4) == 0) goto DONE;
802 else if (strncmp(buffer, "#=TC", 4) == 0) goto DONE;
803 else if (strncmp(buffer, "#=NC", 4) == 0) goto DONE;
804 else if (strncmp(buffer, "#=SQ", 4) == 0) goto DONE;
805 else if (strncmp(buffer, "#=SS", 4) == 0) goto DONE;
806 else if (strncmp(buffer, "#=CS", 4) == 0) goto DONE;
807 else if (strncmp(buffer, "#=RF", 4) == 0) goto DONE;
810 if (strchr(commentsyms, *buffer) != NULL) continue;
813 if ((sptr = strtok(buffer, WHITESPACE)) == NULL) continue;
815 /* a one-word line (name only)
816 is possible, though rare */
817 if ((sptr = strtok(NULL, "\n")) == NULL) continue;
819 if (Seqtype(sptr) == kOtherSeq) {fclose(fp); return 0;}