binaries/src/clustalo/src/squid/stockholm.c

   1 /*****************************************************************
   2  * SQUID - a library of functions for biological sequence analysis
   3  * Copyright (C) 1992-2002 Washington University School of Medicine
   4  *
   5  *     This source code is freely distributed under the terms of the
   6  *     GNU General Public License. See the files COPYRIGHT and LICENSE
   7  *     for details.
   8  *****************************************************************/
   9
  10 /* stockholm.c
  11  * SRE, Fri May 28 15:46:41 1999
  12  *
  13  * Reading/writing of Stockholm format multiple sequence alignments.
  14  *
  15  * example of API:
  16  *
  17  * MSA     *msa;
  18  * FILE    *fp;        -- opened for write with fopen()
  19  * MSAFILE *afp;       -- opened for read with MSAFileOpen()
  20  *
  21  * while ((msa = ReadStockholm(afp)) != NULL)
  22  *   {
  23  *      WriteStockholm(fp, msa);
  24  *      MSAFree(msa);
  25  *   }
  26  *
  27  * RCS $Id: stockholm.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: stockholm.c,v 1.7 2002/10/12 04:40:36 eddy Exp)
  28  */
  29 #include <stdio.h>
  30 #include <string.h>
  31 #include "squid.h"
  32 #include "msa.h"
  33
  34 static int  parse_gf(MSA *msa, char *buf);
  35 static int  parse_gs(MSA *msa, char *buf);
  36 static int  parse_gc(MSA *msa, char *buf);
  37 static int  parse_gr(MSA *msa, char *buf);
  38 static int  parse_comment(MSA *msa, char *buf);
  39 static int  parse_sequence(MSA *msa, char *buf);
  40 static void actually_write_stockholm(FILE *fp, MSA *msa, int cpl);
  41
  42 #ifdef TESTDRIVE_STOCKHOLM
  43 /*****************************************************************
  44  * stockholm.c test driver:
  45  * cc -DTESTDRIVE_STOCKHOLM -g -O2 -Wall -o test stockholm.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm
  46  *
  47  */
  48 int
  49 main(int argc, char **argv)
  50 {
  51   MSAFILE *afp;
  52   MSA     *msa;
  53   char    *file;
  54
  55   file = argv[1];
  56
  57   if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
  58     Die("Couldn't open %s\n", file);
  59
  60   while ((msa = ReadStockholm(afp)) != NULL)
  61     {
  62       WriteStockholm(stdout, msa);
  63       MSAFree(msa);
  64     }
  65
  66   MSAFileClose(afp);
  67   exit(0);
  68 }
  69 /******************************************************************/
  70 #endif /* testdriver */
  71
  72
  73 /* Function: ReadStockholm()
  74  * Date:     SRE, Fri May 21 17:33:10 1999 [St. Louis]
  75  *
  76  * Purpose:  Parse the next alignment from an open Stockholm
  77  *           format alignment file. Return the alignment, or
  78  *           NULL if there are no more alignments in the file.
  79  *
  80  * Args:     afp  - open alignment file
  81  *
  82  * Returns:  MSA *   - an alignment object.
  83  *                     caller responsible for an MSAFree()
  84  *           NULL if no more alignments
  85  *
  86  * Diagnostics:
  87  *           Will Die() here with a (potentially) useful message
  88  *           if a parsing error occurs
  89  */
  90 MSA *
  91 ReadStockholm(MSAFILE *afp)
  92 {
  93   MSA   *msa;
  94   char  *s;
  95   int    status;
  96
  97   if (feof(afp->f)) return NULL;
  98
  99   /* Initialize allocation of the MSA.
 100    */
 101   msa = MSAAlloc(10, 0);
 102
 103   /* Check the magic Stockholm header line.
 104    * We have to skip blank lines here, else we perceive
 105    * trailing blank lines in a file as a format error when
 106    * reading in multi-record mode.
 107    */
 108   do {
 109     if ((s = MSAFileGetLine(afp)) == NULL) {
 110       MSAFree(msa);
 111       return NULL;
 112     }
 113   } while (IsBlankline(s));
 114
 115   if (strncmp(s, "# STOCKHOLM 1.", 14) != 0)
 116     Die("\
 117 File %s doesn't appear to be in Stockholm format.\n\
 118 Assuming there isn't some other problem with your file (it is an\n\
 119 alignment file, right?), please either:\n\
 120   a) use the Babelfish format autotranslator option (-B, usually);\n\
 121   b) specify the file's format with the --informat option; or\n\
 122   a) reformat the alignment to Stockholm format.\n",
 123         afp->fname);
 124
 125   /* Read the alignment file one line at a time.
 126    */
 127   while ((s = MSAFileGetLine(afp)) != NULL)
 128     {
 129       while (*s == ' ' || *s == '\t') s++;  /* skip leading whitespace */
 130
 131       if (*s == '#') {
 132         if      (strncmp(s, "#=GF", 4) == 0)   status = parse_gf(msa, s);
 133         else if (strncmp(s, "#=GS", 4) == 0)   status = parse_gs(msa, s);
 134         else if (strncmp(s, "#=GC", 4) == 0)   status = parse_gc(msa, s);
 135         else if (strncmp(s, "#=GR", 4) == 0)   status = parse_gr(msa, s);
 136         else                                   status = parse_comment(msa, s);
 137       }
 138       else if (strncmp(s, "//",   2) == 0)   break;
 139       else if (*s == '\n')                   continue;
 140       else                                   status = parse_sequence(msa, s);
 141
 142       if (status == 0)
 143         Die("Stockholm format parse error: line %d of file %s while reading alignment %s",
 144             afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
 145     }
 146
 147   if (s == NULL && msa->nseq != 0)
 148     Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);
 149
 150   if (s == NULL && msa->nseq == 0) {
 151                                 /* probably just some junk at end of file */
 152       MSAFree(msa);
 153       return NULL;
 154     }
 155
 156   MSAVerifyParse(msa);
 157   return msa;
 158 }
 159
 160
 161 /* Function: WriteStockholm()
 162  * Date:     SRE, Mon May 31 19:15:22 1999 [St. Louis]
 163  *
 164  * Purpose:  Write an alignment in standard multi-block
 165  *           Stockholm format to an open file. A wrapper
 166  *           for actually_write_stockholm().
 167  *
 168  * Args:     fp  - file that's open for writing
 169  *           msa - alignment to write
 170  *
 171  * Returns:  (void)
 172  */
 173 void
 174 WriteStockholm(FILE *fp, MSA *msa)
 175 {
 176   actually_write_stockholm(fp, msa, 50); /* 50 char per block */
 177 }
 178
 179 /* Function: WriteStockholmOneBlock()
 180  * Date:     SRE, Mon May 31 19:15:22 1999 [St. Louis]
 181  *
 182  * Purpose:  Write an alignment in Pfam's single-block
 183  *           Stockholm format to an open file. A wrapper
 184  *           for actually_write_stockholm().
 185  *
 186  * Args:     fp  - file that's open for writing
 187  *           msa - alignment to write
 188  *
 189  * Returns:  (void)
 190  */
 191 void
 192 WriteStockholmOneBlock(FILE *fp, MSA *msa)
 193 {
 194   actually_write_stockholm(fp, msa, msa->alen); /* one big block */
 195 }
 196
 197
 198 /* Function: actually_write_stockholm()
 199  * Date:     SRE, Fri May 21 17:39:22 1999 [St. Louis]
 200  *
 201  * Purpose:  Write an alignment in Stockholm format to
 202  *           an open file. This is the function that actually
 203  *           does the work. The API's WriteStockholm()
 204  *           and WriteStockholmOneBlock() are wrappers.
 205  *
 206  * Args:     fp    - file that's open for writing
 207  *           msa   - alignment to write
 208  *           cpl   - characters to write per line in alignment block
 209  *
 210  * Returns:  (void)
 211  */
 212 static void
 213 actually_write_stockholm(FILE *fp, MSA *msa, int cpl)
 214 {
 215   int  i, j;
 216   int  len = 0;
 217   int  namewidth;
 218   int  typewidth = 0;           /* markup tags are up to 5 chars long */
 219   int  markupwidth = 0;         /* #=GR, #=GC are four char wide + 1 space */
 220   char *buf;
 221   int  currpos;
 222   char *s, *tok;
 223
 224   /* Figure out how much space we need for name + markup
 225    * to keep the alignment in register. Required by Stockholm
 226    * spec, even though our Stockholm parser doesn't care (Erik's does).
 227    */
 228   namewidth = 0;
 229   for (i = 0; i < msa->nseq; i++)
 230     if ((len = strlen(msa->sqname[i])) > namewidth)
 231       namewidth = len;
 232
 233   /* Figure out how much space we need for markup tags
 234    *   markupwidth = always 4 if we're doing markup:  strlen("#=GR")
 235    *   typewidth   = longest markup tag
 236    */
 237   if (msa->ss      != NULL) { markupwidth = 4; typewidth = 2; }
 238   if (msa->sa      != NULL) { markupwidth = 4; typewidth = 2; }
 239   for (i = 0; i < msa->ngr; i++)
 240     if ((len = strlen(msa->gr_tag[i])) > typewidth) typewidth = len;
 241
 242   if (msa->rf      != NULL) { markupwidth = 4; if (typewidth < 2) typewidth = 2; }
 243   if (msa->ss_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
 244   if (msa->sa_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
 245   for (i = 0; i < msa->ngc; i++)
 246     if ((len = strlen(msa->gc_tag[i])) > typewidth) typewidth = len;
 247
 248   buf = MallocOrDie(sizeof(char) * (cpl+namewidth+typewidth+markupwidth+61));
 249
 250   /* Magic Stockholm header
 251    */
 252   fprintf(fp, "# STOCKHOLM 1.0\n");
 253
 254   /* Free text comments
 255    */
 256   for (i = 0;  i < msa->ncomment; i++)
 257     fprintf(fp, "# %s\n", msa->comment[i]);
 258   if (msa->ncomment > 0) fprintf(fp, "\n");
 259
 260   /* GF section: per-file annotation
 261    */
 262   if (msa->name  != NULL)       fprintf(fp, "#=GF ID    %s\n", msa->name);
 263   if (msa->acc   != NULL)       fprintf(fp, "#=GF AC    %s\n", msa->acc);
 264   if (msa->desc  != NULL)       fprintf(fp, "#=GF DE    %s\n", msa->desc);
 265   if (msa->au    != NULL)       fprintf(fp, "#=GF AU    %s\n", msa->au);
 266
 267   /* Thresholds are hacky. Pfam has two. Rfam has one.
 268    */
 269   if      (msa->cutoff_is_set[MSA_CUTOFF_GA1] && msa->cutoff_is_set[MSA_CUTOFF_GA2])
 270     fprintf(fp, "#=GF GA    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_GA1], msa->cutoff[MSA_CUTOFF_GA2]);
 271   else if (msa->cutoff_is_set[MSA_CUTOFF_GA1])
 272     fprintf(fp, "#=GF GA    %.1f\n", msa->cutoff[MSA_CUTOFF_GA1]);
 273   if      (msa->cutoff_is_set[MSA_CUTOFF_NC1] && msa->cutoff_is_set[MSA_CUTOFF_NC2])
 274     fprintf(fp, "#=GF NC    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_NC1], msa->cutoff[MSA_CUTOFF_NC2]);
 275   else if (msa->cutoff_is_set[MSA_CUTOFF_NC1])
 276     fprintf(fp, "#=GF NC    %.1f\n", msa->cutoff[MSA_CUTOFF_NC1]);
 277   if      (msa->cutoff_is_set[MSA_CUTOFF_TC1] && msa->cutoff_is_set[MSA_CUTOFF_TC2])
 278     fprintf(fp, "#=GF TC    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_TC1], msa->cutoff[MSA_CUTOFF_TC2]);
 279   else if (msa->cutoff_is_set[MSA_CUTOFF_TC1])
 280     fprintf(fp, "#=GF TC    %.1f\n", msa->cutoff[MSA_CUTOFF_TC1]);
 281
 282   for (i = 0; i < msa->ngf; i++)
 283     fprintf(fp, "#=GF %-5s %s\n", msa->gf_tag[i], msa->gf[i]);
 284   fprintf(fp, "\n");
 285
 286
 287   /* GS section: per-sequence annotation
 288    */
 289   if (msa->flags & MSA_SET_WGT)
 290     {
 291       for (i = 0; i < msa->nseq; i++)
 292         fprintf(fp, "#=GS %-*.*s WT    %.2f\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i]);
 293       fprintf(fp, "\n");
 294     }
 295   if (msa->sqacc != NULL)
 296     {
 297       for (i = 0; i < msa->nseq; i++)
 298         if (msa->sqacc[i] != NULL)
 299           fprintf(fp, "#=GS %-*.*s AC    %s\n", namewidth, namewidth, msa->sqname[i], msa->sqacc[i]);
 300       fprintf(fp, "\n");
 301     }
 302   if (msa->sqdesc != NULL)
 303     {
 304       for (i = 0; i < msa->nseq; i++)
 305         if (msa->sqdesc[i] != NULL)
 306           fprintf(fp, "#=GS %*.*s DE    %s\n", namewidth, namewidth, msa->sqname[i], msa->sqdesc[i]);
 307       fprintf(fp, "\n");
 308     }
 309   for (i = 0; i < msa->ngs; i++)
 310     {
 311       /* Multiannotated GS tags are possible; for example,
 312        *     #=GS foo DR PDB; 1xxx;
 313        *     #=GS foo DR PDB; 2yyy;
 314        * These are stored, for example, as:
 315        *     msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;"
 316        * and must be decomposed.
 317        */
 318       for (j = 0; j < msa->nseq; j++)
 319         if (msa->gs[i][j] != NULL)
 320           {
 321             s = msa->gs[i][j];
 322             while ((tok = sre_strtok(&s, "\n", NULL)) != NULL)
 323               fprintf(fp, "#=GS %*.*s %5s %s\n", namewidth, namewidth,
 324                       msa->sqname[j], msa->gs_tag[i], tok);
 325           }
 326       fprintf(fp, "\n");
 327     }
 328
 329   /* Alignment section:
 330    * contains aligned sequence, #=GR annotation, and #=GC annotation
 331    */
 332   for (currpos = 0; currpos < msa->alen; currpos += cpl)
 333     {
 334       if (currpos > 0) fprintf(fp, "\n");
 335       for (i = 0; i < msa->nseq; i++)
 336         {
 337           strncpy(buf, msa->aseq[i] + currpos, cpl);
 338           buf[cpl] = '\0';
 339           fprintf(fp, "%-*.*s  %s\n", namewidth+typewidth+markupwidth, namewidth+typewidth+markupwidth,
 340                   msa->sqname[i], buf);
 341
 342           if (msa->ss != NULL && msa->ss[i] != NULL) {
 343             strncpy(buf, msa->ss[i] + currpos, cpl);
 344             buf[cpl] = '\0';
 345             fprintf(fp, "#=GR %-*.*s SS     %s\n", namewidth, namewidth, msa->sqname[i], buf);
 346           }
 347           if (msa->sa != NULL && msa->sa[i] != NULL) {
 348             strncpy(buf, msa->sa[i] + currpos, cpl);
 349             buf[cpl] = '\0';
 350             fprintf(fp, "#=GR %-*.*s SA     %s\n", namewidth, namewidth, msa->sqname[i], buf);
 351           }
 352           for (j = 0; j < msa->ngr; j++)
 353             if (msa->gr[j][i] != NULL) {
 354               strncpy(buf, msa->gr[j][i] + currpos, cpl);
 355               buf[cpl] = '\0';
 356               fprintf(fp, "#=GR %-*.*s %5s  %s\n",
 357                       namewidth, namewidth, msa->sqname[i], msa->gr_tag[j], buf);
 358             }
 359         }
 360       if (msa->ss_cons != NULL) {
 361         strncpy(buf, msa->ss_cons + currpos, cpl);
 362         buf[cpl] = '\0';
 363         fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SS_cons", buf);
 364       }
 365
 366       if (msa->sa_cons != NULL) {
 367         strncpy(buf, msa->sa_cons + currpos, cpl);
 368         buf[cpl] = '\0';
 369         fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SA_cons", buf);
 370       }
 371
 372       if (msa->rf != NULL) {
 373         strncpy(buf, msa->rf + currpos, cpl);
 374         buf[cpl] = '\0';
 375         fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "RF", buf);
 376       }
 377       for (j = 0; j < msa->ngc; j++) {
 378         strncpy(buf, msa->gc[j] + currpos, cpl);
 379         buf[cpl] = '\0';
 380         fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth,
 381                 msa->gc_tag[j], buf);
 382       }
 383     }
 384   fprintf(fp, "//\n");
 385   free(buf);
 386 }
 387
 388
 389
 390
 391
 392 /* Format of a GF line:
 393  *    #=GF <featurename> <text>
 394  */
 395 static int
 396 parse_gf(MSA *msa, char *buf)
 397 {
 398   char *gf;
 399   char *featurename;
 400   char *text;
 401   char *s;
 402
 403   s = buf;
 404   if ((gf          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 405   if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 406   if ((text        = sre_strtok(&s, "\n",       NULL)) == NULL) return 0;
 407   while (*text && (*text == ' ' || *text == '\t')) text++;
 408
 409   if      (strcmp(featurename, "ID") == 0)
 410     msa->name                 = sre_strdup(text, -1);
 411   else if (strcmp(featurename, "AC") == 0)
 412     msa->acc                  = sre_strdup(text, -1);
 413   else if (strcmp(featurename, "DE") == 0)
 414     msa->desc                 = sre_strdup(text, -1);
 415   else if (strcmp(featurename, "AU") == 0)
 416     msa->au                   = sre_strdup(text, -1);
 417   else if (strcmp(featurename, "GA") == 0)
 418     {                           /* Pfam has GA1, GA2. Rfam just has GA1. */
 419       s = text;
 420       if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 421       msa->cutoff[MSA_CUTOFF_GA1]        = atof(text);
 422       msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
 423       if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
 424         msa->cutoff[MSA_CUTOFF_GA2]        = atof(text);
 425         msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
 426       }
 427     }
 428   else if (strcmp(featurename, "NC") == 0)
 429     {
 430       s = text;
 431       if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 432       msa->cutoff[MSA_CUTOFF_NC1]        = atof(text);
 433       msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
 434       if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
 435         msa->cutoff[MSA_CUTOFF_NC2]        = atof(text);
 436         msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
 437       }
 438     }
 439   else if (strcmp(featurename, "TC") == 0)
 440     {
 441       s = text;
 442       if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 443       msa->cutoff[MSA_CUTOFF_TC1]        = atof(text);
 444       msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
 445       if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
 446         msa->cutoff[MSA_CUTOFF_TC2]        = atof(text);
 447         msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
 448       }
 449     }
 450   else
 451     MSAAddGF(msa, featurename, text);
 452
 453   return 1;
 454 }
 455
 456
 457 /* Format of a GS line:
 458  *    #=GS <seqname> <featurename> <text>
 459  */
 460 static int
 461 parse_gs(MSA *msa, char *buf)
 462 {
 463   char *gs;
 464   char *seqname;
 465   char *featurename;
 466   char *text;
 467   int   seqidx;
 468   char *s;
 469
 470   s = buf;
 471   if ((gs          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 472   if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 473   if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 474   if ((text        = sre_strtok(&s, "\n",       NULL)) == NULL) return 0;
 475   while (*text && (*text == ' ' || *text == '\t')) text++;
 476
 477   /* GS usually follows another GS; guess lastidx+1
 478    */
 479   seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
 480   msa->lastidx = seqidx;
 481
 482   if (strcmp(featurename, "WT") == 0)
 483     {
 484       msa->wgt[seqidx]          = atof(text);
 485       msa->flags |= MSA_SET_WGT;
 486     }
 487
 488   else if (strcmp(featurename, "AC") == 0)
 489     MSASetSeqAccession(msa, seqidx, text);
 490
 491   else if (strcmp(featurename, "DE") == 0)
 492     MSASetSeqDescription(msa, seqidx, text);
 493
 494   else
 495     MSAAddGS(msa, featurename, seqidx, text);
 496
 497   return 1;
 498 }
 499
 500 /* Format of a GC line:
 501  *    #=GC <featurename> <text>
 502  */
 503 static int
 504 parse_gc(MSA *msa, char *buf)
 505 {
 506   char *gc;
 507   char *featurename;
 508   char *text;
 509   char *s;
 510   int   len;
 511
 512   s = buf;
 513   if ((gc          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 514   if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 515   if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
 516
 517   if (strcmp(featurename, "SS_cons") == 0)
 518     sre_strcat(&(msa->ss_cons), -1, text, len);
 519   else if (strcmp(featurename, "SA_cons") == 0)
 520     sre_strcat(&(msa->sa_cons), -1, text, len);
 521   else if (strcmp(featurename, "RF") == 0)
 522     sre_strcat(&(msa->rf), -1, text, len);
 523   else
 524     MSAAppendGC(msa, featurename, text);
 525
 526   return 1;
 527 }
 528
 529 /* Format of a GR line:
 530  *    #=GR <seqname> <featurename> <text>
 531  */
 532 static int
 533 parse_gr(MSA *msa, char *buf)
 534 {
 535   char *gr;
 536   char *seqname;
 537   char *featurename;
 538   char *text;
 539   int   seqidx;
 540   int   len;
 541   int   j;
 542   char *s;
 543
 544   s = buf;
 545   if ((gr          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 546   if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 547   if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 548   if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
 549
 550   /* GR usually follows sequence it refers to; guess msa->lastidx */
 551   seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx);
 552   msa->lastidx = seqidx;
 553
 554   if (strcmp(featurename, "SS") == 0)
 555     {
 556       if (msa->ss == NULL)
 557         {
 558           msa->ss    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
 559           msa->sslen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
 560           for (j = 0; j < msa->nseqalloc; j++)
 561             {
 562               msa->ss[j]    = NULL;
 563               msa->sslen[j] = 0;
 564             }
 565         }
 566       msa->sslen[seqidx] = sre_strcat(&(msa->ss[seqidx]), msa->sslen[seqidx], text, len);
 567     }
 568   else if (strcmp(featurename, "SA") == 0)
 569     {
 570       if (msa->sa == NULL)
 571         {
 572           msa->sa    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
 573           msa->salen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
 574           for (j = 0; j < msa->nseqalloc; j++)
 575             {
 576               msa->sa[j]    = NULL;
 577               msa->salen[j] = 0;
 578             }
 579         }
 580       msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len);
 581     }
 582   else
 583     MSAAppendGR(msa, featurename, seqidx, text);
 584
 585   return 1;
 586 }
 587
 588
 589 /* comments are simply stored verbatim, not parsed
 590  */
 591 static int
 592 parse_comment(MSA *msa, char *buf)
 593 {
 594   char *s;
 595   char *comment;
 596
 597   s = buf + 1;                                 /* skip leading '#' */
 598   if (*s == '\n') { *s = '\0'; comment = s; }  /* deal with blank comment */
 599   else if ((comment = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
 600
 601   MSAAddComment(msa, comment);
 602   return 1;
 603 }
 604
 605 static int
 606 parse_sequence(MSA *msa, char *buf)
 607 {
 608   char *s;
 609   char *seqname;
 610   char *text;
 611   int   seqidx;
 612   int   len;
 613
 614   s = buf;
 615   if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 616   if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
 617
 618   /* seq usually follows another seq; guess msa->lastidx +1 */
 619   seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
 620   msa->lastidx = seqidx;
 621
 622   msa->sqlen[seqidx] = sre_strcat(&(msa->aseq[seqidx]), msa->sqlen[seqidx], text, len);
 623   return 1;
 624 }
 625
 626
 627