binaries/src/clustalo/src/squid/stockholm.c

   1 /*****************************************************************
   2  * SQUID - a library of functions for biological sequence analysis
   3  * Copyright (C) 1992-2002 Washington University School of Medicine
   4  *
   5  *     This source code is freely distributed under the terms of the
   6  *     GNU General Public License. See the files COPYRIGHT and LICENSE
   7  *     for details.
   8  *****************************************************************/
   9
  10 /* stockholm.c
  11  * SRE, Fri May 28 15:46:41 1999
  12  *
  13  * Reading/writing of Stockholm format multiple sequence alignments.
  14  *
  15  * example of API:
  16  *
  17  * MSA     *msa;
  18  * FILE    *fp;        -- opened for write with fopen()
  19  * MSAFILE *afp;       -- opened for read with MSAFileOpen()
  20  *
  21  * while ((msa = ReadStockholm(afp)) != NULL)
  22  *   {
  23  *      WriteStockholm(fp, msa);
  24  *      MSAFree(msa);
  25  *   }
  26  *
  27  * RCS $Id: stockholm.c 297 2014-10-31 13:02:37Z fabian $ (Original squid RCS Id: stockholm.c,v 1.7 2002/10/12 04:40:36 eddy Exp)
  28  */
  29 #include <stdio.h>
  30 #include <string.h>
  31 #include "squid.h"
  32 #include "msa.h"
  33
  34 static int  parse_gf(MSA *msa, char *buf);
  35 static int  parse_gs(MSA *msa, char *buf);
  36 static int  parse_gc(MSA *msa, char *buf);
  37 static int  parse_gr(MSA *msa, char *buf);
  38 static int  parse_comment(MSA *msa, char *buf);
  39 static int  parse_sequence(MSA *msa, char *buf);
  40 static void actually_write_stockholm(FILE *fp, MSA *msa, int cpl);
  41
  42 #ifdef TESTDRIVE_STOCKHOLM
  43 /*****************************************************************
  44  * stockholm.c test driver:
  45  * cc -DTESTDRIVE_STOCKHOLM -g -O2 -Wall -o test stockholm.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm
  46  *
  47  */
  48 int
  49 main(int argc, char **argv)
  50 {
  51   MSAFILE *afp;
  52   MSA     *msa;
  53   char    *file;
  54
  55   file = argv[1];
  56
  57   if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
  58     Die("Couldn't open %s\n", file);
  59
  60   while ((msa = ReadStockholm(afp)) != NULL)
  61     {
  62       WriteStockholm(stdout, msa);
  63       MSAFree(msa);
  64     }
  65
  66   MSAFileClose(afp);
  67   exit(0);
  68 }
  69 /******************************************************************/
  70 #endif /* testdriver */
  71
  72
  73 /* Function: ReadStockholm()
  74  * Date:     SRE, Fri May 21 17:33:10 1999 [St. Louis]
  75  *
  76  * Purpose:  Parse the next alignment from an open Stockholm
  77  *           format alignment file. Return the alignment, or
  78  *           NULL if there are no more alignments in the file.
  79  *
  80  * Args:     afp  - open alignment file
  81  *
  82  * Returns:  MSA *   - an alignment object.
  83  *                     caller responsible for an MSAFree()
  84  *           NULL if no more alignments
  85  *
  86  * Diagnostics:
  87  *           Will Die() here with a (potentially) useful message
  88  *           if a parsing error occurs
  89  */
  90 MSA *
  91 ReadStockholm(MSAFILE *afp)
  92 {
  93   MSA   *msa;
  94   char  *s;
  95   int    status;
  96
  97   if (feof(afp->f)) return NULL;
  98
  99   /* Initialize allocation of the MSA.
 100    */
 101   msa = MSAAlloc(10, 0);
 102
 103   /* Check the magic Stockholm header line.
 104    * We have to skip blank lines here, else we perceive
 105    * trailing blank lines in a file as a format error when
 106    * reading in multi-record mode.
 107    */
 108   do {
 109     if ((s = MSAFileGetLine(afp)) == NULL) {
 110       MSAFree(msa);
 111       return NULL;
 112     }
 113   } while (IsBlankline(s));
 114
 115   if (strncmp(s, "# STOCKHOLM 1.", 14) != 0)
 116     Die("\
 117 File %s doesn't appear to be in Stockholm format.\n\
 118 Assuming there isn't some other problem with your file (it is an\n\
 119 alignment file, right?), please either:\n\
 120   a) use the Babelfish format autotranslator option (-B, usually);\n\
 121   b) specify the file's format with the --informat option; or\n\
 122   a) reformat the alignment to Stockholm format.\n",
 123         afp->fname);
 124
 125   /* Read the alignment file one line at a time.
 126    */
 127   while ((s = MSAFileGetLine(afp)) != NULL)
 128     {
 129       while (*s == ' ' || *s == '\t') s++;  /* skip leading whitespace */
 130
 131       if (*s == '#') {
 132         if      (strncmp(s, "#=GF", 4) == 0)   status = parse_gf(msa, s);
 133         else if (strncmp(s, "#=GS", 4) == 0)   status = parse_gs(msa, s);
 134         else if (strncmp(s, "#=GC", 4) == 0)   status = parse_gc(msa, s);
 135         else if (strncmp(s, "#=GR", 4) == 0)   status = parse_gr(msa, s);
 136         else                                   status = parse_comment(msa, s);
 137       }
 138       else if (strncmp(s, "//",   2) == 0)   break;
 139       else if (*s == '\n')                   continue;
 140       else                                   status = parse_sequence(msa, s);
 141
 142       if (status == 0)
 143         Die("Stockholm format parse error: line %d of file %s while reading alignment %s",
 144             afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
 145     }
 146
 147   if (s == NULL && msa->nseq != 0)
 148     Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);
 149
 150   if (s == NULL && msa->nseq == 0) {
 151                                 /* probably just some junk at end of file */
 152       MSAFree(msa);
 153       return NULL;
 154     }
 155
 156   MSAVerifyParse(msa);
 157   return msa;
 158 }
 159
 160 /* Function: ReadDublin()
 161  * Date:     2014-10-15
 162  *
 163  * Purpose:  Parse the next sequences from an open Dublin
 164  *           format file. Return the sequences, or
 165  *           NULL if there are no more sequences in the file.
 166  *
 167  * Args:     afp  - open alignment file
 168  *
 169  * Returns:  MSA *   - an alignment object.
 170  *                     caller responsible for an MSAFree()
 171  *           NULL if no more alignments
 172  *
 173  * Diagnostics:
 174  *           Will Die() here with a (potentially) useful message
 175  *           if a parsing error occurs
 176  */
 177 MSA *
 178 ReadDublin(MSAFILE *afp)
 179 {
 180   MSA   *msa;
 181   char  *s;
 182   int    status;
 183
 184   if (feof(afp->f)) return NULL;
 185
 186   /* Initialize allocation of the MSA.
 187    */
 188   msa = MSAAlloc(10, 0);
 189
 190   /* Check the magic Dublin header line.
 191    * We have to skip blank lines here, else we perceive
 192    * trailing blank lines in a file as a format error when
 193    * reading in multi-record mode.
 194    */
 195   do {
 196     if ((s = MSAFileGetLine(afp)) == NULL) {
 197       MSAFree(msa);
 198       return NULL;
 199     }
 200   } while (IsBlankline(s));
 201
 202   if (strncmp(s, "# DUBLIN 1.", 11) != 0)
 203     Die("\
 204 File %s doesn't appear to be in Dublin format.\n",
 205         afp->fname);
 206
 207   /* Read the alignment file one line at a time.
 208    */
 209   while ((s = MSAFileGetLine(afp)) != NULL)
 210     {
 211       while (*s == ' ' || *s == '\t') s++;  /* skip leading whitespace */
 212
 213       if (*s == '#') {
 214         if      (strncmp(s, "#=GF", 4) == 0)   status = parse_gf(msa, s);
 215         else if (strncmp(s, "#=GS", 4) == 0)   status = parse_gs(msa, s);
 216         else if (strncmp(s, "#=GC", 4) == 0)   status = parse_gc(msa, s);
 217         else if (strncmp(s, "#=GR", 4) == 0)   status = parse_gr(msa, s);
 218         else                                   status = parse_comment(msa, s);
 219       }
 220       else if (strncmp(s, "//",   2) == 0)   break;
 221       else if (*s == '\n')                   continue;
 222       else                                   status = parse_sequence(msa, s);
 223
 224       if (status == 0)
 225         Die("Dublin format parse error: line %d of file %s while reading alignment %s",
 226             afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name);
 227     }
 228
 229   if (s == NULL && msa->nseq != 0)
 230     Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name);
 231
 232   if (s == NULL && msa->nseq == 0) {
 233                                 /* probably just some junk at end of file */
 234       MSAFree(msa);
 235       return NULL;
 236     }
 237
 238   MSAVerifyParseDub(msa);
 239   return msa;
 240
 241 } /* this is the end of ReadDublin() */
 242
 243
 244 /* Function: WriteStockholm()
 245  * Date:     SRE, Mon May 31 19:15:22 1999 [St. Louis]
 246  *
 247  * Purpose:  Write an alignment in standard multi-block
 248  *           Stockholm format to an open file. A wrapper
 249  *           for actually_write_stockholm().
 250  *
 251  * Args:     fp  - file that's open for writing
 252  *           msa - alignment to write
 253  *
 254  * Returns:  (void)
 255  */
 256 void
 257 WriteStockholm(FILE *fp, MSA *msa)
 258 {
 259   actually_write_stockholm(fp, msa, 50); /* 50 char per block */
 260 }
 261
 262 /* Function: WriteStockholmOneBlock()
 263  * Date:     SRE, Mon May 31 19:15:22 1999 [St. Louis]
 264  *
 265  * Purpose:  Write an alignment in Pfam's single-block
 266  *           Stockholm format to an open file. A wrapper
 267  *           for actually_write_stockholm().
 268  *
 269  * Args:     fp  - file that's open for writing
 270  *           msa - alignment to write
 271  *
 272  * Returns:  (void)
 273  */
 274 void
 275 WriteStockholmOneBlock(FILE *fp, MSA *msa)
 276 {
 277   actually_write_stockholm(fp, msa, msa->alen); /* one big block */
 278 }
 279
 280
 281 /* Function: actually_write_stockholm()
 282  * Date:     SRE, Fri May 21 17:39:22 1999 [St. Louis]
 283  *
 284  * Purpose:  Write an alignment in Stockholm format to
 285  *           an open file. This is the function that actually
 286  *           does the work. The API's WriteStockholm()
 287  *           and WriteStockholmOneBlock() are wrappers.
 288  *
 289  * Args:     fp    - file that's open for writing
 290  *           msa   - alignment to write
 291  *           cpl   - characters to write per line in alignment block
 292  *
 293  * Returns:  (void)
 294  */
 295 static void
 296 actually_write_stockholm(FILE *fp, MSA *msa, int cpl)
 297 {
 298   int  i, j;
 299   int  len = 0;
 300   int  namewidth;
 301   int  typewidth = 0;           /* markup tags are up to 5 chars long */
 302   int  markupwidth = 0;         /* #=GR, #=GC are four char wide + 1 space */
 303   char *buf;
 304   int  currpos;
 305   char *s, *tok;
 306
 307   /* Figure out how much space we need for name + markup
 308    * to keep the alignment in register. Required by Stockholm
 309    * spec, even though our Stockholm parser doesn't care (Erik's does).
 310    */
 311   namewidth = 0;
 312   for (i = 0; i < msa->nseq; i++)
 313     if ((len = strlen(msa->sqname[i])) > namewidth)
 314       namewidth = len;
 315
 316   /* Figure out how much space we need for markup tags
 317    *   markupwidth = always 4 if we're doing markup:  strlen("#=GR")
 318    *   typewidth   = longest markup tag
 319    */
 320   if (msa->ss      != NULL) { markupwidth = 4; typewidth = 2; }
 321   if (msa->sa      != NULL) { markupwidth = 4; typewidth = 2; }
 322   for (i = 0; i < msa->ngr; i++)
 323     if ((len = strlen(msa->gr_tag[i])) > typewidth) typewidth = len;
 324
 325   if (msa->rf      != NULL) { markupwidth = 4; if (typewidth < 2) typewidth = 2; }
 326   if (msa->ss_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
 327   if (msa->sa_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; }
 328   for (i = 0; i < msa->ngc; i++)
 329     if ((len = strlen(msa->gc_tag[i])) > typewidth) typewidth = len;
 330
 331   buf = MallocOrDie(sizeof(char) * (cpl+namewidth+typewidth+markupwidth+61));
 332
 333   /* Magic Stockholm header
 334    */
 335   fprintf(fp, "# STOCKHOLM 1.0\n");
 336
 337   /* Free text comments
 338    */
 339   for (i = 0;  i < msa->ncomment; i++)
 340     fprintf(fp, "# %s\n", msa->comment[i]);
 341   if (msa->ncomment > 0) fprintf(fp, "\n");
 342
 343   /* GF section: per-file annotation
 344    */
 345   if (msa->name  != NULL)       fprintf(fp, "#=GF ID    %s\n", msa->name);
 346   if (msa->acc   != NULL)       fprintf(fp, "#=GF AC    %s\n", msa->acc);
 347   if (msa->desc  != NULL)       fprintf(fp, "#=GF DE    %s\n", msa->desc);
 348   if (msa->au    != NULL)       fprintf(fp, "#=GF AU    %s\n", msa->au);
 349
 350   /* Thresholds are hacky. Pfam has two. Rfam has one.
 351    */
 352   if      (msa->cutoff_is_set[MSA_CUTOFF_GA1] && msa->cutoff_is_set[MSA_CUTOFF_GA2])
 353     fprintf(fp, "#=GF GA    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_GA1], msa->cutoff[MSA_CUTOFF_GA2]);
 354   else if (msa->cutoff_is_set[MSA_CUTOFF_GA1])
 355     fprintf(fp, "#=GF GA    %.1f\n", msa->cutoff[MSA_CUTOFF_GA1]);
 356   if      (msa->cutoff_is_set[MSA_CUTOFF_NC1] && msa->cutoff_is_set[MSA_CUTOFF_NC2])
 357     fprintf(fp, "#=GF NC    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_NC1], msa->cutoff[MSA_CUTOFF_NC2]);
 358   else if (msa->cutoff_is_set[MSA_CUTOFF_NC1])
 359     fprintf(fp, "#=GF NC    %.1f\n", msa->cutoff[MSA_CUTOFF_NC1]);
 360   if      (msa->cutoff_is_set[MSA_CUTOFF_TC1] && msa->cutoff_is_set[MSA_CUTOFF_TC2])
 361     fprintf(fp, "#=GF TC    %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_TC1], msa->cutoff[MSA_CUTOFF_TC2]);
 362   else if (msa->cutoff_is_set[MSA_CUTOFF_TC1])
 363     fprintf(fp, "#=GF TC    %.1f\n", msa->cutoff[MSA_CUTOFF_TC1]);
 364
 365   for (i = 0; i < msa->ngf; i++)
 366     fprintf(fp, "#=GF %-5s %s\n", msa->gf_tag[i], msa->gf[i]);
 367   fprintf(fp, "\n");
 368
 369
 370   /* GS section: per-sequence annotation
 371    */
 372   if (msa->flags & MSA_SET_WGT)
 373     {
 374       for (i = 0; i < msa->nseq; i++)
 375         fprintf(fp, "#=GS %-*.*s WT    %.2f\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i]);
 376       fprintf(fp, "\n");
 377     }
 378   if (msa->sqacc != NULL)
 379     {
 380       for (i = 0; i < msa->nseq; i++)
 381         if (msa->sqacc[i] != NULL)
 382           fprintf(fp, "#=GS %-*.*s AC    %s\n", namewidth, namewidth, msa->sqname[i], msa->sqacc[i]);
 383       fprintf(fp, "\n");
 384     }
 385   if (msa->sqdesc != NULL)
 386     {
 387       for (i = 0; i < msa->nseq; i++)
 388         if (msa->sqdesc[i] != NULL)
 389           fprintf(fp, "#=GS %*.*s DE    %s\n", namewidth, namewidth, msa->sqname[i], msa->sqdesc[i]);
 390       fprintf(fp, "\n");
 391     }
 392   for (i = 0; i < msa->ngs; i++)
 393     {
 394       /* Multiannotated GS tags are possible; for example,
 395        *     #=GS foo DR PDB; 1xxx;
 396        *     #=GS foo DR PDB; 2yyy;
 397        * These are stored, for example, as:
 398        *     msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;"
 399        * and must be decomposed.
 400        */
 401       for (j = 0; j < msa->nseq; j++)
 402         if (msa->gs[i][j] != NULL)
 403           {
 404             s = msa->gs[i][j];
 405             while ((tok = sre_strtok(&s, "\n", NULL)) != NULL)
 406               fprintf(fp, "#=GS %*.*s %5s %s\n", namewidth, namewidth,
 407                       msa->sqname[j], msa->gs_tag[i], tok);
 408           }
 409       fprintf(fp, "\n");
 410     }
 411
 412   /* Alignment section:
 413    * contains aligned sequence, #=GR annotation, and #=GC annotation
 414    */
 415   for (currpos = 0; currpos < msa->alen; currpos += cpl)
 416     {
 417       if (currpos > 0) fprintf(fp, "\n");
 418       for (i = 0; i < msa->nseq; i++)
 419         {
 420           strncpy(buf, msa->aseq[i] + currpos, cpl);
 421           buf[cpl] = '\0';
 422           fprintf(fp, "%-*.*s  %s\n", namewidth+typewidth+markupwidth, namewidth+typewidth+markupwidth,
 423                   msa->sqname[i], buf);
 424
 425           if (msa->ss != NULL && msa->ss[i] != NULL) {
 426             strncpy(buf, msa->ss[i] + currpos, cpl);
 427             buf[cpl] = '\0';
 428             fprintf(fp, "#=GR %-*.*s SS     %s\n", namewidth, namewidth, msa->sqname[i], buf);
 429           }
 430           if (msa->sa != NULL && msa->sa[i] != NULL) {
 431             strncpy(buf, msa->sa[i] + currpos, cpl);
 432             buf[cpl] = '\0';
 433             fprintf(fp, "#=GR %-*.*s SA     %s\n", namewidth, namewidth, msa->sqname[i], buf);
 434           }
 435           for (j = 0; j < msa->ngr; j++)
 436             if (msa->gr[j][i] != NULL) {
 437               strncpy(buf, msa->gr[j][i] + currpos, cpl);
 438               buf[cpl] = '\0';
 439               fprintf(fp, "#=GR %-*.*s %5s  %s\n",
 440                       namewidth, namewidth, msa->sqname[i], msa->gr_tag[j], buf);
 441             }
 442         }
 443       if (msa->ss_cons != NULL) {
 444         strncpy(buf, msa->ss_cons + currpos, cpl);
 445         buf[cpl] = '\0';
 446         fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SS_cons", buf);
 447       }
 448
 449       if (msa->sa_cons != NULL) {
 450         strncpy(buf, msa->sa_cons + currpos, cpl);
 451         buf[cpl] = '\0';
 452         fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SA_cons", buf);
 453       }
 454
 455       if (msa->rf != NULL) {
 456         strncpy(buf, msa->rf + currpos, cpl);
 457         buf[cpl] = '\0';
 458         fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "RF", buf);
 459       }
 460       for (j = 0; j < msa->ngc; j++) {
 461         strncpy(buf, msa->gc[j] + currpos, cpl);
 462         buf[cpl] = '\0';
 463         fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth,
 464                 msa->gc_tag[j], buf);
 465       }
 466     }
 467   fprintf(fp, "//\n");
 468   free(buf);
 469 }
 470
 471
 472
 473
 474
 475 /* Format of a GF line:
 476  *    #=GF <featurename> <text>
 477  */
 478 static int
 479 parse_gf(MSA *msa, char *buf)
 480 {
 481   char *gf;
 482   char *featurename;
 483   char *text;
 484   char *s;
 485
 486   s = buf;
 487   if ((gf          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 488   if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 489   if ((text        = sre_strtok(&s, "\n",       NULL)) == NULL) return 0;
 490   while (*text && (*text == ' ' || *text == '\t')) text++;
 491
 492   if      (strcmp(featurename, "ID") == 0)
 493     msa->name                 = sre_strdup(text, -1);
 494   else if (strcmp(featurename, "AC") == 0)
 495     msa->acc                  = sre_strdup(text, -1);
 496   else if (strcmp(featurename, "DE") == 0)
 497     msa->desc                 = sre_strdup(text, -1);
 498   else if (strcmp(featurename, "AU") == 0)
 499     msa->au                   = sre_strdup(text, -1);
 500   else if (strcmp(featurename, "GA") == 0)
 501     {                           /* Pfam has GA1, GA2. Rfam just has GA1. */
 502       s = text;
 503       if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 504       msa->cutoff[MSA_CUTOFF_GA1]        = atof(text);
 505       msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
 506       if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
 507         msa->cutoff[MSA_CUTOFF_GA2]        = atof(text);
 508         msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
 509       }
 510     }
 511   else if (strcmp(featurename, "NC") == 0)
 512     {
 513       s = text;
 514       if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 515       msa->cutoff[MSA_CUTOFF_NC1]        = atof(text);
 516       msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
 517       if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
 518         msa->cutoff[MSA_CUTOFF_NC2]        = atof(text);
 519         msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
 520       }
 521     }
 522   else if (strcmp(featurename, "TC") == 0)
 523     {
 524       s = text;
 525       if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 526       msa->cutoff[MSA_CUTOFF_TC1]        = atof(text);
 527       msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
 528       if ((text = sre_strtok(&s, WHITESPACE, NULL)) != NULL) {
 529         msa->cutoff[MSA_CUTOFF_TC2]        = atof(text);
 530         msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
 531       }
 532     }
 533   else
 534     MSAAddGF(msa, featurename, text);
 535
 536   return 1;
 537 }
 538
 539
 540 /* Format of a GS line:
 541  *    #=GS <seqname> <featurename> <text>
 542  */
 543 static int
 544 parse_gs(MSA *msa, char *buf)
 545 {
 546   char *gs;
 547   char *seqname;
 548   char *featurename;
 549   char *text;
 550   int   seqidx;
 551   char *s;
 552
 553   s = buf;
 554   if ((gs          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 555   if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 556   if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 557   if ((text        = sre_strtok(&s, "\n",       NULL)) == NULL) return 0;
 558   while (*text && (*text == ' ' || *text == '\t')) text++;
 559
 560   /* GS usually follows another GS; guess lastidx+1
 561    */
 562   seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
 563   msa->lastidx = seqidx;
 564
 565   if (strcmp(featurename, "WT") == 0)
 566     {
 567       msa->wgt[seqidx]          = atof(text);
 568       msa->flags |= MSA_SET_WGT;
 569     }
 570
 571   else if (strcmp(featurename, "AC") == 0)
 572     MSASetSeqAccession(msa, seqidx, text);
 573
 574   else if (strcmp(featurename, "DE") == 0)
 575     MSASetSeqDescription(msa, seqidx, text);
 576
 577   else
 578     MSAAddGS(msa, featurename, seqidx, text);
 579
 580   return 1;
 581 }
 582
 583 /* Format of a GC line:
 584  *    #=GC <featurename> <text>
 585  */
 586 static int
 587 parse_gc(MSA *msa, char *buf)
 588 {
 589   char *gc;
 590   char *featurename;
 591   char *text;
 592   char *s;
 593   int   len;
 594
 595   s = buf;
 596   if ((gc          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 597   if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 598   if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
 599
 600   if (strcmp(featurename, "SS_cons") == 0)
 601     sre_strcat(&(msa->ss_cons), -1, text, len);
 602   else if (strcmp(featurename, "SA_cons") == 0)
 603     sre_strcat(&(msa->sa_cons), -1, text, len);
 604   else if (strcmp(featurename, "RF") == 0)
 605     sre_strcat(&(msa->rf), -1, text, len);
 606   else
 607     MSAAppendGC(msa, featurename, text);
 608
 609   return 1;
 610 }
 611
 612 /* Format of a GR line:
 613  *    #=GR <seqname> <featurename> <text>
 614  */
 615 static int
 616 parse_gr(MSA *msa, char *buf)
 617 {
 618   char *gr;
 619   char *seqname;
 620   char *featurename;
 621   char *text;
 622   int   seqidx;
 623   int   len;
 624   int   j;
 625   char *s;
 626
 627   s = buf;
 628   if ((gr          = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 629   if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 630   if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 631   if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
 632
 633   /* GR usually follows sequence it refers to; guess msa->lastidx */
 634   seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx);
 635   msa->lastidx = seqidx;
 636
 637   if (strcmp(featurename, "SS") == 0)
 638     {
 639       if (msa->ss == NULL)
 640         {
 641           msa->ss    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
 642           msa->sslen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
 643           for (j = 0; j < msa->nseqalloc; j++)
 644             {
 645               msa->ss[j]    = NULL;
 646               msa->sslen[j] = 0;
 647             }
 648         }
 649       msa->sslen[seqidx] = sre_strcat(&(msa->ss[seqidx]), msa->sslen[seqidx], text, len);
 650     }
 651   else if (strcmp(featurename, "SA") == 0)
 652     {
 653       if (msa->sa == NULL)
 654         {
 655           msa->sa    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
 656           msa->salen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
 657           for (j = 0; j < msa->nseqalloc; j++)
 658             {
 659               msa->sa[j]    = NULL;
 660               msa->salen[j] = 0;
 661             }
 662         }
 663       msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len);
 664     }
 665   else if (strcmp(featurename, "CO") == 0)
 666     {
 667       if (msa->co == NULL)
 668         {
 669           msa->co    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
 670           msa->colen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
 671           for (j = 0; j < msa->nseqalloc; j++)
 672             {
 673               msa->co[j]    = NULL;
 674               msa->colen[j] = 0;
 675             }
 676         }
 677       msa->colen[seqidx] = sre_strcat(&(msa->co[seqidx]), msa->colen[seqidx], text, len);
 678     }
 679   else
 680     MSAAppendGR(msa, featurename, seqidx, text);
 681
 682   return 1;
 683 }
 684
 685
 686 /* comments are simply stored verbatim, not parsed
 687  */
 688 static int
 689 parse_comment(MSA *msa, char *buf)
 690 {
 691   char *s;
 692   char *comment;
 693
 694   s = buf + 1;                                 /* skip leading '#' */
 695   if (*s == '\n') { *s = '\0'; comment = s; }  /* deal with blank comment */
 696   else if ((comment = sre_strtok(&s, "\n", NULL)) == NULL) return 0;
 697
 698   MSAAddComment(msa, comment);
 699   return 1;
 700 }
 701
 702 static int
 703 parse_sequence(MSA *msa, char *buf)
 704 {
 705   char *s;
 706   char *seqname;
 707   char *text;
 708   int   seqidx;
 709   int   len;
 710
 711   s = buf;
 712   if ((seqname     = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0;
 713   if ((text        = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0;
 714
 715   /* seq usually follows another seq; guess msa->lastidx +1 */
 716   seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1);
 717   msa->lastidx = seqidx;
 718
 719   msa->sqlen[seqidx] = sre_strcat(&(msa->aseq[seqidx]), msa->sqlen[seqidx], text, len);
 720   return 1;
 721 }
 722
 723
 724