forester/archive/RIO/others/hmmer/squid/selex.c

   1 /*****************************************************************
   2  * HMMER - Biological sequence analysis with profile HMMs
   3  * Copyright (C) 1992-1999 Washington University School of Medicine
   4  * All Rights Reserved
   5  *
   6  *     This source code is distributed under the terms of the
   7  *     GNU General Public License. See the files COPYING and LICENSE
   8  *     for details.
   9  *****************************************************************/
  10
  11 /* selex.c
  12  *
  13  * SRE, Mon Jun 14 11:08:38 1999
  14  * SELEX  obsolete as the preferred HMMER/SQUID format
  15  * replaced by Stockholm format
  16  * selex support retained for backwards compatibility
  17  * kludged to use the MSA interface
  18  *
  19  * SRE, Mon Jan 30 14:41:49 1995:
  20  * #=SA side chain % surface accessibility annotation supported
  21  *
  22  * SRE, Tue Nov  9 17:40:50 1993:
  23  * major revision. #= special comments and aliinfo_s optional
  24  * alignment info support added. Support for #=CS (consensus
  25  * secondary structure), #=SS (individual secondary structure),
  26  * #=RF (reference coordinate system), #=SQ (per-sequence header info),
  27  * and #=AU ("author") added.
  28  *
  29  * Fri Dec  4 17:43:24 1992, SRE:
  30  * Reading and writing aligned sequences to/from disk files.
  31  * Implements a new, broader specification of SELEX format
  32  * and supercedes alignio.c.
  33  *
  34  * SELEX format is documented in Docs/formats.tex.
  35  ****************************************************************************
  36  * RCS $Id: selex.c,v 1.1.1.1 2005/03/22 08:34:24 cmzmasek Exp $
  37  */
  38
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42 #include <ctype.h>
  43 #include <memory.h>
  44 #include "squid.h"
  45 #include "msa.h"
  46
  47 static int  copy_alignment_line(char *aseq, int apos, int name_rcol,
  48                                 char *buffer, int lcol, int rcol, char gapsym);
  49 static void actually_write_selex(FILE *fp, MSA *msa, int cpl);
  50
  51 static char commentsyms[] = "%#";
  52
  53 /* Function: ReadSELEX()
  54  * Date:     SRE, Sun Jun  6 18:24:09 1999 [St. Louis]
  55  *
  56  * Purpose:  Parse an alignment read from an open SELEX format
  57  *           alignment file. (SELEX is a single alignment format).
  58  *           Return the alignment, or NULL if we've already read the
  59  *           alignment or there's no alignment data in the file.
  60  *
  61  * Limitations: SELEX is the only remaining multipass parser for
  62  *           alignment files. It cannot read from gzip or from stdin.
  63  *           It Die()'s here if you try. The reason for this
  64  *           that SELEX allows space characters as gaps, so we don't
  65  *           know the borders of an alignment block until we've seen
  66  *           the whole block. I could rewrite to allow single-pass
  67  *           parsing (by storing the whole block in memory) but
  68  *           since SELEX is now legacy, why bother.
  69  *
  70  *           Note that the interface is totally kludged: fastest
  71  *           possible adaptation of old ReadSELEX() to the new
  72  *           MSA interface.
  73  *
  74  * Args:     afp  - open alignment file
  75  *
  76  * Returns:  MSA *  - an alignment object
  77  *                    caller responsible for an MSAFree()
  78  *           NULL if no alignment data.
  79  */
  80 MSA *
  81 ReadSELEX(MSAFILE *afp)
  82 {
  83   MSA     *msa;                 /* RETURN: mult seq alignment   */
  84   FILE    *fp;                  /* ptr to opened seqfile        */
  85   char   **aseqs;               /* aligned seqs                 */
  86   int      num = 0;             /* number of seqs read          */
  87   char     buffer[LINEBUFLEN];  /* input buffer for lines       */
  88   char     bufcpy[LINEBUFLEN];  /* strtok'able copy of buffer   */
  89   struct block_struc {          /** alignment data for a block: */
  90     int lcol;                   /* furthest left aligned sym    */
  91     int rcol;                   /* furthest right aligned sym   */
  92   } *blocks = NULL;
  93   int      blocknum;            /* number of blocks in file     */
  94   char    *nptr;                /* ptr to start of name on line */
  95   char    *sptr;                /* ptr into sequence on line    */
  96   int      currnum;             /* num. seqs in given block     */
  97   int      currblock;           /* index for blocks             */
  98   int      i;                   /* loop counter                 */
  99   int      seqidx;              /* counter for seqs             */
 100   int      alen;                /* length of alignment          */
 101   int      warn_names;          /* becomes TRUE if names don't match between blocks */
 102   int      headnum;             /* seqidx in per-sequence header info */
 103   int      currlen;
 104   int      count;
 105   int      have_cs = 0;
 106   int      have_rf = 0;
 107   AINFO    base_ainfo, *ainfo;  /* hack: used to be passed ptr to AINFO */
 108
 109
 110   /* Convert from MSA interface to what old ReadSELEX() did:
 111    *     - copy our open fp, rather than opening file
 112    *     - verify that we're not reading a gzip or stdin
 113    */
 114   if (feof(afp->f)) return NULL;
 115   if (afp->do_gzip || afp->do_stdin)
 116     Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file");
 117   fp    = afp->f;
 118   ainfo = &base_ainfo;
 119
 120   /***************************************************
 121    * First pass across file.
 122    * Count seqs, get names, determine column info
 123    * Determine what sorts of info are active in this file.
 124    ***************************************************/
 125
 126   InitAinfo(ainfo);
 127                                 /* get first line of the block
 128                                  * (non-comment, non-blank) */
 129   do
 130     {
 131       if (fgets(buffer, LINEBUFLEN, fp) == NULL)
 132         { squid_errno = SQERR_NODATA; return 0; }
 133       strcpy(bufcpy, buffer);
 134       if (*buffer == '#')
 135         {
 136           if      (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
 137           else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;
 138         }
 139     }
 140   while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL ||
 141          (strchr(commentsyms, *nptr) != NULL));
 142
 143   blocknum   = 0;
 144   warn_names = FALSE;
 145   while (!feof(fp))
 146     {
 147                                 /* allocate for info about this block. */
 148       if (blocknum == 0)
 149         blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc));
 150       else
 151         blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc));
 152       blocks[blocknum].lcol = LINEBUFLEN+1;
 153       blocks[blocknum].rcol = -1;
 154
 155       currnum = 0;
 156       while (nptr != NULL)      /* becomes NULL when this block ends. */
 157       {
 158                                 /* First block only: save names */
 159         if (blocknum == 0)
 160           {
 161             if (currnum == 0)
 162               ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO));
 163             else
 164               ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO));
 165
 166             ainfo->sqinfo[currnum].flags = 0;
 167             SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME);
 168           }
 169         else                    /* in each additional block: check names */
 170           {
 171             if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0)
 172               warn_names = TRUE;
 173           }
 174         currnum++;
 175
 176                                 /* check rcol, lcol */
 177         if ((sptr = strtok(NULL, WHITESPACE)) != NULL)
 178           {
 179                                 /* is this the furthest left we've
 180                                    seen word 2 in this block? */
 181             if (sptr - bufcpy < blocks[blocknum].lcol)
 182               blocks[blocknum].lcol = sptr - bufcpy;
 183                                 /* look for right side in buffer */
 184             for (sptr = buffer + strlen(buffer) - 1;
 185                  strchr(WHITESPACE, *sptr) != NULL;
 186                  sptr --)
 187               /* do nothing */ ;
 188             if (sptr - buffer > blocks[blocknum].rcol)
 189               blocks[blocknum].rcol = sptr - buffer;
 190           }
 191
 192                                 /* get the next line; blank line means end of block */
 193         do
 194           {
 195             if (fgets(buffer, LINEBUFLEN, fp) == NULL)
 196               { nptr = NULL; break; }
 197             strcpy(bufcpy, buffer);
 198
 199             if      (strncmp(buffer, "#=SS",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS;
 200             else if (strncmp(buffer, "#=SA",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA;
 201             else if (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
 202             else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;
 203
 204             if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL)
 205               break;
 206           } while (strchr(commentsyms, *nptr) != NULL);
 207       }
 208
 209
 210                                 /* check that number of sequences matches expected */
 211       if (blocknum == 0)
 212         num = currnum;
 213       else if (currnum != num)
 214         Die("Parse error in ReadSELEX()");
 215       blocknum++;
 216
 217                                 /* get first line of next block
 218                                  * (non-comment, non-blank) */
 219       do
 220         {
 221           if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; }
 222           strcpy(bufcpy, buffer);
 223         }
 224       while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL ||
 225              (strchr(commentsyms, *nptr) != NULL));
 226     }
 227
 228
 229   /***************************************************
 230    * Get ready for second pass:
 231    *   figure out the length of the alignment
 232    *   malloc space
 233    *   rewind the file
 234    ***************************************************/
 235
 236   alen = 0;
 237   for (currblock = 0; currblock < blocknum; currblock++)
 238     alen += blocks[currblock].rcol - blocks[currblock].lcol + 1;
 239
 240   rewind(fp);
 241
 242   /* allocations. we can't use AllocateAlignment because of
 243    * the way we already used ainfo->sqinfo.
 244    */
 245   aseqs     = (char **) MallocOrDie (num * sizeof(char *));
 246   if (have_cs)
 247     ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char));
 248   if (have_rf)
 249     ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char));
 250
 251
 252
 253   for (i = 0; i < num; i++)
 254     {
 255       aseqs[i]     = (char *) MallocOrDie ((alen+1) * sizeof(char));
 256       if (ainfo->sqinfo[i].flags & SQINFO_SS)
 257         ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char));
 258       if (ainfo->sqinfo[i].flags & SQINFO_SA)
 259         ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char));
 260     }
 261
 262   ainfo->alen = alen;
 263   ainfo->nseq = num;
 264   ainfo->wgt  = (float *) MallocOrDie (sizeof(float) * num);
 265   FSet(ainfo->wgt, num, 1.0);
 266
 267   /***************************************************
 268    * Second pass across file. Parse header; assemble sequences
 269    ***************************************************/
 270   /* We've now made a complete first pass over the file. We know how
 271    * many blocks it contains, we know the number of seqs in the first
 272    * block, and we know every block has the same number of blocks;
 273    * so we can be a bit more cavalier about error-checking as we
 274    * make the second pass.
 275    */
 276
 277   /* Look for header
 278    */
 279   headnum = 0;
 280   for (;;)
 281     {
 282       if (fgets(buffer, LINEBUFLEN, fp) == NULL)
 283         Die("Parse error in ReadSELEX()");
 284       strcpy(bufcpy, buffer);
 285       if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */
 286
 287       if (strcmp(nptr, "#=AU") == 0  && (sptr = strtok(NULL, "\n")) != NULL)
 288         ainfo->au = Strdup(sptr);
 289       else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
 290         ainfo->name = Strdup(sptr);
 291       else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
 292         ainfo->acc  = Strdup(sptr);
 293       else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
 294         ainfo->desc = Strdup(sptr);
 295       else if (strcmp(nptr, "#=GA") == 0)
 296         {
 297           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 298             Die("Parse error in #=GA line in ReadSELEX()");
 299           ainfo->ga1 = atof(sptr);
 300
 301           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 302             Die("Parse error in #=GA line in ReadSELEX()");
 303           ainfo->ga2 = atof(sptr);
 304
 305           ainfo->flags |= AINFO_GA;
 306         }
 307       else if (strcmp(nptr, "#=TC") == 0)
 308         {
 309           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 310             Die("Parse error in #=TC line in ReadSELEX()");
 311           ainfo->tc1 = atof(sptr);
 312
 313           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 314             Die("Parse error in #=TC line in ReadSELEX()");
 315           ainfo->tc2 = atof(sptr);
 316
 317           ainfo->flags |= AINFO_TC;
 318         }
 319       else if (strcmp(nptr, "#=NC") == 0)
 320         {
 321           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 322             Die("Parse error in #=NC line in ReadSELEX()");
 323           ainfo->nc1 = atof(sptr);
 324
 325           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 326             Die("Parse error in #=NC line in ReadSELEX()");
 327           ainfo->nc2 = atof(sptr);
 328
 329           ainfo->flags |= AINFO_NC;
 330         }
 331       else if (strcmp(nptr, "#=SQ") == 0)      /* per-sequence header info */
 332         {
 333                                 /* first field is the name */
 334           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 335             Die("Parse error in #=SQ line in ReadSELEX()");
 336           if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE;
 337
 338                                 /* second field is the weight */
 339           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 340             Die("Parse error in #=SQ line in ReadSELEX()");
 341           if (!IsReal(sptr))
 342             Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number");
 343           ainfo->wgt[headnum] = atof(sptr);
 344
 345                                 /* third field is database source id */
 346           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 347             Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
 348           SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID);
 349
 350                                 /* fourth field is database accession number */
 351           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 352             Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
 353           SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC);
 354
 355                                 /* fifth field is start..stop::olen */
 356           if ((sptr = strtok(NULL, ".:")) == NULL)
 357             Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
 358           SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START);
 359
 360           if ((sptr = strtok(NULL, ".:")) == NULL)
 361             Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
 362           SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP);
 363
 364           if ((sptr = strtok(NULL, ":\t ")) == NULL)
 365             Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
 366           SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN);
 367
 368                                 /* rest of line is optional description */
 369           if ((sptr = strtok(NULL, "\n")) != NULL)
 370             SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC);
 371
 372           headnum++;
 373         }
 374       else if (strcmp(nptr, "#=CS") == 0) break;
 375       else if (strcmp(nptr, "#=RF") == 0) break;
 376       else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */
 377     }
 378
 379
 380   currlen = 0;
 381   for (currblock = 0 ; currblock < blocknum; currblock++)
 382     {
 383                                 /* parse the block */
 384       seqidx = 0;
 385       while (nptr != NULL)
 386         {
 387                                 /* Consensus structure */
 388           if (strcmp(nptr, "#=CS") == 0)
 389             {
 390               if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1,
 391                                         buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
 392                 Die("Parse error in #=CS line in ReadSELEX()");
 393             }
 394
 395                                 /* Reference coordinates */
 396           else if (strcmp(nptr, "#=RF") == 0)
 397             {
 398               if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1,
 399                                         buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
 400                 Die("Parse error in #=RF line in ReadSELEX()");
 401             }
 402                                 /* Individual secondary structure */
 403           else if (strcmp(nptr, "#=SS") == 0)
 404             {
 405               if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1,
 406                                         buffer, blocks[currblock].lcol,
 407                                         blocks[currblock].rcol, (char) '.'))
 408                 Die("Parse error in #=SS line in ReadSELEX()");
 409             }
 410
 411                                 /* Side chain % surface accessibility code */
 412           else if (strcmp(nptr, "#=SA") == 0)
 413             {
 414               if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1,
 415                                         buffer, blocks[currblock].lcol,
 416                                         blocks[currblock].rcol, (char) '.'))
 417                 Die("Parse error in #=SA line in ReadSELEX()");
 418             }
 419                                 /* Aligned sequence; avoid unparsed machine comments */
 420           else if (strncmp(nptr, "#=", 2) != 0)
 421             {
 422               if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1,
 423                                         buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
 424                 Die("Parse error in alignment line in ReadSELEX()");
 425               seqidx++;
 426             }
 427
 428                                 /* get next line */
 429           for (;;)
 430             {
 431               nptr = NULL;
 432               if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* EOF */
 433               strcpy(bufcpy, buffer);
 434               if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */
 435               if (strncmp(buffer, "#=", 2) == 0) break;      /* machine comment */
 436               if (strchr(commentsyms, *nptr) == NULL) break; /* data */
 437             }
 438         } /* end of a block */
 439
 440       currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1;
 441
 442                                 /* get line 1 of next block */
 443       for (;;)
 444         {
 445           if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */
 446           strcpy(bufcpy, buffer);
 447           if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */
 448           if (strncmp(buffer, "#=", 2) == 0)       break; /* machine comment */
 449           if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */
 450         }
 451     } /* end of the file */
 452
 453   /* Lengths in sqinfo are for raw sequence (ungapped),
 454    * and SS, SA are 0..rlen-1 not 0..alen-1.
 455    * Only the seqs with structures come out of here with lengths set.
 456    */
 457   for (seqidx = 0; seqidx < num; seqidx++)
 458     {
 459       int apos, rpos;
 460                                 /* secondary structures */
 461       if (ainfo->sqinfo[seqidx].flags & SQINFO_SS)
 462         {
 463           for (apos = rpos = 0; apos < alen; apos++)
 464             if (! isgap(aseqs[seqidx][apos]))
 465               {
 466                 ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos];
 467                 rpos++;
 468               }
 469           ainfo->sqinfo[seqidx].ss[rpos] = '\0';
 470         }
 471                                 /* Surface accessibility */
 472       if (ainfo->sqinfo[seqidx].flags & SQINFO_SA)
 473         {
 474           for (apos = rpos = 0; apos < alen; apos++)
 475             if (! isgap(aseqs[seqidx][apos]))
 476               {
 477                 ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos];
 478                 rpos++;
 479               }
 480           ainfo->sqinfo[seqidx].sa[rpos] = '\0';
 481         }
 482     }
 483
 484                                 /* NULL-terminate all the strings */
 485   if (ainfo->rf != NULL) ainfo->rf[alen] = '\0';
 486   if (ainfo->cs != NULL) ainfo->cs[alen] = '\0';
 487   for (seqidx = 0; seqidx < num; seqidx++)
 488     aseqs[seqidx][alen]            = '\0';
 489
 490                                 /* find raw sequence lengths for sqinfo */
 491   for (seqidx = 0; seqidx < num; seqidx++)
 492     {
 493       count = 0;
 494       for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++)
 495         if (!isgap(*sptr)) count++;
 496       ainfo->sqinfo[seqidx].len    = count;
 497       ainfo->sqinfo[seqidx].flags |= SQINFO_LEN;
 498     }
 499
 500
 501   /***************************************************
 502    * Garbage collection and return
 503    ***************************************************/
 504   free(blocks);
 505   if (warn_names)
 506     Warn("sequences may be in different orders in blocks of %s?", afp->fname);
 507
 508   /* Convert back to MSA structure. (Wasteful kludge.)
 509    */
 510   msa = MSAFromAINFO(aseqs, ainfo);
 511   MSAVerifyParse(msa);
 512   FreeAlignment(aseqs, ainfo);
 513   return msa;
 514 }
 515
 516
 517 /* Function: WriteSELEX()
 518  * Date:     SRE, Mon Jun 14 13:13:14 1999 [St. Louis]
 519  *
 520  * Purpose:  Write a SELEX file in multiblock format.
 521  *
 522  * Args:     fp  - file that's open for writing
 523  *           msa - multiple sequence alignment object
 524  *
 525  * Returns:  (void)
 526  */
 527 void
 528 WriteSELEX(FILE *fp, MSA *msa)
 529 {
 530   actually_write_selex(fp, msa, 50); /* 50 char per block */
 531 }
 532
 533 /* Function: WriteSELEXOneBlock()
 534  * Date:     SRE, Mon Jun 14 13:14:56 1999 [St. Louis]
 535  *
 536  * Purpose:  Write a SELEX alignment file in Pfam's single-block
 537  *           format style. A wrapper for actually_write_selex().
 538  *
 539  * Args:     fp - file that's open for writing
 540  *           msa- alignment to write
 541  *
 542  * Returns:  (void)
 543  */
 544 void
 545 WriteSELEXOneBlock(FILE *fp, MSA *msa)
 546 {
 547   actually_write_selex(fp, msa, msa->alen); /* one big block */
 548 }
 549
 550
 551 /* Function: actually_write_selex()
 552  * Date:     SRE, Mon Jun 14 12:54:46 1999 [St. Louis]
 553  *
 554  * Purpose:  Write an alignment in SELEX format to an open
 555  *           file. This is the function that actually does
 556  *           the work. The API's WriteSELEX() and
 557  *           WriteSELEXOneBlock() are wrappers.
 558  *
 559  * Args:     fp  - file that's open for writing
 560  *           msa - alignment to write
 561  *           cpl - characters to write per line in alignment block
 562  *
 563  * Returns:  (void)
 564  */
 565 static void
 566 actually_write_selex(FILE *fp, MSA *msa, int cpl)
 567 {
 568   int  i;
 569   int  len = 0;
 570   int  namewidth;
 571   char *buf;
 572   int  currpos;
 573
 574   buf = malloc(sizeof(char) * (cpl+101)); /* 100 chars allowed for name, etc. */
 575
 576   /* Figure out how much space we need for name + markup
 577    * to keep the alignment in register, for easier human viewing --
 578    * even though Stockholm format doesn't care about visual
 579    * alignment.
 580    */
 581   namewidth = 0;
 582   for (i = 0; i < msa->nseq; i++)
 583     if ((len = strlen(msa->sqname[i])) > namewidth)
 584       namewidth = len;
 585   if (namewidth < 6) namewidth = 6; /* minimum space for markup tags */
 586
 587   /* Free text comments
 588    */
 589   for (i = 0;  i < msa->ncomment; i++)
 590     fprintf(fp, "# %s\n", msa->comment[i]);
 591   if (msa->ncomment > 0) fprintf(fp, "\n");
 592
 593   /* Per-file annotation
 594    */
 595   if (msa->name  != NULL)       fprintf(fp, "#=ID %s\n", msa->name);
 596   if (msa->acc   != NULL)       fprintf(fp, "#=AC %s\n", msa->acc);
 597   if (msa->desc  != NULL)       fprintf(fp, "#=DE %s\n", msa->desc);
 598   if (msa->au    != NULL)       fprintf(fp, "#=AU %s\n", msa->au);
 599   if (msa->flags & MSA_SET_GA)  fprintf(fp, "#=GA %.1f %.1f\n", msa->ga1, msa->ga2);
 600   if (msa->flags & MSA_SET_NC)  fprintf(fp, "#=NC %.1f %.1f\n", msa->nc1, msa->nc2);
 601   if (msa->flags & MSA_SET_TC)  fprintf(fp, "#=TC %.1f %.1f\n", msa->tc1, msa->tc2);
 602
 603   /* Per-sequence annotation
 604    */
 605   for (i = 0; i < msa->nseq; i++)
 606     fprintf(fp, "#=SQ %-*.*s %6.4f %s %s %d..%d::%d %s\n",
 607             namewidth, namewidth, msa->sqname[i],
 608             msa->wgt[i],
 609             "-",                /* MSA has no ID field */
 610             (msa->sqacc != NULL && msa->sqacc[i] != NULL) ? msa->sqacc[i] : "-",
 611             0, 0, 0,            /* MSA has no start, stop, olen field */
 612             (msa->sqdesc != NULL && msa->sqdesc[i] != NULL) ? msa->sqdesc[i] : "-");
 613   fprintf(fp, "\n");
 614
 615   /* Alignment section:
 616    */
 617   for (currpos = 0; currpos < msa->alen; currpos += cpl)
 618     {
 619       if (currpos > 0) fprintf(fp, "\n");
 620
 621       if (msa->ss_cons != NULL) {
 622         strncpy(buf, msa->ss_cons + currpos, cpl);
 623         buf[cpl] = '\0';
 624         fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=CS", buf);
 625       }
 626       if (msa->rf != NULL) {
 627         strncpy(buf, msa->rf + currpos, cpl);
 628         buf[cpl] = '\0';
 629         fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=RF", buf);
 630       }
 631       for (i = 0; i < msa->nseq; i++)
 632         {
 633           strncpy(buf, msa->aseq[i] + currpos, cpl);
 634           buf[cpl] = '\0';
 635           fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, msa->sqname[i], buf);
 636
 637           if (msa->ss != NULL && msa->ss[i] != NULL) {
 638             strncpy(buf, msa->ss[i] + currpos, cpl);
 639             buf[cpl] = '\0';
 640             fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=SS", buf);
 641           }
 642           if (msa->sa != NULL && msa->sa[i] != NULL) {
 643             strncpy(buf, msa->sa[i] + currpos, cpl);
 644             buf[cpl] = '\0';
 645             fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=SA", buf);
 646           }
 647         }
 648     }
 649   free(buf);
 650 }
 651
 652
 653 /* Function: copy_alignment_line()
 654  *
 655  * Purpose:  Given a line from an alignment file, and bounds lcol,rcol
 656  *           on what part of it may be sequence, save the alignment into
 657  *           aseq starting at position apos.
 658  *
 659  *           name_rcol is set to the rightmost column this aseqs's name
 660  *           occupies; if name_rcol >= lcol, we have a special case in
 661  *           which the name intrudes into the sequence zone.
 662  */
 663 static int
 664 copy_alignment_line(char *aseq, int apos, int name_rcol,
 665                     char *buffer, int lcol, int rcol, char gapsym)
 666 {
 667   char *s1, *s2;
 668   int   i;
 669
 670   s1 = aseq + apos;
 671   s2 = buffer;                  /* be careful that buffer doesn't end before lcol! */
 672   for (i = 0; i < lcol; i++)
 673     if (*s2) s2++;
 674
 675   for (i = lcol; i <= rcol; i++)
 676     {
 677       if (*s2 == '\t') {
 678         Warn("TAB characters will corrupt a SELEX alignment! Please remove them first.");
 679         return 0;
 680       }
 681       if (name_rcol >= i)       /* name intrusion special case: pad left w/ gaps */
 682         *s1 = gapsym;
 683                                 /* short buffer special case: pad right w/ gaps  */
 684       else if (*s2 == '\0' || *s2 == '\n')
 685         *s1 = gapsym;
 686
 687       else if (*s2 == ' ')      /* new: disallow spaces as gap symbols */
 688         *s1 = gapsym;
 689
 690       else                      /* normal case: copy buffer into aseq */
 691         *s1 = *s2;
 692
 693       s1++;
 694       if (*s2) s2++;
 695     }
 696   return 1;
 697 }
 698
 699
 700
 701
 702
 703 /* Function: DealignAseqs()
 704  *
 705  * Given an array of (num) aligned sequences aseqs,
 706  * strip the gaps. Store the raw sequences in a new allocated array.
 707  *
 708  * Caller is responsible for free'ing the memory allocated to
 709  * rseqs.
 710  *
 711  * Returns 1 on success. Returns 0 and sets squid_errno on
 712  * failure.
 713  */
 714 int
 715 DealignAseqs(char **aseqs, int num, char ***ret_rseqs)
 716 {
 717   char **rseqs;                 /* de-aligned sequence array   */
 718   int    idx;                   /* counter for sequences       */
 719   int    depos;                 /* position counter for dealigned seq*/
 720   int    apos;                  /* position counter for aligned seq */
 721   int    seqlen;                /* length of aligned seq */
 722
 723                                 /* alloc space */
 724   rseqs = (char **) MallocOrDie (num * sizeof(char *));
 725                                 /* main loop */
 726   for (idx = 0; idx < num; idx++)
 727     {
 728       seqlen = strlen(aseqs[idx]);
 729                                 /* alloc space */
 730       rseqs[idx] = (char *) MallocOrDie ((seqlen + 1) * sizeof(char));
 731
 732                                 /* strip gaps */
 733       depos = 0;
 734       for (apos = 0; aseqs[idx][apos] != '\0'; apos++)
 735         if (!isgap(aseqs[idx][apos]))
 736           {
 737             rseqs[idx][depos] = aseqs[idx][apos];
 738             depos++;
 739           }
 740       rseqs[idx][depos] = '\0';
 741     }
 742   *ret_rseqs = rseqs;
 743   return 1;
 744 }
 745
 746
 747 /* Function: IsSELEXFormat()
 748  *
 749  * Return TRUE if filename may be in SELEX format.
 750  *
 751  * Accuracy is sacrificed for speed; a TRUE return does
 752  * *not* guarantee that the file will pass the stricter
 753  * error-checking of ReadSELEX(). All it checks is that
 754  * the first 500 non-comment lines of a file are
 755  * blank, or if there's a second "word" on the line
 756  * it looks like sequence (i.e., it's not kOtherSeq).
 757  *
 758  * Returns TRUE or FALSE.
 759  */
 760 int
 761 IsSELEXFormat(char *filename)
 762 {
 763   FILE *fp;                     /* ptr to open sequence file */
 764   char  buffer[LINEBUFLEN];
 765   char *sptr;                   /* ptr to first word          */
 766   int   linenum;
 767
 768
 769   if ((fp = fopen(filename, "r")) == NULL)
 770     { squid_errno = SQERR_NOFILE; return 0; }
 771
 772   linenum = 0;
 773   while (linenum < 500 &&
 774          fgets(buffer, LINEBUFLEN, fp) != NULL)
 775     {
 776       linenum++;
 777                                 /* dead giveaways for extended SELEX */
 778       if      (strncmp(buffer, "#=AU", 4) == 0) goto DONE;
 779       else if (strncmp(buffer, "#=ID", 4) == 0) goto DONE;
 780       else if (strncmp(buffer, "#=AC", 4) == 0) goto DONE;
 781       else if (strncmp(buffer, "#=DE", 4) == 0) goto DONE;
 782       else if (strncmp(buffer, "#=GA", 4) == 0) goto DONE;
 783       else if (strncmp(buffer, "#=TC", 4) == 0) goto DONE;
 784       else if (strncmp(buffer, "#=NC", 4) == 0) goto DONE;
 785       else if (strncmp(buffer, "#=SQ", 4) == 0) goto DONE;
 786       else if (strncmp(buffer, "#=SS", 4) == 0) goto DONE;
 787       else if (strncmp(buffer, "#=CS", 4) == 0) goto DONE;
 788       else if (strncmp(buffer, "#=RF", 4) == 0) goto DONE;
 789
 790                                 /* a comment? */
 791       if (strchr(commentsyms, *buffer) != NULL) continue;
 792
 793                                 /* a blank line? */
 794       if ((sptr = strtok(buffer, WHITESPACE)) == NULL) continue;
 795
 796                                 /* a one-word line (name only)
 797                                    is possible, though rare */
 798       if ((sptr = strtok(NULL, "\n")) == NULL) continue;
 799
 800       if (Seqtype(sptr) == kOtherSeq) {fclose(fp); return 0;}
 801     }
 802
 803  DONE:
 804   fclose(fp);
 805   return 1;
 806 }
 807
 808
 809
 810
 811
 812
 813
 814