website/archive/binaries/mac/src/clustalo/src/squid/selex.c

   1 /*****************************************************************
   2  * SQUID - a library of functions for biological sequence analysis
   3  * Copyright (C) 1992-2002 Washington University School of Medicine
   4  *
   5  *     This source code is freely distributed under the terms of the
   6  *     GNU General Public License. See the files COPYRIGHT and LICENSE
   7  *     for details.
   8  *****************************************************************/
   9
  10 /* selex.c
  11  *
  12  * SRE, Mon Jun 14 11:08:38 1999
  13  * SELEX  obsolete as the preferred HMMER/SQUID format
  14  * replaced by Stockholm format
  15  * selex support retained for backwards compatibility
  16  * kludged to use the MSA interface
  17  *
  18  * SRE, Mon Jan 30 14:41:49 1995:
  19  * #=SA side chain % surface accessibility annotation supported
  20  *
  21  * SRE, Tue Nov  9 17:40:50 1993:
  22  * major revision. #= special comments and aliinfo_s optional
  23  * alignment info support added. Support for #=CS (consensus
  24  * secondary structure), #=SS (individual secondary structure),
  25  * #=RF (reference coordinate system), #=SQ (per-sequence header info),
  26  * and #=AU ("author") added.
  27  *
  28  * Fri Dec  4 17:43:24 1992, SRE:
  29  * Reading and writing aligned sequences to/from disk files.
  30  * Implements a new, broader specification of SELEX format
  31  * and supercedes alignio.c.
  32  *
  33  * SELEX format is documented in Docs/formats.tex.
  34  ****************************************************************************
  35  * RCS $Id: selex.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: selex.c,v 1.11 2002/10/12 04:40:35 eddy Exp)
  36  */
  37
  38 #include <stdio.h>
  39 #include <stdlib.h>
  40 #include <string.h>
  41 #include <ctype.h>
  42 #include <memory.h>
  43 #include "squid.h"
  44 #include "msa.h"
  45
  46 static int  copy_alignment_line(char *aseq, int apos, int name_rcol,
  47                                 char *buffer, int lcol, int rcol, char gapsym);
  48 static void actually_write_selex(FILE *fp, MSA *msa, int cpl);
  49
  50 static char commentsyms[] = "%#";
  51
  52 /* Function: ReadSELEX()
  53  * Date:     SRE, Sun Jun  6 18:24:09 1999 [St. Louis]
  54  *
  55  * Purpose:  Parse an alignment read from an open SELEX format
  56  *           alignment file. (SELEX is a single alignment format).
  57  *           Return the alignment, or NULL if we've already read the
  58  *           alignment or there's no alignment data in the file.
  59  *
  60  * Limitations: SELEX is the only remaining multipass parser for
  61  *           alignment files. It cannot read from gzip or from stdin.
  62  *           It Die()'s here if you try. The reason for this
  63  *           that SELEX allows space characters as gaps, so we don't
  64  *           know the borders of an alignment block until we've seen
  65  *           the whole block. I could rewrite to allow single-pass
  66  *           parsing (by storing the whole block in memory) but
  67  *           since SELEX is now legacy, why bother.
  68  *
  69  *           Note that the interface is totally kludged: fastest
  70  *           possible adaptation of old ReadSELEX() to the new
  71  *           MSA interface.
  72  *
  73  * Args:     afp  - open alignment file
  74  *
  75  * Returns:  MSA *  - an alignment object
  76  *                    caller responsible for an MSAFree()
  77  *           NULL if no alignment data.
  78  */
  79 MSA *
  80 ReadSELEX(MSAFILE *afp)
  81 {
  82   MSA     *msa;                 /* RETURN: mult seq alignment   */
  83   FILE    *fp;                  /* ptr to opened seqfile        */
  84   char   **aseqs;               /* aligned seqs                 */
  85   int      num = 0;             /* number of seqs read          */
  86   char     buffer[LINEBUFLEN];  /* input buffer for lines       */
  87   char     bufcpy[LINEBUFLEN];  /* strtok'able copy of buffer   */
  88   struct block_struc {          /** alignment data for a block: */
  89     int lcol;                   /* furthest left aligned sym    */
  90     int rcol;                   /* furthest right aligned sym   */
  91   } *blocks = NULL;
  92   int      blocknum;            /* number of blocks in file     */
  93   char    *nptr;                /* ptr to start of name on line */
  94   char    *sptr;                /* ptr into sequence on line    */
  95   int      currnum;             /* num. seqs in given block     */
  96   int      currblock;           /* index for blocks             */
  97   int      i;                   /* loop counter                 */
  98   int      seqidx;              /* counter for seqs             */
  99   int      alen;                /* length of alignment          */
 100   int      warn_names;          /* becomes TRUE if names don't match between blocks */
 101   int      headnum;             /* seqidx in per-sequence header info */
 102   int      currlen;
 103   int      count;
 104   int      have_cs = 0;
 105   int      have_rf = 0;
 106   AINFO    base_ainfo, *ainfo;  /* hack: used to be passed ptr to AINFO */
 107
 108
 109   /* Convert from MSA interface to what old ReadSELEX() did:
 110    *     - copy our open fp, rather than opening file
 111    *     - verify that we're not reading a gzip or stdin
 112    */
 113   if (feof(afp->f)) return NULL;
 114   if (afp->do_gzip || afp->do_stdin)
 115     Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file");
 116   fp    = afp->f;
 117   ainfo = &base_ainfo;
 118
 119   /***************************************************
 120    * First pass across file.
 121    * Count seqs, get names, determine column info
 122    * Determine what sorts of info are active in this file.
 123    ***************************************************/
 124
 125   InitAinfo(ainfo);
 126                                 /* get first line of the block
 127                                  * (non-comment, non-blank) */
 128   do
 129     {
 130       if (fgets(buffer, LINEBUFLEN, fp) == NULL)
 131         { squid_errno = SQERR_NODATA; return 0; }
 132       strcpy(bufcpy, buffer);
 133       if (*buffer == '#')
 134         {
 135           if      (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
 136           else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;
 137         }
 138     }
 139   while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL ||
 140          (strchr(commentsyms, *nptr) != NULL));
 141
 142   blocknum   = 0;
 143   warn_names = FALSE;
 144   while (!feof(fp))
 145     {
 146                                 /* allocate for info about this block. */
 147       if (blocknum == 0)
 148         blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc));
 149       else
 150         blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc));
 151       blocks[blocknum].lcol = LINEBUFLEN+1;
 152       blocks[blocknum].rcol = -1;
 153
 154       currnum = 0;
 155       while (nptr != NULL)      /* becomes NULL when this block ends. */
 156       {
 157                                 /* First block only: save names */
 158         if (blocknum == 0)
 159           {
 160             if (currnum == 0)
 161               ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO));
 162             else
 163               ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO));
 164
 165             ainfo->sqinfo[currnum].flags = 0;
 166             SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME);
 167           }
 168         else                    /* in each additional block: check names */
 169           {
 170             if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0)
 171               warn_names = TRUE;
 172           }
 173         currnum++;
 174
 175                                 /* check rcol, lcol */
 176         if ((sptr = strtok(NULL, WHITESPACE)) != NULL)
 177           {
 178                                 /* is this the furthest left we've
 179                                    seen word 2 in this block? */
 180             if (sptr - bufcpy < blocks[blocknum].lcol)
 181               blocks[blocknum].lcol = sptr - bufcpy;
 182                                 /* look for right side in buffer */
 183             for (sptr = buffer + strlen(buffer) - 1;
 184                  strchr(WHITESPACE, *sptr) != NULL;
 185                  sptr --)
 186               /* do nothing */ ;
 187             if (sptr - buffer > blocks[blocknum].rcol)
 188               blocks[blocknum].rcol = sptr - buffer;
 189           }
 190
 191                                 /* get the next line; blank line means end of block */
 192         do
 193           {
 194             if (fgets(buffer, LINEBUFLEN, fp) == NULL)
 195               { nptr = NULL; break; }
 196             strcpy(bufcpy, buffer);
 197
 198             if      (strncmp(buffer, "#=SS",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS;
 199             else if (strncmp(buffer, "#=SA",    4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA;
 200             else if (strncmp(buffer, "#=CS",    4) == 0) have_cs = 1;
 201             else if (strncmp(buffer, "#=RF",    4) == 0) have_rf = 1;
 202
 203             if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL)
 204               break;
 205           } while (strchr(commentsyms, *nptr) != NULL);
 206       }
 207
 208
 209                                 /* check that number of sequences matches expected */
 210       if (blocknum == 0)
 211         num = currnum;
 212       else if (currnum != num)
 213         Die("Parse error in ReadSELEX()");
 214       blocknum++;
 215
 216                                 /* get first line of next block
 217                                  * (non-comment, non-blank) */
 218       do
 219         {
 220           if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; }
 221           strcpy(bufcpy, buffer);
 222         }
 223       while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL ||
 224              (strchr(commentsyms, *nptr) != NULL));
 225     }
 226
 227
 228   /***************************************************
 229    * Get ready for second pass:
 230    *   figure out the length of the alignment
 231    *   malloc space
 232    *   rewind the file
 233    ***************************************************/
 234
 235   alen = 0;
 236   for (currblock = 0; currblock < blocknum; currblock++)
 237     alen += blocks[currblock].rcol - blocks[currblock].lcol + 1;
 238
 239   rewind(fp);
 240
 241   /* allocations. we can't use AllocateAlignment because of
 242    * the way we already used ainfo->sqinfo.
 243    */
 244   aseqs     = (char **) MallocOrDie (num * sizeof(char *));
 245   if (have_cs)
 246     ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char));
 247   if (have_rf)
 248     ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char));
 249
 250
 251
 252   for (i = 0; i < num; i++)
 253     {
 254       aseqs[i]     = (char *) MallocOrDie ((alen+1) * sizeof(char));
 255       if (ainfo->sqinfo[i].flags & SQINFO_SS)
 256         ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char));
 257       if (ainfo->sqinfo[i].flags & SQINFO_SA)
 258         ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char));
 259     }
 260
 261   ainfo->alen = alen;
 262   ainfo->nseq = num;
 263   ainfo->wgt  = (float *) MallocOrDie (sizeof(float) * num);
 264   FSet(ainfo->wgt, num, 1.0);
 265
 266   /***************************************************
 267    * Second pass across file. Parse header; assemble sequences
 268    ***************************************************/
 269   /* We've now made a complete first pass over the file. We know how
 270    * many blocks it contains, we know the number of seqs in the first
 271    * block, and we know every block has the same number of blocks;
 272    * so we can be a bit more cavalier about error-checking as we
 273    * make the second pass.
 274    */
 275
 276   /* Look for header
 277    */
 278   headnum = 0;
 279   for (;;)
 280     {
 281       if (fgets(buffer, LINEBUFLEN, fp) == NULL)
 282         Die("Parse error in ReadSELEX()");
 283       strcpy(bufcpy, buffer);
 284       if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */
 285
 286       if (strcmp(nptr, "#=AU") == 0  && (sptr = strtok(NULL, "\n")) != NULL)
 287         ainfo->au = Strdup(sptr);
 288       else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
 289         ainfo->name = Strdup(sptr);
 290       else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
 291         ainfo->acc  = Strdup(sptr);
 292       else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL)
 293         ainfo->desc = Strdup(sptr);
 294       else if (strcmp(nptr, "#=GA") == 0)
 295         {
 296           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 297             Die("Parse error in #=GA line in ReadSELEX()");
 298           ainfo->ga1 = atof(sptr);
 299
 300           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 301             Die("Parse error in #=GA line in ReadSELEX()");
 302           ainfo->ga2 = atof(sptr);
 303
 304           ainfo->flags |= AINFO_GA;
 305         }
 306       else if (strcmp(nptr, "#=TC") == 0)
 307         {
 308           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 309             Die("Parse error in #=TC line in ReadSELEX()");
 310           ainfo->tc1 = atof(sptr);
 311
 312           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 313             Die("Parse error in #=TC line in ReadSELEX()");
 314           ainfo->tc2 = atof(sptr);
 315
 316           ainfo->flags |= AINFO_TC;
 317         }
 318       else if (strcmp(nptr, "#=NC") == 0)
 319         {
 320           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 321             Die("Parse error in #=NC line in ReadSELEX()");
 322           ainfo->nc1 = atof(sptr);
 323
 324           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 325             Die("Parse error in #=NC line in ReadSELEX()");
 326           ainfo->nc2 = atof(sptr);
 327
 328           ainfo->flags |= AINFO_NC;
 329         }
 330       else if (strcmp(nptr, "#=SQ") == 0)      /* per-sequence header info */
 331         {
 332                                 /* first field is the name */
 333           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 334             Die("Parse error in #=SQ line in ReadSELEX()");
 335           if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE;
 336
 337                                 /* second field is the weight */
 338           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 339             Die("Parse error in #=SQ line in ReadSELEX()");
 340           if (!IsReal(sptr))
 341             Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number");
 342           ainfo->wgt[headnum] = atof(sptr);
 343
 344                                 /* third field is database source id */
 345           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 346             Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
 347           SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID);
 348
 349                                 /* fourth field is database accession number */
 350           if ((sptr = strtok(NULL, WHITESPACE)) == NULL)
 351             Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
 352           SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC);
 353
 354                                 /* fifth field is start..stop::olen */
 355           if ((sptr = strtok(NULL, ".:")) == NULL)
 356             Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
 357           SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START);
 358
 359           if ((sptr = strtok(NULL, ".:")) == NULL)
 360             Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
 361           SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP);
 362
 363           if ((sptr = strtok(NULL, ":\t ")) == NULL)
 364             Die("Parse error in #=SQ line in ReadSELEX(): incomplete line");
 365           SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN);
 366
 367                                 /* rest of line is optional description */
 368           if ((sptr = strtok(NULL, "\n")) != NULL)
 369             SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC);
 370
 371           headnum++;
 372         }
 373       else if (strcmp(nptr, "#=CS") == 0) break;
 374       else if (strcmp(nptr, "#=RF") == 0) break;
 375       else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */
 376     }
 377
 378
 379   currlen = 0;
 380   for (currblock = 0 ; currblock < blocknum; currblock++)
 381     {
 382                                 /* parse the block */
 383       seqidx = 0;
 384       while (nptr != NULL)
 385         {
 386                                 /* Consensus structure */
 387           if (strcmp(nptr, "#=CS") == 0)
 388             {
 389               if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1,
 390                                         buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
 391                 Die("Parse error in #=CS line in ReadSELEX()");
 392             }
 393
 394                                 /* Reference coordinates */
 395           else if (strcmp(nptr, "#=RF") == 0)
 396             {
 397               if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1,
 398                                         buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
 399                 Die("Parse error in #=RF line in ReadSELEX()");
 400             }
 401                                 /* Individual secondary structure */
 402           else if (strcmp(nptr, "#=SS") == 0)
 403             {
 404               if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1,
 405                                         buffer, blocks[currblock].lcol,
 406                                         blocks[currblock].rcol, (char) '.'))
 407                 Die("Parse error in #=SS line in ReadSELEX()");
 408             }
 409
 410                                 /* Side chain % surface accessibility code */
 411           else if (strcmp(nptr, "#=SA") == 0)
 412             {
 413               if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1,
 414                                         buffer, blocks[currblock].lcol,
 415                                         blocks[currblock].rcol, (char) '.'))
 416                 Die("Parse error in #=SA line in ReadSELEX()");
 417             }
 418                                 /* Aligned sequence; avoid unparsed machine comments */
 419           else if (strncmp(nptr, "#=", 2) != 0)
 420             {
 421               if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1,
 422                                         buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.'))
 423                 Die("Parse error in alignment line in ReadSELEX()");
 424               seqidx++;
 425             }
 426
 427                                 /* get next line */
 428           for (;;)
 429             {
 430               nptr = NULL;
 431               if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* EOF */
 432               strcpy(bufcpy, buffer);
 433               if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */
 434               if (strncmp(buffer, "#=", 2) == 0) break;      /* machine comment */
 435               if (strchr(commentsyms, *nptr) == NULL) break; /* data */
 436             }
 437         } /* end of a block */
 438
 439       currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1;
 440
 441                                 /* get line 1 of next block */
 442       for (;;)
 443         {
 444           if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */
 445           strcpy(bufcpy, buffer);
 446           if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */
 447           if (strncmp(buffer, "#=", 2) == 0)       break; /* machine comment */
 448           if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */
 449         }
 450     } /* end of the file */
 451
 452   /* Lengths in sqinfo are for raw sequence (ungapped),
 453    * and SS, SA are 0..rlen-1 not 0..alen-1.
 454    * Only the seqs with structures come out of here with lengths set.
 455    */
 456   for (seqidx = 0; seqidx < num; seqidx++)
 457     {
 458       int apos, rpos;
 459                                 /* secondary structures */
 460       if (ainfo->sqinfo[seqidx].flags & SQINFO_SS)
 461         {
 462           for (apos = rpos = 0; apos < alen; apos++)
 463             if (! isgap(aseqs[seqidx][apos]))
 464               {
 465                 ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos];
 466                 rpos++;
 467               }
 468           ainfo->sqinfo[seqidx].ss[rpos] = '\0';
 469         }
 470                                 /* Surface accessibility */
 471       if (ainfo->sqinfo[seqidx].flags & SQINFO_SA)
 472         {
 473           for (apos = rpos = 0; apos < alen; apos++)
 474             if (! isgap(aseqs[seqidx][apos]))
 475               {
 476                 ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos];
 477                 rpos++;
 478               }
 479           ainfo->sqinfo[seqidx].sa[rpos] = '\0';
 480         }
 481     }
 482
 483                                 /* NULL-terminate all the strings */
 484   if (ainfo->rf != NULL) ainfo->rf[alen] = '\0';
 485   if (ainfo->cs != NULL) ainfo->cs[alen] = '\0';
 486   for (seqidx = 0; seqidx < num; seqidx++)
 487     aseqs[seqidx][alen]            = '\0';
 488
 489                                 /* find raw sequence lengths for sqinfo */
 490   for (seqidx = 0; seqidx < num; seqidx++)
 491     {
 492       count = 0;
 493       for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++)
 494         if (!isgap(*sptr)) count++;
 495       ainfo->sqinfo[seqidx].len    = count;
 496       ainfo->sqinfo[seqidx].flags |= SQINFO_LEN;
 497     }
 498
 499
 500   /***************************************************
 501    * Garbage collection and return
 502    ***************************************************/
 503   free(blocks);
 504   if (warn_names)
 505 #ifdef CLUSTALO
 506     Warning("sequences may be in different orders in blocks of %s?", afp->fname);
 507 #else
 508     Warn("sequences may be in different orders in blocks of %s?", afp->fname);
 509 #endif
 510
 511   /* Convert back to MSA structure. (Wasteful kludge.)
 512    */
 513   msa = MSAFromAINFO(aseqs, ainfo);
 514   MSAVerifyParse(msa);
 515   FreeAlignment(aseqs, ainfo);
 516   return msa;
 517 }
 518
 519
 520 /* Function: WriteSELEX()
 521  * Date:     SRE, Mon Jun 14 13:13:14 1999 [St. Louis]
 522  *
 523  * Purpose:  Write a SELEX file in multiblock format.
 524  *
 525  * Args:     fp  - file that's open for writing
 526  *           msa - multiple sequence alignment object
 527  *
 528  * Returns:  (void)
 529  */
 530 void
 531 WriteSELEX(FILE *fp, MSA *msa)
 532 {
 533   actually_write_selex(fp, msa, 50); /* 50 char per block */
 534 }
 535
 536 /* Function: WriteSELEXOneBlock()
 537  * Date:     SRE, Mon Jun 14 13:14:56 1999 [St. Louis]
 538  *
 539  * Purpose:  Write a SELEX alignment file in Pfam's single-block
 540  *           format style. A wrapper for actually_write_selex().
 541  *
 542  * Args:     fp - file that's open for writing
 543  *           msa- alignment to write
 544  *
 545  * Returns:  (void)
 546  */
 547 void
 548 WriteSELEXOneBlock(FILE *fp, MSA *msa)
 549 {
 550   actually_write_selex(fp, msa, msa->alen); /* one big block */
 551 }
 552
 553
 554 /* Function: actually_write_selex()
 555  * Date:     SRE, Mon Jun 14 12:54:46 1999 [St. Louis]
 556  *
 557  * Purpose:  Write an alignment in SELEX format to an open
 558  *           file. This is the function that actually does
 559  *           the work. The API's WriteSELEX() and
 560  *           WriteSELEXOneBlock() are wrappers.
 561  *
 562  * Args:     fp  - file that's open for writing
 563  *           msa - alignment to write
 564  *           cpl - characters to write per line in alignment block
 565  *
 566  * Returns:  (void)
 567  */
 568 static void
 569 actually_write_selex(FILE *fp, MSA *msa, int cpl)
 570 {
 571   int  i;
 572   int  len = 0;
 573   int  namewidth;
 574   char *buf;
 575   int  currpos;
 576
 577   buf = malloc(sizeof(char) * (cpl+101)); /* 100 chars allowed for name, etc. */
 578
 579   /* Figure out how much space we need for name + markup
 580    * to keep the alignment in register, for easier human viewing --
 581    * even though Stockholm format doesn't care about visual
 582    * alignment.
 583    */
 584   namewidth = 0;
 585   for (i = 0; i < msa->nseq; i++)
 586     if ((len = strlen(msa->sqname[i])) > namewidth)
 587       namewidth = len;
 588   if (namewidth < 6) namewidth = 6; /* minimum space for markup tags */
 589
 590   /* Free text comments
 591    */
 592   for (i = 0;  i < msa->ncomment; i++)
 593     fprintf(fp, "# %s\n", msa->comment[i]);
 594   if (msa->ncomment > 0) fprintf(fp, "\n");
 595
 596   /* Per-file annotation
 597    */
 598   if (msa->name  != NULL)       fprintf(fp, "#=ID %s\n", msa->name);
 599   if (msa->acc   != NULL)       fprintf(fp, "#=AC %s\n", msa->acc);
 600   if (msa->desc  != NULL)       fprintf(fp, "#=DE %s\n", msa->desc);
 601   if (msa->au    != NULL)       fprintf(fp, "#=AU %s\n", msa->au);
 602
 603   /* Thresholds are hacky. Pfam has two. Rfam has one.
 604    */
 605   if      (msa->cutoff_is_set[MSA_CUTOFF_GA1] && msa->cutoff_is_set[MSA_CUTOFF_GA2])
 606     fprintf(fp, "#=GA %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_GA1], msa->cutoff[MSA_CUTOFF_GA2]);
 607   else if (msa->cutoff_is_set[MSA_CUTOFF_GA1])
 608     fprintf(fp, "#=GA %.1f\n", msa->cutoff[MSA_CUTOFF_GA1]);
 609   if      (msa->cutoff_is_set[MSA_CUTOFF_NC1] && msa->cutoff_is_set[MSA_CUTOFF_NC2])
 610     fprintf(fp, "#=NC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_NC1], msa->cutoff[MSA_CUTOFF_NC2]);
 611   else if (msa->cutoff_is_set[MSA_CUTOFF_NC1])
 612     fprintf(fp, "#=NC %.1f\n", msa->cutoff[MSA_CUTOFF_NC1]);
 613   if      (msa->cutoff_is_set[MSA_CUTOFF_TC1] && msa->cutoff_is_set[MSA_CUTOFF_TC2])
 614     fprintf(fp, "#=TC %.1f %.1f\n", msa->cutoff[MSA_CUTOFF_TC1], msa->cutoff[MSA_CUTOFF_TC2]);
 615   else if (msa->cutoff_is_set[MSA_CUTOFF_TC1])
 616     fprintf(fp, "#=TC %.1f\n", msa->cutoff[MSA_CUTOFF_TC1]);
 617
 618   /* Per-sequence annotation
 619    */
 620   for (i = 0; i < msa->nseq; i++)
 621     fprintf(fp, "#=SQ %-*.*s %6.4f %s %s %d..%d::%d %s\n",
 622             namewidth, namewidth, msa->sqname[i],
 623             msa->wgt[i],
 624             "-",                /* MSA has no ID field */
 625             (msa->sqacc != NULL && msa->sqacc[i] != NULL) ? msa->sqacc[i] : "-",
 626             0, 0, 0,            /* MSA has no start, stop, olen field */
 627             (msa->sqdesc != NULL && msa->sqdesc[i] != NULL) ? msa->sqdesc[i] : "-");
 628   fprintf(fp, "\n");
 629
 630   /* Alignment section:
 631    */
 632   for (currpos = 0; currpos < msa->alen; currpos += cpl)
 633     {
 634       if (currpos > 0) fprintf(fp, "\n");
 635
 636       if (msa->ss_cons != NULL) {
 637         strncpy(buf, msa->ss_cons + currpos, cpl);
 638         buf[cpl] = '\0';
 639         fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=CS", buf);
 640       }
 641       if (msa->rf != NULL) {
 642         strncpy(buf, msa->rf + currpos, cpl);
 643         buf[cpl] = '\0';
 644         fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=RF", buf);
 645       }
 646       for (i = 0; i < msa->nseq; i++)
 647         {
 648           strncpy(buf, msa->aseq[i] + currpos, cpl);
 649           buf[cpl] = '\0';
 650           fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, msa->sqname[i], buf);
 651
 652           if (msa->ss != NULL && msa->ss[i] != NULL) {
 653             strncpy(buf, msa->ss[i] + currpos, cpl);
 654             buf[cpl] = '\0';
 655             fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=SS", buf);
 656           }
 657           if (msa->sa != NULL && msa->sa[i] != NULL) {
 658             strncpy(buf, msa->sa[i] + currpos, cpl);
 659             buf[cpl] = '\0';
 660             fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=SA", buf);
 661           }
 662         }
 663     }
 664   free(buf);
 665 }
 666
 667
 668 /* Function: copy_alignment_line()
 669  *
 670  * Purpose:  Given a line from an alignment file, and bounds lcol,rcol
 671  *           on what part of it may be sequence, save the alignment into
 672  *           aseq starting at position apos.
 673  *
 674  *           name_rcol is set to the rightmost column this aseqs's name
 675  *           occupies; if name_rcol >= lcol, we have a special case in
 676  *           which the name intrudes into the sequence zone.
 677  */
 678 static int
 679 copy_alignment_line(char *aseq, int apos, int name_rcol,
 680                     char *buffer, int lcol, int rcol, char gapsym)
 681 {
 682   char *s1, *s2;
 683   int   i;
 684
 685   s1 = aseq + apos;
 686   s2 = buffer;                  /* be careful that buffer doesn't end before lcol! */
 687   for (i = 0; i < lcol; i++)
 688     if (*s2) s2++;
 689
 690   for (i = lcol; i <= rcol; i++)
 691     {
 692       if (*s2 == '\t') {
 693 #ifdef CLUSTALO
 694         Warning("TAB characters will corrupt a SELEX alignment! Please remove them first.");
 695 #else
 696         Warn("TAB characters will corrupt a SELEX alignment! Please remove them first.");
 697 #endif
 698         return 0;
 699       }
 700       if (name_rcol >= i)       /* name intrusion special case: pad left w/ gaps */
 701         *s1 = gapsym;
 702                                 /* short buffer special case: pad right w/ gaps  */
 703       else if (*s2 == '\0' || *s2 == '\n')
 704         *s1 = gapsym;
 705
 706       else if (*s2 == ' ')      /* new: disallow spaces as gap symbols */
 707         *s1 = gapsym;
 708
 709       else                      /* normal case: copy buffer into aseq */
 710         *s1 = *s2;
 711
 712       s1++;
 713       if (*s2) s2++;
 714     }
 715   return 1;
 716 }
 717
 718
 719
 720
 721
 722 /* Function: DealignAseqs()
 723  *
 724  * Given an array of (num) aligned sequences aseqs,
 725  * strip the gaps. Store the raw sequences in a new allocated array.
 726  *
 727  * Caller is responsible for free'ing the memory allocated to
 728  * rseqs.
 729  *
 730  * Returns 1 on success. Returns 0 and sets squid_errno on
 731  * failure.
 732  */
 733 int
 734 DealignAseqs(char **aseqs, int num, char ***ret_rseqs)
 735 {
 736   char **rseqs;                 /* de-aligned sequence array   */
 737   int    idx;                   /* counter for sequences       */
 738   int    depos;                 /* position counter for dealigned seq*/
 739   int    apos;                  /* position counter for aligned seq */
 740   int    seqlen;                /* length of aligned seq */
 741
 742                                 /* alloc space */
 743   rseqs = (char **) MallocOrDie (num * sizeof(char *));
 744                                 /* main loop */
 745   for (idx = 0; idx < num; idx++)
 746     {
 747       seqlen = strlen(aseqs[idx]);
 748                                 /* alloc space */
 749       rseqs[idx] = (char *) MallocOrDie ((seqlen + 1) * sizeof(char));
 750
 751                                 /* strip gaps */
 752       depos = 0;
 753       for (apos = 0; aseqs[idx][apos] != '\0'; apos++)
 754         if (!isgap(aseqs[idx][apos]))
 755           {
 756             rseqs[idx][depos] = aseqs[idx][apos];
 757             depos++;
 758           }
 759       rseqs[idx][depos] = '\0';
 760     }
 761   *ret_rseqs = rseqs;
 762   return 1;
 763 }
 764
 765
 766 /* Function: IsSELEXFormat()
 767  *
 768  * Return TRUE if filename may be in SELEX format.
 769  *
 770  * Accuracy is sacrificed for speed; a TRUE return does
 771  * *not* guarantee that the file will pass the stricter
 772  * error-checking of ReadSELEX(). All it checks is that
 773  * the first 500 non-comment lines of a file are
 774  * blank, or if there's a second "word" on the line
 775  * it looks like sequence (i.e., it's not kOtherSeq).
 776  *
 777  * Returns TRUE or FALSE.
 778  */
 779 int
 780 IsSELEXFormat(char *filename)
 781 {
 782   FILE *fp;                     /* ptr to open sequence file */
 783   char  buffer[LINEBUFLEN];
 784   char *sptr;                   /* ptr to first word          */
 785   int   linenum;
 786
 787
 788   if ((fp = fopen(filename, "r")) == NULL)
 789     { squid_errno = SQERR_NOFILE; return 0; }
 790
 791   linenum = 0;
 792   while (linenum < 500 &&
 793          fgets(buffer, LINEBUFLEN, fp) != NULL)
 794     {
 795       linenum++;
 796                                 /* dead giveaways for extended SELEX */
 797       if      (strncmp(buffer, "#=AU", 4) == 0) goto DONE;
 798       else if (strncmp(buffer, "#=ID", 4) == 0) goto DONE;
 799       else if (strncmp(buffer, "#=AC", 4) == 0) goto DONE;
 800       else if (strncmp(buffer, "#=DE", 4) == 0) goto DONE;
 801       else if (strncmp(buffer, "#=GA", 4) == 0) goto DONE;
 802       else if (strncmp(buffer, "#=TC", 4) == 0) goto DONE;
 803       else if (strncmp(buffer, "#=NC", 4) == 0) goto DONE;
 804       else if (strncmp(buffer, "#=SQ", 4) == 0) goto DONE;
 805       else if (strncmp(buffer, "#=SS", 4) == 0) goto DONE;
 806       else if (strncmp(buffer, "#=CS", 4) == 0) goto DONE;
 807       else if (strncmp(buffer, "#=RF", 4) == 0) goto DONE;
 808
 809                                 /* a comment? */
 810       if (strchr(commentsyms, *buffer) != NULL) continue;
 811
 812                                 /* a blank line? */
 813       if ((sptr = strtok(buffer, WHITESPACE)) == NULL) continue;
 814
 815                                 /* a one-word line (name only)
 816                                    is possible, though rare */
 817       if ((sptr = strtok(NULL, "\n")) == NULL) continue;
 818
 819       if (Seqtype(sptr) == kOtherSeq) {fclose(fp); return 0;}
 820     }
 821
 822  DONE:
 823   fclose(fp);
 824   return 1;
 825 }
 826
 827
 828
 829
 830
 831
 832
 833