+++ /dev/null
-/*****************************************************************
- * SQUID - a library of functions for biological sequence analysis
- * Copyright (C) 1992-2002 Washington University School of Medicine
- *
- * This source code is freely distributed under the terms of the
- * GNU General Public License. See the files COPYRIGHT and LICENSE
- * for details.
- *****************************************************************/
-
-/* clustal.c
- * SRE, Sun Jun 6 17:50:45 1999 [bus from Madison, 1999 worm mtg]
- *
- * Import/export of ClustalV/W multiple sequence alignment
- * formatted files. Derivative of msf.c; MSF is a pretty
- * generic interleaved format.
- *
- * RCS $Id: clustal.c 228 2011-03-29 14:05:27Z dave $ (Original squid RCS Id: clustal.c,v 1.1 1999/07/15 22:26:53 eddy Exp)
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include "squid.h"
-#include "msa.h"
-
-#ifdef CLUSTALO
-/* needed for PACKAGE_VERSION */
-#include "../config.h"
-
-#ifndef min
- #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
-#endif
-
-/*These are all the positively scoring groups that occur in the Gonnet Pam250
-matrix. There are strong and weak groups, defined as strong score >0.5 and
-weak score =<0.5. Strong matching columns to be assigned ':' and weak matches
-assigned '.' in the clustal output format.
-amino_strong = res_cat1
-amino_weak = res_cat2
-*/
-
-char *amino_strong[] = {"STA", "NEQK", "NHQK", "NDEQ", "QHRK", "MILV",
- "MILF", "HY", "FYW", NULL};
-
-char *amino_weak[] = {"CSA", "ATV", "SAG", "STNK", "STPA", "SGND",
- "SNDEQK", "NDEQHK", "NEQHRK", "FVLIM", "HFY", NULL};
-
-#endif
-
-#ifdef TESTDRIVE_CLUSTAL
-/*****************************************************************
- * msf.c test driver:
- * cc -DTESTDRIVE_CLUSTAL -g -O2 -Wall -o test clustal.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm
- *
- */
-int
-main(int argc, char **argv)
-{
- MSAFILE *afp;
- MSA *msa;
- char *file;
-
- file = argv[1];
-
- if ((afp = MSAFileOpen(file, MSAFILE_CLUSTAL, NULL)) == NULL)
- Die("Couldn't open %s\n", file);
-
- while ((msa = ReadClustal(afp)) != NULL)
- {
- WriteClustal(stdout, msa);
- MSAFree(msa);
- }
-
- MSAFileClose(afp);
- exit(0);
-}
-/******************************************************************/
-#endif /* testdrive_clustal */
-
-
-/* Function: ReadClustal()
- * Date: SRE, Sun Jun 6 17:53:49 1999 [bus from Madison, 1999 worm mtg]
- *
- * Purpose: Parse an alignment read from an open Clustal format
- * alignment file. Clustal is a single-alignment format.
- * Return the alignment, or NULL if we have no data.
- *
- * Args: afp - open alignment file
- *
- * Returns: MSA * - an alignment object
- * caller responsible for an MSAFree()
- * NULL if no more alignments
- *
- * Diagnostics:
- * Will Die() here with a (potentially) useful message
- * if a parsing error occurs.
- */
-MSA *
-ReadClustal(MSAFILE *afp)
-{
- MSA *msa;
- char *s;
- int slen;
- int sqidx;
- char *name;
- char *seq;
- char *s2;
-
- if (feof(afp->f)) return NULL;
-
- /* Skip until we see the CLUSTAL header
- */
- while ((s = MSAFileGetLine(afp)) != NULL)
- {
- if (strncmp(s, "CLUSTAL", 7) == 0 &&
- strstr(s, "multiple sequence alignment") != NULL)
- break;
- }
- if (s == NULL) return NULL;
-
- msa = MSAAlloc(10, 0);
-
- /* Now we're in the sequence section.
- * As discussed above, if we haven't seen a sequence name, then we
- * don't include the sequence in the alignment.
- * Watch out for conservation markup lines that contain *.: chars
- */
- while ((s = MSAFileGetLine(afp)) != NULL)
- {
- if ((name = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue;
- if ((seq = sre_strtok(&s, WHITESPACE, &slen)) == NULL) continue;
- s2 = sre_strtok(&s, "\n", NULL);
-
- /* The test for a conservation markup line
- */
- if (strpbrk(name, ".*:") != NULL && strpbrk(seq, ".*:") != NULL)
- continue;
- if (s2 != NULL)
- Die("Parse failed at line %d, file %s: possibly using spaces as gaps",
- afp->linenumber, afp->fname);
-
- /* It's not blank, and it's not a coord line: must be sequence
- */
- sqidx = MSAGetSeqidx(msa, name, msa->lastidx+1);
- msa->lastidx = sqidx;
- msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen);
- }
-
- MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */
- return msa;
-}
-
-
-/* Function: WriteClustal()
- * Date: SRE, Sun Jun 6 18:12:47 1999 [bus from Madison, worm mtg 1999]
- *
- * Purpose: Write an alignment in Clustal format to an open file.
- *
- * Args: fp - file that's open for writing.
- * msa - alignment to write.
- *
- * Returns: (void)
- */
-void
-WriteClustal(FILE *fp, MSA *msa)
-{
- int idx; /* counter for sequences */
- int len; /* tmp variable for name lengths */
- int namelen; /* maximum name length used */
- int pos; /* position counter */
- char buf[80]; /* buffer for writing seq */
- int cpl = 60; /* char per line (< 64) */
-
- /* consensus line stuff */
- int subpos;
- char first;
- int bail;
- int strong_bins[9];
- int weak_bins[11];
- int cons;
- int bin;
-
- /* calculate max namelen used */
- namelen = 0;
- for (idx = 0; idx < msa->nseq; idx++)
- if ((len = strlen(msa->sqname[idx])) > namelen)
- namelen = len;
-
-#ifdef CLUSTALO
- fprintf(fp, "CLUSTAL O(%s) multiple sequence alignment\n", PACKAGE_VERSION);
-#else
- fprintf(fp, "CLUSTAL W(1.5) multiple sequence alignment\n");
-#endif
-
- /*****************************************************
- * Write the sequences
- *****************************************************/
-
-#ifdef CLUSTALO
- fprintf(fp, "\n"); /* original had two blank lines */
-#endif
-
- for (pos = 0; pos < msa->alen; pos += cpl)
- {
- fprintf(fp, "\n"); /* Blank line between sequence blocks */
- for (idx = 0; idx < msa->nseq; idx++)
- {
- strncpy(buf, msa->aseq[idx] + pos, cpl);
- buf[cpl] = '\0';
-#ifdef CLUSTALO
- fprintf(fp, "%-*s %s\n", namelen+5, msa->sqname[idx], buf);
-#else
- fprintf(fp, "%*s %s\n", namelen, msa->sqname[idx], buf);
-#endif
- }
-#ifdef CLUSTALO
- /* do consensus dots */
-
- /* print namelen+5 spaces */
- for(subpos = 0; subpos <= namelen+5; subpos++)
- fprintf(fp, " ");
-
- for(subpos = pos; subpos < min(pos + cpl, msa->alen); subpos++)
- {
- /* see if 100% conservation */
- first = msa->aseq[0][subpos];
- bail = 0;
- for (idx = 1; idx < msa->nseq; idx++)
- {
- if(msa->aseq[idx][subpos] != first)
- {
- bail = 1;
- break;
- }
- }
- if(!bail)
- fprintf(fp, "*");
- else
- {
- /* if not then check strong */
- for(bin = 0; bin < 9; bin++)
- strong_bins[bin] = 0; /* clear the bins */
-
- for(idx = 0; idx < msa->nseq; idx++)
- {
- switch(msa->aseq[idx][subpos])
- {
- case 'S': strong_bins[0]++; break;
- case 'T': strong_bins[0]++; break;
- case 'A': strong_bins[0]++; break;
- case 'N': strong_bins[1]++; strong_bins[2]++; strong_bins[3]++; break;
- case 'E': strong_bins[1]++; strong_bins[3]++; break;
- case 'Q': strong_bins[1]++; strong_bins[2]++; strong_bins[3]++; strong_bins[4]++; break;
- case 'K': strong_bins[1]++; strong_bins[2]++; strong_bins[4]++; break;
- case 'D': strong_bins[3]++; break;
- case 'R': strong_bins[4]++; break;
- case 'H': strong_bins[4]++; strong_bins[7]++; break;
- case 'M': strong_bins[5]++; strong_bins[6]++; break;
- case 'I': strong_bins[5]++; strong_bins[6]++; break;
- case 'L': strong_bins[5]++; strong_bins[6]++; break;
- case 'V': strong_bins[5]++; break;
- case 'F': strong_bins[6]++; strong_bins[8]++; break;
- case 'Y': strong_bins[7]++; strong_bins[8]++; break;
- case 'W': strong_bins[8]++; break;
- }
- }
- bail = 0;
- for(bin = 0; bin < 9; bin++)
- if(strong_bins[bin] == msa->nseq)
- {
- bail = 1;
- break;
- }
- if(bail)
- fprintf(fp, ":");
- else
- {
- /* check weak */
- for(bin = 0; bin < 11; bin++)
- weak_bins[bin] = 0; /* clear the bins */
-
- for(idx = 0; idx < msa->nseq; idx++)
- {
- switch(msa->aseq[idx][subpos])
- {
- case 'C': weak_bins[0]++; break;
- case 'S': weak_bins[0]++; weak_bins[2]++; weak_bins[3]++; weak_bins[4]++; weak_bins[5]++; weak_bins[6]++; break;
- case 'A': weak_bins[0]++; weak_bins[1]++; weak_bins[2]++; weak_bins[4]++; break;
- case 'T': weak_bins[1]++; weak_bins[3]++; weak_bins[4]++; break;
- case 'V': weak_bins[1]++; weak_bins[9]++; break;
- case 'G': weak_bins[2]++; break;
- case 'N': weak_bins[3]++; weak_bins[5]++; weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break;
- case 'K': weak_bins[3]++; weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break;
- case 'D': weak_bins[5]++; weak_bins[6]++; weak_bins[7]++; break;
- case 'E': weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break;
- case 'Q': weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break;
- case 'H': weak_bins[7]++; weak_bins[8]++; weak_bins[10]++; break;
- case 'R': weak_bins[8]++; break;
- case 'F': weak_bins[9]++; weak_bins[10]++; break;
- case 'L': weak_bins[9]++; break;
- case 'I': weak_bins[9]++; break;
- case 'M': weak_bins[9]++; break;
- case 'Y': weak_bins[10]++; break;
- }
- }
- bail = 0;
- for(bin = 0; bin < 11; bin++)
- if(weak_bins[bin] == msa->nseq)
- {
- bail = 1;
- break;
- }
- if(bail)
- fprintf(fp, ".");
- else
- fprintf(fp, " ");
- }
- }
- }
- fprintf(fp,"\n");
-#endif
- }
-
- return;
-}
-
-
-