--- /dev/null
+/*****************************************************************
+ * SQUID - a library of functions for biological sequence analysis
+ * Copyright (C) 1992-2002 Washington University School of Medicine
+ *
+ * This source code is freely distributed under the terms of the
+ * GNU General Public License. See the files COPYRIGHT and LICENSE
+ * for details.
+ *****************************************************************/
+
+/* phylip.c
+ * SRE, Mon Jun 14 14:08:33 1999 [St. Louis]
+ *
+ * Import/export of PHYLIP interleaved multiple sequence alignment
+ * format files.
+ *
+ * RCS $Id: phylip.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: phylip.c,v 1.1 1999/07/15 22:29:20 eddy Exp)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "squid.h"
+#include "msa.h"
+
+#ifdef TESTDRIVE_PHYLIP
+/*****************************************************************
+ * phylip.c test driver:
+ *
+ */
+int
+main(int argc, char **argv)
+{
+ MSAFILE *afp;
+ MSA *msa;
+ char *file;
+
+ file = argv[1];
+
+ if ((afp = MSAFileOpen(file, MSAFILE_UNKNOWN, NULL)) == NULL)
+ Die("Couldn't open %s\n", file);
+
+ printf("format %d\n", afp->format);
+
+ while ((msa = ReadPhylip(afp)) != NULL)
+ {
+ WritePhylip(stdout, msa);
+ MSAFree(msa);
+ }
+
+ MSAFileClose(afp);
+ exit(0);
+}
+/******************************************************************/
+#endif /* testdrive_phylip */
+
+
+
+/* Function: ReadPhylip()
+ * Date: SRE, Fri Jun 18 12:59:37 1999 [Sanger Centre]
+ *
+ * Purpose: Parse an alignment from an open Phylip format
+ * alignment file. Phylip is a single-alignment format.
+ * Return the alignment, or NULL if we have no data.
+ *
+ * Args: afp - open alignment file
+ *
+ * Returns: MSA * - an alignment object
+ * Caller responsible for an MSAFree()
+ * NULL if no more alignments
+ */
+MSA *
+ReadPhylip(MSAFILE *afp)
+{
+ MSA *msa;
+ char *s, *s1, *s2;
+ char name[11]; /* seq name max len = 10 char */
+ int nseq, alen;
+ int idx; /* index of current sequence */
+ int slen;
+ int nblock;
+
+ if (feof(afp->f)) return NULL;
+
+ /* Skip until we see a nonblank line; it's the header,
+ * containing nseq/alen
+ */
+ nseq = 0; alen = 0;
+ while ((s = MSAFileGetLine(afp)) != NULL)
+ {
+ if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue;
+ if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL)
+ Die("Failed to parse nseq/alen from first line of PHYLIP file %s\n", afp->fname);
+ if (! IsInt(s1) || ! IsInt(s2))
+ Die("nseq and/or alen not an integer in first line of PHYLIP file %s\n", afp->fname);
+ nseq = atoi(s1);
+ alen = atoi(s2);
+ break;
+ }
+
+ msa = MSAAlloc(nseq, 0);
+ idx = 0;
+ nblock = 0;
+ while ((s = MSAFileGetLine(afp)) != NULL)
+ {
+ /* ignore blank lines. nonblank lines start w/ nonblank char */
+ if (isspace(*s)) continue;
+ /* First block has seq names */
+ if (nblock == 0) {
+ strncpy(name, s, 10);
+ name[10] = '\0';
+ GKIStoreKey(msa->index, name);
+ msa->sqname[idx] = sre_strdup(name, -1);
+ s += 10;
+ }
+ /* be careful of trailing whitespace on lines */
+ if ((s1 = sre_strtok(&s, WHITESPACE, &slen)) == NULL)
+ Die("Failed to parse sequence at line %d of PHYLIP file %s\n",
+ afp->linenumber, afp->fname);
+ msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], s1, slen);
+
+ idx++;
+ if (idx == nseq) { idx = 0; nblock++; }
+ }
+ msa->nseq = nseq;
+ MSAVerifyParse(msa); /* verifies; sets alen, wgt; frees sqlen[] */
+ return msa;
+}
+
+
+
+/* Function: WritePhylip()
+ * Date: SRE, Fri Jun 18 12:07:41 1999 [Sanger Centre]
+ *
+ * Purpose: Write an alignment in Phylip format to an open file.
+ *
+ * Args: fp - file that's open for writing.
+ * msa - alignment to write.
+ *
+ * Returns: (void)
+ */
+void
+WritePhylip(FILE *fp, MSA *msa)
+{
+ int idx; /* counter for sequences */
+ int cpl = 50; /* 50 seq char per line */
+ char buf[51]; /* buffer for writing seq */
+ int pos;
+
+ /* First line has nseq, alen
+ */
+ fprintf(fp, " %d %d\n", msa->nseq, msa->alen);
+
+ /* Alignment section.
+ * PHYLIP is a multiblock format, blocks (optionally) separated
+ * by blanks; names only attached to first block. Names are
+ * restricted to ten char; we achieve this by simple truncation (!).
+ * (Do we need to convert gap characters from our ./- convention?)
+ */
+ for (pos = 0; pos < msa->alen; pos += cpl)
+ {
+ if (pos > 0) fprintf(fp, "\n");
+
+ for (idx = 0; idx < msa->nseq; idx++)
+ {
+ strncpy(buf, msa->aseq[idx] + pos, cpl);
+ buf[cpl] = '\0';
+ if (pos > 0) fprintf(fp, "%s\n", buf);
+ else fprintf(fp, "%-10.10s%s\n", msa->sqname[idx], buf);
+ }
+ }
+ return;
+}