--- /dev/null
+/*****************************************************************
+ * SQUID - a library of functions for biological sequence analysis
+ * Copyright (C) 1992-2002 Washington University School of Medicine
+ *
+ * This source code is freely distributed under the terms of the
+ * GNU General Public License. See the files COPYRIGHT and LICENSE
+ * for details.
+ *****************************************************************/
+
+/* dayhoff.c
+ *
+ * Routines for dealing with PAM matrices.
+ *
+ * Includes:
+ * ParsePAMFile() -- read a PAM matrix from disk.
+ *
+ *
+ * SRE - Fri Apr 2 11:23:45 1993
+ * RCS $Id: dayhoff.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: dayhoff.c,v 1.5 2002/07/03 15:03:39 eddy Exp)
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <ctype.h>
+#include "squid.h"
+
+/* Function: ParsePAMFile()
+ *
+ * Purpose: Given a pointer to an open file containing a PAM matrix,
+ * parse the file and allocate and fill a 2D array of
+ * floats containing the matrix. The PAM file is
+ * assumed to be in the format that NCBI distributes
+ * with BLAST. BLOSUM matrices also work fine, as
+ * produced by Henikoff's program "MATBLAS".
+ *
+ * Parses both old format and new format BLAST matrices.
+ * Old format just had rows of integers.
+ * New format includes a leading character on each row.
+ *
+ * The PAM matrix is a 27x27 matrix, 0=A..25=Z,26=*.
+ * Note that it's not a 20x20 matrix as you might expect;
+ * this is for speed of indexing as well as the ability
+ * to deal with ambiguous characters.
+ *
+ * Args: fp - open PAM file
+ * ret_pam - RETURN: pam matrix, integers
+ * ret_scale - RETURN: scale factor for converting
+ * to real Sij. For instance, PAM120 is
+ * given in units of ln(2)/2. This may
+ * be passed as NULL if the caller
+ * doesn't care.
+ *
+ * Returns: 1 on success; 0 on failure and sets squid_errno to
+ * indicate the cause. ret_pam is allocated here and
+ * must be freed by the caller (use FreePAM).
+ */
+int
+ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale)
+{
+ int **pam;
+ char buffer[512]; /* input buffer from fp */
+ int order[27]; /* order of fields, obtained from header */
+ int nsymbols; /* total number of symbols in matrix */
+ char *sptr;
+ int idx;
+ int row, col;
+ float scale;
+ int gotscale = FALSE;
+
+ scale = 0.0; /* just to silence gcc uninit warnings */
+ if (fp == NULL) { squid_errno = SQERR_NODATA; return 0; }
+
+ /* Look at the first non-blank, non-comment line in the file.
+ * It gives single-letter codes in the order the PAM matrix
+ * is arrayed in the file.
+ */
+ do {
+ if (fgets(buffer, 512, fp) == NULL)
+ { squid_errno = SQERR_NODATA; return 0; }
+
+ /* Get the scale factor from the header.
+ * For BLOSUM files, we assume the line looks like:
+ * BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
+ * and we assume that the fraction is always 1/x;
+ *
+ * For PAM files, we assume the line looks like:
+ * PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574
+ * and we assume that the number following the final '=' is our scale
+ */
+ if (strstr(buffer, "BLOSUM Clustered Scoring Matrix") != NULL &&
+ (sptr = strchr(buffer, '/')) != NULL)
+ {
+ sptr++;
+ if (! isdigit((int) (*sptr))) { squid_errno = SQERR_FORMAT; return 0; }
+ scale = (float) (log(2.0) / atof(sptr));
+ gotscale = TRUE;
+ }
+ else if (strstr(buffer, "substitution matrix,") != NULL)
+ {
+ while ((sptr = strrchr(buffer, '=')) != NULL) {
+ sptr += 2;
+ if (IsReal(sptr)) {
+ scale = atof(sptr);
+ gotscale = TRUE;
+ break;
+ }
+ }
+ }
+ } while ((sptr = strtok(buffer, " \t\n")) == NULL || *sptr == '#');
+
+ idx = 0;
+ do {
+ order[idx] = (int) *sptr - (int) 'A';
+ if (order[idx] < 0 || order[idx] > 25) order[idx] = 26;
+ idx++;
+ } while ((sptr = strtok(NULL, " \t\n")) != NULL);
+ nsymbols = idx;
+
+ /* Allocate a pam matrix. For speed of indexing, we use
+ * a 27x27 matrix so we can do lookups using the ASCII codes
+ * of amino acid single-letter representations, plus one
+ * extra field to deal with the "*" (terminators).
+ */
+ if ((pam = (int **) calloc (27, sizeof(int *))) == NULL)
+ Die("calloc failed");
+ for (idx = 0; idx < 27; idx++)
+ if ((pam[idx] = (int *) calloc (27, sizeof(int))) == NULL)
+ Die("calloc failed");
+
+ /* Parse the rest of the file.
+ */
+ for (row = 0; row < nsymbols; row++)
+ {
+ if (fgets(buffer, 512, fp) == NULL)
+ { squid_errno = SQERR_NODATA; return 0; }
+
+ if ((sptr = strtok(buffer, " \t\n")) == NULL)
+ { squid_errno = SQERR_NODATA; return 0; }
+ for (col = 0; col < nsymbols; col++)
+ {
+ if (sptr == NULL) { squid_errno = SQERR_NODATA; return 0; }
+
+ /* Watch out for new BLAST format, with leading characters
+ */
+ if (*sptr == '*' || isalpha((int) *sptr))
+ col--; /* hack hack */
+ else
+ pam [order[row]] [order[col]] = atoi(sptr);
+
+ sptr = strtok(NULL, " \t\n");
+ }
+ }
+
+ /* Return
+ */
+ if (ret_scale != NULL)
+ {
+ if (gotscale) *ret_scale = scale;
+ else
+ {
+#ifdef CLUSTALO
+ Warning("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
+#else
+ Warn("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
+#endif
+ *ret_scale = log(2.0) / 2.0;
+ }
+ }
+ *ret_pam = pam;
+ return 1;
+}