+++ /dev/null
-/*****************************************************************
- * SQUID - a library of functions for biological sequence analysis
- * Copyright (C) 1992-2002 Washington University School of Medicine
- *
- * This source code is freely distributed under the terms of the
- * GNU General Public License. See the files COPYRIGHT and LICENSE
- * for details.
- *****************************************************************/
-
-/* dayhoff.c
- *
- * Routines for dealing with PAM matrices.
- *
- * Includes:
- * ParsePAMFile() -- read a PAM matrix from disk.
- *
- *
- * SRE - Fri Apr 2 11:23:45 1993
- * RCS $Id: dayhoff.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: dayhoff.c,v 1.5 2002/07/03 15:03:39 eddy Exp)
- */
-
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <ctype.h>
-#include "squid.h"
-
-/* Function: ParsePAMFile()
- *
- * Purpose: Given a pointer to an open file containing a PAM matrix,
- * parse the file and allocate and fill a 2D array of
- * floats containing the matrix. The PAM file is
- * assumed to be in the format that NCBI distributes
- * with BLAST. BLOSUM matrices also work fine, as
- * produced by Henikoff's program "MATBLAS".
- *
- * Parses both old format and new format BLAST matrices.
- * Old format just had rows of integers.
- * New format includes a leading character on each row.
- *
- * The PAM matrix is a 27x27 matrix, 0=A..25=Z,26=*.
- * Note that it's not a 20x20 matrix as you might expect;
- * this is for speed of indexing as well as the ability
- * to deal with ambiguous characters.
- *
- * Args: fp - open PAM file
- * ret_pam - RETURN: pam matrix, integers
- * ret_scale - RETURN: scale factor for converting
- * to real Sij. For instance, PAM120 is
- * given in units of ln(2)/2. This may
- * be passed as NULL if the caller
- * doesn't care.
- *
- * Returns: 1 on success; 0 on failure and sets squid_errno to
- * indicate the cause. ret_pam is allocated here and
- * must be freed by the caller (use FreePAM).
- */
-int
-ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale)
-{
- int **pam;
- char buffer[512]; /* input buffer from fp */
- int order[27]; /* order of fields, obtained from header */
- int nsymbols; /* total number of symbols in matrix */
- char *sptr;
- int idx;
- int row, col;
- float scale;
- int gotscale = FALSE;
-
- scale = 0.0; /* just to silence gcc uninit warnings */
- if (fp == NULL) { squid_errno = SQERR_NODATA; return 0; }
-
- /* Look at the first non-blank, non-comment line in the file.
- * It gives single-letter codes in the order the PAM matrix
- * is arrayed in the file.
- */
- do {
- if (fgets(buffer, 512, fp) == NULL)
- { squid_errno = SQERR_NODATA; return 0; }
-
- /* Get the scale factor from the header.
- * For BLOSUM files, we assume the line looks like:
- * BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
- * and we assume that the fraction is always 1/x;
- *
- * For PAM files, we assume the line looks like:
- * PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574
- * and we assume that the number following the final '=' is our scale
- */
- if (strstr(buffer, "BLOSUM Clustered Scoring Matrix") != NULL &&
- (sptr = strchr(buffer, '/')) != NULL)
- {
- sptr++;
- if (! isdigit((int) (*sptr))) { squid_errno = SQERR_FORMAT; return 0; }
- scale = (float) (log(2.0) / atof(sptr));
- gotscale = TRUE;
- }
- else if (strstr(buffer, "substitution matrix,") != NULL)
- {
- while ((sptr = strrchr(buffer, '=')) != NULL) {
- sptr += 2;
- if (IsReal(sptr)) {
- scale = atof(sptr);
- gotscale = TRUE;
- break;
- }
- }
- }
- } while ((sptr = strtok(buffer, " \t\n")) == NULL || *sptr == '#');
-
- idx = 0;
- do {
- order[idx] = (int) *sptr - (int) 'A';
- if (order[idx] < 0 || order[idx] > 25) order[idx] = 26;
- idx++;
- } while ((sptr = strtok(NULL, " \t\n")) != NULL);
- nsymbols = idx;
-
- /* Allocate a pam matrix. For speed of indexing, we use
- * a 27x27 matrix so we can do lookups using the ASCII codes
- * of amino acid single-letter representations, plus one
- * extra field to deal with the "*" (terminators).
- */
- if ((pam = (int **) calloc (27, sizeof(int *))) == NULL)
- Die("calloc failed");
- for (idx = 0; idx < 27; idx++)
- if ((pam[idx] = (int *) calloc (27, sizeof(int))) == NULL)
- Die("calloc failed");
-
- /* Parse the rest of the file.
- */
- for (row = 0; row < nsymbols; row++)
- {
- if (fgets(buffer, 512, fp) == NULL)
- { squid_errno = SQERR_NODATA; return 0; }
-
- if ((sptr = strtok(buffer, " \t\n")) == NULL)
- { squid_errno = SQERR_NODATA; return 0; }
- for (col = 0; col < nsymbols; col++)
- {
- if (sptr == NULL) { squid_errno = SQERR_NODATA; return 0; }
-
- /* Watch out for new BLAST format, with leading characters
- */
- if (*sptr == '*' || isalpha((int) *sptr))
- col--; /* hack hack */
- else
- pam [order[row]] [order[col]] = atoi(sptr);
-
- sptr = strtok(NULL, " \t\n");
- }
- }
-
- /* Return
- */
- if (ret_scale != NULL)
- {
- if (gotscale) *ret_scale = scale;
- else
- {
-#ifdef CLUSTALO
- Warning("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
-#else
- Warn("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
-#endif
- *ret_scale = log(2.0) / 2.0;
- }
- }
- *ret_pam = pam;
- return 1;
-}