1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
13 * Routines for dealing with PAM matrices.
16 * ParsePAMFile() -- read a PAM matrix from disk.
19 * SRE - Fri Apr 2 11:23:45 1993
20 * RCS $Id: dayhoff.c,v 1.1.1.1 2005/03/22 08:34:17 cmzmasek Exp $
31 /* Function: ParsePAMFile()
33 * Purpose: Given a pointer to an open file containing a PAM matrix,
34 * parse the file and allocate and fill a 2D array of
35 * floats containing the matrix. The PAM file is
36 * assumed to be in the format that NCBI distributes
37 * with BLAST. BLOSUM matrices also work fine, as
38 * produced by Henikoff's program "MATBLAS".
40 * Parses both old format and new format BLAST matrices.
41 * Old format just had rows of integers.
42 * New format includes a leading character on each row.
44 * The PAM matrix is a 27x27 matrix, 0=A..25=Z,26=*.
45 * Note that it's not a 20x20 matrix as you might expect;
46 * this is for speed of indexing as well as the ability
47 * to deal with ambiguous characters.
49 * Args: fp - open PAM file
50 * ret_pam - RETURN: pam matrix, integers
51 * ret_scale - RETURN: scale factor for converting
52 * to real Sij. For instance, PAM120 is
53 * given in units of ln(2)/2. This may
54 * be passed as NULL if the caller
57 * Returns: 1 on success; 0 on failure and sets squid_errno to
58 * indicate the cause. ret_pam is allocated here and
59 * must be freed by the caller (use FreePAM).
62 ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale)
65 char buffer[512]; /* input buffer from fp */
66 int order[27]; /* order of fields, obtained from header */
67 int nsymbols; /* total number of symbols in matrix */
74 if (fp == NULL) { squid_errno = SQERR_NODATA; return 0; }
76 /* Look at the first non-blank, non-comment line in the file.
77 * It gives single-letter codes in the order the PAM matrix
78 * is arrayed in the file.
81 if (fgets(buffer, 512, fp) == NULL)
82 { squid_errno = SQERR_NODATA; return 0; }
84 /* Get the scale factor from the header.
85 * For BLOSUM files, we assume the line looks like:
86 * BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
87 * and we assume that the fraction is always 1/x;
89 * For PAM files, we assume the line looks like:
90 * PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574
91 * and we assume that the number following the final '=' is our scale
93 scale = 0.0; /* just to silence gcc uninit warnings */
94 if (strstr(buffer, "BLOSUM Clustered Scoring Matrix") != NULL &&
95 (sptr = strchr(buffer, '/')) != NULL)
98 if (! isdigit((int) (*sptr))) { squid_errno = SQERR_FORMAT; return 0; }
99 scale = (float) (log(2.0) / atof(sptr));
102 else if (strstr(buffer, "substitution matrix,") != NULL)
104 while ((sptr = strrchr(buffer, '=')) != NULL) {
113 } while ((sptr = strtok(buffer, " \t\n")) == NULL || *sptr == '#');
117 order[idx] = (int) *sptr - (int) 'A';
118 if (order[idx] < 0 || order[idx] > 25) order[idx] = 26;
120 } while ((sptr = strtok(NULL, " \t\n")) != NULL);
123 /* Allocate a pam matrix. For speed of indexing, we use
124 * a 27x27 matrix so we can do lookups using the ASCII codes
125 * of amino acid single-letter representations, plus one
126 * extra field to deal with the "*" (terminators).
128 if ((pam = (int **) calloc (27, sizeof(int *))) == NULL)
129 Die("calloc failed");
130 for (idx = 0; idx < 27; idx++)
131 if ((pam[idx] = (int *) calloc (27, sizeof(int))) == NULL)
132 Die("calloc failed");
134 /* Parse the rest of the file.
136 for (row = 0; row < nsymbols; row++)
138 if (fgets(buffer, 512, fp) == NULL)
139 { squid_errno = SQERR_NODATA; return 0; }
141 if ((sptr = strtok(buffer, " \t\n")) == NULL)
142 { squid_errno = SQERR_NODATA; return 0; }
143 for (col = 0; col < nsymbols; col++)
145 if (sptr == NULL) { squid_errno = SQERR_NODATA; return 0; }
147 /* Watch out for new BLAST format, with leading characters
149 if (*sptr == '*' || isalpha((int) *sptr))
150 col--; /* hack hack */
152 pam [order[row]] [order[col]] = atoi(sptr);
154 sptr = strtok(NULL, " \t\n");
160 if (ret_scale != NULL)
162 if (gotscale) *ret_scale = scale;
165 Warn("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
166 *ret_scale = log(2.0) / 2.0;