1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
8 *****************************************************************/
12 * Routines for dealing with PAM matrices.
15 * ParsePAMFile() -- read a PAM matrix from disk.
18 * SRE - Fri Apr 2 11:23:45 1993
19 * RCS $Id: dayhoff.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: dayhoff.c,v 1.5 2002/07/03 15:03:39 eddy Exp)
30 /* Function: ParsePAMFile()
32 * Purpose: Given a pointer to an open file containing a PAM matrix,
33 * parse the file and allocate and fill a 2D array of
34 * floats containing the matrix. The PAM file is
35 * assumed to be in the format that NCBI distributes
36 * with BLAST. BLOSUM matrices also work fine, as
37 * produced by Henikoff's program "MATBLAS".
39 * Parses both old format and new format BLAST matrices.
40 * Old format just had rows of integers.
41 * New format includes a leading character on each row.
43 * The PAM matrix is a 27x27 matrix, 0=A..25=Z,26=*.
44 * Note that it's not a 20x20 matrix as you might expect;
45 * this is for speed of indexing as well as the ability
46 * to deal with ambiguous characters.
48 * Args: fp - open PAM file
49 * ret_pam - RETURN: pam matrix, integers
50 * ret_scale - RETURN: scale factor for converting
51 * to real Sij. For instance, PAM120 is
52 * given in units of ln(2)/2. This may
53 * be passed as NULL if the caller
56 * Returns: 1 on success; 0 on failure and sets squid_errno to
57 * indicate the cause. ret_pam is allocated here and
58 * must be freed by the caller (use FreePAM).
61 ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale)
64 char buffer[512]; /* input buffer from fp */
65 int order[27]; /* order of fields, obtained from header */
66 int nsymbols; /* total number of symbols in matrix */
73 scale = 0.0; /* just to silence gcc uninit warnings */
74 if (fp == NULL) { squid_errno = SQERR_NODATA; return 0; }
76 /* Look at the first non-blank, non-comment line in the file.
77 * It gives single-letter codes in the order the PAM matrix
78 * is arrayed in the file.
81 if (fgets(buffer, 512, fp) == NULL)
82 { squid_errno = SQERR_NODATA; return 0; }
84 /* Get the scale factor from the header.
85 * For BLOSUM files, we assume the line looks like:
86 * BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
87 * and we assume that the fraction is always 1/x;
89 * For PAM files, we assume the line looks like:
90 * PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574
91 * and we assume that the number following the final '=' is our scale
93 if (strstr(buffer, "BLOSUM Clustered Scoring Matrix") != NULL &&
94 (sptr = strchr(buffer, '/')) != NULL)
97 if (! isdigit((int) (*sptr))) { squid_errno = SQERR_FORMAT; return 0; }
98 scale = (float) (log(2.0) / atof(sptr));
101 else if (strstr(buffer, "substitution matrix,") != NULL)
103 while ((sptr = strrchr(buffer, '=')) != NULL) {
112 } while ((sptr = strtok(buffer, " \t\n")) == NULL || *sptr == '#');
116 order[idx] = (int) *sptr - (int) 'A';
117 if (order[idx] < 0 || order[idx] > 25) order[idx] = 26;
119 } while ((sptr = strtok(NULL, " \t\n")) != NULL);
122 /* Allocate a pam matrix. For speed of indexing, we use
123 * a 27x27 matrix so we can do lookups using the ASCII codes
124 * of amino acid single-letter representations, plus one
125 * extra field to deal with the "*" (terminators).
127 if ((pam = (int **) calloc (27, sizeof(int *))) == NULL)
128 Die("calloc failed");
129 for (idx = 0; idx < 27; idx++)
130 if ((pam[idx] = (int *) calloc (27, sizeof(int))) == NULL)
131 Die("calloc failed");
133 /* Parse the rest of the file.
135 for (row = 0; row < nsymbols; row++)
137 if (fgets(buffer, 512, fp) == NULL)
138 { squid_errno = SQERR_NODATA; return 0; }
140 if ((sptr = strtok(buffer, " \t\n")) == NULL)
141 { squid_errno = SQERR_NODATA; return 0; }
142 for (col = 0; col < nsymbols; col++)
144 if (sptr == NULL) { squid_errno = SQERR_NODATA; return 0; }
146 /* Watch out for new BLAST format, with leading characters
148 if (*sptr == '*' || isalpha((int) *sptr))
149 col--; /* hack hack */
151 pam [order[row]] [order[col]] = atoi(sptr);
153 sptr = strtok(NULL, " \t\n");
159 if (ret_scale != NULL)
161 if (gotscale) *ret_scale = scale;
165 Warning("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
167 Warn("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
169 *ret_scale = log(2.0) / 2.0;