--- /dev/null
+/* print_pssm.c - 21-Jan-2005
+
+ copyright (c) 2005 - William R. Pearson and the University of Virginia
+
+ read a binary PSSM checkpoint file from blastpgp, and produce an ascii
+ formatted file
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <math.h>
+#include <string.h>
+
+#include "defs.h"
+#include "mm_file.h"
+#include "param.h"
+
+#include "uascii.h"
+#include "upam.h"
+
+void initenv(int, char **, struct pstruct *, char *);
+void read_pssm();
+void alloc_pam();
+int **alloc_pam2p();
+void initpam2();
+void fill_pam();
+double get_lambda();
+
+extern int optind;
+extern char *optarg;
+
+main(int argc, char **argv) {
+
+ char *aa0;
+ char libstr[MAX_FN];
+ char qname[MAX_FN];
+ int sq0off;
+ int i, n0;
+ FILE *fp;
+ struct pstruct pst, *ppst;
+
+ /* stuff from initfa.c/h_init() */
+
+ memcpy(qascii,aascii,sizeof(qascii));
+
+ /* initialize a pam matrix */
+ ppst = &pst;
+ strncpy(ppst->pamfile,"BL50",MAX_FN);
+ standard_pam(ppst->pamfile,ppst,0,0);
+
+ /* this is always protein by default */
+ ppst->nsq = naa;
+ ppst->nsqx = naax;
+ for (i=0; i<=ppst->nsqx; i++) {
+ ppst->sq[i] = aa[i];
+ ppst->hsq[i] = haa[i];
+ ppst->sqx[i]=aax[i]; /* sq = aa */
+ ppst->hsqx[i]=haax[i]; /* hsq = haa */
+ }
+ ppst->sq[ppst->nsqx+1] = ppst->sqx[ppst->nsqx+1] = '\0';
+
+ if ((aa0 = calloc(MAXTST,sizeof(char)))==NULL) {
+ fprintf(stderr,"Cannot allocate aa0\n");
+ exit(1);
+ }
+
+ initenv(argc, argv, &pst, qname);
+ alloc_pam(pst.nsq+1,pst.nsq+1, &pst);
+ initpam2(&pst);
+
+ n0 = getseq (qname, qascii, aa0, MAXTST, libstr,&sq0off);
+
+ if (!pst.pam_pssm) {
+ fprintf(stderr," ** ERROR ** No -P PSSM provided\n");
+ }
+ else {
+ ppst->pam2p[0] = alloc_pam2p(n0,pst.nsq);
+ ppst->pam2p[1] = alloc_pam2p(n0,pst.nsq);
+ if ((fp = fopen(pst.pgpfile,"rb"))!=NULL) {
+ read_pssm(aa0, n0, pst.nsq, pst.pamscale,fp,ppst);
+ }
+ }
+}
+
+void
+initenv(int argc, char **argv, struct pstruct *ppst, char *qname) {
+ char copt;
+
+ pascii = aascii;
+
+ while ((copt = getopt(argc, argv, "P:s:"))!=EOF) {
+ switch (copt) {
+ case 'P':
+ strncpy(ppst->pgpfile,optarg,MAX_FN);
+ ppst->pgpfile[MAX_FN-1]='\0';
+ ppst->pam_pssm = 1;
+ break;
+
+ case 's':
+ strncpy (ppst->pamfile, optarg, 120);
+ ppst->pamfile[120-1]='\0';
+ if (!standard_pam(ppst->pamfile,ppst,0, 0)) {
+ initpam (ppst->pamfile, ppst);
+ }
+ ppst->pam_set=1;
+ break;
+ }
+ }
+ optind--;
+
+ if (argc - optind > 1) strncpy(qname, argv[optind+1], MAX_FN);
+}
+
+
+/*
+ *aa0 - query sequence
+ n0 - length
+ pamscale - scaling for pam matrix - provided by apam.c, either
+ 0.346574 = ln(2)/2 (P120, BL62) or
+ 0.231049 = ln(2)/3 (P250, BL50)
+*/
+
+#define N_EFFECT 20
+
+void
+read_pssm(unsigned char *aa0, int n0, int nsq, double pamscale, FILE *fp, struct pstruct *ppst) {
+ int i, j, len;
+ int qi, rj;
+ int **pam2p;
+ int first, too_high;
+ char *query;
+ double freq, **freq2d, lambda, new_lambda;
+ double scale, scale_high, scale_low;
+
+ pam2p = ppst->pam2p[0];
+
+ if(1 != fread(&len, sizeof(int), 1, fp)) {
+ fprintf(stderr, "error reading from checkpoint file: %d\n", len);
+ exit(1);
+ }
+
+ if(len != n0) {
+ fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",
+ len,n0);
+ exit(1);
+ }
+
+ /* read over query sequence stored in BLAST profile */
+ if(NULL == (query = (char *) calloc(len, sizeof(char)))) {
+ fprintf(stderr, "Couldn't allocate memory for query!\n");
+ exit(1);
+ }
+
+ if(len != fread(query, sizeof(char), len, fp)) {
+ fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);
+ exit(1);
+ }
+
+ printf("%d\n%s\n",len,query);
+
+ /* currently we don't do anything with query; ideally, we should
+ check to see that it actually matches aa0 ... */
+
+ /* quick 2d array alloc: */
+ if((freq2d = (double **) calloc(n0, sizeof(double *))) == NULL) {
+ fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
+ exit(1);
+ }
+
+ if((freq2d[0] = (double *) calloc(n0 * N_EFFECT, sizeof(double))) == NULL) {
+ fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
+ exit(1);
+ }
+
+ /* a little pointer arithmetic to fill out 2d array: */
+ for (qi = 1 ; qi < n0 ; qi++) {
+ freq2d[qi] = freq2d[0] + (N_EFFECT * qi);
+ }
+
+ for (qi = 0 ; qi < n0 ; qi++) {
+ printf("%c",query[qi]);
+ for (rj = 0 ; rj < N_EFFECT ; rj++) {
+ if(1 != fread(&freq, sizeof(double), 1, fp)) {
+ fprintf(stderr, "Error while reading frequencies!\n");
+ exit(1);
+ }
+ printf(" %8.7g",freq*10.0);
+
+ if (freq > 1e-12) {
+ freq = log(freq /((double) (rrcounts[rj+1])/(double) rrtotal));
+ freq /= pamscale; /* this gets us close to originial pam scores */
+ freq2d[qi][rj] = freq;
+ }
+ else {freq2d[qi][rj] = freq;}
+ }
+ printf("\n");
+ }
+
+
+ /* now figure out the right scale */
+ scale = 1.0;
+ lambda = get_lambda(ppst->pam2[0], 20, 20, "\0ARNDCQEGHILKMFPSTWYV");
+
+ /* should be near 1.0 because of our initial scaling by ppst->pamscale */
+ fprintf(stderr, "real_lambda: %g\n", lambda);
+
+ /* get initial high/low scale values: */
+ first = 1;
+ while (1) {
+ fill_pam(pam2p, n0, 20, freq2d, scale);
+ new_lambda = get_lambda(pam2p, n0, 20, query);
+
+ if (new_lambda > lambda) {
+ if (first) {
+ first = 0;
+ scale = scale_high = 1.0 + 0.05;
+ scale_low = 1.0;
+ too_high = 1;
+ } else {
+ if (!too_high) break;
+ scale = (scale_high += scale_high - 1.0);
+ }
+ } else if (new_lambda > 0) {
+ if (first) {
+ first = 0;
+ scale_high = 1.0;
+ scale = scale_low = 1.0 - 0.05;
+ too_high = 0;
+ } else {
+ if (too_high) break;
+ scale = (scale_low += scale_low - 1.0);
+ }
+ } else {
+ fprintf(stderr, "new_lambda (%g) <= 0; matrix has positive average score", new_lambda);
+ exit(1);
+ }
+ }
+
+ /* now do binary search between low and high */
+ for (i = 0 ; i < 10 ; i++) {
+ scale = 0.5 * (scale_high + scale_low);
+ fill_pam(pam2p, n0, 20, freq2d, scale);
+ new_lambda = get_lambda(pam2p, n0, 20, query);
+
+ if (new_lambda > lambda) scale_low = scale;
+ else scale_high = scale;
+ }
+
+ scale = 0.5 * (scale_high + scale_low);
+ fill_pam(pam2p, n0, 20, freq2d, scale);
+
+ fprintf(stderr, "final scale: %g\n", scale);
+
+ for (qi = 0 ; qi < n0 ; qi++) {
+ fprintf(stderr, "%4d %c: ", qi+1, query[qi]);
+ for (rj = 1 ; rj <= 20 ; rj++) {
+ fprintf(stderr, "%4d", pam2p[qi][rj]);
+ }
+ fprintf(stderr, "\n");
+ }
+
+ free(freq2d[0]);
+ free(freq2d);
+
+ free(query);
+}
+
+/*
+ * alloc_pam(): allocates memory for the 2D pam matrix as well
+ * as for the integer array used to transmit the pam matrix
+ */
+void
+alloc_pam (int d1, int d2, struct pstruct *ppst)
+{
+ int i, *d2p;
+
+ if ((ppst->pam2[0] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
+ fprintf(stderr,"Cannot allocate 2D pam matrix: %d",d1);
+ exit(1);
+ }
+
+ if ((ppst->pam2[1] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
+ fprintf(stderr,"Cannot allocate 2D pam matrix: %d",d1);
+ exit(1);
+ }
+
+ if ((d2p = pam12 = (int *) malloc (d1 * d2 * sizeof (int))) == NULL) {
+ fprintf(stderr,"Cannot allocate 2D pam matrix: %d",d1);
+ exit(1);
+ }
+
+ for (i = 0; i < d1; i++, d2p += d2)
+ ppst->pam2[0][i] = d2p;
+
+ if ((d2p=pam12x= (int *) malloc (d1 * d2 * sizeof (int))) == NULL) {
+ fprintf(stderr,"Cannot allocate 2d pam matrix: %d",d2);
+ exit(1);
+ }
+
+ for (i = 0; i < d1; i++, d2p += d2)
+ ppst->pam2[1][i] = d2p;
+}
+
+void
+fill_pam(int **pam2p, int n0, int nsq, double **freq2d, double scale) {
+ int i, j;
+ double freq;
+
+ /* fprintf(stderr, "scale: %g\n", scale); */
+
+ /* now fill in the pam matrix: */
+ for (i = 0 ; i < n0 ; i++) {
+ for (j = 1 ; j <=nsq ; j++) {
+ freq = scale * freq2d[i][j-1];
+ if ( freq < 0.0) freq -= 0.5;
+ else freq += 0.5;
+ pam2p[i][j] = (int)(freq);
+ }
+ }
+}
+
+/*
+ * initpam2(struct pstruct pst): Converts 1-D pam matrix to 2-D
+ */
+void initpam2 (struct pstruct *ppst)
+{
+ int i, j, k, nsq, pam_xx, pam_xm;
+ int sa_x, sa_t, tmp;
+
+ nsq = ppst->nsq;
+ sa_x = pascii['X'];
+ sa_t = pascii['*'];
+
+ ppst->pam2[0][0][0] = -BIGNUM;
+ ppst->pam_h = -1; ppst->pam_l = 1;
+
+ k = 0;
+ for (i = 1; i <= nsq; i++) {
+ ppst->pam2[0][0][i] = ppst->pam2[0][i][0] = -BIGNUM;
+ for (j = 1; j <= i; j++) {
+ ppst->pam2[0][j][i] = ppst->pam2[0][i][j] = pam[k++] - ppst->pamoff;
+ if (ppst->pam_l > ppst->pam2[0][i][j]) ppst->pam_l =ppst->pam2[0][i][j];
+ if (ppst->pam_h < ppst->pam2[0][i][j]) ppst->pam_h =ppst->pam2[0][i][j];
+ }
+ }
+
+ ppst->nt_align = (ppst->dnaseq== SEQT_DNA || ppst->dnaseq == SEQT_RNA);
+
+ if (ppst->dnaseq == SEQT_RNA) {
+ tmp = ppst->pam2[0][nascii['G']][nascii['G']] - 1;
+ ppst->pam2[0][nascii['A']][nascii['G']] =
+ ppst->pam2[0][nascii['C']][nascii['T']] =
+ ppst->pam2[0][nascii['C']][nascii['U']] = tmp;
+ }
+
+ if (ppst->pam_x_set) {
+ for (i=1; i<=nsq; i++) {
+ ppst->pam2[0][sa_x][i] = ppst->pam2[0][i][sa_x]=ppst->pam_xm;
+ ppst->pam2[0][sa_t][i] = ppst->pam2[0][i][sa_t]=ppst->pam_xm;
+ }
+ ppst->pam2[0][sa_x][sa_x]=ppst->pam_xx;
+ ppst->pam2[0][sa_t][sa_t]=ppst->pam_xm;
+ }
+ else {
+ ppst->pam_xx = ppst->pam2[0][sa_x][sa_x];
+ ppst->pam_xm = ppst->pam2[0][1][sa_x];
+ }
+}
+
+double
+get_lambda(int **pam2p, int n0, int nsq, char *aa0) {
+ double lambda, H;
+ double *pr, tot, sum;
+ int i, ioff, j, min, max;
+
+ /* get min and max scores */
+ min = BIGNUM;
+ max = -BIGNUM;
+ if(pam2p[0][1] == -BIGNUM) {
+ ioff = 1;
+ n0++;
+ } else {
+ ioff = 0;
+ }
+
+ for (i = ioff ; i < n0 ; i++) {
+ for (j = 1; j <= nsq ; j++) {
+ if (min > pam2p[i][j])
+ min = pam2p[i][j];
+ if (max < pam2p[i][j])
+ max = pam2p[i][j];
+ }
+ }
+
+ /* fprintf(stderr, "min: %d\tmax:%d\n", min, max); */
+
+ if ((pr = (double *) calloc(max - min + 1, sizeof(double))) == NULL) {
+ fprintf(stderr, "Couldn't allocate memory for score probabilities: %d\n", max - min + 1);
+ exit(1);
+ }
+
+ tot = (double) rrtotal * (double) rrtotal * (double) n0;
+ for (i = ioff ; i < n0 ; i++) {
+ for (j = 1; j <= nsq ; j++) {
+ pr[pam2p[i][j] - min] +=
+ (double) ((double) rrcounts[aascii[aa0[i]]] * (double) rrcounts[j]) / tot;
+ }
+ }
+
+ sum = 0.0;
+ for(i = 0 ; i <= max-min ; i++) {
+ sum += pr[i];
+ /* fprintf(stderr, "%3d: %g %g\n", i+min, pr[i], sum); */
+ }
+ /* fprintf(stderr, "sum: %g\n", sum); */
+
+ for(i = 0 ; i <= max-min ; i++) { pr[i] /= sum; }
+
+ if (!karlin(min, max, pr, &lambda, &H)) {
+ fprintf(stderr, "Karlin lambda estimation failed\n");
+ }
+
+ /* fprintf(stderr, "lambda: %g\n", lambda); */
+ free(pr);
+
+ return lambda;
+}
+
+int **
+alloc_pam2p(int len, int nsq) {
+ int i;
+ int **pam2p;
+
+ if ((pam2p = (int **)calloc(len,sizeof(int *)))==NULL) {
+ fprintf(stderr," Cannot allocate pam2p: %d\n",len);
+ return NULL;
+ }
+
+ if((pam2p[0] = (int *)calloc((nsq+1)*len,sizeof(int)))==NULL) {
+ fprintf(stderr, "Cannot allocate pam2p[0]: %d\n", (nsq+1)*len);
+ free(pam2p);
+ return NULL;
+ }
+
+ for (i=1; i<len; i++) {
+ pam2p[i] = pam2p[0] + (i*(nsq+1));
+ }
+
+ return pam2p;
+}
+
+void free_pam2p(int **pam2p) {
+ if (pam2p) {
+ free(pam2p[0]);
+ free(pam2p);
+ }
+}
+