3 /* $Name: fa_34_26_5 $ - $Id: initfa.c,v 1.148 2007/04/26 18:40:58 wrp Exp $ */
5 /* copyright (c) 1996, 1997, 1998 William R. Pearson and the U. of Virginia */
7 /* init??.c files provide function specific initializations */
9 /* h_init() - called from comp_lib.c, comp_thr.c to initialize pstruct ppst
10 which includes the alphabet, and pam matrix
12 alloc_pam() - allocate pam matrix space
13 init_pam2() - convert from 1D to 2D pam
15 init_pamx() - convert from 1D to 2D pam
17 f_initenv() - set up mngmsg and pstruct defaults
18 f_getopt() - read fasta specific command line options
19 f_getarg() - read ktup
21 resetp() - reset the parameters, scoring matrix for DNA-DNA/DNA-prot
23 query_parm() - ask for ktup
24 last_init() - some things must be done last
26 f_initpam() - set some parameters based on the pam matrix
36 #include <sys/types.h>
57 int initpam(char *, struct pstruct *);
58 void init_pam2 (struct pstruct *ppst);
59 void extend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst);
60 void build_xascii(int *qascii, char *save_str);
61 void ann_ascii(int *qascii, char *ann_arr);
62 void re_ascii(int *qascii, int *pascii);
63 extern int nrand(int);
65 /* at some point, all the defaults should be driven from this table */
67 #pgm q_seq l_seq p_seq matrix g_open g_ext fr_shft e_cut ktup
68 # -n/-p -s -e -f -h/-j -E argv[3]
69 fasta prot(0) prot(0) prot(0) bl50 -10 -2 - 10.0 2
70 fasta dna(1) dna(1) dna(1) +5/-4 -14 -4 - 2.0 6
71 ssearch prot(0) prot(0) prot(0) bl50 -10 -2 - 10.0 -
72 ssearch dna(1) dna(1) dna(1) +5/-4 -14 -4 - 2.0 -
73 fastx dna(1) prot(0) prot(0) BL50 -12 -2 -20 5.0 2
74 fasty dna(1) prot(0) prot(0) BL50 -12 -2 -20/-24 5.0 2
75 tfastx dna(1) prot(0) prot(0) BL50 -14 -2 -20 5.0 2
76 tfasty dna(1) prot(0) prot(0) BL50 -14 -2 -20/-24 5.0 2
77 fasts prot(0) prot(0) prot(0) MD20-MS - - - 5.0 -
78 fasts dna(1) dna(1) dna(1) +2/-4 - - - 5.0 1
79 tfasts prot(0) dna(1) prot(0) MD10-MS - - - 2.0 1
80 fastf prot(0) prot(0) prot(0) MD20 - - - 2.0 1
81 tfastf prot(0) dna(1) prot(0) MD10 - - - 1.0 1
82 fastm prot(0) prot(0) prot(0) MD20 - - - 5.0 1
83 fastm dna(1) dna(1) dna(1) +2/-4 - - - 2.0 1
84 tfastm prot(0) dna(1) prot(0) MD10 - - - 2.0 1
103 "\nPlease cite:\n W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448\n",
104 "\nPlease cite:\n T. F. Smith and M. S. Waterman, (1981) J. Mol. Biol. 147:195-197; \n W.R. Pearson (1991) Genomics 11:635-650\n",
105 "\nPlease cite:\n Pearson et al, Genomics (1997) 46:24-36\n",
106 "\nPlease cite:\n Mackey et al. Mol. Cell. Proteomics (2002) 1:139-147\n",
107 "\nPlease cite:\n W.R. Pearson (1996) Meth. Enzymol. 266:227-258\n"
119 #define SSS_PID 10 /* old (slow) non-PG Smith-Waterman */
120 #define TFA_PID FA_PID+10
121 #define TFX_PID FX_PID+10
122 #define TFY_PID FY_PID+10
123 #define TFS_PID FS_PID+10
124 #define TFF_PID FF_PID+10
125 #define TFM_PID FM_PID+10
129 {0, "", "", "", NULL, 400, "", 0, 0, 0, 1.0, 0 }, /* 0 */
130 {FA_PID, "FASTA", "fa",
131 "FASTA searches a protein or DNA sequence data bank",
132 NULL, 401, "BL50", 0, 0, 0, 10.0, 2}, /* 1 - FASTA */
133 {SS_PID, "SSEARCH","gsw","SSEARCH searches a sequence data bank",
134 NULL, 404, "BL50", 0, 0, 0, 10.0, 0}, /* 2 - SSEARCH */
135 {FX_PID, "FASTX","fx",
136 "FASTX compares a DNA sequence to a protein sequence data bank",
137 NULL, 405, "BL50", -2, -20, 0, 5.0, 2}, /* 3 - FASTX */
138 {FY_PID, "FASTY", "fy",
139 "FASTY compares a DNA sequence to a protein sequence data bank",
140 NULL, 405, "BL50", -2, -20, -24, 5.0, 2}, /* 4 - FASTY */
141 {FS_PID, "FASTS", "fs",
142 "FASTS compares linked peptides to a protein data bank",
143 NULL, 400, "MD20-MS", 0, 0, 0, 5.0, 1}, /* 5 - FASTS */
144 {FF_PID, "FASTF", "ff",
145 "FASTF compares mixed peptides to a protein databank",
146 NULL, 400, "MD20", 0, 0, 0, 2.0, 1 }, /* 6 - FASTF */
147 {FM_PID, "FASTM", "fm",
148 "FASTM compares ordered peptides to a protein data bank",
149 NULL, 400, "MD20", 0, 0, 0, 5.0, 1 }, /* 7 - FASTM */
150 {RSS_PID, "PRSS", "rss",
151 "PRSS evaluates statistical signficance using Smith-Waterman",
152 NULL, 401, "BL50", 0, 0, 0, 1000.0, 0 }, /* 8 - PRSS */
153 {RFX_PID,"PRFX", "rfx",
154 "PRFX evaluates statistical signficance using FASTX",
155 NULL, 401, "BL50", -2, -20, -24, 1000.0, 2 }, /* 9 - PRFX */
156 {SSS_PID, "OSEARCH","ssw","OSEARCH searches a sequence data bank",
157 NULL, 404, "BL50", 0, 0, 0, 10.0, 0}, /* 2 - OSEARCH */
158 {TFA_PID, "TFASTA", "tfa",
159 "TFASTA compares a protein to a translated DNA data bank",
160 NULL, 402, "BL50", -2, 0, 0, 5.0, 2 },
161 {0, "", "", "", NULL, 400, "", 0, 0, 0, 1.0, 0 }, /* 0 */
162 {TFX_PID, "TFASTX", "tfx",
163 "TFASTX compares a protein to a translated DNA data bank",
164 NULL, 406, "BL50", -2, -20, 0, 2.0, 2},
165 {TFY_PID, "TFASTY", "tfy",
166 "TFASTY compares a protein to a translated DNA data bank",
167 NULL, 406, "BL50", -2, -20, -24, 2.0, 2},
168 {TFS_PID, "TFASTS", "tfs",
169 "TFASTS compares linked peptides to a translated DNA data bank",
170 NULL, 400, "MD10-MS", 0, 0, 0, 2.0, 2 },
171 {TFF_PID, "TFASTF", "tff",
172 "TFASTF compares mixed peptides to a protein databank",
173 NULL, 400, "MD10", 0, 0, 0, 1.0, 1 },
174 {TFM_PID, "TFASTM", "tfm",
175 "TFASTM compares ordered peptides to a translated DNA databank",
176 NULL, 400, "MD10", 0, 0, 0, 1.0, 1 }
188 int nrelv, srelv, arelv;
189 char *f_id0, *f_id1, *label;
192 /* pgm_id q_seqt l_seqt p_seqt sw_f st qf nf nrv srv arv s_ix */
193 struct msg_def_str msg_def_arr[20] = {
194 {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "", "", ""}, /* ID=0 */
195 {FA_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 1, 3,
197 {SS_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
199 {FX_PID, SEQT_DNA, SEQT_PROT, SEQT_PROT, 1, 1, 2, -1, 3, 1, 3,
201 {FY_PID, SEQT_DNA, SEQT_PROT, SEQT_PROT, 1, 1, 2, -1, 3, 1, 3,
203 {FS_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
204 "fs","fs", "initn init1"},
205 {FF_PID, SEQT_PROT,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
206 "ff","ff", "initn init1"},
207 {FM_PID, SEQT_PROT,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
208 "fm","fm","initn init1"},
209 {RSS_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 0, 1, 1, -1, 1, 1, 1,
211 {RFX_PID, SEQT_DNA,SEQT_PROT, SEQT_PROT, 0, 1, 2, -1, 3, 1, 3,
213 {SSS_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
215 {TFA_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 0, 1, 1, 6, 3, 1, 3,
216 "tfa","fa","initn init1"},
217 {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "", "", ""}, /* ID=12 */
218 {TFX_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 2, 3, 2, 3,
219 "tfx","sx","initn opt"},
220 {TFY_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 2, 3, 2, 3,
221 "tfy","sy","initn opt"},
222 {TFS_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
223 "tfs","fs","initn init1"},
224 {TFF_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
225 "tff","ff","initn init1"},
226 {TFM_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
227 "tfm","fm","initn init1"}
237 pgm_def_arr[FA_PID].ref_str = ref_str_a[0];
240 pgm_def_arr[TFA_PID].ref_str = ref_str_a[0];
248 pgm_def_arr[FX_PID].ref_str = ref_str_a[2];
251 pgm_def_arr[RFX_PID].ref_str = ref_str_a[2];
255 pgm_def_arr[TFX_PID].ref_str = ref_str_a[2];
262 pgm_def_arr[FY_PID].ref_str = ref_str_a[2];
265 pgm_def_arr[TFY_PID].ref_str = ref_str_a[2];
272 pgm_def_arr[FS_PID].ref_str = ref_str_a[3];
275 pgm_def_arr[TFS_PID].ref_str = ref_str_a[3];
282 pgm_def_arr[FF_PID].ref_str = ref_str_a[3];
285 pgm_def_arr[TFF_PID].ref_str = ref_str_a[3];
292 pgm_def_arr[FM_PID].ref_str = ref_str_a[3];
295 pgm_def_arr[TFM_PID].ref_str = ref_str_a[3];
301 pgm_def_arr[SS_PID].ref_str = ref_str_a[1];
306 pgm_def_arr[SSS_PID].ref_str = ref_str_a[1];
312 pgm_def_arr[RSS_PID].ref_str = ref_str_a[4];
320 char *iprompt1=" test sequence file name: ";
321 char *iprompt2=" database file name: ";
323 char *verstr="version 34.26.5 April 26, 2007";
325 char *s_optstr = "13Ac:f:g:h:j:k:nopP:r:s:St:Ux:y:";
328 static int ktup_set = 0;
329 static int gap_set=0;
330 static int del_set=0;
331 static int mshuff_set = 0;
332 static int prot2dna = 0;
334 extern int max_workers;
336 extern void s_abort(char *, char *);
337 extern void init_ascii(int ext_sq, int *sascii, int dnaseq);
338 extern int standard_pam(char *smstr, struct pstruct *ppst,
339 int del_set, int gap_set);
340 extern void mk_n_pam(int *arr,int siz, int mat, int mis);
341 extern int karlin(int , int, double *, double *, double *);
342 extern void init_karlin_a(struct pstruct *, double *, double **);
343 extern int do_karlin_a(int **, struct pstruct *, double *,
344 double *, double *, double *, double *);
346 #if defined(TFAST) || defined(FASTX) || defined(FASTY)
347 extern void aainit(int tr_type, int debug);
350 char *iprompt0, *prog_func, *refstr;
353 /* Sets defaults assuming a protein sequence */
354 void h_init (struct pstruct *ppst, struct mngmsg *m_msp, char *pgm_abbr)
356 struct pgm_def_str pgm_def;
359 ppst->pgm_id = pgm_id = get_pgm_id();
360 pgm_def = pgm_def_arr[pgm_id];
362 /* check that pgm_def_arr[] is valid */
363 if (pgm_def.pgm_id != pgm_id) {
365 "**pgm_def integrity failure: def.pgm_id %d != pgm_id %d**\n",
366 pgm_def.pgm_id, pgm_id);
370 /* check that msg_def_arr[] is valid */
371 if (msg_def_arr[pgm_id].pgm_id != pgm_id) {
373 "**msg_def integrity failure: def.pgm_id %d != pgm_id %d**\n",
374 msg_def_arr[pgm_id].pgm_id, pgm_id);
378 strncpy(pgm_abbr,pgm_def.pgm_abbr,MAX_SSTR);
379 iprompt0 = pgm_def.iprompt0;
380 refstr = pgm_def.ref_str;
381 prog_func = pgm_def.prog_func;
383 /* MAXTOT = MAXTST + MAXLIB for everything except TFAST,
384 where it is MAXTST + MAXTRN */
385 m_msp->max_tot = MAXTOT;
387 /* set up DNA query sequence if required*/
388 if (msg_def_arr[pgm_id].q_seqt == SEQT_DNA) {
389 memcpy(qascii,nascii,sizeof(qascii));
390 m_msp->qdnaseq = SEQT_DNA;
392 else { /* when SEQT_UNK, start with protein */
393 memcpy(qascii,aascii,sizeof(qascii));
394 m_msp->qdnaseq = msg_def_arr[pgm_id].q_seqt;
397 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
399 /* also initialize aascii, nascii for databases */
403 /* initialize a pam matrix */
404 strncpy(ppst->pamfile,pgm_def.smstr,MAX_FN);
405 standard_pam(ppst->pamfile,ppst,del_set,gap_set);
408 /* this is always protein by default */
411 for (i=0; i<=ppst->nsqx; i++) {
413 ppst->hsq[i] = haa[i];
414 ppst->sqx[i]=aax[i]; /* sq = aa */
415 ppst->hsqx[i]=haax[i]; /* hsq = haa */
417 ppst->sq[ppst->nsqx+1] = ppst->sqx[ppst->nsqx+1] = '\0';
419 /* set up the c_nt[] mapping */
421 #if defined(FASTS) || defined(FASTF) || defined(FASTM)
422 ppst->c_nt[ESS] = ESS;
425 for (i=1; i<=nnt; i++) {
426 ppst->c_nt[i]=gc_nt[i];
427 ppst->c_nt[i+nnt]=gc_nt[i]+nnt;
432 * alloc_pam(): allocates memory for the 2D pam matrix as well
433 * as for the integer array used to transmit the pam matrix
436 alloc_pam (int d1, int d2, struct pstruct *ppst)
441 if ((ppst->pam2[0] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
442 sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
443 s_abort (err_str,"");
446 if ((ppst->pam2[1] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
447 sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
448 s_abort (err_str,"");
451 if ((d2p = pam12 = (int *) calloc (d1 * d2, sizeof (int))) == NULL) {
452 sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
453 s_abort (err_str,"");
456 for (i = 0; i < d1; i++, d2p += d2)
457 ppst->pam2[0][i] = d2p;
459 if ((d2p=pam12x= (int *) malloc (d1 * d2 * sizeof (int))) == NULL) {
460 sprintf(err_str,"Cannot allocate 2d pam matrix: %d",d2);
461 s_abort (err_str,"");
464 for (i = 0; i < d1; i++, d2p += d2)
465 ppst->pam2[1][i] = d2p;
471 * init_pam2(struct pstruct pst): Converts 1-D pam matrix to 2-D
474 init_pam2 (struct pstruct *ppst) {
479 ppst->pam2[0][0][0] = -BIGNUM;
480 ppst->pam_h = -1; ppst->pam_l = 1;
483 for (i = 1; i <= nsq; i++) {
484 ppst->pam2[0][0][i] = ppst->pam2[0][i][0] = -BIGNUM;
485 for (j = 1; j <= i; j++) {
486 ppst->pam2[0][j][i] = ppst->pam2[0][i][j] = pam[k++] - ppst->pamoff;
487 if (ppst->pam_l > ppst->pam2[0][i][j]) ppst->pam_l =ppst->pam2[0][i][j];
488 if (ppst->pam_h < ppst->pam2[0][i][j]) ppst->pam_h =ppst->pam2[0][i][j];
494 init_pamx (struct pstruct *ppst) {
495 int i, j, k, nsq, pam_xx, pam_xm;
500 ppst->nt_align = (ppst->dnaseq== SEQT_DNA || ppst->dnaseq == SEQT_RNA);
502 if (ppst->nt_align) {
511 if (ppst->dnaseq == SEQT_RNA) {
512 tmp = ppst->pam2[0][nascii['G']][nascii['G']] - 1;
513 ppst->pam2[0][nascii['A']][nascii['G']] =
514 ppst->pam2[0][nascii['C']][nascii['T']] =
515 ppst->pam2[0][nascii['C']][nascii['U']] = tmp;
518 if (ppst->pam_x_set) {
519 for (i=1; i<=nsq; i++) {
520 ppst->pam2[0][sa_x][i] = ppst->pam2[0][i][sa_x]=ppst->pam_xm;
521 ppst->pam2[0][sa_t][i] = ppst->pam2[0][i][sa_t]=ppst->pam_xm;
523 ppst->pam2[0][sa_x][sa_x]=ppst->pam_xx;
524 ppst->pam2[0][sa_t][sa_t]=ppst->pam_xx;
527 ppst->pam_xx = ppst->pam2[0][sa_x][sa_x];
528 ppst->pam_xm = ppst->pam2[0][1][sa_x];
531 pam_xx = ppst->pam_xx;
532 pam_xm = ppst->pam_xm;
534 if (ppst->ext_sq_set) { /* using extended alphabet */
535 /* fill in pam2[1] matrix */
536 ppst->pam2[1][0][0] = -BIGNUM;
537 /* fill in additional parts of the matrix */
538 for (i = 1; i <= nsq; i++) {
540 /* -BIGNUM to all matches vs 0 */
541 ppst->pam2[0][0][i+nsq] = ppst->pam2[0][i+nsq][0] =
542 ppst->pam2[1][0][i+nsq] = ppst->pam2[1][i+nsq][0] =
543 ppst->pam2[1][0][i] = ppst->pam2[1][i][0] = -BIGNUM;
545 for (j = 1; j <= nsq; j++) {
547 /* replicate pam2[0] to i+nsq, j+nsq */
548 ppst->pam2[0][i+nsq][j] = ppst->pam2[0][i][j+nsq] =
549 ppst->pam2[0][i+nsq][j+nsq] = ppst->pam2[1][i][j] =
552 /* set the high portion of pam2[1] to the corresponding value
553 of pam2[1][sa_x][j] */
555 ppst->pam2[1][i+nsq][j] = ppst->pam2[1][i][j+nsq]=
556 ppst->pam2[1][i+nsq][j+nsq]=ppst->pam2[0][sa_x][j];
562 /* function specific initializations */
564 f_initenv (struct mngmsg *m_msp, struct pstruct *ppst, unsigned char **aa0) {
565 struct msg_def_str m_msg_def;
568 pgm_id = ppst->pgm_id;
569 m_msg_def = msg_def_arr[pgm_id];
571 m_msp->last_calc_flg=0;
573 strncpy(m_msp->f_id0,m_msg_def.f_id0,sizeof(m_msp->f_id0));
574 strncpy(m_msp->f_id1,m_msg_def.f_id1,sizeof(m_msp->f_id1));
575 strncpy (m_msp->label, m_msg_def.label, sizeof(m_msp->label));
578 strncpy (m_msp->alab[0],"initn",20);
579 strncpy (m_msp->alab[1],"init1",20);
580 strncpy (m_msp->alab[2],"opt",20);
582 strncpy (m_msp->alab[0],"s-w opt",20);
585 ppst->gdelval += pgm_def_arr[pgm_id].g_open_mod;
586 ppst->sw_flag = m_msg_def.sw_flag;
587 m_msp->e_cut=pgm_def_arr[pgm_id].e_cut;
591 m_msp->qframe = m_msg_def.qframe;
592 ppst->sw_flag = m_msg_def.sw_flag;
593 m_msp->nframe = m_msg_def.nframe;
594 m_msp->nrelv = m_msg_def.nrelv;
595 m_msp->srelv = m_msg_def.srelv;
596 m_msp->arelv = m_msg_def.arelv;
597 m_msp->stages = m_msg_def.stages;
599 m_msp->shuff_wid = 0;
600 m_msp->shuff_max = 200;
603 /* see param.h for the definition of all these */
607 m_msp->escore_flg = 0;
609 /* pam information */
611 #if defined(FASTS) || defined(FASTF) || defined(FASTM)
612 ppst->pam_xx = ppst->pam_xm = 0;
614 ppst->pam_xx = 1; /* set >0 to use pam['X']['X'] value */
615 ppst->pam_xm = -1; /* set >0 to use pam['X']['A-Z'] value */
622 ppst->ext_sq_set = 0;
624 if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
626 ppst->param_u.fa.bestscale = 300;
627 ppst->param_u.fa.bestoff = 36;
628 ppst->param_u.fa.bkfact = 6;
629 ppst->param_u.fa.scfact = 3;
630 ppst->param_u.fa.bktup = 2;
631 ppst->param_u.fa.ktup = 0;
632 ppst->param_u.fa.bestmax = 50;
633 ppst->param_u.fa.pamfact = 1;
634 ppst->param_u.fa.altflag = 0;
635 ppst->param_u.fa.optflag = 1;
636 ppst->param_u.fa.iniflag = 0;
637 ppst->param_u.fa.optcut = 0;
638 ppst->param_u.fa.optcut_set = 0;
639 ppst->param_u.fa.cgap = 0;
640 ppst->param_u.fa.optwid = MAXWINDOW;
645 /* switches for fasta only */
647 static int shift_set=0;
648 static int subs_set=0;
649 static int sw_flag_set=0;
650 static int nframe_set=0;
651 static int wid_set=0;
654 f_getopt (char copt, char *optarg,
655 struct mngmsg *m_msg, struct pstruct *ppst)
660 pgm_id = ppst->pgm_id;
664 if (pgm_def_arr[pgm_id].ktup > 0) {
665 ppst->param_u.fa.iniflag=1;
670 if (pgm_id == TFA_PID) {
671 m_msg->nframe = 3; break;
674 m_msg->nframe = 1; /* for TFASTXY */
675 m_msg->qframe = 1; /* for FASTA, FASTX */
683 if (pgm_def_arr[pgm_id].ktup > 0) {
684 sscanf (optarg, "%d", &ppst->param_u.fa.optcut);
685 ppst->param_u.fa.optcut_set = 1;
689 sscanf (optarg, "%d", &ppst->gdelval);
690 if (ppst->gdelval > 0) ppst->gdelval = -ppst->gdelval;
694 sscanf (optarg, "%d", &ppst->ggapval);
695 if (ppst->ggapval > 0) ppst->ggapval = -ppst->ggapval;
699 sscanf (optarg, "%d", &ppst->gshift);
700 if (ppst->gshift > 0) ppst->gshift = -ppst->gshift;
704 sscanf (optarg, "%d", &ppst->gsubs);
708 sscanf (optarg, "%d", &m_msg->shuff_max);
712 m_msg->qdnaseq = SEQT_DNA;
713 re_ascii(qascii,nascii);
714 strncpy(m_msg->sqnam,"nt",4);
718 if (pgm_def_arr[pgm_id].ktup > 0) {
719 ppst->param_u.fa.optflag = 0;
720 msg_def_arr[pgm_id].nrelv = m_msg->nrelv = 2;
724 m_msg->qdnaseq = SEQT_PROT;
725 ppst->dnaseq = SEQT_PROT;
726 strncpy(m_msg->sqnam,"aa",4);
729 strncpy(ppst->pgpfile,optarg,MAX_FN);
730 if ((bp=strchr(ppst->pgpfile,' '))!=NULL) {
732 ppst->pgpfile_type = atoi(bp+1);
734 else ppst->pgpfile_type = 0;
735 ppst->pgpfile[MAX_FN-1]='\0';
739 sscanf(optarg,"%d/%d",&ppst->p_d_mat,&ppst->p_d_mis);
740 if (ppst->p_d_mat > 0 && ppst->p_d_mis < 0) {
742 strncpy(ppst->pamfile,optarg,40);
746 strncpy (ppst->pamfile, optarg, 120);
747 ppst->pamfile[120-1]='\0';
748 if (!standard_pam(ppst->pamfile,ppst,del_set, gap_set)) {
749 initpam (ppst->pamfile, ppst);
753 case 'S': /* turn on extended alphabet for seg */
754 ppst->ext_sq_set = 1;
757 if (tolower(optarg[0])=='t') {
758 m_msg->term_code = aascii['*']; optarg++;
760 if (*optarg) {sscanf (optarg, "%d", &ppst->tr_type);}
763 m_msg->qdnaseq = SEQT_RNA;
764 memcpy(qascii,nascii,sizeof(qascii));
765 strncpy(m_msg->sqnam,"nt",4);
770 if (strchr(optarg,',')!=NULL) {
771 sscanf (optarg,"%d,%d",&ppst->pam_xx, &ppst->pam_xm);
774 sscanf (optarg,"%d",&ppst->pam_xx);
775 ppst->pam_xm = ppst->pam_xx;
780 if (pgm_def_arr[pgm_id].ktup > 0) {
781 sscanf (optarg, "%d", &ppst->param_u.fa.optwid);
789 f_lastenv (struct mngmsg *m_msg, struct pstruct *ppst)
791 char save_str[MAX_SSTR];
793 #if !defined(FASTM) && !defined(FASTS) && !defined(FASTF)
794 strncpy(save_str,"*",sizeof(save_str));
796 strncpy(save_str,",",sizeof(save_str));
799 if (m_msg->qdnaseq == SEQT_UNK) {
800 build_xascii(qascii,save_str);
801 if (m_msg->ann_flg) ann_ascii(qascii,m_msg->ann_arr);
804 /* this check allows lc DNA sequence queries with FASTX */
805 #if defined(FASTA) && !defined(FASTS) && !defined(FASTM) && !defined(FASTF)
807 init_ascii(ppst->ext_sq_set,qascii,m_msg->qdnaseq);
812 f_getarg (int argc, char **argv, int optind,
813 struct mngmsg *m_msg, struct pstruct *ppst)
816 if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
817 if (argc - optind >= 4) {
818 sscanf (argv[optind + 3], "%d", &ppst->param_u.fa.ktup);
822 ppst->param_u.fa.ktup = -ppst->param_u.fa.bktup;
825 if (ppst->pgm_id == RSS_PID && argc - optind > 3) {
826 sscanf (argv[optind + 3], "%d", &m_msg->shuff_max);
829 if (ppst->pgm_id == RFX_PID && argc - optind > 4) {
830 sscanf (argv[optind + 4], "%d", &m_msg->shuff_max);
834 /* fills in the query ascii mapping from the parameter
839 re_ascii(int *qascii, int *pascii) {
842 for (i=0; i < 128; i++) {
843 if (qascii[i] > '@' || qascii[i] < ESS) {
844 qascii[i] = pascii[i];
850 /* recode has become function specific to accommodate FASTS/M */
851 /* modified 28-Dec-2004 to ensure that all mapped characters
854 recode(unsigned char *seq, int n, int *qascii, int nsqx) {
858 #if defined(FASTS) || defined(FASTM)
862 for (i=0; i < n; i++) {
864 if (seq[i] > '@') seq[i] = qascii[seq[i]];
865 if (seq[i] > nsqx && seq[i]!=ESS) {
866 fprintf(stderr, "*** Warning - unrecognized residue at %d:%c - %2d\n",
868 seq[i] = qascii['X'];
875 /* here we have the query sequence, all the command line options,
876 but we need to set various parameter options based on the type
877 of the query sequence (m_msg->qdnaseq = 0:protein/1:DNA) and
878 the function (FASTA/FASTX/TFASTA)
881 /* this resetp is for conventional a FASTA/TFASTXYZ search */
883 resetp (struct mngmsg *m_msg, struct pstruct *ppst) {
886 pgm_id = ppst->pgm_id;
889 if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) {
890 fprintf(stderr," %s compares a protein to a translated\n\
891 DNA sequence library. Do not use a DNA query/scoring matrix.\n",prog_func);
895 #if (defined(FASTX) || defined(FASTY))
896 if (!(m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA)) {
897 fprintf(stderr," FASTX/Y compares a DNA sequence to a protein database\n");
898 fprintf(stderr," Use a DNA query\n");
904 /* this code changes parameters for programs (FA_PID, SS_PID, FS_PID,
905 RSS_PID) that can examine either protein (initial state) or DNA
906 Modified May, 2006 to reset e_cut for DNA comparisons.
909 if (msg_def_arr[pgm_id].q_seqt == SEQT_UNK) {
910 if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) {
911 msg_def_arr[pgm_id].q_seqt = m_msg->qdnaseq;
912 msg_def_arr[pgm_id].p_seqt = SEQT_DNA;
913 msg_def_arr[pgm_id].l_seqt = SEQT_DNA;
914 if (m_msg->qdnaseq == SEQT_DNA) msg_def_arr[pgm_id].qframe = 2;
915 pgm_def_arr[pgm_id].e_cut /= 5.0;
918 msg_def_arr[pgm_id].q_seqt = SEQT_PROT;
922 ppst->dnaseq = msg_def_arr[pgm_id].p_seqt;
923 if (!sw_flag_set) ppst->sw_flag = msg_def_arr[pgm_id].sw_flag;
924 if (!m_msg->e_cut_set) m_msg->e_cut=pgm_def_arr[pgm_id].e_cut;
926 if (ppst->dnaseq == SEQT_DNA && m_msg->qdnaseq==SEQT_RNA) {
927 ppst->dnaseq = SEQT_RNA;
930 if (ppst->dnaseq==SEQT_DNA) pascii = &nascii[0];
931 else if (ppst->dnaseq==SEQT_RNA) {
933 ppst->sq[nascii['T']] = 'U';
935 else pascii = &aascii[0];
936 m_msg->ldnaseq = msg_def_arr[pgm_id].l_seqt;
937 if (m_msg->ldnaseq & SEQT_DNA) {
938 memcpy(lascii,nascii,sizeof(lascii));
941 init_ascii(ppst->ext_sq_set,lascii,m_msg->ldnaseq);
944 /* no init_ascii() because we translate lower case library sequences */
948 memcpy(lascii,aascii,sizeof(lascii)); /* initialize lib mapping */
950 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
953 init_ascii(ppst->ext_sq_set,lascii,m_msg->ldnaseq);
957 m_msg->qframe = msg_def_arr[pgm_id].qframe;
958 m_msg->nframe = msg_def_arr[pgm_id].nframe;
961 /* the possibilities:
969 if (m_msg->qdnaseq == SEQT_DNA) {
971 if (m_msg->qframe == 1 && m_msg->revcomp==1) {
972 m_msg->qframe = m_msg->revcomp+1;
975 else if (m_msg->qdnaseq == SEQT_RNA) {
976 m_msg->qframe = m_msg->revcomp+1;
980 /* change settings for DNA search */
981 if (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA) {
986 ppst->gdelval = -16; /* def. del penalty */
988 ppst->gdelval = -12; /* def. open penalty */
991 if (!gap_set) ppst->ggapval = -4; /* def. gap penalty */
993 if (pgm_def_arr[pgm_id].ktup > 0) {
994 /* these parameters are used to scale optcut, they should be replaced
995 by statistically based parameters */
996 if (!wid_set) ppst->param_u.fa.optwid = 16;
997 ppst->param_u.fa.bestscale = 80;
998 ppst->param_u.fa.bkfact = 5;
999 ppst->param_u.fa.scfact = 1;
1000 ppst->param_u.fa.bktup = 6;
1001 ppst->param_u.fa.bestmax = 80;
1002 ppst->param_u.fa.bestoff = 45;
1006 strncpy(m_msg->f_id1,"bs",sizeof(m_msg->f_id1));
1012 if (ppst->param_u.fa.pamfact >= 0) ppst->param_u.fa.pamfact = 0;
1013 if (ppst->param_u.fa.ktup < 0)
1014 ppst->param_u.fa.ktup = -ppst->param_u.fa.bktup;
1019 for (i=0; i<=ppst->nsqx; i++) {
1020 ppst->hsq[i] = hnt[i];
1021 ppst->sq[i] = nt[i];
1022 ppst->hsqx[i] = hntx[i];
1023 ppst->sqx[i] = ntx[i];
1025 ppst->sq[ppst->nsqx+1] = ppst->sqx[ppst->nsqx+1] = '\0';
1027 if (!ppst->pam_set) {
1029 mk_n_pam(npam,nnt,ppst->p_d_mat,ppst->p_d_mis);
1030 #if !defined(FASTS) && !defined(FASTM)
1031 else if (ppst->pamfile[0]=='\0' || strncmp(ppst->pamfile,"BL50",4)==0) {
1032 strncpy (ppst->pamfile, "+5/-4", sizeof(ppst->pamfile));
1035 else if (strncmp(ppst->pamfile,"MD20",4)==0) {
1036 strncpy (ppst->pamfile, "+2/-2", sizeof(ppst->pamfile));
1039 mk_n_pam(npam,nnt,ppst->p_d_mat,ppst->p_d_mis);
1045 strncpy (m_msg->sqnam, "nt",sizeof(m_msg->sqnam));
1046 strncpy (m_msg->sqtype, "DNA",sizeof(m_msg->sqtype));
1047 } /* end DNA reset */
1049 else { /* other parameters for protein comparison */
1050 if (pgm_def_arr[pgm_id].ktup > 0) {
1052 if (ppst->param_u.fa.ktup==1) ppst->param_u.fa.optwid = 32;
1053 else ppst->param_u.fa.optwid = 16;
1056 if (!del_set) {ppst->gdelval += pgm_def_arr[pgm_id].g_open_mod;}
1057 if (!shift_set) {ppst->gshift = pgm_def_arr[pgm_id].gshift;}
1058 if (!subs_set) {ppst->gsubs = pgm_def_arr[pgm_id].hshift;}
1063 /* query_parm() this function asks for any additional parameters
1064 that have not been provided. Could be null. */
1066 query_parm (struct mngmsg *m_msp, struct pstruct *ppst)
1070 if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
1071 if (ppst->param_u.fa.ktup < 0)
1072 ppst->param_u.fa.ktup = -ppst->param_u.fa.ktup;
1074 if (ppst->param_u.fa.ktup == 0) {
1075 printf (" ktup? (1 to %d) [%d] ", mktup, ppst->param_u.fa.bktup);
1076 if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1077 else sscanf(qline,"%d",&ppst->param_u.fa.ktup);
1079 if (ppst->param_u.fa.ktup == 0)
1080 ppst->param_u.fa.ktup = ppst->param_u.fa.bktup;
1085 if (m_msp->shuff_max < 10) m_msp->shuff_max = 200;
1088 printf(" number of shuffles [%d]? ",m_msp->shuff_max);
1090 if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1091 else sscanf(qline,"%d",&m_msp->shuff_max);
1094 if (ppst->zs_win == 0) {
1095 printf (" local (window) (w) or uniform (u) shuffle [u]? ");
1096 if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1097 else if (qline[0]=='w' || qline[0]=='W') {
1098 m_msp->shuff_wid = 20;
1099 printf(" local shuffle window size [%d]? ",m_msp->shuff_wid);
1100 if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1101 else sscanf(qline,"%d",&m_msp->shuff_wid);
1107 /* last_init() cannot look at aa0, n0, because it is only run once,
1108 it is not run before each new aa0 search */
1110 last_init (struct mngmsg *m_msg, struct pstruct *ppst
1116 int ix_l, ix_i, i, pgm_id;
1118 double aa0_f[MAXSQ];
1120 pgm_id = ppst->pgm_id;
1122 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
1124 m_msg->shuff_max = 2000;
1126 ppst->shuff_node = m_msg->shuff_max/max_workers;
1128 ppst->shuff_node = m_msg->shuff_max/nnodes;
1132 if (m_msg->aln.llen < 1) {
1133 m_msg->aln.llen = 60;
1137 #if defined(FASTX) || defined(FASTY) || defined(TFAST)
1138 /* set up translation tables: faatran.c */
1139 aainit(ppst->tr_type,ppst->debug_lib);
1143 /* a sanity check */
1145 if (m_msg->revcomp && m_msg->qdnaseq!=SEQT_DNA && m_msg->qdnaseq!=SEQT_RNA) {
1146 fprintf(stderr," cannot reverse complement protein\n");
1151 if (pgm_def_arr[pgm_id].ktup > 0) {
1153 if (ppst->param_u.fa.ktup < 0)
1154 ppst->param_u.fa.ktup = -ppst->param_u.fa.ktup;
1156 if (ppst->param_u.fa.ktup < 1 || ppst->param_u.fa.ktup > mktup) {
1157 fprintf(stderr," warning ktup = %d out of range [1..%d], reset to %d\n",
1158 ppst->param_u.fa.ktup, mktup, ppst->param_u.fa.bktup);
1159 ppst->param_u.fa.ktup = ppst->param_u.fa.bktup;
1163 if (pgm_id == TFA_PID) {
1164 m_msg->revcomp *= 3;
1165 if (m_msg->nframe == 3) m_msg->nframe += m_msg->revcomp;
1167 else if (pgm_id == TFX_PID || pgm_id == TFY_PID) {
1168 if (m_msg->nframe == 1) m_msg->nframe += m_msg->revcomp;
1172 /* for fasta/fastx searches, itt iterates the the query strand */
1173 m_msg->nitt1 = m_msg->qframe-1;
1175 /* for tfasta/tfastxy searches, itt iterates the library frames */
1176 m_msg->nitt1 = m_msg->nframe-1;
1179 if (pgm_def_arr[pgm_id].ktup > 0) {
1180 if (ppst->param_u.fa.ktup>=2 && !wid_set) {
1181 ppst->param_u.fa.optwid=16;
1184 m_msg->thr_fact = 32;
1188 m_msg->thr_fact = 16;
1193 m_msg->thr_fact = 8;
1196 m_msg->thr_fact = 4;
1199 else { m_msg->thr_fact = 4;}
1201 else m_msg->thr_fact = 4;
1204 if (m_msg->shuff_max < 10) m_msg->shuff_max = 200;
1205 if (ppst->zsflag < 10) ppst->zsflag += 10;
1206 if (ppst->zs_win > 0) {
1207 m_msg->shuff_wid = ppst->zs_win;
1211 if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
1212 if (ppst->param_u.fa.iniflag) {
1214 strncpy (m_msg->label, "initn init1", sizeof(m_msg->label));
1216 else if (ppst->param_u.fa.optflag) {
1222 if (!ppst->have_pam2) {
1223 alloc_pam (MAXSQ, MAXSQ, ppst);
1229 if (m_msg->qdnaseq == SEQT_PROT) {
1230 /* code to make 'L'/'I' identical scores */
1233 ppst->pam2[0][ix_l][ix_i] = ppst->pam2[0][ix_i][ix_l] =
1234 ppst->pam2[0][ix_l][ix_l] = ppst->pam2[0][ix_i][ix_i] =
1235 (ppst->pam2[0][ix_l][ix_l]+ppst->pam2[0][ix_i][ix_i]+1)/2;
1236 for (i=1; i<=ppst->nsq; i++) {
1237 ppst->pam2[0][i][ix_i] = ppst->pam2[0][i][ix_l] =
1238 (ppst->pam2[0][i][ix_l]+ppst->pam2[0][i][ix_i]+1)/2;
1239 ppst->pam2[0][ix_i][i] = ppst->pam2[0][ix_l][i] =
1240 (ppst->pam2[0][ix_i][i]+ppst->pam2[0][ix_l][i]+1)/2;
1243 /* code to make 'Q'/'K' identical scores */
1247 ppst->pam2[0][ix_l][ix_i] = ppst->pam2[0][ix_i][ix_l] =
1248 ppst->pam2[0][ix_l][ix_l] = ppst->pam2[0][ix_i][ix_i] =
1249 (ppst->pam2[0][ix_l][ix_l]+ppst->pam2[0][ix_i][ix_i]+1)/2;
1250 for (i=1; i<=ppst->nsq; i++) {
1251 ppst->pam2[0][i][ix_i] = ppst->pam2[0][i][ix_l] =
1252 (ppst->pam2[0][i][ix_l]+ppst->pam2[0][i][ix_i]+1)/2;
1253 ppst->pam2[0][ix_i][i] = ppst->pam2[0][ix_l][i] =
1254 (ppst->pam2[0][ix_i][i]+ppst->pam2[0][ix_l][i]+1)/2;
1264 /* once we have a complete pam matrix, we can calculate Lambda and K
1265 for "average" sequences */
1267 init_karlin_a(ppst, aa0_f, &kar_p);
1268 do_karlin_a(ppst->pam2[0], ppst, aa0_f,
1269 kar_p, &m_msg->Lambda, &m_msg->K, &m_msg->H);
1272 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
1273 if (ppst->ext_sq_set) {
1274 fprintf(stderr," -S not available on [t]fast[fs]\n");
1275 ppst->ext_sq_set = 0;
1277 /* reset sascii to ignore -S, map lc */
1278 init_ascii(0,lascii,0);
1283 /* this function is left over from the older FASTA format scoring
1284 matrices that allowed additional parameters (bktup, bkfact) to be
1285 set in the scoring matrix. It is no longer used. A modern version
1286 would set parameters based on lambda and K.
1290 f_initpam (line, ppst)
1292 struct pstruct *ppst;
1294 if (sscanf (line, " %d %d %d %d %d %d %d", &ppst->param_u.fa.scfact,
1295 &ppst->param_u.fa.bestoff, &ppst->param_u.fa.bestscale,
1296 &ppst->param_u.fa.bkfact, &ppst->param_u.fa.bktup,
1297 &ppst->param_u.fa.bestmax, &ppst->histint) != 7)
1299 printf (" bestcut parameters - bad format\n");
1305 /* alloc_pam2 creates a profile structure */
1307 alloc_pam2p(int len, int nsq) {
1311 if ((pam2p = (int **)calloc(len,sizeof(int *)))==NULL) {
1312 fprintf(stderr," Cannot allocate pam2p: %d\n",len);
1316 if((pam2p[0] = (int *)calloc((nsq+1)*len,sizeof(int)))==NULL) {
1317 fprintf(stderr, "Cannot allocate pam2p[0]: %d\n", (nsq+1)*len);
1322 for (i=1; i<len; i++) {
1323 pam2p[i] = pam2p[0] + (i*(nsq+1));
1329 void free_pam2p(int **pam2p) {
1336 /* sortbest has now become comparison function specific so that we can use
1337 a different comparison for fasts/f
1339 #if !defined(FASTS) && !defined (FASTF) && !defined(FASTM)
1346 last_calc(unsigned char *aa0, unsigned char *aa1, int maxn,
1347 struct beststr **bestp_arr, int nbest,
1348 struct mngmsg *m_msg, struct pstruct *pst,
1349 void **f_str, void *rs_str)
1354 void sortbest (bptr, nbest, irelv)
1355 struct beststr **bptr;
1359 struct beststr *tmp;
1361 for (gap = nbest/2; gap > 0; gap /= 2)
1362 for (i = gap; i < nbest; i++)
1363 for (j = i - gap; j >= 0; j-= gap) {
1364 if (bptr[j]->score[irelv] >= bptr[j + gap]->score[irelv]) break;
1366 bptr[j] = bptr[j + gap];
1367 bptr[j + gap] = tmp;
1371 void show_aux(FILE *fp, struct beststr *bptr) {}
1372 void header_aux(FILE *fp) {}
1375 void sortbest (bptr, nbest, irelv)
1376 struct beststr **bptr;
1380 struct beststr *tmp;
1382 for (gap = nbest/2; gap > 0; gap /= 2)
1383 for (i = gap; i < nbest; i++)
1384 for (j = i - gap; j >= 0; j-= gap) {
1385 if (bptr[j]->escore < bptr[j + gap]->escore) break;
1387 bptr[j] = bptr[j + gap];
1388 bptr[j + gap] = tmp;
1392 #if defined(FASTS) || defined(FASTM)
1395 /* this shuffle is for FASTS */
1396 /* convert ',' -> '\0', shuffle each of the substrings */
1398 qshuffle(unsigned char *aa0, int n0, int nm0) {
1400 unsigned char **aa0start, *aap, tmp;
1403 if ((aa0start=(unsigned char **)calloc(nm0+1,
1404 sizeof(unsigned char *)))==NULL) {
1405 fprintf(stderr,"cannot calloc for qshuffle %d\n",nm0);
1410 for (k=1,i=0; i<n0; i++) {
1411 if (aa0[i]==EOSEQ || aa0[i]==ESS) {
1413 aa0start[k++] = &aa0[i+1];
1417 /* aa0start has the beginning of each substring */
1418 for (k=0; k<nm0; k++) {
1420 ns = strlen((char *)aap);
1421 for (i=ns; i>1; i--) {
1430 for (k=1; k<nm0; k++) {
1431 /* aap = aa0start[k];
1432 while (*aap) fputc(pst.sq[*aap++],stderr);
1435 aa0start[k][-1]=ESS;
1445 void qshuffle(unsigned char *aa0, int n0, int nm0) {
1451 nmoff = (n0 - nm0 - 1)/nm0 + 1;
1453 for (i = nmoff-1 ; i > 0 ; i--) {
1455 /* j = nrand(i); if (i == j) continue;*/ /* shuffle columns */
1456 j = (nmoff -1 ) - i;
1457 if (i <= j) break; /* reverse columns */
1459 /* swap all i'th column residues for all j'th column residues */
1460 for(nmpos = 0, k = 0 ; k < nm0 ; k++, nmpos += nmoff+1 ) {
1461 tmp = aa0[nmpos + i];
1462 aa0[nmpos + i] = aa0[nmpos + j];
1463 aa0[nmpos + j] = tmp;
1471 /* show additional best_str values */
1472 void show_aux(FILE *fp, struct beststr *bptr) {
1473 fprintf(fp," %2d %3d",bptr->segnum,bptr->seglen);
1476 void header_aux(FILE *fp) {
1477 fprintf(fp, " sn sl");
1482 fill_pam(int **pam2p, int n0, int nsq, double **freq2d, double scale) {
1486 /* fprintf(stderr, "scale: %g\n", scale); */
1488 /* now fill in the pam matrix: */
1489 for (i = 0 ; i < n0 ; i++) {
1490 for (j = 1 ; j <=20 ; j++) {
1491 freq = scale * freq2d[i][j-1];
1492 if ( freq < 0.0) freq -= 0.5;
1494 pam2p[i][j] = (int)(freq);
1500 get_lambda(int **pam2p, int n0, int nsq, unsigned char *query) {
1502 double *pr, tot, sum;
1503 int i, ioff, j, min, max;
1505 /* get min and max scores */
1508 if(pam2p[0][1] == -BIGNUM) {
1515 for (i = ioff ; i < n0 ; i++) {
1516 for (j = 1; j <= nsq ; j++) {
1517 if (min > pam2p[i][j])
1519 if (max < pam2p[i][j])
1524 /* fprintf(stderr, "min: %d\tmax:%d\n", min, max); */
1526 if ((pr = (double *) calloc(max - min + 1, sizeof(double))) == NULL) {
1527 fprintf(stderr, "Couldn't allocate memory for score probabilities: %d\n", max - min + 1);
1531 tot = (double) rrtotal * (double) rrtotal * (double) n0;
1532 for (i = ioff ; i < n0 ; i++) {
1533 for (j = 1; j <= nsq ; j++) {
1534 pr[pam2p[i][j] - min] +=
1535 (double) ((double) rrcounts[aascii[query[i]]] * (double) rrcounts[j]) / tot;
1540 for(i = 0 ; i <= max-min ; i++) {
1542 /* fprintf(stderr, "%3d: %g %g\n", i+min, pr[i], sum); */
1544 /* fprintf(stderr, "sum: %g\n", sum); */
1546 for(i = 0 ; i <= max-min ; i++) { pr[i] /= sum; }
1548 if (!karlin(min, max, pr, &lambda, &H)) {
1549 fprintf(stderr, "Karlin lambda estimation failed\n");
1552 /* fprintf(stderr, "lambda: %g\n", lambda); */
1559 *aa0 - query sequence
1561 pamscale - scaling for pam matrix - provided by apam.c, either
1562 0.346574 = ln(2)/2 (P120, BL62) or
1563 0.231049 = ln(2)/3 (P250, BL50)
1567 scale_pssm(int **pssm2p, double **freq2d,
1568 unsigned char *query, int n0,
1569 int **pam2, double pamscale);
1571 static unsigned char ustandard_aa[] ="\0ARNDCQEGHILKMFPSTWYV";
1574 read_pssm(unsigned char *aa0, int n0, int nsq,
1576 FILE *fp, int pgpf_type, struct pstruct *ppst) {
1578 int qi, rj; /* qi - index query; rj - index residues (1-20) */
1580 int first, too_high;
1581 unsigned char *query, ctmp;
1583 double freq, **freq2d, lambda, new_lambda;
1584 double scale, scale_high, scale_low;
1586 pam2p = ppst->pam2p[0];
1588 if (pgpf_type == 0) {
1590 if(1 != fread(&len, sizeof(int), 1, fp)) {
1591 fprintf(stderr, "error reading from checkpoint file: %d\n", len);
1596 fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",
1601 /* read over query sequence stored in BLAST profile */
1602 if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) {
1603 fprintf(stderr, "Couldn't allocate memory for query!\n");
1607 if(len != fread(query, sizeof(char), len, fp)) {
1608 fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);
1612 else if (pgpf_type == 1) {
1614 if ((fgets(dline,sizeof(dline),fp) == NULL) ||
1615 (1 != sscanf(dline, "%d",&len))) {
1616 fprintf(stderr, "error reading from checkpoint file: %d\n", len);
1621 fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",
1626 /* read over query sequence stored in BLAST profile */
1627 if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) {
1628 fprintf(stderr, "Couldn't allocate memory for query!\n");
1632 if (fgets((char *)query,len+2,fp)==NULL) {
1633 fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);
1638 fprintf(stderr," Unrecognized PSSM file type: %d\n",pgpf_type);
1642 /* currently we don't do anything with query; ideally, we should
1643 check to see that it actually matches aa0 ... */
1645 /* quick 2d array alloc: */
1646 if((freq2d = (double **) calloc(n0, sizeof(double *))) == NULL) {
1647 fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
1651 if((freq2d[0] = (double *) calloc(n0 * 20, sizeof(double))) == NULL) {
1652 fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
1656 /* a little pointer arithmetic to fill out 2d array: */
1657 for (i = 1 ; i < n0 ; i++) {
1658 freq2d[i] = freq2d[i-1] + 20;
1661 if (pgpf_type == 0) {
1662 for (qi = 0 ; qi < n0 ; qi++) {
1663 for (rj = 0 ; rj < 20 ; rj++) {
1664 if(1 != fread(&freq, sizeof(double), 1, fp)) {
1665 fprintf(stderr, "Error while reading frequencies!\n");
1668 freq2d[qi][rj] = freq;
1673 for (qi = 0 ; qi < n0 ; qi++) {
1674 if ((fgets(dline,sizeof(dline),fp) ==NULL) ||
1675 (k = sscanf(dline,"%c %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg\n",
1676 &ctmp, &freq2d[qi][0], &freq2d[qi][1], &freq2d[qi][2], &freq2d[qi][3], &freq2d[qi][4],
1677 &freq2d[qi][5], &freq2d[qi][6], &freq2d[qi][7], &freq2d[qi][8], &freq2d[qi][9],
1678 &freq2d[qi][10], &freq2d[qi][11], &freq2d[qi][12], &freq2d[qi][13], &freq2d[qi][14],
1679 &freq2d[qi][15], &freq2d[qi][16], &freq2d[qi][17], &freq2d[qi][18], &freq2d[qi][19]))<1) {
1680 fprintf(stderr, "Error while reading frequencies: %d read!\n",k);
1683 for (rj=0; rj<20; rj++) { freq2d[qi][rj] /= 10.0; } /* reverse scaling */
1687 scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale);
1696 scale_pssm(int **pssm2p, double **freq2d, unsigned char *query, int n0, int **pam2, double pamscale) {
1698 double freq, new_lambda, lambda;
1699 int first, too_high;
1700 double scale, scale_high, scale_low;
1702 for (qi = 0 ; qi < n0 ; qi++) {
1703 for (rj = 0 ; rj < 20 ; rj++) {
1704 if (freq2d[qi][rj] > 1e-20) {
1705 freq = log(freq2d[qi][rj] /((double) (rrcounts[rj+1])/(double) rrtotal));
1706 freq /= pamscale; /* this gets us close to originial pam scores */
1707 freq2d[qi][rj] = freq;
1710 /* when blastpgp decides to leave something out, it puts 0's in all the frequencies
1711 in the binary checkpoint file. In the ascii version, however, it uses BLOSUM62
1712 values. I will put in scoring matrix values as well */
1714 freq2d[qi][rj] = pam2[aascii[query[qi]]][rj+1];
1719 /* now figure out the right scale */
1721 lambda = get_lambda(pam2, 20, 20, ustandard_aa);
1723 /* should be near 1.0 because of our initial scaling by ppst->pamscale */
1724 /* fprintf(stderr, "real_lambda: %g\n", lambda); */
1726 /* get initial high/low scale values: */
1729 fill_pam(pssm2p, n0, 20, freq2d, scale);
1730 new_lambda = get_lambda(pssm2p, n0, 20, query);
1732 if (new_lambda > lambda) {
1735 scale = scale_high = 1.0 + 0.05;
1739 if (!too_high) break;
1740 scale = (scale_high += scale_high - 1.0);
1742 } else if (new_lambda > 0) {
1746 scale = scale_low = 1.0 - 0.05;
1749 if (too_high) break;
1750 scale = (scale_low += scale_low - 1.0);
1753 fprintf(stderr, "new_lambda (%g) <= 0; matrix has positive average score", new_lambda);
1758 /* now do binary search between low and high */
1759 for (i = 0 ; i < 10 ; i++) {
1760 scale = 0.5 * (scale_high + scale_low);
1761 fill_pam(pssm2p, n0, 20, freq2d, scale);
1762 new_lambda = get_lambda(pssm2p, n0, 20, query);
1764 if (new_lambda > lambda) scale_low = scale;
1765 else scale_high = scale;
1768 scale = 0.5 * (scale_high + scale_low);
1769 fill_pam(pssm2p, n0, 20, freq2d, scale);
1772 fprintf(stderr, "final scale: %g\n", scale);
1774 for (qi = 0 ; qi < n0 ; qi++) {
1775 fprintf(stderr, "%4d %c: ", qi+1, query[qi]);
1776 for (rj = 1 ; rj <= 20 ; rj++) {
1777 fprintf(stderr, "%4d", pssm2p[qi][rj]);
1779 fprintf(stderr, "\n");
1784 #if defined(SSEARCH) || (defined(PRSS) && !defined(FASTX))
1786 parse_pssm_asn_fa(FILE *afd, int *n_rows, int *n_cols,
1787 unsigned char **query, double ***freqs,
1788 char *matrix, int *gap_open, int *gap_extend,
1791 /* the ASN.1 pssm includes information about the scoring matrix used
1792 (though not the gap penalty in the current version PSSM:2) The PSSM
1793 scoring matrix and gap penalties should become the default if they
1794 have not been set explicitly.
1798 read_asn_pssm(unsigned char *aa0, int n0, int nsq,
1799 double pamscale, FILE *fp, struct pstruct *ppst) {
1802 int qi, rj; /* qi - index query; rj - index residues (1-20) */
1804 int first, too_high;
1805 unsigned char *query, ctmp;
1807 char matrix[MAX_SSTR];
1809 double freq, **freq2d, lambda, new_lambda;
1810 double scale, scale_high, scale_low;
1811 int gap_open, gap_extend;
1814 pam2p = ppst->pam2p[0];
1816 if (parse_pssm_asn_fa(fp, &n_rows, &n_cols, &query, &freq2d,
1817 matrix, &gap_open, &gap_extend, &psi2_lambda)<=0) {
1823 if (gap_open > 0) {gap_open = -gap_open;}
1824 ppst->gdelval = gap_open;
1826 else if (strncmp(matrix,"BLOSUM62",8)==0) {
1827 ppst->gdelval = -11;
1833 if (gap_extend > 0) {gap_extend = -gap_extend;}
1834 ppst->ggapval = gap_extend;
1836 else if (strncmp(matrix,"BLOSUM62",8)==0) {
1842 if (strncmp(matrix, "BLOSUM62", 8)== 0 && !ppst->pam_set) {
1843 strncpy(ppst->pamfile, "BL62", 120);
1844 standard_pam(ppst->pamfile,ppst,del_set, gap_set);
1845 if (!ppst->have_pam2) {
1846 alloc_pam (MAXSQ, MAXSQ, ppst);
1853 fprintf(stderr, " query length: %d != n_cols: %d\n",n0, n_cols);
1857 scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale);
1868 last_params(unsigned char *aa0, int n0,
1869 struct mngmsg *m_msg,
1870 struct pstruct *ppst
1872 , struct qmng_str *qm_msg
1878 if (n0 < 0) { return;}
1880 ppst->n0 = m_msg->n0;
1882 if (ppst->ext_sq_set) { nsq = ppst->nsqx; }
1883 else {nsq = ppst->nsq;}
1885 /* currently, profiles are only available for SSEARCH, PRSS */
1886 #if defined(SSEARCH) || defined(PRSS)
1888 ppst->pam2p[0] = alloc_pam2p(n0,nsq);
1889 ppst->pam2p[1] = alloc_pam2p(n0,nsq);
1891 if (ppst->pam_pssm) {
1892 if ((ppst->pgpfile_type == 0) && (fp=fopen(ppst->pgpfile,"rb"))) {
1893 read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 0, ppst);
1894 extend_pssm(aa0, n0, ppst);
1896 else if ((ppst->pgpfile_type == 1) && (fp=fopen(ppst->pgpfile,"r"))) {
1897 read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 1, ppst);
1898 extend_pssm(aa0, n0, ppst);
1900 #if defined(SSEARCH) || (defined(PRSS) && !defined(FASTX))
1901 else if ((ppst->pgpfile_type == 2) && (fp=fopen(ppst->pgpfile,"rb"))) {
1902 if (read_asn_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, ppst)>0) {
1903 extend_pssm(aa0, n0, ppst);
1906 fprintf(stderr," Could not parse PSSM file: %s\n",ppst->pgpfile);
1913 fprintf(stderr," Could not open PSSM file: %s\n",ppst->pgpfile);
1920 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
1922 for (i=0; i<n0; i++)
1923 if (aa0[i]==EOSEQ || aa0[i]==ESS) m_msg->nm0++;
1926 for FASTS, we can do statistics in one of two different ways
1927 if there are <= 10 query fragments, then we calculate probabilistic
1928 scores for every library sequence. If there are > 10 fragments, this
1929 takes much too long and too much memory, so we use the old fashioned
1930 raw score only z-score normalized method initially, and then calculate
1931 the probabilistic scores for the best hits. To scale those scores, we
1932 also need a set of random probabilistic scores. So we do the qshuffle
1935 For FASTF, precalculating probabilities is prohibitively expensive,
1936 so we never do it; FASTF always acts like FASTS with nfrags>10.
1940 #if defined(FASTS) || defined(FASTM)
1941 if (m_msg->nm0 > 10) m_msg->escore_flg = 0;
1942 else m_msg->escore_flg = 1;
1945 if (m_msg->escore_flg && (ppst->zsflag&1)) {
1946 m_msg->last_calc_flg = 0;
1947 m_msg->qshuffle = 0;
1949 else { /* need random query, second set of 2000 scores */
1950 m_msg->last_calc_flg = 1;
1951 m_msg->qshuffle = 1;
1954 m_msg->last_calc_flg = 0;
1955 m_msg->qshuffle = 0;
1956 m_msg->escore_flg = 0;
1960 /* adjust the ktup if appropriate */
1962 if (!ktup_set && pgm_def_arr[ppst->pgm_id].ktup > 0) {
1963 if (m_msg->qdnaseq == SEQT_PROT) {
1964 ppst->param_u.fa.ktup = pgm_def_arr[ppst->pgm_id].ktup;
1965 #if defined(FASTS) || defined(FASTM)
1966 if (n0 > 100) ppst->param_u.fa.ktup = 2;
1968 if (n0 < 40) ppst->param_u.fa.ktup = 1;
1970 else if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) {
1971 if (n0 < 20) ppst->param_u.fa.ktup = 1;
1972 #if defined(FASTS) || defined(FASTM)
1973 /* with the current (April 12 2005) dropfs2.c - ktup cannot be > 2 */
1974 else ppst->param_u.fa.ktup = 2;
1976 else if (n0 < 50) ppst->param_u.fa.ktup = 2;
1977 else if (n0 < 100) ppst->param_u.fa.ktup = 3;
1983 qm_msg->nm0 = m_msg->nm0;
1984 qm_msg->escore_flg = m_msg->escore_flg;
1985 qm_msg->qshuffle = m_msg->qshuffle;
1986 qm_msg->pam_pssm = 0;
1990 /* given a good profile in ppst->pam2p[0], make an extended profile
1994 extend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst) {
1997 int sa_x, sa_t, sa_b, sa_z;
1998 int **pam2p0, **pam2p1;
2002 pam2p0 = ppst->pam2p[0];
2003 pam2p1 = ppst->pam2p[1];
2010 /* fill in boundaries, B, Z, *, X */
2011 for (i=0; i<n0; i++) {
2012 pam2p0[i][0] = -BIGNUM;
2013 pam2p0[i][sa_b] = (int)
2014 (((float)pam2p0[i][pascii['N']]+(float)pam2p0[i][pascii['D']]+0.5)/2.0);
2015 pam2p0[i][sa_z] = (int)
2016 (((float)pam2p0[i][pascii['Q']]+(float)pam2p0[i][pascii['E']]+0.5)/2.0);
2017 pam2p0[i][sa_x] = ppst->pam_xm;
2018 pam2p0[i][sa_t] = ppst->pam_xm;
2021 /* copy pam2p0 into pam2p1 */
2022 for (i=0; i<n0; i++) {
2023 pam2p1[i][0] = -BIGNUM;
2024 for (j=1; j<=ppst->nsq; j++) {
2025 pam2p1[i][j] = pam2p0[i][j];
2029 /* then fill in extended characters, if necessary */
2030 if (ppst->ext_sq_set) {
2031 for (i=0; i<n0; i++) {
2032 for (j=1; j<=ppst->nsq; j++) {
2033 pam2p0[i][nsq+j] = pam2p0[i][j];
2034 pam2p1[i][nsq+j] = ppst->pam_xm;