/* initfa.c */ /* $Name: fa_34_26_5 $ - $Id: initfa.c,v 1.148 2007/04/26 18:40:58 wrp Exp $ */ /* copyright (c) 1996, 1997, 1998 William R. Pearson and the U. of Virginia */ /* init??.c files provide function specific initializations */ /* h_init() - called from comp_lib.c, comp_thr.c to initialize pstruct ppst which includes the alphabet, and pam matrix alloc_pam() - allocate pam matrix space init_pam2() - convert from 1D to 2D pam init_pamx() - convert from 1D to 2D pam f_initenv() - set up mngmsg and pstruct defaults f_getopt() - read fasta specific command line options f_getarg() - read ktup resetp() - reset the parameters, scoring matrix for DNA-DNA/DNA-prot query_parm() - ask for ktup last_init() - some things must be done last f_initpam() - set some parameters based on the pam matrix */ #include #include #include #include #include #ifdef UNIX #include #include #endif #include "defs.h" #include "structs.h" #include "param.h" #ifndef PCOMPLIB #include "mw.h" #else #include "p_mw.h" #endif #define XTERNAL #include "upam.h" #include "uascii.h" #undef XTERNAL #define MAXWINDOW 32 int initpam(char *, struct pstruct *); void init_pam2 (struct pstruct *ppst); void extend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst); void build_xascii(int *qascii, char *save_str); void ann_ascii(int *qascii, char *ann_arr); void re_ascii(int *qascii, int *pascii); extern int nrand(int); /* at some point, all the defaults should be driven from this table */ /* #pgm q_seq l_seq p_seq matrix g_open g_ext fr_shft e_cut ktup # -n/-p -s -e -f -h/-j -E argv[3] fasta prot(0) prot(0) prot(0) bl50 -10 -2 - 10.0 2 fasta dna(1) dna(1) dna(1) +5/-4 -14 -4 - 2.0 6 ssearch prot(0) prot(0) prot(0) bl50 -10 -2 - 10.0 - ssearch dna(1) dna(1) dna(1) +5/-4 -14 -4 - 2.0 - fastx dna(1) prot(0) prot(0) BL50 -12 -2 -20 5.0 2 fasty dna(1) prot(0) prot(0) BL50 -12 -2 -20/-24 5.0 2 tfastx dna(1) prot(0) prot(0) BL50 -14 -2 -20 5.0 2 tfasty dna(1) prot(0) prot(0) BL50 -14 -2 -20/-24 5.0 2 fasts prot(0) prot(0) prot(0) MD20-MS - - - 5.0 - fasts dna(1) dna(1) dna(1) +2/-4 - - - 5.0 1 tfasts prot(0) dna(1) prot(0) MD10-MS - - - 2.0 1 fastf prot(0) prot(0) prot(0) MD20 - - - 2.0 1 tfastf prot(0) dna(1) prot(0) MD10 - - - 1.0 1 fastm prot(0) prot(0) prot(0) MD20 - - - 5.0 1 fastm dna(1) dna(1) dna(1) +2/-4 - - - 2.0 1 tfastm prot(0) dna(1) prot(0) MD10 - - - 2.0 1 */ struct pgm_def_str { int pgm_id; char *prog_func; char *pgm_abbr; char *iprompt0; char *ref_str; int PgmDID; char *smstr; int g_open_mod; int gshift; int hshift; int e_cut; int ktup; }; char *ref_str_a[]={ "\nPlease cite:\n W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448\n", "\nPlease cite:\n T. F. Smith and M. S. Waterman, (1981) J. Mol. Biol. 147:195-197; \n W.R. Pearson (1991) Genomics 11:635-650\n", "\nPlease cite:\n Pearson et al, Genomics (1997) 46:24-36\n", "\nPlease cite:\n Mackey et al. Mol. Cell. Proteomics (2002) 1:139-147\n", "\nPlease cite:\n W.R. Pearson (1996) Meth. Enzymol. 266:227-258\n" }; #define FA_PID 1 #define SS_PID 2 #define FX_PID 3 #define FY_PID 4 #define FS_PID 5 #define FF_PID 6 #define FM_PID 7 #define RSS_PID 8 #define RFX_PID 9 #define SSS_PID 10 /* old (slow) non-PG Smith-Waterman */ #define TFA_PID FA_PID+10 #define TFX_PID FX_PID+10 #define TFY_PID FY_PID+10 #define TFS_PID FS_PID+10 #define TFF_PID FF_PID+10 #define TFM_PID FM_PID+10 struct pgm_def_str pgm_def_arr[20] = { {0, "", "", "", NULL, 400, "", 0, 0, 0, 1.0, 0 }, /* 0 */ {FA_PID, "FASTA", "fa", "FASTA searches a protein or DNA sequence data bank", NULL, 401, "BL50", 0, 0, 0, 10.0, 2}, /* 1 - FASTA */ {SS_PID, "SSEARCH","gsw","SSEARCH searches a sequence data bank", NULL, 404, "BL50", 0, 0, 0, 10.0, 0}, /* 2 - SSEARCH */ {FX_PID, "FASTX","fx", "FASTX compares a DNA sequence to a protein sequence data bank", NULL, 405, "BL50", -2, -20, 0, 5.0, 2}, /* 3 - FASTX */ {FY_PID, "FASTY", "fy", "FASTY compares a DNA sequence to a protein sequence data bank", NULL, 405, "BL50", -2, -20, -24, 5.0, 2}, /* 4 - FASTY */ {FS_PID, "FASTS", "fs", "FASTS compares linked peptides to a protein data bank", NULL, 400, "MD20-MS", 0, 0, 0, 5.0, 1}, /* 5 - FASTS */ {FF_PID, "FASTF", "ff", "FASTF compares mixed peptides to a protein databank", NULL, 400, "MD20", 0, 0, 0, 2.0, 1 }, /* 6 - FASTF */ {FM_PID, "FASTM", "fm", "FASTM compares ordered peptides to a protein data bank", NULL, 400, "MD20", 0, 0, 0, 5.0, 1 }, /* 7 - FASTM */ {RSS_PID, "PRSS", "rss", "PRSS evaluates statistical signficance using Smith-Waterman", NULL, 401, "BL50", 0, 0, 0, 1000.0, 0 }, /* 8 - PRSS */ {RFX_PID,"PRFX", "rfx", "PRFX evaluates statistical signficance using FASTX", NULL, 401, "BL50", -2, -20, -24, 1000.0, 2 }, /* 9 - PRFX */ {SSS_PID, "OSEARCH","ssw","OSEARCH searches a sequence data bank", NULL, 404, "BL50", 0, 0, 0, 10.0, 0}, /* 2 - OSEARCH */ {TFA_PID, "TFASTA", "tfa", "TFASTA compares a protein to a translated DNA data bank", NULL, 402, "BL50", -2, 0, 0, 5.0, 2 }, {0, "", "", "", NULL, 400, "", 0, 0, 0, 1.0, 0 }, /* 0 */ {TFX_PID, "TFASTX", "tfx", "TFASTX compares a protein to a translated DNA data bank", NULL, 406, "BL50", -2, -20, 0, 2.0, 2}, {TFY_PID, "TFASTY", "tfy", "TFASTY compares a protein to a translated DNA data bank", NULL, 406, "BL50", -2, -20, -24, 2.0, 2}, {TFS_PID, "TFASTS", "tfs", "TFASTS compares linked peptides to a translated DNA data bank", NULL, 400, "MD10-MS", 0, 0, 0, 2.0, 2 }, {TFF_PID, "TFASTF", "tff", "TFASTF compares mixed peptides to a protein databank", NULL, 400, "MD10", 0, 0, 0, 1.0, 1 }, {TFM_PID, "TFASTM", "tfm", "TFASTM compares ordered peptides to a translated DNA databank", NULL, 400, "MD10", 0, 0, 0, 1.0, 1 } }; struct msg_def_str { int pgm_id; int q_seqt; int l_seqt; int p_seqt; int sw_flag; int stages; int qframe; int nframe; int nrelv, srelv, arelv; char *f_id0, *f_id1, *label; }; /* pgm_id q_seqt l_seqt p_seqt sw_f st qf nf nrv srv arv s_ix */ struct msg_def_str msg_def_arr[20] = { {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "", "", ""}, /* ID=0 */ {FA_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 1, 3, "fa","sw", "opt"}, {SS_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1, "sw","sw", "s-w"}, {FX_PID, SEQT_DNA, SEQT_PROT, SEQT_PROT, 1, 1, 2, -1, 3, 1, 3, "fx","sx", "opt"}, {FY_PID, SEQT_DNA, SEQT_PROT, SEQT_PROT, 1, 1, 2, -1, 3, 1, 3, "fy","sy", "opt"}, {FS_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3, "fs","fs", "initn init1"}, {FF_PID, SEQT_PROT,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3, "ff","ff", "initn init1"}, {FM_PID, SEQT_PROT,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3, "fm","fm","initn init1"}, {RSS_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 0, 1, 1, -1, 1, 1, 1, "rss","sw","s-w"}, {RFX_PID, SEQT_DNA,SEQT_PROT, SEQT_PROT, 0, 1, 2, -1, 3, 1, 3, "rfx","sx","opt"}, {SSS_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1, "sw","sw", "s-w"}, {TFA_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 0, 1, 1, 6, 3, 1, 3, "tfa","fa","initn init1"}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "", "", ""}, /* ID=12 */ {TFX_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 2, 3, 2, 3, "tfx","sx","initn opt"}, {TFY_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 2, 3, 2, 3, "tfy","sy","initn opt"}, {TFS_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3, "tfs","fs","initn init1"}, {TFF_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3, "tff","ff","initn init1"}, {TFM_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3, "tfm","fm","initn init1"} }; int get_pgm_id() { int rval=0; #ifdef FASTA #ifndef TFAST pgm_def_arr[FA_PID].ref_str = ref_str_a[0]; rval=FA_PID; #else pgm_def_arr[TFA_PID].ref_str = ref_str_a[0]; rval=TFA_PID; #endif #endif #ifdef FASTX #ifndef TFAST #ifndef PRSS pgm_def_arr[FX_PID].ref_str = ref_str_a[2]; rval=FX_PID; #else pgm_def_arr[RFX_PID].ref_str = ref_str_a[2]; rval=RFX_PID; #endif #else pgm_def_arr[TFX_PID].ref_str = ref_str_a[2]; rval=TFX_PID; #endif #endif #ifdef FASTY #ifndef TFAST pgm_def_arr[FY_PID].ref_str = ref_str_a[2]; rval=FY_PID; #else pgm_def_arr[TFY_PID].ref_str = ref_str_a[2]; rval=TFY_PID; #endif #endif #ifdef FASTS #ifndef TFAST pgm_def_arr[FS_PID].ref_str = ref_str_a[3]; rval=FS_PID; #else pgm_def_arr[TFS_PID].ref_str = ref_str_a[3]; rval=TFS_PID; #endif #endif #ifdef FASTF #ifndef TFAST pgm_def_arr[FF_PID].ref_str = ref_str_a[3]; rval=FF_PID; #else pgm_def_arr[TFF_PID].ref_str = ref_str_a[3]; rval=TFF_PID; #endif #endif #ifdef FASTM #ifndef TFAST pgm_def_arr[FM_PID].ref_str = ref_str_a[3]; rval=FM_PID; #else pgm_def_arr[TFM_PID].ref_str = ref_str_a[3]; rval=TFM_PID; #endif #endif #ifdef SSEARCH pgm_def_arr[SS_PID].ref_str = ref_str_a[1]; rval=SS_PID; #endif #ifdef OSEARCH pgm_def_arr[SSS_PID].ref_str = ref_str_a[1]; rval=SSS_PID; #endif #ifdef PRSS #ifndef FASTX pgm_def_arr[RSS_PID].ref_str = ref_str_a[4]; rval=RSS_PID; #endif #endif return rval; } char *iprompt1=" test sequence file name: "; char *iprompt2=" database file name: "; char *verstr="version 34.26.5 April 26, 2007"; char *s_optstr = "13Ac:f:g:h:j:k:nopP:r:s:St:Ux:y:"; static int mktup=2; static int ktup_set = 0; static int gap_set=0; static int del_set=0; static int mshuff_set = 0; static int prot2dna = 0; extern int max_workers; extern void s_abort(char *, char *); extern void init_ascii(int ext_sq, int *sascii, int dnaseq); extern int standard_pam(char *smstr, struct pstruct *ppst, int del_set, int gap_set); extern void mk_n_pam(int *arr,int siz, int mat, int mis); extern int karlin(int , int, double *, double *, double *); extern void init_karlin_a(struct pstruct *, double *, double **); extern int do_karlin_a(int **, struct pstruct *, double *, double *, double *, double *, double *); #if defined(TFAST) || defined(FASTX) || defined(FASTY) extern void aainit(int tr_type, int debug); #endif char *iprompt0, *prog_func, *refstr; /* Sets defaults assuming a protein sequence */ void h_init (struct pstruct *ppst, struct mngmsg *m_msp, char *pgm_abbr) { struct pgm_def_str pgm_def; int i, pgm_id; ppst->pgm_id = pgm_id = get_pgm_id(); pgm_def = pgm_def_arr[pgm_id]; /* check that pgm_def_arr[] is valid */ if (pgm_def.pgm_id != pgm_id) { fprintf(stderr, "**pgm_def integrity failure: def.pgm_id %d != pgm_id %d**\n", pgm_def.pgm_id, pgm_id); exit(1); } /* check that msg_def_arr[] is valid */ if (msg_def_arr[pgm_id].pgm_id != pgm_id) { fprintf(stderr, "**msg_def integrity failure: def.pgm_id %d != pgm_id %d**\n", msg_def_arr[pgm_id].pgm_id, pgm_id); exit(1); } strncpy(pgm_abbr,pgm_def.pgm_abbr,MAX_SSTR); iprompt0 = pgm_def.iprompt0; refstr = pgm_def.ref_str; prog_func = pgm_def.prog_func; /* MAXTOT = MAXTST + MAXLIB for everything except TFAST, where it is MAXTST + MAXTRN */ m_msp->max_tot = MAXTOT; /* set up DNA query sequence if required*/ if (msg_def_arr[pgm_id].q_seqt == SEQT_DNA) { memcpy(qascii,nascii,sizeof(qascii)); m_msp->qdnaseq = SEQT_DNA; } else { /* when SEQT_UNK, start with protein */ memcpy(qascii,aascii,sizeof(qascii)); m_msp->qdnaseq = msg_def_arr[pgm_id].q_seqt; } #if defined(FASTF) || defined(FASTS) || defined(FASTM) qascii[','] = ESS; /* also initialize aascii, nascii for databases */ qascii['*'] = NA; #endif /* initialize a pam matrix */ strncpy(ppst->pamfile,pgm_def.smstr,MAX_FN); standard_pam(ppst->pamfile,ppst,del_set,gap_set); ppst->have_pam2 = 0; /* this is always protein by default */ ppst->nsq = naa; ppst->nsqx = naax; for (i=0; i<=ppst->nsqx; i++) { ppst->sq[i] = aa[i]; ppst->hsq[i] = haa[i]; ppst->sqx[i]=aax[i]; /* sq = aa */ ppst->hsqx[i]=haax[i]; /* hsq = haa */ } ppst->sq[ppst->nsqx+1] = ppst->sqx[ppst->nsqx+1] = '\0'; /* set up the c_nt[] mapping */ #if defined(FASTS) || defined(FASTF) || defined(FASTM) ppst->c_nt[ESS] = ESS; #endif ppst->c_nt[0]=0; for (i=1; i<=nnt; i++) { ppst->c_nt[i]=gc_nt[i]; ppst->c_nt[i+nnt]=gc_nt[i]+nnt; } } /* * alloc_pam(): allocates memory for the 2D pam matrix as well * as for the integer array used to transmit the pam matrix */ void alloc_pam (int d1, int d2, struct pstruct *ppst) { int i, *d2p; char err_str[128]; if ((ppst->pam2[0] = (int **) malloc (d1 * sizeof (int *))) == NULL) { sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1); s_abort (err_str,""); } if ((ppst->pam2[1] = (int **) malloc (d1 * sizeof (int *))) == NULL) { sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1); s_abort (err_str,""); } if ((d2p = pam12 = (int *) calloc (d1 * d2, sizeof (int))) == NULL) { sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1); s_abort (err_str,""); } for (i = 0; i < d1; i++, d2p += d2) ppst->pam2[0][i] = d2p; if ((d2p=pam12x= (int *) malloc (d1 * d2 * sizeof (int))) == NULL) { sprintf(err_str,"Cannot allocate 2d pam matrix: %d",d2); s_abort (err_str,""); } for (i = 0; i < d1; i++, d2p += d2) ppst->pam2[1][i] = d2p; ppst->have_pam2 = 1; } /* * init_pam2(struct pstruct pst): Converts 1-D pam matrix to 2-D */ void init_pam2 (struct pstruct *ppst) { int i, j, k, nsq; nsq = ppst->nsq; ppst->pam2[0][0][0] = -BIGNUM; ppst->pam_h = -1; ppst->pam_l = 1; k = 0; for (i = 1; i <= nsq; i++) { ppst->pam2[0][0][i] = ppst->pam2[0][i][0] = -BIGNUM; for (j = 1; j <= i; j++) { ppst->pam2[0][j][i] = ppst->pam2[0][i][j] = pam[k++] - ppst->pamoff; if (ppst->pam_l > ppst->pam2[0][i][j]) ppst->pam_l =ppst->pam2[0][i][j]; if (ppst->pam_h < ppst->pam2[0][i][j]) ppst->pam_h =ppst->pam2[0][i][j]; } } } void init_pamx (struct pstruct *ppst) { int i, j, k, nsq, pam_xx, pam_xm; int sa_x, sa_t, tmp; nsq = ppst->nsq; ppst->nt_align = (ppst->dnaseq== SEQT_DNA || ppst->dnaseq == SEQT_RNA); if (ppst->nt_align) { sa_x = pascii['N']; sa_t = sa_x; } else { sa_x = pascii['X']; sa_t = pascii['*']; } if (ppst->dnaseq == SEQT_RNA) { tmp = ppst->pam2[0][nascii['G']][nascii['G']] - 1; ppst->pam2[0][nascii['A']][nascii['G']] = ppst->pam2[0][nascii['C']][nascii['T']] = ppst->pam2[0][nascii['C']][nascii['U']] = tmp; } if (ppst->pam_x_set) { for (i=1; i<=nsq; i++) { ppst->pam2[0][sa_x][i] = ppst->pam2[0][i][sa_x]=ppst->pam_xm; ppst->pam2[0][sa_t][i] = ppst->pam2[0][i][sa_t]=ppst->pam_xm; } ppst->pam2[0][sa_x][sa_x]=ppst->pam_xx; ppst->pam2[0][sa_t][sa_t]=ppst->pam_xx; } else { ppst->pam_xx = ppst->pam2[0][sa_x][sa_x]; ppst->pam_xm = ppst->pam2[0][1][sa_x]; } pam_xx = ppst->pam_xx; pam_xm = ppst->pam_xm; if (ppst->ext_sq_set) { /* using extended alphabet */ /* fill in pam2[1] matrix */ ppst->pam2[1][0][0] = -BIGNUM; /* fill in additional parts of the matrix */ for (i = 1; i <= nsq; i++) { /* -BIGNUM to all matches vs 0 */ ppst->pam2[0][0][i+nsq] = ppst->pam2[0][i+nsq][0] = ppst->pam2[1][0][i+nsq] = ppst->pam2[1][i+nsq][0] = ppst->pam2[1][0][i] = ppst->pam2[1][i][0] = -BIGNUM; for (j = 1; j <= nsq; j++) { /* replicate pam2[0] to i+nsq, j+nsq */ ppst->pam2[0][i+nsq][j] = ppst->pam2[0][i][j+nsq] = ppst->pam2[0][i+nsq][j+nsq] = ppst->pam2[1][i][j] = ppst->pam2[0][i][j]; /* set the high portion of pam2[1] to the corresponding value of pam2[1][sa_x][j] */ ppst->pam2[1][i+nsq][j] = ppst->pam2[1][i][j+nsq]= ppst->pam2[1][i+nsq][j+nsq]=ppst->pam2[0][sa_x][j]; } } } } /* function specific initializations */ void f_initenv (struct mngmsg *m_msp, struct pstruct *ppst, unsigned char **aa0) { struct msg_def_str m_msg_def; int pgm_id; pgm_id = ppst->pgm_id; m_msg_def = msg_def_arr[pgm_id]; m_msp->last_calc_flg=0; strncpy(m_msp->f_id0,m_msg_def.f_id0,sizeof(m_msp->f_id0)); strncpy(m_msp->f_id1,m_msg_def.f_id1,sizeof(m_msp->f_id1)); strncpy (m_msp->label, m_msg_def.label, sizeof(m_msp->label)); #ifndef SSEARCH strncpy (m_msp->alab[0],"initn",20); strncpy (m_msp->alab[1],"init1",20); strncpy (m_msp->alab[2],"opt",20); #else strncpy (m_msp->alab[0],"s-w opt",20); #endif ppst->gdelval += pgm_def_arr[pgm_id].g_open_mod; ppst->sw_flag = m_msg_def.sw_flag; m_msp->e_cut=pgm_def_arr[pgm_id].e_cut; ppst->score_ix = 0; ppst->histint = 2; m_msp->qframe = m_msg_def.qframe; ppst->sw_flag = m_msg_def.sw_flag; m_msp->nframe = m_msg_def.nframe; m_msp->nrelv = m_msg_def.nrelv; m_msp->srelv = m_msg_def.srelv; m_msp->arelv = m_msg_def.arelv; m_msp->stages = m_msg_def.stages; #if defined(PRSS) m_msp->shuff_wid = 0; m_msp->shuff_max = 200; #endif /* see param.h for the definition of all these */ m_msp->qshuffle = 0; m_msp->nm0 = 1; m_msp->escore_flg = 0; /* pam information */ ppst->pam_pssm = 0; #if defined(FASTS) || defined(FASTF) || defined(FASTM) ppst->pam_xx = ppst->pam_xm = 0; #else ppst->pam_xx = 1; /* set >0 to use pam['X']['X'] value */ ppst->pam_xm = -1; /* set >0 to use pam['X']['A-Z'] value */ #endif ppst->pam_x_set = 0; ppst->pam_set = 0; ppst->pam_pssm = 0; ppst->p_d_set = 0; ppst->pamoff = 0; ppst->ext_sq_set = 0; if (pgm_def_arr[ppst->pgm_id].ktup > 0) { mktup = 2; ppst->param_u.fa.bestscale = 300; ppst->param_u.fa.bestoff = 36; ppst->param_u.fa.bkfact = 6; ppst->param_u.fa.scfact = 3; ppst->param_u.fa.bktup = 2; ppst->param_u.fa.ktup = 0; ppst->param_u.fa.bestmax = 50; ppst->param_u.fa.pamfact = 1; ppst->param_u.fa.altflag = 0; ppst->param_u.fa.optflag = 1; ppst->param_u.fa.iniflag = 0; ppst->param_u.fa.optcut = 0; ppst->param_u.fa.optcut_set = 0; ppst->param_u.fa.cgap = 0; ppst->param_u.fa.optwid = MAXWINDOW; } } /* switches for fasta only */ static int shift_set=0; static int subs_set=0; static int sw_flag_set=0; static int nframe_set=0; static int wid_set=0; void f_getopt (char copt, char *optarg, struct mngmsg *m_msg, struct pstruct *ppst) { int pgm_id; char *bp; pgm_id = ppst->pgm_id; switch (copt) { case '1': if (pgm_def_arr[pgm_id].ktup > 0) { ppst->param_u.fa.iniflag=1; } break; case '3': nframe_set = 1; if (pgm_id == TFA_PID) { m_msg->nframe = 3; break; } else { m_msg->nframe = 1; /* for TFASTXY */ m_msg->qframe = 1; /* for FASTA, FASTX */ } break; case 'A': ppst->sw_flag= 1; sw_flag_set = 1; break; case 'c': if (pgm_def_arr[pgm_id].ktup > 0) { sscanf (optarg, "%d", &ppst->param_u.fa.optcut); ppst->param_u.fa.optcut_set = 1; } break; case 'f': sscanf (optarg, "%d", &ppst->gdelval); if (ppst->gdelval > 0) ppst->gdelval = -ppst->gdelval; del_set = 1; break; case 'g': sscanf (optarg, "%d", &ppst->ggapval); if (ppst->ggapval > 0) ppst->ggapval = -ppst->ggapval; gap_set = 1; break; case 'h': sscanf (optarg, "%d", &ppst->gshift); if (ppst->gshift > 0) ppst->gshift = -ppst->gshift; shift_set = 1; break; case 'j': sscanf (optarg, "%d", &ppst->gsubs); subs_set = 1; break; case 'k': sscanf (optarg, "%d", &m_msg->shuff_max); mshuff_set = 1; break; case 'n': m_msg->qdnaseq = SEQT_DNA; re_ascii(qascii,nascii); strncpy(m_msg->sqnam,"nt",4); prot2dna = 1; break; case 'o': if (pgm_def_arr[pgm_id].ktup > 0) { ppst->param_u.fa.optflag = 0; msg_def_arr[pgm_id].nrelv = m_msg->nrelv = 2; } break; case 'p': m_msg->qdnaseq = SEQT_PROT; ppst->dnaseq = SEQT_PROT; strncpy(m_msg->sqnam,"aa",4); break; case 'P': strncpy(ppst->pgpfile,optarg,MAX_FN); if ((bp=strchr(ppst->pgpfile,' '))!=NULL) { *bp='\0'; ppst->pgpfile_type = atoi(bp+1); } else ppst->pgpfile_type = 0; ppst->pgpfile[MAX_FN-1]='\0'; ppst->pam_pssm = 1; break; case 'r': sscanf(optarg,"%d/%d",&ppst->p_d_mat,&ppst->p_d_mis); if (ppst->p_d_mat > 0 && ppst->p_d_mis < 0) { ppst->p_d_set = 1; strncpy(ppst->pamfile,optarg,40); } break; case 's': strncpy (ppst->pamfile, optarg, 120); ppst->pamfile[120-1]='\0'; if (!standard_pam(ppst->pamfile,ppst,del_set, gap_set)) { initpam (ppst->pamfile, ppst); } ppst->pam_set=1; break; case 'S': /* turn on extended alphabet for seg */ ppst->ext_sq_set = 1; break; case 't': if (tolower(optarg[0])=='t') { m_msg->term_code = aascii['*']; optarg++; } if (*optarg) {sscanf (optarg, "%d", &ppst->tr_type);} break; case 'U': m_msg->qdnaseq = SEQT_RNA; memcpy(qascii,nascii,sizeof(qascii)); strncpy(m_msg->sqnam,"nt",4); nt[nascii['T']]='U'; prot2dna=1; break; case 'x': if (strchr(optarg,',')!=NULL) { sscanf (optarg,"%d,%d",&ppst->pam_xx, &ppst->pam_xm); } else { sscanf (optarg,"%d",&ppst->pam_xx); ppst->pam_xm = ppst->pam_xx; } ppst->pam_x_set=1; break; case 'y': if (pgm_def_arr[pgm_id].ktup > 0) { sscanf (optarg, "%d", &ppst->param_u.fa.optwid); wid_set = 1; } break; } } void f_lastenv (struct mngmsg *m_msg, struct pstruct *ppst) { char save_str[MAX_SSTR]; #if !defined(FASTM) && !defined(FASTS) && !defined(FASTF) strncpy(save_str,"*",sizeof(save_str)); #else strncpy(save_str,",",sizeof(save_str)); #endif if (m_msg->qdnaseq == SEQT_UNK) { build_xascii(qascii,save_str); if (m_msg->ann_flg) ann_ascii(qascii,m_msg->ann_arr); } /* this check allows lc DNA sequence queries with FASTX */ #if defined(FASTA) && !defined(FASTS) && !defined(FASTM) && !defined(FASTF) else init_ascii(ppst->ext_sq_set,qascii,m_msg->qdnaseq); #endif } void f_getarg (int argc, char **argv, int optind, struct mngmsg *m_msg, struct pstruct *ppst) { if (pgm_def_arr[ppst->pgm_id].ktup > 0) { if (argc - optind >= 4) { sscanf (argv[optind + 3], "%d", &ppst->param_u.fa.ktup); ktup_set = 1; } else ppst->param_u.fa.ktup = -ppst->param_u.fa.bktup; } if (ppst->pgm_id == RSS_PID && argc - optind > 3) { sscanf (argv[optind + 3], "%d", &m_msg->shuff_max); } if (ppst->pgm_id == RFX_PID && argc - optind > 4) { sscanf (argv[optind + 4], "%d", &m_msg->shuff_max); } } /* fills in the query ascii mapping from the parameter ascii mapping. */ void re_ascii(int *qascii, int *pascii) { int i; for (i=0; i < 128; i++) { if (qascii[i] > '@' || qascii[i] < ESS) { qascii[i] = pascii[i]; } } } /* recode has become function specific to accommodate FASTS/M */ /* modified 28-Dec-2004 to ensure that all mapped characters are valid */ int recode(unsigned char *seq, int n, int *qascii, int nsqx) { int i,j; char save_c; #if defined(FASTS) || defined(FASTM) qascii[',']=ESS; #endif for (i=0; i < n; i++) { save_c = seq[i]; if (seq[i] > '@') seq[i] = qascii[seq[i]]; if (seq[i] > nsqx && seq[i]!=ESS) { fprintf(stderr, "*** Warning - unrecognized residue at %d:%c - %2d\n", i,save_c,save_c); seq[i] = qascii['X']; } } seq[i]=EOSEQ; return i; } /* here we have the query sequence, all the command line options, but we need to set various parameter options based on the type of the query sequence (m_msg->qdnaseq = 0:protein/1:DNA) and the function (FASTA/FASTX/TFASTA) */ /* this resetp is for conventional a FASTA/TFASTXYZ search */ void resetp (struct mngmsg *m_msg, struct pstruct *ppst) { int i, pgm_id; pgm_id = ppst->pgm_id; #if defined(TFAST) if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) { fprintf(stderr," %s compares a protein to a translated\n\ DNA sequence library. Do not use a DNA query/scoring matrix.\n",prog_func); exit(1); } #else #if (defined(FASTX) || defined(FASTY)) if (!(m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA)) { fprintf(stderr," FASTX/Y compares a DNA sequence to a protein database\n"); fprintf(stderr," Use a DNA query\n"); exit(1); } #endif #endif /* this code changes parameters for programs (FA_PID, SS_PID, FS_PID, RSS_PID) that can examine either protein (initial state) or DNA Modified May, 2006 to reset e_cut for DNA comparisons. */ if (msg_def_arr[pgm_id].q_seqt == SEQT_UNK) { if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) { msg_def_arr[pgm_id].q_seqt = m_msg->qdnaseq; msg_def_arr[pgm_id].p_seqt = SEQT_DNA; msg_def_arr[pgm_id].l_seqt = SEQT_DNA; if (m_msg->qdnaseq == SEQT_DNA) msg_def_arr[pgm_id].qframe = 2; pgm_def_arr[pgm_id].e_cut /= 5.0; } else { msg_def_arr[pgm_id].q_seqt = SEQT_PROT; } } ppst->dnaseq = msg_def_arr[pgm_id].p_seqt; if (!sw_flag_set) ppst->sw_flag = msg_def_arr[pgm_id].sw_flag; if (!m_msg->e_cut_set) m_msg->e_cut=pgm_def_arr[pgm_id].e_cut; if (ppst->dnaseq == SEQT_DNA && m_msg->qdnaseq==SEQT_RNA) { ppst->dnaseq = SEQT_RNA; ppst->nt_align = 1; } if (ppst->dnaseq==SEQT_DNA) pascii = &nascii[0]; else if (ppst->dnaseq==SEQT_RNA) { pascii = &nascii[0]; ppst->sq[nascii['T']] = 'U'; } else pascii = &aascii[0]; m_msg->ldnaseq = msg_def_arr[pgm_id].l_seqt; if (m_msg->ldnaseq & SEQT_DNA) { memcpy(lascii,nascii,sizeof(lascii)); #ifndef TFAST #ifdef DNALIB_LC init_ascii(ppst->ext_sq_set,lascii,m_msg->ldnaseq); #endif #else /* no init_ascii() because we translate lower case library sequences */ #endif } else { memcpy(lascii,aascii,sizeof(lascii)); /* initialize lib mapping */ #if defined(FASTF) || defined(FASTS) || defined(FASTM) lascii['*'] = NA; #endif init_ascii(ppst->ext_sq_set,lascii,m_msg->ldnaseq); } if (!nframe_set) { m_msg->qframe = msg_def_arr[pgm_id].qframe; m_msg->nframe = msg_def_arr[pgm_id].nframe; } /* the possibilities: -i -3 qframe revcomp FA_D/FX - - 2 0 FA_D/FX + - 2 1 FA_D/FX - + 1 0 FA_D/FX + + 2 1 */ if (m_msg->qdnaseq == SEQT_DNA) { m_msg->nframe = 1; if (m_msg->qframe == 1 && m_msg->revcomp==1) { m_msg->qframe = m_msg->revcomp+1; } } else if (m_msg->qdnaseq == SEQT_RNA) { m_msg->qframe = m_msg->revcomp+1; m_msg->nframe = 1; } /* change settings for DNA search */ if (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA) { ppst->histint = 4; if (!del_set) { #ifdef OLD_FASTA_GAP ppst->gdelval = -16; /* def. del penalty */ #else ppst->gdelval = -12; /* def. open penalty */ #endif } if (!gap_set) ppst->ggapval = -4; /* def. gap penalty */ if (pgm_def_arr[pgm_id].ktup > 0) { /* these parameters are used to scale optcut, they should be replaced by statistically based parameters */ if (!wid_set) ppst->param_u.fa.optwid = 16; ppst->param_u.fa.bestscale = 80; ppst->param_u.fa.bkfact = 5; ppst->param_u.fa.scfact = 1; ppst->param_u.fa.bktup = 6; ppst->param_u.fa.bestmax = 80; ppst->param_u.fa.bestoff = 45; if (!sw_flag_set) { ppst->sw_flag = 0; strncpy(m_msg->f_id1,"bs",sizeof(m_msg->f_id1)); } /* largest ktup */ mktup = 6; if (ppst->param_u.fa.pamfact >= 0) ppst->param_u.fa.pamfact = 0; if (ppst->param_u.fa.ktup < 0) ppst->param_u.fa.ktup = -ppst->param_u.fa.bktup; } ppst->nsq = nnt; ppst->nsqx = nntx; for (i=0; i<=ppst->nsqx; i++) { ppst->hsq[i] = hnt[i]; ppst->sq[i] = nt[i]; ppst->hsqx[i] = hntx[i]; ppst->sqx[i] = ntx[i]; } ppst->sq[ppst->nsqx+1] = ppst->sqx[ppst->nsqx+1] = '\0'; if (!ppst->pam_set) { if (ppst->p_d_set) mk_n_pam(npam,nnt,ppst->p_d_mat,ppst->p_d_mis); #if !defined(FASTS) && !defined(FASTM) else if (ppst->pamfile[0]=='\0' || strncmp(ppst->pamfile,"BL50",4)==0) { strncpy (ppst->pamfile, "+5/-4", sizeof(ppst->pamfile)); } #else else if (strncmp(ppst->pamfile,"MD20",4)==0) { strncpy (ppst->pamfile, "+2/-2", sizeof(ppst->pamfile)); ppst->p_d_mat = +2; ppst->p_d_mis = -2; mk_n_pam(npam,nnt,ppst->p_d_mat,ppst->p_d_mis); } #endif pam = npam; } strncpy (m_msg->sqnam, "nt",sizeof(m_msg->sqnam)); strncpy (m_msg->sqtype, "DNA",sizeof(m_msg->sqtype)); } /* end DNA reset */ else { /* other parameters for protein comparison */ if (pgm_def_arr[pgm_id].ktup > 0) { if (!wid_set) { if (ppst->param_u.fa.ktup==1) ppst->param_u.fa.optwid = 32; else ppst->param_u.fa.optwid = 16; } } if (!del_set) {ppst->gdelval += pgm_def_arr[pgm_id].g_open_mod;} if (!shift_set) {ppst->gshift = pgm_def_arr[pgm_id].gshift;} if (!subs_set) {ppst->gsubs = pgm_def_arr[pgm_id].hshift;} } } /* query_parm() this function asks for any additional parameters that have not been provided. Could be null. */ void query_parm (struct mngmsg *m_msp, struct pstruct *ppst) { char qline[40]; if (pgm_def_arr[ppst->pgm_id].ktup > 0) { if (ppst->param_u.fa.ktup < 0) ppst->param_u.fa.ktup = -ppst->param_u.fa.ktup; if (ppst->param_u.fa.ktup == 0) { printf (" ktup? (1 to %d) [%d] ", mktup, ppst->param_u.fa.bktup); if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0); else sscanf(qline,"%d",&ppst->param_u.fa.ktup); } if (ppst->param_u.fa.ktup == 0) ppst->param_u.fa.ktup = ppst->param_u.fa.bktup; else ktup_set = 1; } #if defined(PRSS) if (m_msp->shuff_max < 10) m_msp->shuff_max = 200; if (!mshuff_set) { printf(" number of shuffles [%d]? ",m_msp->shuff_max); fflush(stdout); if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0); else sscanf(qline,"%d",&m_msp->shuff_max); } if (ppst->zs_win == 0) { printf (" local (window) (w) or uniform (u) shuffle [u]? "); if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0); else if (qline[0]=='w' || qline[0]=='W') { m_msp->shuff_wid = 20; printf(" local shuffle window size [%d]? ",m_msp->shuff_wid); if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0); else sscanf(qline,"%d",&m_msp->shuff_wid); } } #endif } /* last_init() cannot look at aa0, n0, because it is only run once, it is not run before each new aa0 search */ void last_init (struct mngmsg *m_msg, struct pstruct *ppst #ifdef PCOMPLIB ,int nnodes #endif ) { int ix_l, ix_i, i, pgm_id; double *kar_p; double aa0_f[MAXSQ]; pgm_id = ppst->pgm_id; #if defined(FASTF) || defined(FASTS) || defined(FASTM) m_msg->nohist = 1; m_msg->shuff_max = 2000; #ifndef PCOMPLIB ppst->shuff_node = m_msg->shuff_max/max_workers; #else ppst->shuff_node = m_msg->shuff_max/nnodes; #endif #endif if (m_msg->aln.llen < 1) { m_msg->aln.llen = 60; } #ifndef PCOMPLIB #if defined(FASTX) || defined(FASTY) || defined(TFAST) /* set up translation tables: faatran.c */ aainit(ppst->tr_type,ppst->debug_lib); #endif #endif /* a sanity check */ #if !defined(TFAST) if (m_msg->revcomp && m_msg->qdnaseq!=SEQT_DNA && m_msg->qdnaseq!=SEQT_RNA) { fprintf(stderr," cannot reverse complement protein\n"); m_msg->revcomp = 0; } #endif if (pgm_def_arr[pgm_id].ktup > 0) { if (ppst->param_u.fa.ktup < 0) ppst->param_u.fa.ktup = -ppst->param_u.fa.ktup; if (ppst->param_u.fa.ktup < 1 || ppst->param_u.fa.ktup > mktup) { fprintf(stderr," warning ktup = %d out of range [1..%d], reset to %d\n", ppst->param_u.fa.ktup, mktup, ppst->param_u.fa.bktup); ppst->param_u.fa.ktup = ppst->param_u.fa.bktup; } } if (pgm_id == TFA_PID) { m_msg->revcomp *= 3; if (m_msg->nframe == 3) m_msg->nframe += m_msg->revcomp; } else if (pgm_id == TFX_PID || pgm_id == TFY_PID) { if (m_msg->nframe == 1) m_msg->nframe += m_msg->revcomp; } #if !defined(TFAST) /* for fasta/fastx searches, itt iterates the the query strand */ m_msg->nitt1 = m_msg->qframe-1; #else /* for tfasta/tfastxy searches, itt iterates the library frames */ m_msg->nitt1 = m_msg->nframe-1; #endif if (pgm_def_arr[pgm_id].ktup > 0) { if (ppst->param_u.fa.ktup>=2 && !wid_set) { ppst->param_u.fa.optwid=16; switch (pgm_id) { case FA_PID: m_msg->thr_fact = 32; break; case FX_PID: case FY_PID: m_msg->thr_fact = 16; break; case TFA_PID: case TFX_PID: case TFY_PID: m_msg->thr_fact = 8; break; default: m_msg->thr_fact = 4; } } else { m_msg->thr_fact = 4;} } else m_msg->thr_fact = 4; #if defined(PRSS) if (m_msg->shuff_max < 10) m_msg->shuff_max = 200; if (ppst->zsflag < 10) ppst->zsflag += 10; if (ppst->zs_win > 0) { m_msg->shuff_wid = ppst->zs_win; } #endif if (pgm_def_arr[ppst->pgm_id].ktup > 0) { if (ppst->param_u.fa.iniflag) { ppst->score_ix = 1; strncpy (m_msg->label, "initn init1", sizeof(m_msg->label)); } else if (ppst->param_u.fa.optflag) { ppst->score_ix = 2; m_msg->stages = 1; } } if (!ppst->have_pam2) { alloc_pam (MAXSQ, MAXSQ, ppst); init_pam2(ppst); } init_pamx(ppst); if (ppst->pam_ms) { if (m_msg->qdnaseq == SEQT_PROT) { /* code to make 'L'/'I' identical scores */ ix_l = pascii['L']; ix_i = pascii['I']; ppst->pam2[0][ix_l][ix_i] = ppst->pam2[0][ix_i][ix_l] = ppst->pam2[0][ix_l][ix_l] = ppst->pam2[0][ix_i][ix_i] = (ppst->pam2[0][ix_l][ix_l]+ppst->pam2[0][ix_i][ix_i]+1)/2; for (i=1; i<=ppst->nsq; i++) { ppst->pam2[0][i][ix_i] = ppst->pam2[0][i][ix_l] = (ppst->pam2[0][i][ix_l]+ppst->pam2[0][i][ix_i]+1)/2; ppst->pam2[0][ix_i][i] = ppst->pam2[0][ix_l][i] = (ppst->pam2[0][ix_i][i]+ppst->pam2[0][ix_l][i]+1)/2; } /* code to make 'Q'/'K' identical scores */ if (!shift_set) { ix_l = pascii['Q']; ix_i = pascii['K']; ppst->pam2[0][ix_l][ix_i] = ppst->pam2[0][ix_i][ix_l] = ppst->pam2[0][ix_l][ix_l] = ppst->pam2[0][ix_i][ix_i] = (ppst->pam2[0][ix_l][ix_l]+ppst->pam2[0][ix_i][ix_i]+1)/2; for (i=1; i<=ppst->nsq; i++) { ppst->pam2[0][i][ix_i] = ppst->pam2[0][i][ix_l] = (ppst->pam2[0][i][ix_l]+ppst->pam2[0][i][ix_i]+1)/2; ppst->pam2[0][ix_i][i] = ppst->pam2[0][ix_l][i] = (ppst->pam2[0][ix_i][i]+ppst->pam2[0][ix_l][i]+1)/2; } } } } /* print_pam(ppst); */ /* once we have a complete pam matrix, we can calculate Lambda and K for "average" sequences */ kar_p = NULL; init_karlin_a(ppst, aa0_f, &kar_p); do_karlin_a(ppst->pam2[0], ppst, aa0_f, kar_p, &m_msg->Lambda, &m_msg->K, &m_msg->H); free(kar_p); #if defined(FASTF) || defined(FASTS) || defined(FASTM) if (ppst->ext_sq_set) { fprintf(stderr," -S not available on [t]fast[fs]\n"); ppst->ext_sq_set = 0; /* reset sascii to ignore -S, map lc */ init_ascii(0,lascii,0); } #endif } /* this function is left over from the older FASTA format scoring matrices that allowed additional parameters (bktup, bkfact) to be set in the scoring matrix. It is no longer used. A modern version would set parameters based on lambda and K. */ /* void f_initpam (line, ppst) char *line; struct pstruct *ppst; { if (sscanf (line, " %d %d %d %d %d %d %d", &ppst->param_u.fa.scfact, &ppst->param_u.fa.bestoff, &ppst->param_u.fa.bestscale, &ppst->param_u.fa.bkfact, &ppst->param_u.fa.bktup, &ppst->param_u.fa.bestmax, &ppst->histint) != 7) { printf (" bestcut parameters - bad format\n"); exit (1); } } */ /* alloc_pam2 creates a profile structure */ int ** alloc_pam2p(int len, int nsq) { int i; int **pam2p; if ((pam2p = (int **)calloc(len,sizeof(int *)))==NULL) { fprintf(stderr," Cannot allocate pam2p: %d\n",len); return NULL; } if((pam2p[0] = (int *)calloc((nsq+1)*len,sizeof(int)))==NULL) { fprintf(stderr, "Cannot allocate pam2p[0]: %d\n", (nsq+1)*len); free(pam2p); return NULL; } for (i=1; i 0; gap /= 2) for (i = gap; i < nbest; i++) for (j = i - gap; j >= 0; j-= gap) { if (bptr[j]->score[irelv] >= bptr[j + gap]->score[irelv]) break; tmp = bptr[j]; bptr[j] = bptr[j + gap]; bptr[j + gap] = tmp; } } void show_aux(FILE *fp, struct beststr *bptr) {} void header_aux(FILE *fp) {} #else void sortbest (bptr, nbest, irelv) struct beststr **bptr; int nbest, irelv; { int gap, i, j; struct beststr *tmp; for (gap = nbest/2; gap > 0; gap /= 2) for (i = gap; i < nbest; i++) for (j = i - gap; j >= 0; j-= gap) { if (bptr[j]->escore < bptr[j + gap]->escore) break; tmp = bptr[j]; bptr[j] = bptr[j + gap]; bptr[j + gap] = tmp; } } #if defined(FASTS) || defined(FASTM) #ifndef PCOMPLIB /* this shuffle is for FASTS */ /* convert ',' -> '\0', shuffle each of the substrings */ void qshuffle(unsigned char *aa0, int n0, int nm0) { unsigned char **aa0start, *aap, tmp; int i,j,k, ns; if ((aa0start=(unsigned char **)calloc(nm0+1, sizeof(unsigned char *)))==NULL) { fprintf(stderr,"cannot calloc for qshuffle %d\n",nm0); exit(1); } aa0start[0]=aa0; for (k=1,i=0; i1; i--) { j = nrand(i); tmp = aap[j]; aap[j] = aap[i-1]; aap[i-1] = tmp; } aap[ns] = 0; } for (k=1; k 0 ; i--) { /* j = nrand(i); if (i == j) continue;*/ /* shuffle columns */ j = (nmoff -1 ) - i; if (i <= j) break; /* reverse columns */ /* swap all i'th column residues for all j'th column residues */ for(nmpos = 0, k = 0 ; k < nm0 ; k++, nmpos += nmoff+1 ) { tmp = aa0[nmpos + i]; aa0[nmpos + i] = aa0[nmpos + j]; aa0[nmpos + j] = tmp; } } } #endif #endif /* show additional best_str values */ void show_aux(FILE *fp, struct beststr *bptr) { fprintf(fp," %2d %3d",bptr->segnum,bptr->seglen); } void header_aux(FILE *fp) { fprintf(fp, " sn sl"); } #endif void fill_pam(int **pam2p, int n0, int nsq, double **freq2d, double scale) { int i, j; double freq; /* fprintf(stderr, "scale: %g\n", scale); */ /* now fill in the pam matrix: */ for (i = 0 ; i < n0 ; i++) { for (j = 1 ; j <=20 ; j++) { freq = scale * freq2d[i][j-1]; if ( freq < 0.0) freq -= 0.5; else freq += 0.5; pam2p[i][j] = (int)(freq); } } } double get_lambda(int **pam2p, int n0, int nsq, unsigned char *query) { double lambda, H; double *pr, tot, sum; int i, ioff, j, min, max; /* get min and max scores */ min = BIGNUM; max = -BIGNUM; if(pam2p[0][1] == -BIGNUM) { ioff = 1; n0++; } else { ioff = 0; } for (i = ioff ; i < n0 ; i++) { for (j = 1; j <= nsq ; j++) { if (min > pam2p[i][j]) min = pam2p[i][j]; if (max < pam2p[i][j]) max = pam2p[i][j]; } } /* fprintf(stderr, "min: %d\tmax:%d\n", min, max); */ if ((pr = (double *) calloc(max - min + 1, sizeof(double))) == NULL) { fprintf(stderr, "Couldn't allocate memory for score probabilities: %d\n", max - min + 1); exit(1); } tot = (double) rrtotal * (double) rrtotal * (double) n0; for (i = ioff ; i < n0 ; i++) { for (j = 1; j <= nsq ; j++) { pr[pam2p[i][j] - min] += (double) ((double) rrcounts[aascii[query[i]]] * (double) rrcounts[j]) / tot; } } sum = 0.0; for(i = 0 ; i <= max-min ; i++) { sum += pr[i]; /* fprintf(stderr, "%3d: %g %g\n", i+min, pr[i], sum); */ } /* fprintf(stderr, "sum: %g\n", sum); */ for(i = 0 ; i <= max-min ; i++) { pr[i] /= sum; } if (!karlin(min, max, pr, &lambda, &H)) { fprintf(stderr, "Karlin lambda estimation failed\n"); } /* fprintf(stderr, "lambda: %g\n", lambda); */ free(pr); return lambda; } /* *aa0 - query sequence n0 - length pamscale - scaling for pam matrix - provided by apam.c, either 0.346574 = ln(2)/2 (P120, BL62) or 0.231049 = ln(2)/3 (P250, BL50) */ void scale_pssm(int **pssm2p, double **freq2d, unsigned char *query, int n0, int **pam2, double pamscale); static unsigned char ustandard_aa[] ="\0ARNDCQEGHILKMFPSTWYV"; void read_pssm(unsigned char *aa0, int n0, int nsq, double pamscale, FILE *fp, int pgpf_type, struct pstruct *ppst) { int i, j, len, k; int qi, rj; /* qi - index query; rj - index residues (1-20) */ int **pam2p; int first, too_high; unsigned char *query, ctmp; char dline[512]; double freq, **freq2d, lambda, new_lambda; double scale, scale_high, scale_low; pam2p = ppst->pam2p[0]; if (pgpf_type == 0) { if(1 != fread(&len, sizeof(int), 1, fp)) { fprintf(stderr, "error reading from checkpoint file: %d\n", len); exit(1); } if(len != n0) { fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n", len,n0); exit(1); } /* read over query sequence stored in BLAST profile */ if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) { fprintf(stderr, "Couldn't allocate memory for query!\n"); exit(1); } if(len != fread(query, sizeof(char), len, fp)) { fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query); exit(1); } } else if (pgpf_type == 1) { if ((fgets(dline,sizeof(dline),fp) == NULL) || (1 != sscanf(dline, "%d",&len))) { fprintf(stderr, "error reading from checkpoint file: %d\n", len); exit(1); } if(len != n0) { fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n", len,n0); exit(1); } /* read over query sequence stored in BLAST profile */ if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) { fprintf(stderr, "Couldn't allocate memory for query!\n"); exit(1); } if (fgets((char *)query,len+2,fp)==NULL) { fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query); exit(1); } } else { fprintf(stderr," Unrecognized PSSM file type: %d\n",pgpf_type); exit(1); } /* currently we don't do anything with query; ideally, we should check to see that it actually matches aa0 ... */ /* quick 2d array alloc: */ if((freq2d = (double **) calloc(n0, sizeof(double *))) == NULL) { fprintf(stderr, "Couldn't allocate memory for frequencies!\n"); exit(1); } if((freq2d[0] = (double *) calloc(n0 * 20, sizeof(double))) == NULL) { fprintf(stderr, "Couldn't allocate memory for frequencies!\n"); exit(1); } /* a little pointer arithmetic to fill out 2d array: */ for (i = 1 ; i < n0 ; i++) { freq2d[i] = freq2d[i-1] + 20; } if (pgpf_type == 0) { for (qi = 0 ; qi < n0 ; qi++) { for (rj = 0 ; rj < 20 ; rj++) { if(1 != fread(&freq, sizeof(double), 1, fp)) { fprintf(stderr, "Error while reading frequencies!\n"); exit(1); } freq2d[qi][rj] = freq; } } } else { for (qi = 0 ; qi < n0 ; qi++) { if ((fgets(dline,sizeof(dline),fp) ==NULL) || (k = sscanf(dline,"%c %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg\n", &ctmp, &freq2d[qi][0], &freq2d[qi][1], &freq2d[qi][2], &freq2d[qi][3], &freq2d[qi][4], &freq2d[qi][5], &freq2d[qi][6], &freq2d[qi][7], &freq2d[qi][8], &freq2d[qi][9], &freq2d[qi][10], &freq2d[qi][11], &freq2d[qi][12], &freq2d[qi][13], &freq2d[qi][14], &freq2d[qi][15], &freq2d[qi][16], &freq2d[qi][17], &freq2d[qi][18], &freq2d[qi][19]))<1) { fprintf(stderr, "Error while reading frequencies: %d read!\n",k); exit(1); } for (rj=0; rj<20; rj++) { freq2d[qi][rj] /= 10.0; } /* reverse scaling */ } } scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale); free(freq2d[0]); free(freq2d); free(query); } void scale_pssm(int **pssm2p, double **freq2d, unsigned char *query, int n0, int **pam2, double pamscale) { int i, qi, rj; double freq, new_lambda, lambda; int first, too_high; double scale, scale_high, scale_low; for (qi = 0 ; qi < n0 ; qi++) { for (rj = 0 ; rj < 20 ; rj++) { if (freq2d[qi][rj] > 1e-20) { freq = log(freq2d[qi][rj] /((double) (rrcounts[rj+1])/(double) rrtotal)); freq /= pamscale; /* this gets us close to originial pam scores */ freq2d[qi][rj] = freq; } else { /* when blastpgp decides to leave something out, it puts 0's in all the frequencies in the binary checkpoint file. In the ascii version, however, it uses BLOSUM62 values. I will put in scoring matrix values as well */ freq2d[qi][rj] = pam2[aascii[query[qi]]][rj+1]; } } } /* now figure out the right scale */ scale = 1.0; lambda = get_lambda(pam2, 20, 20, ustandard_aa); /* should be near 1.0 because of our initial scaling by ppst->pamscale */ /* fprintf(stderr, "real_lambda: %g\n", lambda); */ /* get initial high/low scale values: */ first = 1; while (1) { fill_pam(pssm2p, n0, 20, freq2d, scale); new_lambda = get_lambda(pssm2p, n0, 20, query); if (new_lambda > lambda) { if (first) { first = 0; scale = scale_high = 1.0 + 0.05; scale_low = 1.0; too_high = 1; } else { if (!too_high) break; scale = (scale_high += scale_high - 1.0); } } else if (new_lambda > 0) { if (first) { first = 0; scale_high = 1.0; scale = scale_low = 1.0 - 0.05; too_high = 0; } else { if (too_high) break; scale = (scale_low += scale_low - 1.0); } } else { fprintf(stderr, "new_lambda (%g) <= 0; matrix has positive average score", new_lambda); exit(1); } } /* now do binary search between low and high */ for (i = 0 ; i < 10 ; i++) { scale = 0.5 * (scale_high + scale_low); fill_pam(pssm2p, n0, 20, freq2d, scale); new_lambda = get_lambda(pssm2p, n0, 20, query); if (new_lambda > lambda) scale_low = scale; else scale_high = scale; } scale = 0.5 * (scale_high + scale_low); fill_pam(pssm2p, n0, 20, freq2d, scale); /* fprintf(stderr, "final scale: %g\n", scale); for (qi = 0 ; qi < n0 ; qi++) { fprintf(stderr, "%4d %c: ", qi+1, query[qi]); for (rj = 1 ; rj <= 20 ; rj++) { fprintf(stderr, "%4d", pssm2p[qi][rj]); } fprintf(stderr, "\n"); } */ } #if defined(SSEARCH) || (defined(PRSS) && !defined(FASTX)) int parse_pssm_asn_fa(FILE *afd, int *n_rows, int *n_cols, unsigned char **query, double ***freqs, char *matrix, int *gap_open, int *gap_extend, double *lambda); /* the ASN.1 pssm includes information about the scoring matrix used (though not the gap penalty in the current version PSSM:2) The PSSM scoring matrix and gap penalties should become the default if they have not been set explicitly. */ int read_asn_pssm(unsigned char *aa0, int n0, int nsq, double pamscale, FILE *fp, struct pstruct *ppst) { int i, j, len, k; int qi, rj; /* qi - index query; rj - index residues (1-20) */ int **pam2p; int first, too_high; unsigned char *query, ctmp; char dline[512]; char matrix[MAX_SSTR]; double psi2_lambda; double freq, **freq2d, lambda, new_lambda; double scale, scale_high, scale_low; int gap_open, gap_extend; int n_rows, n_cols; pam2p = ppst->pam2p[0]; if (parse_pssm_asn_fa(fp, &n_rows, &n_cols, &query, &freq2d, matrix, &gap_open, &gap_extend, &psi2_lambda)<=0) { return -1; } if (!gap_set) { if (gap_open) { if (gap_open > 0) {gap_open = -gap_open;} ppst->gdelval = gap_open; } else if (strncmp(matrix,"BLOSUM62",8)==0) { ppst->gdelval = -11; } gap_set = 1; } if (!del_set) { if (gap_extend) { if (gap_extend > 0) {gap_extend = -gap_extend;} ppst->ggapval = gap_extend; } else if (strncmp(matrix,"BLOSUM62",8)==0) { ppst->ggapval = -1; } del_set = 1; } if (strncmp(matrix, "BLOSUM62", 8)== 0 && !ppst->pam_set) { strncpy(ppst->pamfile, "BL62", 120); standard_pam(ppst->pamfile,ppst,del_set, gap_set); if (!ppst->have_pam2) { alloc_pam (MAXSQ, MAXSQ, ppst); } init_pam2(ppst); ppst->pam_set = 1; } if (n_cols < n0) { fprintf(stderr, " query length: %d != n_cols: %d\n",n0, n_cols); exit(1); } scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale); free(freq2d[0]); free(freq2d); free(query); return 1; } #endif void last_params(unsigned char *aa0, int n0, struct mngmsg *m_msg, struct pstruct *ppst #ifdef PCOMPLIB , struct qmng_str *qm_msg #endif ) { int i, nsq; FILE *fp; if (n0 < 0) { return;} ppst->n0 = m_msg->n0; if (ppst->ext_sq_set) { nsq = ppst->nsqx; } else {nsq = ppst->nsq;} /* currently, profiles are only available for SSEARCH, PRSS */ #if defined(SSEARCH) || defined(PRSS) ppst->pam2p[0] = alloc_pam2p(n0,nsq); ppst->pam2p[1] = alloc_pam2p(n0,nsq); if (ppst->pam_pssm) { if ((ppst->pgpfile_type == 0) && (fp=fopen(ppst->pgpfile,"rb"))) { read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 0, ppst); extend_pssm(aa0, n0, ppst); } else if ((ppst->pgpfile_type == 1) && (fp=fopen(ppst->pgpfile,"r"))) { read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 1, ppst); extend_pssm(aa0, n0, ppst); } #if defined(SSEARCH) || (defined(PRSS) && !defined(FASTX)) else if ((ppst->pgpfile_type == 2) && (fp=fopen(ppst->pgpfile,"rb"))) { if (read_asn_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, ppst)>0) { extend_pssm(aa0, n0, ppst); } else { fprintf(stderr," Could not parse PSSM file: %s\n",ppst->pgpfile); ppst->pam_pssm = 0; return; } } #endif else { fprintf(stderr," Could not open PSSM file: %s\n",ppst->pgpfile); ppst->pam_pssm = 0; return; } } #endif #if defined(FASTF) || defined(FASTS) || defined(FASTM) m_msg->nm0 = 1; for (i=0; inm0++; /* for FASTS, we can do statistics in one of two different ways if there are <= 10 query fragments, then we calculate probabilistic scores for every library sequence. If there are > 10 fragments, this takes much too long and too much memory, so we use the old fashioned raw score only z-score normalized method initially, and then calculate the probabilistic scores for the best hits. To scale those scores, we also need a set of random probabilistic scores. So we do the qshuffle to get them. For FASTF, precalculating probabilities is prohibitively expensive, so we never do it; FASTF always acts like FASTS with nfrags>10. */ #if defined(FASTS) || defined(FASTM) if (m_msg->nm0 > 10) m_msg->escore_flg = 0; else m_msg->escore_flg = 1; #endif if (m_msg->escore_flg && (ppst->zsflag&1)) { m_msg->last_calc_flg = 0; m_msg->qshuffle = 0; } else { /* need random query, second set of 2000 scores */ m_msg->last_calc_flg = 1; m_msg->qshuffle = 1; } #else m_msg->last_calc_flg = 0; m_msg->qshuffle = 0; m_msg->escore_flg = 0; m_msg->nm0 = 1; #endif /* adjust the ktup if appropriate */ if (!ktup_set && pgm_def_arr[ppst->pgm_id].ktup > 0) { if (m_msg->qdnaseq == SEQT_PROT) { ppst->param_u.fa.ktup = pgm_def_arr[ppst->pgm_id].ktup; #if defined(FASTS) || defined(FASTM) if (n0 > 100) ppst->param_u.fa.ktup = 2; #endif if (n0 < 40) ppst->param_u.fa.ktup = 1; } else if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) { if (n0 < 20) ppst->param_u.fa.ktup = 1; #if defined(FASTS) || defined(FASTM) /* with the current (April 12 2005) dropfs2.c - ktup cannot be > 2 */ else ppst->param_u.fa.ktup = 2; #else else if (n0 < 50) ppst->param_u.fa.ktup = 2; else if (n0 < 100) ppst->param_u.fa.ktup = 3; #endif } } #ifdef PCOMPLIB qm_msg->nm0 = m_msg->nm0; qm_msg->escore_flg = m_msg->escore_flg; qm_msg->qshuffle = m_msg->qshuffle; qm_msg->pam_pssm = 0; #endif } /* given a good profile in ppst->pam2p[0], make an extended profile in ppst->pam2p[1] */ void extend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst) { int i, j, nsq; int sa_x, sa_t, sa_b, sa_z; int **pam2p0, **pam2p1; nsq = ppst->nsq; pam2p0 = ppst->pam2p[0]; pam2p1 = ppst->pam2p[1]; sa_x = pascii['X']; sa_t = pascii['*']; sa_b = pascii['B']; sa_z = pascii['Z']; /* fill in boundaries, B, Z, *, X */ for (i=0; ipam_xm; pam2p0[i][sa_t] = ppst->pam_xm; } /* copy pam2p0 into pam2p1 */ for (i=0; insq; j++) { pam2p1[i][j] = pam2p0[i][j]; } } /* then fill in extended characters, if necessary */ if (ppst->ext_sq_set) { for (i=0; insq; j++) { pam2p0[i][nsq+j] = pam2p0[i][j]; pam2p1[i][nsq+j] = ppst->pam_xm; } } } }