/*****************************************************************************/ /*** (genwin.c) ***/ /*****************************************************************************/ /*--------------------------------------------------------------(includes)---*/ #include "genwin.h" /*---------------------------------------------------------------(defines)---*/ #define STRSIZE 100 /*----------------------------------------------------------------(protos)---*/ struct Sequence *readentry(); /*---------------------------------------------------------------(globals)---*/ char *blastdbs[] = {"bba", "bbn", "embl", "gbupdate", "genbank", "genpept", "gpupdate", "nr", "nrdb", "nrdb.shuf", "pir", "pseq", "swissprot", "tfdaa"}; int nblastdbs = 14; #ifndef MIN #define MIN(a,b) ((a) <= (b) ? (a) : (b)) #endif char *blastdir = "/net/cruncher/usr/ncbi/db/fasta/"; char *indexdir = "/net/cruncher/usr/ncbi/db/index/"; int nabets; struct Alphabet **abets; int ntvecs; struct TransVector **tvecs; int nsvecs; struct ScoreVector **svecs; int nsmats; struct ScoreMatrix **smats; int aaindex[128]; unsigned char aaflag[128]; char aachar[20]; struct strlist { char string[STRSIZE]; struct strlist *next; } *str, *curstr; /*---------------------------------------------------------------(tmalloc)---*/ #define TESTMAX 1000 void *tmalloc(); int record_ptrs[TESTMAX] = {0,0,0,0}; int rptr = 0; /*------------------------------------------------------------(genwininit)---*/ genwininit() { char *cp, *cp0; int i; char c; for (i = 0; i < sizeof(aaindex)/sizeof(aaindex[0]); ++i) { aaindex[i] = 20; aaflag[i] = TRUE; } for (cp = cp0 = "ACDEFGHIKLMNPQRSTVWY"; (c = *cp) != '\0'; ++cp) { i = cp - cp0; aaindex[c] = i; aaindex[tolower(c)] = i; aachar[i] = tolower(c); aaflag[c] = FALSE; aaflag[tolower(c)] = FALSE; } return; } /*-------------------------------------------------------------(opendbase)---*/ extern struct Database *opendbase(name) char *name; {struct Database *dbase; dbase = (struct Database *) malloc(sizeof(struct Database)); if (blastdb(name)) { dbase->filename = (char *) malloc(strlen(blastdir)+strlen(name)+1); dbase->indexname = (char *) malloc(strlen(indexdir)+strlen(name)+1); strcpy(dbase->filename, blastdir); strcat(dbase->filename, name); strcpy(dbase->indexname, indexdir); strcat(dbase->indexname, name); } else { dbase->filename = (char *) malloc(strlen(name)+1); dbase->indexname = (char *) malloc(strlen(name)+1); strcpy(dbase->filename, name); strcpy(dbase->indexname, name); } if (strcmp(dbase->filename, "-")==0) { dbase->fp = stdin; } else if ((dbase->fp=fopen(dbase->filename, "r"))==NULL) { free(dbase->filename); free(dbase->indexname); free(dbase); return((struct Database *) NULL); } dbase->filepos = 0L; return(dbase); } /*---------------------------------------------------------------(blastdb)---*/ int blastdb(name) char *name; {int i; for (i=0; ifp); free(dbase->filename); free(dbase->indexname); free(dbase); return; } /*--------------------------------------------------------------(firstseq)---*/ extern struct Sequence *firstseq(dbase) struct Database *dbase; { if (dbase->filepos!=0L) { dbase->filepos = 0L; if (fseek(dbase->fp, dbase->filepos, 0)!=0) {fprintf(stderr, "Error positioning file %s for firstseq.\n", dbase->filename); exit(1);} } return(readentry(dbase)); } /*---------------------------------------------------------------(nextseq)---*/ extern struct Sequence *nextseq(dbase) struct Database *dbase; { return(readentry(dbase)); } /*--------------------------------------------------------------(closeseq)---*/ extern closeseq(seq) struct Sequence *seq; { if (seq==NULL) return; if (seq->id!=NULL) free(seq->id); if (seq->name!=NULL) free(seq->name); if (seq->organism!=NULL) free(seq->organism); if (seq->header!=NULL) free(seq->header); if (seq->state!=NULL) free(seq->state); if (seq->composition!=NULL) free(seq->composition); free(seq->seq); free(seq); return; } /*---------------------------------------------------------------(openwin)---*/ extern struct Sequence *openwin(parent, start, length) struct Sequence *parent; int start, length; {struct Sequence *win; int i; if (start<0 || length<0 || start+length>parent->length) { return((struct Sequence *) NULL); } win = (struct Sequence *) malloc(sizeof(struct Sequence)); /*--- ---[set links, up and down]---*/ win->parent = parent; if (parent->root==NULL) {win->root = parent;} else {win->root = parent->root;} win->children = (struct Sequence **) NULL; /* parent->children = ***foo*** ---[not yet implemented]---*/ win->id = (char *) NULL; win->name = (char *) NULL; win->organism = (char *) NULL; win->header = (char *) NULL; /*--- ---[install the local copy of the sequence]---*/ win->start = start; win->length = length; #if 0 win->seq = (char *) malloc(sizeof(char)*length + 1); memcpy(win->seq, (parent->seq)+start, length); win->seq[length] = '\0'; #else win->seq = parent->seq + start; #endif /*--- ---[setup window implementation parameters]---*/ /*--- ---[set local flags]---*/ win->rubberwin = FALSE; win->floatwin = FALSE; win->punctuation = FALSE; /*--- ---[initially unconfiguerd window]---*/ win->entropy = -2.; win->state = (int *) NULL; win->composition = (int *) NULL; win->classvec = (char *) NULL; win->scorevec = (double *) NULL; stateon(win); return win; } /*---------------------------------------------------------------(nextwin)---*/ extern struct Sequence *nextwin(win, shift) struct Sequence *win; int shift; { if ((win->start+shift)<0 || (win->start+win->length+shift)>win->parent->length) { return((struct Sequence *) NULL); } else { return(openwin(win->parent, win->start+shift, win->length)); } } /*--------------------------------------------------------------(shiftwin1)---*/ static void decrementsv(), incrementsv(); extern int shiftwin1(win) struct Sequence *win; { register int j, length; register int *comp; length = win->length; comp = win->composition; if ((++win->start + length) > win->parent->length) { --win->start; return FALSE; } if (!aaflag[j = win->seq[0]]) decrementsv(win->state, comp[aaindex[j]]--); j = win->seq[length]; ++win->seq; if (!aaflag[j]) incrementsv(win->state, comp[aaindex[j]]++); if (win->entropy > -2.) win->entropy = entropy(win->state); return TRUE; } /*--------------------------------------------------------------(closewin)---*/ extern closewin(win) struct Sequence *win; { if (win==NULL) return; if (win->state!=NULL) free(win->state); if (win->composition!=NULL) free(win->composition); if (win->classvec!=NULL) free(win->classvec); if (win->scorevec!=NULL) free(win->scorevec); free(win); return; } /*----------------------------------------------------------------(compon)---*/ extern compon(win) struct Sequence *win; { register int *comp; register int aa; register char *seq, *seqmax; win->composition = comp = (int *) calloc(20*sizeof(*comp), 1); seq = win->seq; seqmax = seq + win->length; while (seq < seqmax) { aa = *seq++; if (!aaflag[aa]) comp[aaindex[aa]]++; } return; } /*---------------------------------------------------------------(stateon)---*/ static int state_cmp(s1, s2) int *s1, *s2; { return *s2 - *s1; } extern stateon(win) struct Sequence *win; { register int aa, nel, c; if (win->composition == NULL) compon(win); win->state = (int *) malloc(21*sizeof(win->state[0])); for (aa = nel = 0; aa < 20; ++aa) { if ((c = win->composition[aa]) == 0) continue; win->state[nel++] = c; } for (aa = nel; aa < 21; ++aa) win->state[aa] = 0; qsort(win->state, nel, sizeof(win->state[0]), state_cmp); return; } /*-----------------------------------------------------------------(enton)---*/ extern enton(win) struct Sequence *win; { if (win->state==NULL) {stateon(win);} win->entropy = entropy(win->state); return; } /*---------------------------------------------------------------(entropy)---*/ static int thewindow; static double *entray; #define LN2 0.69314718055994530941723212145818 void entropy_init(window) int window; { int i; double x, xw; entray = (double *)malloc((window+1) * sizeof(*entray)); xw = window; for (i = 1; i <= window; ++i) { x = i / xw; entray[i] = -x * log(x) / LN2; } thewindow = window; } extern double entropy(sv) register int *sv; { int *sv0 = sv; register double ent; register int i, total; register int *svmax; register double xtotrecip, xsv; for (total = 0; (i = *sv) != 0; ++sv) total += i; svmax = sv; ent = 0.0; if (total == thewindow) { for (sv = sv0; sv < svmax; ) { ent += entray[*sv++]; } return ent; } if (total == 0) return 0.; xtotrecip = 1./(double)total; for (sv = sv0; sv < svmax; ) { xsv = *sv++; ent += xsv * log(xsv * xtotrecip); } return -ent * xtotrecip / LN2; } /*-----------------------------------------------------------(decrementsv)---*/ static void decrementsv(sv, class) register int *sv; register int class; { register int svi; while ((svi = *sv++) != 0) { if (svi == class && *sv < class) { sv[-1] = svi - 1; break; } } } /*-----------------------------------------------------------(incrementsv)---*/ static void incrementsv(sv, class) register int *sv; int class; { for (;;) { if (*sv++ == class) { sv[-1]++; break; } } } /*-------------------------------------------------------------(readentry)---*/ struct Sequence *readentry(dbase) struct Database *dbase; {struct Sequence *seq; int c; seq = (struct Sequence *) malloc(sizeof(struct Sequence)); seq->db = dbase; /*--- ---[backpointers null at the top]---*/ seq->parent = (struct Sequence *) NULL; seq->root = (struct Sequence *) NULL; seq->children = (struct Sequence **) NULL; /*--- ---[set flags]---*/ seq->rubberwin = FALSE; seq->floatwin = FALSE; /*--- ---[read from file]---*/ if (!readhdr(seq)) { return((struct Sequence *) NULL); } while (1) /*---[skip multiple headers]---*/ { c = getc(dbase->fp); if (c == EOF) break; if (c != '>') { ungetc(c, dbase->fp); break; } while ((c=getc(dbase->fp)) != EOF && c !='\n') ; if (c == EOF) break; } readseq(seq); /*--- ---[set implementation parameters]---*/ /*--- ---[initially unconfigured]---*/ seq->entropy = -2.; seq->state = (int *) NULL; seq->composition = (int *) NULL; seq->classvec = (char *) NULL; seq->scorevec = (double *) NULL; return(seq); } /*---------------------------------------------------------------(readhdr)---*/ readhdr(seq) struct Sequence *seq; {FILE *fp; char *bptr, *curpos; int c, i, itotal; int idend, namend, orgend; fp = seq->db->fp; if ((c=getc(fp)) == EOF) { free(seq); return(FALSE); } while (c != EOF && isspace(c)) { c = getc(fp); } if (c!='>') {fprintf(stderr, "Error reading fasta format - '>' not found.\n"); exit(1);} ungetc(c, fp); /* ---[read the header line]---*/ str = (struct strlist *) malloc (sizeof(struct strlist)); str->next = NULL; curstr = str; for (i=0,itotal=0,c=getc(fp); c != EOF; c=getc(fp)) { if (c=='\n') break; if (i==STRSIZE-1) {curstr->string[i] = '\0'; curstr->next = (struct strlist *) malloc (sizeof(struct strlist)); curstr = curstr->next; curstr->next = NULL; i = 0;} curstr->string[i] = c; itotal++; i++; } curstr->string[i] = '\0'; seq->header = (char *) malloc (itotal+2); seq->header[0] = '\0'; for (curstr=str, curpos=seq->header; curstr!=NULL;) { if (curstr->next==NULL) {memccpy(curpos, curstr->string, '\0', STRSIZE);} else {memccpy(curpos, curstr->string, '\0', STRSIZE-1);} str = curstr; curstr = curstr->next; free (str); if (curstr!=NULL) {curpos = curpos+STRSIZE-1;} } bptr = (seq->header)+1; seq->name = (char *) NULL; seq->organism = (char *) NULL; /* ---[parse out the id]---*/ idend = findchar(bptr, ' '); if (idend==-1) {idend = findchar(bptr, '\n');} if (idend==-1) {idend = findchar(bptr, '\0');} if (idend==-1) {fprintf(stderr, "Error parsing header line - id.\n"); fputs(seq->header, fp); exit(1);} seq->id = (char *) malloc((idend+1)*sizeof(char)); memcpy(seq->id, bptr, idend); seq->id[idend] = '\0'; if (bptr[idend]=='\n' || bptr[idend]=='\0') {return(TRUE);} /* ---[parse out the protein name]---*/ bptr = bptr + idend + 1; while (bptr[0]==' ') {bptr++;} namend = findchar(bptr, '-'); if (namend==-1) {namend = findchar(bptr, '\n');} if (namend==-1) {namend = findchar(bptr, '\0');} if (namend==-1) {fprintf(stderr, "Error parsing header line - name.\n"); fputs(seq->header, fp); return(TRUE);} seq->name = (char *) malloc((namend+1)*sizeof(char)); memcpy(seq->name, bptr, namend); seq->name[namend] = '\0'; if (bptr[namend]=='\n' || bptr[namend]=='\0') {return(TRUE);} /* ---[parse out organism]---*/ bptr = bptr + namend + 1; while (bptr[0]==' ') {bptr++;} orgend = findchar(bptr, '|'); if (orgend==-1) {orgend = findchar(bptr, '#');} if (orgend==-1) {orgend = findchar(bptr, '\n');} if (orgend==-1) {orgend = findchar(bptr, '\0');} if (orgend==-1) {fprintf(stderr, "Error parsing header line - organism.\n"); fputs(seq->header, fp); return(TRUE);} seq->organism = (char *) malloc((orgend+1)*sizeof(char)); memcpy(seq->organism, bptr, orgend); seq->organism[orgend] = '\0'; /* ---[skip over multiple header lines]---*/ while (TRUE) { c = getc(fp); if (c == EOF) return(TRUE); if (c=='>') { skipline(fp); } else { ungetc(c,fp); break; } } return(TRUE); } /*--------------------------------------------------------------(skipline)---*/ skipline(fp) FILE *fp; {int c; while ((c=getc(fp))!='\n' && c!=EOF) ; return; } /*--------------------------------------------------------------(findchar)---*/ extern int findchar(str, chr) char *str; char chr; {int i; for (i=0; ; i++) { if (str[i]==chr) { return(i); } if (str[i]=='\0') { return(-1); } } } /*---------------------------------------------------------------(readseq)---*/ readseq(seq) struct Sequence *seq; {FILE *fp; int i, itotal; int c; char *curpos; fp = seq->db->fp; seq->punctuation = FALSE; str = (struct strlist *) malloc (sizeof(struct strlist)); str->next = NULL; curstr = str; for (i = 0, itotal = 0, c = getc(fp); c != EOF; c = getc(fp)) { if (!aaflag[c]) { Keep: if (i < STRSIZE-1) { curstr->string[i++] = c; continue; } itotal += STRSIZE-1; curstr->string[STRSIZE-1] = '\0'; curstr->next = (struct strlist *) malloc(sizeof(*curstr)); curstr = curstr->next; curstr->next = NULL; curstr->string[0] = c; i = 1; continue; } switch (c) { case '>': ungetc(c, fp); goto EndLoop; case '*': case '-': seq->punctuation = TRUE; goto Keep; case 'b': case 'B': case 'u': case 'U': /* selenocysteine */ case 'x': case 'X': case 'z': case 'Z': goto Keep; default: continue; } } EndLoop: itotal += i; curstr->string[i] = '\0'; seq->seq = (char *) malloc (itotal+2); seq->seq[0] = '\0'; for (curstr = str, curpos = seq->seq; curstr != NULL;) { if (curstr->next == NULL) memccpy(curpos, curstr->string, '\0', STRSIZE); else memccpy(curpos, curstr->string, '\0', STRSIZE-1); str = curstr; curstr = curstr->next; free(str); if (curstr != NULL) curpos = curpos+STRSIZE-1; } seq->length = strlen(seq->seq); return; } /*-----------------------------------------------------------------(upper)---*/ extern upper(string, len) register char *string; size_t len; { register char *stringmax, c; for (stringmax = string + len; string < stringmax; ++string) if (islower(c = *string)) *string = toupper(c); } /*-----------------------------------------------------------------(lower)---*/ extern lower(string, len) char *string; size_t len; { register char *stringmax, c; for (stringmax = string + len; string < stringmax; ++string) if (isupper(c = *string)) *string = tolower(c); } /*-------------------------------------------------------------------(min)---*/ int min(a, b) int a, b; { if (aTESTMAX) { exit(2); } record_ptrs[rptr] = (int) ptr; rptr++; return(ptr); } /*-----------------------------------------------------------------(tfree)---*/ tfree(ptr) void *ptr; {int i; for (i=0; i