2 /*****************************************************************************/
4 /*****************************************************************************/
6 /*--------------------------------------------------------------(includes)---*/
10 /*---------------------------------------------------------------(defines)---*/
14 /*----------------------------------------------------------------(protos)---*/
16 struct Sequence *readentry();
18 /*---------------------------------------------------------------(globals)---*/
21 {"bba", "bbn", "embl", "gbupdate", "genbank", "genpept", "gpupdate",
22 "nr", "nrdb", "nrdb.shuf", "pir", "pseq", "swissprot", "tfdaa"};
27 #define MIN(a,b) ((a) <= (b) ? (a) : (b))
30 char *blastdir = "/net/cruncher/usr/ncbi/db/fasta/";
31 char *indexdir = "/net/cruncher/usr/ncbi/db/index/";
34 struct Alphabet **abets;
36 struct TransVector **tvecs;
38 struct ScoreVector **svecs;
40 struct ScoreMatrix **smats;
43 unsigned char aaflag[128];
52 /*---------------------------------------------------------------(tmalloc)---*/
56 int record_ptrs[TESTMAX] = {0,0,0,0};
59 /*------------------------------------------------------------(genwininit)---*/
67 for (i = 0; i < sizeof(aaindex)/sizeof(aaindex[0]); ++i) {
72 for (cp = cp0 = "ACDEFGHIKLMNPQRSTVWY"; (c = *cp) != '\0'; ++cp) {
75 aaindex[tolower(c)] = i;
76 aachar[i] = tolower(c);
78 aaflag[tolower(c)] = FALSE;
83 /*-------------------------------------------------------------(opendbase)---*/
85 extern struct Database *opendbase(name)
88 {struct Database *dbase;
90 dbase = (struct Database *) malloc(sizeof(struct Database));
94 dbase->filename = (char *) malloc(strlen(blastdir)+strlen(name)+1);
95 dbase->indexname = (char *) malloc(strlen(indexdir)+strlen(name)+1);
96 strcpy(dbase->filename, blastdir);
97 strcat(dbase->filename, name);
98 strcpy(dbase->indexname, indexdir);
99 strcat(dbase->indexname, name);
103 dbase->filename = (char *) malloc(strlen(name)+1);
104 dbase->indexname = (char *) malloc(strlen(name)+1);
105 strcpy(dbase->filename, name);
106 strcpy(dbase->indexname, name);
109 if (strcmp(dbase->filename, "-")==0)
113 else if ((dbase->fp=fopen(dbase->filename, "r"))==NULL)
115 free(dbase->filename);
116 free(dbase->indexname);
118 return((struct Database *) NULL);
126 /*---------------------------------------------------------------(blastdb)---*/
133 for (i=0; i<nblastdbs; i++)
135 if (strcmp(name, blastdbs[i])==0) {return(TRUE);}
141 /*------------------------------------------------------------(closedbase)---*/
143 extern closedbase(dbase)
144 struct Database *dbase;
148 free(dbase->filename);
149 free(dbase->indexname);
155 /*--------------------------------------------------------------(firstseq)---*/
157 extern struct Sequence *firstseq(dbase)
158 struct Database *dbase;
161 if (dbase->filepos!=0L)
164 if (fseek(dbase->fp, dbase->filepos, 0)!=0)
165 {fprintf(stderr, "Error positioning file %s for firstseq.\n",
170 return(readentry(dbase));
173 /*---------------------------------------------------------------(nextseq)---*/
175 extern struct Sequence *nextseq(dbase)
176 struct Database *dbase;
179 return(readentry(dbase));
182 /*--------------------------------------------------------------(closeseq)---*/
185 struct Sequence *seq;
188 if (seq==NULL) return;
190 if (seq->id!=NULL) free(seq->id);
191 if (seq->name!=NULL) free(seq->name);
192 if (seq->organism!=NULL) free(seq->organism);
193 if (seq->header!=NULL) free(seq->header);
194 if (seq->state!=NULL) free(seq->state);
195 if (seq->composition!=NULL) free(seq->composition);
201 /*---------------------------------------------------------------(openwin)---*/
203 extern struct Sequence *openwin(parent, start, length)
204 struct Sequence *parent;
207 {struct Sequence *win;
210 if (start<0 || length<0 || start+length>parent->length)
212 return((struct Sequence *) NULL);
215 win = (struct Sequence *) malloc(sizeof(struct Sequence));
217 /*--- ---[set links, up and down]---*/
219 win->parent = parent;
220 if (parent->root==NULL)
221 {win->root = parent;}
223 {win->root = parent->root;}
224 win->children = (struct Sequence **) NULL;
226 /* parent->children = ***foo*** ---[not yet implemented]---*/
228 win->id = (char *) NULL;
229 win->name = (char *) NULL;
230 win->organism = (char *) NULL;
231 win->header = (char *) NULL;
233 /*--- ---[install the local copy of the sequence]---*/
236 win->length = length;
238 win->seq = (char *) malloc(sizeof(char)*length + 1);
239 memcpy(win->seq, (parent->seq)+start, length);
240 win->seq[length] = '\0';
242 win->seq = parent->seq + start;
245 /*--- ---[setup window implementation parameters]---*/
247 /*--- ---[set local flags]---*/
249 win->rubberwin = FALSE;
250 win->floatwin = FALSE;
251 win->punctuation = FALSE;
253 /*--- ---[initially unconfiguerd window]---*/
256 win->state = (int *) NULL;
257 win->composition = (int *) NULL;
258 win->classvec = (char *) NULL;
259 win->scorevec = (double *) NULL;
266 /*---------------------------------------------------------------(nextwin)---*/
268 extern struct Sequence *nextwin(win, shift)
269 struct Sequence *win;
273 if ((win->start+shift)<0 ||
274 (win->start+win->length+shift)>win->parent->length)
276 return((struct Sequence *) NULL);
280 return(openwin(win->parent, win->start+shift, win->length));
284 /*--------------------------------------------------------------(shiftwin1)---*/
285 static void decrementsv(), incrementsv();
287 extern int shiftwin1(win)
288 struct Sequence *win;
290 register int j, length;
293 length = win->length;
294 comp = win->composition;
296 if ((++win->start + length) > win->parent->length) {
301 if (!aaflag[j = win->seq[0]])
302 decrementsv(win->state, comp[aaindex[j]]--);
304 j = win->seq[length];
308 incrementsv(win->state, comp[aaindex[j]]++);
310 if (win->entropy > -2.)
311 win->entropy = entropy(win->state);
316 /*--------------------------------------------------------------(closewin)---*/
319 struct Sequence *win;
322 if (win==NULL) return;
324 if (win->state!=NULL) free(win->state);
325 if (win->composition!=NULL) free(win->composition);
326 if (win->classvec!=NULL) free(win->classvec);
327 if (win->scorevec!=NULL) free(win->scorevec);
333 /*----------------------------------------------------------------(compon)---*/
336 struct Sequence *win;
340 register char *seq, *seqmax;
342 win->composition = comp = (int *) calloc(20*sizeof(*comp), 1);
344 seqmax = seq + win->length;
346 while (seq < seqmax) {
355 /*---------------------------------------------------------------(stateon)---*/
357 static int state_cmp(s1, s2)
364 struct Sequence *win;
366 register int aa, nel, c;
368 if (win->composition == NULL)
371 win->state = (int *) malloc(21*sizeof(win->state[0]));
373 for (aa = nel = 0; aa < 20; ++aa) {
374 if ((c = win->composition[aa]) == 0)
376 win->state[nel++] = c;
378 for (aa = nel; aa < 21; ++aa)
381 qsort(win->state, nel, sizeof(win->state[0]), state_cmp);
386 /*-----------------------------------------------------------------(enton)---*/
389 struct Sequence *win;
392 if (win->state==NULL) {stateon(win);}
394 win->entropy = entropy(win->state);
399 /*---------------------------------------------------------------(entropy)---*/
400 static int thewindow;
401 static double *entray;
403 #define LN2 0.69314718055994530941723212145818
412 entray = (double *)malloc((window+1) * sizeof(*entray));
414 for (i = 1; i <= window; ++i) {
416 entray[i] = -x * log(x) / LN2;
422 extern double entropy(sv)
427 register int i, total;
429 register double xtotrecip, xsv;
431 for (total = 0; (i = *sv) != 0; ++sv)
435 if (total == thewindow) {
436 for (sv = sv0; sv < svmax; ) {
437 ent += entray[*sv++];
444 xtotrecip = 1./(double)total;
445 for (sv = sv0; sv < svmax; ) {
447 ent += xsv * log(xsv * xtotrecip);
449 return -ent * xtotrecip / LN2;
452 /*-----------------------------------------------------------(decrementsv)---*/
455 decrementsv(sv, class)
461 while ((svi = *sv++) != 0) {
462 if (svi == class && *sv < class) {
469 /*-----------------------------------------------------------(incrementsv)---*/
472 incrementsv(sv, class)
477 if (*sv++ == class) {
484 /*-------------------------------------------------------------(readentry)---*/
486 struct Sequence *readentry(dbase)
487 struct Database *dbase;
489 {struct Sequence *seq;
492 seq = (struct Sequence *) malloc(sizeof(struct Sequence));
496 /*--- ---[backpointers null at the top]---*/
498 seq->parent = (struct Sequence *) NULL;
499 seq->root = (struct Sequence *) NULL;
500 seq->children = (struct Sequence **) NULL;
502 /*--- ---[set flags]---*/
504 seq->rubberwin = FALSE;
505 seq->floatwin = FALSE;
507 /*--- ---[read from file]---*/
511 return((struct Sequence *) NULL);
513 while (1) /*---[skip multiple headers]---*/
519 ungetc(c, dbase->fp);
522 while ((c=getc(dbase->fp)) != EOF && c !='\n')
529 /*--- ---[set implementation parameters]---*/
531 /*--- ---[initially unconfigured]---*/
534 seq->state = (int *) NULL;
535 seq->composition = (int *) NULL;
536 seq->classvec = (char *) NULL;
537 seq->scorevec = (double *) NULL;
542 /*---------------------------------------------------------------(readhdr)---*/
545 struct Sequence *seq;
550 int idend, namend, orgend;
554 if ((c=getc(fp)) == EOF)
560 while (c != EOF && isspace(c))
566 {fprintf(stderr, "Error reading fasta format - '>' not found.\n");
569 /* ---[read the header line]---*/
570 str = (struct strlist *) malloc (sizeof(struct strlist));
574 for (i=0,itotal=0,c=getc(fp); c != EOF; c=getc(fp))
579 {curstr->string[i] = '\0';
580 curstr->next = (struct strlist *) malloc (sizeof(struct strlist));
581 curstr = curstr->next;
585 curstr->string[i] = c;
590 curstr->string[i] = '\0';
591 seq->header = (char *) malloc (itotal+2);
592 seq->header[0] = '\0';
594 for (curstr=str, curpos=seq->header; curstr!=NULL;)
596 if (curstr->next==NULL)
597 {memccpy(curpos, curstr->string, '\0', STRSIZE);}
599 {memccpy(curpos, curstr->string, '\0', STRSIZE-1);}
602 curstr = curstr->next;
605 if (curstr!=NULL) {curpos = curpos+STRSIZE-1;}
608 bptr = (seq->header)+1;
609 seq->name = (char *) NULL;
610 seq->organism = (char *) NULL;
611 /* ---[parse out the id]---*/
612 idend = findchar(bptr, ' ');
613 if (idend==-1) {idend = findchar(bptr, '\n');}
614 if (idend==-1) {idend = findchar(bptr, '\0');}
616 {fprintf(stderr, "Error parsing header line - id.\n");
617 fputs(seq->header, fp);
620 seq->id = (char *) malloc((idend+1)*sizeof(char));
621 memcpy(seq->id, bptr, idend);
622 seq->id[idend] = '\0';
624 if (bptr[idend]=='\n' || bptr[idend]=='\0') {return(TRUE);}
626 /* ---[parse out the protein name]---*/
627 bptr = bptr + idend + 1;
628 while (bptr[0]==' ') {bptr++;}
630 namend = findchar(bptr, '-');
631 if (namend==-1) {namend = findchar(bptr, '\n');}
632 if (namend==-1) {namend = findchar(bptr, '\0');}
634 {fprintf(stderr, "Error parsing header line - name.\n");
635 fputs(seq->header, fp);
638 seq->name = (char *) malloc((namend+1)*sizeof(char));
639 memcpy(seq->name, bptr, namend);
640 seq->name[namend] = '\0';
642 if (bptr[namend]=='\n' || bptr[namend]=='\0') {return(TRUE);}
644 /* ---[parse out organism]---*/
645 bptr = bptr + namend + 1;
646 while (bptr[0]==' ') {bptr++;}
648 orgend = findchar(bptr, '|');
649 if (orgend==-1) {orgend = findchar(bptr, '#');}
650 if (orgend==-1) {orgend = findchar(bptr, '\n');}
651 if (orgend==-1) {orgend = findchar(bptr, '\0');}
653 {fprintf(stderr, "Error parsing header line - organism.\n");
654 fputs(seq->header, fp);
657 seq->organism = (char *) malloc((orgend+1)*sizeof(char));
658 memcpy(seq->organism, bptr, orgend);
659 seq->organism[orgend] = '\0';
661 /* ---[skip over multiple header lines]---*/
681 /*--------------------------------------------------------------(skipline)---*/
688 while ((c=getc(fp))!='\n' && c!=EOF)
694 /*--------------------------------------------------------------(findchar)---*/
696 extern int findchar(str, chr)
715 /*---------------------------------------------------------------(readseq)---*/
718 struct Sequence *seq;
727 seq->punctuation = FALSE;
729 str = (struct strlist *) malloc (sizeof(struct strlist));
733 for (i = 0, itotal = 0, c = getc(fp); c != EOF; c = getc(fp)) {
737 curstr->string[i++] = c;
741 curstr->string[STRSIZE-1] = '\0';
742 curstr->next = (struct strlist *) malloc(sizeof(*curstr));
743 curstr = curstr->next;
745 curstr->string[0] = c;
755 seq->punctuation = TRUE;
758 case 'u': case 'U': /* selenocysteine */
769 curstr->string[i] = '\0';
770 seq->seq = (char *) malloc (itotal+2);
773 for (curstr = str, curpos = seq->seq; curstr != NULL;) {
774 if (curstr->next == NULL)
775 memccpy(curpos, curstr->string, '\0', STRSIZE);
777 memccpy(curpos, curstr->string, '\0', STRSIZE-1);
780 curstr = curstr->next;
784 curpos = curpos+STRSIZE-1;
787 seq->length = strlen(seq->seq);
791 /*-----------------------------------------------------------------(upper)---*/
793 extern upper(string, len)
794 register char *string;
797 register char *stringmax, c;
799 for (stringmax = string + len; string < stringmax; ++string)
800 if (islower(c = *string))
801 *string = toupper(c);
804 /*-----------------------------------------------------------------(lower)---*/
806 extern lower(string, len)
810 register char *stringmax, c;
812 for (stringmax = string + len; string < stringmax; ++string)
813 if (isupper(c = *string))
814 *string = tolower(c);
817 /*-------------------------------------------------------------------(min)---*/
823 if (a<b) {return(a);}
827 /*-------------------------------------------------------------------(max)---*/
833 if (a<b) {return(b);}
837 /*---------------------------------------------------------------------------*/
839 /*---------------------------------------------------------------(tmalloc)---*/
846 ptr = (void *) malloc(size);
853 record_ptrs[rptr] = (int) ptr;
859 /*-----------------------------------------------------------------(tfree)---*/
866 for (i=0; i<rptr; i++)
868 if (record_ptrs[i]==(int)ptr)
878 /*---------------------------------------------------------------------------*/