1 /* mmgetaa.c - functions for mmap()ed access to libraries */
3 /* copyright (c) 1999,2000 William R. Pearson */
5 /* version 0 September, 1999 */
8 This is one of two alternative files that can be used to
9 read a database. The two files are nmgetaa.c, and mmgetaa.c
10 (nxgetaa.c has been retired).
12 nmgetlib.c and mmgetaa.c are used together. nmgetlib.c provides
13 the same functions as nxgetaa.c if memory mapping is not used,
14 mmgetaa.c provides the database reading functions if memory
15 mapping is used. The decision to use memory mapping is made on
19 /* $Name: fa_34_26_5 $ - $Id: mmgetaa.c,v 1.41 2006/04/12 18:00:02 wrp Exp $ */
27 #include <sys/types.h>
37 /* #include "upam.h" */
41 extern int nsfnum; /* number of superfamily numbers */
42 extern int sfnum[10]; /* superfamily number from types 0 and 5 */
44 extern int sfnum_n[10];
45 static char tline[MAXLINE];
57 extern MM_OFF bl2_long8_cvt(int64_t);
58 extern int bl2_uint4_cvt(int);
61 long crck(char *, int);
62 extern void src_int4_read(FILE *fd, int *val);
63 extern void src_long4_read(FILE *fd, long *valp);
64 extern void src_long8_read(FILE *fd, int64_t *val);
66 /* load_mmap() loads the d_pos[] and s_pos[] arrays for rapid access */
69 load_mmap(FILE *libi, /* fd for already open ".xin" file */
70 char *sname, /* name of sequence database file */
71 int lib_type, /* 0-Fasta, 5-vms_pir, 6-gcg_binary */
72 int ldnaseq, /* 1 for DNA, 0 for protein */
81 MM_OFF *d_pos_arr, *s_pos_arr;
82 int mm_flag, mm64_flag;
85 /* first check that the necessary indices are up-to-date */
86 /* read the offsets in ".xin" file */
87 if (fread(format,1,4,libi)==0) {
88 fprintf(stderr," cannot read .xin format\n");
92 mm64_flag = (format[2]==1); /* 4 bytes or 8 bytes for long? */
95 if (mm64_flag) {return NULL;}
98 if (format[3]!=lib_type) {
99 fprintf(stderr," cannot read format %d != lib_type %d\n",
104 src_int4_read(libi,&lib_aa);
105 if (lib_aa == ldnaseq) { /* database residue mismatch */
106 fprintf(stderr," residue type mismatch %s != %s (.xin) in %s\n",
107 (lib_aa ? "DNA" : "prot."),(ldnaseq ? "prot." : "DNA"),
112 /* everything looks good, allocate an lmf_str */
114 m_fd->lib_aa = lib_aa;
116 /* get get file size from index */
117 if (mm64_flag) src_long8_read(libi,&f_size);
119 src_long4_read(libi,&lf_size);
123 /* now, start to open mmap()ed file */
124 mm_flag=((m_fd->mmap_fd=open(sname,O_RDONLY))>=0);
126 fprintf(stderr," cannot open %s for mmap()", sname);
128 return NULL; /* file did not open */
131 /* fstat the library file and get size */
132 if(fstat(m_fd->mmap_fd, &statbuf) < 0) {
133 fprintf(stderr," cannot stat %s for mmap()", sname);
139 /* check for identical sizes - if different, do not mmap */
140 if (f_size != statbuf.st_size) {
141 fprintf(stderr," %s file size (%lld) and expected size (%ld) don't match\n",
142 sname,statbuf.st_size,f_size);
147 /* the index file and library file are open and the sizes match */
148 /* allocate the m_file struct and map the file */
150 m_fd->st_size = statbuf.st_size;
151 if((m_fd->mmap_base =
152 mmap(NULL, m_fd->st_size, PROT_READ,
153 MAP_FILE | MAP_SHARED, m_fd->mmap_fd, 0)) == (char *) -1) {
156 fprintf(stderr," cannot mmap %s", sname);
161 close(m_fd->mmap_fd);
162 if (!mm_flag) { return NULL; }
164 /* now finish reading the index file */
165 src_int4_read(libi,&max_cnt);
168 src_long8_read(libi,&m_fd->tot_len);
171 src_long4_read(libi,&lf_size);
172 m_fd->tot_len = lf_size;
174 src_long4_read(libi,&lf_size);
175 m_fd->max_len = lf_size;
179 "%s\tformat: %c%c%d %d; max_cnt: %d; tot_len: %lld max_len: %ld\n",
180 sname,format[0],format[1],format[2],format[3],
181 max_cnt,m_fd->tot_len,m_fd->max_len);
184 /* allocate array of description pointers */
186 if ((tmp_pos_arr=(int *)calloc(max_cnt+1,sizeof(int)))==NULL) {
187 fprintf(stderr," cannot allocate %d for tmp_pos array\n",
192 if ((d_pos_arr=(MM_OFF *)calloc(max_cnt+1, sizeof(MM_OFF)))==NULL) {
193 fprintf(stderr," cannot allocate %d for desc. array\n",max_cnt+1);
197 /* read m_fd->d_pos[max_cnt+1] */
199 if (fread(d_pos_arr,sizeof(MM_OFF),max_cnt+1,libi)!=
201 fprintf(stderr," error reading desc. offsets: %s\n",sname);
206 if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!=
208 fprintf(stderr," error reading desc. offsets: %s\n",sname);
212 fprintf(stderr,"d_pos_crc: %ld\n",
213 crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));
218 #ifndef IS_BIG_ENDIAN
220 for (i=0; i<=max_cnt; i++) {
221 d_pos_arr[i] = bl2_long8_cvt(d_pos_arr[i]);
224 for (i=0; i<=max_cnt; i++) {
225 d_pos_arr[i] = bl2_uint4_cvt(tmp_pos_arr[i]);
229 for (i=0; i<=max_cnt; i++) {
230 d_pos_arr[i] = tmp_pos_arr[i];
236 for (i=0; i<max_cnt-1; i++) {
237 if (d_pos_arr[i+1] <= d_pos_arr[i] )
238 fprintf(stderr," ** dpos_error [%d]\t%ld\t%ld\n",
239 i,d_pos_arr[i],d_pos_arr[i+1]);
243 /* allocate array of sequence pointers */
244 if ((s_pos_arr=(MM_OFF *)calloc(max_cnt+1,sizeof(MM_OFF)))==NULL) {
245 fprintf(stderr," cannot allocate %d for seq. array\n",max_cnt+1);
249 /* read m_fd->s_pos[max_cnt+1] */
251 if (fread(s_pos_arr,sizeof(long),max_cnt+1,libi)!=
253 fprintf(stderr," error reading seq offsets: %s\n",sname);
258 if (fread(tmp_pos_arr,sizeof(int),max_cnt+1,libi)!=
260 fprintf(stderr," error reading seq offsets: %s\n",sname);
264 fprintf(stderr,"s_pos_crc: %ld\n",
265 crck((char *)tmp_pos_arr,sizeof(int)*(max_cnt+1)));
269 #ifndef IS_BIG_ENDIAN
271 for (i=0; i<=max_cnt; i++)
272 s_pos_arr[i] = bl2_long8_cvt(s_pos_arr[i]);
274 for (i=0; i<=max_cnt; i++)
275 s_pos_arr[i] = (long)bl2_uint4_cvt(tmp_pos_arr[i]);
278 for (i=0; i<=max_cnt; i++)
279 s_pos_arr[i] = (long)tmp_pos_arr[i];
283 for (i=1; i<max_cnt-1; i++) {
284 if (s_pos_arr[i+1]<s_pos_arr[i])
285 fprintf(stderr," ** spos_error [%d]\t%ld\t%ld\n",
286 i,s_pos_arr[i],s_pos_arr[i]);
290 if (!mm64_flag) free(tmp_pos_arr);
292 m_fd->max_cnt = max_cnt;
293 m_fd->d_pos_arr = d_pos_arr;
294 m_fd->s_pos_arr = s_pos_arr;
297 /* check_mmap(m_fd,-2); */
302 char *mgets (char *s, int n, struct lmf_str *m_fd)
306 mfp = m_fd->mmap_addr;
309 while (--n > 0 && (*mfp != (char)EOF))
310 if ((*cs++ = *mfp++) == '\n') break;
313 m_fd->mmap_addr = mfp;
314 return (*mfp == (char)EOF && cs == s) ? NULL : s;
318 agetlibm(unsigned char *seq,
324 struct lmf_str *m_fd,
327 register unsigned char *cp, *seqp;
330 int lpos; /* entry number in library */
332 unsigned char *seqm, *seqm1;
335 static unsigned char *cp_max;
337 char *bp1, *bpa, *tp;
352 if (lpos >= m_fd->max_cnt) return (-1);
353 seq_len = m_fd->d_pos_arr[lpos+1] - m_fd->s_pos_arr[lpos];
354 if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
355 fprintf(stderr," ** sequence over-run: %ld at %d\n",seq_len,lpos);
358 *libpos = (fseek_t)lpos;
360 desc = m_fd->mmap_base+m_fd->d_pos_arr[lpos]+1;
361 strncpy(libstr,desc,n_libstr-1);
362 libstr[n_libstr-1]='\0';
363 if ((bp=strchr(libstr,'\r'))!=NULL) *bp='\0';
364 if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
365 if (n_libstr > MAX_UID) {
367 while (*bp++) if ( *bp=='\001' || *bp=='\t') *bp=' ';
370 for (bp = desc; *bp && (*bp != '\n'); *bp++ )
371 if (*bp == '@' && !strncmp(bp+1,"C:",2)) sscanf(bp+3,"%ld",l_off);
375 strncpy(tline,desc,sizeof(tline));
376 tline[MAXLINE-1]='\0';
377 if ((bp=strchr(tline,'\n'))!=NULL) *bp='\0';
378 if ((bp=strchr(tline,' ')) && (bp=strchr(bp+1,SFCHAR))) {
379 if ((bpa = strchr(bp+1,'\001'))!=NULL) *bpa = '\0';
380 if ((bp1=strchr(bp+1,SFCHAR))==NULL) {
381 fprintf(stderr," second %c missing: %s\n",SFCHAR,tline);
386 if ((tp = strtok(bp+1," \t"))!=NULL) {
387 sfnum[i++] = atoi(tp);
388 while ((tp = strtok((char *)NULL," \t")) != (char *)NULL) {
389 sfnum[i++] = atoi(tp);
394 if (nsfnum>1) sf_sort(sfnum,nsfnum);
396 if (nsfnum<1) fprintf(stderr," found | but no sfnum: %s\n",libstr);
401 sfnum[0] = nsfnum = 0;
405 m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
406 cp_max = (unsigned char *)(m_fd->mmap_addr+seq_len);
409 for (cp=(unsigned char *)m_fd->mmap_addr; seqp<seqm1; ) {
410 if ((*seqp++=ap[*cp++])<NA &&
411 (*seqp++=ap[*cp++])<NA &&
412 (*seqp++=ap[*cp++])<NA &&
413 (*seqp++=ap[*cp++])<NA &&
414 (*seqp++=ap[*cp++])<NA &&
415 (*seqp++=ap[*cp++])<NA &&
416 (*seqp++=ap[*cp++])<NA &&
417 (*seqp++=ap[*cp++])<NA &&
418 (*seqp++=ap[*cp++])<NA &&
419 (*seqp++=ap[*cp++])<NA) continue;
421 if (cp >= cp_max) break;
423 m_fd->mmap_addr = (char *)cp;
425 if (seqp>=seqm1) (*lcont)++;
432 /* if ((int)(seqp-seq)==0) return 1; */
433 return (int)(seqp-seq);
441 struct lmf_str *m_fd)
449 llen = m_fd->s_pos_arr[lpos]-m_fd->d_pos_arr[lpos];
450 if (llen >= cnt) llen = cnt-1;
452 strncpy(str,m_fd->mmap_base+m_fd->d_pos_arr[lpos]+1,llen);
454 if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
455 if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
457 while (*bp++) if ( *bp=='\001' || *bp=='\t') *bp=' ';
461 /* there is no vgetlibm() because vgetlibm() and agetlibm() are
462 identical - the difference in the two file formats relates to the
463 location of the sequence, which is already available in spos_arr[].
465 however vranlibm must accomodate both type 5 and 6 files;
466 type 6 has extra stuff after the seq_id.
474 struct lmf_str *m_fd)
482 llen = m_fd->s_pos_arr[lpos]-m_fd->d_pos_arr[lpos];
484 mp = m_fd->mmap_base+m_fd->d_pos_arr[lpos];
486 strncpy(str,mp+4,20);
488 if ((bp=strchr(str,' '))!=NULL) *(bp+1) = '\0';
489 else if ((bp=strchr(str,'\n'))!=NULL) *bp = ' ';
490 bp = strchr(mp,'\n');
493 if (llen > cnt-strlen(str)) llen = cnt-strlen(str)-1;
495 strncat(str,bp+1,llen);
496 if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
502 close_mmap(struct lmf_str *m_fd) {
503 free(m_fd->s_pos_arr);
504 free(m_fd->d_pos_arr);
506 munmap(m_fd->mmap_base,m_fd->st_size);
513 #define min(x,y) ((x) > (y) ? (y) : (x))
516 static int gcg_bton[4]={2,4,1,3};
519 gcg_getlibm(unsigned char *seq,
525 struct lmf_str *m_fd,
531 register unsigned char *cp, *seqp, stmp;
532 register int *ap, lpos;
533 unsigned char *seqm, *seqm1;
534 long r_block, b_block, r_fact, r16_block;
546 if (lpos >= m_fd->max_cnt) return (-1);
547 sscanf(m_fd->mmap_base+m_fd->d_pos_arr[lpos]+4,"%s %s %s %s %ld\n",
548 libstr,gcg_date,gcg_type,dummy,&(m_fd->gcg_len));
550 m_fd->gcg_binary = (gcg_type[0]=='2');
554 m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
557 r_block = b_block = min((size_t)(seqm-seqp),m_fd->gcg_len);
558 if (m_fd->gcg_binary) {
559 r_block = (r_block+3)/4;
562 cp=(unsigned char *)m_fd->mmap_addr;
563 if (!m_fd->gcg_binary) {
565 r16_block = r_block/16;
566 while (r16_block-- > 0) {
584 while (seqp<seq+r_block) *seqp++ = ap[*cp++];
586 else if (m_fd->gcg_binary) {
588 r16_block = r_block/8;
589 while(r16_block-- > 0) {
591 *seqp++ = gcg_bton[(stmp>>6) &3];
592 *seqp++ = gcg_bton[(stmp>>4) &3];
593 *seqp++ = gcg_bton[(stmp>>2) &3];
594 *seqp++ = gcg_bton[(stmp) &3];
596 *seqp++ = gcg_bton[(stmp>>6) &3];
597 *seqp++ = gcg_bton[(stmp>>4) &3];
598 *seqp++ = gcg_bton[(stmp>>2) &3];
599 *seqp++ = gcg_bton[(stmp) &3];
601 *seqp++ = gcg_bton[(stmp>>6) &3];
602 *seqp++ = gcg_bton[(stmp>>4) &3];
603 *seqp++ = gcg_bton[(stmp>>2) &3];
604 *seqp++ = gcg_bton[(stmp) &3];
606 *seqp++ = gcg_bton[(stmp>>6) &3];
607 *seqp++ = gcg_bton[(stmp>>4) &3];
608 *seqp++ = gcg_bton[(stmp>>2) &3];
609 *seqp++ = gcg_bton[(stmp) &3];
611 *seqp++ = gcg_bton[(stmp>>6) &3];
612 *seqp++ = gcg_bton[(stmp>>4) &3];
613 *seqp++ = gcg_bton[(stmp>>2) &3];
614 *seqp++ = gcg_bton[(stmp) &3];
616 *seqp++ = gcg_bton[(stmp>>6) &3];
617 *seqp++ = gcg_bton[(stmp>>4) &3];
618 *seqp++ = gcg_bton[(stmp>>2) &3];
619 *seqp++ = gcg_bton[(stmp) &3];
621 *seqp++ = gcg_bton[(stmp>>6) &3];
622 *seqp++ = gcg_bton[(stmp>>4) &3];
623 *seqp++ = gcg_bton[(stmp>>2) &3];
624 *seqp++ = gcg_bton[(stmp) &3];
626 *seqp++ = gcg_bton[(stmp>>6) &3];
627 *seqp++ = gcg_bton[(stmp>>4) &3];
628 *seqp++ = gcg_bton[(stmp>>2) &3];
629 *seqp++ = gcg_bton[(stmp) &3];
632 while (seqp < seq+4*r_block) {
634 *seqp++ = gcg_bton[(stmp>>6) &3];
635 *seqp++ = gcg_bton[(stmp>>4) &3];
636 *seqp++ = gcg_bton[(stmp>>2) &3];
637 *seqp++ = gcg_bton[(stmp) &3];
640 if (r_fact * r_block >= m_fd->gcg_len) {
645 if (m_fd->gcg_binary) b_block = 4*r_block;
646 m_fd->gcg_len -= b_block;
650 seq[b_block] = EOSEQ;
651 /* if (b_block==0) return 1; else */
655 void lget_ann_m(struct lmf_str *lm_fd, char *libstr, int n_libstr);
658 lgetlibm(unsigned char *seq,
664 struct lmf_str *m_fd,
667 register unsigned char *cp, *seqp;
668 register int *ap, lpos;
669 unsigned char *seqm, *seqm1;
674 seqm = &seq[maxs-11];
681 if (lpos >= m_fd->max_cnt) return (-1);
683 if (n_libstr <= 21) {
684 strncpy(libstr,m_fd->mmap_base+m_fd->d_pos_arr[lpos]+12,12);
688 lget_ann_m(m_fd,libstr,n_libstr);
692 m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lpos];
693 cp = (unsigned char *)m_fd->mmap_addr;
695 else cp = (unsigned char *)m_fd->mmap_addr;
698 if (*cp=='/' && *(cp-1)=='\n') break;
699 if ((*seqp++=ap[*cp++])<NA &&
700 (*seqp++=ap[*cp++])<NA &&
701 (*seqp++=ap[*cp++])<NA &&
702 (*seqp++=ap[*cp++])<NA &&
703 (*seqp++=ap[*cp++])<NA &&
704 (*seqp++=ap[*cp++])<NA &&
705 (*seqp++=ap[*cp++])<NA &&
706 (*seqp++=ap[*cp++])<NA &&
707 (*seqp++=ap[*cp++])<NA &&
708 (*seqp++=ap[*cp++])<NA &&
709 (*seqp++=ap[*cp++])<NA) continue;
711 if (*cp=='\n' && *(cp+1)==' ') cp += 11;
716 m_fd->mmap_addr = (char *)cp;
724 return (int)(seqp-seq);
728 lget_ann_m(struct lmf_str *lm_fd, char *libstr, int n_libstr) {
729 char *bp, *bp_gid, locus[120], desc[120], acc[120], ver[120];
731 /* copy in locus from lm_fd->lline */
732 strncpy(locus,&lm_fd->mmap_addr[12],sizeof(locus));
733 if ((bp=strchr(locus,' '))!=NULL) *(bp+1) = '\0';
735 /* get description */
736 mgets(desc,sizeof(desc),lm_fd);
737 while (desc[0]!='D' || desc[1]!='E' || strncmp(desc,"DEFINITION",10))
738 mgets(desc,sizeof(desc),lm_fd);
739 if ((bp = strchr(&desc[12],'\n'))!=NULL) *bp='\0';
742 mgets(acc,sizeof(acc),lm_fd);
743 while (acc[0]!='A' || acc[1]!='C' || strncmp(acc,"ACCESSION",9)) {
744 mgets(acc,sizeof(acc),lm_fd);
745 if (acc[0]=='O' && acc[1]=='R' && strncmp(acc,"ORIGIN",6)==0)
748 if ((bp = strchr(&acc[12],'\n'))!=NULL) *bp='\0';
749 if ((bp = strchr(&acc[12],' '))!=NULL) *bp='\0';
752 mgets(ver,sizeof(ver),lm_fd);
753 while (ver[0]!='V' || ver[1]!='E' || strncmp(ver,"VERSION",7)) {
754 mgets(ver,sizeof(ver),lm_fd);
755 if (ver[0]=='O' && ver[1]=='R' && strncmp(ver,"ORIGIN",6)==0)
758 if ((bp = strchr(&ver[12],'\n'))!=NULL) *bp='\0';
760 /* extract gi:123456 from version line */
761 bp_gid = strchr(&ver[12],':');
762 if (bp_gid != NULL) {
763 if ((bp=strchr(bp_gid+1,' '))!=NULL) *bp='\0';
766 if ((bp = strchr(&ver[12],' '))!=NULL) *bp='\0';
768 /* build up FASTA header line */
769 if (bp_gid != NULL) {
770 strncpy(libstr,"gi|",n_libstr-1);
771 strncat(libstr,bp_gid,n_libstr-4);
772 strncat(libstr,"|gb|",n_libstr-20);
774 else {libstr[0]='\0';}
776 /* if we have a version number, use it, otherwise accession,
777 otherwise locus/description */
780 strncat(libstr,&ver[12],n_libstr-1-strlen(libstr));
781 strncat(libstr,"|",n_libstr-1-strlen(libstr));
783 else if (acc[0]=='A') {
784 strncat(libstr,&acc[12],n_libstr-1-strlen(libstr));
785 strncat(libstr," ",n_libstr-1-strlen(libstr));
788 strncat(libstr,locus,n_libstr-1-strlen(libstr));
789 strncat(libstr,&desc[11],n_libstr-1-strlen(libstr));
790 libstr[n_libstr-1]='\0';
798 struct lmf_str *m_fd)
801 char acc[MAXLINE], desc[MAXLINE];
803 llp = m_fd->mmap_addr = m_fd->mmap_base + m_fd->d_pos_arr[seek];
805 lget_ann_m(m_fd,str,cnt);
812 static int check_status=0;
815 check_mmap(struct lmf_str *m_fd,long ntt) {
817 int i, seq_len, ok_stat;
820 if ( ++check_status > 5) return;
822 fprintf(stderr," ** checking %s %ld**\n", m_fd->lb_name,ntt);
823 for (i=0; i<m_fd->max_cnt; i++) {
824 seq_len = m_fd->d_pos_arr[i+1] - m_fd->s_pos_arr[i];
825 if (seq_len < 0 || (seq_len > m_fd->max_len && seq_len > (m_fd->max_len*5)/4)) {
826 fprintf(stderr,"%d:\t%ld\t%ld\t%ld\n",
827 i,m_fd->d_pos_arr[i],m_fd->s_pos_arr[i],
828 m_fd->d_pos_arr[i+1]-m_fd->s_pos_arr[i]);
833 if (check_status) fprintf(stderr," ** check_mmap OK %s %ld**\n",
839 /* C H K 3 -- Compute a type-3 Kermit block check. */
841 Calculate the 16-bit CRC of a null-terminated string using a byte-oriented
842 tableless algorithm invented by Andy Lowry (Columbia University). The
843 magic number 010201 is derived from the CRC-CCITT polynomial x^16+x^12+x^5+1.
844 Note - this function could be adapted for strings containing imbedded 0's
845 by including a length argument.
858 q = (crc ^ c) & 017; /* Low-order nibble */
859 crc = (crc >> 4) ^ (q * 010201);
860 q = (crc ^ (c >> 4)) & 017; /* High order nibble */
861 crc = (crc >> 4) ^ (q * 010201);