1 /* ncbl2_lib.c functions to read ncbi-blast format files from
2 formatdb (blast2.0 format files)
4 copyright (c) 1999 William R. Pearson
7 /* $Name: fa_34_26_5 $ - $Id: ncbl2_mlib.c,v 1.56 2007/04/02 18:08:11 wrp Exp $ */
9 /* to turn on mmap()ing for Blast2 files: */
14 #include <sys/types.h>
23 /* ****************************************************************
27 Modified to read NCBI .[np]al and .msk files. The .nal or .pal file
28 provides a way to read sequences from a list of files. The .msk file
29 provides a compact way of indicating the subset of sequences in a
30 larger database (typically nr or nt) that comprise a smaller database
31 (e.g. swissprot or pdbaa). A .pal file (e.g. swissprot.00.pal) that
32 uses a .msk file has the form:
34 # Alias file generated by genmask
35 # Date created: Mon Apr 10 11:24:05 2006
37 TITLE Non-redundant SwissProt sequences
39 OIDLIST swissprot.00.msk
46 To work with this file, we must first load the nr.00 file, and then
47 read the swissprot.00.msk file, and then scan all the entries in the
48 swissprot.00.msk file (which are packed 32 mask-bit to an int) to
49 determine whether a specific libpos index entry is present in the
52 **************************************************************** */
55 /* ****************************************************************
56 This code reads NCBI Blast2 format databases from formatdb version 3 and 4
58 (From NCBI) This section describes the format of the databases.
60 Formatdb creates three main files for proteins containing indices,
61 sequences, and headers with the extensions, respectively, of pin, psq,
62 and phr (for nucleotides these are nin, nsq, and nhr). A number of
63 other ISAM indices are created, but these are described elsewhere.
65 FORMAT OF THE INDEX FILE
66 ------------------------
68 1.) formatdb version number [4 bytes].
70 2.) protein dump flag (1 for a protein database, 0 for a nucleotide
73 3.) length of the database title in bytes [4 bytes].
74 4.) the database title [length given in 3.)].
75 5.) length of the date/time string [4 bytes].
76 6.) the date/time string [length given in 5.)].
77 7.) the number of sequences in the database [4 bytes].
78 8.) the total length of the database in residues/basepairs [4 bytes].
79 9.) the length of the longest sequence in the database [4 bytes].
81 10.) a list of the offsets for definitions (one for each sequence) in
82 the header file. There are num_of_seq+1 of these, where num_of_seq is
83 the number of sequences given in 7.).
85 11.) a list of the offsets for sequences (one for each sequence) in
86 the sequence file. There are num_of_seq+1 of these, where num_of_seq
87 is the number of sequences given in 7.).
89 12.) a list of the offsets for the ambiguity characters (one for each
90 sequence) in the sequence file. This list is only present for
91 nucleotide databases and, since the database is compressed 4/1 for
92 nucleotides, allows the ambiguity characters to be restored when the
93 sequence is generated. There are num_of_seq+1 of these, where
94 num_of_seq is the number of sequences given in 7.).
97 FORMAT OF THE SEQUENCE FILE
98 ---------------------------
100 There are different formats for the protein and nucleotide sequence files.
102 The protein sequence files is quite simple. The first byte in the
103 file is a NULL byte, followed by the sequence in ncbistdaa format
104 (described in the NCBI Software Development Toolkit documentation).
105 Following the sequence is another NULL byte, followed by the next
106 sequence. The file ends with a NULL byte, following the last
109 The nucleotide sequence file contains the nucleotide sequence, with
110 four basepairs compressed into one byte. The format used is NCBI2na,
111 documented in the NCBI Software Development Toolkit manual. Any
112 ambiguity characters present in the original sequence are replaced at
113 random by A, C, G or T. The true value of ambiguity characters are
114 stored at the end of each sequence to allow true reproduction of the
117 FORMAT OF THE HEADER FILE (formatdb version 3)
118 -------------------------
120 The format of the header file depends on whether or not the identifiers in the
121 original file were parsed or not. For the case that they were not, then each
122 entry has the format:
124 gnl|BL_ORD_ID|entry_number my favorite yeast sequence...
126 Here entry_number gives the ordinal number of the sequence in the
127 database (with zero offset). The identifier
128 gnl|BL_ORD_ID|entry_number is used by the BLAST software to identify
129 the entry, if the user has not provided another identifier. If the
130 identifier was parsed, then gnl|BL_ORD_ID|entry_number is replaced by
131 the correct identifier, as described in
132 ftp://ncbi.nlm.nih.gov/blast/db/README .
134 There are no separators between these deflines.
136 For formatdb version 4, the header file contains blast ASN.1 binary
137 deflines, which can parsed with parse_fastadl_asn().
139 FORMAT OF THE .MSK FILE
140 -----------------------
142 The .msk file is simply a packed list of masks for formatdb "oids" for
143 some other file (typically nr). The first value is the last oid
144 available; the remainder are packed 32 oids/mask, so that the number
145 of masks is 1/32 the number of sequences in the file.
147 **************************************************************** */
150 #include <sys/types.h>
151 #include <sys/stat.h>
152 #include <sys/mman.h>
156 #include <sys/fcntl.h>
173 #define SLASH_CHAR '\\'
174 #define SLASH_STR "\\"
176 #define SLASH_CHAR '/'
177 #define SLASH_STR "/"
185 #include "ncbl2_head.h"
190 unsigned int bl2_uint4_cvt(unsigned int);
191 unsigned int bl2_long4_cvt(long);
192 int64_t bl2_long8_cvt(int64_t);
193 void src_int4_read(FILE *fd, int *valp);
194 void src_uint4_read(FILE *fd, unsigned int *valp);
195 void src_long4_read(FILE *fd, long *valp);
196 void ncbi_long8_read(FILE *fd, int64_t *valp);
197 void src_char_read(FILE *fd, char *valp);
198 unsigned char *parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
199 int *gi_p, int *db, char *acc, char *name,
200 char *title, int t_len, int *taxid);
202 /* nt_btoa maps from blast 2bit format to ascii characters */
203 static char nt_btoa[5] = {"ACGT"};
205 static char aa_b2toa[27]= {"-ABCDEFGHIKLMNPQRSTVWXYZU*"};
207 static int aa_btof[32]; /* maps to fasta alphabet */
209 static int dbtype, dbformat, amb_cnt;
213 int ncbl2_getliba(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
214 int ncbl2_getlibn(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
216 int ncbl2_getliba_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
217 int ncbl2_getlibn_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
219 void newname(char *, char *, char *, int);
220 void parse_pal(char *, char *, int *, int *, FILE *);
222 void ncbl2_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd);
224 /* ncbl2_openlib() is used to open (and memory map) a BLAST2.0 format
225 file. Ifdef USE_MMAP, then ncbl2_openlib returns a structure that can
226 be used to read the database. */
229 ncbl2_openlib(char *name, int ldnaseq)
241 char *title_str=NULL;
246 int oid_seqs, max_oid;
247 int oid_cnt, oid_len;
248 unsigned int *oid_list, o_max;
254 FILE *ifile; /* index offsets, also DB info */
255 unsigned int *f_pos_arr;
256 struct lmf_str *m_fptr;
258 if (ldnaseq==SEQT_PROT) { /* read a protein database */
259 newname(lname,name,AA_LIST_EXT,(int)sizeof(lname));
260 newname(tname,name,AA_INDEX_EXT,(int)sizeof(tname));
261 newname(hname,name,AA_HEADER_EXT,(int)sizeof(hname));
262 newname(sname,name,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
264 /* initialize map of BLAST2 amino acids to FASTA amino acids */
265 for (i=0; i<sizeof(aa_b2toa); i++) {
266 if ((tmp=aascii[aa_b2toa[i]])<NA) aa_btof[i]=tmp;
267 else if (aa_b2toa[i]=='*') aa_btof[i]=aascii['X'];
269 /* else aa_btof[i]=aascii['X']; */
272 else { /* reading DNA library */
273 newname(lname,name,NT_LIST_EXT,(int)sizeof(lname));
274 newname(tname,name,NT_INDEX_EXT,(int)sizeof(tname));
275 newname(hname,name,NT_HEADER_EXT,(int)sizeof(hname));
276 newname(sname,name,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
280 /* check first for list name */
281 max_oid = oid_seqs = 0;
283 if ((ifile = fopen(lname,"r"))!=NULL) {
285 if ((bp = strrchr(name,SLASH_CHAR))!=NULL) {
287 strncpy(db_dir,name,sizeof(db_dir));
288 strncat(db_dir,SLASH_STR,sizeof(db_dir)-strlen(db_dir)-1);
295 /* we have a list file, we need to parse it */
296 parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);
302 /* get the pref_db before adding the directory */
303 if (strncmp(msk_name,"swissprot",9)==0) {
306 else if (strncmp(msk_name,"pdbaa",5)==0) {
310 /* need to add directory to both dname and msk_name */
311 strncpy(tname,db_dir,sizeof(tname));
312 strncat(tname,msk_name, sizeof(tname));
313 strncpy(msk_name, tname, sizeof(msk_name));
315 strncpy(tname,db_dir,sizeof(tname));
316 strncat(tname,dname, sizeof(tname));
317 strncpy(dname,tname,sizeof(dname));
319 if (ldnaseq == SEQT_PROT) {
320 newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));
321 newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));
322 newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
324 else { /* reading DNA library */
325 newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));
326 newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));
327 newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
329 /* now load the oid file */
330 if ((ifile = fopen(msk_name,RBSTR))==NULL) {
331 fprintf(stderr,"error - cannot load %s file\n",msk_name);
335 src_uint4_read(ifile,&o_max);
336 if (o_max != max_oid) {
337 fprintf(stderr," error - oid count mismatch %d != %d\n",max_oid, o_max);
339 oid_len = (max_oid/32+1);
340 if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {
341 fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);
344 if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {
345 fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);
351 else { /* we had a .msk file, but there are no oid's in it.
352 allocate an m_fptr and return it empty */
353 if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {
354 fprintf(stderr," cannot allocate lmf_str\n");
358 m_fptr->tmp_buf_max = 0;
360 /* load the oid info */
362 m_fptr->oid_seqs = 0;
363 m_fptr->oid_list = (unsigned int *)calloc(1,sizeof(int));
366 if (ldnaseq==SEQT_DNA) {
367 m_fptr->getlib = ncbl2_getlibn_o;
368 m_fptr->sascii = nascii;
371 m_fptr->getlib = ncbl2_getliba_o;
372 m_fptr->sascii = aascii;
374 strncpy(m_fptr->lb_name,sname,MAX_FN);
379 /* open the index file */
380 if ((ifile = fopen(tname,RBSTR))==NULL) {
381 fprintf(stderr," cannot open %s (%s) INDEX file",tname,name);
385 src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */
386 src_uint4_read(ifile,(unsigned *)&dbtype); /* get 1 for protein/0 DNA */
388 if (dbformat != FORMATDBV3 && dbformat!=FORMATDBV4) {
389 fprintf(stderr,"error - %s wrong formatdb version (%d/%d)\n",
390 tname,dbformat,FORMATDBV3);
394 if ((ldnaseq==SEQT_PROT && dbtype != AAFORMAT) ||
395 (ldnaseq==SEQT_DNA && dbtype!=NTFORMAT)) {
396 fprintf(stderr,"error - %s wrong format (%d/%d)\n",
397 tname,dbtype,(ldnaseq ? NTFORMAT: AAFORMAT));
401 /* the files are there - allocate lmf_str */
403 if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {
404 fprintf(stderr," cannot allocate lmf_str\n");
408 m_fptr->tmp_buf_max = 4096;
409 if ((m_fptr->tmp_buf=
410 (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {
411 fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");
415 /* load the oid info */
416 m_fptr->max_oid = max_oid;
417 m_fptr->oid_seqs = oid_seqs;
418 m_fptr->oid_list = oid_list;
419 m_fptr->pref_db= pref_db;
421 /* open the header file */
422 if ((m_fptr->hfile = fopen(hname,RBSTR))==NULL) {
423 fprintf(stderr," cannot open %s header file\n",hname);
427 /* ncbl2_ranlib is used for all BLAST2.0 access */
428 m_fptr->ranlib = ncbl2_ranlib;
429 m_fptr->bl_format_ver = dbformat;
431 if (ldnaseq==SEQT_DNA) {
433 m_fptr->getlib = ncbl2_getlibn_o;
436 m_fptr->getlib = ncbl2_getlibn;
438 m_fptr->sascii = nascii;
442 m_fptr->getlib = ncbl2_getliba_o;
445 m_fptr->getlib = ncbl2_getliba;
447 m_fptr->sascii = aascii;
449 strncpy(m_fptr->lb_name,sname,MAX_FN);
451 /* open the sequence file */
453 #if defined (USE_MMAP)
454 m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);
455 if (!m_fptr->mm_flg) {
456 fprintf(stderr," cannot open %s",sname);
460 if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {
461 fprintf(stderr," cannot fstat %s",sname);
466 m_fptr->st_size = statbuf.st_size;
467 if((m_fptr->mmap_base =
468 mmap(NULL, m_fptr->st_size, PROT_READ,
469 MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {
470 fprintf(stderr," cannot mmap %s",sname);
475 m_fptr->mmap_addr = m_fptr->mmap_base;
479 /* regardless, close the open()ed version */
480 close(m_fptr->mmap_fd);
486 if (!m_fptr->mm_flg) {
487 if ((m_fptr->libf = fopen(sname,RBSTR))==NULL) {
488 fprintf(stderr," cannot open %s sequence file",sname);
494 /* all files should be open */
496 src_uint4_read(ifile,(unsigned *)&title_len);
499 if ((title_str = calloc((size_t)title_len+1,sizeof(char)))==NULL) {
500 fprintf(stderr," cannot allocate title string (%d)\n",title_len);
503 fread(title_str,(size_t)1,(size_t)title_len,ifile);
506 src_uint4_read(ifile,(unsigned *)&date_len);
509 if ((date_str = calloc((size_t)date_len+1,sizeof(char)))==NULL) {
510 fprintf(stderr," cannot allocate date string (%d)\n",date_len);
513 fread(date_str,(size_t)1,(size_t)date_len,ifile);
517 src_uint4_read(ifile,(unsigned *)&m_fptr->max_cnt);
519 if (dbformat == FORMATDBV3) {
520 src_long4_read(ifile,<mp);
521 m_fptr->tot_len = ltmp;
524 ncbi_long8_read(ifile,&l8tmp);
525 m_fptr->tot_len = ltmp;
528 src_long4_read(ifile,<mp);
529 m_fptr->max_len = ltmp;
531 /* currently we are not using this information, but perhaps later */
532 if (title_str!=NULL) free(title_str);
533 if (date_str!=NULL) free(date_str);
536 fprintf(stderr,"%s format: BL2 (%s) max_cnt: %d, totlen: %lld, maxlen %ld\n",
537 name,m_fptr->mm_flg ? "mmap" : "fopen",
538 m_fptr->max_cnt,m_fptr->tot_len,m_fptr->max_len);
541 /* allocate and read hdr indexes */
542 if ((f_pos_arr=(unsigned int *)calloc((size_t)m_fptr->max_cnt+1,sizeof(int)))==NULL) {
543 fprintf(stderr," cannot allocate tmp header pointers\n");
547 if ((m_fptr->d_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
548 fprintf(stderr," cannot allocate header pointers\n");
552 /* allocate and read sequence offsets */
553 if ((m_fptr->s_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
554 fprintf(stderr," cannot allocate sequence pointers\n");
559 for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->d_pos_arr[i]);
560 for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->s_pos_arr[i]);
562 if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
563 fprintf(stderr," error reading hdr offsets: %s\n",tname);
567 for (i=0; i<=m_fptr->max_cnt; i++)
569 m_fptr->d_pos_arr[i] = f_pos_arr[i];
571 m_fptr->d_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
574 if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
575 fprintf(stderr," error reading seq offsets: %s\n",tname);
578 for (i=0; i<=m_fptr->max_cnt; i++) {
580 m_fptr->s_pos_arr[i] = f_pos_arr[i];
582 m_fptr->s_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
586 if (dbtype == NTFORMAT) {
587 /* allocate and ambiguity offsets */
588 if ((m_fptr->a_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
589 fprintf(stderr," cannot allocate sequence pointers\n");
594 for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->a_pos_arr[i]);
597 if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
598 fprintf(stderr," error reading seq offsets: %s\n",tname);
601 for (i=0; i<=m_fptr->max_cnt; i++) {
603 m_fptr->a_pos_arr[i] = f_pos_arr[i];
605 m_fptr->a_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
611 for (i=0; i < min(m_fptr->max_cnt,10); i++) {
612 fprintf(stderr,"%d: %d %d %d\n",i,m_fptr->s_pos_arr[i],m_fptr->a_pos_arr[i],m_fptr->d_pos_arr[i]);
616 /* all done with ifile, close it */
621 if (!m_fptr->mm_flg) {
622 tmp = fgetc(m_fptr->libf);
624 fprintf(stderr," phase error: %d:%d found\n",0,tmp);
627 m_fptr->bl_lib_pos = 1;
632 /* here if failure after m_fptr allocated */
637 void ncbl2_closelib(struct lmf_str *m_fptr)
639 if (m_fptr->tmp_buf != NULL) {
640 free(m_fptr->tmp_buf);
641 m_fptr->tmp_buf_max = 0;
644 if (m_fptr->s_pos_arr !=NULL) {
645 free(m_fptr->s_pos_arr);
646 m_fptr->s_pos_arr = NULL;
648 if (m_fptr->a_pos_arr!=NULL) {
649 free(m_fptr->a_pos_arr);
650 m_fptr->a_pos_arr = NULL;
653 if (m_fptr->hfile !=NULL ) {
654 fclose(m_fptr->hfile); m_fptr->hfile=NULL;
655 free(m_fptr->d_pos_arr); m_fptr->d_pos_arr = NULL;
658 if (m_fptr->oid_list != NULL) {
659 free(m_fptr->oid_list); m_fptr->oid_list = NULL;
660 m_fptr->oid_seqs = m_fptr->max_oid = 0;
664 if (m_fptr->mm_flg) {
665 munmap(m_fptr->mmap_base,m_fptr->st_size);
666 m_fptr->mmap_fd = -1;
670 if (m_fptr->libf !=NULL ) {fclose(m_fptr->libf); m_fptr->libf=NULL;}
674 ncbl2_getliba_o(unsigned char *seq,
680 struct lmf_str *m_fd,
684 unsigned int t_mask, t_shift, oid_mask;
686 /* get to the next valid pointer */
688 for ( tpos = m_fd->lpos ;tpos <= m_fd->max_oid; tpos++) {
690 t_shift = 31 - (tpos % 32);
691 if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; }
693 if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
694 if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
695 m_fd->lpos = tpos; /* already bumped up */
696 m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
697 return ncbl2_getliba(seq, maxs, libstr, n_libstr,
698 libpos, lcont, m_fd, l_off);
705 ncbl2_getliba(unsigned char *seq,
711 struct lmf_str *m_fd,
714 unsigned char *sptr, *dptr;
715 int s_chunk, d_len, lib_cnt;
719 #if defined(DEBUG) || defined(PCOMPLIB)
720 int gi, my_db, taxid;
721 char acc[20], title[21], name[20];
726 lib_cnt = m_fd->lpos;
727 *libpos = (fseek_t)m_fd->lpos;
730 if (lib_cnt >= m_fd->max_cnt) return -1; /* no more sequences */
731 seq_len = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]; /* value is +1 off to get the NULL */
732 if (m_fd->mm_flg) m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lib_cnt];
733 #if !defined(DEBUG) && !defined(PCOMPLIB)
736 /* get the name from the header file */
737 fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);
739 if (m_fd->bl_format_ver == FORMATDBV3) {
740 d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
741 fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
745 d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
746 fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile);
747 parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len,
748 &gi, &my_db, acc, name, title, 20, &taxid);
749 sprintf(libstr,"gi|%d",gi);
753 if (seq_len <= maxs) { /* sequence fits */
758 else { /* doesn't fit */
763 if (m_fd->mm_flg) sptr = (unsigned char *)m_fd->mmap_addr;
765 if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,m_fd->libf))!=(size_t)seq_len) {
766 fprintf(stderr," could not read sequence record: %ld %ld != %ld\n",
767 *libpos,tmp,seq_len);
772 if (seq_len <= maxs) {seqcnt = --seq_len;}
774 /* everything is ready, set up dst. pointer, seq_len */
777 if (aa_b2toa[sptr[seq_len-1]]=='*') seq_len--;
779 while (s_chunk-- > 0) {
780 *dptr++ = aa_btof[*sptr++];
781 *dptr++ = aa_btof[*sptr++];
782 *dptr++ = aa_btof[*sptr++];
783 *dptr++ = aa_btof[*sptr++];
784 *dptr++ = aa_btof[*sptr++];
785 *dptr++ = aa_btof[*sptr++];
786 *dptr++ = aa_btof[*sptr++];
787 *dptr++ = aa_btof[*sptr++];
788 *dptr++ = aa_btof[*sptr++];
789 *dptr++ = aa_btof[*sptr++];
790 *dptr++ = aa_btof[*sptr++];
791 *dptr++ = aa_btof[*sptr++];
792 *dptr++ = aa_btof[*sptr++];
793 *dptr++ = aa_btof[*sptr++];
794 *dptr++ = aa_btof[*sptr++];
795 *dptr++ = aa_btof[*sptr++];
797 while (dptr < seq+seqcnt) *dptr++ = aa_btof[*sptr++];
799 if (m_fd->mm_flg) m_fd->mmap_addr = (char *)sptr;
801 /* we didn't get it all, so reset for more */
802 if (*lcont) seq_len -= seqcnt;
807 error: fprintf(stderr," error reading %s at %ld\n",libstr,*libpos);
813 ncbl2_getlibn_o(unsigned char *seq,
819 struct lmf_str *m_fd,
823 unsigned int t_mask, t_shift, oid_mask;
825 /* get to the next valid pointer */
827 for (tpos = m_fd->lpos; tpos <= m_fd->max_oid; tpos++) {
829 t_shift = 31 - (tpos % 32);
830 if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; }
832 if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
833 if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
834 m_fd->lpos = tpos; /* already bumped up */
835 m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
836 return ncbl2_getlibn(seq, maxs, libstr, n_libstr,
837 libpos, lcont, m_fd, l_off);
843 static char tmp_amb[4096];
846 ncbl2_getlibn(unsigned char *seq,
852 struct lmf_str *m_fd,
855 unsigned char *sptr, *tptr, stmp;
857 int s_chunk, lib_cnt;
861 static int c_len,c_pad;
862 int c_len_set, d_len;
866 lib_cnt = m_fd->lpos;
867 *libpos = (fseek_t)lib_cnt;
868 if (*lcont==0) { /* not a continuation of previous */
869 if (lib_cnt >= m_fd->max_cnt) return (-1);
870 c_len = m_fd->a_pos_arr[lib_cnt]- m_fd->s_pos_arr[lib_cnt];
872 if (m_fd->bl_lib_pos != m_fd->s_pos_arr[lib_cnt]) { /* are we positioned to read? */
874 if ((m_fd->bl_lib_pos - m_fd->s_pos_arr[lib_cnt]) < sizeof(tmp_amb)) {
875 /* jump over amb_ray */
876 fread(tmp_amb,(size_t)1,(size_t)(m_fd->s_pos_arr[lib_cnt]-m_fd->bl_lib_pos),m_fd->libf);
878 else { /* fseek over amb_ray */
879 fseek(m_fd->libf,m_fd->s_pos_arr[lib_cnt],0);
881 m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
884 else m_fd->mmap_addr = m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt];
885 #if !defined(DEBUG) && !defined(PCOMPLIB)
888 /* get the name from the header file */
889 fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);
891 d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
892 fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
895 } /* end of *lcont==0 */
897 /* To avoid the situation where c_len <= 1; we must anticipate what
898 c_len will be after this pass. If it will be <= 64, back off this
899 time so next time it will be > 64 */
903 if ((seq_len+4 > maxs) && (seq_len+4 - maxs <= 256)) {
904 /* we won't be done but we will have less than 256 to go */
905 c_len -= 64; seq_len -= 256; c_len_set = 1; maxs -= 256;}
909 fprintf(stderr," lib_cnt: %d %d %d %d\n",lib_cnt,c_len,seq_len,maxs);
912 /* does the rest of the sequence fit? */
913 if (seq_len <= maxs-4 && !c_len_set) {
916 if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,m_fd->libf))!=(size_t)seqcnt) {
918 " could not read sequence record: %s %lld %ld != %ld: %d\n",
919 libstr,*libpos,tmp,seqcnt,*seq);
922 m_fd->bl_lib_pos += tmp;
925 else sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
927 *lcont = 0; /* this is the last chunk */
928 lib_cnt++; /* increment to the next sequence */
929 /* the last byte is either '0' (no remainder) or the last 1-3 chars and the remainder */
931 c_pad &= 0x3; /* get the last (low) 2 bits */
932 seq_len -= (4 - c_pad); /* if the last 2 bits are 0, its a NULL byte */
934 else { /* get the next chunk, but more to come */
935 seqcnt = ((maxs+3)/4)-1;
937 if ((tmp=fread(seq,(size_t)1,(size_t)(seqcnt),m_fd->libf))!=(size_t)(seqcnt)) {
938 fprintf(stderr," could not read sequence record: %lld %ld/%ld\n",
942 m_fd->bl_lib_pos += tmp;
946 sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
947 m_fd->mmap_addr += seqcnt;
951 if (c_len_set) {c_len += 64; maxs += 256;}
953 /* hopefully we don't need this because of c_len -= 64. */
956 #if !defined (USE_MMAP)
957 c_pad = fgetc(m_fd->libf);
960 c_pad = *m_fd->mmap_addr++;
961 sptr = m_fd->mmap_addr;
972 /* point to the last packed byte and to the end of the array
973 seqcnt is the exact number of bytes read
974 tptr points to the destination, use multiple of 4 to simplify math
975 sptr points to the source, note that the last byte will be read 4 cycles
979 tptr = seq + 4*seqcnt;
981 while (s_chunk-- > 0) {
983 *--tptr = (stmp&3) +1;
984 *--tptr = ((stmp >>= 2)&3)+1;
985 *--tptr = ((stmp >>= 2)&3)+1;
986 *--tptr = ((stmp >>= 2)&3)+1;
988 *--tptr = (stmp&3) +1;
989 *--tptr = ((stmp >>= 2)&3)+1;
990 *--tptr = ((stmp >>= 2)&3)+1;
991 *--tptr = ((stmp >>= 2)&3)+1;
993 *--tptr = (stmp&3) +1;
994 *--tptr = ((stmp >>= 2)&3)+1;
995 *--tptr = ((stmp >>= 2)&3)+1;
996 *--tptr = ((stmp >>= 2)&3)+1;
998 *--tptr = (stmp&3) +1;
999 *--tptr = ((stmp >>= 2)&3)+1;
1000 *--tptr = ((stmp >>= 2)&3)+1;
1001 *--tptr = ((stmp >>= 2)&3)+1;
1003 *--tptr = (stmp&3) +1;
1004 *--tptr = ((stmp >>= 2)&3)+1;
1005 *--tptr = ((stmp >>= 2)&3)+1;
1006 *--tptr = ((stmp >>= 2)&3)+1;
1008 *--tptr = (stmp&3) +1;
1009 *--tptr = ((stmp >>= 2)&3)+1;
1010 *--tptr = ((stmp >>= 2)&3)+1;
1011 *--tptr = ((stmp >>= 2)&3)+1;
1013 *--tptr = (stmp&3) +1;
1014 *--tptr = ((stmp >>= 2)&3)+1;
1015 *--tptr = ((stmp >>= 2)&3)+1;
1016 *--tptr = ((stmp >>= 2)&3)+1;
1018 *--tptr = (stmp&3) +1;
1019 *--tptr = ((stmp >>= 2)&3)+1;
1020 *--tptr = ((stmp >>= 2)&3)+1;
1021 *--tptr = ((stmp >>= 2)&3)+1;
1025 *--tptr = (stmp&3) +1;
1026 *--tptr = ((stmp >>= 2)&3)+1;
1027 *--tptr = ((stmp >>= 2)&3)+1;
1028 *--tptr = ((stmp >>= 2)&3)+1;
1031 for (sptr=seq; sptr < seq+seq_len; sptr++) {
1032 printf("%c",nt[*sptr]);
1033 if ((int)(sptr-seq) % 60 == 59) printf("\n");
1038 m_fd->lpos = lib_cnt;
1039 if (seqcnt*4 >= seq_len) { /* there was enough room */
1040 seq[seq_len]= EOSEQ;
1041 /* printf("%d\n",seq_len); */
1044 else { /* not enough room */
1045 seq[seqcnt*4]=EOSEQ;
1046 seq_len -= 4*seqcnt;
1050 error: fprintf(stderr," error reading %s at %ld\n",libstr,*libpos);
1056 8 9 10 11 12 13 14 15
1059 *db_type_arr[] = {"lcl","gib","gim","gii","gb","emb","pir","sp",
1060 "pat","ref","gnl","gi","dbj","prf","pdb","tpg",
1064 ncbl2_ranlib(char *str,
1068 struct lmf_str *m_fd)
1072 unsigned char *my_buff=NULL;
1077 char db[5], acc[20], name[20];
1082 lib_cnt = (int)libpos;
1083 llen = m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt];
1085 fseek(m_fd->hfile,m_fd->d_pos_arr[libpos],0);
1087 if (m_fd->bl_format_ver == FORMATDBV3) {
1088 if (llen >= cnt) llen = cnt-1;
1089 fread(str,(size_t)1,(size_t)(llen),m_fd->hfile);
1092 if (llen >= m_fd->tmp_buf_max) {
1093 if ((my_buff=(unsigned char *)calloc(llen,sizeof(char)))==NULL) {
1094 fprintf(stderr," cannot allocate ASN.1 buffer: %d\n",llen);
1095 my_buff = (unsigned char *)m_fd->tmp_buf;
1096 llen = m_fd->tmp_buf_max;
1098 else have_my_buff = 1;
1101 my_buff = (unsigned char *)m_fd->tmp_buf;
1104 fread(my_buff,(size_t)1,llen,m_fd->hfile);
1107 abp = parse_fastadl_asn(abp, my_buff+llen,
1108 &gi, &my_db, acc, name,
1109 title, sizeof(title), &taxid);
1112 sprintf(descr,"gi|%d|%s|%s|%s ",gi,db_type_arr[my_db],acc,name);
1115 if (acc[0] != '\0') sprintf(descr,"%s ",acc);
1116 else descr[0] = '\0';
1117 if (name[0] != '\0' && strcmp(name,"BL_ORD_ID")!=0) sprintf(descr+strlen(descr),"%s ", name);
1119 if (m_fd->pref_db < 0) {
1121 strncpy(str,descr,cnt-1);
1125 strncat(str,"\001",cnt-strlen(str)-1);
1126 strncat(str,descr,cnt-strlen(str)-1);
1128 strncat(str,title,cnt-strlen(str)-1);
1129 if (strlen(str) >= cnt-1) break;
1131 else if (m_fd->pref_db == my_db) {
1133 strncpy(str,descr,cnt-1);
1134 strncat(str,title,cnt-strlen(str)-1);
1140 strncpy(str,descr,cnt-1);
1141 strncat(str,descr,cnt-strlen(str)-1);
1144 if (have_my_buff) free(my_buff);
1150 while((bp=strchr(bp,'\001'))!=NULL) {*bp++=' ';}
1152 if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[libpos],0);
1154 m_fd->lpos = lib_cnt;
1155 m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
1158 unsigned int bl2_uint4_cvt(unsigned int val)
1161 #ifdef IS_BIG_ENDIAN
1163 #else /* it better be LITTLE_ENDIAN */
1164 res = ((val&255)*256)+ ((val>>8)&255);
1165 res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
1170 unsigned int bl2_long4_cvt(long val)
1174 #ifdef IS_BIG_ENDIAN
1177 #else /* it better be LITTLE_ENDIAN */
1178 res = ((val&255)*256)+ ((val>>8)&255);
1179 res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
1184 int64_t bl2_long8_cvt(int64_t val)
1187 #ifdef IS_BIG_ENDIAN
1189 #else /* it better be LITTLE_ENDIAN */
1190 res = ((val&255)*256)+ ((val>>8)&255);
1191 res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
1193 res = (res<<16) + (((val>>32)&255)*256) + ((val>>40)&255);
1194 res = (res<<16) + (((val>>48)&255)*256) + ((val>>56)&255);
1196 fprintf(stderr,"Cannot use bl2_long8_cvt without 64-bit longs\n");
1203 void src_int4_read(FILE *fd, int *val)
1205 #ifdef IS_BIG_ENDIAN
1206 fread((char *)val,(size_t)4,(size_t)1,fd);
1210 fread((char *)&b[0],(size_t)1,(size_t)4,fd);
1212 *val = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
1217 void src_long4_read(FILE *fd, long *valp)
1220 #ifdef IS_BIG_ENDIAN
1221 fread(&val4,(size_t)4,(size_t)1,fd);
1226 fread((char *)&b[0],(size_t)1,(size_t)4,fd);
1228 val4 = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
1234 void src_uint4_read(FILE *fd, unsigned int *valp)
1236 #ifdef IS_BIG_ENDIAN
1237 fread(valp,(size_t)4,(size_t)1,fd);
1241 fread((char *)&b[0],(size_t)1,(size_t)4,fd);
1243 *valp = (unsigned int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
1248 void src_long8_read(FILE *fd, long *val)
1250 #ifdef IS_BIG_ENDIAN
1251 fread((void *)val,(size_t)8,(size_t)1,fd);
1255 fread((char *)&b[0],(size_t)1,(size_t)8,fd);
1257 *val = (long)((((((long)((long)(b[0]<<8)+(long)b[1]<<8)+(long)b[2]<<8)
1258 +(long)b[3]<<8)+(long)b[4]<<8)+(long)b[5]<<8)
1259 +(long)b[6]<<8)+(long)b[7];
1263 void ncbi_long8_read(FILE *fd, int64_t *val)
1267 fread((char *)&b[0],(size_t)1,(size_t)8,fd);
1269 *val = (long)((((((long)((long)(b[7]<<8)+(long)b[6]<<8)+(long)b[5]<<8)
1270 +(long)b[4]<<8)+(long)b[3]<<8)+(long)b[2]<<8)
1271 +(long)b[1]<<8)+(long)b[0];
1274 void src_char_read(FILE *fd, char *val)
1276 fread(val,(size_t)1,(size_t)1,fd);
1279 void src_fstr_read(FILE *fd, char *val, int slen)
1281 fread(val,(size_t)slen,(size_t)1,fd);
1285 newname(char *nname, char *oname, char *suff, int maxn)
1287 strncpy(nname,oname,maxn-1);
1288 strncat(nname,".",1);
1289 strncat(nname,suff,maxn-strlen(nname));
1292 #define ASN_SEQ 0x30
1293 #define ASN_IS_BOOL 1
1294 #define ASN_IS_INT 2
1295 #define ASN_IS_STR 26
1298 get_asn_int(unsigned char *abp, int *val) {
1303 if (*abp++ != ASN_IS_INT) { /* check for int */
1304 fprintf(stderr," int missing\n");
1308 while (v_len-- > 0) {
1312 abp += 2; /* skip over null's */
1319 get_asn_text(unsigned char *abp, char *text, int t_len) {
1323 if (*abp++ != ASN_IS_STR) { /* check for str */
1324 fprintf(stderr," str missing\n");
1327 if ((tch = *abp++) > 128) { /* string length is in next bytes */
1328 tch &= 0x7f; /* get number of bytes for len */
1330 while (tch-- > 0) { at_len = (at_len << 8) + *abp++;}
1336 if ( at_len < t_len-1) {
1337 memcpy(text, abp, at_len);
1338 text[at_len] = '\0';
1341 memcpy(text, abp, t_len-1);
1342 text[t_len-1] = '\0';
1349 /* something to try to skip over stuff we don't want */
1351 get_asn_junk(unsigned char *abp) {
1358 if ( *abp == ASN_SEQ) { abp += 2; seq_cnt++;}
1359 else if ( *abp == ASN_IS_BOOL ) {abp = get_asn_int(abp, &tmp);}
1360 else if ( *abp == ASN_IS_INT ) {abp = get_asn_int(abp, &tmp);}
1361 else if ( *abp == ASN_IS_STR ) {abp = get_asn_text(abp, string, sizeof(string)-1);}
1364 while (seq_cnt-- > 0) abp += 2;
1369 get_asn_textseq_id(unsigned char *abp,
1370 char *name, char *acc)
1372 char release[20], ver_str[10];
1378 if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}
1383 abp = get_asn_text(abp+2, name, 20);
1386 abp = get_asn_text(abp+2, acc, 20);
1389 abp = get_asn_text(abp+2, release, sizeof(release));
1392 abp = get_asn_int(abp+2, &version);
1393 sprintf(ver_str,".%d",version);
1398 while (seqcnt-- > 0) abp += 4;
1399 strncat(acc,ver_str,20-strlen(acc));
1401 return abp; /* skip 2 NULL's */
1405 get_asn_local_id(unsigned char *abp, char *acc)
1409 if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}
1411 abp = get_asn_text(abp+2, acc, 20);
1413 while (seqcnt-- > 0) abp += 4;
1415 return abp; /* skip 2 NULL's */
1419 get_asn_dbtag(unsigned char *abp, char *name, char *str, int *id_p) {
1421 if (*abp == ASN_SEQ) { abp += 2;}
1423 if (*abp == 0xa0) { /* get db */
1424 abp = get_asn_text(abp+2, name, 20);
1427 fprintf(stderr," missing dbtag:db %d %d\n",abp[0],abp[1]);
1431 if (*abp == 0xa1) { /* get tag */
1433 abp += 2; /* skip over id */
1434 if (*abp == 2) abp = get_asn_int(abp,id_p);
1435 else abp = get_asn_text(abp+2, str, 20);
1438 fprintf(stderr," missing dbtag:tag %2x %2x\n",abp[0],abp[1]);
1441 return abp+2; /* skip 2 NULL's */
1445 get_asn_pdb_id(unsigned char *abp, char *acc, char *chain)
1447 int ichain, seq_cnt=0;
1449 if (*abp == ASN_SEQ) { abp += 2; seq_cnt++;}
1453 case 0: abp += 2; break;
1454 case 0xa0: /* mol-id */
1455 abp = get_asn_text(abp+2, acc, 20);
1458 abp = get_asn_int(abp+2, &ichain);
1462 case 0xa2: /* ignore date - scan until NULL's */
1464 abp += 2; /* skip the NULL's */
1469 while (seq_cnt-- > 0) {abp += 4;}
1473 #define ASN_TYPE_MASK 31
1476 *get_asn_seqid(unsigned char *abp,
1477 int *gi_p, int *db, char *acc, char *name) {
1479 int db_type, itmp, seq_cnt=0;
1483 if (*abp != ASN_SEQ) {
1484 fprintf(stderr, "seqid - missing SEQ 1: %2x %2x\n",abp[0], abp[1]);
1487 else { abp += 2; seq_cnt++;}
1489 db_type = (*abp & ASN_TYPE_MASK);
1491 if (db_type == 11) { /* gi */
1492 abp = get_asn_int(abp+2,gi_p);
1495 while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;}
1497 db_type = (*abp & ASN_TYPE_MASK);
1498 if (db_type > 17) {db_type = 0;}
1503 abp = get_asn_local_id(abp+2, acc);
1507 abp = get_asn_int(abp+2,&itmp);
1511 abp = get_asn_int(abp+2,&itmp);
1523 abp = get_asn_textseq_id(abp+2,name,acc);
1526 abp = get_asn_dbtag(abp+2,name,acc,&itmp);
1528 abp = get_asn_pdb_id(abp+2,acc,name);
1533 while (seq_cnt-- > 0) { abp += 4;}
1534 return abp; /* skip over 2 NULL's */
1537 #define ASN_FADL_TITLE 0xa0
1538 #define ASN_FADL_SEQID 0xa1
1539 #define ASN_FADL_TAXID 0xa2
1540 #define ASN_FADL_MEMBERS 0xa3
1541 #define ASN_FADL_LINKS 0xa4
1542 #define ASN_FADL_OTHER 0xa5
1545 parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
1546 int *gi_p, int *db, char *acc,
1547 char *name, char *title, int t_len, int *taxid_p) {
1549 char tmp_db[4], tmp_acc[32], tmp_name[32];
1554 acc[0] = name[0] = db[0] = title[0] = '\0';
1557 while ( abp < asn_max && *abp) {
1558 if (*abp == ASN_SEQ) { abp += 2; seq_cnt++; }
1559 else if (*abp == ASN_FADL_TITLE) {
1560 abp = get_asn_text(abp+2, title, t_len);
1562 else if (*abp == ASN_FADL_SEQID ) {
1563 abp = get_asn_seqid(abp+2, gi_p, db, acc, name);
1564 if (*db > 17) *db = 0;
1566 else if (*abp == ASN_FADL_TAXID ) {
1567 abp = get_asn_int(abp+2, taxid_p);
1569 else if (*abp == ASN_FADL_MEMBERS) {
1570 abp = get_asn_junk(abp+2);
1572 else if (*abp == ASN_FADL_LINKS ) {
1573 abp = get_asn_junk(abp+2);
1575 else if (*abp == ASN_FADL_OTHER ) {
1576 abp = get_asn_junk(abp+2);
1579 /* fprintf(stderr, " Error - missing ASN.1 %2x:%2x:%2x:%2x\n",
1580 abp[-2],abp[-1],abp[0],abp[1]); */
1584 while (abp < asn_max && *abp == '\0' ) abp++;
1585 if (abp >= asn_max) return NULL;
1591 parse_pal(char *dname, char *msk_name,
1592 int *oid_seqs, int *max_oid,
1597 while (fgets(line,sizeof(line),fd)) {
1598 if (line[0] == '#') continue;
1600 if (strncmp(line, "DBLIST", 6)==0) {
1601 sscanf(line+7,"%s",dname);
1603 else if (strncmp(line, "OIDLIST", 7)==0) {
1604 sscanf(line+8,"%s",msk_name);
1606 else if (strncmp(line, "NSEQ", 4)==0) {
1607 sscanf(line+5,"%d",oid_seqs);
1609 else if (strncmp(line, "MAXOID", 6)==0) {
1610 sscanf(line+7,"%d",max_oid);