+++ /dev/null
-/* ncbl2_lib.c functions to read ncbi-blast format files from
- formatdb (blast2.0 format files)
-
- copyright (c) 1999 William R. Pearson
-*/
-
-/* $Name: fa_34_26_5 $ - $Id: ncbl2_mlib.c,v 1.56 2007/04/02 18:08:11 wrp Exp $ */
-
-/* to turn on mmap()ing for Blast2 files: */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#ifdef UNIX
-#include <unistd.h>
-#endif
-#include <errno.h>
-
-
-/* ****************************************************************
-
-17-May-2006
-
-Modified to read NCBI .[np]al and .msk files. The .nal or .pal file
-provides a way to read sequences from a list of files. The .msk file
-provides a compact way of indicating the subset of sequences in a
-larger database (typically nr or nt) that comprise a smaller database
-(e.g. swissprot or pdbaa). A .pal file (e.g. swissprot.00.pal) that
-uses a .msk file has the form:
-
- # Alias file generated by genmask
- # Date created: Mon Apr 10 11:24:05 2006
- #
- TITLE Non-redundant SwissProt sequences
- DBLIST nr.00
- OIDLIST swissprot.00.msk
- LENGTH 74351250
- NSEQ 198346
- MAXOID 2617347
- MEMB_BIT 1
- # end of the file
-
-To work with this file, we must first load the nr.00 file, and then
-read the swissprot.00.msk file, and then scan all the entries in the
-swissprot.00.msk file (which are packed 32 mask-bit to an int) to
-determine whether a specific libpos index entry is present in the
-subset database.
-
-**************************************************************** */
-
-
-/* ****************************************************************
-This code reads NCBI Blast2 format databases from formatdb version 3 and 4
-
-(From NCBI) This section describes the format of the databases.
-
-Formatdb creates three main files for proteins containing indices,
-sequences, and headers with the extensions, respectively, of pin, psq,
-and phr (for nucleotides these are nin, nsq, and nhr). A number of
-other ISAM indices are created, but these are described elsewhere.
-
-FORMAT OF THE INDEX FILE
-------------------------
-
-1.) formatdb version number [4 bytes].
-
-2.) protein dump flag (1 for a protein database, 0 for a nucleotide
- database) [4 bytes].
-
-3.) length of the database title in bytes [4 bytes].
-4.) the database title [length given in 3.)].
-5.) length of the date/time string [4 bytes].
-6.) the date/time string [length given in 5.)].
-7.) the number of sequences in the database [4 bytes].
-8.) the total length of the database in residues/basepairs [4 bytes].
-9.) the length of the longest sequence in the database [4 bytes].
-
-10.) a list of the offsets for definitions (one for each sequence) in
-the header file. There are num_of_seq+1 of these, where num_of_seq is
-the number of sequences given in 7.).
-
-11.) a list of the offsets for sequences (one for each sequence) in
-the sequence file. There are num_of_seq+1 of these, where num_of_seq
-is the number of sequences given in 7.).
-
-12.) a list of the offsets for the ambiguity characters (one for each
-sequence) in the sequence file. This list is only present for
-nucleotide databases and, since the database is compressed 4/1 for
-nucleotides, allows the ambiguity characters to be restored when the
-sequence is generated. There are num_of_seq+1 of these, where
-num_of_seq is the number of sequences given in 7.).
-
-
-FORMAT OF THE SEQUENCE FILE
----------------------------
-
-There are different formats for the protein and nucleotide sequence files.
-
-The protein sequence files is quite simple. The first byte in the
-file is a NULL byte, followed by the sequence in ncbistdaa format
-(described in the NCBI Software Development Toolkit documentation).
-Following the sequence is another NULL byte, followed by the next
-sequence. The file ends with a NULL byte, following the last
-sequence.
-
-The nucleotide sequence file contains the nucleotide sequence, with
-four basepairs compressed into one byte. The format used is NCBI2na,
-documented in the NCBI Software Development Toolkit manual. Any
-ambiguity characters present in the original sequence are replaced at
-random by A, C, G or T. The true value of ambiguity characters are
-stored at the end of each sequence to allow true reproduction of the
-original sequence.
-
-FORMAT OF THE HEADER FILE (formatdb version 3)
--------------------------
-
-The format of the header file depends on whether or not the identifiers in the
-original file were parsed or not. For the case that they were not, then each
-entry has the format:
-
-gnl|BL_ORD_ID|entry_number my favorite yeast sequence...
-
-Here entry_number gives the ordinal number of the sequence in the
-database (with zero offset). The identifier
-gnl|BL_ORD_ID|entry_number is used by the BLAST software to identify
-the entry, if the user has not provided another identifier. If the
-identifier was parsed, then gnl|BL_ORD_ID|entry_number is replaced by
-the correct identifier, as described in
-ftp://ncbi.nlm.nih.gov/blast/db/README .
-
-There are no separators between these deflines.
-
-For formatdb version 4, the header file contains blast ASN.1 binary
-deflines, which can parsed with parse_fastadl_asn().
-
-FORMAT OF THE .MSK FILE
------------------------
-
-The .msk file is simply a packed list of masks for formatdb "oids" for
-some other file (typically nr). The first value is the last oid
-available; the remainder are packed 32 oids/mask, so that the number
-of masks is 1/32 the number of sequences in the file.
-
-**************************************************************** */
-
-#ifdef USE_MMAP
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#ifdef IBM_AIX
-#include <fcntl.h>
-#else
-#include <sys/fcntl.h>
-#endif
-#endif
-
-#ifdef USE_MMAP
-#ifndef MAP_FILE
-#define MAP_FILE 0
-#endif
-#endif
-
-#ifdef UNIX
-#define RBSTR "r"
-#else
-#define RBSTR "rb"
-#endif
-
-#ifdef WIN32
-#define SLASH_CHAR '\\'
-#define SLASH_STR "\\"
-#else
-#define SLASH_CHAR '/'
-#define SLASH_STR "/"
-#endif
-
-#define XTERNAL
-#include "uascii.h"
-
-#define XTERNAL
-#include "upam.h"
-#include "ncbl2_head.h"
-
-#include "defs.h"
-#include "mm_file.h"
-
-unsigned int bl2_uint4_cvt(unsigned int);
-unsigned int bl2_long4_cvt(long);
-int64_t bl2_long8_cvt(int64_t);
-void src_int4_read(FILE *fd, int *valp);
-void src_uint4_read(FILE *fd, unsigned int *valp);
-void src_long4_read(FILE *fd, long *valp);
-void ncbi_long8_read(FILE *fd, int64_t *valp);
-void src_char_read(FILE *fd, char *valp);
-unsigned char *parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
- int *gi_p, int *db, char *acc, char *name,
- char *title, int t_len, int *taxid);
-
-/* nt_btoa maps from blast 2bit format to ascii characters */
-static char nt_btoa[5] = {"ACGT"};
-
-static char aa_b2toa[27]= {"-ABCDEFGHIKLMNPQRSTVWXYZU*"};
-
-static int aa_btof[32]; /* maps to fasta alphabet */
-
-static int dbtype, dbformat, amb_cnt;
-
-#define NCBIBL20 12
-
-int ncbl2_getliba(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
-int ncbl2_getlibn(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
-
-int ncbl2_getliba_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
-int ncbl2_getlibn_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
-
-void newname(char *, char *, char *, int);
-void parse_pal(char *, char *, int *, int *, FILE *);
-
-void ncbl2_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd);
-
-/* ncbl2_openlib() is used to open (and memory map) a BLAST2.0 format
- file. Ifdef USE_MMAP, then ncbl2_openlib returns a structure that can
- be used to read the database. */
-
-struct lmf_str *
-ncbl2_openlib(char *name, int ldnaseq)
-{
- char lname[256];
- char dname[256];
- char msk_name[256];
- char hname[256];
- char sname[256];
- char tname[256];
- char db_dir[256];
- int pref_db= -1;
- char *bp;
- int title_len;
- char *title_str=NULL;
- int date_len;
- char *date_str=NULL;
- long ltmp;
- int64_t l8tmp;
- int oid_seqs, max_oid;
- int oid_cnt, oid_len;
- unsigned int *oid_list, o_max;
- int tmp;
- int i;
-#ifdef USE_MMAP
- struct stat statbuf;
-#endif
- FILE *ifile; /* index offsets, also DB info */
- unsigned int *f_pos_arr;
- struct lmf_str *m_fptr;
-
- if (ldnaseq==SEQT_PROT) { /* read a protein database */
- newname(lname,name,AA_LIST_EXT,(int)sizeof(lname));
- newname(tname,name,AA_INDEX_EXT,(int)sizeof(tname));
- newname(hname,name,AA_HEADER_EXT,(int)sizeof(hname));
- newname(sname,name,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
-
- /* initialize map of BLAST2 amino acids to FASTA amino acids */
- for (i=0; i<sizeof(aa_b2toa); i++) {
- if ((tmp=aascii[aa_b2toa[i]])<NA) aa_btof[i]=tmp;
- else if (aa_b2toa[i]=='*') aa_btof[i]=aascii['X'];
- else aa_b2toa[i]=0;
-/* else aa_btof[i]=aascii['X']; */
- }
- }
- else { /* reading DNA library */
- newname(lname,name,NT_LIST_EXT,(int)sizeof(lname));
- newname(tname,name,NT_INDEX_EXT,(int)sizeof(tname));
- newname(hname,name,NT_HEADER_EXT,(int)sizeof(hname));
- newname(sname,name,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
-
- }
-
- /* check first for list name */
- max_oid = oid_seqs = 0;
- oid_list = NULL;
- if ((ifile = fopen(lname,"r"))!=NULL) {
-
- if ((bp = strrchr(name,SLASH_CHAR))!=NULL) {
- *bp = '\0';
- strncpy(db_dir,name,sizeof(db_dir));
- strncat(db_dir,SLASH_STR,sizeof(db_dir)-strlen(db_dir)-1);
- *bp = SLASH_CHAR;
- }
- else {
- db_dir[0]='\0';
- }
-
- /* we have a list file, we need to parse it */
- parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);
- fclose(ifile);
-
- pref_db = -1;
- if (oid_seqs > 0) {
-
- /* get the pref_db before adding the directory */
- if (strncmp(msk_name,"swissprot",9)==0) {
- pref_db = 7;
- }
- else if (strncmp(msk_name,"pdbaa",5)==0) {
- pref_db = 14;
- }
-
- /* need to add directory to both dname and msk_name */
- strncpy(tname,db_dir,sizeof(tname));
- strncat(tname,msk_name, sizeof(tname));
- strncpy(msk_name, tname, sizeof(msk_name));
-
- strncpy(tname,db_dir,sizeof(tname));
- strncat(tname,dname, sizeof(tname));
- strncpy(dname,tname,sizeof(dname));
-
- if (ldnaseq == SEQT_PROT) {
- newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));
- newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));
- newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
- }
- else { /* reading DNA library */
- newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));
- newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));
- newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
- }
- /* now load the oid file */
- if ((ifile = fopen(msk_name,RBSTR))==NULL) {
- fprintf(stderr,"error - cannot load %s file\n",msk_name);
- return NULL;
- }
- else {
- src_uint4_read(ifile,&o_max);
- if (o_max != max_oid) {
- fprintf(stderr," error - oid count mismatch %d != %d\n",max_oid, o_max);
- }
- oid_len = (max_oid/32+1);
- if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {
- fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);
- return NULL;
- }
- if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {
- fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);
- return NULL;
- }
- fclose(ifile);
- }
- }
- else { /* we had a .msk file, but there are no oid's in it.
- allocate an m_fptr and return it empty */
- if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {
- fprintf(stderr," cannot allocate lmf_str\n");
- return NULL;
- }
-
- m_fptr->tmp_buf_max = 0;
-
- /* load the oid info */
- m_fptr->max_oid = 0;
- m_fptr->oid_seqs = 0;
- m_fptr->oid_list = (unsigned int *)calloc(1,sizeof(int));
- m_fptr->pref_db= -1;
-
- if (ldnaseq==SEQT_DNA) {
- m_fptr->getlib = ncbl2_getlibn_o;
- m_fptr->sascii = nascii;
- }
- else {
- m_fptr->getlib = ncbl2_getliba_o;
- m_fptr->sascii = aascii;
- }
- strncpy(m_fptr->lb_name,sname,MAX_FN);
- return m_fptr;
- }
- }
-
- /* open the index file */
- if ((ifile = fopen(tname,RBSTR))==NULL) {
- fprintf(stderr," cannot open %s (%s) INDEX file",tname,name);
- perror("...");
- return 0;
- }
- src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */
- src_uint4_read(ifile,(unsigned *)&dbtype); /* get 1 for protein/0 DNA */
-
- if (dbformat != FORMATDBV3 && dbformat!=FORMATDBV4) {
- fprintf(stderr,"error - %s wrong formatdb version (%d/%d)\n",
- tname,dbformat,FORMATDBV3);
- return NULL;
- }
-
- if ((ldnaseq==SEQT_PROT && dbtype != AAFORMAT) ||
- (ldnaseq==SEQT_DNA && dbtype!=NTFORMAT)) {
- fprintf(stderr,"error - %s wrong format (%d/%d)\n",
- tname,dbtype,(ldnaseq ? NTFORMAT: AAFORMAT));
- return NULL;
- }
-
- /* the files are there - allocate lmf_str */
-
- if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {
- fprintf(stderr," cannot allocate lmf_str\n");
- return NULL;
- }
-
- m_fptr->tmp_buf_max = 4096;
- if ((m_fptr->tmp_buf=
- (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {
- fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");
- return NULL;
- }
-
- /* load the oid info */
- m_fptr->max_oid = max_oid;
- m_fptr->oid_seqs = oid_seqs;
- m_fptr->oid_list = oid_list;
- m_fptr->pref_db= pref_db;
-
- /* open the header file */
- if ((m_fptr->hfile = fopen(hname,RBSTR))==NULL) {
- fprintf(stderr," cannot open %s header file\n",hname);
- goto error_r;
- }
-
- /* ncbl2_ranlib is used for all BLAST2.0 access */
- m_fptr->ranlib = ncbl2_ranlib;
- m_fptr->bl_format_ver = dbformat;
-
- if (ldnaseq==SEQT_DNA) {
- if (oid_seqs > 0) {
- m_fptr->getlib = ncbl2_getlibn_o;
- }
- else {
- m_fptr->getlib = ncbl2_getlibn;
- }
- m_fptr->sascii = nascii;
- }
- else {
- if (oid_seqs > 0) {
- m_fptr->getlib = ncbl2_getliba_o;
- }
- else {
- m_fptr->getlib = ncbl2_getliba;
- }
- m_fptr->sascii = aascii;
- }
- strncpy(m_fptr->lb_name,sname,MAX_FN);
-
- /* open the sequence file */
-
-#if defined (USE_MMAP)
- m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);
- if (!m_fptr->mm_flg) {
- fprintf(stderr," cannot open %s",sname);
- perror("...");
- }
- else {
- if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {
- fprintf(stderr," cannot fstat %s",sname);
- perror("...");
- m_fptr->mm_flg = 0;
- }
- else {
- m_fptr->st_size = statbuf.st_size;
- if((m_fptr->mmap_base =
- mmap(NULL, m_fptr->st_size, PROT_READ,
- MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {
- fprintf(stderr," cannot mmap %s",sname);
- perror("...");
- m_fptr->mm_flg = 0;
- }
- else {
- m_fptr->mmap_addr = m_fptr->mmap_base;
- m_fptr->mm_flg = 1;
- }
- }
- /* regardless, close the open()ed version */
- close(m_fptr->mmap_fd);
- }
-#else
- m_fptr->mm_flg = 0;
-#endif
-
- if (!m_fptr->mm_flg) {
- if ((m_fptr->libf = fopen(sname,RBSTR))==NULL) {
- fprintf(stderr," cannot open %s sequence file",sname);
- perror("...");
- goto error_r;
- }
- }
-
-/* all files should be open */
-
- src_uint4_read(ifile,(unsigned *)&title_len);
-
- if (title_len > 0) {
- if ((title_str = calloc((size_t)title_len+1,sizeof(char)))==NULL) {
- fprintf(stderr," cannot allocate title string (%d)\n",title_len);
- goto error_r;
- }
- fread(title_str,(size_t)1,(size_t)title_len,ifile);
- }
-
- src_uint4_read(ifile,(unsigned *)&date_len);
-
- if (date_len > 0) {
- if ((date_str = calloc((size_t)date_len+1,sizeof(char)))==NULL) {
- fprintf(stderr," cannot allocate date string (%d)\n",date_len);
- goto error_r;
- }
- fread(date_str,(size_t)1,(size_t)date_len,ifile);
- }
-
- m_fptr->lpos = 0;
- src_uint4_read(ifile,(unsigned *)&m_fptr->max_cnt);
-
- if (dbformat == FORMATDBV3) {
- src_long4_read(ifile,<mp);
- m_fptr->tot_len = ltmp;
- }
- else {
- ncbi_long8_read(ifile,&l8tmp);
- m_fptr->tot_len = ltmp;
- }
-
- src_long4_read(ifile,<mp);
- m_fptr->max_len = ltmp;
-
- /* currently we are not using this information, but perhaps later */
- if (title_str!=NULL) free(title_str);
- if (date_str!=NULL) free(date_str);
-
-#ifdef DEBUG
- fprintf(stderr,"%s format: BL2 (%s) max_cnt: %d, totlen: %lld, maxlen %ld\n",
- name,m_fptr->mm_flg ? "mmap" : "fopen",
- m_fptr->max_cnt,m_fptr->tot_len,m_fptr->max_len);
-#endif
-
- /* allocate and read hdr indexes */
- if ((f_pos_arr=(unsigned int *)calloc((size_t)m_fptr->max_cnt+1,sizeof(int)))==NULL) {
- fprintf(stderr," cannot allocate tmp header pointers\n");
- goto error_r;
- }
-
- if ((m_fptr->d_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
- fprintf(stderr," cannot allocate header pointers\n");
- goto error_r;
- }
-
- /* allocate and read sequence offsets */
- if ((m_fptr->s_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
- fprintf(stderr," cannot allocate sequence pointers\n");
- goto error_r;
- }
-
- /*
- for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->d_pos_arr[i]);
- for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->s_pos_arr[i]);
- */
- if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
- fprintf(stderr," error reading hdr offsets: %s\n",tname);
- goto error_r;
- }
-
- for (i=0; i<=m_fptr->max_cnt; i++)
-#ifdef IS_BIG_ENDIAN
- m_fptr->d_pos_arr[i] = f_pos_arr[i];
-#else
- m_fptr->d_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
-#endif
-
- if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
- fprintf(stderr," error reading seq offsets: %s\n",tname);
- goto error_r;
- }
- for (i=0; i<=m_fptr->max_cnt; i++) {
-#ifdef IS_BIG_ENDIAN
- m_fptr->s_pos_arr[i] = f_pos_arr[i];
-#else
- m_fptr->s_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
-#endif
- }
-
- if (dbtype == NTFORMAT) {
- /* allocate and ambiguity offsets */
- if ((m_fptr->a_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
- fprintf(stderr," cannot allocate sequence pointers\n");
- goto error_r;
- }
-
- /*
- for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->a_pos_arr[i]);
- */
-
- if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
- fprintf(stderr," error reading seq offsets: %s\n",tname);
- goto error_r;
- }
- for (i=0; i<=m_fptr->max_cnt; i++) {
-#ifdef IS_BIG_ENDIAN
- m_fptr->a_pos_arr[i] = f_pos_arr[i];
-#else
- m_fptr->a_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
-#endif
- }
- }
-
- /*
- for (i=0; i < min(m_fptr->max_cnt,10); i++) {
- fprintf(stderr,"%d: %d %d %d\n",i,m_fptr->s_pos_arr[i],m_fptr->a_pos_arr[i],m_fptr->d_pos_arr[i]);
- }
- */
-
- /* all done with ifile, close it */
- fclose(ifile);
-
- free(f_pos_arr);
-
- if (!m_fptr->mm_flg) {
- tmp = fgetc(m_fptr->libf);
- if (tmp!=NULLB)
- fprintf(stderr," phase error: %d:%d found\n",0,tmp);
- }
-
- m_fptr->bl_lib_pos = 1;
- amb_cnt = 0;
- return m_fptr;
-
- error_r:
- /* here if failure after m_fptr allocated */
- free(m_fptr);
- return NULL;
-}
-
-void ncbl2_closelib(struct lmf_str *m_fptr)
-{
- if (m_fptr->tmp_buf != NULL) {
- free(m_fptr->tmp_buf);
- m_fptr->tmp_buf_max = 0;
- }
-
- if (m_fptr->s_pos_arr !=NULL) {
- free(m_fptr->s_pos_arr);
- m_fptr->s_pos_arr = NULL;
- }
- if (m_fptr->a_pos_arr!=NULL) {
- free(m_fptr->a_pos_arr);
- m_fptr->a_pos_arr = NULL;
- }
-
- if (m_fptr->hfile !=NULL ) {
- fclose(m_fptr->hfile); m_fptr->hfile=NULL;
- free(m_fptr->d_pos_arr); m_fptr->d_pos_arr = NULL;
- }
-
- if (m_fptr->oid_list != NULL) {
- free(m_fptr->oid_list); m_fptr->oid_list = NULL;
- m_fptr->oid_seqs = m_fptr->max_oid = 0;
- }
-
-#ifdef use_mmap
- if (m_fptr->mm_flg) {
- munmap(m_fptr->mmap_base,m_fptr->st_size);
- m_fptr->mmap_fd = -1;
- }
- else
-#endif
- if (m_fptr->libf !=NULL ) {fclose(m_fptr->libf); m_fptr->libf=NULL;}
-}
-
-int
-ncbl2_getliba_o(unsigned char *seq,
- int maxs,
- char *libstr,
- int n_libstr,
- fseek_t *libpos,
- int *lcont,
- struct lmf_str *m_fd,
- long *l_off)
-{
- int tpos;
- unsigned int t_mask, t_shift, oid_mask;
-
- /* get to the next valid pointer */
-
- for ( tpos = m_fd->lpos ;tpos <= m_fd->max_oid; tpos++) {
- t_mask = tpos / 32;
- t_shift = 31 - (tpos % 32);
- if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; }
-
- if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
- if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
- m_fd->lpos = tpos; /* already bumped up */
- m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
- return ncbl2_getliba(seq, maxs, libstr, n_libstr,
- libpos, lcont, m_fd, l_off);
- }
- }
- return -1;
-}
-
-int
-ncbl2_getliba(unsigned char *seq,
- int maxs,
- char *libstr,
- int n_libstr,
- fseek_t *libpos,
- int *lcont,
- struct lmf_str *m_fd,
- long *l_off)
-{
- unsigned char *sptr, *dptr;
- int s_chunk, d_len, lib_cnt;
- long seqcnt;
- long tmp;
- static long seq_len;
-#if defined(DEBUG) || defined(PCOMPLIB)
- int gi, my_db, taxid;
- char acc[20], title[21], name[20];
-#endif
-
- *l_off = 1;
-
- lib_cnt = m_fd->lpos;
- *libpos = (fseek_t)m_fd->lpos;
-
- if (*lcont==0) {
- if (lib_cnt >= m_fd->max_cnt) return -1; /* no more sequences */
- seq_len = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]; /* value is +1 off to get the NULL */
- if (m_fd->mm_flg) m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lib_cnt];
-#if !defined(DEBUG) && !defined(PCOMPLIB)
- libstr[0]='\0';
-#else
- /* get the name from the header file */
- fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);
-
- if (m_fd->bl_format_ver == FORMATDBV3) {
- d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
- fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
- libstr[d_len]='\0';
- }
- else {
- d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
- fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile);
- parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len,
- &gi, &my_db, acc, name, title, 20, &taxid);
- sprintf(libstr,"gi|%d",gi);
- }
-#endif
- }
- if (seq_len <= maxs) { /* sequence fits */
- seqcnt = seq_len;
- m_fd->lpos++;
- *lcont = 0;
- }
- else { /* doesn't fit */
- seqcnt = maxs-1;
- (*lcont)++;
- }
-
- if (m_fd->mm_flg) sptr = (unsigned char *)m_fd->mmap_addr;
- else {
- if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,m_fd->libf))!=(size_t)seq_len) {
- fprintf(stderr," could not read sequence record: %ld %ld != %ld\n",
- *libpos,tmp,seq_len);
- goto error;
- }
- sptr = seq;
- }
- if (seq_len <= maxs) {seqcnt = --seq_len;}
-
- /* everything is ready, set up dst. pointer, seq_len */
- dptr = seq;
-
- if (aa_b2toa[sptr[seq_len-1]]=='*') seq_len--;
- s_chunk = seqcnt/16;
- while (s_chunk-- > 0) {
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- *dptr++ = aa_btof[*sptr++];
- }
- while (dptr < seq+seqcnt) *dptr++ = aa_btof[*sptr++];
-
- if (m_fd->mm_flg) m_fd->mmap_addr = (char *)sptr;
-
- /* we didn't get it all, so reset for more */
- if (*lcont) seq_len -= seqcnt;
-
- seq[seqcnt]= EOSEQ;
- return (seqcnt);
-
-error: fprintf(stderr," error reading %s at %ld\n",libstr,*libpos);
- fflush(stderr);
- return (-1);
-}
-
-int
-ncbl2_getlibn_o(unsigned char *seq,
- int maxs,
- char *libstr,
- int n_libstr,
- fseek_t *libpos,
- int *lcont,
- struct lmf_str *m_fd,
- long *l_off)
-{
- int tpos;
- unsigned int t_mask, t_shift, oid_mask;
-
- /* get to the next valid pointer */
-
- for (tpos = m_fd->lpos; tpos <= m_fd->max_oid; tpos++) {
- t_mask = tpos / 32;
- t_shift = 31 - (tpos % 32);
- if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; }
-
- if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
- if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
- m_fd->lpos = tpos; /* already bumped up */
- m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
- return ncbl2_getlibn(seq, maxs, libstr, n_libstr,
- libpos, lcont, m_fd, l_off);
- }
- }
- return -1;
-}
-
-static char tmp_amb[4096];
-
-int
-ncbl2_getlibn(unsigned char *seq,
- int maxs,
- char *libstr,
- int n_libstr,
- fseek_t *libpos,
- int *lcont,
- struct lmf_str *m_fd,
- long *l_off)
-{
- unsigned char *sptr, *tptr, stmp;
- long seqcnt;
- int s_chunk, lib_cnt;
- size_t tmp;
- char ch;
- static long seq_len;
- static int c_len,c_pad;
- int c_len_set, d_len;
-
- *l_off = 1;
-
- lib_cnt = m_fd->lpos;
- *libpos = (fseek_t)lib_cnt;
- if (*lcont==0) { /* not a continuation of previous */
- if (lib_cnt >= m_fd->max_cnt) return (-1);
- c_len = m_fd->a_pos_arr[lib_cnt]- m_fd->s_pos_arr[lib_cnt];
- if (!m_fd->mm_flg) {
- if (m_fd->bl_lib_pos != m_fd->s_pos_arr[lib_cnt]) { /* are we positioned to read? */
- amb_cnt++;
- if ((m_fd->bl_lib_pos - m_fd->s_pos_arr[lib_cnt]) < sizeof(tmp_amb)) {
- /* jump over amb_ray */
- fread(tmp_amb,(size_t)1,(size_t)(m_fd->s_pos_arr[lib_cnt]-m_fd->bl_lib_pos),m_fd->libf);
- }
- else { /* fseek over amb_ray */
- fseek(m_fd->libf,m_fd->s_pos_arr[lib_cnt],0);
- }
- m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
- }
- }
- else m_fd->mmap_addr = m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt];
-#if !defined(DEBUG) && !defined(PCOMPLIB)
- libstr[0]='\0';
-#else
- /* get the name from the header file */
- fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);
-
- d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
- fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
- libstr[d_len]='\0';
-#endif
- } /* end of *lcont==0 */
-
- /* To avoid the situation where c_len <= 1; we must anticipate what
- c_len will be after this pass. If it will be <= 64, back off this
- time so next time it will be > 64 */
-
- seq_len = c_len*4;
-
- if ((seq_len+4 > maxs) && (seq_len+4 - maxs <= 256)) {
- /* we won't be done but we will have less than 256 to go */
- c_len -= 64; seq_len -= 256; c_len_set = 1; maxs -= 256;}
- else c_len_set = 0;
-
- /*
- fprintf(stderr," lib_cnt: %d %d %d %d\n",lib_cnt,c_len,seq_len,maxs);
- */
-
- /* does the rest of the sequence fit? */
- if (seq_len <= maxs-4 && !c_len_set) {
- seqcnt = c_len;
- if (!m_fd->mm_flg) {
- if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,m_fd->libf))!=(size_t)seqcnt) {
- fprintf(stderr,
- " could not read sequence record: %s %lld %ld != %ld: %d\n",
- libstr,*libpos,tmp,seqcnt,*seq);
- goto error;
- }
- m_fd->bl_lib_pos += tmp;
- sptr = seq + seqcnt;
- }
- else sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
-
- *lcont = 0; /* this is the last chunk */
- lib_cnt++; /* increment to the next sequence */
- /* the last byte is either '0' (no remainder) or the last 1-3 chars and the remainder */
- c_pad = *(sptr-1);
- c_pad &= 0x3; /* get the last (low) 2 bits */
- seq_len -= (4 - c_pad); /* if the last 2 bits are 0, its a NULL byte */
- }
- else { /* get the next chunk, but more to come */
- seqcnt = ((maxs+3)/4)-1;
- if (!m_fd->mm_flg) {
- if ((tmp=fread(seq,(size_t)1,(size_t)(seqcnt),m_fd->libf))!=(size_t)(seqcnt)) {
- fprintf(stderr," could not read sequence record: %lld %ld/%ld\n",
- *libpos,tmp,seqcnt);
- goto error;
- }
- m_fd->bl_lib_pos += tmp;
- sptr = seq + seqcnt;
- }
- else {
- sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
- m_fd->mmap_addr += seqcnt;
- }
- seq_len = 4*seqcnt;
- c_len -= seqcnt;
- if (c_len_set) {c_len += 64; maxs += 256;}
- (*lcont)++;
-/* hopefully we don't need this because of c_len -= 64. */
-/*
- if (c_len == 1) {
-#if !defined (USE_MMAP)
- c_pad = fgetc(m_fd->libf);
- *sptr=c_pad;
-#else
- c_pad = *m_fd->mmap_addr++;
- sptr = m_fd->mmap_addr;
-#endif
- c_pad &= 0x3;
- seq_len += c_pad;
- seqcnt++;
- lib_cnt++;
- *lcont = 0;
- }
-*/
- }
-
- /* point to the last packed byte and to the end of the array
- seqcnt is the exact number of bytes read
- tptr points to the destination, use multiple of 4 to simplify math
- sptr points to the source, note that the last byte will be read 4 cycles
- before it is written
- */
-
- tptr = seq + 4*seqcnt;
- s_chunk = seqcnt/8;
- while (s_chunk-- > 0) {
- stmp = *--sptr;
- *--tptr = (stmp&3) +1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- stmp = *--sptr;
- *--tptr = (stmp&3) +1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- stmp = *--sptr;
- *--tptr = (stmp&3) +1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- stmp = *--sptr;
- *--tptr = (stmp&3) +1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- stmp = *--sptr;
- *--tptr = (stmp&3) +1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- stmp = *--sptr;
- *--tptr = (stmp&3) +1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- stmp = *--sptr;
- *--tptr = (stmp&3) +1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- stmp = *--sptr;
- *--tptr = (stmp&3) +1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- }
- while (tptr>seq) {
- stmp = *--sptr;
- *--tptr = (stmp&3) +1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- *--tptr = ((stmp >>= 2)&3)+1;
- }
- /*
- for (sptr=seq; sptr < seq+seq_len; sptr++) {
- printf("%c",nt[*sptr]);
- if ((int)(sptr-seq) % 60 == 59) printf("\n");
- }
- printf("\n");
- */
-
- m_fd->lpos = lib_cnt;
- if (seqcnt*4 >= seq_len) { /* there was enough room */
- seq[seq_len]= EOSEQ;
- /* printf("%d\n",seq_len); */
- return seq_len;
- }
- else { /* not enough room */
- seq[seqcnt*4]=EOSEQ;
- seq_len -= 4*seqcnt;
- return (4*seqcnt);
- }
-
-error: fprintf(stderr," error reading %s at %ld\n",libstr,*libpos);
- fflush(stderr);
- return (-1);
-}
-
- /* 0 1 2 3 4 5 6 7
- 8 9 10 11 12 13 14 15
- 16 17 */
-static char
-*db_type_arr[] = {"lcl","gib","gim","gii","gb","emb","pir","sp",
- "pat","ref","gnl","gi","dbj","prf","pdb","tpg",
- "tpe","tpd"};
-
-void
-ncbl2_ranlib(char *str,
- int cnt,
- fseek_t libpos,
- char *libstr,
- struct lmf_str *m_fd)
-{
- int llen, lib_cnt;
- char *bp;
- unsigned char *my_buff=NULL;
- char descr[2048];
- unsigned char *abp;
- int gi, taxid;
- int my_db;
- char db[5], acc[20], name[20];
- char title[1024];
- int have_my_buff=0;
- int have_descr = 0;
-
- lib_cnt = (int)libpos;
- llen = m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt];
-
- fseek(m_fd->hfile,m_fd->d_pos_arr[libpos],0);
-
- if (m_fd->bl_format_ver == FORMATDBV3) {
- if (llen >= cnt) llen = cnt-1;
- fread(str,(size_t)1,(size_t)(llen),m_fd->hfile);
- }
- else {
- if (llen >= m_fd->tmp_buf_max) {
- if ((my_buff=(unsigned char *)calloc(llen,sizeof(char)))==NULL) {
- fprintf(stderr," cannot allocate ASN.1 buffer: %d\n",llen);
- my_buff = (unsigned char *)m_fd->tmp_buf;
- llen = m_fd->tmp_buf_max;
- }
- else have_my_buff = 1;
- }
- else {
- my_buff = (unsigned char *)m_fd->tmp_buf;
- }
- abp = my_buff;
- fread(my_buff,(size_t)1,llen,m_fd->hfile);
-
- do {
- abp = parse_fastadl_asn(abp, my_buff+llen,
- &gi, &my_db, acc, name,
- title, sizeof(title), &taxid);
-
- if (gi > 0) {
- sprintf(descr,"gi|%d|%s|%s|%s ",gi,db_type_arr[my_db],acc,name);
- }
- else {
- if (acc[0] != '\0') sprintf(descr,"%s ",acc);
- else descr[0] = '\0';
- if (name[0] != '\0' && strcmp(name,"BL_ORD_ID")!=0) sprintf(descr+strlen(descr),"%s ", name);
- }
- if (m_fd->pref_db < 0) {
- if (!have_descr) {
- strncpy(str,descr,cnt-1);
- have_descr = 1;
- }
- else {
- strncat(str,"\001",cnt-strlen(str)-1);
- strncat(str,descr,cnt-strlen(str)-1);
- }
- strncat(str,title,cnt-strlen(str)-1);
- if (strlen(str) >= cnt-1) break;
- }
- else if (m_fd->pref_db == my_db) {
- have_descr = 1;
- strncpy(str,descr,cnt-1);
- strncat(str,title,cnt-strlen(str)-1);
- break;
- }
- } while (abp);
-
- if (!have_descr) {
- strncpy(str,descr,cnt-1);
- strncat(str,descr,cnt-strlen(str)-1);
- }
-
- if (have_my_buff) free(my_buff);
- }
-
- str[cnt-1]='\0';
-
- bp = str;
- while((bp=strchr(bp,'\001'))!=NULL) {*bp++=' ';}
-
- if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[libpos],0);
-
- m_fd->lpos = lib_cnt;
- m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
-}
-
-unsigned int bl2_uint4_cvt(unsigned int val)
-{
- unsigned int res;
-#ifdef IS_BIG_ENDIAN
- return val;
-#else /* it better be LITTLE_ENDIAN */
- res = ((val&255)*256)+ ((val>>8)&255);
- res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
- return res;
-#endif
-}
-
-unsigned int bl2_long4_cvt(long val)
-{
- int val4;
- unsigned int res;
-#ifdef IS_BIG_ENDIAN
- val4 = val;
- return val4;
-#else /* it better be LITTLE_ENDIAN */
- res = ((val&255)*256)+ ((val>>8)&255);
- res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
- return res;
-#endif
-}
-
-int64_t bl2_long8_cvt(int64_t val)
-{
- int64_t res;
-#ifdef IS_BIG_ENDIAN
- return val;
-#else /* it better be LITTLE_ENDIAN */
- res = ((val&255)*256)+ ((val>>8)&255);
- res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
-#ifdef BIG_LIB64
- res = (res<<16) + (((val>>32)&255)*256) + ((val>>40)&255);
- res = (res<<16) + (((val>>48)&255)*256) + ((val>>56)&255);
-#else
- fprintf(stderr,"Cannot use bl2_long8_cvt without 64-bit longs\n");
- exit(1);
-#endif
- return res;
-#endif
-}
-
-void src_int4_read(FILE *fd, int *val)
-{
-#ifdef IS_BIG_ENDIAN
- fread((char *)val,(size_t)4,(size_t)1,fd);
-#else
- unsigned char b[4];
-
- fread((char *)&b[0],(size_t)1,(size_t)4,fd);
- *val = 0;
- *val = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
- +(int)b[3];
-#endif
-}
-
-void src_long4_read(FILE *fd, long *valp)
-{
- int val4;
-#ifdef IS_BIG_ENDIAN
- fread(&val4,(size_t)4,(size_t)1,fd);
- *valp = val4;
-#else
- unsigned char b[4];
-
- fread((char *)&b[0],(size_t)1,(size_t)4,fd);
- val4 = 0;
- val4 = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
- +(int)b[3];
- *valp = val4;
-#endif
-}
-
-void src_uint4_read(FILE *fd, unsigned int *valp)
-{
-#ifdef IS_BIG_ENDIAN
- fread(valp,(size_t)4,(size_t)1,fd);
-#else
- unsigned char b[4];
-
- fread((char *)&b[0],(size_t)1,(size_t)4,fd);
- *valp = 0;
- *valp = (unsigned int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
- +(int)b[3];
-#endif
-}
-
-void src_long8_read(FILE *fd, long *val)
-{
-#ifdef IS_BIG_ENDIAN
- fread((void *)val,(size_t)8,(size_t)1,fd);
-#else
- unsigned char b[8];
-
- fread((char *)&b[0],(size_t)1,(size_t)8,fd);
- *val = 0;
- *val = (long)((((((long)((long)(b[0]<<8)+(long)b[1]<<8)+(long)b[2]<<8)
- +(long)b[3]<<8)+(long)b[4]<<8)+(long)b[5]<<8)
- +(long)b[6]<<8)+(long)b[7];
-#endif
-}
-
-void ncbi_long8_read(FILE *fd, int64_t *val)
-{
- unsigned char b[8];
-
- fread((char *)&b[0],(size_t)1,(size_t)8,fd);
- *val = 0;
- *val = (long)((((((long)((long)(b[7]<<8)+(long)b[6]<<8)+(long)b[5]<<8)
- +(long)b[4]<<8)+(long)b[3]<<8)+(long)b[2]<<8)
- +(long)b[1]<<8)+(long)b[0];
-}
-
-void src_char_read(FILE *fd, char *val)
-{
- fread(val,(size_t)1,(size_t)1,fd);
-}
-
-void src_fstr_read(FILE *fd, char *val, int slen)
-{
- fread(val,(size_t)slen,(size_t)1,fd);
-}
-
-void
-newname(char *nname, char *oname, char *suff, int maxn)
-{
- strncpy(nname,oname,maxn-1);
- strncat(nname,".",1);
- strncat(nname,suff,maxn-strlen(nname));
-}
-
-#define ASN_SEQ 0x30
-#define ASN_IS_BOOL 1
-#define ASN_IS_INT 2
-#define ASN_IS_STR 26
-
-unsigned char *
-get_asn_int(unsigned char *abp, int *val) {
-
- int v_len, v;
-
- v = 0;
- if (*abp++ != ASN_IS_INT) { /* check for int */
- fprintf(stderr," int missing\n");
- }
- else {
- v_len = *abp++;
- while (v_len-- > 0) {
- v *= 256;
- v += *abp++;
- }
- abp += 2; /* skip over null's */
- }
- *val = v;
- return abp;
-}
-
-unsigned char *
-get_asn_text(unsigned char *abp, char *text, int t_len) {
- int tch, at_len;
-
- text[0] = '\0';
- if (*abp++ != ASN_IS_STR) { /* check for str */
- fprintf(stderr," str missing\n");
- }
- else {
- if ((tch = *abp++) > 128) { /* string length is in next bytes */
- tch &= 0x7f; /* get number of bytes for len */
- at_len = 0;
- while (tch-- > 0) { at_len = (at_len << 8) + *abp++;}
- }
- else {
- at_len = tch;
- }
-
- if ( at_len < t_len-1) {
- memcpy(text, abp, at_len);
- text[at_len] = '\0';
- }
- else {
- memcpy(text, abp, t_len-1);
- text[t_len-1] = '\0';
- }
- abp += at_len + 2;
- }
- return abp;
-}
-
-/* something to try to skip over stuff we don't want */
-unsigned char *
-get_asn_junk(unsigned char *abp) {
-
- int seq_cnt = 0;
- int tmp;
- char string[256];
-
- while (*abp) {
- if ( *abp == ASN_SEQ) { abp += 2; seq_cnt++;}
- else if ( *abp == ASN_IS_BOOL ) {abp = get_asn_int(abp, &tmp);}
- else if ( *abp == ASN_IS_INT ) {abp = get_asn_int(abp, &tmp);}
- else if ( *abp == ASN_IS_STR ) {abp = get_asn_text(abp, string, sizeof(string)-1);}
- }
-
- while (seq_cnt-- > 0) abp += 2;
- return abp;
-}
-
-unsigned char *
-get_asn_textseq_id(unsigned char *abp,
- char *name, char *acc)
-{
- char release[20], ver_str[10];
- int version;
- int seqcnt = 0;
-
- ver_str[0]='\0';
-
- if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}
-
- while (*abp) {
- switch (*abp) {
- case 0xa0:
- abp = get_asn_text(abp+2, name, 20);
- break;
- case 0xa1:
- abp = get_asn_text(abp+2, acc, 20);
- break;
- case 0xa2:
- abp = get_asn_text(abp+2, release, sizeof(release));
- break;
- case 0xa3:
- abp = get_asn_int(abp+2, &version);
- sprintf(ver_str,".%d",version);
- break;
- default: abp += 2;
- }
- }
- while (seqcnt-- > 0) abp += 4;
- strncat(acc,ver_str,20-strlen(acc));
- acc[19]='\0';
- return abp; /* skip 2 NULL's */
-}
-
-unsigned char *
-get_asn_local_id(unsigned char *abp, char *acc)
-{
- int seqcnt = 0;
-
- if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}
-
- abp = get_asn_text(abp+2, acc, 20);
-
- while (seqcnt-- > 0) abp += 4;
- acc[19]='\0';
- return abp; /* skip 2 NULL's */
-}
-
-unsigned char *
-get_asn_dbtag(unsigned char *abp, char *name, char *str, int *id_p) {
-
- if (*abp == ASN_SEQ) { abp += 2;}
-
- if (*abp == 0xa0) { /* get db */
- abp = get_asn_text(abp+2, name, 20);
- }
- else {
- fprintf(stderr," missing dbtag:db %d %d\n",abp[0],abp[1]);
- abp += 2;
- }
-
- if (*abp == 0xa1) { /* get tag */
- abp += 2;
- abp += 2; /* skip over id */
- if (*abp == 2) abp = get_asn_int(abp,id_p);
- else abp = get_asn_text(abp+2, str, 20);
- }
- else {
- fprintf(stderr," missing dbtag:tag %2x %2x\n",abp[0],abp[1]);
- abp += 2;
- }
- return abp+2; /* skip 2 NULL's */
-}
-
-unsigned char *
-get_asn_pdb_id(unsigned char *abp, char *acc, char *chain)
-{
- int ichain, seq_cnt=0;
-
- if (*abp == ASN_SEQ) { abp += 2; seq_cnt++;}
-
- while (*abp) {
- switch (*abp) {
- case 0: abp += 2; break;
- case 0xa0: /* mol-id */
- abp = get_asn_text(abp+2, acc, 20);
- break;
- case 0xa1:
- abp = get_asn_int(abp+2, &ichain);
- chain[0] = ichain;
- chain[1] = '\0';
- break;
- case 0xa2: /* ignore date - scan until NULL's */
- while (*abp++) {}
- abp += 2; /* skip the NULL's */
- break;
- default: abp+=2;
- }
- }
- while (seq_cnt-- > 0) {abp += 4;}
- return abp;
-}
-
-#define ASN_TYPE_MASK 31
-
-unsigned char
-*get_asn_seqid(unsigned char *abp,
- int *gi_p, int *db, char *acc, char *name) {
-
- int db_type, itmp, seq_cnt=0;
-
- *gi_p = 0;
-
- if (*abp != ASN_SEQ) {
- fprintf(stderr, "seqid - missing SEQ 1: %2x %2x\n",abp[0], abp[1]);
- return abp;
- }
- else { abp += 2; seq_cnt++;}
-
- db_type = (*abp & ASN_TYPE_MASK);
-
- if (db_type == 11) { /* gi */
- abp = get_asn_int(abp+2,gi_p);
- }
-
- while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;}
-
- db_type = (*abp & ASN_TYPE_MASK);
- if (db_type > 17) {db_type = 0;}
- *db = db_type;
-
- switch(db_type) {
- case 0:
- abp = get_asn_local_id(abp+2, acc);
- break;
- case 1:
- case 2:
- abp = get_asn_int(abp+2,&itmp);
- abp += 2;
- break;
- case 11:
- abp = get_asn_int(abp+2,&itmp);
- break;
- case 4:
- case 5:
- case 6:
- case 7:
- case 9:
- case 12:
- case 13:
- case 15:
- case 16:
- case 17:
- abp = get_asn_textseq_id(abp+2,name,acc);
- break;
- case 10:
- abp = get_asn_dbtag(abp+2,name,acc,&itmp);
- case 14:
- abp = get_asn_pdb_id(abp+2,acc,name);
- break;
- default: abp += 2;
- }
-
- while (seq_cnt-- > 0) { abp += 4;}
- return abp; /* skip over 2 NULL's */
-}
-
-#define ASN_FADL_TITLE 0xa0
-#define ASN_FADL_SEQID 0xa1
-#define ASN_FADL_TAXID 0xa2
-#define ASN_FADL_MEMBERS 0xa3
-#define ASN_FADL_LINKS 0xa4
-#define ASN_FADL_OTHER 0xa5
-
-unsigned char *
-parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
- int *gi_p, int *db, char *acc,
- char *name, char *title, int t_len, int *taxid_p) {
- unsigned char *abp;
- char tmp_db[4], tmp_acc[32], tmp_name[32];
- int this_db;
- int seq_cnt = 0;
- int tmp_gi;
-
- acc[0] = name[0] = db[0] = title[0] = '\0';
-
- abp = asn_buff;
- while ( abp < asn_max && *abp) {
- if (*abp == ASN_SEQ) { abp += 2; seq_cnt++; }
- else if (*abp == ASN_FADL_TITLE) {
- abp = get_asn_text(abp+2, title, t_len);
- }
- else if (*abp == ASN_FADL_SEQID ) {
- abp = get_asn_seqid(abp+2, gi_p, db, acc, name);
- if (*db > 17) *db = 0;
- }
- else if (*abp == ASN_FADL_TAXID ) {
- abp = get_asn_int(abp+2, taxid_p);
- }
- else if (*abp == ASN_FADL_MEMBERS) {
- abp = get_asn_junk(abp+2);
- }
- else if (*abp == ASN_FADL_LINKS ) {
- abp = get_asn_junk(abp+2);
- }
- else if (*abp == ASN_FADL_OTHER ) {
- abp = get_asn_junk(abp+2);
- }
- else {
- /* fprintf(stderr, " Error - missing ASN.1 %2x:%2x:%2x:%2x\n",
- abp[-2],abp[-1],abp[0],abp[1]); */
- abp += 2;
- }
- }
- while (abp < asn_max && *abp == '\0' ) abp++;
- if (abp >= asn_max) return NULL;
- else return abp;
-}
-
-
-void
-parse_pal(char *dname, char *msk_name,
- int *oid_seqs, int *max_oid,
- FILE *fd) {
-
- char line[MAX_STR];
-
- while (fgets(line,sizeof(line),fd)) {
- if (line[0] == '#') continue;
-
- if (strncmp(line, "DBLIST", 6)==0) {
- sscanf(line+7,"%s",dname);
- }
- else if (strncmp(line, "OIDLIST", 7)==0) {
- sscanf(line+8,"%s",msk_name);
- }
- else if (strncmp(line, "NSEQ", 4)==0) {
- sscanf(line+5,"%d",oid_seqs);
- }
- else if (strncmp(line, "MAXOID", 6)==0) {
- sscanf(line+7,"%d",max_oid);
- }
- }
-}