X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=website%2Farchive%2Fbinaries%2Fmac%2Fsrc%2Ffasta34%2Fncbl2_mlib.c;fp=website%2Farchive%2Fbinaries%2Fmac%2Fsrc%2Ffasta34%2Fncbl2_mlib.c;h=4ec9f1ad3af4f665c12b446332ac41fa49863fe1;hb=dbde3fb6f00b9bb770343631a517c0e599db8528;hp=0000000000000000000000000000000000000000;hpb=85f830bbd51a7277994bd4233141016304e210c9;p=jabaws.git diff --git a/website/archive/binaries/mac/src/fasta34/ncbl2_mlib.c b/website/archive/binaries/mac/src/fasta34/ncbl2_mlib.c new file mode 100644 index 0000000..4ec9f1a --- /dev/null +++ b/website/archive/binaries/mac/src/fasta34/ncbl2_mlib.c @@ -0,0 +1,1613 @@ +/* ncbl2_lib.c functions to read ncbi-blast format files from + formatdb (blast2.0 format files) + + copyright (c) 1999 William R. Pearson +*/ + +/* $Name: fa_34_26_5 $ - $Id: ncbl2_mlib.c,v 1.56 2007/04/02 18:08:11 wrp Exp $ */ + +/* to turn on mmap()ing for Blast2 files: */ + +#include +#include +#include +#include +#include +#include +#ifdef UNIX +#include +#endif +#include + + +/* **************************************************************** + +17-May-2006 + +Modified to read NCBI .[np]al and .msk files. The .nal or .pal file +provides a way to read sequences from a list of files. The .msk file +provides a compact way of indicating the subset of sequences in a +larger database (typically nr or nt) that comprise a smaller database +(e.g. swissprot or pdbaa). A .pal file (e.g. swissprot.00.pal) that +uses a .msk file has the form: + + # Alias file generated by genmask + # Date created: Mon Apr 10 11:24:05 2006 + # + TITLE Non-redundant SwissProt sequences + DBLIST nr.00 + OIDLIST swissprot.00.msk + LENGTH 74351250 + NSEQ 198346 + MAXOID 2617347 + MEMB_BIT 1 + # end of the file + +To work with this file, we must first load the nr.00 file, and then +read the swissprot.00.msk file, and then scan all the entries in the +swissprot.00.msk file (which are packed 32 mask-bit to an int) to +determine whether a specific libpos index entry is present in the +subset database. + +**************************************************************** */ + + +/* **************************************************************** +This code reads NCBI Blast2 format databases from formatdb version 3 and 4 + +(From NCBI) This section describes the format of the databases. + +Formatdb creates three main files for proteins containing indices, +sequences, and headers with the extensions, respectively, of pin, psq, +and phr (for nucleotides these are nin, nsq, and nhr). A number of +other ISAM indices are created, but these are described elsewhere. + +FORMAT OF THE INDEX FILE +------------------------ + +1.) formatdb version number [4 bytes]. + +2.) protein dump flag (1 for a protein database, 0 for a nucleotide + database) [4 bytes]. + +3.) length of the database title in bytes [4 bytes]. +4.) the database title [length given in 3.)]. +5.) length of the date/time string [4 bytes]. +6.) the date/time string [length given in 5.)]. +7.) the number of sequences in the database [4 bytes]. +8.) the total length of the database in residues/basepairs [4 bytes]. +9.) the length of the longest sequence in the database [4 bytes]. + +10.) a list of the offsets for definitions (one for each sequence) in +the header file. There are num_of_seq+1 of these, where num_of_seq is +the number of sequences given in 7.). + +11.) a list of the offsets for sequences (one for each sequence) in +the sequence file. There are num_of_seq+1 of these, where num_of_seq +is the number of sequences given in 7.). + +12.) a list of the offsets for the ambiguity characters (one for each +sequence) in the sequence file. This list is only present for +nucleotide databases and, since the database is compressed 4/1 for +nucleotides, allows the ambiguity characters to be restored when the +sequence is generated. There are num_of_seq+1 of these, where +num_of_seq is the number of sequences given in 7.). + + +FORMAT OF THE SEQUENCE FILE +--------------------------- + +There are different formats for the protein and nucleotide sequence files. + +The protein sequence files is quite simple. The first byte in the +file is a NULL byte, followed by the sequence in ncbistdaa format +(described in the NCBI Software Development Toolkit documentation). +Following the sequence is another NULL byte, followed by the next +sequence. The file ends with a NULL byte, following the last +sequence. + +The nucleotide sequence file contains the nucleotide sequence, with +four basepairs compressed into one byte. The format used is NCBI2na, +documented in the NCBI Software Development Toolkit manual. Any +ambiguity characters present in the original sequence are replaced at +random by A, C, G or T. The true value of ambiguity characters are +stored at the end of each sequence to allow true reproduction of the +original sequence. + +FORMAT OF THE HEADER FILE (formatdb version 3) +------------------------- + +The format of the header file depends on whether or not the identifiers in the +original file were parsed or not. For the case that they were not, then each +entry has the format: + +gnl|BL_ORD_ID|entry_number my favorite yeast sequence... + +Here entry_number gives the ordinal number of the sequence in the +database (with zero offset). The identifier +gnl|BL_ORD_ID|entry_number is used by the BLAST software to identify +the entry, if the user has not provided another identifier. If the +identifier was parsed, then gnl|BL_ORD_ID|entry_number is replaced by +the correct identifier, as described in +ftp://ncbi.nlm.nih.gov/blast/db/README . + +There are no separators between these deflines. + +For formatdb version 4, the header file contains blast ASN.1 binary +deflines, which can parsed with parse_fastadl_asn(). + +FORMAT OF THE .MSK FILE +----------------------- + +The .msk file is simply a packed list of masks for formatdb "oids" for +some other file (typically nr). The first value is the last oid +available; the remainder are packed 32 oids/mask, so that the number +of masks is 1/32 the number of sequences in the file. + +**************************************************************** */ + +#ifdef USE_MMAP +#include +#include +#include +#ifdef IBM_AIX +#include +#else +#include +#endif +#endif + +#ifdef USE_MMAP +#ifndef MAP_FILE +#define MAP_FILE 0 +#endif +#endif + +#ifdef UNIX +#define RBSTR "r" +#else +#define RBSTR "rb" +#endif + +#ifdef WIN32 +#define SLASH_CHAR '\\' +#define SLASH_STR "\\" +#else +#define SLASH_CHAR '/' +#define SLASH_STR "/" +#endif + +#define XTERNAL +#include "uascii.h" + +#define XTERNAL +#include "upam.h" +#include "ncbl2_head.h" + +#include "defs.h" +#include "mm_file.h" + +unsigned int bl2_uint4_cvt(unsigned int); +unsigned int bl2_long4_cvt(long); +int64_t bl2_long8_cvt(int64_t); +void src_int4_read(FILE *fd, int *valp); +void src_uint4_read(FILE *fd, unsigned int *valp); +void src_long4_read(FILE *fd, long *valp); +void ncbi_long8_read(FILE *fd, int64_t *valp); +void src_char_read(FILE *fd, char *valp); +unsigned char *parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max, + int *gi_p, int *db, char *acc, char *name, + char *title, int t_len, int *taxid); + +/* nt_btoa maps from blast 2bit format to ascii characters */ +static char nt_btoa[5] = {"ACGT"}; + +static char aa_b2toa[27]= {"-ABCDEFGHIKLMNPQRSTVWXYZU*"}; + +static int aa_btof[32]; /* maps to fasta alphabet */ + +static int dbtype, dbformat, amb_cnt; + +#define NCBIBL20 12 + +int ncbl2_getliba(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *); +int ncbl2_getlibn(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *); + +int ncbl2_getliba_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *); +int ncbl2_getlibn_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *); + +void newname(char *, char *, char *, int); +void parse_pal(char *, char *, int *, int *, FILE *); + +void ncbl2_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd); + +/* ncbl2_openlib() is used to open (and memory map) a BLAST2.0 format + file. Ifdef USE_MMAP, then ncbl2_openlib returns a structure that can + be used to read the database. */ + +struct lmf_str * +ncbl2_openlib(char *name, int ldnaseq) +{ + char lname[256]; + char dname[256]; + char msk_name[256]; + char hname[256]; + char sname[256]; + char tname[256]; + char db_dir[256]; + int pref_db= -1; + char *bp; + int title_len; + char *title_str=NULL; + int date_len; + char *date_str=NULL; + long ltmp; + int64_t l8tmp; + int oid_seqs, max_oid; + int oid_cnt, oid_len; + unsigned int *oid_list, o_max; + int tmp; + int i; +#ifdef USE_MMAP + struct stat statbuf; +#endif + FILE *ifile; /* index offsets, also DB info */ + unsigned int *f_pos_arr; + struct lmf_str *m_fptr; + + if (ldnaseq==SEQT_PROT) { /* read a protein database */ + newname(lname,name,AA_LIST_EXT,(int)sizeof(lname)); + newname(tname,name,AA_INDEX_EXT,(int)sizeof(tname)); + newname(hname,name,AA_HEADER_EXT,(int)sizeof(hname)); + newname(sname,name,AA_SEARCHSEQ_EXT,(int)sizeof(sname)); + + /* initialize map of BLAST2 amino acids to FASTA amino acids */ + for (i=0; i 0) { + + /* get the pref_db before adding the directory */ + if (strncmp(msk_name,"swissprot",9)==0) { + pref_db = 7; + } + else if (strncmp(msk_name,"pdbaa",5)==0) { + pref_db = 14; + } + + /* need to add directory to both dname and msk_name */ + strncpy(tname,db_dir,sizeof(tname)); + strncat(tname,msk_name, sizeof(tname)); + strncpy(msk_name, tname, sizeof(msk_name)); + + strncpy(tname,db_dir,sizeof(tname)); + strncat(tname,dname, sizeof(tname)); + strncpy(dname,tname,sizeof(dname)); + + if (ldnaseq == SEQT_PROT) { + newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname)); + newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname)); + newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname)); + } + else { /* reading DNA library */ + newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname)); + newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname)); + newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname)); + } + /* now load the oid file */ + if ((ifile = fopen(msk_name,RBSTR))==NULL) { + fprintf(stderr,"error - cannot load %s file\n",msk_name); + return NULL; + } + else { + src_uint4_read(ifile,&o_max); + if (o_max != max_oid) { + fprintf(stderr," error - oid count mismatch %d != %d\n",max_oid, o_max); + } + oid_len = (max_oid/32+1); + if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) { + fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len); + return NULL; + } + if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) { + fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len); + return NULL; + } + fclose(ifile); + } + } + else { /* we had a .msk file, but there are no oid's in it. + allocate an m_fptr and return it empty */ + if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) { + fprintf(stderr," cannot allocate lmf_str\n"); + return NULL; + } + + m_fptr->tmp_buf_max = 0; + + /* load the oid info */ + m_fptr->max_oid = 0; + m_fptr->oid_seqs = 0; + m_fptr->oid_list = (unsigned int *)calloc(1,sizeof(int)); + m_fptr->pref_db= -1; + + if (ldnaseq==SEQT_DNA) { + m_fptr->getlib = ncbl2_getlibn_o; + m_fptr->sascii = nascii; + } + else { + m_fptr->getlib = ncbl2_getliba_o; + m_fptr->sascii = aascii; + } + strncpy(m_fptr->lb_name,sname,MAX_FN); + return m_fptr; + } + } + + /* open the index file */ + if ((ifile = fopen(tname,RBSTR))==NULL) { + fprintf(stderr," cannot open %s (%s) INDEX file",tname,name); + perror("..."); + return 0; + } + src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */ + src_uint4_read(ifile,(unsigned *)&dbtype); /* get 1 for protein/0 DNA */ + + if (dbformat != FORMATDBV3 && dbformat!=FORMATDBV4) { + fprintf(stderr,"error - %s wrong formatdb version (%d/%d)\n", + tname,dbformat,FORMATDBV3); + return NULL; + } + + if ((ldnaseq==SEQT_PROT && dbtype != AAFORMAT) || + (ldnaseq==SEQT_DNA && dbtype!=NTFORMAT)) { + fprintf(stderr,"error - %s wrong format (%d/%d)\n", + tname,dbtype,(ldnaseq ? NTFORMAT: AAFORMAT)); + return NULL; + } + + /* the files are there - allocate lmf_str */ + + if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) { + fprintf(stderr," cannot allocate lmf_str\n"); + return NULL; + } + + m_fptr->tmp_buf_max = 4096; + if ((m_fptr->tmp_buf= + (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) { + fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n"); + return NULL; + } + + /* load the oid info */ + m_fptr->max_oid = max_oid; + m_fptr->oid_seqs = oid_seqs; + m_fptr->oid_list = oid_list; + m_fptr->pref_db= pref_db; + + /* open the header file */ + if ((m_fptr->hfile = fopen(hname,RBSTR))==NULL) { + fprintf(stderr," cannot open %s header file\n",hname); + goto error_r; + } + + /* ncbl2_ranlib is used for all BLAST2.0 access */ + m_fptr->ranlib = ncbl2_ranlib; + m_fptr->bl_format_ver = dbformat; + + if (ldnaseq==SEQT_DNA) { + if (oid_seqs > 0) { + m_fptr->getlib = ncbl2_getlibn_o; + } + else { + m_fptr->getlib = ncbl2_getlibn; + } + m_fptr->sascii = nascii; + } + else { + if (oid_seqs > 0) { + m_fptr->getlib = ncbl2_getliba_o; + } + else { + m_fptr->getlib = ncbl2_getliba; + } + m_fptr->sascii = aascii; + } + strncpy(m_fptr->lb_name,sname,MAX_FN); + + /* open the sequence file */ + +#if defined (USE_MMAP) + m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0); + if (!m_fptr->mm_flg) { + fprintf(stderr," cannot open %s",sname); + perror("..."); + } + else { + if(fstat(m_fptr->mmap_fd, &statbuf) < 0) { + fprintf(stderr," cannot fstat %s",sname); + perror("..."); + m_fptr->mm_flg = 0; + } + else { + m_fptr->st_size = statbuf.st_size; + if((m_fptr->mmap_base = + mmap(NULL, m_fptr->st_size, PROT_READ, + MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) { + fprintf(stderr," cannot mmap %s",sname); + perror("..."); + m_fptr->mm_flg = 0; + } + else { + m_fptr->mmap_addr = m_fptr->mmap_base; + m_fptr->mm_flg = 1; + } + } + /* regardless, close the open()ed version */ + close(m_fptr->mmap_fd); + } +#else + m_fptr->mm_flg = 0; +#endif + + if (!m_fptr->mm_flg) { + if ((m_fptr->libf = fopen(sname,RBSTR))==NULL) { + fprintf(stderr," cannot open %s sequence file",sname); + perror("..."); + goto error_r; + } + } + +/* all files should be open */ + + src_uint4_read(ifile,(unsigned *)&title_len); + + if (title_len > 0) { + if ((title_str = calloc((size_t)title_len+1,sizeof(char)))==NULL) { + fprintf(stderr," cannot allocate title string (%d)\n",title_len); + goto error_r; + } + fread(title_str,(size_t)1,(size_t)title_len,ifile); + } + + src_uint4_read(ifile,(unsigned *)&date_len); + + if (date_len > 0) { + if ((date_str = calloc((size_t)date_len+1,sizeof(char)))==NULL) { + fprintf(stderr," cannot allocate date string (%d)\n",date_len); + goto error_r; + } + fread(date_str,(size_t)1,(size_t)date_len,ifile); + } + + m_fptr->lpos = 0; + src_uint4_read(ifile,(unsigned *)&m_fptr->max_cnt); + + if (dbformat == FORMATDBV3) { + src_long4_read(ifile,<mp); + m_fptr->tot_len = ltmp; + } + else { + ncbi_long8_read(ifile,&l8tmp); + m_fptr->tot_len = ltmp; + } + + src_long4_read(ifile,<mp); + m_fptr->max_len = ltmp; + + /* currently we are not using this information, but perhaps later */ + if (title_str!=NULL) free(title_str); + if (date_str!=NULL) free(date_str); + +#ifdef DEBUG + fprintf(stderr,"%s format: BL2 (%s) max_cnt: %d, totlen: %lld, maxlen %ld\n", + name,m_fptr->mm_flg ? "mmap" : "fopen", + m_fptr->max_cnt,m_fptr->tot_len,m_fptr->max_len); +#endif + + /* allocate and read hdr indexes */ + if ((f_pos_arr=(unsigned int *)calloc((size_t)m_fptr->max_cnt+1,sizeof(int)))==NULL) { + fprintf(stderr," cannot allocate tmp header pointers\n"); + goto error_r; + } + + if ((m_fptr->d_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) { + fprintf(stderr," cannot allocate header pointers\n"); + goto error_r; + } + + /* allocate and read sequence offsets */ + if ((m_fptr->s_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) { + fprintf(stderr," cannot allocate sequence pointers\n"); + goto error_r; + } + + /* + for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->d_pos_arr[i]); + for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->s_pos_arr[i]); + */ + if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) { + fprintf(stderr," error reading hdr offsets: %s\n",tname); + goto error_r; + } + + for (i=0; i<=m_fptr->max_cnt; i++) +#ifdef IS_BIG_ENDIAN + m_fptr->d_pos_arr[i] = f_pos_arr[i]; +#else + m_fptr->d_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]); +#endif + + if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) { + fprintf(stderr," error reading seq offsets: %s\n",tname); + goto error_r; + } + for (i=0; i<=m_fptr->max_cnt; i++) { +#ifdef IS_BIG_ENDIAN + m_fptr->s_pos_arr[i] = f_pos_arr[i]; +#else + m_fptr->s_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]); +#endif + } + + if (dbtype == NTFORMAT) { + /* allocate and ambiguity offsets */ + if ((m_fptr->a_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) { + fprintf(stderr," cannot allocate sequence pointers\n"); + goto error_r; + } + + /* + for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->a_pos_arr[i]); + */ + + if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) { + fprintf(stderr," error reading seq offsets: %s\n",tname); + goto error_r; + } + for (i=0; i<=m_fptr->max_cnt; i++) { +#ifdef IS_BIG_ENDIAN + m_fptr->a_pos_arr[i] = f_pos_arr[i]; +#else + m_fptr->a_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]); +#endif + } + } + + /* + for (i=0; i < min(m_fptr->max_cnt,10); i++) { + fprintf(stderr,"%d: %d %d %d\n",i,m_fptr->s_pos_arr[i],m_fptr->a_pos_arr[i],m_fptr->d_pos_arr[i]); + } + */ + + /* all done with ifile, close it */ + fclose(ifile); + + free(f_pos_arr); + + if (!m_fptr->mm_flg) { + tmp = fgetc(m_fptr->libf); + if (tmp!=NULLB) + fprintf(stderr," phase error: %d:%d found\n",0,tmp); + } + + m_fptr->bl_lib_pos = 1; + amb_cnt = 0; + return m_fptr; + + error_r: + /* here if failure after m_fptr allocated */ + free(m_fptr); + return NULL; +} + +void ncbl2_closelib(struct lmf_str *m_fptr) +{ + if (m_fptr->tmp_buf != NULL) { + free(m_fptr->tmp_buf); + m_fptr->tmp_buf_max = 0; + } + + if (m_fptr->s_pos_arr !=NULL) { + free(m_fptr->s_pos_arr); + m_fptr->s_pos_arr = NULL; + } + if (m_fptr->a_pos_arr!=NULL) { + free(m_fptr->a_pos_arr); + m_fptr->a_pos_arr = NULL; + } + + if (m_fptr->hfile !=NULL ) { + fclose(m_fptr->hfile); m_fptr->hfile=NULL; + free(m_fptr->d_pos_arr); m_fptr->d_pos_arr = NULL; + } + + if (m_fptr->oid_list != NULL) { + free(m_fptr->oid_list); m_fptr->oid_list = NULL; + m_fptr->oid_seqs = m_fptr->max_oid = 0; + } + +#ifdef use_mmap + if (m_fptr->mm_flg) { + munmap(m_fptr->mmap_base,m_fptr->st_size); + m_fptr->mmap_fd = -1; + } + else +#endif + if (m_fptr->libf !=NULL ) {fclose(m_fptr->libf); m_fptr->libf=NULL;} +} + +int +ncbl2_getliba_o(unsigned char *seq, + int maxs, + char *libstr, + int n_libstr, + fseek_t *libpos, + int *lcont, + struct lmf_str *m_fd, + long *l_off) +{ + int tpos; + unsigned int t_mask, t_shift, oid_mask; + + /* get to the next valid pointer */ + + for ( tpos = m_fd->lpos ;tpos <= m_fd->max_oid; tpos++) { + t_mask = tpos / 32; + t_shift = 31 - (tpos % 32); + if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; } + + if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) { + if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0); + m_fd->lpos = tpos; /* already bumped up */ + m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos]; + return ncbl2_getliba(seq, maxs, libstr, n_libstr, + libpos, lcont, m_fd, l_off); + } + } + return -1; +} + +int +ncbl2_getliba(unsigned char *seq, + int maxs, + char *libstr, + int n_libstr, + fseek_t *libpos, + int *lcont, + struct lmf_str *m_fd, + long *l_off) +{ + unsigned char *sptr, *dptr; + int s_chunk, d_len, lib_cnt; + long seqcnt; + long tmp; + static long seq_len; +#if defined(DEBUG) || defined(PCOMPLIB) + int gi, my_db, taxid; + char acc[20], title[21], name[20]; +#endif + + *l_off = 1; + + lib_cnt = m_fd->lpos; + *libpos = (fseek_t)m_fd->lpos; + + if (*lcont==0) { + if (lib_cnt >= m_fd->max_cnt) return -1; /* no more sequences */ + seq_len = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]; /* value is +1 off to get the NULL */ + if (m_fd->mm_flg) m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lib_cnt]; +#if !defined(DEBUG) && !defined(PCOMPLIB) + libstr[0]='\0'; +#else + /* get the name from the header file */ + fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0); + + if (m_fd->bl_format_ver == FORMATDBV3) { + d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1); + fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile); + libstr[d_len]='\0'; + } + else { + d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1); + fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile); + parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len, + &gi, &my_db, acc, name, title, 20, &taxid); + sprintf(libstr,"gi|%d",gi); + } +#endif + } + if (seq_len <= maxs) { /* sequence fits */ + seqcnt = seq_len; + m_fd->lpos++; + *lcont = 0; + } + else { /* doesn't fit */ + seqcnt = maxs-1; + (*lcont)++; + } + + if (m_fd->mm_flg) sptr = (unsigned char *)m_fd->mmap_addr; + else { + if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,m_fd->libf))!=(size_t)seq_len) { + fprintf(stderr," could not read sequence record: %ld %ld != %ld\n", + *libpos,tmp,seq_len); + goto error; + } + sptr = seq; + } + if (seq_len <= maxs) {seqcnt = --seq_len;} + + /* everything is ready, set up dst. pointer, seq_len */ + dptr = seq; + + if (aa_b2toa[sptr[seq_len-1]]=='*') seq_len--; + s_chunk = seqcnt/16; + while (s_chunk-- > 0) { + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + *dptr++ = aa_btof[*sptr++]; + } + while (dptr < seq+seqcnt) *dptr++ = aa_btof[*sptr++]; + + if (m_fd->mm_flg) m_fd->mmap_addr = (char *)sptr; + + /* we didn't get it all, so reset for more */ + if (*lcont) seq_len -= seqcnt; + + seq[seqcnt]= EOSEQ; + return (seqcnt); + +error: fprintf(stderr," error reading %s at %ld\n",libstr,*libpos); + fflush(stderr); + return (-1); +} + +int +ncbl2_getlibn_o(unsigned char *seq, + int maxs, + char *libstr, + int n_libstr, + fseek_t *libpos, + int *lcont, + struct lmf_str *m_fd, + long *l_off) +{ + int tpos; + unsigned int t_mask, t_shift, oid_mask; + + /* get to the next valid pointer */ + + for (tpos = m_fd->lpos; tpos <= m_fd->max_oid; tpos++) { + t_mask = tpos / 32; + t_shift = 31 - (tpos % 32); + if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; } + + if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) { + if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0); + m_fd->lpos = tpos; /* already bumped up */ + m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos]; + return ncbl2_getlibn(seq, maxs, libstr, n_libstr, + libpos, lcont, m_fd, l_off); + } + } + return -1; +} + +static char tmp_amb[4096]; + +int +ncbl2_getlibn(unsigned char *seq, + int maxs, + char *libstr, + int n_libstr, + fseek_t *libpos, + int *lcont, + struct lmf_str *m_fd, + long *l_off) +{ + unsigned char *sptr, *tptr, stmp; + long seqcnt; + int s_chunk, lib_cnt; + size_t tmp; + char ch; + static long seq_len; + static int c_len,c_pad; + int c_len_set, d_len; + + *l_off = 1; + + lib_cnt = m_fd->lpos; + *libpos = (fseek_t)lib_cnt; + if (*lcont==0) { /* not a continuation of previous */ + if (lib_cnt >= m_fd->max_cnt) return (-1); + c_len = m_fd->a_pos_arr[lib_cnt]- m_fd->s_pos_arr[lib_cnt]; + if (!m_fd->mm_flg) { + if (m_fd->bl_lib_pos != m_fd->s_pos_arr[lib_cnt]) { /* are we positioned to read? */ + amb_cnt++; + if ((m_fd->bl_lib_pos - m_fd->s_pos_arr[lib_cnt]) < sizeof(tmp_amb)) { + /* jump over amb_ray */ + fread(tmp_amb,(size_t)1,(size_t)(m_fd->s_pos_arr[lib_cnt]-m_fd->bl_lib_pos),m_fd->libf); + } + else { /* fseek over amb_ray */ + fseek(m_fd->libf,m_fd->s_pos_arr[lib_cnt],0); + } + m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt]; + } + } + else m_fd->mmap_addr = m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt]; +#if !defined(DEBUG) && !defined(PCOMPLIB) + libstr[0]='\0'; +#else + /* get the name from the header file */ + fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0); + + d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1); + fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile); + libstr[d_len]='\0'; +#endif + } /* end of *lcont==0 */ + + /* To avoid the situation where c_len <= 1; we must anticipate what + c_len will be after this pass. If it will be <= 64, back off this + time so next time it will be > 64 */ + + seq_len = c_len*4; + + if ((seq_len+4 > maxs) && (seq_len+4 - maxs <= 256)) { + /* we won't be done but we will have less than 256 to go */ + c_len -= 64; seq_len -= 256; c_len_set = 1; maxs -= 256;} + else c_len_set = 0; + + /* + fprintf(stderr," lib_cnt: %d %d %d %d\n",lib_cnt,c_len,seq_len,maxs); + */ + + /* does the rest of the sequence fit? */ + if (seq_len <= maxs-4 && !c_len_set) { + seqcnt = c_len; + if (!m_fd->mm_flg) { + if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,m_fd->libf))!=(size_t)seqcnt) { + fprintf(stderr, + " could not read sequence record: %s %lld %ld != %ld: %d\n", + libstr,*libpos,tmp,seqcnt,*seq); + goto error; + } + m_fd->bl_lib_pos += tmp; + sptr = seq + seqcnt; + } + else sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt); + + *lcont = 0; /* this is the last chunk */ + lib_cnt++; /* increment to the next sequence */ + /* the last byte is either '0' (no remainder) or the last 1-3 chars and the remainder */ + c_pad = *(sptr-1); + c_pad &= 0x3; /* get the last (low) 2 bits */ + seq_len -= (4 - c_pad); /* if the last 2 bits are 0, its a NULL byte */ + } + else { /* get the next chunk, but more to come */ + seqcnt = ((maxs+3)/4)-1; + if (!m_fd->mm_flg) { + if ((tmp=fread(seq,(size_t)1,(size_t)(seqcnt),m_fd->libf))!=(size_t)(seqcnt)) { + fprintf(stderr," could not read sequence record: %lld %ld/%ld\n", + *libpos,tmp,seqcnt); + goto error; + } + m_fd->bl_lib_pos += tmp; + sptr = seq + seqcnt; + } + else { + sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt); + m_fd->mmap_addr += seqcnt; + } + seq_len = 4*seqcnt; + c_len -= seqcnt; + if (c_len_set) {c_len += 64; maxs += 256;} + (*lcont)++; +/* hopefully we don't need this because of c_len -= 64. */ +/* + if (c_len == 1) { +#if !defined (USE_MMAP) + c_pad = fgetc(m_fd->libf); + *sptr=c_pad; +#else + c_pad = *m_fd->mmap_addr++; + sptr = m_fd->mmap_addr; +#endif + c_pad &= 0x3; + seq_len += c_pad; + seqcnt++; + lib_cnt++; + *lcont = 0; + } +*/ + } + + /* point to the last packed byte and to the end of the array + seqcnt is the exact number of bytes read + tptr points to the destination, use multiple of 4 to simplify math + sptr points to the source, note that the last byte will be read 4 cycles + before it is written + */ + + tptr = seq + 4*seqcnt; + s_chunk = seqcnt/8; + while (s_chunk-- > 0) { + stmp = *--sptr; + *--tptr = (stmp&3) +1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + stmp = *--sptr; + *--tptr = (stmp&3) +1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + stmp = *--sptr; + *--tptr = (stmp&3) +1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + stmp = *--sptr; + *--tptr = (stmp&3) +1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + stmp = *--sptr; + *--tptr = (stmp&3) +1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + stmp = *--sptr; + *--tptr = (stmp&3) +1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + stmp = *--sptr; + *--tptr = (stmp&3) +1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + stmp = *--sptr; + *--tptr = (stmp&3) +1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + } + while (tptr>seq) { + stmp = *--sptr; + *--tptr = (stmp&3) +1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + *--tptr = ((stmp >>= 2)&3)+1; + } + /* + for (sptr=seq; sptr < seq+seq_len; sptr++) { + printf("%c",nt[*sptr]); + if ((int)(sptr-seq) % 60 == 59) printf("\n"); + } + printf("\n"); + */ + + m_fd->lpos = lib_cnt; + if (seqcnt*4 >= seq_len) { /* there was enough room */ + seq[seq_len]= EOSEQ; + /* printf("%d\n",seq_len); */ + return seq_len; + } + else { /* not enough room */ + seq[seqcnt*4]=EOSEQ; + seq_len -= 4*seqcnt; + return (4*seqcnt); + } + +error: fprintf(stderr," error reading %s at %ld\n",libstr,*libpos); + fflush(stderr); + return (-1); +} + + /* 0 1 2 3 4 5 6 7 + 8 9 10 11 12 13 14 15 + 16 17 */ +static char +*db_type_arr[] = {"lcl","gib","gim","gii","gb","emb","pir","sp", + "pat","ref","gnl","gi","dbj","prf","pdb","tpg", + "tpe","tpd"}; + +void +ncbl2_ranlib(char *str, + int cnt, + fseek_t libpos, + char *libstr, + struct lmf_str *m_fd) +{ + int llen, lib_cnt; + char *bp; + unsigned char *my_buff=NULL; + char descr[2048]; + unsigned char *abp; + int gi, taxid; + int my_db; + char db[5], acc[20], name[20]; + char title[1024]; + int have_my_buff=0; + int have_descr = 0; + + lib_cnt = (int)libpos; + llen = m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]; + + fseek(m_fd->hfile,m_fd->d_pos_arr[libpos],0); + + if (m_fd->bl_format_ver == FORMATDBV3) { + if (llen >= cnt) llen = cnt-1; + fread(str,(size_t)1,(size_t)(llen),m_fd->hfile); + } + else { + if (llen >= m_fd->tmp_buf_max) { + if ((my_buff=(unsigned char *)calloc(llen,sizeof(char)))==NULL) { + fprintf(stderr," cannot allocate ASN.1 buffer: %d\n",llen); + my_buff = (unsigned char *)m_fd->tmp_buf; + llen = m_fd->tmp_buf_max; + } + else have_my_buff = 1; + } + else { + my_buff = (unsigned char *)m_fd->tmp_buf; + } + abp = my_buff; + fread(my_buff,(size_t)1,llen,m_fd->hfile); + + do { + abp = parse_fastadl_asn(abp, my_buff+llen, + &gi, &my_db, acc, name, + title, sizeof(title), &taxid); + + if (gi > 0) { + sprintf(descr,"gi|%d|%s|%s|%s ",gi,db_type_arr[my_db],acc,name); + } + else { + if (acc[0] != '\0') sprintf(descr,"%s ",acc); + else descr[0] = '\0'; + if (name[0] != '\0' && strcmp(name,"BL_ORD_ID")!=0) sprintf(descr+strlen(descr),"%s ", name); + } + if (m_fd->pref_db < 0) { + if (!have_descr) { + strncpy(str,descr,cnt-1); + have_descr = 1; + } + else { + strncat(str,"\001",cnt-strlen(str)-1); + strncat(str,descr,cnt-strlen(str)-1); + } + strncat(str,title,cnt-strlen(str)-1); + if (strlen(str) >= cnt-1) break; + } + else if (m_fd->pref_db == my_db) { + have_descr = 1; + strncpy(str,descr,cnt-1); + strncat(str,title,cnt-strlen(str)-1); + break; + } + } while (abp); + + if (!have_descr) { + strncpy(str,descr,cnt-1); + strncat(str,descr,cnt-strlen(str)-1); + } + + if (have_my_buff) free(my_buff); + } + + str[cnt-1]='\0'; + + bp = str; + while((bp=strchr(bp,'\001'))!=NULL) {*bp++=' ';} + + if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[libpos],0); + + m_fd->lpos = lib_cnt; + m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt]; +} + +unsigned int bl2_uint4_cvt(unsigned int val) +{ + unsigned int res; +#ifdef IS_BIG_ENDIAN + return val; +#else /* it better be LITTLE_ENDIAN */ + res = ((val&255)*256)+ ((val>>8)&255); + res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255); + return res; +#endif +} + +unsigned int bl2_long4_cvt(long val) +{ + int val4; + unsigned int res; +#ifdef IS_BIG_ENDIAN + val4 = val; + return val4; +#else /* it better be LITTLE_ENDIAN */ + res = ((val&255)*256)+ ((val>>8)&255); + res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255); + return res; +#endif +} + +int64_t bl2_long8_cvt(int64_t val) +{ + int64_t res; +#ifdef IS_BIG_ENDIAN + return val; +#else /* it better be LITTLE_ENDIAN */ + res = ((val&255)*256)+ ((val>>8)&255); + res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255); +#ifdef BIG_LIB64 + res = (res<<16) + (((val>>32)&255)*256) + ((val>>40)&255); + res = (res<<16) + (((val>>48)&255)*256) + ((val>>56)&255); +#else + fprintf(stderr,"Cannot use bl2_long8_cvt without 64-bit longs\n"); + exit(1); +#endif + return res; +#endif +} + +void src_int4_read(FILE *fd, int *val) +{ +#ifdef IS_BIG_ENDIAN + fread((char *)val,(size_t)4,(size_t)1,fd); +#else + unsigned char b[4]; + + fread((char *)&b[0],(size_t)1,(size_t)4,fd); + *val = 0; + *val = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8) + +(int)b[3]; +#endif +} + +void src_long4_read(FILE *fd, long *valp) +{ + int val4; +#ifdef IS_BIG_ENDIAN + fread(&val4,(size_t)4,(size_t)1,fd); + *valp = val4; +#else + unsigned char b[4]; + + fread((char *)&b[0],(size_t)1,(size_t)4,fd); + val4 = 0; + val4 = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8) + +(int)b[3]; + *valp = val4; +#endif +} + +void src_uint4_read(FILE *fd, unsigned int *valp) +{ +#ifdef IS_BIG_ENDIAN + fread(valp,(size_t)4,(size_t)1,fd); +#else + unsigned char b[4]; + + fread((char *)&b[0],(size_t)1,(size_t)4,fd); + *valp = 0; + *valp = (unsigned int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8) + +(int)b[3]; +#endif +} + +void src_long8_read(FILE *fd, long *val) +{ +#ifdef IS_BIG_ENDIAN + fread((void *)val,(size_t)8,(size_t)1,fd); +#else + unsigned char b[8]; + + fread((char *)&b[0],(size_t)1,(size_t)8,fd); + *val = 0; + *val = (long)((((((long)((long)(b[0]<<8)+(long)b[1]<<8)+(long)b[2]<<8) + +(long)b[3]<<8)+(long)b[4]<<8)+(long)b[5]<<8) + +(long)b[6]<<8)+(long)b[7]; +#endif +} + +void ncbi_long8_read(FILE *fd, int64_t *val) +{ + unsigned char b[8]; + + fread((char *)&b[0],(size_t)1,(size_t)8,fd); + *val = 0; + *val = (long)((((((long)((long)(b[7]<<8)+(long)b[6]<<8)+(long)b[5]<<8) + +(long)b[4]<<8)+(long)b[3]<<8)+(long)b[2]<<8) + +(long)b[1]<<8)+(long)b[0]; +} + +void src_char_read(FILE *fd, char *val) +{ + fread(val,(size_t)1,(size_t)1,fd); +} + +void src_fstr_read(FILE *fd, char *val, int slen) +{ + fread(val,(size_t)slen,(size_t)1,fd); +} + +void +newname(char *nname, char *oname, char *suff, int maxn) +{ + strncpy(nname,oname,maxn-1); + strncat(nname,".",1); + strncat(nname,suff,maxn-strlen(nname)); +} + +#define ASN_SEQ 0x30 +#define ASN_IS_BOOL 1 +#define ASN_IS_INT 2 +#define ASN_IS_STR 26 + +unsigned char * +get_asn_int(unsigned char *abp, int *val) { + + int v_len, v; + + v = 0; + if (*abp++ != ASN_IS_INT) { /* check for int */ + fprintf(stderr," int missing\n"); + } + else { + v_len = *abp++; + while (v_len-- > 0) { + v *= 256; + v += *abp++; + } + abp += 2; /* skip over null's */ + } + *val = v; + return abp; +} + +unsigned char * +get_asn_text(unsigned char *abp, char *text, int t_len) { + int tch, at_len; + + text[0] = '\0'; + if (*abp++ != ASN_IS_STR) { /* check for str */ + fprintf(stderr," str missing\n"); + } + else { + if ((tch = *abp++) > 128) { /* string length is in next bytes */ + tch &= 0x7f; /* get number of bytes for len */ + at_len = 0; + while (tch-- > 0) { at_len = (at_len << 8) + *abp++;} + } + else { + at_len = tch; + } + + if ( at_len < t_len-1) { + memcpy(text, abp, at_len); + text[at_len] = '\0'; + } + else { + memcpy(text, abp, t_len-1); + text[t_len-1] = '\0'; + } + abp += at_len + 2; + } + return abp; +} + +/* something to try to skip over stuff we don't want */ +unsigned char * +get_asn_junk(unsigned char *abp) { + + int seq_cnt = 0; + int tmp; + char string[256]; + + while (*abp) { + if ( *abp == ASN_SEQ) { abp += 2; seq_cnt++;} + else if ( *abp == ASN_IS_BOOL ) {abp = get_asn_int(abp, &tmp);} + else if ( *abp == ASN_IS_INT ) {abp = get_asn_int(abp, &tmp);} + else if ( *abp == ASN_IS_STR ) {abp = get_asn_text(abp, string, sizeof(string)-1);} + } + + while (seq_cnt-- > 0) abp += 2; + return abp; +} + +unsigned char * +get_asn_textseq_id(unsigned char *abp, + char *name, char *acc) +{ + char release[20], ver_str[10]; + int version; + int seqcnt = 0; + + ver_str[0]='\0'; + + if (*abp == ASN_SEQ) { abp += 2; seqcnt++;} + + while (*abp) { + switch (*abp) { + case 0xa0: + abp = get_asn_text(abp+2, name, 20); + break; + case 0xa1: + abp = get_asn_text(abp+2, acc, 20); + break; + case 0xa2: + abp = get_asn_text(abp+2, release, sizeof(release)); + break; + case 0xa3: + abp = get_asn_int(abp+2, &version); + sprintf(ver_str,".%d",version); + break; + default: abp += 2; + } + } + while (seqcnt-- > 0) abp += 4; + strncat(acc,ver_str,20-strlen(acc)); + acc[19]='\0'; + return abp; /* skip 2 NULL's */ +} + +unsigned char * +get_asn_local_id(unsigned char *abp, char *acc) +{ + int seqcnt = 0; + + if (*abp == ASN_SEQ) { abp += 2; seqcnt++;} + + abp = get_asn_text(abp+2, acc, 20); + + while (seqcnt-- > 0) abp += 4; + acc[19]='\0'; + return abp; /* skip 2 NULL's */ +} + +unsigned char * +get_asn_dbtag(unsigned char *abp, char *name, char *str, int *id_p) { + + if (*abp == ASN_SEQ) { abp += 2;} + + if (*abp == 0xa0) { /* get db */ + abp = get_asn_text(abp+2, name, 20); + } + else { + fprintf(stderr," missing dbtag:db %d %d\n",abp[0],abp[1]); + abp += 2; + } + + if (*abp == 0xa1) { /* get tag */ + abp += 2; + abp += 2; /* skip over id */ + if (*abp == 2) abp = get_asn_int(abp,id_p); + else abp = get_asn_text(abp+2, str, 20); + } + else { + fprintf(stderr," missing dbtag:tag %2x %2x\n",abp[0],abp[1]); + abp += 2; + } + return abp+2; /* skip 2 NULL's */ +} + +unsigned char * +get_asn_pdb_id(unsigned char *abp, char *acc, char *chain) +{ + int ichain, seq_cnt=0; + + if (*abp == ASN_SEQ) { abp += 2; seq_cnt++;} + + while (*abp) { + switch (*abp) { + case 0: abp += 2; break; + case 0xa0: /* mol-id */ + abp = get_asn_text(abp+2, acc, 20); + break; + case 0xa1: + abp = get_asn_int(abp+2, &ichain); + chain[0] = ichain; + chain[1] = '\0'; + break; + case 0xa2: /* ignore date - scan until NULL's */ + while (*abp++) {} + abp += 2; /* skip the NULL's */ + break; + default: abp+=2; + } + } + while (seq_cnt-- > 0) {abp += 4;} + return abp; +} + +#define ASN_TYPE_MASK 31 + +unsigned char +*get_asn_seqid(unsigned char *abp, + int *gi_p, int *db, char *acc, char *name) { + + int db_type, itmp, seq_cnt=0; + + *gi_p = 0; + + if (*abp != ASN_SEQ) { + fprintf(stderr, "seqid - missing SEQ 1: %2x %2x\n",abp[0], abp[1]); + return abp; + } + else { abp += 2; seq_cnt++;} + + db_type = (*abp & ASN_TYPE_MASK); + + if (db_type == 11) { /* gi */ + abp = get_asn_int(abp+2,gi_p); + } + + while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;} + + db_type = (*abp & ASN_TYPE_MASK); + if (db_type > 17) {db_type = 0;} + *db = db_type; + + switch(db_type) { + case 0: + abp = get_asn_local_id(abp+2, acc); + break; + case 1: + case 2: + abp = get_asn_int(abp+2,&itmp); + abp += 2; + break; + case 11: + abp = get_asn_int(abp+2,&itmp); + break; + case 4: + case 5: + case 6: + case 7: + case 9: + case 12: + case 13: + case 15: + case 16: + case 17: + abp = get_asn_textseq_id(abp+2,name,acc); + break; + case 10: + abp = get_asn_dbtag(abp+2,name,acc,&itmp); + case 14: + abp = get_asn_pdb_id(abp+2,acc,name); + break; + default: abp += 2; + } + + while (seq_cnt-- > 0) { abp += 4;} + return abp; /* skip over 2 NULL's */ +} + +#define ASN_FADL_TITLE 0xa0 +#define ASN_FADL_SEQID 0xa1 +#define ASN_FADL_TAXID 0xa2 +#define ASN_FADL_MEMBERS 0xa3 +#define ASN_FADL_LINKS 0xa4 +#define ASN_FADL_OTHER 0xa5 + +unsigned char * +parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max, + int *gi_p, int *db, char *acc, + char *name, char *title, int t_len, int *taxid_p) { + unsigned char *abp; + char tmp_db[4], tmp_acc[32], tmp_name[32]; + int this_db; + int seq_cnt = 0; + int tmp_gi; + + acc[0] = name[0] = db[0] = title[0] = '\0'; + + abp = asn_buff; + while ( abp < asn_max && *abp) { + if (*abp == ASN_SEQ) { abp += 2; seq_cnt++; } + else if (*abp == ASN_FADL_TITLE) { + abp = get_asn_text(abp+2, title, t_len); + } + else if (*abp == ASN_FADL_SEQID ) { + abp = get_asn_seqid(abp+2, gi_p, db, acc, name); + if (*db > 17) *db = 0; + } + else if (*abp == ASN_FADL_TAXID ) { + abp = get_asn_int(abp+2, taxid_p); + } + else if (*abp == ASN_FADL_MEMBERS) { + abp = get_asn_junk(abp+2); + } + else if (*abp == ASN_FADL_LINKS ) { + abp = get_asn_junk(abp+2); + } + else if (*abp == ASN_FADL_OTHER ) { + abp = get_asn_junk(abp+2); + } + else { + /* fprintf(stderr, " Error - missing ASN.1 %2x:%2x:%2x:%2x\n", + abp[-2],abp[-1],abp[0],abp[1]); */ + abp += 2; + } + } + while (abp < asn_max && *abp == '\0' ) abp++; + if (abp >= asn_max) return NULL; + else return abp; +} + + +void +parse_pal(char *dname, char *msk_name, + int *oid_seqs, int *max_oid, + FILE *fd) { + + char line[MAX_STR]; + + while (fgets(line,sizeof(line),fd)) { + if (line[0] == '#') continue; + + if (strncmp(line, "DBLIST", 6)==0) { + sscanf(line+7,"%s",dname); + } + else if (strncmp(line, "OIDLIST", 7)==0) { + sscanf(line+8,"%s",msk_name); + } + else if (strncmp(line, "NSEQ", 4)==0) { + sscanf(line+5,"%d",oid_seqs); + } + else if (strncmp(line, "MAXOID", 6)==0) { + sscanf(line+7,"%d",max_oid); + } + } +}