/* ncbl2_lib.c functions to read ncbi-blast format files from formatdb (blast2.0 format files) copyright (c) 1999 William R. Pearson */ /* $Name: fa_34_26_5 $ - $Id: ncbl2_mlib.c,v 1.56 2007/04/02 18:08:11 wrp Exp $ */ /* to turn on mmap()ing for Blast2 files: */ #include #include #include #include #include #include #ifdef UNIX #include #endif #include /* **************************************************************** 17-May-2006 Modified to read NCBI .[np]al and .msk files. The .nal or .pal file provides a way to read sequences from a list of files. The .msk file provides a compact way of indicating the subset of sequences in a larger database (typically nr or nt) that comprise a smaller database (e.g. swissprot or pdbaa). A .pal file (e.g. swissprot.00.pal) that uses a .msk file has the form: # Alias file generated by genmask # Date created: Mon Apr 10 11:24:05 2006 # TITLE Non-redundant SwissProt sequences DBLIST nr.00 OIDLIST swissprot.00.msk LENGTH 74351250 NSEQ 198346 MAXOID 2617347 MEMB_BIT 1 # end of the file To work with this file, we must first load the nr.00 file, and then read the swissprot.00.msk file, and then scan all the entries in the swissprot.00.msk file (which are packed 32 mask-bit to an int) to determine whether a specific libpos index entry is present in the subset database. **************************************************************** */ /* **************************************************************** This code reads NCBI Blast2 format databases from formatdb version 3 and 4 (From NCBI) This section describes the format of the databases. Formatdb creates three main files for proteins containing indices, sequences, and headers with the extensions, respectively, of pin, psq, and phr (for nucleotides these are nin, nsq, and nhr). A number of other ISAM indices are created, but these are described elsewhere. FORMAT OF THE INDEX FILE ------------------------ 1.) formatdb version number [4 bytes]. 2.) protein dump flag (1 for a protein database, 0 for a nucleotide database) [4 bytes]. 3.) length of the database title in bytes [4 bytes]. 4.) the database title [length given in 3.)]. 5.) length of the date/time string [4 bytes]. 6.) the date/time string [length given in 5.)]. 7.) the number of sequences in the database [4 bytes]. 8.) the total length of the database in residues/basepairs [4 bytes]. 9.) the length of the longest sequence in the database [4 bytes]. 10.) a list of the offsets for definitions (one for each sequence) in the header file. There are num_of_seq+1 of these, where num_of_seq is the number of sequences given in 7.). 11.) a list of the offsets for sequences (one for each sequence) in the sequence file. There are num_of_seq+1 of these, where num_of_seq is the number of sequences given in 7.). 12.) a list of the offsets for the ambiguity characters (one for each sequence) in the sequence file. This list is only present for nucleotide databases and, since the database is compressed 4/1 for nucleotides, allows the ambiguity characters to be restored when the sequence is generated. There are num_of_seq+1 of these, where num_of_seq is the number of sequences given in 7.). FORMAT OF THE SEQUENCE FILE --------------------------- There are different formats for the protein and nucleotide sequence files. The protein sequence files is quite simple. The first byte in the file is a NULL byte, followed by the sequence in ncbistdaa format (described in the NCBI Software Development Toolkit documentation). Following the sequence is another NULL byte, followed by the next sequence. The file ends with a NULL byte, following the last sequence. The nucleotide sequence file contains the nucleotide sequence, with four basepairs compressed into one byte. The format used is NCBI2na, documented in the NCBI Software Development Toolkit manual. Any ambiguity characters present in the original sequence are replaced at random by A, C, G or T. The true value of ambiguity characters are stored at the end of each sequence to allow true reproduction of the original sequence. FORMAT OF THE HEADER FILE (formatdb version 3) ------------------------- The format of the header file depends on whether or not the identifiers in the original file were parsed or not. For the case that they were not, then each entry has the format: gnl|BL_ORD_ID|entry_number my favorite yeast sequence... Here entry_number gives the ordinal number of the sequence in the database (with zero offset). The identifier gnl|BL_ORD_ID|entry_number is used by the BLAST software to identify the entry, if the user has not provided another identifier. If the identifier was parsed, then gnl|BL_ORD_ID|entry_number is replaced by the correct identifier, as described in ftp://ncbi.nlm.nih.gov/blast/db/README . There are no separators between these deflines. For formatdb version 4, the header file contains blast ASN.1 binary deflines, which can parsed with parse_fastadl_asn(). FORMAT OF THE .MSK FILE ----------------------- The .msk file is simply a packed list of masks for formatdb "oids" for some other file (typically nr). The first value is the last oid available; the remainder are packed 32 oids/mask, so that the number of masks is 1/32 the number of sequences in the file. **************************************************************** */ #ifdef USE_MMAP #include #include #include #ifdef IBM_AIX #include #else #include #endif #endif #ifdef USE_MMAP #ifndef MAP_FILE #define MAP_FILE 0 #endif #endif #ifdef UNIX #define RBSTR "r" #else #define RBSTR "rb" #endif #ifdef WIN32 #define SLASH_CHAR '\\' #define SLASH_STR "\\" #else #define SLASH_CHAR '/' #define SLASH_STR "/" #endif #define XTERNAL #include "uascii.h" #define XTERNAL #include "upam.h" #include "ncbl2_head.h" #include "defs.h" #include "mm_file.h" unsigned int bl2_uint4_cvt(unsigned int); unsigned int bl2_long4_cvt(long); int64_t bl2_long8_cvt(int64_t); void src_int4_read(FILE *fd, int *valp); void src_uint4_read(FILE *fd, unsigned int *valp); void src_long4_read(FILE *fd, long *valp); void ncbi_long8_read(FILE *fd, int64_t *valp); void src_char_read(FILE *fd, char *valp); unsigned char *parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max, int *gi_p, int *db, char *acc, char *name, char *title, int t_len, int *taxid); /* nt_btoa maps from blast 2bit format to ascii characters */ static char nt_btoa[5] = {"ACGT"}; static char aa_b2toa[27]= {"-ABCDEFGHIKLMNPQRSTVWXYZU*"}; static int aa_btof[32]; /* maps to fasta alphabet */ static int dbtype, dbformat, amb_cnt; #define NCBIBL20 12 int ncbl2_getliba(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *); int ncbl2_getlibn(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *); int ncbl2_getliba_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *); int ncbl2_getlibn_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *); void newname(char *, char *, char *, int); void parse_pal(char *, char *, int *, int *, FILE *); void ncbl2_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd); /* ncbl2_openlib() is used to open (and memory map) a BLAST2.0 format file. Ifdef USE_MMAP, then ncbl2_openlib returns a structure that can be used to read the database. */ struct lmf_str * ncbl2_openlib(char *name, int ldnaseq) { char lname[256]; char dname[256]; char msk_name[256]; char hname[256]; char sname[256]; char tname[256]; char db_dir[256]; int pref_db= -1; char *bp; int title_len; char *title_str=NULL; int date_len; char *date_str=NULL; long ltmp; int64_t l8tmp; int oid_seqs, max_oid; int oid_cnt, oid_len; unsigned int *oid_list, o_max; int tmp; int i; #ifdef USE_MMAP struct stat statbuf; #endif FILE *ifile; /* index offsets, also DB info */ unsigned int *f_pos_arr; struct lmf_str *m_fptr; if (ldnaseq==SEQT_PROT) { /* read a protein database */ newname(lname,name,AA_LIST_EXT,(int)sizeof(lname)); newname(tname,name,AA_INDEX_EXT,(int)sizeof(tname)); newname(hname,name,AA_HEADER_EXT,(int)sizeof(hname)); newname(sname,name,AA_SEARCHSEQ_EXT,(int)sizeof(sname)); /* initialize map of BLAST2 amino acids to FASTA amino acids */ for (i=0; i 0) { /* get the pref_db before adding the directory */ if (strncmp(msk_name,"swissprot",9)==0) { pref_db = 7; } else if (strncmp(msk_name,"pdbaa",5)==0) { pref_db = 14; } /* need to add directory to both dname and msk_name */ strncpy(tname,db_dir,sizeof(tname)); strncat(tname,msk_name, sizeof(tname)); strncpy(msk_name, tname, sizeof(msk_name)); strncpy(tname,db_dir,sizeof(tname)); strncat(tname,dname, sizeof(tname)); strncpy(dname,tname,sizeof(dname)); if (ldnaseq == SEQT_PROT) { newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname)); newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname)); newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname)); } else { /* reading DNA library */ newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname)); newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname)); newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname)); } /* now load the oid file */ if ((ifile = fopen(msk_name,RBSTR))==NULL) { fprintf(stderr,"error - cannot load %s file\n",msk_name); return NULL; } else { src_uint4_read(ifile,&o_max); if (o_max != max_oid) { fprintf(stderr," error - oid count mismatch %d != %d\n",max_oid, o_max); } oid_len = (max_oid/32+1); if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) { fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len); return NULL; } if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) { fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len); return NULL; } fclose(ifile); } } else { /* we had a .msk file, but there are no oid's in it. allocate an m_fptr and return it empty */ if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) { fprintf(stderr," cannot allocate lmf_str\n"); return NULL; } m_fptr->tmp_buf_max = 0; /* load the oid info */ m_fptr->max_oid = 0; m_fptr->oid_seqs = 0; m_fptr->oid_list = (unsigned int *)calloc(1,sizeof(int)); m_fptr->pref_db= -1; if (ldnaseq==SEQT_DNA) { m_fptr->getlib = ncbl2_getlibn_o; m_fptr->sascii = nascii; } else { m_fptr->getlib = ncbl2_getliba_o; m_fptr->sascii = aascii; } strncpy(m_fptr->lb_name,sname,MAX_FN); return m_fptr; } } /* open the index file */ if ((ifile = fopen(tname,RBSTR))==NULL) { fprintf(stderr," cannot open %s (%s) INDEX file",tname,name); perror("..."); return 0; } src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */ src_uint4_read(ifile,(unsigned *)&dbtype); /* get 1 for protein/0 DNA */ if (dbformat != FORMATDBV3 && dbformat!=FORMATDBV4) { fprintf(stderr,"error - %s wrong formatdb version (%d/%d)\n", tname,dbformat,FORMATDBV3); return NULL; } if ((ldnaseq==SEQT_PROT && dbtype != AAFORMAT) || (ldnaseq==SEQT_DNA && dbtype!=NTFORMAT)) { fprintf(stderr,"error - %s wrong format (%d/%d)\n", tname,dbtype,(ldnaseq ? NTFORMAT: AAFORMAT)); return NULL; } /* the files are there - allocate lmf_str */ if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) { fprintf(stderr," cannot allocate lmf_str\n"); return NULL; } m_fptr->tmp_buf_max = 4096; if ((m_fptr->tmp_buf= (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) { fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n"); return NULL; } /* load the oid info */ m_fptr->max_oid = max_oid; m_fptr->oid_seqs = oid_seqs; m_fptr->oid_list = oid_list; m_fptr->pref_db= pref_db; /* open the header file */ if ((m_fptr->hfile = fopen(hname,RBSTR))==NULL) { fprintf(stderr," cannot open %s header file\n",hname); goto error_r; } /* ncbl2_ranlib is used for all BLAST2.0 access */ m_fptr->ranlib = ncbl2_ranlib; m_fptr->bl_format_ver = dbformat; if (ldnaseq==SEQT_DNA) { if (oid_seqs > 0) { m_fptr->getlib = ncbl2_getlibn_o; } else { m_fptr->getlib = ncbl2_getlibn; } m_fptr->sascii = nascii; } else { if (oid_seqs > 0) { m_fptr->getlib = ncbl2_getliba_o; } else { m_fptr->getlib = ncbl2_getliba; } m_fptr->sascii = aascii; } strncpy(m_fptr->lb_name,sname,MAX_FN); /* open the sequence file */ #if defined (USE_MMAP) m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0); if (!m_fptr->mm_flg) { fprintf(stderr," cannot open %s",sname); perror("..."); } else { if(fstat(m_fptr->mmap_fd, &statbuf) < 0) { fprintf(stderr," cannot fstat %s",sname); perror("..."); m_fptr->mm_flg = 0; } else { m_fptr->st_size = statbuf.st_size; if((m_fptr->mmap_base = mmap(NULL, m_fptr->st_size, PROT_READ, MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) { fprintf(stderr," cannot mmap %s",sname); perror("..."); m_fptr->mm_flg = 0; } else { m_fptr->mmap_addr = m_fptr->mmap_base; m_fptr->mm_flg = 1; } } /* regardless, close the open()ed version */ close(m_fptr->mmap_fd); } #else m_fptr->mm_flg = 0; #endif if (!m_fptr->mm_flg) { if ((m_fptr->libf = fopen(sname,RBSTR))==NULL) { fprintf(stderr," cannot open %s sequence file",sname); perror("..."); goto error_r; } } /* all files should be open */ src_uint4_read(ifile,(unsigned *)&title_len); if (title_len > 0) { if ((title_str = calloc((size_t)title_len+1,sizeof(char)))==NULL) { fprintf(stderr," cannot allocate title string (%d)\n",title_len); goto error_r; } fread(title_str,(size_t)1,(size_t)title_len,ifile); } src_uint4_read(ifile,(unsigned *)&date_len); if (date_len > 0) { if ((date_str = calloc((size_t)date_len+1,sizeof(char)))==NULL) { fprintf(stderr," cannot allocate date string (%d)\n",date_len); goto error_r; } fread(date_str,(size_t)1,(size_t)date_len,ifile); } m_fptr->lpos = 0; src_uint4_read(ifile,(unsigned *)&m_fptr->max_cnt); if (dbformat == FORMATDBV3) { src_long4_read(ifile,<mp); m_fptr->tot_len = ltmp; } else { ncbi_long8_read(ifile,&l8tmp); m_fptr->tot_len = ltmp; } src_long4_read(ifile,<mp); m_fptr->max_len = ltmp; /* currently we are not using this information, but perhaps later */ if (title_str!=NULL) free(title_str); if (date_str!=NULL) free(date_str); #ifdef DEBUG fprintf(stderr,"%s format: BL2 (%s) max_cnt: %d, totlen: %lld, maxlen %ld\n", name,m_fptr->mm_flg ? "mmap" : "fopen", m_fptr->max_cnt,m_fptr->tot_len,m_fptr->max_len); #endif /* allocate and read hdr indexes */ if ((f_pos_arr=(unsigned int *)calloc((size_t)m_fptr->max_cnt+1,sizeof(int)))==NULL) { fprintf(stderr," cannot allocate tmp header pointers\n"); goto error_r; } if ((m_fptr->d_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) { fprintf(stderr," cannot allocate header pointers\n"); goto error_r; } /* allocate and read sequence offsets */ if ((m_fptr->s_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) { fprintf(stderr," cannot allocate sequence pointers\n"); goto error_r; } /* for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->d_pos_arr[i]); for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->s_pos_arr[i]); */ if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) { fprintf(stderr," error reading hdr offsets: %s\n",tname); goto error_r; } for (i=0; i<=m_fptr->max_cnt; i++) #ifdef IS_BIG_ENDIAN m_fptr->d_pos_arr[i] = f_pos_arr[i]; #else m_fptr->d_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]); #endif if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) { fprintf(stderr," error reading seq offsets: %s\n",tname); goto error_r; } for (i=0; i<=m_fptr->max_cnt; i++) { #ifdef IS_BIG_ENDIAN m_fptr->s_pos_arr[i] = f_pos_arr[i]; #else m_fptr->s_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]); #endif } if (dbtype == NTFORMAT) { /* allocate and ambiguity offsets */ if ((m_fptr->a_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) { fprintf(stderr," cannot allocate sequence pointers\n"); goto error_r; } /* for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->a_pos_arr[i]); */ if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) { fprintf(stderr," error reading seq offsets: %s\n",tname); goto error_r; } for (i=0; i<=m_fptr->max_cnt; i++) { #ifdef IS_BIG_ENDIAN m_fptr->a_pos_arr[i] = f_pos_arr[i]; #else m_fptr->a_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]); #endif } } /* for (i=0; i < min(m_fptr->max_cnt,10); i++) { fprintf(stderr,"%d: %d %d %d\n",i,m_fptr->s_pos_arr[i],m_fptr->a_pos_arr[i],m_fptr->d_pos_arr[i]); } */ /* all done with ifile, close it */ fclose(ifile); free(f_pos_arr); if (!m_fptr->mm_flg) { tmp = fgetc(m_fptr->libf); if (tmp!=NULLB) fprintf(stderr," phase error: %d:%d found\n",0,tmp); } m_fptr->bl_lib_pos = 1; amb_cnt = 0; return m_fptr; error_r: /* here if failure after m_fptr allocated */ free(m_fptr); return NULL; } void ncbl2_closelib(struct lmf_str *m_fptr) { if (m_fptr->tmp_buf != NULL) { free(m_fptr->tmp_buf); m_fptr->tmp_buf_max = 0; } if (m_fptr->s_pos_arr !=NULL) { free(m_fptr->s_pos_arr); m_fptr->s_pos_arr = NULL; } if (m_fptr->a_pos_arr!=NULL) { free(m_fptr->a_pos_arr); m_fptr->a_pos_arr = NULL; } if (m_fptr->hfile !=NULL ) { fclose(m_fptr->hfile); m_fptr->hfile=NULL; free(m_fptr->d_pos_arr); m_fptr->d_pos_arr = NULL; } if (m_fptr->oid_list != NULL) { free(m_fptr->oid_list); m_fptr->oid_list = NULL; m_fptr->oid_seqs = m_fptr->max_oid = 0; } #ifdef use_mmap if (m_fptr->mm_flg) { munmap(m_fptr->mmap_base,m_fptr->st_size); m_fptr->mmap_fd = -1; } else #endif if (m_fptr->libf !=NULL ) {fclose(m_fptr->libf); m_fptr->libf=NULL;} } int ncbl2_getliba_o(unsigned char *seq, int maxs, char *libstr, int n_libstr, fseek_t *libpos, int *lcont, struct lmf_str *m_fd, long *l_off) { int tpos; unsigned int t_mask, t_shift, oid_mask; /* get to the next valid pointer */ for ( tpos = m_fd->lpos ;tpos <= m_fd->max_oid; tpos++) { t_mask = tpos / 32; t_shift = 31 - (tpos % 32); if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; } if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) { if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0); m_fd->lpos = tpos; /* already bumped up */ m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos]; return ncbl2_getliba(seq, maxs, libstr, n_libstr, libpos, lcont, m_fd, l_off); } } return -1; } int ncbl2_getliba(unsigned char *seq, int maxs, char *libstr, int n_libstr, fseek_t *libpos, int *lcont, struct lmf_str *m_fd, long *l_off) { unsigned char *sptr, *dptr; int s_chunk, d_len, lib_cnt; long seqcnt; long tmp; static long seq_len; #if defined(DEBUG) || defined(PCOMPLIB) int gi, my_db, taxid; char acc[20], title[21], name[20]; #endif *l_off = 1; lib_cnt = m_fd->lpos; *libpos = (fseek_t)m_fd->lpos; if (*lcont==0) { if (lib_cnt >= m_fd->max_cnt) return -1; /* no more sequences */ seq_len = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]; /* value is +1 off to get the NULL */ if (m_fd->mm_flg) m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lib_cnt]; #if !defined(DEBUG) && !defined(PCOMPLIB) libstr[0]='\0'; #else /* get the name from the header file */ fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0); if (m_fd->bl_format_ver == FORMATDBV3) { d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1); fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile); libstr[d_len]='\0'; } else { d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1); fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile); parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len, &gi, &my_db, acc, name, title, 20, &taxid); sprintf(libstr,"gi|%d",gi); } #endif } if (seq_len <= maxs) { /* sequence fits */ seqcnt = seq_len; m_fd->lpos++; *lcont = 0; } else { /* doesn't fit */ seqcnt = maxs-1; (*lcont)++; } if (m_fd->mm_flg) sptr = (unsigned char *)m_fd->mmap_addr; else { if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,m_fd->libf))!=(size_t)seq_len) { fprintf(stderr," could not read sequence record: %ld %ld != %ld\n", *libpos,tmp,seq_len); goto error; } sptr = seq; } if (seq_len <= maxs) {seqcnt = --seq_len;} /* everything is ready, set up dst. pointer, seq_len */ dptr = seq; if (aa_b2toa[sptr[seq_len-1]]=='*') seq_len--; s_chunk = seqcnt/16; while (s_chunk-- > 0) { *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; *dptr++ = aa_btof[*sptr++]; } while (dptr < seq+seqcnt) *dptr++ = aa_btof[*sptr++]; if (m_fd->mm_flg) m_fd->mmap_addr = (char *)sptr; /* we didn't get it all, so reset for more */ if (*lcont) seq_len -= seqcnt; seq[seqcnt]= EOSEQ; return (seqcnt); error: fprintf(stderr," error reading %s at %ld\n",libstr,*libpos); fflush(stderr); return (-1); } int ncbl2_getlibn_o(unsigned char *seq, int maxs, char *libstr, int n_libstr, fseek_t *libpos, int *lcont, struct lmf_str *m_fd, long *l_off) { int tpos; unsigned int t_mask, t_shift, oid_mask; /* get to the next valid pointer */ for (tpos = m_fd->lpos; tpos <= m_fd->max_oid; tpos++) { t_mask = tpos / 32; t_shift = 31 - (tpos % 32); if ((oid_mask = m_fd->oid_list[t_mask])==0) { continue; } if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) { if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0); m_fd->lpos = tpos; /* already bumped up */ m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos]; return ncbl2_getlibn(seq, maxs, libstr, n_libstr, libpos, lcont, m_fd, l_off); } } return -1; } static char tmp_amb[4096]; int ncbl2_getlibn(unsigned char *seq, int maxs, char *libstr, int n_libstr, fseek_t *libpos, int *lcont, struct lmf_str *m_fd, long *l_off) { unsigned char *sptr, *tptr, stmp; long seqcnt; int s_chunk, lib_cnt; size_t tmp; char ch; static long seq_len; static int c_len,c_pad; int c_len_set, d_len; *l_off = 1; lib_cnt = m_fd->lpos; *libpos = (fseek_t)lib_cnt; if (*lcont==0) { /* not a continuation of previous */ if (lib_cnt >= m_fd->max_cnt) return (-1); c_len = m_fd->a_pos_arr[lib_cnt]- m_fd->s_pos_arr[lib_cnt]; if (!m_fd->mm_flg) { if (m_fd->bl_lib_pos != m_fd->s_pos_arr[lib_cnt]) { /* are we positioned to read? */ amb_cnt++; if ((m_fd->bl_lib_pos - m_fd->s_pos_arr[lib_cnt]) < sizeof(tmp_amb)) { /* jump over amb_ray */ fread(tmp_amb,(size_t)1,(size_t)(m_fd->s_pos_arr[lib_cnt]-m_fd->bl_lib_pos),m_fd->libf); } else { /* fseek over amb_ray */ fseek(m_fd->libf,m_fd->s_pos_arr[lib_cnt],0); } m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt]; } } else m_fd->mmap_addr = m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt]; #if !defined(DEBUG) && !defined(PCOMPLIB) libstr[0]='\0'; #else /* get the name from the header file */ fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0); d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1); fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile); libstr[d_len]='\0'; #endif } /* end of *lcont==0 */ /* To avoid the situation where c_len <= 1; we must anticipate what c_len will be after this pass. If it will be <= 64, back off this time so next time it will be > 64 */ seq_len = c_len*4; if ((seq_len+4 > maxs) && (seq_len+4 - maxs <= 256)) { /* we won't be done but we will have less than 256 to go */ c_len -= 64; seq_len -= 256; c_len_set = 1; maxs -= 256;} else c_len_set = 0; /* fprintf(stderr," lib_cnt: %d %d %d %d\n",lib_cnt,c_len,seq_len,maxs); */ /* does the rest of the sequence fit? */ if (seq_len <= maxs-4 && !c_len_set) { seqcnt = c_len; if (!m_fd->mm_flg) { if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,m_fd->libf))!=(size_t)seqcnt) { fprintf(stderr, " could not read sequence record: %s %lld %ld != %ld: %d\n", libstr,*libpos,tmp,seqcnt,*seq); goto error; } m_fd->bl_lib_pos += tmp; sptr = seq + seqcnt; } else sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt); *lcont = 0; /* this is the last chunk */ lib_cnt++; /* increment to the next sequence */ /* the last byte is either '0' (no remainder) or the last 1-3 chars and the remainder */ c_pad = *(sptr-1); c_pad &= 0x3; /* get the last (low) 2 bits */ seq_len -= (4 - c_pad); /* if the last 2 bits are 0, its a NULL byte */ } else { /* get the next chunk, but more to come */ seqcnt = ((maxs+3)/4)-1; if (!m_fd->mm_flg) { if ((tmp=fread(seq,(size_t)1,(size_t)(seqcnt),m_fd->libf))!=(size_t)(seqcnt)) { fprintf(stderr," could not read sequence record: %lld %ld/%ld\n", *libpos,tmp,seqcnt); goto error; } m_fd->bl_lib_pos += tmp; sptr = seq + seqcnt; } else { sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt); m_fd->mmap_addr += seqcnt; } seq_len = 4*seqcnt; c_len -= seqcnt; if (c_len_set) {c_len += 64; maxs += 256;} (*lcont)++; /* hopefully we don't need this because of c_len -= 64. */ /* if (c_len == 1) { #if !defined (USE_MMAP) c_pad = fgetc(m_fd->libf); *sptr=c_pad; #else c_pad = *m_fd->mmap_addr++; sptr = m_fd->mmap_addr; #endif c_pad &= 0x3; seq_len += c_pad; seqcnt++; lib_cnt++; *lcont = 0; } */ } /* point to the last packed byte and to the end of the array seqcnt is the exact number of bytes read tptr points to the destination, use multiple of 4 to simplify math sptr points to the source, note that the last byte will be read 4 cycles before it is written */ tptr = seq + 4*seqcnt; s_chunk = seqcnt/8; while (s_chunk-- > 0) { stmp = *--sptr; *--tptr = (stmp&3) +1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; stmp = *--sptr; *--tptr = (stmp&3) +1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; stmp = *--sptr; *--tptr = (stmp&3) +1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; stmp = *--sptr; *--tptr = (stmp&3) +1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; stmp = *--sptr; *--tptr = (stmp&3) +1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; stmp = *--sptr; *--tptr = (stmp&3) +1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; stmp = *--sptr; *--tptr = (stmp&3) +1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; stmp = *--sptr; *--tptr = (stmp&3) +1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; } while (tptr>seq) { stmp = *--sptr; *--tptr = (stmp&3) +1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; } /* for (sptr=seq; sptr < seq+seq_len; sptr++) { printf("%c",nt[*sptr]); if ((int)(sptr-seq) % 60 == 59) printf("\n"); } printf("\n"); */ m_fd->lpos = lib_cnt; if (seqcnt*4 >= seq_len) { /* there was enough room */ seq[seq_len]= EOSEQ; /* printf("%d\n",seq_len); */ return seq_len; } else { /* not enough room */ seq[seqcnt*4]=EOSEQ; seq_len -= 4*seqcnt; return (4*seqcnt); } error: fprintf(stderr," error reading %s at %ld\n",libstr,*libpos); fflush(stderr); return (-1); } /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 */ static char *db_type_arr[] = {"lcl","gib","gim","gii","gb","emb","pir","sp", "pat","ref","gnl","gi","dbj","prf","pdb","tpg", "tpe","tpd"}; void ncbl2_ranlib(char *str, int cnt, fseek_t libpos, char *libstr, struct lmf_str *m_fd) { int llen, lib_cnt; char *bp; unsigned char *my_buff=NULL; char descr[2048]; unsigned char *abp; int gi, taxid; int my_db; char db[5], acc[20], name[20]; char title[1024]; int have_my_buff=0; int have_descr = 0; lib_cnt = (int)libpos; llen = m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]; fseek(m_fd->hfile,m_fd->d_pos_arr[libpos],0); if (m_fd->bl_format_ver == FORMATDBV3) { if (llen >= cnt) llen = cnt-1; fread(str,(size_t)1,(size_t)(llen),m_fd->hfile); } else { if (llen >= m_fd->tmp_buf_max) { if ((my_buff=(unsigned char *)calloc(llen,sizeof(char)))==NULL) { fprintf(stderr," cannot allocate ASN.1 buffer: %d\n",llen); my_buff = (unsigned char *)m_fd->tmp_buf; llen = m_fd->tmp_buf_max; } else have_my_buff = 1; } else { my_buff = (unsigned char *)m_fd->tmp_buf; } abp = my_buff; fread(my_buff,(size_t)1,llen,m_fd->hfile); do { abp = parse_fastadl_asn(abp, my_buff+llen, &gi, &my_db, acc, name, title, sizeof(title), &taxid); if (gi > 0) { sprintf(descr,"gi|%d|%s|%s|%s ",gi,db_type_arr[my_db],acc,name); } else { if (acc[0] != '\0') sprintf(descr,"%s ",acc); else descr[0] = '\0'; if (name[0] != '\0' && strcmp(name,"BL_ORD_ID")!=0) sprintf(descr+strlen(descr),"%s ", name); } if (m_fd->pref_db < 0) { if (!have_descr) { strncpy(str,descr,cnt-1); have_descr = 1; } else { strncat(str,"\001",cnt-strlen(str)-1); strncat(str,descr,cnt-strlen(str)-1); } strncat(str,title,cnt-strlen(str)-1); if (strlen(str) >= cnt-1) break; } else if (m_fd->pref_db == my_db) { have_descr = 1; strncpy(str,descr,cnt-1); strncat(str,title,cnt-strlen(str)-1); break; } } while (abp); if (!have_descr) { strncpy(str,descr,cnt-1); strncat(str,descr,cnt-strlen(str)-1); } if (have_my_buff) free(my_buff); } str[cnt-1]='\0'; bp = str; while((bp=strchr(bp,'\001'))!=NULL) {*bp++=' ';} if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[libpos],0); m_fd->lpos = lib_cnt; m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt]; } unsigned int bl2_uint4_cvt(unsigned int val) { unsigned int res; #ifdef IS_BIG_ENDIAN return val; #else /* it better be LITTLE_ENDIAN */ res = ((val&255)*256)+ ((val>>8)&255); res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255); return res; #endif } unsigned int bl2_long4_cvt(long val) { int val4; unsigned int res; #ifdef IS_BIG_ENDIAN val4 = val; return val4; #else /* it better be LITTLE_ENDIAN */ res = ((val&255)*256)+ ((val>>8)&255); res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255); return res; #endif } int64_t bl2_long8_cvt(int64_t val) { int64_t res; #ifdef IS_BIG_ENDIAN return val; #else /* it better be LITTLE_ENDIAN */ res = ((val&255)*256)+ ((val>>8)&255); res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255); #ifdef BIG_LIB64 res = (res<<16) + (((val>>32)&255)*256) + ((val>>40)&255); res = (res<<16) + (((val>>48)&255)*256) + ((val>>56)&255); #else fprintf(stderr,"Cannot use bl2_long8_cvt without 64-bit longs\n"); exit(1); #endif return res; #endif } void src_int4_read(FILE *fd, int *val) { #ifdef IS_BIG_ENDIAN fread((char *)val,(size_t)4,(size_t)1,fd); #else unsigned char b[4]; fread((char *)&b[0],(size_t)1,(size_t)4,fd); *val = 0; *val = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8) +(int)b[3]; #endif } void src_long4_read(FILE *fd, long *valp) { int val4; #ifdef IS_BIG_ENDIAN fread(&val4,(size_t)4,(size_t)1,fd); *valp = val4; #else unsigned char b[4]; fread((char *)&b[0],(size_t)1,(size_t)4,fd); val4 = 0; val4 = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8) +(int)b[3]; *valp = val4; #endif } void src_uint4_read(FILE *fd, unsigned int *valp) { #ifdef IS_BIG_ENDIAN fread(valp,(size_t)4,(size_t)1,fd); #else unsigned char b[4]; fread((char *)&b[0],(size_t)1,(size_t)4,fd); *valp = 0; *valp = (unsigned int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8) +(int)b[3]; #endif } void src_long8_read(FILE *fd, long *val) { #ifdef IS_BIG_ENDIAN fread((void *)val,(size_t)8,(size_t)1,fd); #else unsigned char b[8]; fread((char *)&b[0],(size_t)1,(size_t)8,fd); *val = 0; *val = (long)((((((long)((long)(b[0]<<8)+(long)b[1]<<8)+(long)b[2]<<8) +(long)b[3]<<8)+(long)b[4]<<8)+(long)b[5]<<8) +(long)b[6]<<8)+(long)b[7]; #endif } void ncbi_long8_read(FILE *fd, int64_t *val) { unsigned char b[8]; fread((char *)&b[0],(size_t)1,(size_t)8,fd); *val = 0; *val = (long)((((((long)((long)(b[7]<<8)+(long)b[6]<<8)+(long)b[5]<<8) +(long)b[4]<<8)+(long)b[3]<<8)+(long)b[2]<<8) +(long)b[1]<<8)+(long)b[0]; } void src_char_read(FILE *fd, char *val) { fread(val,(size_t)1,(size_t)1,fd); } void src_fstr_read(FILE *fd, char *val, int slen) { fread(val,(size_t)slen,(size_t)1,fd); } void newname(char *nname, char *oname, char *suff, int maxn) { strncpy(nname,oname,maxn-1); strncat(nname,".",1); strncat(nname,suff,maxn-strlen(nname)); } #define ASN_SEQ 0x30 #define ASN_IS_BOOL 1 #define ASN_IS_INT 2 #define ASN_IS_STR 26 unsigned char * get_asn_int(unsigned char *abp, int *val) { int v_len, v; v = 0; if (*abp++ != ASN_IS_INT) { /* check for int */ fprintf(stderr," int missing\n"); } else { v_len = *abp++; while (v_len-- > 0) { v *= 256; v += *abp++; } abp += 2; /* skip over null's */ } *val = v; return abp; } unsigned char * get_asn_text(unsigned char *abp, char *text, int t_len) { int tch, at_len; text[0] = '\0'; if (*abp++ != ASN_IS_STR) { /* check for str */ fprintf(stderr," str missing\n"); } else { if ((tch = *abp++) > 128) { /* string length is in next bytes */ tch &= 0x7f; /* get number of bytes for len */ at_len = 0; while (tch-- > 0) { at_len = (at_len << 8) + *abp++;} } else { at_len = tch; } if ( at_len < t_len-1) { memcpy(text, abp, at_len); text[at_len] = '\0'; } else { memcpy(text, abp, t_len-1); text[t_len-1] = '\0'; } abp += at_len + 2; } return abp; } /* something to try to skip over stuff we don't want */ unsigned char * get_asn_junk(unsigned char *abp) { int seq_cnt = 0; int tmp; char string[256]; while (*abp) { if ( *abp == ASN_SEQ) { abp += 2; seq_cnt++;} else if ( *abp == ASN_IS_BOOL ) {abp = get_asn_int(abp, &tmp);} else if ( *abp == ASN_IS_INT ) {abp = get_asn_int(abp, &tmp);} else if ( *abp == ASN_IS_STR ) {abp = get_asn_text(abp, string, sizeof(string)-1);} } while (seq_cnt-- > 0) abp += 2; return abp; } unsigned char * get_asn_textseq_id(unsigned char *abp, char *name, char *acc) { char release[20], ver_str[10]; int version; int seqcnt = 0; ver_str[0]='\0'; if (*abp == ASN_SEQ) { abp += 2; seqcnt++;} while (*abp) { switch (*abp) { case 0xa0: abp = get_asn_text(abp+2, name, 20); break; case 0xa1: abp = get_asn_text(abp+2, acc, 20); break; case 0xa2: abp = get_asn_text(abp+2, release, sizeof(release)); break; case 0xa3: abp = get_asn_int(abp+2, &version); sprintf(ver_str,".%d",version); break; default: abp += 2; } } while (seqcnt-- > 0) abp += 4; strncat(acc,ver_str,20-strlen(acc)); acc[19]='\0'; return abp; /* skip 2 NULL's */ } unsigned char * get_asn_local_id(unsigned char *abp, char *acc) { int seqcnt = 0; if (*abp == ASN_SEQ) { abp += 2; seqcnt++;} abp = get_asn_text(abp+2, acc, 20); while (seqcnt-- > 0) abp += 4; acc[19]='\0'; return abp; /* skip 2 NULL's */ } unsigned char * get_asn_dbtag(unsigned char *abp, char *name, char *str, int *id_p) { if (*abp == ASN_SEQ) { abp += 2;} if (*abp == 0xa0) { /* get db */ abp = get_asn_text(abp+2, name, 20); } else { fprintf(stderr," missing dbtag:db %d %d\n",abp[0],abp[1]); abp += 2; } if (*abp == 0xa1) { /* get tag */ abp += 2; abp += 2; /* skip over id */ if (*abp == 2) abp = get_asn_int(abp,id_p); else abp = get_asn_text(abp+2, str, 20); } else { fprintf(stderr," missing dbtag:tag %2x %2x\n",abp[0],abp[1]); abp += 2; } return abp+2; /* skip 2 NULL's */ } unsigned char * get_asn_pdb_id(unsigned char *abp, char *acc, char *chain) { int ichain, seq_cnt=0; if (*abp == ASN_SEQ) { abp += 2; seq_cnt++;} while (*abp) { switch (*abp) { case 0: abp += 2; break; case 0xa0: /* mol-id */ abp = get_asn_text(abp+2, acc, 20); break; case 0xa1: abp = get_asn_int(abp+2, &ichain); chain[0] = ichain; chain[1] = '\0'; break; case 0xa2: /* ignore date - scan until NULL's */ while (*abp++) {} abp += 2; /* skip the NULL's */ break; default: abp+=2; } } while (seq_cnt-- > 0) {abp += 4;} return abp; } #define ASN_TYPE_MASK 31 unsigned char *get_asn_seqid(unsigned char *abp, int *gi_p, int *db, char *acc, char *name) { int db_type, itmp, seq_cnt=0; *gi_p = 0; if (*abp != ASN_SEQ) { fprintf(stderr, "seqid - missing SEQ 1: %2x %2x\n",abp[0], abp[1]); return abp; } else { abp += 2; seq_cnt++;} db_type = (*abp & ASN_TYPE_MASK); if (db_type == 11) { /* gi */ abp = get_asn_int(abp+2,gi_p); } while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;} db_type = (*abp & ASN_TYPE_MASK); if (db_type > 17) {db_type = 0;} *db = db_type; switch(db_type) { case 0: abp = get_asn_local_id(abp+2, acc); break; case 1: case 2: abp = get_asn_int(abp+2,&itmp); abp += 2; break; case 11: abp = get_asn_int(abp+2,&itmp); break; case 4: case 5: case 6: case 7: case 9: case 12: case 13: case 15: case 16: case 17: abp = get_asn_textseq_id(abp+2,name,acc); break; case 10: abp = get_asn_dbtag(abp+2,name,acc,&itmp); case 14: abp = get_asn_pdb_id(abp+2,acc,name); break; default: abp += 2; } while (seq_cnt-- > 0) { abp += 4;} return abp; /* skip over 2 NULL's */ } #define ASN_FADL_TITLE 0xa0 #define ASN_FADL_SEQID 0xa1 #define ASN_FADL_TAXID 0xa2 #define ASN_FADL_MEMBERS 0xa3 #define ASN_FADL_LINKS 0xa4 #define ASN_FADL_OTHER 0xa5 unsigned char * parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max, int *gi_p, int *db, char *acc, char *name, char *title, int t_len, int *taxid_p) { unsigned char *abp; char tmp_db[4], tmp_acc[32], tmp_name[32]; int this_db; int seq_cnt = 0; int tmp_gi; acc[0] = name[0] = db[0] = title[0] = '\0'; abp = asn_buff; while ( abp < asn_max && *abp) { if (*abp == ASN_SEQ) { abp += 2; seq_cnt++; } else if (*abp == ASN_FADL_TITLE) { abp = get_asn_text(abp+2, title, t_len); } else if (*abp == ASN_FADL_SEQID ) { abp = get_asn_seqid(abp+2, gi_p, db, acc, name); if (*db > 17) *db = 0; } else if (*abp == ASN_FADL_TAXID ) { abp = get_asn_int(abp+2, taxid_p); } else if (*abp == ASN_FADL_MEMBERS) { abp = get_asn_junk(abp+2); } else if (*abp == ASN_FADL_LINKS ) { abp = get_asn_junk(abp+2); } else if (*abp == ASN_FADL_OTHER ) { abp = get_asn_junk(abp+2); } else { /* fprintf(stderr, " Error - missing ASN.1 %2x:%2x:%2x:%2x\n", abp[-2],abp[-1],abp[0],abp[1]); */ abp += 2; } } while (abp < asn_max && *abp == '\0' ) abp++; if (abp >= asn_max) return NULL; else return abp; } void parse_pal(char *dname, char *msk_name, int *oid_seqs, int *max_oid, FILE *fd) { char line[MAX_STR]; while (fgets(line,sizeof(line),fd)) { if (line[0] == '#') continue; if (strncmp(line, "DBLIST", 6)==0) { sscanf(line+7,"%s",dname); } else if (strncmp(line, "OIDLIST", 7)==0) { sscanf(line+8,"%s",msk_name); } else if (strncmp(line, "NSEQ", 4)==0) { sscanf(line+5,"%d",oid_seqs); } else if (strncmp(line, "MAXOID", 6)==0) { sscanf(line+7,"%d",max_oid); } } }