1 /* ncbl_lib.c functions to read ncbi-blast format files from
2 setdb (blastp 1.3.2) format files
4 copyright (c) 1992 William R. Pearson
7 /* $Name: fa_34_26_5 $ - $Id: ncbl_lib.c,v 1.9 2006/10/05 18:22:07 wrp Exp $ */
22 #include "ncbl_head.h"
25 int ncbl_getliba(char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
26 int ncbl_getlibn(char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
28 void src_ulong_read();
31 static void src_char_read();
32 static void newname(char *, char *, char *, int);
35 void newname(char *, char *, char *, int);
38 /* nt_btoa maps from blast 2bit format to ascii characters */
39 static char nt_btoa[5] = {"ACGT"};
41 static char aa_btoa[27]= {"-ARNDCQEGHILKMFPSTWYVBZX*"};
42 static int aa_btof[32]; /* maps to fasta alphabet */
44 static FILE *tfile=NULL, /* table of offsets, also DB info */
45 *hfile=NULL, /* description lines */
46 *sfile=NULL; /* binary sequence data */
48 static unsigned long lib_cnt, max_cnt, totlen, mxlen, dbline_len;
49 static unsigned long *seq_beg, *hdr_beg;
50 static unsigned char *ambiguity_ray;
51 static long seq_format, dbtype, dbformat;
52 static char dline[512];
57 ncbl_openlib(char *name, int ldnaseq)
66 unsigned long line_len, c_len, clean_count;
69 newname(tname,name,AA_TABLE_EXT,(int)sizeof(tname));
70 if ((tfile = fopen(tname,RBSTR))==NULL) {
71 fprintf(stderr," cannot open %s (%s.%s) table file\n",
72 name,tname,NT_TABLE_EXT);
75 seq_format = AAFORMAT;
78 newname(tname,name,NT_TABLE_EXT,(int)sizeof(tname));
79 if ((tfile = fopen(tname,RBSTR))==NULL) {
80 fprintf(stderr," cannot open %s (%s.%s) table file\n",
81 name,tname,NT_TABLE_EXT);
84 seq_format = NTFORMAT;
87 src_ulong_read(tfile,&dbtype);
88 src_ulong_read(tfile,&dbformat);
90 if (seq_format == AAFORMAT && (dbformat != seq_format || dbtype !=
92 fprintf(stderr,"error - %s wrong type (%ld/%d) or format (%ld/%ld)\n",
93 tname,dbtype,DB_TYPE_PRO,dbformat,seq_format);
96 else if (seq_format == NTFORMAT && (dbformat != seq_format || dbtype !=
98 fprintf(stderr,"error - %s wrong type (%ld/%d) or format (%ld/%ld)\n",
99 tname,dbtype,DB_TYPE_NUC,dbformat,seq_format);
103 if (seq_format == AAFORMAT) {
104 newname(hname,name,AA_HEADER_EXT,(int)sizeof(hname));
105 if ((hfile = fopen(hname,RBSTR))==NULL) {
106 fprintf(stderr," cannot open %s header file\n",hname);
109 newname(sname,name,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
110 if ((sfile = fopen(sname,RBSTR))==NULL) {
111 fprintf(stderr," cannot open %s sequence file\n",sname);
116 newname(hname,name,NT_HEADER_EXT,(int)sizeof(hname));
117 if ((hfile = fopen(hname,RBSTR))==NULL) {
118 fprintf(stderr," cannot open %s header file\n",hname);
121 newname(sname,name,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
122 if ((sfile = fopen(sname,RBSTR))==NULL) {
123 fprintf(stderr," cannot open %s sequence file\n",sname);
128 /* all files should be open */
130 src_ulong_read(tfile,&title_len);
131 rdtmp = title_len + ((title_len%4 !=0 ) ? 4-(title_len%4) : 0);
132 if ((title_str = calloc((size_t)rdtmp,sizeof(char)))==NULL) {
133 fprintf(stderr," cannot allocate title string (%d)\n",rdtmp);
136 fread(title_str,(size_t)1,(size_t)rdtmp,tfile);
139 if (seq_format == AAFORMAT) {
140 src_ulong_read(tfile,&max_cnt);
141 src_ulong_read(tfile,&totlen);
142 src_ulong_read(tfile,&mxlen);
144 /* fprintf(stderr," max_cnt: %d, totlen: %d\n",max_cnt,totlen); */
146 if ((seq_beg=(unsigned long *)calloc((size_t)max_cnt+1,sizeof(long)))==NULL) {
147 fprintf(stderr," cannot allocate sequence pointers\n");
150 if ((hdr_beg=(unsigned long *)calloc((size_t)max_cnt+1,sizeof(long)))==NULL) {
151 fprintf(stderr," cannot allocate header pointers\n");
154 for (i=0; i<max_cnt+1; i++) src_ulong_read(tfile,&seq_beg[i]);
155 for (i=0; i<max_cnt+1; i++) src_ulong_read(tfile,&hdr_beg[i]);
157 for (i=0; i<sizeof(aa_btoa); i++) {
158 if ((rdtmp=aascii[aa_btoa[i]])<NA) aa_btof[i]=rdtmp;
159 else aa_btof[i]=aascii['X'];
162 else if (seq_format == NTFORMAT) {
163 src_ulong_read(tfile,&dbline_len); /* length of uncompress DB lines */
164 src_ulong_read(tfile,&max_cnt); /* number of entries */
165 src_ulong_read(tfile,&mxlen); /* maximum length sequence */
166 src_ulong_read(tfile,&totlen); /* total count */
167 src_ulong_read(tfile,&c_len); /* compressed db length */
168 src_ulong_read(tfile,&clean_count); /* count of nt's cleaned */
170 fseek(tfile,(size_t)((clean_count)*4),1);
171 /* seek over clean_count */
172 if ((seq_beg=(unsigned long *)calloc((size_t)max_cnt+1,sizeof(long)))==NULL) {
173 fprintf(stderr," cannot allocate sequence pointers\n");
176 if ((hdr_beg=(unsigned long *)calloc((size_t)max_cnt+1,sizeof(long)))==NULL) {
177 fprintf(stderr," cannot allocate header pointers\n");
181 (unsigned char *)calloc((size_t)max_cnt/CHAR_BIT+1,sizeof(char)))==NULL) {
182 fprintf(stderr," cannot allocate ambiguity_ray\n");
186 for (i=0; i<max_cnt+1; i++) src_ulong_read(tfile,&seq_beg[i]);
187 fseek(tfile,(size_t)((max_cnt+1)*4),1);
188 /* seek over seq_beg */
189 for (i=0; i<max_cnt+1; i++) src_ulong_read(tfile,&hdr_beg[i]);
190 for (i=0; i<max_cnt/CHAR_BIT+1; i++)
191 src_char_read(tfile,&ambiguity_ray[i]);
198 if (tfile !=NULL ) {fclose(tfile); tfile=NULL;}
199 if (hfile !=NULL ) {fclose(hfile); hfile=NULL;}
200 if (sfile !=NULL ) {fclose(sfile); sfile=NULL;}
204 ncbl_getliba(char *seq, int maxs,
205 char *libstr, int n_libstr,
217 if (lib_cnt >= max_cnt) return -1;
218 seq_len = seq_beg[lib_cnt+1] - seq_beg[lib_cnt] -1;
219 tmp=(long)fgetc(sfile); /* skip the null byte */
221 fprintf(stderr," phase error: %ld:%ld found\n",lib_cnt,tmp);
225 if (seq_len < maxs) {
226 if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,sfile))!=(size_t)seq_len) {
227 fprintf(stderr," could not read sequence record: %ld %ld != %ld\n",
228 *libpos,tmp,seq_len);
231 if (aa_btoa[seq[seq_len-1]]=='*') seqcnt = seq_len-1;
237 if (fread(seq,(size_t)1,(size_t)(maxs-1),sfile)!=(size_t)(maxs-1)) {
238 fprintf(stderr," could not read sequence record: %ld %ld\n",
248 while (--sptr >= seq) *sptr = aa_btof[*sptr];
253 error: fprintf(stderr," error reading %ld at %ld\n",libstr,*libpos);
259 ncbl_getlibn(char *seq, int maxs,
260 char *libstr, int n_libstr,
261 fseek_t *libpos, int *lcont)
263 register char *sptr, *tptr, stmp;
268 static int c_len,c_pad;
272 if (lib_cnt >= max_cnt) return -1;
273 c_len = seq_beg[lib_cnt+1]/(CHAR_BIT/NBPN)
274 - seq_beg[lib_cnt]/(CHAR_BIT/NBPN);
277 seq_len = c_len*(CHAR_BIT/NBPN);
278 c_pad = seq_beg[lib_cnt] & ((CHAR_BIT/NBPN)-1);
279 if (c_pad != 0) seq_len -= ((CHAR_BIT/NBPN) - c_pad);
281 tmp=fgetc(sfile); /* skip the null byte */
282 if (tmp!=NT_MAGIC_BYTE) {
283 fprintf(stderr," phase error: %ld:%ld (%ld/%d) found\n",
284 lib_cnt,seq_len,tmp,NT_MAGIC_BYTE);
290 if (seq_len < maxs-3) {
291 seqcnt=(seq_len+3)/4;
292 if (seqcnt==0) seqcnt++;
293 if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,sfile))
296 " could not read sequence record: %s %ld %ld != %ld: %d\n",
297 libstr,*libpos,tmp,seqcnt,*seq);
300 tmp=fgetc(sfile); /* skip the null byte */
301 if (tmp!=(unsigned char)NT_MAGIC_BYTE) {
302 fprintf(stderr," phase2 error: %ld:%ld (%ld/%d) next ",
303 lib_cnt,seqcnt,tmp,NT_MAGIC_BYTE);
311 seqcnt = ((maxs+3)/4)-1;
312 if (fread(seq,(size_t)1,(size_t)(seqcnt),sfile)!=(size_t)(seqcnt)) {
313 fprintf(stderr," could not read sequence record: %s %ld %ld\n",
314 libstr,*libpos,seqcnt);
320 /* point to the last packed byte and to the end of the array
321 seqcnt is the exact number of bytes read
322 tptr points to the destination, use multiple of 4 to simplify math
323 sptr points to the source, note that the last byte will be read 4 cycles
328 tptr = seq + 4*seqcnt;
331 *--tptr = (stmp&3) +1;
332 *--tptr = ((stmp >>= 2)&3)+1;
333 *--tptr = ((stmp >>= 2)&3)+1;
334 *--tptr = ((stmp >>= 2)&3)+1;
337 for (sptr=seq; sptr < seq+seq_len; sptr++) {
338 printf("%c",nt[*sptr]);
339 if ((int)(sptr-seq) % 60 == 59) printf("\n");
343 if (seqcnt*4 >= seq_len) { /* there was enough room */
345 /* printf("%d\n",seq_len); */
348 else { /* not enough room */
354 error: fprintf(stderr," error reading %ld at %ld\n",libstr,*libpos);
360 ncbl_ranlib(str,cnt,libpos)
364 char hline[256], *bp, *bp0;
369 llen = hdr_beg[lib_cnt+1]-hdr_beg[lib_cnt];
370 if (llen > sizeof(hline)) llen = sizeof(hline);
371 fseek(hfile,hdr_beg[lib_cnt]+1,0);
373 fread(hline,(size_t)1,(size_t)(llen-1),hfile);
376 if (hline[9]=='|' || hline[10]=='|') {
377 bp0 = strchr(hline+3,'|');
378 if ((bp=strchr(bp0+1,' '))!=NULL) *bp='\0';
379 if (dbformat == NTFORMAT &&
380 (ambiguity_ray[lib_cnt/CHAR_BIT]&(1<<lib_cnt%CHAR_BIT))) {
381 sprintf(str,"*%-9s ",bp0+1);
383 else sprintf(str,"%-10s ",bp0+1);
384 strncat(str+11,bp+1,cnt-strlen(str));
387 if (dbformat == NTFORMAT &&
388 (ambiguity_ray[lib_cnt/CHAR_BIT]&(1<<lib_cnt%CHAR_BIT))) {
390 strncpy(str+1,hline,cnt-1);
392 else strncpy(str,hline,cnt);
396 if (dbformat == AAFORMAT)
397 fseek(sfile,seq_beg[lib_cnt]-1,0);
399 spos = (seq_beg[lib_cnt])/(CHAR_BIT/NBPN);
400 fseek(sfile,spos-1,0);
404 void src_ulong_read(fd, val)
409 fread((char *)val,(size_t)4,(size_t)1,fd);
413 fread((char *)&b[0],(size_t)1,(size_t)4,fd);
415 *val = (unsigned long)((unsigned long)((unsigned long)(b[0]<<8) +
416 (unsigned long)b[1]<<8) + (unsigned long)b[2]<<8)+(unsigned long)b[3];
420 void src_long_read(fd,val)
425 fread((char *)val,(size_t)4,(size_t)1,fd);
429 fread((char *)&b[0],(size_t)1,(size_t)4,fd);
431 *val = (long)((long)((long)(b[0]<<8)+(long)b[1]<<8)+(long)b[2]<<8)
441 src_char_read(fd, val)
445 fread(val,(size_t)1,(size_t)1,fd);
453 src_fstr_read(fd, val, slen)
458 fread(val,(size_t)slen,(size_t)1,fd);
466 newname(char *nname, char *oname, char *suff, int maxn)
470 if (oname[0]=='@') strncpy(nname,&oname[1],maxn);
471 else strncpy(nname,oname,maxn);
472 for (tptr=nname; *tptr=='.' && *tptr; tptr++);
473 for (; *tptr!='.'&& *tptr; tptr++); /* get to '.' or EOS */
474 *tptr++='.'; *tptr='\0';
475 strncat(nname,suff,maxn);