/* ncbl_lib.c functions to read ncbi-blast format files from setdb (blastp 1.3.2) format files copyright (c) 1992 William R. Pearson */ /* $Name: fa_34_26_5 $ - $Id: ncbl_lib.c,v 1.9 2006/10/05 18:22:07 wrp Exp $ */ #include #include #include #ifndef WIN32 #define RBSTR "r" #else #define RBSTR "rb" #endif #define XTERNAL #include "uascii.h" #include "upam.h" #include "ncbl_head.h" #include "mm_file.h" int ncbl_getliba(char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *); int ncbl_getlibn(char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *); void src_ulong_read(); #ifndef NCBL13_ONLY static void src_char_read(); static void newname(char *, char *, char *, int); #else void src_char_read(); void newname(char *, char *, char *, int); #endif /* nt_btoa maps from blast 2bit format to ascii characters */ static char nt_btoa[5] = {"ACGT"}; static char aa_btoa[27]= {"-ARNDCQEGHILKMFPSTWYVBZX*"}; static int aa_btof[32]; /* maps to fasta alphabet */ static FILE *tfile=NULL, /* table of offsets, also DB info */ *hfile=NULL, /* description lines */ *sfile=NULL; /* binary sequence data */ static unsigned long lib_cnt, max_cnt, totlen, mxlen, dbline_len; static unsigned long *seq_beg, *hdr_beg; static unsigned char *ambiguity_ray; static long seq_format, dbtype, dbformat; static char dline[512]; #define NCBIBL13 11 struct lmf_str * ncbl_openlib(char *name, int ldnaseq) { char hname[256]; char sname[256]; char tname[256]; long title_len; char *title_str; int rdtmp; int i; unsigned long line_len, c_len, clean_count; if (ldnaseq!=1) { newname(tname,name,AA_TABLE_EXT,(int)sizeof(tname)); if ((tfile = fopen(tname,RBSTR))==NULL) { fprintf(stderr," cannot open %s (%s.%s) table file\n", name,tname,NT_TABLE_EXT); return (-1); } seq_format = AAFORMAT; } else { newname(tname,name,NT_TABLE_EXT,(int)sizeof(tname)); if ((tfile = fopen(tname,RBSTR))==NULL) { fprintf(stderr," cannot open %s (%s.%s) table file\n", name,tname,NT_TABLE_EXT); return (-1); } seq_format = NTFORMAT; } src_ulong_read(tfile,&dbtype); src_ulong_read(tfile,&dbformat); if (seq_format == AAFORMAT && (dbformat != seq_format || dbtype != DB_TYPE_PRO)) { fprintf(stderr,"error - %s wrong type (%ld/%d) or format (%ld/%ld)\n", tname,dbtype,DB_TYPE_PRO,dbformat,seq_format); return (-1); } else if (seq_format == NTFORMAT && (dbformat != seq_format || dbtype != DB_TYPE_NUC)) { fprintf(stderr,"error - %s wrong type (%ld/%d) or format (%ld/%ld)\n", tname,dbtype,DB_TYPE_NUC,dbformat,seq_format); return (-1); } if (seq_format == AAFORMAT) { newname(hname,name,AA_HEADER_EXT,(int)sizeof(hname)); if ((hfile = fopen(hname,RBSTR))==NULL) { fprintf(stderr," cannot open %s header file\n",hname); return (-1); } newname(sname,name,AA_SEARCHSEQ_EXT,(int)sizeof(sname)); if ((sfile = fopen(sname,RBSTR))==NULL) { fprintf(stderr," cannot open %s sequence file\n",sname); return (-1); } } else { newname(hname,name,NT_HEADER_EXT,(int)sizeof(hname)); if ((hfile = fopen(hname,RBSTR))==NULL) { fprintf(stderr," cannot open %s header file\n",hname); return (-1); } newname(sname,name,NT_SEARCHSEQ_EXT,(int)sizeof(sname)); if ((sfile = fopen(sname,RBSTR))==NULL) { fprintf(stderr," cannot open %s sequence file\n",sname); return (-1); } } /* all files should be open */ src_ulong_read(tfile,&title_len); rdtmp = title_len + ((title_len%4 !=0 ) ? 4-(title_len%4) : 0); if ((title_str = calloc((size_t)rdtmp,sizeof(char)))==NULL) { fprintf(stderr," cannot allocate title string (%d)\n",rdtmp); return(-1); } fread(title_str,(size_t)1,(size_t)rdtmp,tfile); lib_cnt = 0; if (seq_format == AAFORMAT) { src_ulong_read(tfile,&max_cnt); src_ulong_read(tfile,&totlen); src_ulong_read(tfile,&mxlen); /* fprintf(stderr," max_cnt: %d, totlen: %d\n",max_cnt,totlen); */ if ((seq_beg=(unsigned long *)calloc((size_t)max_cnt+1,sizeof(long)))==NULL) { fprintf(stderr," cannot allocate sequence pointers\n"); return -1; } if ((hdr_beg=(unsigned long *)calloc((size_t)max_cnt+1,sizeof(long)))==NULL) { fprintf(stderr," cannot allocate header pointers\n"); return -1; } for (i=0; i= max_cnt) return -1; seq_len = seq_beg[lib_cnt+1] - seq_beg[lib_cnt] -1; tmp=(long)fgetc(sfile); /* skip the null byte */ if (tmp!=NULLB) fprintf(stderr," phase error: %ld:%ld found\n",lib_cnt,tmp); libstr[0]='\0'; } if (seq_len < maxs) { if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,sfile))!=(size_t)seq_len) { fprintf(stderr," could not read sequence record: %ld %ld != %ld\n", *libpos,tmp,seq_len); goto error; } if (aa_btoa[seq[seq_len-1]]=='*') seqcnt = seq_len-1; else seqcnt=seq_len; lib_cnt++; *lcont = 0; } else { if (fread(seq,(size_t)1,(size_t)(maxs-1),sfile)!=(size_t)(maxs-1)) { fprintf(stderr," could not read sequence record: %ld %ld\n", *libpos,seq_len); goto error; } (*lcont)++; seqcnt = maxs-1; seq_len -= seqcnt; } sptr = seq+seqcnt; while (--sptr >= seq) *sptr = aa_btof[*sptr]; seq[seqcnt]= EOSEQ; return (seqcnt); error: fprintf(stderr," error reading %ld at %ld\n",libstr,*libpos); fflush(stderr); return (-1); } int ncbl_getlibn(char *seq, int maxs, char *libstr, int n_libstr, fseek_t *libpos, int *lcont) { register char *sptr, *tptr, stmp; long seqcnt; long tmp; char ch; static long seq_len; static int c_len,c_pad; *libpos = lib_cnt; if (*lcont==0) { if (lib_cnt >= max_cnt) return -1; c_len = seq_beg[lib_cnt+1]/(CHAR_BIT/NBPN) - seq_beg[lib_cnt]/(CHAR_BIT/NBPN); c_len -= NSENTINELS; seq_len = c_len*(CHAR_BIT/NBPN); c_pad = seq_beg[lib_cnt] & ((CHAR_BIT/NBPN)-1); if (c_pad != 0) seq_len -= ((CHAR_BIT/NBPN) - c_pad); tmp=fgetc(sfile); /* skip the null byte */ if (tmp!=NT_MAGIC_BYTE) { fprintf(stderr," phase error: %ld:%ld (%ld/%d) found\n", lib_cnt,seq_len,tmp,NT_MAGIC_BYTE); goto error; } libstr[0]='\0'; } if (seq_len < maxs-3) { seqcnt=(seq_len+3)/4; if (seqcnt==0) seqcnt++; if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,sfile)) !=(size_t)seqcnt) { fprintf(stderr, " could not read sequence record: %s %ld %ld != %ld: %d\n", libstr,*libpos,tmp,seqcnt,*seq); goto error; } tmp=fgetc(sfile); /* skip the null byte */ if (tmp!=(unsigned char)NT_MAGIC_BYTE) { fprintf(stderr," phase2 error: %ld:%ld (%ld/%d) next ", lib_cnt,seqcnt,tmp,NT_MAGIC_BYTE); goto error; } *lcont = 0; lib_cnt++; } else { seqcnt = ((maxs+3)/4)-1; if (fread(seq,(size_t)1,(size_t)(seqcnt),sfile)!=(size_t)(seqcnt)) { fprintf(stderr," could not read sequence record: %s %ld %ld\n", libstr,*libpos,seqcnt); goto error; } (*lcont)++; } /* point to the last packed byte and to the end of the array seqcnt is the exact number of bytes read tptr points to the destination, use multiple of 4 to simplify math sptr points to the source, note that the last byte will be read 4 cycles before it is written */ sptr = seq + seqcnt; tptr = seq + 4*seqcnt; while (sptr>seq) { stmp = *--sptr; *--tptr = (stmp&3) +1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; *--tptr = ((stmp >>= 2)&3)+1; } /* for (sptr=seq; sptr < seq+seq_len; sptr++) { printf("%c",nt[*sptr]); if ((int)(sptr-seq) % 60 == 59) printf("\n"); } printf("\n"); */ if (seqcnt*4 >= seq_len) { /* there was enough room */ seq[seq_len]= EOSEQ; /* printf("%d\n",seq_len); */ return seq_len; } else { /* not enough room */ seq[seqcnt*4]=EOSEQ; seq_len -= 4*seqcnt; return (4*seqcnt); } error: fprintf(stderr," error reading %ld at %ld\n",libstr,*libpos); fflush(stderr); return (-1); } void ncbl_ranlib(str,cnt,libpos) char *str; int cnt; long libpos; { char hline[256], *bp, *bp0; int llen; long spos; lib_cnt = libpos; llen = hdr_beg[lib_cnt+1]-hdr_beg[lib_cnt]; if (llen > sizeof(hline)) llen = sizeof(hline); fseek(hfile,hdr_beg[lib_cnt]+1,0); fread(hline,(size_t)1,(size_t)(llen-1),hfile); hline[llen-1]='\0'; if (hline[9]=='|' || hline[10]=='|') { bp0 = strchr(hline+3,'|'); if ((bp=strchr(bp0+1,' '))!=NULL) *bp='\0'; if (dbformat == NTFORMAT && (ambiguity_ray[lib_cnt/CHAR_BIT]&(1<