1 /* map_db.c - read a FASTA or GCG format database and generate a list
2 of indices for rapid memory mapping */
4 /* copyright (c) 1999 William R. Pearson */
6 /* $Name: fa_34_26_5 $ - $Id: map_db.c,v 1.9 2005/09/27 15:32:58 wrp Exp $ */
8 /* input is a libtype 1,5, or 6 sequence database */
9 /* output is a BLAST2 formatdb type index file */
11 /* format of the index file:
13 1) map_db version number ["MP"+2 bytes]
14 2) number of sequences in database [4 bytes]
15 3) total length of database [8 bytes] (MP1, 4 bytes for MP0)
16 4) longest sequence in database [8 bytes] (MP1, 4 bytes for MP0)
17 5) list of offsets to definitions [num_seq+1] int*8 (MP1, 4 bytes for MP0)
18 6) list of offsets to sequences [num_seq+1] int*8 (MP1, 4 bytes for MP1)
19 7) list of flag characters for sequences [num_seq+1]bytes
20 (used for GCG binary to encode 2bit or 4 bit representation)
22 sequence files will be as defined by their format
29 #include <sys/types.h>
33 #include "ncbl2_head.h"
40 int a_get_ent(long *, long *);
41 int v_get_ent(long *, long *);
42 int gcg_get_ent(long *, long *);
43 int gbf_get_ent(long *, long *);
45 void src_int4_write(FILE *, int);
46 void src_int4_read(FILE *, int *);
47 void src_long4_write(FILE *, long);
48 void src_long4_read(FILE *, long *);
49 void src_long8_write(FILE *, long);
50 void src_long8_read(FILE *, long *);
52 void newname(char *nname, char *oname, char *suff, int maxn);
54 int (*get_ent_arr[LASTLIB+1])()={a_get_ent, gbf_get_ent, NULL, NULL, NULL,
55 v_get_ent, gcg_get_ent};
57 long openlib(char *, int);
61 main(int argc, char **argv)
70 int nlib; /* number of entries */
72 long max_len; /* longest sequence */
73 long tot_len; /* total sequence length */
77 long f_size; /* file size from fstat() */
78 int lib_size; /* current space available - may be realloc'ed */
80 int lib_type; /* 1 for protein, 0 for DNA */
81 int lib_aa; /* dna=1; prot=0; */
84 long d_pos; /* start of description */
85 long s_pos; /* start of sequence */
86 long *d_pos_arr; /* array of description pointers */
87 long *s_pos_arr; /* array of description pointers */
95 while (argc > 1 && *argv[1]=='-') {
96 if (strcmp(argv[1],"-n")==0) lib_aa = 0;
101 /* open the database */
102 if (argc > 1) strncpy(lname, argv[1],sizeof(lname));
104 fprintf(stderr," Entry library name: ");
105 fgets(lname,sizeof(lname),stdin);
106 if ((bp=strchr(lname,'\n'))!=NULL) *bp='\0';
109 if ((bp=strchr(lname,' '))!=NULL) {
110 lib_type = atoi(bp+1);
115 if (get_ent_arr[lib_type] == NULL) {
116 fprintf(stderr," cannot index file %s type %d\n",lname,lib_type);
120 if (lib_type == 6) lib_aa = 0;
121 if (lib_type == 1) lib_aa = 0;
123 if (lib_aa == 1) sascii = aascii;
124 else sascii = nascii;
126 if ((f_size=openlib(lname,lib_type))==0) {
127 fprintf(stderr," cannot open %s (type: %d)\n",lname,lib_type);
131 /* allocate array of description pointers */
132 if ((d_pos_arr=(long *)calloc(lib_size, sizeof(long)))==NULL) {
133 fprintf(stderr," cannot allocate %d for desc. array\n",lib_size);
136 /* allocate array of sequence pointers */
137 if ((s_pos_arr=(long *)calloc(lib_size, sizeof(long)))==NULL) {
138 fprintf(stderr," cannot allocate %d for seq. array\n",lib_size);
142 /* allocate array of sequence flags */
144 nlib = 0; tot_len=0; max_len=-1;
145 while ((n1=get_entry(&d_pos, &s_pos)) > 0) {
146 d_pos_arr[nlib] = d_pos;
147 s_pos_arr[nlib] = s_pos;
150 if (n1 > max_len) max_len = n1;
151 if (nlib >= lib_size) { /* too many entries */
153 if ((d_pos_arr=(long *)realloc(d_pos_arr,lib_size*sizeof(long)))==NULL) {
154 fprintf(stderr," cannot realloc allocate %d for desc.. array\n",
158 if ((s_pos_arr=(long *)realloc(s_pos_arr,lib_size*sizeof(long)))==NULL) {
159 fprintf(stderr," cannot realloc allocate %d for seq. array\n",
166 d_pos_arr[nlib]= d_pos; /* put in the end of the file */
169 /* all the information is in, write it out */
171 newname(iname,lname,"xin",sizeof(iname));
173 if ((libi=fopen(iname,"w"))==NULL) {
174 fprintf(stderr," cannot open %s for writing\n",iname);
178 /* write out format version */
182 format[2]= 1; /* format 1 for 8-byte offsets */
184 format[2]='\0'; /* format '\0' for original 4-byte */
188 fwrite(format,4,sizeof(char),libi);
190 /* write out sequence type */
191 src_int4_write(libi, lib_aa);
193 /* write out file fstat as integrity check */
195 src_long8_write(libi, f_size);
197 src_int4_write(libi, f_size);
200 /* write out num_seq */
201 src_int4_write(libi, nlib);
204 /* write out tot_len, max_len */
205 src_long8_write(libi, tot_len);
207 src_int4_write(libi, tot_len);
209 src_int4_write(libi, max_len);
212 for (i=0; i<=nlib; i++) src_long8_write(libi,d_pos_arr[i]);
213 for (i=0; i<=nlib; i++) src_long8_write(libi,s_pos_arr[i]);
215 for (i=0; i<=nlib; i++) src_int4_write(libi,d_pos_arr[i]);
216 for (i=0; i<=nlib; i++) src_int4_write(libi,s_pos_arr[i]);
222 fprintf(stderr," wrote %d sequences (tot=%ld, max=%ld) to %s\n",
223 nlib,tot_len,max_len,iname);
225 fprintf(stderr," wrote %d sequences (tot=%ld, max=%ld) to %s\n",
226 nlib,tot_len,max_len,iname);
235 char lline[MAXLINE+1];
238 openlib(char *lname, int lib_type)
241 struct stat stat_buf;
243 if (stat(lname,&stat_buf)<0) {
244 fprintf(stderr," cannot stat library: %s\n",lname);
248 if ((libf=fopen(lname,"r"))==NULL) {
249 fprintf(stderr," cannot open library: %s (type: %d)\n",
254 f_size = stat_buf.st_size;
256 get_entry = get_ent_arr[lib_type];
259 if (fgets(lline,MAXLINE,libf)==NULL) return 0;
264 a_get_ent(long *d_pos, long *s_pos)
267 register int *ap, n1;
271 while (lline[0]!='>' && lline[0]!=';') {
273 if (fgets(lline,sizeof(lline),libf)==NULL) {
281 /* make certain we have the end of the line */
282 while (strchr((char *)lline,'\n')==NULL) {
283 if (fgets(lline,sizeof(lline),libf)==NULL) break;
286 *s_pos = ftell(libf);
289 while (fgets(lline,sizeof(lline),libf)!=NULL) {
290 if (lline[0]=='>') break;
292 if (strchr(lline,'\n')==NULL) {
293 fprintf(stderr," excessive continuation\n%s",lline);
298 for (cp=lline; *cp; ) if (ap[*cp++]<NA) n1++;
305 v_get_ent(long *d_pos, long *s_pos)
313 /* check for seq_id line */
314 while (lline[0]!='>' && lline[0]!=';') {
316 if (fgets(lline,sizeof(lline),libf)==NULL) {
323 /* get the description line */
324 if (fgets(lline,sizeof(lline),libf)==NULL) return 0;
325 /* make certain we have the end of the line */
326 while (strchr((char *)lline,'\n')==NULL) {
327 if (fgets(lline,sizeof(lline),libf)==NULL) break;
330 *s_pos = ftell(libf);
333 while (fgets(lline,sizeof(lline),libf)!=NULL) {
334 if (lline[0]=='>') break;
336 for (cp=lline; *cp; ) if (ap[*cp++]<NA) n1++;
342 static char gcg_type[10];
344 static int gcg_bton[4]={2,4,1,3};
347 gcg_get_ent(long *d_pos, long *s_pos)
351 char libstr[20], dummy[20];
356 /* check for seq_id line */
357 while (lline[0]!='>') {
359 if (fgets(lline,sizeof(lline),libf)==NULL) {
366 /* get the encoding/sequence length info */
368 sscanf(&lline[4],"%s %s %s %s %ld",
369 libstr,gcg_date,gcg_type,dummy,&gcg_len);
371 /* get the description line */
372 if (fgets(lline,MAXLINE,libf)==NULL) return;
374 *s_pos = ftell(libf);
375 /* seek to the end of the sequence; +1 to jump over newline */
376 if (gcg_type[0]=='2') {
377 r_block = (gcg_len+3)/4;
378 fseek(libf,r_block+1,SEEK_CUR);
380 else fseek(libf,gcg_len+1,SEEK_CUR);
383 fgets(lline,MAXLINE,libf);
389 gbf_get_ent(long *d_pos, long *s_pos)
401 while (lline[0]!='L' || lline[1]!='O' ||
402 strncmp(lline,"LOCUS",5)) { /* find LOCUS */
404 if (fgets(lline,MAXLINE,libf)==NULL) return (-1);
408 while (lline[0]!='O' || lline[1]!='R' ||
409 strncmp(lline,"ORIGIN",6)) { /* find ORIGIN */
410 if (fgets(lline,MAXLINE,libf)==NULL) return (-1);
412 *s_pos = ftell(libf);
416 while (fgets(lline,MAXLINE,libf)!=NULL) {
417 if (lline[0]=='/') break;
418 for (cp=lline; *cp; ) if (ap[*cp++]<NA) n1++;
421 fgets(lline,MAXLINE,libf);
426 void src_int4_read(FILE *fd, int *val)
429 fread((char *)val,(size_t)4,(size_t)1,fd);
433 fread((char *)&b[0],(size_t)1,(size_t)4,fd);
435 *val = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
440 void src_int4_write(FILE *fd, int val)
443 fwrite(&val,(size_t)4,(size_t)1,fd);
448 b[2] = (val=val>>8)&255;
449 b[1] = (val=val>>8)&255;
450 b[0] = (val=val>>8)&255;
452 fwrite(b,(size_t)1,(size_t)4,fd);
456 void src_long8_write(FILE *fd, long val)
459 fwrite(&val,(size_t)8,(size_t)1,fd);
464 b[6] = (val=val>>8)&255;
465 b[5] = (val=val>>8)&255;
466 b[4] = (val=val>>8)&255;
467 b[3] = (val=val>>8)&255;
468 b[2] = (val=val>>8)&255;
469 b[1] = (val=val>>8)&255;
470 b[0] = (val=val>>8)&255;
472 fwrite(b,(size_t)1,(size_t)8,fd);
477 newname(char *nname, char *oname, char *suff, int maxn)
479 strncpy(nname,oname,maxn-1);
480 strncat(nname,".",1);
481 strncat(nname,suff,maxn-strlen(nname));