2 /* $Name: fa_34_26_5 $ - $Id: nmgetlib.c,v 1.35 2007/01/08 15:38:46 wrp Exp $ */
4 /* May, June 1987 - modified for rapid read of database
6 copyright (c) 1987,1988,1989,1992,1995,2000 William R. Pearson
8 revised (split) version of nmgetaa.c -> renamed nmgetlib.c
10 This version seeks to be a thread safe, no global, library
11 reading program. While adjusting the routines in this file
12 should be relatively easy, ncbl2_mlib.c and mysql_lib.c may be
15 nmgetlib.c and mmgetaa.c are used together. nmgetlib.c provides
16 the same functions as nxgetaa.c if memory mapping is not used,
17 mmgetaa.c provides the database reading functions if memory
18 mapping is used. The decision to use memory mapping is made on
21 June 2, 1987 - added TFASTA
22 March 30, 1988 - combined ffgetaa, fgetgb;
23 April 8, 1988 - added PIRLIB format for unix
24 Feb 4, 1989 - added universal subroutines for libraries
25 December, 1995 - added range option file.name:1-1000
26 September, 1999 - added option for mmap()ed files using ".xin" */
30 February 4, 1988 - this starts a major revision of the getaa
31 routines. The goal is to be able to seach the following format
34 0 - normal FASTA format
35 1 - full Genbank tape format
36 2 - NBRF/PIR CODATA format
37 3 - EMBL/Swiss-prot format
38 4 - Intelligentics format
39 5 - NBRF/PIR VMS format
42 11 - NCBI setdb/blastp (1.3.2) AA/NT
43 12 - NCBI setdb/blastp (2.0) AA/NT
46 see file altlib.h to confirm numbers
65 /* #include "upam.h" */
67 #define LFCHAR '\015' /* for MWC 5.5 */
75 #define O_RAW O_BINARY
82 #define RBSTR "rb" /* read file in binary mode */
88 struct lmf_str *load_mmap(FILE *, char *, int, int, struct lmf_str *);
89 struct lmf_str *ncbl2_reopen(struct lmf_str *);
90 struct lmf_str *ncbl2_openlib(char *, int);
92 static struct lmf_str *last_m_fptr=NULL;
95 struct lmf_str *mysql_openlib(char *, int, int *);
96 struct lmf_str *mysql_reopen(struct lmf_str *);
100 struct lmf_str *pgsql_openlib(char *, int, int *);
101 struct lmf_str *pgsql_reopen(struct lmf_str *);
104 void closelib(struct lmf_str *m_fptr);
105 extern void newname(char *nname, char *oname, char *suff, int maxn);
107 /* a file name for openlib may include a library type suffix */
110 openlib(char *lname, int ldnaseq, int *sascii,
111 int outtty, struct lmf_str *om_fptr)
113 char rline[10],sname[MAX_FN], iname[MAX_FN], *bp;
114 char opt_text[MAX_FN]; /* save text after ':' */
120 struct lmf_str *m_fptr=NULL;
122 /* this is currently unavailable - later it can return a value somewhere */
124 if (lname[0]=='#') {return -9;}
127 if (om_fptr != NULL && om_fptr->mm_flg) {
132 wcnt = 0; /* number of times to ask for file name */
134 /* check to see if there is a file option ":1-100" */
136 if ((bp=strchr(lname,':'))!=NULL && *(bp+1)!='\0') {
138 if ((bp=strchr(lname+3,':'))!=NULL && *(bp+1)!='\0') {
140 strncpy(opt_text,bp+1,sizeof(opt_text));
141 opt_text[sizeof(opt_text)-1]='\0';
144 else opt_text[0]='\0';
146 if (lname[0] == '-' || lname[0] == '@') {
151 strncpy(sname,lname,sizeof(sname));
152 sname[sizeof(sname)-1]='\0';
153 /* check for library type */
154 if ((bp=strchr(sname,' '))!=NULL) {
156 sscanf(bp+1,"%d",&libtype);
157 if (libtype<0 || libtype >= LASTLIB) {
158 fprintf(stderr," invalid library type: %d (>%d)- resetting\n%s\n",
159 libtype,LASTLIB,lname);
165 if (use_stdin && libtype !=0) {
166 fprintf(stderr," @/- STDIN libraries must be in FASTA format\n");
170 /* check to see if file can be open()ed? */
173 if (libtype<=LASTTXT) {
175 opnflg=((libf=fopen(sname,RBSTR))!=NULL);
179 strncpy(sname,"STDIN",sizeof(sname));
180 sname[sizeof(sname)-1]='\0';
185 else if (libtype==NCBIBL13) opnflg=(ncbl_openlib(sname,ldnaseq)!= -1);
188 else if (libtype==NCBIBL20) {
189 opnflg=((m_fptr=ncbl2_openlib(sname,ldnaseq))!=NULL);
194 /* a mySQL filename contains mySQL commands, not sequences */
195 else if (libtype==MYSQL_LIB) {
196 opnflg=((m_fptr=mysql_openlib(sname,ldnaseq,sascii))!=NULL);
200 /* a mySQL filename contains mySQL commands, not sequences */
201 else if (libtype==PGSQL_LIB) {
202 opnflg=((m_fptr=pgsql_openlib(sname,ldnaseq,sascii))!=NULL);
206 if (!opnflg) { /* here if open failed */
208 fprintf(stderr," cannot open %s library\n",sname);
209 fprintf(stderr," enter new file name or <RET> to quit ");
211 if (fgets(sname,sizeof(sname),stdin)==NULL) return NULL;
212 if ((bp=strchr(sname,'\n'))!=0) *bp='\0';
213 if (strlen(sname)==0) return NULL;
214 if (++wcnt > 10) return NULL;
215 strncpy(lname,sname,sizeof(lname)-1);
216 lname[sizeof(lname)-1]='\0';
222 if (libtype <= LASTTXT) {
223 /* now allocate a buffer for the opened text file */
224 if ((m_fptr = calloc(1,sizeof(struct lmf_str)))==NULL) {
225 fprintf(stderr," cannot allocate lmf_str (%ld) for %s\n",
226 sizeof(struct lmf_str),sname);
229 if ((m_fptr->lline = calloc(MAX_STR,sizeof(char)))==NULL) {
230 fprintf(stderr," cannot allocate lline (%d) for %s\n",
235 strncpy(m_fptr->lb_name,sname,MAX_FN);
236 m_fptr->lb_name[MAX_FN-1]='\0';
237 strncpy(m_fptr->opt_text,opt_text,MAX_FN);
238 m_fptr->opt_text[MAX_FN-1]='\0';
239 m_fptr->sascii = sascii;
242 m_fptr->lb_type = libtype;
243 m_fptr->getlib = getliba[libtype];
244 m_fptr->ranlib = ranliba[libtype];
248 m_fptr->lib_aa = (ldnaseq==0);
250 last_m_fptr = m_fptr;
253 /* check for possible mmap()ed files */
254 if (!use_stdin && (libtype <= LASTTXT) && (getlibam[libtype]!=NULL)) {
255 /* this is a file we can mmap() */
256 /* look for .xin file */
257 newname(iname,sname,"xin",sizeof(iname));
258 if ((libi=fopen(iname,"r"))!=NULL) { /* have a *.xin file, use mmap */
259 if (load_mmap(libi,sname,libtype,ldnaseq,m_fptr)!=NULL) {
260 fclose(libi); /* close index file */
261 m_fptr->lb_type = libtype;
262 m_fptr->getlib = getlibam[libtype];
263 m_fptr->ranlib = ranlibam[libtype];
267 fclose(libi); /* memory mapping failed, but still must close file */
272 if (libtype <= LASTTXT) {
274 if (fgets(m_fptr->lline,MAX_STR,libf)==NULL) return NULL;
280 closelib(struct lmf_str *m_fptr) {
284 if (m_fptr->mm_flag) {
285 /* don't close memory mapped files
292 if (m_fptr->libf!=NULL && m_fptr->libf != stdin) {
293 fclose(m_fptr->libf);
298 if (m_fptr->lb_type == NCBIBL13) ncbl_closelib(m_fptr);
301 if (m_fptr->lb_type == NCBIBL20) ncbl2_closelib(m_fptr);
304 if (m_fptr->lb_type == MYSQL_LIB) mysql_closelib(m_fptr);
309 re_openlib(struct lmf_str *om_fptr, int outtty)
313 /* if the file mmap()ed and has been opened - use it and return */
314 if (om_fptr->mm_flg) {
318 /* if this is a mysql database - use it and return */
319 else if (om_fptr->lb_type == MYSQL_LIB) {
324 /* data is available, but file is closed or not memory mapped, open it */
325 /* no longer check to memory map - because we could not do it before */
328 if (om_fptr->lb_type<=LASTTXT && om_fptr->libf==NULL)
329 opnflg=((om_fptr->libf=fopen(om_fptr->lb_name,RBSTR))!=NULL);
331 else if (om_fptr->lb_type==NCBIBL13)
332 opnflg=(ncbl_openlib(om_fptr->lb_name,!om_fptr->lib_aa)!= -1);
335 else if (om_fptr->lb_type==NCBIBL20) {
336 opnflg=((om_fptr=ncbl2_openlib(om_fptr->lb_name,!om_fptr->lib_aa))!=NULL);
340 /* a mySQL filename contains mySQL commands, not sequences */
341 else if (om_fptr->lb_type==MYSQL_LIB)
342 opnflg=(mysql_reopen(om_fptr)!=NULL);
346 fprintf(stderr,"*** could not re_open %s\n",om_fptr->lb_name);
350 /* use the old buffer for the opened text file */
352 last_m_fptr = om_fptr;
358 static char tline[512];
359 extern int nsfnum; /* number of superfamily numbers */
360 extern int sfnum[10]; /* superfamily number from types 0 and 5 */
362 extern int sfnum_n[10];
365 void sf_sort(int *, int);
368 agetlib(unsigned char *seq, int maxs,
369 char *libstr, int n_libstr,
372 struct lmf_str *lm_fd,
376 register unsigned char *cp, *seqp;
378 unsigned char *seqm, *seqm1;
379 /* int ic, l_start, l_stop, l_limit, rn; */
380 char *bp, *bp1, *bpa, *tp;
390 while (lm_fd->lline[0]!='>' && lm_fd->lline[0]!=';') {
391 if (lm_fd->libf != stdin) lm_fd->lpos = FTELL(lm_fd->libf);
392 if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
395 strncpy(tline,lm_fd->lline+1,sizeof(tline));
396 tline[sizeof(tline)-1]='\0';
398 if ((bp=strchr(tline,' ')) && (bp=strchr(bp+1,SFCHAR))) {
399 if ((bpa = strchr(bp+1,'\001'))!=NULL) *bpa = '\0';
400 if ((bp1=strchr(bp+1,SFCHAR))==NULL) {
401 /* fprintf(stderr," second %c missing: %s\n",SFCHAR,libstr); */
406 if ((tp = strtok(bp+1," \t"))!=NULL) {
407 sfnum[i++] = atoi(tp);
408 while ((tp = strtok((char *)NULL," \t")) != (char *)NULL) {
409 if (isdigit(*tp)) sfnum[i++] = atoi(tp);
414 if (nsfnum>1) sf_sort(sfnum,nsfnum);
416 if (nsfnum<1) fprintf(stderr," found | but no sfnum: %s\n",libstr);
421 sfnum[0] = nsfnum = 0;
425 if ((bp=strchr(lm_fd->lline,'@'))!=NULL && !strncmp(bp+1,"C:",2)) {
426 sscanf(bp+3,"%ld",l_off);
429 strncpy(libstr,lm_fd->lline+1,n_libstr-1);
430 libstr[n_libstr-1]='\0';
431 if ((bp=strchr(libstr,'\r'))!=NULL) *bp='\0';
432 if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
433 if (n_libstr > MAX_UID) {
435 while (*tp++) if (*tp == '\001' || *tp== '\t') *tp = ' ';
438 *libpos = lm_fd->lpos;
440 /* make certain we have the end of the line */
441 while (strchr((char *)lm_fd->lline,'\n')==NULL) {
442 if (strlen(lm_fd->lline)<MAX_STR/2)
443 fgets(&lm_fd->lline[strlen(lm_fd->lline)],MAX_STR/2,lm_fd->libf);
445 fgets(&lm_fd->lline[MAX_STR/2],MAX_STR/2,lm_fd->libf);
447 lm_fd->lline[MAX_STR-1]='\0';
450 lm_fd->lline[0]='\0';
451 while (seqp<seqm1 && fgets((char *)seqp,(size_t)(seqm-seqp),lm_fd->libf)!=NULL) {
452 if (*seqp=='>') goto new;
454 if (strchr((char *)seqp,'\n')==NULL) goto cont;
458 /* removed - used for @P:1-n
460 for (cp=seqp; seqp<seqm1 && rn < l_stop && (ic=ap[*cp++])<EL; )
461 if (ic < NA && ++rn > l_start) *seqp++ = (unsigned char)ic;
462 if (rn > l_stop) goto finish;
466 for (cp=seqp; seqp<seqm1; ) {
467 if ((*seqp++=ap[*cp++])<NA &&
468 (*seqp++=ap[*cp++])<NA &&
469 (*seqp++=ap[*cp++])<NA &&
470 (*seqp++=ap[*cp++])<NA &&
471 (*seqp++=ap[*cp++])<NA &&
472 (*seqp++=ap[*cp++])<NA &&
473 (*seqp++=ap[*cp++])<NA &&
474 (*seqp++=ap[*cp++])<NA &&
475 (*seqp++=ap[*cp++])<NA) continue;
476 if (*(--seqp)>NA) break;
478 if (*seqp==ES) goto done;
479 if (lm_fd->libf != stdin) lm_fd->lpos = FTELL(lm_fd->libf);
483 strncpy(lm_fd->lline,(char *)seqp,MAX_STR);
484 lm_fd->lline[MAX_STR-1]='\0';
485 /* be certain to get complete line, if possible */
486 if (strchr(lm_fd->lline,'\n')==NULL)
487 fgets(&lm_fd->lline[strlen(lm_fd->lline)],MAX_STR-strlen(lm_fd->lline),lm_fd->libf);
488 lm_fd->lline[MAX_STR-1]='\0';
489 if (strchr(lm_fd->lline,'\n')==NULL && strchr((char *)seqp,'\n')!=NULL)
490 lm_fd->lline[strlen(lm_fd->lline)-1]='\n';
493 /* removed - used for @P:1-n
495 while (lm_fd->lline[0]!='>' &&
496 fgets(lm_fd->lline,MAX_STR,lm_fd->libf)!=NULL) {
497 if (lm_fd->libf != stdin) lm_fd->lpos = FTELL(lm_fd->libf);
502 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
505 if (seqp>=seqm1) (*lcont)++;
511 /* if ((int)(seqp-seq)==0) return 1; */
512 return (int)(seqp-seq);
516 aranlib(char *str, int cnt, fseek_t seek, char *libstr, struct lmf_str *lm_fd)
520 if (lm_fd->libf != stdin) {
521 FSEEK(lm_fd->libf, seek, 0);
522 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
524 if (lm_fd->lline[0]=='>' || lm_fd->lline[0]==';') {
525 strncpy(str,lm_fd->lline+1,cnt);
527 if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
528 if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
530 if ((bp = strchr(str,SFCHAR))!=NULL) *bp='\0';
531 else if ((bp = strchr(str,'\001'))!=NULL) *bp='\0';
532 else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
533 else str[cnt-1]='\0';
536 while (*bp++) if (*bp=='\001' || *bp=='\t') *bp=' ';
545 void lget_ann(struct lmf_str *, char *, int);
548 lgetlib(unsigned char *seq,
554 struct lmf_str *lm_fd,
557 register unsigned char *cp, *seqp;
559 unsigned char *seqm, *seqm1;
565 seqm = &seq[maxs-11];
571 while (lm_fd->lline[0]!='L' || lm_fd->lline[1]!='O' ||
572 strncmp(lm_fd->lline,"LOCUS",5)) { /* find LOCUS */
573 lm_fd->lpos = FTELL(lm_fd->libf);
574 if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
575 if (lm_fd->lfflag) getc(lm_fd->libf);
577 *libpos= lm_fd->lpos;
579 if (n_libstr <= 21) {
580 strncpy(libstr,&lm_fd->lline[12],12);
584 lget_ann(lm_fd,libstr,n_libstr);
585 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
588 while (lm_fd->lline[0]!='O' || lm_fd->lline[1]!='R' ||
589 strncmp(lm_fd->lline,"ORIGIN",6)) { /* find ORIGIN */
590 if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
591 if (lm_fd->lfflag) getc(lm_fd->libf);
595 for (cp= lm_fd->cpsave; seqp<seqm1; ) {
596 if ((*seqp++=ap[*cp++])<NA) continue;
597 if (*(--seqp)>NA) break;
601 lm_fd->lline[0]='\0';
602 while (seqp<seqm1 && fgets(lm_fd->lline,MAX_STR,lm_fd->libf)!=NULL) {
603 if (lm_fd->lfflag) getc(lm_fd->libf);
604 if (lm_fd->lline[0]=='/') goto new;
605 for (cp= (unsigned char *)&lm_fd->lline[10]; seqp<seqm1; ) {
606 if ((*seqp++=ap[*cp++])<NA &&
607 (*seqp++=ap[*cp++])<NA &&
608 (*seqp++=ap[*cp++])<NA &&
609 (*seqp++=ap[*cp++])<NA &&
610 (*seqp++=ap[*cp++])<NA &&
611 (*seqp++=ap[*cp++])<NA &&
612 (*seqp++=ap[*cp++])<NA &&
613 (*seqp++=ap[*cp++])<NA &&
614 (*seqp++=ap[*cp++])<NA &&
615 (*seqp++=ap[*cp++])<NA &&
616 (*seqp++=ap[*cp++])<NA) continue;
617 if (*(--seqp)>NA) break;
622 lm_fd->lpos = FTELL(lm_fd->libf);
623 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
624 if (lm_fd->lfflag) getc(lm_fd->libf);
634 /* if ((int)(seqp-seq)==0) return 1; */
635 return (int)(seqp-seq);
639 lget_ann(struct lmf_str *lm_fd, char *libstr, int n_libstr) {
640 char *bp, *bp_gid, locus[120], desc[120], acc[120], ver[120];
642 /* copy in locus from lm_fd->lline */
643 strncpy(locus,&lm_fd->lline[12],sizeof(locus));
644 if ((bp=strchr(locus,' '))!=NULL) *(bp+1) = '\0';
646 /* get description */
647 fgets(desc,sizeof(desc),lm_fd->libf);
648 while (desc[0]!='D' || desc[1]!='E' || strncmp(desc,"DEFINITION",10))
649 fgets(desc,sizeof(desc),lm_fd->libf);
650 if ((bp = strchr(&desc[12],'\n'))!=NULL) *bp='\0';
653 fgets(acc,sizeof(acc),lm_fd->libf);
654 while (acc[0]!='A' || acc[1]!='C' || strncmp(acc,"ACCESSION",9)) {
655 fgets(acc,sizeof(acc),lm_fd->libf);
656 if (acc[0]=='O' && acc[1]=='R' && strncmp(acc,"ORIGIN",6)==0)
659 if ((bp = strchr(&acc[12],'\n'))!=NULL) *bp='\0';
660 if ((bp = strchr(&acc[12],' '))!=NULL) *bp='\0';
663 fgets(ver,sizeof(ver),lm_fd->libf);
664 while (ver[0]!='V' || ver[1]!='E' || strncmp(ver,"VERSION",7)) {
665 fgets(ver,sizeof(ver),lm_fd->libf);
666 if (ver[0]=='O' && ver[1]=='R' && strncmp(ver,"ORIGIN",6)==0)
669 if ((bp = strchr(&ver[12],'\n'))!=NULL) *bp='\0';
671 /* extract gi:123456 from version line */
672 bp_gid = strchr(&ver[12],':');
673 if (bp_gid != NULL) {
674 if ((bp=strchr(bp_gid+1,' '))!=NULL) *bp='\0';
677 if ((bp = strchr(&ver[12],' '))!=NULL) *bp='\0';
679 /* build up FASTA header line */
680 if (bp_gid != NULL) {
681 strncpy(libstr,"gi|",n_libstr-1);
682 strncat(libstr,bp_gid,n_libstr-4);
683 strncat(libstr,"|gb|",n_libstr-20);
685 else {libstr[0]='\0';}
687 /* if we have a version number, use it, otherwise accession,
688 otherwise locus/description */
691 strncat(libstr,&ver[12],n_libstr-1-strlen(libstr));
692 strncat(libstr,"|",n_libstr-1-strlen(libstr));
694 else if (acc[0]=='A') {
695 strncat(libstr,&acc[12],n_libstr-1-strlen(libstr));
696 strncat(libstr," ",n_libstr-1-strlen(libstr));
699 strncat(libstr,locus,n_libstr-1-strlen(libstr));
700 strncat(libstr,&desc[11],n_libstr-1-strlen(libstr));
701 libstr[n_libstr-1]='\0';
705 /* this code seeks to provide both the various accession numbers
706 necessary to identify the sequence, and also some description.
708 Unfortunately, the various contributors to Genbank use three
709 slightly different formats for including the accession number.
711 (1)LOCUS HSJ214M20 107422 bp DNA HTG 16-JUN-2000
712 DEFINITION Homo sapiens chromosome 6 clone RP1-214M20 map p12.1-12.3, ***
713 SEQUENCING IN PROGRESS ***, in unordered pieces.
716 (2)LOCUS AL359201 117444 bp DNA HTG 15-JUN-2000
717 DEFINITION Homo sapiens chromosome 1 clone RP4-671C13 map p13.2-21.1, ***
718 SEQUENCING IN PROGRESS ***, in unordered pieces.
721 (3)LOCUS BB067000 280 bp mRNA EST 19-JUN-2000
722 DEFINITION BB067000 RIKEN full-length enriched, 15 days embryo male testis Mus
723 musculus cDNA clone 8030456L01 3', mRNA sequence.
726 This makes it more difficult to both provide the accession number in a
727 standard location and to conserve definition space
735 struct lmf_str *lm_fd)
737 char *bp, acc[MAX_STR], desc[MAX_STR];
739 FSEEK(lm_fd->libf, seek, 0);
740 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
741 if (lm_fd->lfflag) getc(lm_fd->libf);
743 lget_ann(lm_fd, str, cnt);
746 FSEEK(lm_fd->libf,seek,0);
747 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
748 if (lm_fd->lfflag) getc(lm_fd->libf);
752 pgetlib(unsigned char *seq,
758 struct lmf_str *lm_fd,
762 register unsigned char *cp, *seqp;
764 unsigned char *seqm, *seqm1;
769 seqm = &seq[maxs-11];
775 while (lm_fd->lline[0]!='E' || lm_fd->lline[1]!='N' || strncmp(lm_fd->lline,"ENTRY",5))
777 lm_fd->lpos = FTELL(lm_fd->libf);
778 if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
780 strncpy(libstr,&lm_fd->lline[16],8);
782 *libpos = lm_fd->lpos;
783 while (lm_fd->lline[2]!='Q' || lm_fd->lline[0]!='S' || strncmp(lm_fd->lline,"SEQUENCE",8))
784 { /* find SEQUENCE */
785 if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
787 fgets(lm_fd->lline,MAX_STR,lm_fd->libf); /* get the extra line */
790 for (cp= lm_fd->cpsave; seqp<seqm1; ) {
791 if ((*seqp++=ap[*cp++])<NA) continue;
792 if (*(--seqp)>NA) break;
794 if (*seqp==ES) goto done;
797 lm_fd->lline[0]='\0';
798 while (seqp<seqm1 && fgets(lm_fd->lline,MAX_STR,lm_fd->libf)!=NULL) {
799 if (lm_fd->lline[0]=='/') goto new;
800 for (cp= (unsigned char *)&lm_fd->lline[8]; seqp<seqm1; ) {
801 if ((*seqp++=ap[*cp++])<NA) continue;
802 if (*(--seqp)>NA) break;
804 if (*seqp==ES) goto done;
808 lm_fd->lpos = FTELL(lm_fd->libf);
809 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
819 /* if ((int)(seqp-seq)==0) return 1; */
820 return (int)(seqp-seq);
828 struct lmf_str *lm_fd)
832 FSEEK(lm_fd->libf, seek, 0);
833 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
835 strncpy(str,&lm_fd->lline[16],8);
837 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
838 while (lm_fd->lline[0]!='T' || lm_fd->lline[1]!='I' || strncmp(lm_fd->lline,"TITLE",5))
839 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
840 strncpy(&str[8],&lm_fd->lline[16],cnt-9);
842 if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
844 FSEEK(lm_fd->libf,seek,0);
845 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
849 egetlib(unsigned char *seq,
855 struct lmf_str *lm_fd,
860 register unsigned char *cp, *seqp;
862 unsigned char *seqm, *seqm1;
863 char id[11]; /* Holds Identifier */
868 seqm = &seq[maxs-11];
874 while (lm_fd->lline[0]!='I' || lm_fd->lline[1]!='D') { /* find ID */
875 lm_fd->lpos = FTELL(lm_fd->libf);
876 if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
877 if (lm_fd->lfflag) getc(lm_fd->libf);
879 sscanf(&lm_fd->lline[5],"%s",id);
880 sprintf(libstr,"%-12.12s",id);
882 *libpos = lm_fd->lpos;
883 while (lm_fd->lline[0]!='S' || lm_fd->lline[1]!='Q') { /* find ORIGIN */
884 if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
885 if (lm_fd->lfflag) getc(lm_fd->libf);
887 sscanf(&lm_fd->lline[14],"%ld",&lm_fd->gcg_len);
890 for (cp= lm_fd->cpsave; seqp<seqm1; ) {
891 if ((*seqp++=ap[*cp++])<NA) continue;
892 if (*(--seqp)>NA) break;
894 if (*seqp==ES) goto done;
897 lm_fd->lline[0]='\0';
898 while (seqp<seqm1 && fgets(lm_fd->lline,MAX_STR,lm_fd->libf)!=NULL) {
899 if (lm_fd->lfflag) getc(lm_fd->libf);
900 if (lm_fd->lline[0]=='/') goto new;
901 lm_fd->lline[70]='\0';
902 for (cp= (unsigned char *)&lm_fd->lline[5]; seqp<seqm1; ) {
903 if ((*seqp++=ap[*cp++])<NA &&
904 (*seqp++=ap[*cp++])<NA &&
905 (*seqp++=ap[*cp++])<NA &&
906 (*seqp++=ap[*cp++])<NA &&
907 (*seqp++=ap[*cp++])<NA &&
908 (*seqp++=ap[*cp++])<NA &&
909 (*seqp++=ap[*cp++])<NA &&
910 (*seqp++=ap[*cp++])<NA &&
911 (*seqp++=ap[*cp++])<NA &&
912 (*seqp++=ap[*cp++])<NA &&
913 (*seqp++=ap[*cp++])<NA) continue;
914 if (*(--seqp)>NA) break;
916 if (*seqp==ES) goto done;
919 new: lm_fd->lpos = FTELL(lm_fd->libf);
920 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
921 if (lm_fd->lfflag) getc(lm_fd->libf);
924 done: if (seqp>=seqm1) {
927 lm_fd->gcg_len -= (long)(seqp-seq);
932 /* if ((int)(seqp-seq)==0) return 1; */
933 /* if (*lcont==0 && (long)(seqp-seq)!=lm_fd->gcg_len)
934 printf("%s read %d of %d\n",libstr,(int)(seqp-seq),lm_fd->gcg_len);
936 return (int)(seqp-seq);
944 struct lmf_str *lm_fd)
947 char id[11]; /* Holds Identifier */
949 FSEEK(lm_fd->libf, seek, 0);
950 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
951 if (lm_fd->lfflag) getc(lm_fd->libf);
953 sscanf(&lm_fd->lline[5],"%s",id);
954 sprintf(str,"%-10.10s ",id);
955 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
956 if (lm_fd->lfflag) getc(lm_fd->libf);
957 while (lm_fd->lline[0]!='D' || lm_fd->lline[1]!='E') fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
958 strncpy(&str[11],&lm_fd->lline[5],cnt-11);
960 if ((bp = strchr(str,'\r'))!=NULL) *bp='\0';
961 if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
963 FSEEK(lm_fd->libf,seek,0);
964 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
965 if (lm_fd->lfflag) getc(lm_fd->libf);
969 igetlib(unsigned char *seq,
975 struct lmf_str *lm_fd,
978 register unsigned char *cp, *seqp;
980 unsigned char *seqm, *seqm1;
992 while (lm_fd->lline[0]!=';') {
993 lm_fd->lpos = FTELL(lm_fd->libf);
994 if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
996 *libpos = lm_fd->lpos;
997 while (lm_fd->lline[0]==';') fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
998 strncpy(libstr,lm_fd->lline+1,12);
1000 if((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
1003 lm_fd->lline[0]='\0';
1004 while (seqp<seqm1 && fgets((char *)seqp,(size_t)(seqm-seqp),lm_fd->libf)!=NULL) {
1005 if (*seqp=='>') goto new;
1007 if (strchr((char *)seqp,'\n')==NULL) goto cont;
1010 for (cp=seqp; seqp<seqm1; ) {
1011 if ((*seqp++=ap[*cp++])<NA &&
1012 (*seqp++=ap[*cp++])<NA &&
1013 (*seqp++=ap[*cp++])<NA &&
1014 (*seqp++=ap[*cp++])<NA &&
1015 (*seqp++=ap[*cp++])<NA &&
1016 (*seqp++=ap[*cp++])<NA &&
1017 (*seqp++=ap[*cp++])<NA &&
1018 (*seqp++=ap[*cp++])<NA &&
1019 (*seqp++=ap[*cp++])<NA) continue;
1020 if (*(--seqp)>NA) break;
1022 if (*seqp==ES) goto done;
1023 lm_fd->lpos = FTELL(lm_fd->libf);
1026 new: strncpy(lm_fd->lline,(char *)seqp,MAX_STR);
1027 lm_fd->lline[MAX_STR-1]='\0';
1028 if (strchr((char *)seqp,'\n')==NULL)
1029 fgets(lm_fd->lline,MAX_STR-strlen(lm_fd->lline),lm_fd->libf);
1033 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1036 done: if (seqp>=seqm1) {
1045 /* if ((int)(seqp-seq)==0) return 1; */
1046 return (int)(seqp-seq);
1054 struct lmf_str *lm_fd)
1059 FSEEK(lm_fd->libf, seek, 0);
1060 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1062 if (lm_fd->lline[0]=='>' || lm_fd->lline[0]==';') {
1063 strncpy(tline,lm_fd->lline+1,sizeof(tline));
1064 tline[sizeof(tline)-1]='\0';
1065 if ((bp = strchr(tline,'\n'))!=NULL) *bp='\0';
1071 while (lm_fd->lline[0]==';') fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1072 if ((bp=strchr(lm_fd->lline,'\n'))!=NULL) *bp=0;
1073 if ((bp=strchr(lm_fd->lline,' '))!=NULL) *bp=0;
1074 strncpy(str,lm_fd->lline,cnt);
1076 strncat(str," ",cnt-strlen(str)-1);
1077 strncat(str,tline,cnt-strlen(str)-1);
1079 FSEEK(lm_fd->libf,seek,0);
1080 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1084 vgetlib(unsigned char *seq,
1090 struct lmf_str *lm_fd,
1094 register unsigned char *cp, *seqp;
1096 unsigned char *seqm, *seqm1;
1102 seqm = &seq[maxs-9];
1108 while (lm_fd->lline[0]!='>' && lm_fd->lline[0]!=';') {
1109 lm_fd->lpos = FTELL(lm_fd->libf);
1110 if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
1111 if (lm_fd->lfflag) getc(lm_fd->libf);
1115 if ((bp=strchr(&lm_fd->lline[1],' ')) &&
1116 (bp=strchr(bp+1,SFCHAR))) {
1118 if ((tp = strtok(bp+1," \t\n"))!=NULL) sfnum[i++] = atoi(tp);
1119 while ((tp = strtok(NULL," \t")) != NULL) {
1120 sfnum[i++] = atoi(tp);
1124 if (nsfnum>1) sf_sort(sfnum,nsfnum);
1126 if (nsfnum < 1) fprintf(stderr," found | but no sfnum: %s\n",libstr);
1129 else sfnum[0]=nsfnum=0;
1132 if ((bp=strchr(lm_fd->lline,'\n'))!=NULL) *bp='\0';
1133 strncpy(libstr,&lm_fd->lline[4],12);
1135 if ((bp=strchr(libstr,' '))!=NULL) *bp='\0';
1136 if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
1138 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1139 if (lm_fd->lfflag) getc(lm_fd->libf);
1141 if (n_libstr > 21) {
1143 strncat(libstr,lm_fd->lline,n_libstr-1-strlen(libstr));
1144 if ((bp=strchr(libstr,'\n'))!=NULL) *bp='\0';
1145 libstr[n_libstr-1]='\0';
1147 *libpos = lm_fd->lpos;
1150 lm_fd->lline[0]='\0';
1151 while (seqp<seqm1 && fgets((char *)seqp,(size_t)(seqm-seqp),lm_fd->libf)!=NULL) {
1152 if (lm_fd->lfflag && (ich=getc(lm_fd->libf))!=LFCHAR) ungetc(ich,lm_fd->libf);
1153 if (*seqp=='>') goto new;
1155 if (strchr((char *)seqp,'\n')==NULL) goto cont;
1158 for (cp=seqp; seqp<seqm1; ) {
1159 if ((*seqp++=ap[*cp++])<NA &&
1160 (*seqp++=ap[*cp++])<NA &&
1161 (*seqp++=ap[*cp++])<NA &&
1162 (*seqp++=ap[*cp++])<NA &&
1163 (*seqp++=ap[*cp++])<NA &&
1164 (*seqp++=ap[*cp++])<NA &&
1165 (*seqp++=ap[*cp++])<NA &&
1166 (*seqp++=ap[*cp++])<NA &&
1167 (*seqp++=ap[*cp++])<NA) continue;
1168 if (*(--seqp)>NA) break;
1170 if (*seqp==ES) goto done;
1171 lm_fd->lpos = FTELL(lm_fd->libf);
1175 strncpy(lm_fd->lline,(char *)seqp,MAX_STR);
1176 lm_fd->lline[MAX_STR-1]='\0';
1177 if (strchr((char *)seqp,'\n')==NULL) {
1178 fgets(&lm_fd->lline[strlen(lm_fd->lline)],MAX_STR-strlen(lm_fd->lline),lm_fd->libf);
1179 if (lm_fd->lfflag && (ich=getc(lm_fd->libf))!=LFCHAR) ungetc(ich,lm_fd->libf);
1184 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1185 if (lm_fd->lfflag && (ich=getc(lm_fd->libf))!=LFCHAR) ungetc(ich,lm_fd->libf);
1197 /* if ((int)(seqp-seq)==0) return 1;*/
1198 return (int)(seqp-seq);
1206 struct lmf_str *lm_fd)
1210 FSEEK(lm_fd->libf, seek, 0);
1211 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1212 if (lm_fd->lfflag) getc(lm_fd->libf);
1214 if (lm_fd->lline[0]=='>'&&(lm_fd->lline[3]==';'||lm_fd->lline[3]=='>')) {
1215 strncpy(str,&lm_fd->lline[4],cnt-1);
1218 if ((bp = strchr(str,':'))!=NULL) *bp='\0';
1219 if ((bp=strchr(str,'\r'))!=NULL) *bp='\0';
1220 else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
1221 else str[cnt-1]='\0';
1223 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1224 if (lm_fd->lfflag) getc(lm_fd->libf);
1226 /* skip over redundant stuff */
1227 for (llp=lm_fd->lline,bp=str; *llp==*bp; llp++,bp++);
1228 if ((int)(llp-lm_fd->lline)<5) llp = lm_fd->lline;
1230 if ((bp=strchr(llp,'\r'))!=NULL) *bp=' ';
1231 if ((bp=strchr(llp,'\n'))!=NULL) *bp='\0';
1232 strncat(str," ",(size_t)1);
1233 strncat(str,llp,(size_t)cnt-strlen(str)-1);
1239 FSEEK(lm_fd->libf,seek,0);
1240 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1241 if (lm_fd->lfflag) getc(lm_fd->libf);
1244 static int gcg_bton[4]={2,4,1,3};
1247 gcg_getlib(unsigned char *seq,
1253 struct lmf_str *lm_fd,
1258 register unsigned char *cp, *seqp, stmp;
1261 unsigned char *seqm, *seqm1;
1262 long r_block, b_block;
1268 seqm = &seq[maxs-9];
1274 while (lm_fd->lline[0]!='>' && lm_fd->lline[0]!=';') {
1275 lm_fd->lpos = FTELL(lm_fd->libf);
1276 if (fgets(lm_fd->lline,MAX_STR,lm_fd->libf)==NULL) return (-1);
1278 sscanf(&lm_fd->lline[4],"%s %s %s %s %ld",
1279 libstr,gcg_date,gcg_type,dummy,&(lm_fd->gcg_len));
1281 lm_fd->gcg_binary = (gcg_type[0]=='2');
1283 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1284 while (strchr((char *)lm_fd->lline,'\n')==NULL) {
1285 if (strlen(lm_fd->lline)<MAX_STR/2)
1286 fgets(&lm_fd->lline[strlen(lm_fd->lline)],MAX_STR/2,lm_fd->libf);
1288 fgets(&lm_fd->lline[strlen(lm_fd->lline)-MAX_STR/2],MAX_STR/2,lm_fd->libf);
1290 lm_fd->lline[MAX_STR-1]='\0';
1291 if (n_libstr <= 21) {
1295 strncat(libstr," ",1);
1296 strncat(libstr,lm_fd->lline,n_libstr-1-strlen(libstr));
1297 if ((bp = strchr(libstr,'\n'))!=NULL) *bp='\0';
1298 libstr[n_libstr-1]='\0';
1300 *libpos = lm_fd->lpos;
1303 lm_fd->lline[0]='\0';
1305 r_block = b_block = min((size_t)(seqm-seqp),lm_fd->gcg_len);
1306 if (lm_fd->gcg_binary) { r_block = (r_block+3)/4; }
1308 fread((char *)seqp,(size_t)r_block,(size_t)1,lm_fd->libf);
1309 if (!lm_fd->gcg_binary)
1310 for (cp=seqp; seqp<seq+r_block; ) *seqp++ = ap[*cp++];
1311 else if (lm_fd->gcg_binary) {
1312 seqp = seq + r_block;
1313 cp = seq + 4*r_block;
1314 while (seqp > seq) {
1316 *--cp = gcg_bton[stmp&3];
1317 *--cp = gcg_bton[(stmp >>= 2)&3];
1318 *--cp = gcg_bton[(stmp >>= 2)&3];
1319 *--cp = gcg_bton[(stmp >>= 2)&3];
1322 if (4 * r_block >= lm_fd->gcg_len) {
1323 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1327 if (lm_fd->gcg_binary) b_block = 4*r_block;
1328 lm_fd->gcg_len -= b_block;
1332 seq[b_block] = EOSEQ;
1333 /* if (b_block==0) return 1; else */
1338 gcg_ranlib(char *str,
1342 struct lmf_str *lm_fd)
1344 char *bp, *bp1, *llp;
1346 FSEEK(lm_fd->libf, seek, 0);
1347 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1349 if (lm_fd->lline[0]=='>'&&(lm_fd->lline[3]==';'||lm_fd->lline[3]=='>')) {
1350 strncpy(str,&lm_fd->lline[4],cnt-1);
1352 if ((bp = strchr(str,' '))!=NULL) *bp='\0';
1353 else if ((bp=strchr(str,'\r'))!=NULL) *bp='\0';
1354 else if ((bp = strchr(str,'\n'))!=NULL) *bp='\0';
1355 else str[cnt-1]='\0';
1357 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1359 /* check beginning of line it is a duplicate */
1360 for (llp=lm_fd->lline,bp=str; *llp == *bp; llp++,bp++);
1361 if ((int)(llp-lm_fd->lline)<5) llp = lm_fd->lline;
1363 /* here we would like to skip over some species stuff */
1365 if ((bp1 = strchr(llp,';'))!=NULL && (int)(bp1-llp)<50) {
1366 if ((bp2 = strchr(bp1+1,';'))!=NULL && (int)(bp2-bp1)<50) {
1367 *(bp2+1)='\0'; bp1 = bp2+2;
1371 else if ((bp1=strchr(llp,'.'))!=NULL && *(bp1+1)==' ') {
1372 *(bp1+1) = '\0'; bp1 += 2;}
1377 if ((bp=strchr(bp1,'\r'))!=NULL) *bp='\0';
1378 if ((bp=strchr(bp1,'\n'))!=NULL) *bp='\0';
1379 strncat(str," ",(size_t)1);
1380 strncat(str,bp1,(size_t)cnt-strlen(str));
1381 if (bp1!=llp) strncat(str,llp,(size_t)cnt-strlen(str));
1387 FSEEK(lm_fd->libf,seek,0);
1388 fgets(lm_fd->lline,MAX_STR,lm_fd->libf);
1400 for (i=0; i<n-1; i++)
1401 if (s[i]>s[i+1]) goto l2;
1405 for (gap=n/2; gap>0; gap/=2)
1406 for (i=gap; i<n; i++)
1407 for (j=i-gap; j>=0; j -= gap) {
1408 if (s[j] <= s[j+gap]) break;