--- /dev/null
+/* May, June 1987 - modified for rapid read of database
+
+ copyright (c) 1987,1988,1989,1992,1995,2000 William R. Pearson
+
+ This is one of three alternative files that can be used to
+ read a database. The three files are nxgetaa.c, nmgetaa.c, and
+ mmgetaa.c.
+
+ nxgetaa.c contains the original code for reading databases, and
+ is still used for Mac and PC versions of fasta33 (which do not
+ use mmap).
+
+ nmgetaa.c and mmgetaa.c are used together. nmgetaa.c provides
+ the same functions as nxgetaa.c if memory mapping is not used,
+ mmgetaa.c provides the database reading functions if memory
+ mapping is used. The decision to use memory mapping is made on
+ a file-by-file basis.
+
+ June 2, 1987 - added TFASTA
+ March 30, 1988 - combined ffgetaa, fgetgb;
+ April 8, 1988 - added PIRLIB format for unix
+ Feb 4, 1989 - added universal subroutines for libraries
+ December, 1995 - added range option file.name:1-1000
+ Feb 22, 2002 - fix to allow "plain" text file queries
+
+ getnt.c associated subroutines for matching sequences */
+
+/* $Name: fa_34_26_5 $ - $Id: getseq.c,v 1.13 2006/10/05 18:22:07 wrp Exp $ */
+
+/*
+ 8-April-88
+ The compile time #define PIRLIB allows this routine to be used
+ to read protein and DNA sequence libraries in the NBRF/PIR
+ VAX/VMS library format. That is:
+
+ >P1;LCBO
+ This is a line of description
+ GTYH ... the sequence starts on this line
+
+ This may ease conversion from UWGCG format libraries. It
+ has not been extensively tested.
+
+ In addition, sequence libraries with a '>' in the 4th position
+ are recognized as NBRF format libraries for consistency with
+ UWGCG
+*/
+
+/* Nov 12, 1987 - this version checks to see if the sequence
+ is DNA or protein by asking whether > 85% is A, C, G, T
+
+ May 5, 1988 - modify the DNA/PROTEIN checker by re-reading
+ DNA sequences in order to check for 'U'.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "defs.h"
+#include "structs.h"
+
+#ifndef SFCHAR
+#define SFCHAR ':'
+#endif
+
+#ifdef VMS
+#define PIRLIB
+#endif
+
+#define XTERNAL
+#include "uascii.h"
+#include "upam.h"
+#undef XTERNAL
+
+#define YES 1
+#define NO 0
+#define MAXLINE 512
+
+#ifndef min
+#define min(x,y) ((x) > (y) ? (y) : (x))
+#endif
+
+#ifdef SUPERFAMNUM
+extern int nsfnum; /* number of superfamily numbers */
+extern int sfnum[]; /* superfamily number from types 0 and 5 */
+extern int nsfnum_n;
+extern int sfnum_n[];
+#endif
+
+#define NO_FORMAT 0
+#define FASTA_FORMAT 1
+#define GCG_FORMAT 2
+
+static int seq_format=NO_FORMAT;
+static char seq_title[200];
+
+int scanseq(unsigned char *, int, char *);
+void sf_sort(int *, int);
+extern void init_ascii(int is_ext, int *sascii, int is_dna);
+
+/* getseq - get a query sequence, possibly re-reading to set type
+ returns - length of query sequence or error = 0
+
+ char *filen - name of file to be opened
+ char *seq - destination for query sequence
+ int maxs - maximum length of query
+ char libstr[20] - short description (locus or acc)
+ int *dnaseq - -1 => use scanseq to determine sequence type
+ 0 => must be protein
+ 1 => must be DNA
+ long *sq0off - offset into query specified by query_file:1001-2000
+*/
+
+int
+getseq(char *filen, int *qascii, unsigned char *seq, int maxs, char *libstr, long *sq0off)
+{
+ FILE *fptr;
+ char line[512],*bp, *bp1, *bpn, *tp;
+ int i, rn, n;
+ int ic;
+ int sstart, sstop, sset=0;
+ int llen, l_offset;
+#ifdef SUPERFAMNUM
+ static char tline[MAXLINE];
+#endif
+
+ seq_title[0]='\0';
+ libstr[0]='\0';
+
+ sstart = sstop = -1;
+#ifndef DOS
+ if ((bp=strchr(filen,':'))!=NULL && *(bp+1)!='\0') {
+#else
+ if ((bp=strchr(filen+3,':'))!=NULL && *(bp+1)!='\0') {
+#endif
+ *bp='\0';
+ if (*(bp+1)=='-') {
+ sstart = 0;
+ sscanf(bp+2,"%d",&sstop);
+ }
+ else {
+ sscanf(bp+1,"%d-%d",&sstart,&sstop);
+ sstart--;
+ if (sstop <= 0 ) sstop = BIGNUM;
+ }
+ sset=1;
+ }
+ else {
+ sstart = 0;
+ sstop = BIGNUM;
+ }
+
+ /* check for input from stdin */
+ if (strcmp(filen,"-") && strcmp(filen,"@")) {
+ if ((fptr=fopen(filen,"r"))==NULL) {
+ fprintf(stderr," could not open %s\n",filen);
+ return 0;
+ }
+ }
+ else {
+ fptr = stdin;
+ }
+ rn = n=0;
+
+ while(fgets(line,sizeof(line),fptr)!=NULL) {
+#ifdef PIRLIB
+ if (line[0]=='>'&& (line[3]==';'||line[3]=='>'))
+ fgets(line,sizeof(line),fptr);
+ else
+#endif
+ l_offset = 0;
+ if (line[0]=='>') {
+ seq_format = FASTA_FORMAT;
+#ifdef SUPERFAMNUM
+ sfnum[nsfnum=0]= sfnum_n[nsfnum_n=0]=0;
+ strncpy(tline,line+1,sizeof(tline));
+ tline[sizeof(tline)-1]='\0';
+
+ if ((bp=strchr(tline,' ')) && (bp=strchr(bp+1,SFCHAR))) {
+ if ((bp1=strchr(bp+1,SFCHAR))==NULL) {
+ fprintf(stderr," second %c missing: %s\n",SFCHAR,tline);
+ }
+ else {
+ if ((bpn=strchr(bp+1,NSFCHAR))!=NULL) *bpn = '\0';
+ *bp1 = '\0';
+ i = 0;
+ if ((tp = strtok(bp+1," \t"))!=NULL) {
+ sfnum[i++] = atoi(tp);
+ while ((tp = strtok((char *)NULL," \t")) != (char *)NULL) {
+ if (isdigit(*tp)) sfnum[i++] = atoi(tp);
+ if (i>=9) break;
+ }
+ }
+ sfnum[nsfnum=i]= 0;
+ if (nsfnum>1) sf_sort(sfnum,nsfnum);
+ else {
+ if (nsfnum < 1) fprintf(stderr," found | but no sfnum: %s\n",libstr);
+ }
+ if (bpn != NULL) {
+ tp = strtok(bpn+1," \t");
+ sfnum_n[0]=atoi(tp);
+ i = 1;
+ while ((tp=strtok(NULL," \t"))!=NULL) {
+ sfnum_n[i++] = atoi(tp);
+ if (i >= 10) {
+ fprintf(stderr,
+ " error - too many negative superfamilies: %d\n %s\n",
+ i,tline);
+ break;
+ }
+ }
+ sfnum_n[nsfnum_n=i]=0;
+ sf_sort(sfnum_n,nsfnum_n);
+ }
+ }
+ }
+ else {
+ sfnum[nsfnum = 0] = 0;
+ sfnum_n[nsfnum_n = 0] = 0;
+ }
+#endif
+ if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
+ strncpy(seq_title,line+1,sizeof(seq_title));
+ seq_title[sizeof(seq_title)-1]='\0';
+ if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
+ strncpy(libstr,line+1,12);
+ libstr[12]='\0';
+ }
+ else if (seq_format==NO_FORMAT && strcmp(line,"..")==0) {
+ seq_format = GCG_FORMAT;
+/*
+ if (*dnaseq != 1) qascii['*'] = qascii['X'];
+*/
+ l_offset = 10;
+ llen = strlen(line);
+ while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
+ if (fgets(line,sizeof(line),fptr)==NULL) return 0;
+ llen = strlen(line);
+ }
+ bp = strtok(line," \t");
+/*
+ if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
+ else if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
+*/
+ if (bp!=NULL) strncpy(libstr,bp,12);
+ else strncpy(libstr,filen,12);
+ libstr[12]='\0';
+ if (fgets(line,sizeof(line),fptr)==NULL) return 0;
+ }
+ else {
+ if (libstr[0]=='\0') strncpy(libstr,filen,12);
+ libstr[12]='\0';
+ }
+
+ if (seq_format==GCG_FORMAT && strlen(line)<l_offset) continue;
+
+ if (line[0]!='>'&& line[0]!=';') {
+ for (i=l_offset; (n<maxs && rn < sstop)&&
+ ((ic=qascii[line[i]&AAMASK])<EL); i++)
+ if (ic<NA && ++rn > sstart) seq[n++]= ic;
+ if (ic == ES || rn > sstop) break;
+ }
+ }
+
+ if (n==maxs) {
+ fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
+ fflush(stderr);
+ }
+ if ((bp=strchr(libstr,'\n'))!=NULL) *bp = '\0';
+ if ((bp=strchr(libstr,'\r'))!=NULL) *bp = '\0';
+ seq[n]= EOSEQ;
+
+
+ if (seq_format !=GCG_FORMAT)
+ while(fgets(line,sizeof(line),fptr)!=NULL) {
+#ifdef PIRLIB
+ if (line[0]=='>'&& (line[3]==';'||line[3]=='>'))
+ fgets(line,sizeof(line),fptr);
+ else
+#endif
+ if (line[0]!='>'&& line[0]!=';') {
+ for (i=0; (n<maxs && rn < sstop)&&
+ ((ic=qascii[line[i]&AAMASK])<EL); i++)
+ if (ic<NA && ++rn > sstart ) seq[n++]= ic;
+ if (ic == ES || rn > sstop) break;
+ }
+ }
+ else {
+ llen = strlen(line);
+ while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
+ if (fgets(line,sizeof(line),fptr)==NULL) return 0;
+ llen = strlen(line);
+ }
+ while (fgets(line,sizeof(line),fptr)!=NULL) {
+ if (strlen(line)<l_offset) continue;
+ for (i=l_offset; (n<maxs && rn < sstop) &&
+ ((ic=qascii[line[i]&AAMASK])<EL); i++)
+ if (ic<NA && ++rn > sstart ) seq[n++]= ic;
+ if (ic == ES || rn > sstop ) break;
+ }
+ }
+
+ if (n==maxs) {
+ fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
+ fflush(stderr);
+ }
+ seq[n]= EOSEQ;
+
+ if (fptr!=stdin) fclose(fptr);
+
+ if (sset==1) {
+ sstart++;
+ filen[strlen(filen)]=':';
+ if (*sq0off==1 || sstart>=1) *sq0off = sstart;
+ }
+
+ return n;
+}
+
+int
+gettitle(char *filen, char *title, int len) {
+ FILE *fptr;
+ char line[512];
+ char *bp;
+ int sset;
+#ifdef WIN32
+ char *strpbrk();
+#endif
+
+ sset = 0;
+
+ if (strncmp(filen,"-",1)==0 || strncmp(filen,"@",1)==0) {
+ strncpy(title,seq_title,len);
+ title[len-1]='\0';
+ return (int)strlen(title);
+ }
+
+ if ((bp=strchr(filen,':'))!=NULL) { *bp='\0'; sset=1;}
+
+
+ if ((fptr=fopen(filen,"r"))==NULL) {
+ fprintf(stderr," file %s was not found\n",filen);
+ fflush(stderr);
+ return 0;
+ }
+
+ if (sset==1) filen[strlen(filen)]=':';
+
+ while(fgets(line,sizeof(line),fptr)!=NULL) {
+ if (line[0]=='>'|| line[0]==';') goto found;
+ }
+ fclose(fptr);
+ title[0]='\0';
+ return 0;
+
+ found:
+
+#ifdef PIRLIB
+ if (line[0]=='>'&&(line[3]==';'||line[3]=='>')) {
+ if ((bp = strchr(line,'\n'))!=NULL) *bp='\0';
+ ll=strlen(line); line[ll++]=' '; line[ll]='\0';
+ fgets(&line[ll],sizeof(line)-ll,fptr);
+ }
+#endif
+
+#ifdef WIN32
+ bp = strpbrk(line,"\n\r");
+#else
+ bp = strchr(line,'\n');
+#endif
+ if (bp!=NULL) *bp = 0;
+ strncpy(title,line,len);
+ title[len-1]='\0';
+ fclose(fptr);
+ return strlen(title);
+}
+