1 /* May, June 1987 - modified for rapid read of database
3 copyright (c) 1987,1988,1989,1992,1995,2000 William R. Pearson
5 This is one of three alternative files that can be used to
6 read a database. The three files are nxgetaa.c, nmgetaa.c, and
9 nxgetaa.c contains the original code for reading databases, and
10 is still used for Mac and PC versions of fasta33 (which do not
13 nmgetaa.c and mmgetaa.c are used together. nmgetaa.c provides
14 the same functions as nxgetaa.c if memory mapping is not used,
15 mmgetaa.c provides the database reading functions if memory
16 mapping is used. The decision to use memory mapping is made on
19 June 2, 1987 - added TFASTA
20 March 30, 1988 - combined ffgetaa, fgetgb;
21 April 8, 1988 - added PIRLIB format for unix
22 Feb 4, 1989 - added universal subroutines for libraries
23 December, 1995 - added range option file.name:1-1000
24 Feb 22, 2002 - fix to allow "plain" text file queries
26 getnt.c associated subroutines for matching sequences */
28 /* $Name: fa_34_26_5 $ - $Id: getseq.c,v 1.13 2006/10/05 18:22:07 wrp Exp $ */
32 The compile time #define PIRLIB allows this routine to be used
33 to read protein and DNA sequence libraries in the NBRF/PIR
34 VAX/VMS library format. That is:
37 This is a line of description
38 GTYH ... the sequence starts on this line
40 This may ease conversion from UWGCG format libraries. It
41 has not been extensively tested.
43 In addition, sequence libraries with a '>' in the 4th position
44 are recognized as NBRF format libraries for consistency with
48 /* Nov 12, 1987 - this version checks to see if the sequence
49 is DNA or protein by asking whether > 85% is A, C, G, T
51 May 5, 1988 - modify the DNA/PROTEIN checker by re-reading
52 DNA sequences in order to check for 'U'.
80 #define min(x,y) ((x) > (y) ? (y) : (x))
84 extern int nsfnum; /* number of superfamily numbers */
85 extern int sfnum[]; /* superfamily number from types 0 and 5 */
91 #define FASTA_FORMAT 1
94 static int seq_format=NO_FORMAT;
95 static char seq_title[200];
97 int scanseq(unsigned char *, int, char *);
98 void sf_sort(int *, int);
99 extern void init_ascii(int is_ext, int *sascii, int is_dna);
101 /* getseq - get a query sequence, possibly re-reading to set type
102 returns - length of query sequence or error = 0
104 char *filen - name of file to be opened
105 char *seq - destination for query sequence
106 int maxs - maximum length of query
107 char libstr[20] - short description (locus or acc)
108 int *dnaseq - -1 => use scanseq to determine sequence type
111 long *sq0off - offset into query specified by query_file:1001-2000
115 getseq(char *filen, int *qascii, unsigned char *seq, int maxs, char *libstr, long *sq0off)
118 char line[512],*bp, *bp1, *bpn, *tp;
121 int sstart, sstop, sset=0;
124 static char tline[MAXLINE];
132 if ((bp=strchr(filen,':'))!=NULL && *(bp+1)!='\0') {
134 if ((bp=strchr(filen+3,':'))!=NULL && *(bp+1)!='\0') {
139 sscanf(bp+2,"%d",&sstop);
142 sscanf(bp+1,"%d-%d",&sstart,&sstop);
144 if (sstop <= 0 ) sstop = BIGNUM;
153 /* check for input from stdin */
154 if (strcmp(filen,"-") && strcmp(filen,"@")) {
155 if ((fptr=fopen(filen,"r"))==NULL) {
156 fprintf(stderr," could not open %s\n",filen);
165 while(fgets(line,sizeof(line),fptr)!=NULL) {
167 if (line[0]=='>'&& (line[3]==';'||line[3]=='>'))
168 fgets(line,sizeof(line),fptr);
173 seq_format = FASTA_FORMAT;
175 sfnum[nsfnum=0]= sfnum_n[nsfnum_n=0]=0;
176 strncpy(tline,line+1,sizeof(tline));
177 tline[sizeof(tline)-1]='\0';
179 if ((bp=strchr(tline,' ')) && (bp=strchr(bp+1,SFCHAR))) {
180 if ((bp1=strchr(bp+1,SFCHAR))==NULL) {
181 fprintf(stderr," second %c missing: %s\n",SFCHAR,tline);
184 if ((bpn=strchr(bp+1,NSFCHAR))!=NULL) *bpn = '\0';
187 if ((tp = strtok(bp+1," \t"))!=NULL) {
188 sfnum[i++] = atoi(tp);
189 while ((tp = strtok((char *)NULL," \t")) != (char *)NULL) {
190 if (isdigit(*tp)) sfnum[i++] = atoi(tp);
195 if (nsfnum>1) sf_sort(sfnum,nsfnum);
197 if (nsfnum < 1) fprintf(stderr," found | but no sfnum: %s\n",libstr);
200 tp = strtok(bpn+1," \t");
203 while ((tp=strtok(NULL," \t"))!=NULL) {
204 sfnum_n[i++] = atoi(tp);
207 " error - too many negative superfamilies: %d\n %s\n",
212 sfnum_n[nsfnum_n=i]=0;
213 sf_sort(sfnum_n,nsfnum_n);
218 sfnum[nsfnum = 0] = 0;
219 sfnum_n[nsfnum_n = 0] = 0;
222 if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
223 strncpy(seq_title,line+1,sizeof(seq_title));
224 seq_title[sizeof(seq_title)-1]='\0';
225 if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
226 strncpy(libstr,line+1,12);
229 else if (seq_format==NO_FORMAT && strcmp(line,"..")==0) {
230 seq_format = GCG_FORMAT;
232 if (*dnaseq != 1) qascii['*'] = qascii['X'];
236 while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
237 if (fgets(line,sizeof(line),fptr)==NULL) return 0;
240 bp = strtok(line," \t");
242 if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0';
243 else if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0';
245 if (bp!=NULL) strncpy(libstr,bp,12);
246 else strncpy(libstr,filen,12);
248 if (fgets(line,sizeof(line),fptr)==NULL) return 0;
251 if (libstr[0]=='\0') strncpy(libstr,filen,12);
255 if (seq_format==GCG_FORMAT && strlen(line)<l_offset) continue;
257 if (line[0]!='>'&& line[0]!=';') {
258 for (i=l_offset; (n<maxs && rn < sstop)&&
259 ((ic=qascii[line[i]&AAMASK])<EL); i++)
260 if (ic<NA && ++rn > sstart) seq[n++]= ic;
261 if (ic == ES || rn > sstop) break;
266 fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
269 if ((bp=strchr(libstr,'\n'))!=NULL) *bp = '\0';
270 if ((bp=strchr(libstr,'\r'))!=NULL) *bp = '\0';
274 if (seq_format !=GCG_FORMAT)
275 while(fgets(line,sizeof(line),fptr)!=NULL) {
277 if (line[0]=='>'&& (line[3]==';'||line[3]=='>'))
278 fgets(line,sizeof(line),fptr);
281 if (line[0]!='>'&& line[0]!=';') {
282 for (i=0; (n<maxs && rn < sstop)&&
283 ((ic=qascii[line[i]&AAMASK])<EL); i++)
284 if (ic<NA && ++rn > sstart ) seq[n++]= ic;
285 if (ic == ES || rn > sstop) break;
290 while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) {
291 if (fgets(line,sizeof(line),fptr)==NULL) return 0;
294 while (fgets(line,sizeof(line),fptr)!=NULL) {
295 if (strlen(line)<l_offset) continue;
296 for (i=l_offset; (n<maxs && rn < sstop) &&
297 ((ic=qascii[line[i]&AAMASK])<EL); i++)
298 if (ic<NA && ++rn > sstart ) seq[n++]= ic;
299 if (ic == ES || rn > sstop ) break;
304 fprintf(stderr," sequence may be truncated %d %d\n",n,maxs);
309 if (fptr!=stdin) fclose(fptr);
313 filen[strlen(filen)]=':';
314 if (*sq0off==1 || sstart>=1) *sq0off = sstart;
321 gettitle(char *filen, char *title, int len) {
332 if (strncmp(filen,"-",1)==0 || strncmp(filen,"@",1)==0) {
333 strncpy(title,seq_title,len);
335 return (int)strlen(title);
338 if ((bp=strchr(filen,':'))!=NULL) { *bp='\0'; sset=1;}
341 if ((fptr=fopen(filen,"r"))==NULL) {
342 fprintf(stderr," file %s was not found\n",filen);
347 if (sset==1) filen[strlen(filen)]=':';
349 while(fgets(line,sizeof(line),fptr)!=NULL) {
350 if (line[0]=='>'|| line[0]==';') goto found;
359 if (line[0]=='>'&&(line[3]==';'||line[3]=='>')) {
360 if ((bp = strchr(line,'\n'))!=NULL) *bp='\0';
361 ll=strlen(line); line[ll++]=' '; line[ll]='\0';
362 fgets(&line[ll],sizeof(line)-ll,fptr);
367 bp = strpbrk(line,"\n\r");
369 bp = strchr(line,'\n');
371 if (bp!=NULL) *bp = 0;
372 strncpy(title,line,len);
375 return strlen(title);