/* May, June 1987 - modified for rapid read of database copyright (c) 1987,1988,1989,1992,1995,2000 William R. Pearson This is one of three alternative files that can be used to read a database. The three files are nxgetaa.c, nmgetaa.c, and mmgetaa.c. nxgetaa.c contains the original code for reading databases, and is still used for Mac and PC versions of fasta33 (which do not use mmap). nmgetaa.c and mmgetaa.c are used together. nmgetaa.c provides the same functions as nxgetaa.c if memory mapping is not used, mmgetaa.c provides the database reading functions if memory mapping is used. The decision to use memory mapping is made on a file-by-file basis. June 2, 1987 - added TFASTA March 30, 1988 - combined ffgetaa, fgetgb; April 8, 1988 - added PIRLIB format for unix Feb 4, 1989 - added universal subroutines for libraries December, 1995 - added range option file.name:1-1000 Feb 22, 2002 - fix to allow "plain" text file queries getnt.c associated subroutines for matching sequences */ /* $Name: fa_34_26_5 $ - $Id: getseq.c,v 1.13 2006/10/05 18:22:07 wrp Exp $ */ /* 8-April-88 The compile time #define PIRLIB allows this routine to be used to read protein and DNA sequence libraries in the NBRF/PIR VAX/VMS library format. That is: >P1;LCBO This is a line of description GTYH ... the sequence starts on this line This may ease conversion from UWGCG format libraries. It has not been extensively tested. In addition, sequence libraries with a '>' in the 4th position are recognized as NBRF format libraries for consistency with UWGCG */ /* Nov 12, 1987 - this version checks to see if the sequence is DNA or protein by asking whether > 85% is A, C, G, T May 5, 1988 - modify the DNA/PROTEIN checker by re-reading DNA sequences in order to check for 'U'. */ #include #include #include #include "defs.h" #include "structs.h" #ifndef SFCHAR #define SFCHAR ':' #endif #ifdef VMS #define PIRLIB #endif #define XTERNAL #include "uascii.h" #include "upam.h" #undef XTERNAL #define YES 1 #define NO 0 #define MAXLINE 512 #ifndef min #define min(x,y) ((x) > (y) ? (y) : (x)) #endif #ifdef SUPERFAMNUM extern int nsfnum; /* number of superfamily numbers */ extern int sfnum[]; /* superfamily number from types 0 and 5 */ extern int nsfnum_n; extern int sfnum_n[]; #endif #define NO_FORMAT 0 #define FASTA_FORMAT 1 #define GCG_FORMAT 2 static int seq_format=NO_FORMAT; static char seq_title[200]; int scanseq(unsigned char *, int, char *); void sf_sort(int *, int); extern void init_ascii(int is_ext, int *sascii, int is_dna); /* getseq - get a query sequence, possibly re-reading to set type returns - length of query sequence or error = 0 char *filen - name of file to be opened char *seq - destination for query sequence int maxs - maximum length of query char libstr[20] - short description (locus or acc) int *dnaseq - -1 => use scanseq to determine sequence type 0 => must be protein 1 => must be DNA long *sq0off - offset into query specified by query_file:1001-2000 */ int getseq(char *filen, int *qascii, unsigned char *seq, int maxs, char *libstr, long *sq0off) { FILE *fptr; char line[512],*bp, *bp1, *bpn, *tp; int i, rn, n; int ic; int sstart, sstop, sset=0; int llen, l_offset; #ifdef SUPERFAMNUM static char tline[MAXLINE]; #endif seq_title[0]='\0'; libstr[0]='\0'; sstart = sstop = -1; #ifndef DOS if ((bp=strchr(filen,':'))!=NULL && *(bp+1)!='\0') { #else if ((bp=strchr(filen+3,':'))!=NULL && *(bp+1)!='\0') { #endif *bp='\0'; if (*(bp+1)=='-') { sstart = 0; sscanf(bp+2,"%d",&sstop); } else { sscanf(bp+1,"%d-%d",&sstart,&sstop); sstart--; if (sstop <= 0 ) sstop = BIGNUM; } sset=1; } else { sstart = 0; sstop = BIGNUM; } /* check for input from stdin */ if (strcmp(filen,"-") && strcmp(filen,"@")) { if ((fptr=fopen(filen,"r"))==NULL) { fprintf(stderr," could not open %s\n",filen); return 0; } } else { fptr = stdin; } rn = n=0; while(fgets(line,sizeof(line),fptr)!=NULL) { #ifdef PIRLIB if (line[0]=='>'&& (line[3]==';'||line[3]=='>')) fgets(line,sizeof(line),fptr); else #endif l_offset = 0; if (line[0]=='>') { seq_format = FASTA_FORMAT; #ifdef SUPERFAMNUM sfnum[nsfnum=0]= sfnum_n[nsfnum_n=0]=0; strncpy(tline,line+1,sizeof(tline)); tline[sizeof(tline)-1]='\0'; if ((bp=strchr(tline,' ')) && (bp=strchr(bp+1,SFCHAR))) { if ((bp1=strchr(bp+1,SFCHAR))==NULL) { fprintf(stderr," second %c missing: %s\n",SFCHAR,tline); } else { if ((bpn=strchr(bp+1,NSFCHAR))!=NULL) *bpn = '\0'; *bp1 = '\0'; i = 0; if ((tp = strtok(bp+1," \t"))!=NULL) { sfnum[i++] = atoi(tp); while ((tp = strtok((char *)NULL," \t")) != (char *)NULL) { if (isdigit(*tp)) sfnum[i++] = atoi(tp); if (i>=9) break; } } sfnum[nsfnum=i]= 0; if (nsfnum>1) sf_sort(sfnum,nsfnum); else { if (nsfnum < 1) fprintf(stderr," found | but no sfnum: %s\n",libstr); } if (bpn != NULL) { tp = strtok(bpn+1," \t"); sfnum_n[0]=atoi(tp); i = 1; while ((tp=strtok(NULL," \t"))!=NULL) { sfnum_n[i++] = atoi(tp); if (i >= 10) { fprintf(stderr, " error - too many negative superfamilies: %d\n %s\n", i,tline); break; } } sfnum_n[nsfnum_n=i]=0; sf_sort(sfnum_n,nsfnum_n); } } } else { sfnum[nsfnum = 0] = 0; sfnum_n[nsfnum_n = 0] = 0; } #endif if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0'; strncpy(seq_title,line+1,sizeof(seq_title)); seq_title[sizeof(seq_title)-1]='\0'; if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0'; strncpy(libstr,line+1,12); libstr[12]='\0'; } else if (seq_format==NO_FORMAT && strcmp(line,"..")==0) { seq_format = GCG_FORMAT; /* if (*dnaseq != 1) qascii['*'] = qascii['X']; */ l_offset = 10; llen = strlen(line); while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) { if (fgets(line,sizeof(line),fptr)==NULL) return 0; llen = strlen(line); } bp = strtok(line," \t"); /* if ((bp=(char *)strchr(line,' '))!=NULL) *bp='\0'; else if ((bp=(char *)strchr(line,'\n'))!=NULL) *bp='\0'; */ if (bp!=NULL) strncpy(libstr,bp,12); else strncpy(libstr,filen,12); libstr[12]='\0'; if (fgets(line,sizeof(line),fptr)==NULL) return 0; } else { if (libstr[0]=='\0') strncpy(libstr,filen,12); libstr[12]='\0'; } if (seq_format==GCG_FORMAT && strlen(line) sstart) seq[n++]= ic; if (ic == ES || rn > sstop) break; } } if (n==maxs) { fprintf(stderr," sequence may be truncated %d %d\n",n,maxs); fflush(stderr); } if ((bp=strchr(libstr,'\n'))!=NULL) *bp = '\0'; if ((bp=strchr(libstr,'\r'))!=NULL) *bp = '\0'; seq[n]= EOSEQ; if (seq_format !=GCG_FORMAT) while(fgets(line,sizeof(line),fptr)!=NULL) { #ifdef PIRLIB if (line[0]=='>'&& (line[3]==';'||line[3]=='>')) fgets(line,sizeof(line),fptr); else #endif if (line[0]!='>'&& line[0]!=';') { for (i=0; (n sstart ) seq[n++]= ic; if (ic == ES || rn > sstop) break; } } else { llen = strlen(line); while (strncmp(&line[llen-3],"..\n",(size_t)3) != 0) { if (fgets(line,sizeof(line),fptr)==NULL) return 0; llen = strlen(line); } while (fgets(line,sizeof(line),fptr)!=NULL) { if (strlen(line) sstart ) seq[n++]= ic; if (ic == ES || rn > sstop ) break; } } if (n==maxs) { fprintf(stderr," sequence may be truncated %d %d\n",n,maxs); fflush(stderr); } seq[n]= EOSEQ; if (fptr!=stdin) fclose(fptr); if (sset==1) { sstart++; filen[strlen(filen)]=':'; if (*sq0off==1 || sstart>=1) *sq0off = sstart; } return n; } int gettitle(char *filen, char *title, int len) { FILE *fptr; char line[512]; char *bp; int sset; #ifdef WIN32 char *strpbrk(); #endif sset = 0; if (strncmp(filen,"-",1)==0 || strncmp(filen,"@",1)==0) { strncpy(title,seq_title,len); title[len-1]='\0'; return (int)strlen(title); } if ((bp=strchr(filen,':'))!=NULL) { *bp='\0'; sset=1;} if ((fptr=fopen(filen,"r"))==NULL) { fprintf(stderr," file %s was not found\n",filen); fflush(stderr); return 0; } if (sset==1) filen[strlen(filen)]=':'; while(fgets(line,sizeof(line),fptr)!=NULL) { if (line[0]=='>'|| line[0]==';') goto found; } fclose(fptr); title[0]='\0'; return 0; found: #ifdef PIRLIB if (line[0]=='>'&&(line[3]==';'||line[3]=='>')) { if ((bp = strchr(line,'\n'))!=NULL) *bp='\0'; ll=strlen(line); line[ll++]=' '; line[ll]='\0'; fgets(&line[ll],sizeof(line)-ll,fptr); } #endif #ifdef WIN32 bp = strpbrk(line,"\n\r"); #else bp = strchr(line,'\n'); #endif if (bp!=NULL) *bp = 0; strncpy(title,line,len); title[len-1]='\0'; fclose(fptr); return strlen(title); }