#include #include #include #include #include "getquery.h" #include "gjutil.h" /* get the next sequence entry from a fasta format file and return a pointer to a structure containing this information. Format expected of the input file is: >IDENT Title here ONE LETTER CODE SEQUENCE ON SEVERAL LINES LIKE THIS >NEXTIDENT Next title etc. The routine reads lines from the file until it finds one that starts in '>'. It then reads this line as an ID, title line before reading all alphabetic characters that follow as the amino acid sequence. The sequence is terminated by the next '>' or End of File. This means that the must ONLY contain sequences. PIR format permits more flexibility in the input file. NOTE: This routine assumes that no line will be longer than the p->MAX_BUFFER_LEN. Any program that calls this routine should have p->MAX_BUFFER_LEN set suitably big. This can be done by pre-checking the database file for line lengths. Author: G. J. Barton (October 1995) */ SEQS *gseq_fasta(FILE *seqfile) { char *buff,*tbuff; SEQS *ret_val; int MAX_BUFFER_LEN = 10000; int MAX_SEQ_LEN = 10000; char *ident = NULL; char *title = NULL; GJ_S_COUNT j; int c; STD_FILES; buff = (char *) GJmalloc(sizeof(char) * MAX_BUFFER_LEN); tbuff = buff; while((buff = fgets(buff,MAX_BUFFER_LEN,seqfile)) != NULL) { if(buff[0] == '>') { ret_val = (SEQS *) GJmalloc(sizeof(SEQS)); ident = strtok(&buff[1]," "); if(ident != NULL) { ident=GJremovechar2(ident,'\n'); ret_val->id = GJstrdup(ident); ret_val->ilen = strlen(ident); }else { GJerror("Something strange with sequence identifier in fasta file"); fprintf(std_err,"Line:%s\n",buff); return NULL; } title = strtok(NULL,"\n"); if(title != NULL) { ret_val->title = GJstrdup(title); ret_val->tlen = strlen(title); }else { /* GJerror("Something strange with sequence title in fasta file");*/ /* fprintf(std_err,"Line:%s\n",buff);*/ /*if(p->VERBOSE > 10)fprintf(std_err,"Title missing in FASTA file - Inserting dummy: %s\n",buff);*/ ret_val->title = GJstrdup("-"); ret_val->tlen = strlen(ret_val->title); /* return NULL;*/ } ret_val->seq = (char *) GJmalloc(sizeof(char) * MAX_SEQ_LEN); ret_val->seq[0] = ' '; j = 0; for(;;) { c = fgetc(seqfile); if(c == EOF || c == '>') { ungetc(c,seqfile); ret_val->seq[j] = '\0'; ret_val->slen = j; ret_val->seq = (char *) GJrealloc(ret_val->seq,sizeof(char) * ret_val->slen); GJfree(buff); return ret_val; }else if(isalpha(c)) { if(j == (MAX_SEQ_LEN - 3)){ fprintf(std_err,"Sequence too long: %s (> %d residues): Increase MAX_SEQ_LEN\n", ret_val->id,j); exit(-1); } ret_val->seq[j++] = toupper(c); } } } } GJfree(tbuff); return(NULL); } /* int main(int argc, char **argv) { FILE *fasta; SEQS *seq; fasta = fopen("/homes/pvtroshin/large.fasta", "r"); do{ seq = gseq_fasta(fasta); if(seq!=NULL) printf("Seq: %s\n",seq->id ); } while(seq != NULL); fclose(fasta); return 0; } */