9 /* get the next sequence entry from a fasta format file and
10 return a pointer to a structure containing this information.
12 Format expected of the input file is:
15 ONE LETTER CODE SEQUENCE ON SEVERAL LINES
17 >NEXTIDENT Next title etc.
19 The routine reads lines from the file until it finds one that starts
20 in '>'. It then reads this line as an ID, title line before reading
21 all alphabetic characters that follow as the amino acid sequence.
22 The sequence is terminated by the next '>' or End of File.
24 This means that the must ONLY contain sequences. PIR format permits
25 more flexibility in the input file.
27 NOTE: This routine assumes that no line will be longer than the
28 p->MAX_BUFFER_LEN. Any program that calls this routine should
29 have p->MAX_BUFFER_LEN set suitably big. This can be done by
30 pre-checking the database file for line lengths.
32 Author: G. J. Barton (October 1995)
36 SEQS *gseq_fasta(FILE *seqfile)
41 int MAX_BUFFER_LEN = 10000;
42 int MAX_SEQ_LEN = 10000;
50 buff = (char *) GJmalloc(sizeof(char) * MAX_BUFFER_LEN);
53 while((buff = fgets(buff,MAX_BUFFER_LEN,seqfile)) != NULL) {
55 ret_val = (SEQS *) GJmalloc(sizeof(SEQS));
56 ident = strtok(&buff[1]," ");
58 ident=GJremovechar2(ident,'\n');
59 ret_val->id = GJstrdup(ident);
60 ret_val->ilen = strlen(ident);
62 GJerror("Something strange with sequence identifier in fasta file");
63 fprintf(std_err,"Line:%s\n",buff);
66 title = strtok(NULL,"\n");
68 ret_val->title = GJstrdup(title);
69 ret_val->tlen = strlen(title);
71 /* GJerror("Something strange with sequence title in fasta file");*/
72 /* fprintf(std_err,"Line:%s\n",buff);*/
73 /*if(p->VERBOSE > 10)fprintf(std_err,"Title missing in FASTA file - Inserting dummy: %s\n",buff);*/
74 ret_val->title = GJstrdup("-");
75 ret_val->tlen = strlen(ret_val->title);
78 ret_val->seq = (char *) GJmalloc(sizeof(char) * MAX_SEQ_LEN);
79 ret_val->seq[0] = ' ';
83 if(c == EOF || c == '>') {
85 ret_val->seq[j] = '\0';
87 ret_val->seq = (char *) GJrealloc(ret_val->seq,sizeof(char) * ret_val->slen);
90 }else if(isalpha(c)) {
91 if(j == (MAX_SEQ_LEN - 3)){
92 fprintf(std_err,"Sequence too long: %s (> %d residues): Increase MAX_SEQ_LEN\n",
96 ret_val->seq[j++] = toupper(c);
106 int main(int argc, char **argv)
110 fasta = fopen("/homes/pvtroshin/large.fasta", "r");
113 seq = gseq_fasta(fasta);
114 if(seq!=NULL) printf("Seq: %s\n",seq->id );
115 } while(seq != NULL);