1 /********* Sequence input routines for CLUSTAL W *******************/
2 /* DES was here. FEB. 1994 */
3 /* Now reads PILEUP/MSF and CLUSTAL alignment files */
9 #include "io_lib_header.h"
10 #include "util_lib_header.h"
11 #include "define_header.h"
17 extern Boolean linetype(char *,char *);
18 extern Boolean blankline(char *);
19 extern void warning(char *,...);
20 extern void error(char *,...);
21 extern char * rtrim(char *);
22 extern char * blank_to_(char *);
23 extern void getstr(char *,char *);
25 void fill_chartab(void);
28 static void get_seq(char *,char *,int *,char *);
29 static void get_clustal_seq(char *,char *,int *,char *,int);
30 static void get_msf_seq(char *,char *,int *,char *,int);
31 static void check_infile(int *);
36 static int count_clustal_seqs(void);
37 static int count_msf_seqs(void);
46 char *amino_acid_codes = "ABCDEFGHIKLMNPQRSTUVWXYZ-"; /* DES */
47 char *nucleic_acid_order = "ACGTUN";
49 static char chartab[128];
51 void fill_chartab(void) /* Create translation and check table */
57 for(i=0;i<128;chartab[i++]=0);
58 for(i=0,c=0;c<=amino_acid_codes[i];i++)
59 chartab[c]=chartab[tolower(c)]=c;
63 static void get_msf_seq(char *sname,char *seq,int *len,char *tit,int seqno)
64 /* read the seqno_th. sequence from a PILEUP multiple alignment file */
69 if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
71 fseek(fin,0,0); /* start at the beginning */
73 *len=0; /* initialise length to zero */
75 if(fgets(line,MAXLINE+1,fin)==NULL) return; /* read the title*/
76 if(linetype(line,"/") ) break; /* lines...ignore*/
79 while (fgets(line,MAXLINE+1,fin) != NULL) {
80 if(!blankline(line)) {
82 for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
83 for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
84 for(k=j;k<=strlen(line);k++) if(line[k] == ' ') break;
85 strncpy(sname,line+j,MIN(MAXNAMES,k-j));
86 sname[MIN(MAXNAMES,k-j)]=EOS;
90 for(i=k;*len < SEQ_MAX_LEN;i++) {
94 if(c == '\n' || c == EOS) break; /* EOL */
95 if( (c=chartab[c])) seq[++(*len)]=c;
97 if(*len == SEQ_MAX_LEN) return;
100 if(fgets(line,MAXLINE+1,fin)==NULL) return;
101 if(blankline(line)) break;
108 static void get_clustal_seq(char *sname,char *seq,int *len,char *tit,int seqno)
109 /* read the seqno_th. sequence from a clustal multiple alignment file */
114 if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
115 fseek(fin,0,0); /* start at the beginning */
117 *len=0; /* initialise length to zero */
118 fgets(line,MAXLINE+1,fin); /* read the title line...ignore it */
120 while (fgets(line,MAXLINE+1,fin) != NULL) {
121 if(!blankline(line)) {
123 for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
124 for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
125 strncpy(sname,line+j,MAXNAMES-j); /* remember entryname */
130 for(i=14;*len < SEQ_MAX_LEN;i++) {
132 if(c == '\n' || c == EOS) break; /* EOL */
133 if( (c=chartab[c])) seq[++(*len)]=c;
135 if(*len == SEQ_MAX_LEN) return;
138 if(fgets(line,MAXLINE+1,fin)==NULL) return;
139 if(blankline(line)) break;
146 static void get_seq(char *sname,char *seq,int *len,char *tit)
151 if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
154 /************************************/
156 while( !linetype(line,"ID") )
157 fgets(line,MAXLINE+1,fin);
159 for(i=5;i<=strlen(line);i++) /* DES */
160 if(line[i] != ' ') break;
161 strncpy(sname,line+i,MAXNAMES); /* remember entryname */
165 fprintf ( stderr, "\n%s", sname);
166 while( !linetype(line,"SQ") )
167 fgets(line,MAXLINE+1,fin);
170 while(fgets(line,MAXLINE+1,fin)) {
171 for(i=0;*len < SEQ_MAX_LEN;i++) {
173 if(c == '\n' || c == EOS || c == '/')
178 if(*len == SEQ_MAX_LEN || c == '/') break;
182 /************************************/
187 fgets(line,MAXLINE+1,fin);
189 for(i=4;i<=strlen(line);i++) /* DES */
190 if(line[i] != ' ') break;
191 strncpy(sname,line+i,MAXNAMES); /* remember entryname */
196 fgets(line,MAXLINE+1,fin);
197 strncpy(tit,line,MAXTITLES);
200 if(tit[i-1]=='\n') tit[i-1]=EOS;
203 while(fgets(line,MAXLINE+1,fin)) {
205 for(i=0;*len < SEQ_MAX_LEN;i++)
208 if(c == '\n' || c == EOS || c == '*')
215 if(*len == SEQ_MAX_LEN || c == '*') break;
218 /***********************************************/
223 fgets(line,MAXLINE+1,fin);
225 for(i=1;i<=strlen(line);i++) /* DES */
226 if(line[i] != ' ') break;
227 strncpy(sname,line+i,MAXNAMES); /* remember entryname */
235 while(fgets(line,MAXLINE+1,fin))
237 for(i=0;*len < SEQ_MAX_LEN;i++)
240 if(c == '\n' || c == EOS || c == '>')
247 if(*len == SEQ_MAX_LEN || c == '>') break;
250 /**********************************************/
253 while(*line != '#' ||*line != '%' )
254 fgets(line,MAXLINE+1,fin);
259 for (i=1;i<=MAXNAMES;i++) {
260 if (line[i] == '(' || line[i] == '\n')
265 sname[i-1] = line[i];
269 if (sname[i-1] == '(') sscanf(&line[i],"%d",&offset);
271 for(i=MAXNAMES-1;i > 0;i--)
272 if(isspace(sname[i])) {
282 for (i=0;i<offset;i++) seq[++(*len)] = '-';
283 while(fgets(line,MAXLINE+1,fin)) {
284 if(*line == '%' || *line == '#' || *line == '"') break;
285 for(i=0;*len < SEQ_MAX_LEN;i++) {
287 if(c == '\n' || c == EOS)
293 if(*len == SEQ_MAX_LEN) break;
296 /***********************************************/
299 if(*len == SEQ_MAX_LEN)
300 warning("Sequence %s truncated to %d residues",
301 sname,(pint)SEQ_MAX_LEN);
307 int readseqs(char *saga_file,char ***SAGA_SEQ, char*** SAGA_NAMES, int ***SAGA_LEN) /*first_seq is the #no. of the first seq. to read */
320 if ( !line) line=vcalloc ( (FILENAMELEN+1), sizeof (char));
321 if ( !seq1) seq1=vcalloc ( (SEQ_MAX_LEN+2), sizeof (char));
322 if ( !sname1)sname1=vcalloc ( (MAXNAMES+1), sizeof (char));
323 if ( !title)title=vcalloc ( (MAXTITLES+1), sizeof (char));
327 fin=vfopen(saga_file,"r");
331 check_infile(&no_seqs);
334 return 0; /* return the number of seqs. (zero here)*/
336 SAGA_SEQ[0]= vcalloc ( no_seqs, sizeof ( char*));
337 SAGA_NAMES[0]= vcalloc ( no_seqs, sizeof ( char*));
338 SAGA_LEN[0]= declare_int ( no_seqs,3);
341 for(i=first_seq;i<=first_seq+no_seqs-1;i++)
342 { /* get the seqs now*/
343 if(seqFormat == CLUSTAL)
344 get_clustal_seq(sname1,seq1,&l1,title,i-first_seq+1);
346 get_msf_seq(sname1,seq1,&l1,title,i-first_seq+1);
348 get_seq(sname1,seq1,&l1,title);
352 error("Sequence too long. Maximum is %d",(pint)SEQ_MAX_LEN);
353 return 0; /* also return zero if too many */
358 for ( a=0; a<l1; a++)
362 (SAGA_SEQ[0])[i]=vcalloc ( strlen ( seq1)+1, sizeof ( char));
363 (SAGA_NAMES[0])[i]=vcalloc ( strlen ( sname1)+1, sizeof ( char));
366 (SAGA_LEN[0])[i][0]=l1;
367 (SAGA_SEQ[0])[i][l1]='\0';
369 sprintf ( (SAGA_SEQ[0])[i], "%s", seq1);
370 sprintf ( (SAGA_NAMES[0])[i], "%s", sname1);
372 l=strlen (SAGA_NAMES[0][i]);
373 for ( b=0; b< l; b++)
375 if ( isspace(SAGA_NAMES[0][i][b])){SAGA_NAMES[0][i][b]='\0';break;}
384 check sequence names are all different - otherwise phylip tree is
387 for(i=first_seq;i<=first_seq+no_seqs-1;i++) {
388 for(j=i+1;j<=first_seq+no_seqs-1;j++) {
389 if (strncmp((SAGA_NAMES[0])[i],(SAGA_NAMES[0])[j],MAXNAMES) == 0) {
390 error("Multiple sequences found with same name (first %d chars are significant)", MAXNAMES);
396 return no_seqs; /* return the number of seqs. read in this call */
402 static void check_infile(int *nseqs)
407 if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
409 for ( i=0; i<=MAXLINE; i++)line[i]='a';
411 while (fgets(line,MAXLINE+1,fin) != NULL) {
412 if ((*line != '\n') && (*line != ' ') && (*line != '\t'))
416 for(i=0;i<=6;i++) line[i] = toupper(line[i]);
418 if( linetype(line,"ID") ) { /* EMBL/Swiss-Prot format ? */
422 else if( linetype(line,"CLUSTAL") ) {
425 else if( linetype(line,"PILEUP") ) {
428 else if(*line == '>') { /* no */
429 seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
432 else if((*line == '"') || (*line == '%') || (*line == '#')) {
433 seqFormat=GDE; /* GDE format */
438 else if (*line == '#') {
448 while(fgets(line,MAXLINE+1,fin) != NULL) {
451 if( linetype(line,"ID") )
460 if(( *line == '%' ) )
462 else if (( *line == '#') )
466 *nseqs = count_clustal_seqs();
467 /* DES */ /* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */
472 *nseqs = count_msf_seqs();
485 static int count_clustal_seqs(void)
486 /* count the number of sequences in a clustal alignment file */
491 if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
493 while (fgets(line,MAXLINE+1,fin) != NULL) {
494 if(!blankline(line)) break; /* Look for next non- */
498 while (fgets(line,MAXLINE+1,fin) != NULL) {
499 if(blankline(line)) return nseqs;
503 return 0; /* if you got to here-funny format/no seqs.*/
506 static int count_msf_seqs(void)
508 /* count the number of sequences in a PILEUP alignment file */
513 if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
515 while (fgets(line,MAXLINE+1,fin) != NULL) {
516 if(linetype(line,"/")) break;
519 while (fgets(line,MAXLINE+1,fin) != NULL) {
520 if(!blankline(line)) break; /* Look for next non- */
524 while (fgets(line,MAXLINE+1,fin) != NULL) {
525 if(blankline(line)) return nseqs;
529 return 0; /* if you got to here-funny format/no seqs.*/
534 /*********************************COPYRIGHT NOTICE**********************************/
535 /*© Centro de Regulacio Genomica */
537 /*Cedric Notredame */
538 /*Tue Oct 27 10:12:26 WEST 2009. */
539 /*All rights reserved.*/
540 /*This file is part of T-COFFEE.*/
542 /* T-COFFEE is free software; you can redistribute it and/or modify*/
543 /* it under the terms of the GNU General Public License as published by*/
544 /* the Free Software Foundation; either version 2 of the License, or*/
545 /* (at your option) any later version.*/
547 /* T-COFFEE is distributed in the hope that it will be useful,*/
548 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
549 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
550 /* GNU General Public License for more details.*/
552 /* You should have received a copy of the GNU General Public License*/
553 /* along with Foobar; if not, write to the Free Software*/
554 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
555 /*............................................... |*/
556 /* If you need some more information*/
557 /* cedric.notredame@europe.com*/
558 /*............................................... |*/
562 /*********************************COPYRIGHT NOTICE**********************************/