JWS-109 Fixed the links in the navbar header and footer to match the new documentatio...
[jabaws.git] / binaries / src / tcoffee / t_coffee_source / pb_util_read_sequence.c
1 /********* Sequence input routines for CLUSTAL W *******************/
2 /* DES was here.  FEB. 1994 */
3 /* Now reads PILEUP/MSF and CLUSTAL alignment files */
4
5 #include <stdio.h>
6 #include <string.h>
7 #include <ctype.h>
8 #include <stdlib.h>
9 #include "io_lib_header.h"
10 #include "util_lib_header.h"
11 #include "define_header.h"
12
13 /*
14 *       Prototypes
15 */
16
17 extern Boolean linetype(char *,char *);
18 extern Boolean blankline(char *);
19 extern void warning(char *,...);
20 extern void error(char *,...);
21 extern char *   rtrim(char *);
22 extern char *   blank_to_(char *);
23 extern void     getstr(char *,char *);
24
25 void fill_chartab(void);
26
27
28 static void         get_seq(char *,char *,int *,char *);
29 static void get_clustal_seq(char *,char *,int *,char *,int);
30 static void     get_msf_seq(char *,char *,int *,char *,int);
31 static void check_infile(int *);
32
33
34
35
36 static int count_clustal_seqs(void);
37 static int count_msf_seqs(void);
38
39 /*
40  *      Global variables
41  */
42
43 static FILE *fin;
44
45
46 char *amino_acid_codes   =    "ABCDEFGHIKLMNPQRSTUVWXYZ-";  /* DES */
47 char *nucleic_acid_order =        "ACGTUN";
48 static int seqFormat;
49 static char chartab[128];
50
51 void fill_chartab(void) /* Create translation and check table */
52 {
53         register int i;
54         register int c;
55
56
57         for(i=0;i<128;chartab[i++]=0);
58         for(i=0,c=0;c<=amino_acid_codes[i];i++)
59                 chartab[c]=chartab[tolower(c)]=c;
60
61 }
62
63 static void get_msf_seq(char *sname,char *seq,int *len,char *tit,int seqno)
64 /* read the seqno_th. sequence from a PILEUP multiple alignment file */
65 {
66         static char *line;
67         int i,j,k;
68         unsigned char c;
69         if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
70
71         fseek(fin,0,0);                 /* start at the beginning */
72
73         *len=0;                         /* initialise length to zero */
74         for(i=0;;i++) {
75                 if(fgets(line,MAXLINE+1,fin)==NULL) return; /* read the title*/
76                 if(linetype(line,"/") ) break;              /* lines...ignore*/
77         }
78
79         while (fgets(line,MAXLINE+1,fin) != NULL) {
80                 if(!blankline(line)) {
81
82                         for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
83                         for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
84                         for(k=j;k<=strlen(line);k++) if(line[k] == ' ') break;
85                         strncpy(sname,line+j,MIN(MAXNAMES,k-j)); 
86                         sname[MIN(MAXNAMES,k-j)]=EOS;
87                         rtrim(sname);
88                         blank_to_(sname);
89
90                         for(i=k;*len < SEQ_MAX_LEN;i++) {
91                                 c=line[i];
92                                 if(c == '.') c = '-';
93                                 if(c == '*') c = 'X';
94                                 if(c == '\n' || c == EOS) break; /* EOL */
95                                 if( (c=chartab[c])) seq[++(*len)]=c;
96                         }
97                         if(*len == SEQ_MAX_LEN) return;
98
99                         for(i=0;;i++) {
100                                 if(fgets(line,MAXLINE+1,fin)==NULL) return;
101                                 if(blankline(line)) break;
102                         }
103                 }
104         }
105 }
106
107
108 static void get_clustal_seq(char *sname,char *seq,int *len,char *tit,int seqno)
109 /* read the seqno_th. sequence from a clustal multiple alignment file */
110 {
111         static char *line;
112         int i,j;
113         unsigned char c;
114         if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
115         fseek(fin,0,0);                 /* start at the beginning */
116
117         *len=0;                         /* initialise length to zero */
118         fgets(line,MAXLINE+1,fin);      /* read the title line...ignore it */
119
120         while (fgets(line,MAXLINE+1,fin) != NULL) {
121                 if(!blankline(line)) {
122
123                         for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
124                         for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
125                         strncpy(sname,line+j,MAXNAMES-j); /* remember entryname */
126                         sname[MAXNAMES]=EOS;
127                         rtrim(sname);
128                         blank_to_(sname);
129
130                         for(i=14;*len < SEQ_MAX_LEN;i++) {
131                                 c=line[i];
132                                 if(c == '\n' || c == EOS) break; /* EOL */
133                                 if( (c=chartab[c])) seq[++(*len)]=c;
134                         }
135                         if(*len == SEQ_MAX_LEN) return;
136
137                         for(i=0;;i++) {
138                                 if(fgets(line,MAXLINE+1,fin)==NULL) return;
139                                 if(blankline(line)) break;
140                         }
141                 }
142         }
143 }
144
145
146 static void get_seq(char *sname,char *seq,int *len,char *tit)
147 {
148         static char *line;
149         int i, offset, c=0;
150
151         if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
152         switch(seqFormat) {
153
154 /************************************/
155                 case EMBLSWISS:
156                         while( !linetype(line,"ID") )
157                                 fgets(line,MAXLINE+1,fin);
158                         
159                         for(i=5;i<=strlen(line);i++)  /* DES */
160                                 if(line[i] != ' ') break;
161                         strncpy(sname,line+i,MAXNAMES); /* remember entryname */
162                         sname[MAXNAMES]=EOS;
163                         rtrim(sname);
164                         blank_to_(sname);
165                         fprintf ( stderr, "\n%s", sname);
166                         while( !linetype(line,"SQ") )
167                                 fgets(line,MAXLINE+1,fin);
168                         
169                         *len=0;
170                         while(fgets(line,MAXLINE+1,fin)) {
171                                 for(i=0;*len < SEQ_MAX_LEN;i++) {
172                                         c=line[i];
173                                 if(c == '\n' || c == EOS || c == '/')
174                                         break;                  /* EOL */
175                                 if( (c=chartab[c]))
176                                         seq[++(*len)]=c;
177                                 }
178                         if(*len == SEQ_MAX_LEN || c == '/') break;
179                         }
180                 break;
181                 
182 /************************************/
183                 case PIR:
184                         while(*line != '>')
185                                 {
186                                 
187                                 fgets(line,MAXLINE+1,fin);
188                                  }                      
189                         for(i=4;i<=strlen(line);i++)  /* DES */
190                                 if(line[i] != ' ') break;
191                         strncpy(sname,line+i,MAXNAMES); /* remember entryname */
192                         sname[MAXNAMES]=EOS;
193                         rtrim(sname);
194                         blank_to_(sname);
195
196                         fgets(line,MAXLINE+1,fin);
197                         strncpy(tit,line,MAXTITLES);
198                         tit[MAXTITLES]=EOS;
199                         i=strlen(tit);
200                         if(tit[i-1]=='\n') tit[i-1]=EOS;
201                         
202                         *len=0;
203                         while(fgets(line,MAXLINE+1,fin)) {
204                                 
205                                 for(i=0;*len < SEQ_MAX_LEN;i++) 
206                                    {
207                                    c=line[i];
208                                    if(c == '\n' || c == EOS || c == '*')
209                                         break;                  /* EOL */
210                         
211                                    if( (c=chartab[c]))
212                                         seq[++(*len)]=c;
213
214                                    }
215                         if(*len == SEQ_MAX_LEN || c == '*') break;
216                         }
217                 break;
218 /***********************************************/
219         case (PEARSON):
220                         
221                         while(*line != '>')
222                                 {
223                                 fgets(line,MAXLINE+1,fin);
224                                 }
225                         for(i=1;i<=strlen(line);i++)  /* DES */
226                                 if(line[i] != ' ') break;
227                         strncpy(sname,line+i,MAXNAMES); /* remember entryname */
228                         sname[MAXNAMES]=EOS;
229                         rtrim(sname);
230                         blank_to_(sname);
231
232                         *tit=EOS;
233                         
234                         *len=0;
235                         while(fgets(line,MAXLINE+1,fin)) 
236                                 {
237                                 for(i=0;*len < SEQ_MAX_LEN;i++) 
238                                         {
239                                         c=line[i];
240                                         if(c == '\n' || c == EOS || c == '>')
241                                                 break;                  /* EOL */
242                                                 
243                                         if( (c=chartab[c]))
244                                                 {seq[++(*len)]=c;
245                                                  }
246                                         }
247                                 if(*len == SEQ_MAX_LEN || c == '>') break;
248                                 }
249                         break;
250 /**********************************************/
251                 case GDE:
252                 
253                         while(*line != '#' ||*line != '%' )
254                                         fgets(line,MAXLINE+1,fin);
255                         
256                         
257                         
258                         
259                         for (i=1;i<=MAXNAMES;i++) {
260                                 if (line[i] == '(' || line[i] == '\n')
261                                   {
262                                     i--;
263                                     break;
264                                   }
265                                 sname[i-1] = line[i];
266                         }
267                         sname[i]=EOS;
268                         offset=0;
269                         if (sname[i-1] == '(') sscanf(&line[i],"%d",&offset);
270                         else offset = 0;
271                         for(i=MAXNAMES-1;i > 0;i--) 
272                                 if(isspace(sname[i])) {
273                                         sname[i]=EOS;   
274                                         break;
275                                 }               
276                         blank_to_(sname);
277
278
279                         *tit=EOS;
280                         
281                         *len=0;
282                         for (i=0;i<offset;i++) seq[++(*len)] = '-';
283                         while(fgets(line,MAXLINE+1,fin)) {
284                         if(*line == '%' || *line == '#' || *line == '"') break;
285                                 for(i=0;*len < SEQ_MAX_LEN;i++) {
286                                         c=line[i];
287                                 if(c == '\n' || c == EOS) 
288                                         break;                  /* EOL */
289                         
290                                 if( (c=chartab[c]))
291                                         seq[++(*len)]=c;
292                                 }
293                         if(*len == SEQ_MAX_LEN) break;
294                         }
295                 break;
296 /***********************************************/
297         }
298         
299         if(*len == SEQ_MAX_LEN)
300                 warning("Sequence %s truncated to %d residues",
301                                 sname,(pint)SEQ_MAX_LEN);
302                                 
303         seq[*len+1]=EOS;
304 }
305
306
307 int readseqs(char *saga_file,char ***SAGA_SEQ, char*** SAGA_NAMES, int ***SAGA_LEN) /*first_seq is the #no. of the first seq. to read */
308 {
309         
310         static char *line;
311         static char *seq1;
312         static char *sname1;
313         static char *title;
314         int i,j,no_seqs;
315         static int l1;
316         int a;
317         int b, l;
318         int first_seq=0;
319         
320         if ( !line) line=vcalloc   ( (FILENAMELEN+1), sizeof (char));
321         if ( !seq1) seq1=vcalloc   ( (SEQ_MAX_LEN+2), sizeof (char));
322         if ( !sname1)sname1=vcalloc ( (MAXNAMES+1), sizeof (char));
323         if ( !title)title=vcalloc  ( (MAXTITLES+1), sizeof (char));
324         
325         fill_chartab();
326         
327         fin=vfopen(saga_file,"r");
328         
329         
330         no_seqs=0;
331         check_infile(&no_seqs);
332
333         if(no_seqs == 0)
334                 return 0;       /* return the number of seqs. (zero here)*/
335         
336         SAGA_SEQ[0]= vcalloc ( no_seqs, sizeof ( char*));
337         SAGA_NAMES[0]= vcalloc ( no_seqs, sizeof ( char*));
338         SAGA_LEN[0]= declare_int ( no_seqs,3);
339         
340         
341         for(i=first_seq;i<=first_seq+no_seqs-1;i++) 
342                 {    /* get the seqs now*/
343                 if(seqFormat == CLUSTAL) 
344                         get_clustal_seq(sname1,seq1,&l1,title,i-first_seq+1);
345                 if(seqFormat == MSF)
346                             get_msf_seq(sname1,seq1,&l1,title,i-first_seq+1);
347                 else
348                         get_seq(sname1,seq1,&l1,title);
349                 
350                 if(l1 > SEQ_MAX_LEN) 
351                         {
352                         error("Sequence too long. Maximum is %d",(pint)SEQ_MAX_LEN);
353                         return 0;       /* also return zero if too many */
354                         }
355                 
356                 
357                 
358                 for ( a=0; a<l1; a++)
359                         seq1[a]=seq1[a+1];      
360                 seq1[l1]='\0';
361                 
362                 (SAGA_SEQ[0])[i]=vcalloc ( strlen ( seq1)+1, sizeof ( char));
363                 (SAGA_NAMES[0])[i]=vcalloc ( strlen ( sname1)+1, sizeof ( char));
364                 
365                 l1=strlen (seq1);
366                 (SAGA_LEN[0])[i][0]=l1;
367                 (SAGA_SEQ[0])[i][l1]='\0';
368                 
369                 sprintf ( (SAGA_SEQ[0])[i], "%s", seq1);
370                 sprintf ( (SAGA_NAMES[0])[i], "%s", sname1);
371                 
372                 l=strlen (SAGA_NAMES[0][i]);
373                 for ( b=0; b< l; b++)
374                   {
375                     if ( isspace(SAGA_NAMES[0][i][b])){SAGA_NAMES[0][i][b]='\0';break;}
376                   }
377                           
378                 
379         }
380
381         fclose(fin);
382 /*
383    JULIE
384    check sequence names are all different - otherwise phylip tree is 
385    confused.
386 */
387         for(i=first_seq;i<=first_seq+no_seqs-1;i++) {
388                 for(j=i+1;j<=first_seq+no_seqs-1;j++) {
389                         if (strncmp((SAGA_NAMES[0])[i],(SAGA_NAMES[0])[j],MAXNAMES) == 0) {
390                                 error("Multiple sequences found with same name (first %d chars are significant)", MAXNAMES);
391                                 return -1;
392                         }
393                 }
394         }
395                         
396         return no_seqs;    /* return the number of seqs. read in this call */
397 }
398
399
400
401
402 static void check_infile(int *nseqs)
403 {
404         static char *line;
405         int i;  
406
407         if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
408         
409         for ( i=0; i<=MAXLINE; i++)line[i]='a';
410         *nseqs=0;
411         while (fgets(line,MAXLINE+1,fin) != NULL) {
412                 if ((*line != '\n') && (*line != ' ') && (*line != '\t'))
413                         break;
414         }
415         
416         for(i=0;i<=6;i++) line[i] = toupper(line[i]);
417
418         if( linetype(line,"ID") ) {                                     /* EMBL/Swiss-Prot format ? */
419                 seqFormat=EMBLSWISS;
420                 (*nseqs)++;
421         }
422         else if( linetype(line,"CLUSTAL") ) {
423                 seqFormat=CLUSTAL;
424         }
425         else if( linetype(line,"PILEUP") ) {
426                 seqFormat = MSF;
427         }
428         else if(*line == '>') {                                         /* no */
429                 seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
430                 (*nseqs)++;
431         }
432         else if((*line == '"') || (*line == '%') || (*line == '#')) {
433                 seqFormat=GDE; /* GDE format */
434                 if (*line == '%') {
435                         (*nseqs)++;
436                         
437                 }
438                 else if (*line == '#') {
439                         (*nseqs)++;
440                         
441                 }
442         }
443         else {
444                 seqFormat=UNKNOWN;
445                 return;
446         }
447
448         while(fgets(line,MAXLINE+1,fin) != NULL) {
449                 switch(seqFormat) {
450                         case EMBLSWISS:
451                                 if( linetype(line,"ID") )
452                                         (*nseqs)++;
453                                 break;
454                         case PIR:
455                         case PEARSON:
456                                 if( *line == '>' )
457                                         (*nseqs)++;
458                                 break;
459                         case GDE:
460                                 if(( *line == '%' ) )
461                                         (*nseqs)++;
462                                 else if (( *line == '#') )
463                                         (*nseqs)++;
464                                 break;
465                         case CLUSTAL:
466                                 *nseqs = count_clustal_seqs();
467 /* DES */                       /* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */
468                                 fseek(fin,0,0);
469                                 return;
470                                 break;
471                         case MSF:
472                                 *nseqs = count_msf_seqs();
473                                 fseek(fin,0,0);
474                                 return;
475                                 break;
476                         case USER:
477                         default:
478                                 break;
479                 }
480         }
481         fseek(fin,0,0);
482 }
483
484
485 static int count_clustal_seqs(void)
486 /* count the number of sequences in a clustal alignment file */
487 {
488         static char *line;
489         int  nseqs;
490
491         if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
492
493         while (fgets(line,MAXLINE+1,fin) != NULL) {
494                 if(!blankline(line)) break;             /* Look for next non- */
495         }                                               /* blank line */
496         nseqs = 1;
497
498         while (fgets(line,MAXLINE+1,fin) != NULL) {
499                 if(blankline(line)) return nseqs;
500                 nseqs++;
501         }
502
503         return 0;       /* if you got to here-funny format/no seqs.*/
504 }
505
506 static int count_msf_seqs(void)
507 {
508 /* count the number of sequences in a PILEUP alignment file */
509
510         static char *line;
511         int  nseqs;
512
513         if ( !line)line=vcalloc ( (MAXLINE+1), sizeof (char));
514
515         while (fgets(line,MAXLINE+1,fin) != NULL) {
516                 if(linetype(line,"/")) break;
517         }
518
519         while (fgets(line,MAXLINE+1,fin) != NULL) {
520                 if(!blankline(line)) break;             /* Look for next non- */
521         }                                               /* blank line */
522         nseqs = 1;
523
524         while (fgets(line,MAXLINE+1,fin) != NULL) {
525                 if(blankline(line)) return nseqs;
526                 nseqs++;
527         }
528
529         return 0;       /* if you got to here-funny format/no seqs.*/
530 }
531
532
533
534 /******************************COPYRIGHT NOTICE*******************************/
535 /*© Centro de Regulacio Genomica */
536 /*and */
537 /*Cedric Notredame */
538 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
539 /*All rights reserved.*/
540 /*This file is part of T-COFFEE.*/
541 /**/
542 /*    T-COFFEE is free software; you can redistribute it and/or modify*/
543 /*    it under the terms of the GNU General Public License as published by*/
544 /*    the Free Software Foundation; either version 2 of the License, or*/
545 /*    (at your option) any later version.*/
546 /**/
547 /*    T-COFFEE is distributed in the hope that it will be useful,*/
548 /*    but WITHOUT ANY WARRANTY; without even the implied warranty of*/
549 /*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*/
550 /*    GNU General Public License for more details.*/
551 /**/
552 /*    You should have received a copy of the GNU General Public License*/
553 /*    along with Foobar; if not, write to the Free Software*/
554 /*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/
555 /*...............................................                                                                                      |*/
556 /*  If you need some more information*/
557 /*  cedric.notredame@europe.com*/
558 /*...............................................                                                                                                                                     |*/
559 /**/
560 /**/
561 /*      */
562 /******************************COPYRIGHT NOTICE*******************************/